Step 2: AI-Based Screening with ChatGPT
To use this approach, you’ll need an OpenAI account and an API key
for accessing GPT models. Once you have your API key, you can customize
your prompt based on your specific screening criteria.
Since API calls incur costs each time they are executed, the code is
currently commented out (#) to prevent unnecessary charges. It’s
recommended to run the extraction process only when needed, and once you
are satisfied with the results, save the output DataFrame to avoid
repeated API costs.
# abstract_text<- Bib$`Abstract Note`
# # Create a Chat Object
# Sys.setenv(OPENAI_API_KEY = insert api key")
#
# chat <- chat_openai(model = "gpt-4o")
#
# type_include_paper <- type_object(
# "Assess whether the paper should be included based on the following strict inclusion criteria. Return one column per criterion, and a final decision column. If any criterion is not met, the paper is excluded. When uncertain, favor inclusion.",
#
# location_LA_CA = type_string(
# "Extract country names from the abstract. If no location is mentioned, return NA."
# ),
#
# field_study = type_string(
# "Determine whether the study is an **on-field experiment** (not a lab or greenhouse study). If the experiment was conducted on a farm or in natural field conditions, return 'Yes'. If it was conducted in a lab, greenhouse, or controlled environment, return 'No'."
# ),
#
# primary_data = type_string(
# "Check if the study uses **primary data** (original experimental data). If the study collects and analyzes new data, return 'Yes'. If it is a review, synthesis, or modeling study, return 'No'."
# ),
#
# study_type = type_string(
# "Determine whether the study is a review, synthesis, or modeling exercise. If it falls into any of these categories, return 'No'. If it is an experimental field study using primary data, return 'Yes'."
# ),
#
# focus_agriculture = type_string(
# "Check whether the study focuses on **agriculture**. If the study explicitly investigates agricultural practices, return 'Yes'. If it focuses on non-agricultural topics, return 'No'."
# ),
#
# target_crop_livestock = type_string(
# "Determine if the study focuses on **maize, coffee, beans, or cattle**. If any of these are one or more of those crops state 'Yes'. If the study focuses on other crops return 'No'."
# ),
#
# tested_practices = type_array(
# "Extract the **agricultural practices tested in the field** from the following predefined list:
# agroforestry, rotation, intercropping, green manure, crop residue management, mulch, water harvesting, organic fertilizer, biological pest control, selective pesticides, inorganic fertilizer optimization, improved varieties, locally adapted heirloom varieties, reduced tillage, supplemental irrigation.
#
# - Return the practices as a comma-separated list **without additional text**.
# - If no listed practices are tested, return 'No'.",
# items = type_string()
# )
# )
#
#
# # Process abstracts
# abstracts_info <- lapply(abstract_text, function(abstract) {
# tryCatch({
# result <- chat$extract_data(abstract, type = type_include_paper)
#
# data.frame(
# location_LA_CA = result$location_LA_CA %||% NA,
# field_study = result$field_study %||% NA,
# primary_data = result$primary_data %||% NA,
# study_type = result$study_type %||% NA,
# focus_agriculture = result$focus_agriculture %||% NA,
# target_crop_livestock = result$target_crop_livestock %||% NA,
# tested_practices = ifelse(length(result$tested_practices) > 0, paste(result$tested_practices, collapse = ", "), NA)
# )
# }, error = function(e) {
# data.frame(
# location_LA_CA = NA, field_study = NA, primary_data = NA, study_type = NA,
# focus_agriculture = NA, target_crop_livestock = NA, tested_practices = NA
# )
# })
# })
#
# # Convert to DataFrame
# abstracts_df <- bind_rows(abstracts_info)
#
# # Merge extracted information with Bibliography data
# ERA_processed <- cbind(Bib, abstracts_df)
#
# ERA_processed <- ERA_processed[, c(setdiff(names(ERA_processed), "Status"), "Status")]
#
# # Define list of Latin American and Central American (LAC/CAC) countries
# library(dplyr)
#
# # Define list of Latin American and Central American (LAC/CAC) countries
# LAC_CAC_countries <- c("Mexico", "Belize", "Guatemala", "El Salvador", "Honduras",
# "Nicaragua", "Costa Rica", "Panama", "Colombia", "Venezuela",
# "Ecuador", "Peru", "Bolivia", "Paraguay", "Chile", "Argentina",
# "Uruguay", "Brazil")
#
# # Ensure `is_LAC_CAC` is added before filtering
# ERA_processed <- ERA_processed %>%
# mutate(
# is_LAC_CAC = ifelse(location_LA_CA %in% LAC_CAC_countries, "Yes", "No")
# )
#
# # Define required columns
# required_columns <- c("is_LAC_CAC", "field_study", "primary_data", "study_type", "focus_agriculture", "target_crop_livestock")
#
# # Check if all required columns exist
# missing_columns <- setdiff(required_columns, colnames(ERA_processed))
# if (length(missing_columns) > 0) {
# stop(paste("Missing columns in ERA_processed:", paste(missing_columns, collapse = ", ")))
# }
#
# # Apply filtering logic
# ERA_processed <- ERA_processed %>%
# mutate(
# failing_criteria = apply(select(., all_of(required_columns)) %>% as.data.frame(),
# 1, function(row) paste(names(row)[row == "No"], collapse = ", ")),
#
# Status_AI = ifelse(failing_criteria == "", "Included", paste("Excluded - Issues:", failing_criteria))
# ) %>%
#
# # Remove temporary `failing_criteria` column
# select(-failing_criteria)
#
# # Display final processed dataset
# print(ERA_processed)
#
#
# write.csv(ERA_processed, file = "ERA_screened_GPT.csv")
# Create a summary table
criteria_table <- data.frame(
Criterion = c(
"Location (Latin America & Central America)",
"Field Study",
"Primary Data",
"Study Type",
"Focus on Agriculture",
"Target Crops/Livestock",
"Tested Agricultural Practices"
),
Description = c(
"Extract country names from the abstract. If the study is conducted in Latin America or Central America, return 'Yes'. Otherwise, return 'No'. If no location is mentioned, return 'NA'.",
"Determine whether the study is an **on-field experiment** (not a lab or greenhouse study). If conducted on a farm or in natural field conditions, return 'Yes'. Otherwise, return 'No'.",
"Check if the study collects **primary data** (original experimental data). If it does, return 'Yes'. If it is a review, synthesis, or modeling study, return 'No'.",
"Determine whether the study is a **review, synthesis, or modeling exercise**. If it falls into any of these categories, return 'No'. If it is an experimental field study using primary data, return 'Yes'.",
"Check whether the study focuses on **agriculture**. If it explicitly investigates agricultural practices, return 'Yes'. Otherwise, return 'No'.",
"Determine if the study focuses on **maize, coffee, beans, or cattle**. If one or more of these are studied, return 'Yes'. If the study focuses on other crops or livestock, return 'No'.",
"Extract the **agricultural practices tested in the field** from a predefined list (e.g., agroforestry, crop rotation, organic fertilizer). If no listed practices are tested, return 'No'."
)
)
# Display the table in Quarto with styling
kable(criteria_table, format = "html", caption = "Summary of Prompts based on inclusion criteria") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Summary of Prompts based on inclusion criteria
|
Criterion
|
Description
|
|
Location (Latin America & Central America)
|
Extract country names from the abstract. If the study is conducted in
Latin America or Central America, return ‘Yes’. Otherwise, return ‘No’.
If no location is mentioned, return ‘NA’.
|
|
Field Study
|
Determine whether the study is an on-field experiment
(not a lab or greenhouse study). If conducted on a farm or in natural
field conditions, return ‘Yes’. Otherwise, return ‘No’.
|
|
Primary Data
|
Check if the study collects primary data (original
experimental data). If it does, return ‘Yes’. If it is a review,
synthesis, or modeling study, return ‘No’.
|
|
Study Type
|
Determine whether the study is a review, synthesis, or modeling
exercise. If it falls into any of these categories, return
‘No’. If it is an experimental field study using primary data, return
‘Yes’.
|
|
Focus on Agriculture
|
Check whether the study focuses on agriculture. If it
explicitly investigates agricultural practices, return ‘Yes’. Otherwise,
return ‘No’.
|
|
Target Crops/Livestock
|
Determine if the study focuses on maize, coffee, beans, or
cattle. If one or more of these are studied, return ‘Yes’. If
the study focuses on other crops or livestock, return ‘No’.
|
|
Tested Agricultural Practices
|
Extract the agricultural practices tested in the field
from a predefined list (e.g., agroforestry, crop rotation, organic
fertilizer). If no listed practices are tested, return ‘No’.
|
#use the file's location
ERA_screened_GPT <- read_csv("C:/Users/mlolita/OneDrive - CGIAR/Documents/AI/data/ERA_screened_GPT.csv")
# Load data
ERA_screened_GPT <- read_csv("C:/Users/mlolita/OneDrive - CGIAR/Documents/AI/data/ERA_screened_GPT.csv") %>%
select(`Abstract Note`, location_LA_CA, field_study, primary_data, study_type,
focus_agriculture, target_crop_livestock, tested_practices, is_LAC_CAC, Status, Status_AI)
# Truncate abstracts and add expand/collapse option
ERA_screened_GPT <- ERA_screened_GPT %>%
mutate(`Abstract Note` = paste0(
"<details><summary>Click to expand</summary><p>",
`Abstract Note`, "</p></details>"
))
# Create paginated table with expandable abstracts
datatable(
ERA_screened_GPT,
escape = FALSE, # Allow HTML in Abstract column
rownames = FALSE,
options = list(
pageLength = 5, # Display 5 rows per page
lengthMenu = c(5, 10, 25, 50), # Allow changing number of rows per page
autoWidth = TRUE
)
)
Accuracy
ERA_screened_GPT <- ERA_screened_GPT %>%
mutate(
Screening_Comparison = case_when(
grepl("Extracted|Included", Status) & grepl("Included", Status_AI) ~ "Match - Included",
grepl("Rejected", Status) & grepl("Excluded", Status_AI) ~ "Match - Rejected",
grepl("Rejected", Status) & grepl("Included", Status_AI) ~ "False Inclusion (GPT Error)",
grepl("Extracted|Included", Status) & grepl("Excluded", Status_AI) ~ "False Exclusion (GPT Error)",
TRUE ~ "Other"
)
)
# Count occurrences
summary_table <- ERA_screened_GPT %>%
count(Screening_Comparison)
# Compute total accuracy
total_cases <- nrow(ERA_screened_GPT)
correct_predictions <- sum(summary_table$n[summary_table$Screening_Comparison %in% c("Match - Included", "Match - Rejected")])
accuracy <- round((correct_predictions / total_cases) * 100, 2)
total_include_n <- nrow(subset(ERA_screened_GPT, Status %in% c("Extracted", "Included")))
total_exclude_n <- nrow(subset(ERA_screened_GPT, !Status %in% c("Extracted", "Included")))
tp <- summary_table[[3, "n"]]
tn <- summary_table[[4, "n"]]
fp <- summary_table[[2, "n"]]
fn <- summary_table[[1, "n"]]
accuracy_include <- tp / total_include_n
accuracy_exclude <- tn / total_exclude_n
balanced_accuracy <- (accuracy_include + accuracy_exclude) / 2
f1_score <- tp / (tp + (0.5 * (fp + fn)))
sensitivity <- tp / (tp + fn)
specificity <- tn / (tn + fp)
accuracy_df <- data.frame(
Metric = c(
"Num. Abstracts",
"Num. Include Abstracts",
"Num. Exclude Abstracts",
"Accuracy",
"Inclusion Accuracy",
"Exclusion Accuracy",
"Balanced Accuracy",
"F1 Score",
"Sensitivity",
"Specificity"
),
Value = c(
total_cases,
total_include_n,
total_exclude_n,
accuracy,
(accuracy_include * 100),
(accuracy_exclude * 100),
(balanced_accuracy * 100),
f1_score,
sensitivity, # this would be most important if care about papers being excluded on accident
specificity
)
)
# Display summary table
kable(summary_table, format = "html", caption = paste("GPT Screening Accuracy:", accuracy, "%")) %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
GPT Screening Accuracy: 81 %
|
Screening_Comparison
|
n
|
|
False Exclusion (GPT Error)
|
14
|
|
False Inclusion (GPT Error)
|
5
|
|
Match - Included
|
10
|
|
Match - Rejected
|
71
|
kable(accuracy_df, format = "html",digits=2, caption = "GPT Screening Accuracy Details") |>
kable_styling(full_width = FALSE,
bootstrap_options = c("striped", "hover", "condensed")
)
GPT Screening Accuracy Details
|
Metric
|
Value
|
|
Num. Abstracts
|
100.00
|
|
Num. Include Abstracts
|
24.00
|
|
Num. Exclude Abstracts
|
76.00
|
|
Accuracy
|
81.00
|
|
Inclusion Accuracy
|
41.67
|
|
Exclusion Accuracy
|
93.42
|
|
Balanced Accuracy
|
67.54
|
|
F1 Score
|
0.51
|
|
Sensitivity
|
0.42
|
|
Specificity
|
0.93
|
Note: Some of the false exclusions identified by GPT are actually
correct. During the ACDC extraction process, we unintentionally included
some papers from Africa, even though they should have been excluded
based on the criteria. In these cases, GPT correctly excluded the papers
based on location, while our original extraction mistakenly retained
them.
Cost analysis
# Define given cost and processing parameters
total_cost <- 3.35 # Total cost for processing 100 abstracts
num_abstracts <- 100 # Number of abstracts processed
num_requests <- 7 # Requests per abstract
# Compute cost per abstract and cost per request
cost_per_abstract <- total_cost / num_abstracts
cost_per_request <- total_cost / (num_abstracts * num_requests)
# Define cost estimates for different dataset sizes
abstract_sizes <- c(100, 500, 1000, 5000, 10000, 50000, 100000)
abstract_sizes<-as.numeric(abstract_sizes)
cost_estimates <- data.frame(
"Number of Abstracts" = abstract_sizes,
"Total Requests" = abstract_sizes * num_requests,
"Estimated Cost ($USD)" = round(abstract_sizes * cost_per_abstract, 2)
)
# Create summary table
cost_summary <- data.frame(
Metric = c(
"Total Cost for 100 Abstracts",
"Total Requests Made",
"Cost per Abstract (All 7 Requests)",
"Cost per GPT Request"
),
Value = c(
paste0("$", round(total_cost, 2)),
format(num_abstracts * num_requests, big.mark = ","),
paste0("$", round(cost_per_abstract, 4)),
paste0("$", round(cost_per_request, 5))
)
)
# Display cost summary table
kable(cost_summary, format = "html", caption = "GPT Processing Cost Summary") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
GPT Processing Cost Summary
|
Metric
|
Value
|
|
Total Cost for 100 Abstracts
|
$3.35
|
|
Total Requests Made
|
700
|
|
Cost per Abstract (All 7 Requests)
|
$0.0335
|
|
Cost per GPT Request
|
$0.00479
|
# Display cost estimates for different dataset sizes
kable(cost_estimates, format = "html", caption = "Estimated Costs for Different Dataset Sizes") %>%
kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))
Estimated Costs for Different Dataset Sizes
|
Number.of.Abstracts
|
Total.Requests
|
Estimated.Cost…USD.
|
|
1e+02
|
700
|
3.35
|
|
5e+02
|
3500
|
16.75
|
|
1e+03
|
7000
|
33.50
|
|
5e+03
|
35000
|
167.50
|
|
1e+04
|
70000
|
335.00
|
|
5e+04
|
350000
|
1675.00
|
|
1e+05
|
700000
|
3350.00
|