Screening Agricultural Research Papers with AI

This vignette demonstrates how to automate the screening of agricultural research papers using GPT-based AI models. The goal is to efficiently classify and filter research papers based on predefined inclusion/exclusion criteria, reducing manual workload and ensuring consistency in the selection process.

Step 1: Load & Sample Bibliographic Data

# Extract bibliography data
# Bib <- read_csv("ACDC_abstracts.csv")
# 
# set.seed(444)
# Bib <- Bib %>%
#   sample_n(100)

Step 2: AI-Based Screening with ChatGPT

To use this approach, you’ll need an OpenAI account and an API key for accessing GPT models. Once you have your API key, you can customize your prompt based on your specific screening criteria.

Since API calls incur costs each time they are executed, the code is currently commented out (#) to prevent unnecessary charges. It’s recommended to run the extraction process only when needed, and once you are satisfied with the results, save the output DataFrame to avoid repeated API costs.

# abstract_text<- Bib$`Abstract Note`
# # Create a Chat Object
# Sys.setenv(OPENAI_API_KEY = insert api key")
# 
# chat <- chat_openai(model = "gpt-4o")
# 
# type_include_paper <- type_object(
#   "Assess whether the paper should be included based on the following strict inclusion criteria. Return one column per criterion, and a final decision column. If any criterion is not met, the paper is excluded. When uncertain, favor inclusion.",
# 
#   location_LA_CA = type_string(
#     "Extract country names from the abstract. If no location is mentioned, return NA."
#   ),
# 
#   field_study = type_string(
#     "Determine whether the study is an **on-field experiment** (not a lab or greenhouse study). If the experiment was conducted on a farm or in natural field conditions, return 'Yes'. If it was conducted in a lab, greenhouse, or controlled environment, return 'No'."
#   ),
# 
#   primary_data = type_string(
#     "Check if the study uses **primary data** (original experimental data). If the study collects and analyzes new data, return 'Yes'. If it is a review, synthesis, or modeling study, return 'No'."
#   ),
# 
#   study_type = type_string(
#     "Determine whether the study is a review, synthesis, or modeling exercise. If it falls into any of these categories, return 'No'. If it is an experimental field study using primary data, return 'Yes'."
#   ),
# 
#   focus_agriculture = type_string(
#     "Check whether the study focuses on **agriculture**. If the study explicitly investigates agricultural practices, return 'Yes'. If it focuses on non-agricultural topics, return 'No'."
#   ),
# 
#   target_crop_livestock = type_string(
#     "Determine if the study focuses on **maize, coffee, beans, or cattle**. If any of these are one or more of those crops state 'Yes'. If the study focuses on other crops return 'No'."
#   ),
# 
#   tested_practices = type_array(
#     "Extract the **agricultural practices tested in the field** from the following predefined list:
#     agroforestry, rotation, intercropping, green manure, crop residue management, mulch, water harvesting, organic fertilizer, biological pest control, selective pesticides, inorganic fertilizer optimization, improved varieties, locally adapted heirloom varieties, reduced tillage, supplemental irrigation.
#     
#     - Return the practices as a comma-separated list **without additional text**.
#     - If no listed practices are tested, return 'No'.",
#     items = type_string()
#   )
# )
# 
# 
# # Process abstracts
# abstracts_info <- lapply(abstract_text, function(abstract) {
#   tryCatch({
#     result <- chat$extract_data(abstract, type = type_include_paper)
#     
#     data.frame(
#       location_LA_CA = result$location_LA_CA %||% NA,
#       field_study = result$field_study %||% NA,
#       primary_data = result$primary_data %||% NA,
#       study_type = result$study_type %||% NA,
#       focus_agriculture = result$focus_agriculture %||% NA,
#       target_crop_livestock = result$target_crop_livestock %||% NA,
#       tested_practices = ifelse(length(result$tested_practices) > 0, paste(result$tested_practices, collapse = ", "), NA)
#     )
#   }, error = function(e) {
#     data.frame(
#       location_LA_CA = NA, field_study = NA, primary_data = NA, study_type = NA,
#       focus_agriculture = NA, target_crop_livestock = NA, tested_practices = NA
#     )
#   })
# })
# 
# # Convert to DataFrame
# abstracts_df <- bind_rows(abstracts_info)
# 
# # Merge extracted information with Bibliography data
# ERA_processed <- cbind(Bib, abstracts_df)
# 
# ERA_processed <- ERA_processed[, c(setdiff(names(ERA_processed), "Status"), "Status")]
# 
# # Define list of Latin American and Central American (LAC/CAC) countries
# library(dplyr)
# 
# # Define list of Latin American and Central American (LAC/CAC) countries
# LAC_CAC_countries <- c("Mexico", "Belize", "Guatemala", "El Salvador", "Honduras", 
#                         "Nicaragua", "Costa Rica", "Panama", "Colombia", "Venezuela", 
#                         "Ecuador", "Peru", "Bolivia", "Paraguay", "Chile", "Argentina", 
#                         "Uruguay", "Brazil")
# 
# # Ensure `is_LAC_CAC` is added before filtering
# ERA_processed <- ERA_processed %>%
#   mutate(
#     is_LAC_CAC = ifelse(location_LA_CA %in% LAC_CAC_countries, "Yes", "No")
#   )
# 
# # Define required columns
# required_columns <- c("is_LAC_CAC", "field_study", "primary_data", "study_type", "focus_agriculture", "target_crop_livestock")
# 
# # Check if all required columns exist
# missing_columns <- setdiff(required_columns, colnames(ERA_processed))
# if (length(missing_columns) > 0) {
#   stop(paste("Missing columns in ERA_processed:", paste(missing_columns, collapse = ", ")))
# }
# 
# # Apply filtering logic
# ERA_processed <- ERA_processed %>%
#   mutate(
#     failing_criteria = apply(select(., all_of(required_columns)) %>% as.data.frame(), 
#                              1, function(row) paste(names(row)[row == "No"], collapse = ", ")),
#     
#     Status_AI = ifelse(failing_criteria == "", "Included", paste("Excluded - Issues:", failing_criteria))
#   ) %>%
#   
#   # Remove temporary `failing_criteria` column
#   select(-failing_criteria)
# 
# # Display final processed dataset
# print(ERA_processed)
# 
# 
# write.csv(ERA_processed, file = "ERA_screened_GPT.csv")

# Create a summary table
criteria_table <- data.frame(
  Criterion = c(
    "Location (Latin America & Central America)",
    "Field Study",
    "Primary Data",
    "Study Type",
    "Focus on Agriculture",
    "Target Crops/Livestock",
    "Tested Agricultural Practices"
  ),
  
  Description = c(
    "Extract country names from the abstract. If the study is conducted in Latin America or Central America, return 'Yes'. Otherwise, return 'No'. If no location is mentioned, return 'NA'.",
    
    "Determine whether the study is an **on-field experiment** (not a lab or greenhouse study). If conducted on a farm or in natural field conditions, return 'Yes'. Otherwise, return 'No'.",
    
    "Check if the study collects **primary data** (original experimental data). If it does, return 'Yes'. If it is a review, synthesis, or modeling study, return 'No'.",
    
    "Determine whether the study is a **review, synthesis, or modeling exercise**. If it falls into any of these categories, return 'No'. If it is an experimental field study using primary data, return 'Yes'.",
    
    "Check whether the study focuses on **agriculture**. If it explicitly investigates agricultural practices, return 'Yes'. Otherwise, return 'No'.",
    
    "Determine if the study focuses on **maize, coffee, beans, or cattle**. If one or more of these are studied, return 'Yes'. If the study focuses on other crops or livestock, return 'No'.",
    
    "Extract the **agricultural practices tested in the field** from a predefined list (e.g., agroforestry, crop rotation, organic fertilizer). If no listed practices are tested, return 'No'."
  )
)

# Display the table in Quarto with styling
kable(criteria_table, format = "html", caption = "Summary of Prompts based on inclusion criteria") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Summary of Prompts based on inclusion criteria
Criterion	Description
Location (Latin America & Central America)	Extract country names from the abstract. If the study is conducted in Latin America or Central America, return ‘Yes’. Otherwise, return ‘No’. If no location is mentioned, return ‘NA’.
Field Study	Determine whether the study is an on-field experiment (not a lab or greenhouse study). If conducted on a farm or in natural field conditions, return ‘Yes’. Otherwise, return ‘No’.
Primary Data	Check if the study collects primary data (original experimental data). If it does, return ‘Yes’. If it is a review, synthesis, or modeling study, return ‘No’.
Study Type	Determine whether the study is a review, synthesis, or modeling exercise. If it falls into any of these categories, return ‘No’. If it is an experimental field study using primary data, return ‘Yes’.
Focus on Agriculture	Check whether the study focuses on agriculture. If it explicitly investigates agricultural practices, return ‘Yes’. Otherwise, return ‘No’.
Target Crops/Livestock	Determine if the study focuses on maize, coffee, beans, or cattle. If one or more of these are studied, return ‘Yes’. If the study focuses on other crops or livestock, return ‘No’.
Tested Agricultural Practices	Extract the agricultural practices tested in the field from a predefined list (e.g., agroforestry, crop rotation, organic fertilizer). If no listed practices are tested, return ‘No’.

#use the file's location
ERA_screened_GPT <- read_csv("C:/Users/mlolita/OneDrive - CGIAR/Documents/AI/data/ERA_screened_GPT.csv")


# Load data
ERA_screened_GPT <- read_csv("C:/Users/mlolita/OneDrive - CGIAR/Documents/AI/data/ERA_screened_GPT.csv") %>%
  select(`Abstract Note`, location_LA_CA, field_study, primary_data, study_type,
         focus_agriculture, target_crop_livestock, tested_practices, is_LAC_CAC, Status, Status_AI)

# Truncate abstracts and add expand/collapse option
ERA_screened_GPT <- ERA_screened_GPT %>%
  mutate(`Abstract Note` = paste0(
    "<details><summary>Click to expand</summary><p>",
    `Abstract Note`, "</p></details>"
  ))

# Create paginated table with expandable abstracts
datatable(
  ERA_screened_GPT, 
  escape = FALSE,  # Allow HTML in Abstract column
  rownames = FALSE, 
  options = list(
    pageLength = 5,  # Display 5 rows per page
    lengthMenu = c(5, 10, 25, 50),  # Allow changing number of rows per page
    autoWidth = TRUE
  )
)

Accuracy

ERA_screened_GPT <- ERA_screened_GPT %>%
  mutate(
    Screening_Comparison = case_when(
      grepl("Extracted|Included", Status) & grepl("Included", Status_AI) ~ "Match - Included",
      grepl("Rejected", Status) & grepl("Excluded", Status_AI) ~ "Match - Rejected",
      grepl("Rejected", Status) & grepl("Included", Status_AI) ~ "False Inclusion (GPT Error)",
      grepl("Extracted|Included", Status) & grepl("Excluded", Status_AI) ~ "False Exclusion (GPT Error)",
      TRUE ~ "Other"
    )
  )

# Count occurrences
summary_table <- ERA_screened_GPT %>%
  count(Screening_Comparison)

# Compute total accuracy
total_cases <- nrow(ERA_screened_GPT)
correct_predictions <- sum(summary_table$n[summary_table$Screening_Comparison %in% c("Match - Included", "Match - Rejected")])
accuracy <- round((correct_predictions / total_cases) * 100, 2)

total_include_n <- nrow(subset(ERA_screened_GPT, Status %in% c("Extracted", "Included")))
total_exclude_n <- nrow(subset(ERA_screened_GPT, !Status %in% c("Extracted", "Included")))
tp <- summary_table[[3, "n"]]
tn <- summary_table[[4, "n"]]
fp <- summary_table[[2, "n"]]
fn <- summary_table[[1, "n"]]
accuracy_include <- tp / total_include_n
accuracy_exclude <- tn / total_exclude_n
balanced_accuracy <- (accuracy_include + accuracy_exclude) / 2
f1_score <- tp / (tp + (0.5 * (fp + fn)))
sensitivity <- tp / (tp + fn)
specificity <- tn / (tn + fp)

accuracy_df <- data.frame(
  Metric = c(
    "Num. Abstracts",
    "Num. Include Abstracts",
    "Num. Exclude Abstracts",
    "Accuracy",
    "Inclusion Accuracy",
    "Exclusion Accuracy",
    "Balanced Accuracy",
    "F1 Score",
    "Sensitivity",
    "Specificity"
  ),
  Value = c(
    total_cases,
    total_include_n,
    total_exclude_n,
    accuracy,
    (accuracy_include * 100),
    (accuracy_exclude * 100),
    (balanced_accuracy * 100), 
    f1_score,
    sensitivity, # this would be most important if care about papers being excluded on accident
    specificity
  )
)


# Display summary table
kable(summary_table, format = "html", caption = paste("GPT Screening Accuracy:", accuracy, "%")) %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

GPT Screening Accuracy: 81 %
Screening_Comparison	n
False Exclusion (GPT Error)	14
False Inclusion (GPT Error)	5
Match - Included	10
Match - Rejected	71

kable(accuracy_df, format = "html",digits=2, caption = "GPT Screening Accuracy Details") |> 
   kable_styling(full_width = FALSE, 
    bootstrap_options = c("striped", "hover", "condensed")
   )

GPT Screening Accuracy Details
Metric	Value
Num. Abstracts	100.00
Num. Include Abstracts	24.00
Num. Exclude Abstracts	76.00
Accuracy	81.00
Inclusion Accuracy	41.67
Exclusion Accuracy	93.42
Balanced Accuracy	67.54
F1 Score	0.51
Sensitivity	0.42
Specificity	0.93

Note: Some of the false exclusions identified by GPT are actually correct. During the ACDC extraction process, we unintentionally included some papers from Africa, even though they should have been excluded based on the criteria. In these cases, GPT correctly excluded the papers based on location, while our original extraction mistakenly retained them.

Cost analysis

# Define given cost and processing parameters
total_cost <- 3.35    # Total cost for processing 100 abstracts
num_abstracts <- 100  # Number of abstracts processed
num_requests <- 7     # Requests per abstract

# Compute cost per abstract and cost per request
cost_per_abstract <- total_cost / num_abstracts
cost_per_request <- total_cost / (num_abstracts * num_requests)

# Define cost estimates for different dataset sizes
abstract_sizes <- c(100, 500, 1000, 5000, 10000, 50000, 100000)
abstract_sizes<-as.numeric(abstract_sizes)


cost_estimates <- data.frame(
  "Number of Abstracts" = abstract_sizes,
  "Total Requests" = abstract_sizes * num_requests,
  "Estimated Cost ($USD)" = round(abstract_sizes * cost_per_abstract, 2)
)

# Create summary table
cost_summary <- data.frame(
  Metric = c(
    "Total Cost for 100 Abstracts",
    "Total Requests Made",
    "Cost per Abstract (All 7 Requests)",
    "Cost per GPT Request"
  ),
  Value = c(
    paste0("$", round(total_cost, 2)),
    format(num_abstracts * num_requests, big.mark = ","),
    paste0("$", round(cost_per_abstract, 4)),
    paste0("$", round(cost_per_request, 5))
  )
)

# Display cost summary table
kable(cost_summary, format = "html", caption = "GPT Processing Cost Summary") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

GPT Processing Cost Summary
Metric	Value
Total Cost for 100 Abstracts	$3.35
Total Requests Made	700
Cost per Abstract (All 7 Requests)	$0.0335
Cost per GPT Request	$0.00479

# Display cost estimates for different dataset sizes
kable(cost_estimates, format = "html", caption = "Estimated Costs for Different Dataset Sizes") %>%
  kable_styling(full_width = FALSE, bootstrap_options = c("striped", "hover", "condensed"))

Estimated Costs for Different Dataset Sizes
Number.of.Abstracts	Total.Requests	Estimated.Cost…USD.
1e+02	700	3.35
5e+02	3500	16.75
1e+03	7000	33.50
5e+03	35000	167.50
1e+04	70000	335.00
5e+04	350000	1675.00
1e+05	700000	3350.00

Screening Agricultural Research Papers with AI

Lolita Muller

Namita Joshi

Peter Steward

Todd Rosenstock

2025-10-14

Screening Agricultural Research Papers with AI

Step 1: Load & Sample Bibliographic Data

Step 2: AI-Based Screening with ChatGPT

Accuracy

Cost analysis