## First check for the required packages, install if needed, and load the libraries.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
::install("sangerseqR")
BiocManager::install_github("ropensci/bold")
remotes::install_github("ropensci/taxize")
remotes
if (!require("pacman")) install.packages("pacman")
::p_load(maps, ggplot2, dplyr, countrycode, data.table, raster) pacman
Building the BOLD dataset
This notebook queries the BOLD API to fetch all the plant specimen data, including metadata and sequences. We clean as we go to correct for errors in the data on BOLD and adjust for some data being held in private projects on BOLD.
Set up directory
dir.create(file.path("./family_csvs"))
setDTthreads(threads = 6) # if you are working on cluster, this will allow data.table to multithread
Step 1: Obtain a list of plant phyla in BOLD from which to pull records
# Name BOLD plant phyla
<- c("Anthocerotophyta", "Bryophyta", "Cycadophyta", "Ginkgophyta", "Magnoliophyta", "Marchantiophyta", "Pinophyta", "Pteridophyta", "Tracheophyta") #"Gnetophyta", "Psilophyta", "Lycopodiophyta" # these were previously listed a families but are now nested under other families
boldphyla length(boldphyla)
Step 2: retrieve all families descending from a vector of Phyla.
<- vector("list", length(boldphyla))
out0
for(i in 1:length(boldphyla)){
<-taxize::downstream(boldphyla[i], db = "bold", downto = "family")
tempx<- tempx[[1]]$name
out0[[i]]
}<- sort(unlist(out0))
boldfamilies
# Remove families that return errors (private entries exist in BOLD so they can be found "downstream" of their respective phyla, but don't permit downloads of any barcode data and need to be excluded before the next phase
# As taxonomy gets reclassified, and barcode coverage improves, the families below may need to be updated depending on when this analysis is being run. The families below did not have public records at the time we ran our analysis, but they may as database coverage improves over time
<- boldfamilies[which(boldfamilies != "Chonecoleaceae")] # in BOLD but nothing public
boldfamilies <- boldfamilies[which(boldfamilies != "Cytinaceae")] # in BOLD but nothing public
boldfamilies <- boldfamilies[which(boldfamilies != "Eropodiaceae")] # in BOLD but nothing public
boldfamilies <- boldfamilies[which(boldfamilies != "Hydnoraceae")] # in BOLD but nothing public
boldfamilies <-boldfamilies[which(boldfamilies != "Labiatae")] # in BOLD but nothing public
boldfamilies<-boldfamilies[which(boldfamilies != "Leucobryaceae")] # in BOLD but nothing public
boldfamilies<-boldfamilies[which(boldfamilies != "Scrophularaceae")] # misspelling in BOLD but nothing public
boldfamilies<-boldfamilies[which(boldfamilies != "Sterculiaceae")] # in BOLD but nothing public
boldfamilies<-boldfamilies[which(boldfamilies != "Kahakuloaceae")]
boldfamilies# Length of boldfamilies plus those that have no public data (hashed out above) is how many land plant families BOLD recognizes
<- c("Chonecoleaceae", "Cytinaceae", "Eropodiaceae", "Hydnoraceae", "Labiatae", "Leucobryaceae", "Scrophularaceae", "Sterculiaceae", "Kahakuloaceae")
nopublicrecords
# This is where we can download the barcode data, for all families, in batches, knowing that there are public records
<- vector("list", length(boldfamilies))
out1 <- matrix(nrow = length(boldfamilies), ncol = 2)
outnames 1] <- boldfamilies
outnames[,<- c("trnL-F", "rbcL", "matK", "ITS") markernames
Step 3. Run a loop that outputs in each iteration a separate .csv file for each family included in the “boldfamilies” vector. We first split the boldfamilies object into three lists to stay within the request limits for the BOLD API.
<- floor((length(boldfamilies)/3))
first_third <- floor(length(boldfamilies) - floor((length(boldfamilies)/3)))
second_third <- floor(length(boldfamilies))
last_third
<- boldfamilies[1:first_third]
boldfamilies1 <- boldfamilies[first_third:second_third]
boldfamilies2 <- boldfamilies[second_third:last_third]
boldfamilies3
# Download data for the first third of families
for(i in 1:length(boldfamilies1)){
<- bold_seqspec(boldfamilies1[i], marker = markernames)
out1[[i]] <-paste("./family_csvs/",boldfamilies1[i],".csv",sep = "")
filename2] <- filename
outnames[i,write.csv(file = filename, out1[[i]])
}
# Download data for the second third of families
for(i in 2:length(boldfamilies2)){
<- bold_seqspec(boldfamilies2[i], marker = markernames)
out1[[i]] <- paste("./family_csvs/",boldfamilies2[i],".csv",sep = "")
filename 2] <- filename
outnames[i,write.csv(file = filename, out1[[i]])
}
# Download data for the last third of families
for(i in 2:length(boldfamilies3)){
<- bold_seqspec(boldfamilies3[i], marker = markernames)
out1[[i]] <- paste("./family_csvs/",boldfamilies3[i],".csv",sep = "")
filename 2] <- filename
outnames[i,write.csv(file = filename, out1[[i]])
}
Step 4: Write BOLD search term lists to keep track of what was included in this batch effort.
write.csv(file = "ListofBoldPhyla_to_Families", boldfamilies)
<- read.csv(file = "ListofBoldPhyla_to_Families")$x
boldfamilies write.csv(file = "ListofBoldPhyla_to_Families_nopublicrecords", nopublicrecords)
Step 5: Read in the files that were output to make the composite “combtab” dataset across all families.
<- vector("list", length(boldfamilies))
out3 for(i in 1:length(boldfamilies)){
<- paste("./family_csvs/", boldfamilies[i],".csv",sep = "")
filetemp if (file.exists(filetemp))
<- read.csv(filetemp)
out3[[i]] }
Step 6: Obtain list of families that downloaded with zero entries.
<- boldfamilies[which(lapply(out3, ncol) != 81)] # 81 is the column with marker codes - this is selecting elements from boldfamilies where the corresponding elements in out3 do not have 81 columns
zerodata write.csv(file = "ListofBoldPhyla_to_Families_zerodata", zerodata)
.1 <- out3[which(lapply(out3, ncol) == 81)]
out3
<- do.call(rbind, out3.1)
out4 write.csv(file = "BoldPhyla_to_Families.csv", out4)
Step 7: load and merge “out” files of interest, dereplicate, and combine them in ways that list the markers
<- read.csv("BoldPhyla_to_Families.csv")
intab
<- intab[,1:68] # note that in the current combtab we select the following columns listed below
combtab
## columns to keep: "processid", "institution_storing", "phylum_taxID", "phylum_name", "class_taxID", "class_name", "order_taxID", "order_name", "family_taxID", "family_name", "subfamily_taxID", "subfamily_name", "genus_taxID", "genus_name", "species_taxID", "species_name", "subspecies_taxID", "subspecies_name"
<- combtab[which(duplicated(combtab[c(3,8,11:26)]) == F),]
combtab
length(unique(intab$family_name))
length(unique(combtab$family_name))
Step 8. Annotate combtab to include barcodes associated with each specimen - rbcL, matK, trnL, ITS.
<- matrix(nrow = nrow(combtab),ncol = 5, "")
markers colnames(markers) <- c("rbcL","matK","trnL", "ITS2", "multi")
levels(intab$markercode)
which(combtab$processid %in% intab[which(intab$markercode %in% c("rbcL", "rbcLa")),]$processid),1] <- "rbcL"
markers[
which(combtab$processid %in% intab[which(intab$markercode %in% c("matK")),]$processid),2] <- "matK"
markers[
which(combtab$processid %in% intab[which(intab$markercode %in% c("trnL", "trnL-F")),]$processid),3] <- "trnL"
markers[
which(combtab$processid %in% intab[which(intab$markercode %in% c("ITS", "ITS2")),]$processid),4] <- "ITS"
markers[
5] <- apply(markers[,1:4] , 1, paste , collapse = "-" )
markers[,
<- as.data.frame(markers)
marker_df unique(marker_df$multi)
Step 9. Save GENBANK information and combine markers list to combtab object
# Save genbank info
<- matrix(nrow = nrow(combtab),ncol = 4, "")
genbank colnames(genbank) <- c("gb_rbcL","gb_matK","gb_trnL", "gb_ITS")
which(combtab$processid %in% intab[which(intab$markercode %in% c("rbcL", "rbcLa")),]$processid),1] <-intab[which(combtab$processid %in% intab[which(intab$markercode %in% c("rbcL", "rbcLa")),]$processid),]$genbank_accession
genbank[
which(combtab$processid %in% intab[which(intab$markercode %in% c("matK")),]$processid),2] <- intab[which(combtab$processid %in% intab[which(intab$markercode %in% c("matK")),]$processid),]$genbank_accession
genbank[
which(combtab$processid %in% intab[which(intab$markercode %in% c("trnL-F")),]$processid),3] <- intab[which(combtab$processid %in% intab[which(intab$markercode %in% c("trnL", "trnL-F")),]$processid),]$genbank_accession
genbank[
which(combtab$processid %in% intab[which(intab$markercode %in% c("ITS", "ITS2")),]$processid),4] <- intab[which(combtab$processid %in% intab[which(intab$markercode %in% c("ITS", "ITS2")),]$processid),]$genbank_accession
genbank[
# Combine markers list to combtab
<- cbind(combtab, markers, genbank) combtab
Step 10. Filter columns that make the file unnecessarily large and write combtab object
This block will generate the same file structure as dataset S1 in the Supplement.
names(combtab)
<- combtab[, c(2:3,8,11:26,35:36,49:53,57:61,69:77)]
combtabout head(combtabout, 2)
write.csv(file = "../data/Kartzinel_et_al_Dataset_S1_20240725.csv", combtabout)