## First check for the required packages, install if needed, and load the libraries.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
::install("sangerseqR")
BiocManager::install_github("ropensci/bold")
remotes::install_github("ropensci/taxize")
remotes
if (!require("pacman")) install.packages("pacman")
::p_load(maps, ggplot2, dplyr, countrycode, data.table, raster) pacman
Fetching taxonomic data from ITIS
This notebook contaions code to fetch family names from ITIS and then create lists of species in each family found in ITIS across the total 730 plant families. The final result here is one file called allFamNames.csv
.
Step 1: Retrieve taxonomic information for families within the plant clade Embryophyta (multicellular plants, excluding green algae) using the ITIS database.
# Retrieve families under "Embryophyta"
<- taxize::downstream("Embryophyta", db = "itis", downto = "family")
famnames # Embryophyta is multicellular plants (excluding green algae)
<- famnames$Embryophyta$taxonname
fams # Remove family with problematic records
<- fams[fams != "Kahukaloaceae"]
fams
# Initialize data structures
<- vector("list", length(fams))
famlist <- matrix(nrow = length(fams), ncol = 3)
famout 1] <- fams
famout[,length(fams)
We found 730 families returned from ITIS.
Next we can loop over these families and generate CSV files with all the species in these families.
# Loop over families and save a CSV for each
for(i in 1:length(fams)){
<- taxize::downstream(fams[i], db = "itis", downto = "species")
famlist[[i]] <- paste0("../data/family_csvs/",fams[i],".csv",sep = "")
filename 2] <- filename
famout[i,write.csv(file = filename, do.call(rbind, famlist[[i]]))
3] <- length(which(do.call(rbind, famlist[[i]])$rankname == "species"))
famout[i, }
Reassemble data from saved CSV files:
<- vector("list", length(fams))
out3 for(i in 1:length(fams)){
<- paste0("./data/family_csvs/",fams[i],".csv",sep="")
filename if (file.exists(filename)) out3[[i]] <- read.csv(filename) }
Combine all data into a single data frame
This block generates the list of families used in the Correlations between plant species and DNA barcode availability notebook.
<- do.call(rbind, out3)
out4 dim(out4)
$family<-do.call(rbind, lapply(strsplit(as.character(out4$X), "[.]"), `[[`, 1)) # extract family names
out4head(out4, 2)
write.csv(file="../data/allFamNames.csv", out4)