Fetching taxonomic data from ITIS

This notebook contaions code to fetch family names from ITIS and then create lists of species in each family found in ITIS across the total 730 plant families. The final result here is one file called allFamNames.csv.

## First check for the required packages, install if needed, and load the libraries.
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("sangerseqR")
remotes::install_github("ropensci/bold")
remotes::install_github("ropensci/taxize")

if (!require("pacman")) install.packages("pacman")
pacman::p_load(maps, ggplot2, dplyr, countrycode, data.table, raster)

Step 1: Retrieve taxonomic information for families within the plant clade Embryophyta (multicellular plants, excluding green algae) using the ITIS database.

# Retrieve families under "Embryophyta"
famnames <- taxize::downstream("Embryophyta", db = "itis", downto = "family") 
# Embryophyta is multicellular plants (excluding green algae)
fams <- famnames$Embryophyta$taxonname
# Remove family with problematic records
fams <- fams[fams != "Kahukaloaceae"]

# Initialize data structures
famlist <- vector("list", length(fams))
famout <- matrix(nrow = length(fams), ncol = 3)
famout[,1] <- fams
length(fams)

We found 730 families returned from ITIS.

Next we can loop over these families and generate CSV files with all the species in these families.

# Loop over families and save a CSV for each
    for(i in 1:length(fams)){
        famlist[[i]] <- taxize::downstream(fams[i], db = "itis", downto = "species")
        filename <- paste0("../data/family_csvs/",fams[i],".csv",sep = "")
        famout[i,2] <- filename
        write.csv(file = filename, do.call(rbind, famlist[[i]]))
        famout[i,3] <- length(which(do.call(rbind, famlist[[i]])$rankname == "species"))
    }

Reassemble data from saved CSV files:

out3 <- vector("list", length(fams)) 
  for(i in 1:length(fams)){ 
    filename <- paste0("./data/family_csvs/",fams[i],".csv",sep="") 
    if (file.exists(filename)) out3[[i]] <- read.csv(filename) }

Combine all data into a single data frame

This block generates the list of families used in the Correlations between plant species and DNA barcode availability notebook.

out4 <- do.call(rbind, out3)
dim(out4)
out4$family<-do.call(rbind, lapply(strsplit(as.character(out4$X), "[.]"), `[[`, 1)) # extract family names
head(out4, 2)
write.csv(file="../data/allFamNames.csv", out4)