From Qiime2 into R - Initial diagnostics

Packages

library(qiime2R)
library(tidyverse)
library(stringr)
library(vegan)
library(phyloseq)

Import qiime files

## use this vector to set factor levels (this is only needed if you want treatments
## in a certain order for your figures)
### 'levelsvector' is added to steps below.
levelsvector = c("Control", "H2O2","Heat", "Inositol", "SodiumFormate", "SodiumAcetate",
"CitricAcid", "FumaricAcid",  "Lime", "PoultryLitter", "Biochar")

## metadata
# This is the same metadata file that was used in qiime2
metadata<- readr::read_tsv("metadata.tsv")
metadata2 <- metadata[c(2:nrow(metadata)),c(1:length(metadata))] %>%
  tibble::rownames_to_column("spl") %>%
  dplyr::mutate_all(type.convert) %>%
  dplyr::mutate_if(is.factor, as.character) %>%
  tibble::column_to_rownames("spl")

# rename, add or edit factors
metadata2$Treatment[metadata2$Treatment == "AutoclavedControl"] <- "Heat"
metadata2$Treatment <- factor(metadata2$Treatment, levels=levelsvector)
metadata2$Week <- factor(metadata2$Week, levels=c("T0","T39","T52","Reference", "Blank"))
metadata2 <- metadata2 %>% as_tibble() %>% column_to_rownames("#SampleID")

## Import of Qiime2 feature table (ASV abundances)
SVs <- qiime2R::read_qza("feature_table_insertiontreefiltered.qza")

## Import taxonomy file
## I found it easier to use a tsv file. I.e. after creating the qiime taxonomy.qzv I
## opened the file visualisation in https://view.qiime2.org/ and downloaded the .tsv from there.

taxonomy <- read_tsv("taxonomy_silva.tsv") %>%
  filter(Taxon != "categorical")  %>%
  rename(Feature.ID = `Feature ID`)

## re-format the taxonomy file to split the taxonomy into columns
## remove the confidence column
taxtable <- taxonomy %>% as_tibble() %>%
  separate(Taxon, sep=";", c("Kingdom","Phylum","Class","Order","Family","Genus","Species"))  %>%
  dplyr::select(-Confidence) %>%
  column_to_rownames("Feature.ID") %>%
  as.matrix()

## Import qiime tree file, here the insertion-tree.qza, see https://library.qiime2.org/plugins/q2-fragment-insertion/16/
tree <- read_qza("insertion-tree-silva.qza")

Create a phyloseq object

## combine all to a phyloseq object
physeqB <- phyloseq::phyloseq(
  phyloseq::otu_table(SVs$data, taxa_are_rows = T),
  phyloseq::phy_tree(tree$data),
  phyloseq::tax_table(taxtable),
  phyloseq::sample_data(metadata2) )

## Remove those annoying short codes in front of taxa names (i.e. p__ etc) as they
## dont look good in visualisation
tax_table(physeqB)[, "Kingdom"] <- str_replace_all(tax_table(physeqB)[, "Kingdom"], "d__", "")
tax_table(physeqB)[, "Phylum"] <- str_replace_all(tax_table(physeqB)[, "Phylum"], " p__", "")
tax_table(physeqB)[, "Class"] <- str_replace_all(tax_table(physeqB)[, "Class"], " c__", "")
tax_table(physeqB)[, "Order"] <- str_replace_all(tax_table(physeqB)[, "Order"], " o__", "")
tax_table(physeqB)[, "Family"] <- str_replace_all(tax_table(physeqB)[, "Family"], " f__", "")
tax_table(physeqB)[, "Genus"] <- str_replace_all(tax_table(physeqB)[, "Genus"], " g__", "")
tax_table(physeqB)[, "Species"] <- str_replace_all(tax_table(physeqB)[, "Species"], " s__", "")

physeqB

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 22271 taxa and 136 samples ]
## sample_data() Sample Data:       [ 136 samples by 6 sample variables ]
## tax_table()   Taxonomy Table:    [ 22271 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 22271 tips and 22189 internal nodes ]

Initial filtering

Filter out any ASVs that have less than 10 reads and are present in less than 3 samples. Then filter out any phyla that came up as uncharacterized and taxa that came up as mitochondria and chloroplast.

# filter as needed. This will be your final otu-table.
# Although you can still filter for specific analysis if needed.
# minimum of reads per feature
physeqB.flt = prune_taxa(taxa_sums(physeqB) >= 10, physeqB) #minimum reads per feature
physeqB.flt = filter_taxa(physeqB.flt, function(x) sum(x > 0) > (0.015*length(x)), TRUE) #minimum presense in x% of samples (136 * 0.015 = 2 samples)
#filter any phyla that have not been classified i.e. are "NA"
physeqB.flt = subset_taxa(physeqB.flt, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized"))

# Filter out non-bacteria, mitochondia and chloroplast taxa
physeqB.flt <- physeqB.flt %>%
  subset_taxa(Kingdom == "Bacteria" & Family  != "Mitochondria" & Class   != "Chloroplast"
 & !is.na(Phylum) & !Phylum %in% c("", "uncharacterized") )

physeqB.flt

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 4932 taxa and 136 samples ]
## sample_data() Sample Data:       [ 136 samples by 6 sample variables ]
## tax_table()   Taxonomy Table:    [ 4932 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 4932 tips and 4931 internal nodes ]

Rarefaction curve

Check if you cover enough depth for diversity analyses. In this case I had to remove references, blanks, autoclaved soils and soils treated with hydrogen peroxide before doing diversity analyses.

#filter
`%notin%` = Negate(`%in%`)
#physeqB.flt.norefs <-  prune_samples(sample_data(physeqB.flt)$Name %notin% c("41AutoclavedControlT0","42AutoclavedControlT0","43AutoclavedControlT0","44AutoclavedControlT0"), physeqB.flt)
physeqB.flt.norefs <-  prune_samples(sample_data(physeqB.flt)$Treatment %notin% c("Heat", "H2O2"), physeqB.flt)
physeqB.flt.norefs <-  prune_samples(sample_data(physeqB.flt.norefs)$Treatment != "Reference", physeqB.flt.norefs)
physeqB.flt.norefs <-  prune_samples(sample_data(physeqB.flt.norefs)$Treatment != "Blank", physeqB.flt.norefs)
physeqB.flt.norefs <-  phyloseq::prune_taxa(taxa_sums(physeqB.flt.norefs) != 0, physeqB.flt.norefs)

vegan::rarecurve(t(otu_table(physeqB.flt.norefs)), step=200, sample = min(colSums(otu_table(physeqB.flt.norefs))), label = FALSE, xlab = "Sample Size after filtering")

Prevelance table

Create a prevalence table to see which phyla have low prevalence

## Create a prevalence table to see which phyla have low prevalence
physeqB.flt.prevtab <-  prune_samples(sample_data(physeqB.flt)$Treatment != "Reference", physeqB.flt)
physeqB.flt.prevtab <-  prune_samples(sample_data(physeqB.flt.prevtab)$Treatment != "Blank", physeqB.flt.prevtab)
prevelancedf = apply(X = phyloseq::otu_table(physeqB.flt.prevtab),
                       MARGIN = 1,
                       FUN = function(x){sum(x > 0)})
prevelancedf = data.frame(Prevalence = prevelancedf,
                            TotalAbundance = taxa_sums(physeqB.flt.prevtab),
                            phyloseq::tax_table(physeqB.flt.prevtab))

prevelancedf <- plyr::ddply(prevelancedf, "Phylum", function(df1){
  data.frame(mean_prevalence=mean(df1$Prevalence), total_abundance=sum(df1$TotalAbundance,na.rm = T), stringsAsFactors = F)
  })
prevelancedf

##               Phylum mean_prevalence total_abundance
## 1    Acidobacteriota        7.214912          199741
## 2   Actinobacteriota        8.756279          708045
## 3     Armatimonadota        4.954545            2062
## 4       Bacteroidota        5.981818           20919
## 5   Bdellovibrionota        3.000000              30
## 6        Chloroflexi        9.567042          646439
## 7      Cyanobacteria        3.333333             135
## 8       Dependentiae        3.333333              72
## 9   Desulfobacterota        4.875000            1572
## 10        Firmicutes       14.844250         1101043
## 11   Gemmatimonadota        5.800000           17851
## 12       Myxococcota        5.909091           27926
## 13      Nitrospirota        4.235294            2300
## 14   Planctomycetota        6.509174          177762
## 15    Proteobacteria        8.713961          557906
## 16           RCP2-54        6.666667            3067
## 17 Verrucomicrobiota        8.205128           54178
## 18             WPS-2        5.870968           20024

Note:
This code is an amalgamation from various sources. Apart from putting it together into a pipeline I do not take credit for it.

Last updated on Jan 28, 2021