3 Rarefaction

To control for uneven sequencing effort in amplicon sequence analyses a common approach consist of normalizing the sampling depth by the random subsampling of sequences from each sample down to the lowest but reasonable sample’s depth (it is recommended to not go below 1000 sequences). This normalization method is refereed to as rarefying. While this approach is the subject of considerable debate and statistical criticism (see the 2014 PLOS Computational Biology paper, “Waste not, want not: why rarefying microbiome data is inadmissible” by McMurdie and Holmes) and alternative methods have been developed (DESeq2, cumulative sum scaling (CSS), and more…) rarefaction is still widely used and the most common normalizing method used in the literature.

3.1 Load required libraries

library(vegan)
library(phyloseq)
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(glue)

3.2 Getting ready

Load the data previously generated by the DADA2 workflow and combine into phyloseq object

# Define your path 
path = "~/project/domain/int_data"

asv = read.table(file = glue("{path}/raw_asv.csv"), sep=",", row.names=1, header=TRUE, check.names=FALSE) 
taxa = read.table(file = glue("{path}/raw_taxa.csv"), sep=",", row.names=1, header=TRUE) 
meta = read.table(file = glue("{path}/raw_meta.csv"), sep=",", row.names=1, header=TRUE) 
tree = ape::read.tree( file = glue("{path}/raw_tree.tree"))

# Merge into phyloseq object
ps=phyloseq(otu_table(asv, taxa_are_rows=TRUE), tax_table(as.matrix(taxa)), sample_data(meta), phy_tree(tree))

3.3 Filter low abundant taxa

One of the reasons to filter in this way is to avoid spending much time analyzing taxa that were only rarely seen. This also turns out to be a useful filter of noise (taxa that are actually just artifacts of the data collection process) A step that should probably be considered essential for datasets constructed via heuristic OTU-clustering methods, which are notoriously prone to generating spurious taxa. Here we are removing taxa with a relative abundance less than 0.005% as recommended by Bokulich et al., 2013

# Define threshold for low abundant taxa
minTotRelAbun = 5e-5
# Get total sum for each taxa
x = taxa_sums(ps)
# Identify taxa with a total sum greater than the defined threshold
keepTaxa = (x / sum(x)) > minTotRelAbun
# Filter out from the phyloseq object any taxa not identified in the keepTaxa object
prunedSet = prune_taxa(keepTaxa, ps)
# View how many taxa were removed by sample 
loss_taxa=data.frame(sample_sums(prunedSet), sample_sums(ps), (sample_sums(prunedSet)-sample_sums(ps)))

# Remove samples with 0 sequences !

3.4 Generate rarefaction curve

Generating rarefation curves is the most common method used to visualize the ASVs richness as a function of sample size. (number of sequences). Analyzing such graphics also helps identifying the optimal rarefying threshold.

To generate the rarefaction curves we are using the custom ggrare() function which runs much faster and also ultimately allows the user to modify certain parameter of the function if necessary.

ggrare <- function(physeq_object, step = 10, label = NULL, color = NULL, plot = TRUE, parallel = FALSE, se = TRUE) {
  
  x <- methods::as(phyloseq::otu_table(physeq_object), "matrix")
  if (phyloseq::taxa_are_rows(physeq_object)) { x <- t(x) }
  
  ## This script is adapted from vegan `rarecurve` function
  tot <- rowSums(x)
  S <- rowSums(x > 0)
  nr <- nrow(x)
  
  rarefun <- function(i) {
    cat(paste("rarefying sample", rownames(x)[i]), sep = "\n")
    n <- seq(1, tot[i], by = step)
    if (n[length(n)] != tot[i]) {
      n <- c(n, tot[i])
    }
    y <- vegan::rarefy(x[i, ,drop = FALSE], n, se = se)
    if (nrow(y) != 1) {
      rownames(y) <- c(".S", ".se")
      return(data.frame(t(y), Size = n, Sample = rownames(x)[i]))
    } else {
      return(data.frame(.S = y[1, ], Size = n, Sample = rownames(x)[i]))
    }
  }
  if (parallel) {
    out <- parallel::mclapply(seq_len(nr), rarefun, mc.preschedule = FALSE)
  } else {
    out <- lapply(seq_len(nr), rarefun)
  }
  df <- do.call(rbind, out)
  
  # Get sample data
  if (!is.null(phyloseq::sample_data(physeq_object, FALSE))) {
    sdf <- methods::as(phyloseq::sample_data(physeq_object), "data.frame")
    sdf$Sample <- rownames(sdf)
    data <- merge(df, sdf, by = "Sample")
    labels <- data.frame(x = tot, y = S, Sample = rownames(x))
    labels <- merge(labels, sdf, by = "Sample")
  }
  
  # Add, any custom-supplied plot-mapped variables
  if ( length(color) > 1 ) {
    data$color <- color
    names(data)[names(data) == "color"] <- deparse(substitute(color))
    color <- deparse(substitute(color))
  }
  
  if ( length(label) > 1 ) {
    labels$label <- label
    names(labels)[names(labels) == "label"] <- deparse(substitute(label))
    label <- deparse(substitute(label))
  }
  
  p <- ggplot2::ggplot(data = data,
                       ggplot2::aes_string(x = "Size",
                                           y = ".S",
                                           group = "Sample",
                                           color = color))
  
  p <- p + ggplot2::labs(x = "Number of sequence reads", y = "ASV richness")
  
  if (!is.null(label)) {
    p <- p + ggplot2::geom_text(data = labels,
                                ggplot2::aes_string(x = "x",
                                                    y = "y",
                                                    label = label,
                                                    color = color),
                                size = 4, hjust = 0, show.legend=FALSE)
  }
  
  p <- p + ggplot2::geom_line()
  if (se) { ## add standard error if available
    p <- p +
      ggplot2::geom_ribbon(ggplot2::aes_string(ymin = ".S - .se",
                                               ymax = ".S + .se",
                                               color = NULL,
                                               fill = color),
                           alpha = 0.2)
  }
  if (plot) {
    plot(p)
  }
  invisible(p)
}

Once the function is defined we can use it with the phyloseq object. If you wish to color the different curves based on a certain sample characteristics from you metadata table simply swap characteristic to the column name of interest.

p = ggrare(prunedSet, step = 20, color = "characteristic", label = "Sample", se = FALSE)

We can now either :

rarefy to the lowest sequence number greater than 1000;
or use the information from the graph to determine an optimal threshold.

For option 1

# Get dataframe of sequences per sample
sample_size = as.data.frame(sample_sums(prunedSet))
# Filter for the lowest number above 1000 
rare_value = sample_size[which.max((sample_size[,1] >= 1000)/sample_size[,1]),]
# Rarefy to value identified as rare_value
ps_rare=rarefy_even_depth(prunedSet, rare_value, rngseed = 112, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) 
# Confirm rarefaction as a sanity check 
sample_sums(ps_rare)

For option 2

# Define value for rarefying
rare_value=1019 
# Rarefy to value identified as rare_value
ps_rare=rarefy_even_depth(prunedSet, rare_value, rngseed = 112, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) 
# Confirm rarefaction as a sanity check 
sample_sums(ps_rare)

3.5 Customizing the graph

We are adding a red line which corresponds to the define rarefying threshold. Again, to color your curves based on a certain sample characteristics from you metadata table simply swap characteristic to the column name of interest. Specific colors can also be defined with function scale_color_manual.

p2 = p + geom_vline(xintercept=2530, color= "red", linetype='dashed') + # Add vertical red line and modifying axis limits
  theme(text = element_text(size=12)) + 
  theme_minimal() +
  labs(color="characteristic") +
  scale_color_manual(values = c("#72B3DA", "royalblue4", "#000000","#A0522D"))

3.6 Comparing richness and diversity between rarefied and un-rarefied samples

# Extract ASV count matrix from both phyloseq object 
asv_r=data.frame(otu_table(ps_rare))
asv=data.frame(otu_table(ps))
# Remove samples deleted after rarefaction
asv1=subset(asv, select=c(colnames(asv_r)))

# Calculate Shannon diversity index 
dShannon=ChaoShannon(asv1)
dShannon$rar=diversity(t(asv_r))
# Calculate species richness 
Sr=ChaoRichness(asv1)
Sr$rar=ChaoRichness(asv_r)$Observed

# ------------------ Plotting the results  ------------------ 

gshannon=ggplot(dShannon,aes(x=Estimator, y=rar))+
  stat_cor(method = "spearman", digits = 4, aes(label=paste(rr.label,..p.label..,sep='~`,`~'))) +
  geom_point()+
  theme_minimal()+xlab("Non-rarefied Shannon diversity")+ylab("Rarefied Shannon diversity") +
  geom_smooth(method = "lm", se = F, color = "#D6604D") +
  geom_abline(intercept = 0, slope = 1 , lty=2) +
  xlim(0,7) + ylim(0,7) +
  theme(text = element_text(size=12))


grichness <- ggplot(Sr,aes(x=Estimator, y=rar))+
  stat_cor(method = "spearman", digits = 4, aes(label=paste(rr.label,..p.label..,sep='~`,`~')), label.y=4680) +
  geom_point()+
  theme_minimal()+xlab("Non-rarefied ASV richness")+ylab("Rarefied ASV richness") +
  geom_smooth(method = "lm", se = F, color = "#D6604D") +
  geom_abline(intercept = 0, slope = 1 , lty=2) +
  xlim(0,5000) + ylim(0,5000) + 
  theme(text = element_text(size=12))


combined_plots=ggarrange(p2, ggarrange(gshannon, grichness, ncol=2, labels=c("b","c")), nrow=2, labels="a")


# ------------------ Save plot  ------------------ 

ggsave("rarefaction_curve.pdf", plot=combined_plots, width=16.5, height = 18,units="cm", path="/image/")


# ------------------ Save rarefied table  ------------------ 

write.csv(as.data.frame(as(tax_table(ps_rare), "matrix")), file = glue("{path}/rarefied_taxa.csv"))
write.csv(as.data.frame(as(otu_table(ps_rare), "matrix")),file = glue("{path}/rarefied_asv.csv"))
write.csv(as.data.frame(as(sample_data(ps_rare), "matrix")), file = glue("{path}/rarefied_meta.csv"))
tree.raw = phy_tree(ps_rare)
ape::write.tree(tree.raw , file = glue("{path}/rarefied_tree.tree"))

3.7 Complete code

You can copy-paste the following block of code inside a new markdown document. Code-chunks will be automatically generated and you can use the far right button ( ▶ ) to execute all of the code inside each chunk.

```{r}
# ----------- Load libraries -----------
library(vegan)
library(phyloseq)
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(glue)

# ----------- Define path -----------
path = "~/project/domain/int_data"

asv = read.table(file = glue("{path}/raw_asv.csv"), sep=",", row.names=1, header=TRUE, check.names=FALSE) 
taxa = read.table(file = glue("{path}/raw_taxa.csv"), sep=",", row.names=1, header=TRUE) 
meta = read.table(file = glue("{path}/raw_meta.csv"), sep=",", row.names=1, header=TRUE) 
tree = ape::read.tree( file = glue("{path}/raw_tree.tree"))

# ----------- Merge into phyloseq object -----------
ps=phyloseq(otu_table(asv, taxa_are_rows=TRUE), tax_table(as.matrix(taxa)), sample_data(meta), phy_tree(tree))

# ----------- Remove low abundant taxa -----------
# Set threshold
minTotRelAbun = 5e-5
# Get total sum for each taxa
x = taxa_sums(ps)
# Identify taxa with a total sum greater than the defined threshold
keepTaxa = (x / sum(x)) > minTotRelAbun
# Filter out from the phyloseq object any taxa not identified in the keepTaxa object
prunedSet = prune_taxa(keepTaxa, ps)
# View how many taxa were removed by sample 
loss_taxa=data.frame(sample_sums(prunedSet), sample_sums(ps), (sample_sums(prunedSet)-sample_sums(ps)))

# ----------- Create function -----------
ggrare = function(physeq_object, step = 10, label = NULL, color = NULL, plot = TRUE, parallel = FALSE, se = TRUE) {
  x = methods::as(phyloseq::otu_table(physeq_object), "matrix")
  if (phyloseq::taxa_are_rows(physeq_object)) { x <- t(x) }
  ## This script is adapted from vegan `rarecurve` function
  tot <- rowSums(x)
  S <- rowSums(x > 0)
  nr <- nrow(x)
  rarefun <- function(i) {
    cat(paste("rarefying sample", rownames(x)[i]), sep = "\n")
    n <- seq(1, tot[i], by = step)
    if (n[length(n)] != tot[i]) {
      n <- c(n, tot[i])
    }
    y <- vegan::rarefy(x[i, ,drop = FALSE], n, se = se)
    if (nrow(y) != 1) {
      rownames(y) <- c(".S", ".se")
      return(data.frame(t(y), Size = n, Sample = rownames(x)[i]))
    } else {
      return(data.frame(.S = y[1, ], Size = n, Sample = rownames(x)[i]))
    }
  }
  if (parallel) {
    out <- parallel::mclapply(seq_len(nr), rarefun, mc.preschedule = FALSE)
  } else {
    out <- lapply(seq_len(nr), rarefun)
  }
  df <- do.call(rbind, out)
  # Get sample data
  if (!is.null(phyloseq::sample_data(physeq_object, FALSE))) {
    sdf <- methods::as(phyloseq::sample_data(physeq_object), "data.frame")
    sdf$Sample <- rownames(sdf)
    data <- merge(df, sdf, by = "Sample")
    labels <- data.frame(x = tot, y = S, Sample = rownames(x))
    labels <- merge(labels, sdf, by = "Sample")
  }
  # Add, any custom-supplied plot-mapped variables
  if ( length(color) > 1 ) {
    data$color <- color
    names(data)[names(data) == "color"] <- deparse(substitute(color))
    color <- deparse(substitute(color))
  }
  if ( length(label) > 1 ) {
    labels$label <- label
    names(labels)[names(labels) == "label"] <- deparse(substitute(label))
    label <- deparse(substitute(label))
  }
  p <- ggplot2::ggplot(data = data,
                       ggplot2::aes_string(x = "Size",
                                           y = ".S",
                                           group = "Sample",
                                           color = color))
  p <- p + ggplot2::labs(x = "Number of sequence reads", y = "ASV richness")
  if (!is.null(label)) {
    p <- p + ggplot2::geom_text(data = labels,
                                ggplot2::aes_string(x = "x",
                                                    y = "y",
                                                    label = label,
                                                    color = color),
                                size = 4, hjust = 0, show.legend=FALSE)
  }
  p <- p + ggplot2::geom_line()
  if (se) { ## add standard error if available
    p <- p +
      ggplot2::geom_ribbon(ggplot2::aes_string(ymin = ".S - .se",
                                               ymax = ".S + .se",
                                               color = NULL,
                                               fill = color),
                           alpha = 0.2)
  }
  if (plot) {
    plot(p)
  }
  invisible(p)
}

# ----------- Plot rarefaction curves -----------
p = ggrare(prunedSet, step = 20, color = "characteristic", label = "Sample", se = FALSE) 
```
OPTION 1 : rarefy to the lowest sequence number greater than 1000
```{r}
# Get dataframe of sequences per sample
sample_size = as.data.frame(sample_sums(prunedSet))
# Filter for the lowest number above 1000 
rare_value = sample_size[which.max((sample_size[,1] >= 1000)/sample_size[,1]),]
# Rarefy to value identified as rare_value
ps_rare=rarefy_even_depth(prunedSet, rare_value, rngseed = 112, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) 
# Confirm rarefaction as a sanity check 
sample_sums(ps_rare)
```
OPTION 2 : Use the information from the graph to determine an optimal threshold 
```{r}
# Define value for rarefying
rare_value=1019 
# Rarefy to value identified as rare_value
ps_rare=rarefy_even_depth(prunedSet, rare_value, rngseed = 112, replace = FALSE, trimOTUs = TRUE, verbose = TRUE) 
# Confirm rarefaction as a sanity check 
sample_sums(ps_rare)
``` 

Save rarefied table
```{r}
write.csv(as.data.frame(as(tax_table(ps_rare), "matrix")), file = glue("{path}/rarefied_taxa.csv"))
write.csv(as.data.frame(as(otu_table(ps_rare), "matrix")),file = glue("{path}/rarefied_asv.csv"))
write.csv(as.data.frame(as(sample_data(ps_rare), "matrix")), file = glue("{path}/rarefied_meta.csv"))
tree.raw = phy_tree(ps_rare)
ape::write.tree(tree.raw , file = glue("{path}/rarefied_tree.tree"))
```