diff phyloseq_filter.R @ 0:910739de7b80 draft

planemo upload commit 555f91999a1164a4420492126fa7713c89e3c5f5-dirty
author simon-gladman
date Wed, 05 Sep 2018 22:21:43 -0400
parents
children 9fbb104e16d9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/phyloseq_filter.R	Wed Sep 05 22:21:43 2018 -0400
@@ -0,0 +1,328 @@
+library('getopt')
+library('ape')
+library('ggplot2')
+suppressPackageStartupMessages(library('phyloseq'))
+library(biomformat)
+library(plyr)
+Sys.setenv("DISPLAY"=":1")
+library(biomformat)
+suppressPackageStartupMessages(library(metagenomeSeq))
+suppressPackageStartupMessages(library("doParallel"))
+ncores = ceiling(detectCores() * 0.8)
+registerDoParallel(cores=ncores)
+
+options(warn=-1)
+
+theme_set(theme_bw())
+## ggplot 
+# http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
+# https://gist.github.com/Mikeyj/5429538
+# http://microbiome-tutorials.readthedocs.io/en/latest/_static/Composition.html
+# https://rstudio-pubs-static.s3.amazonaws.com/268156_d3ea37937f4f4469839ab6fa2c483842.html#otus_that_differ_by (stacked bar plot)
+
+## color
+## http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/ 
+
+#http://saml.rilspace.com/creating-a-galaxy-tool-for-r-scripts-that-output-images-and-pdfs
+#http://joey711.github.io/phyloseq-demo/phyloseq-demo.html
+option_specification = matrix(c(
+   'otu_table','o',2,'character',
+   'tax_table','t',2,'character',
+  'meta_table','m',2,'character',
+        'biom','b',2,'character',
+      'filter','f',2,'numeric',
+     'kingdom','k',2,'character',
+      'cutoff','c',2,'numeric',
+        'keep','p',2,'numeric',
+     'outbiom','h',2,'character',
+      'outdir','d',2,'character',
+    'htmlfile','w',2,'character'
+),byrow=TRUE,ncol=4);
+
+
+options <- getopt(option_specification);
+options(bitmapType="cairo")
+
+if (!is.null(options$outdir)) {
+  # Create the directory
+  dir.create(options$outdir,FALSE)
+}
+
+
+
+cutoff_value<-options$cutoff
+### select a kingdom for phyloseq plot (e.g., "phylum")
+#kingdom_str<-colnames(tax_table)[options$kingdom]
+kingdom_str<-options$kingdom
+keep<-options$keep
+filter<-options$filter
+
+### prepare the directory and file name
+pdffile <- gsub("[ ]+", "", paste(options$outdir,"/pdffile.pdf"))
+pngfile_before_filtering <- gsub("[ ]+", "", paste(options$outdir,"/barplot_before_filtering.png"))
+pngfile_after_filtering <- gsub("[ ]+", "", paste(options$outdir,"/barplot_after_filtering.png"))
+pngfile_pre_phyla_filtering <- gsub("[ ]+", "", paste(options$outdir,"/barplot_before_phyla_filtering.png"))
+pngfile_post_phyla_filtering<- gsub("[ ]+", "", paste(options$outdir,"/barplot_after_phyla_filtering.png"))
+htmlfile <- gsub("[ ]+", "", paste(options$htmlfile))
+
+### This function accepts different two different type of BIOM file format
+readBIOM<-function(inBiom){
+	tryCatch({
+		phyloseq_obj<-import_biom(inBiom,parallel=TRUE)
+		return(phyloseq_obj)
+		},
+		error=function(e){
+		biom_obj<-read_biom(inBiom)
+
+		otu_matrix = as(biom_data(biom_obj), "matrix")
+		OTU_TABLE = otu_table(otu_matrix, taxa_are_rows=TRUE)
+		
+		taxonomy_matrix = as.matrix(observation_metadata(biom_obj), rownames.force=TRUE)	
+		TAXONOMY_TABLE = tax_table(taxonomy_matrix)	
+		
+		metadata.temp<-sample_metadata(biom_obj)
+		METADATA_TABLE<-plyr::ldply(metadata.temp, rbind)
+        rownames(METADATA_TABLE)<-as.character(METADATA_TABLE$.id)
+
+		phyloseq_obj = phyloseq(OTU_TABLE, TAXONOMY_TABLE,sample_data(METADATA_TABLE))
+		return(phyloseq_obj)
+		}
+	)
+}
+
+
+create_PDF<-function(pdf_file,OTU_DATAFRAME_BEFORE_FILTERING,OTU_DATAFRAME_AFTER_FILTERING,physeq_pre_phyla_filtering,physeq_post_phyla_filtering,kingdom_str,htmlfile,pngfile_before_filtering,pngfile_after_filtering,pngfile_pre_phyla_filtering,pngfile_post_phyla_filtering){
+    pdf(pdf_file);
+    barplot_before_filtering<-ggplot(OTU_DATAFRAME_BEFORE_FILTERING,aes(rownames(OTU_DATAFRAME_BEFORE_FILTERING))) + 
+                                  geom_bar(aes(weight=Abundance),fill="tomato3") +
+                                  labs(title="Sample Depth Bar Chart",subtitle="Sample Vs Abundance (Before Filtering)",caption="source: Input Biom") +
+                                  xlab("Sample") +
+                                  ylab("Abundance") +
+                                  theme(axis.text.x=element_text(angle=65,vjust=0.6))
+    print(barplot_before_filtering)
+
+    barplot_after_filtering<-ggplot(OTU_DATAFRAME_AFTER_FILTERING,aes(rownames(OTU_DATAFRAME_AFTER_FILTERING))) + 
+                                  geom_bar(aes(weight=Abundance),fill="blue") + 
+                                  labs(title="Sample Depth Bar Chart",subtitle="Sample Vs Abundance (After Filtering)", caption="source: Input Biom") +
+                                  xlab("Sample") +
+                                  ylab("Abundance") +
+                                  theme(axis.text.x=element_text(angle=65,vjust=0.6))
+    print(barplot_after_filtering)
+
+    barplot_pre_phyla_filtering<-plot_bar(physeq_pre_phyla_filtering, x=colnames(sample_data(physeq_pre_phyla_filtering))[1], fill=kingdom_str) + 
+                                 geom_bar(stat="identity", position="stack") +
+                                 labs(title=paste("Sample Depth Bar Chart",kingdom_str,sep=":"),subtitle="Sample Vs Abundance (Pre phyla Filtering)",caption="source: Input Biom") +
+                                 xlab("Sample") +
+                                 ylab("Abundance") +
+                                 theme(axis.text.x=element_text(angle=90,vjust=0.6)) +
+                                 scale_fill_hue()
+    print(barplot_pre_phyla_filtering)
+
+    barplot_post_phyla_filtering<-plot_bar(physeq_post_phyla_filtering, x=colnames(sample_data(physeq_post_phyla_filtering))[1], fill=kingdom_str) +
+                                  geom_bar(stat="identity", position="stack") +
+                                  labs(title=paste("Sample Depth Bar Chart",kingdom_str,sep=":"),subtitle="Sample Vs Abundance (Post phyla Filtering)",caption="source: Input Biom") +
+                                  xlab("Sample") +
+                                  ylab("Abundance") +
+                                  theme(axis.text.x=element_text(angle=90,vjust=0.6)) +
+                                  scale_fill_hue()
+    print(barplot_post_phyla_filtering)
+    garbage<-dev.off();
+
+    #png('barplot_before_filtering.png')
+    bitmap(pngfile_before_filtering,"png16m")
+    barplot_before_filtering_png<-ggplot(OTU_DATAFRAME_BEFORE_FILTERING,aes(rownames(OTU_DATAFRAME_BEFORE_FILTERING))) +
+                                  geom_bar(aes(weight=Abundance),fill="tomato3") +
+                                  labs(title="Sample Depth Bar Chart",subtitle="Sample Vs Abundance (Before Filtering)",caption="source: Input Biom") +
+                                  xlab("Sample") +
+                                  ylab("Abundance") +
+                                  theme(axis.text.x=element_text(angle=65,vjust=0.6))
+    print(barplot_before_filtering_png)
+    garbage<-dev.off()
+
+    #png('barplot_after_filtering.png')
+    bitmap(pngfile_after_filtering,"png16m")
+    barplot_after_filtering_png<-ggplot(OTU_DATAFRAME_AFTER_FILTERING,aes(rownames(OTU_DATAFRAME_AFTER_FILTERING))) +
+                                 geom_bar(aes(weight=Abundance),fill="blue") +
+                                 labs(title="Sample Depth Bar Chart",subtitle="Sample Vs Abundance (After Filtering)", caption="source: Input Biom") +
+                                 xlab("Sample") +
+                                 ylab("Abundance") +
+                                 theme(axis.text.x=element_text(angle=65,vjust=0.6))
+    print(barplot_after_filtering_png)
+    garbage<-dev.off()
+
+    #png('barplot_pre_phyla_filtering.png')
+    bitmap(pngfile_pre_phyla_filtering,"png16m")
+    print(sample_data(physeq_pre_phyla_filtering))
+    barplot_pre_phyla_filtering<-plot_bar(physeq_pre_phyla_filtering, x=colnames(sample_data(physeq_pre_phyla_filtering))[1], fill=kingdom_str) +
+                                 geom_bar(stat="identity", position="stack") +
+                                 labs(title=paste("Sample Depth Bar Chart",kingdom_str,sep=":"),subtitle="Sample Vs Abundance (Pre Phyla Filtering)",caption="source: Input Biom") +
+                                 xlab("Sample") +
+                                 ylab("Abundance") +
+                                 theme(axis.text.x=element_text(angle=90,vjust=0.6)) +
+                                 scale_fill_hue()
+    print(barplot_pre_phyla_filtering)
+    garbage<-dev.off()
+
+    #png('barplot_post_phyla_filtering.png')
+    bitmap(pngfile_post_phyla_filtering,"png16m")
+    barplot_post_phyla_filtering<-plot_bar(physeq_post_phyla_filtering, x=colnames(sample_data(physeq_post_phyla_filtering))[1], fill=kingdom_str) +
+                                  geom_bar(stat="identity", position="stack") +
+                                  labs(title=paste("Sample Depth Bar Chart",kingdom_str,sep=":"),subtitle="Sample Vs Abundance (Post Phyla Filtering)",caption="source: Input Biom") +
+                                  xlab("Sample") +
+                                  ylab("Abundance") +
+                                  theme(axis.text.x=element_text(angle=90,vjust=0.6)) +
+                                  scale_fill_hue()
+    print(barplot_post_phyla_filtering)
+    garbage<-dev.off()
+
+    create_HTML(htmlfile)
+}
+
+create_HTML<-function(htmlfile){
+    htmlfile_handle <- file(htmlfile)
+    html_output = c('<html><body>',
+                    '<table align="center>',
+                    '<tr>',
+                    '<td valign="middle" style="vertical-align:middle;">',
+                        '<a href="pdffile.pdf"><img src="barplot_before_filtering.png"/></a>',
+                    '</td>',
+                    '</tr>',
+                    '<tr>',
+                    '<td valign="middle" style="vertical-align:middle;">',
+                    '<a href="pdffile.pdf"><img src="barplot_after_filtering.png"/></a>',
+                    '</td>',
+                    '</tr>',
+                    '<tr>',
+                    '<td valign="middle" style="vertical-align:middle;">',
+                    '<a href="pdffile.pdf"><img src="barplot_before_phyla_filtering.png"/></a>',
+                    '</td>',
+                    '</tr>',
+                    '<tr>',
+                    '<td valign="middle" style="vertical-align:middle;">',
+                    '<a href="pdffile.pdf"><img src="barplot_after_phyla_filtering.png"/></a>',
+                    '</td>',
+                    '</tr>',
+                    '</table>',
+                    '</html></body>');
+     writeLines(html_output, htmlfile_handle);
+     close(htmlfile_handle);
+}
+
+convert_phyloseq_otutable_to_dataframe<-function(physeq_obj){
+    temp.df<-data.frame(otu_table(physeq_obj))   
+    temp.df.counts<-as.data.frame(colSums(temp.df))
+    colnames(temp.df.counts)<-"Abundance"
+    print(temp.df.counts)
+    return(temp.df.counts)
+}
+
+if(!is.null(options$biom)){
+    
+    #physeq<-import_biom(options$biom)
+    physeq<-readBIOM(options$biom)
+    
+    if(length(rank_names(physeq)) == 8){
+        tax_table(physeq) <- tax_table(physeq)[,-1]
+        colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
+    } else {
+        colnames(tax_table(physeq)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species")
+    }
+    
+    ### select column name from sample table for nmds plot
+    ## which(colnames(sample_data(biom)) == "vegetation_type_id")
+    #category_type<-colnames(sample_data(physeq))[options$subset]
+    #category_type <- options$subset
+
+    ### obtain the unique value in the selected column from sample table
+    #category_option<-unique(sample_data(physeq))[,options$subset]
+
+}else{
+
+    ### read the data into correct data type to create phyloseq object
+    otu_table<-as.matrix(read.table(options$otu_table,header=T,sep="\t"))
+    tax_table<-as.matrix(read.table(options$tax_table,header=T,sep="\t"))
+    sample_table<-read.table(options$meta_table,header=T,sep="\t",stringsAsFactors=F)
+
+
+    ### select column name from sample table for nmds plot
+    #category_type<-colnames(sample_table)[options$category]
+
+    ### obtain the unique value in the selected column from sample table
+    #category_option<-unique(sample_table[,options$category])
+
+
+    ### create a sample object for phyloseq
+    sample_object<-sample_data(sample_table)
+
+    ### create otu object for phyloseq
+    OTU<-otu_table(otu_table, taxa_are_rows = TRUE)
+
+    ### create tax object for phyloseq
+    TAX<-tax_table(tax_table)
+
+    ### create a phyloseq object
+    physeq = phyloseq(OTU,TAX,sample_object)
+}
+    ### make the first column to be the sample ID in the phyloseq object
+   
+    firstColumn = sample_data(physeq)[,1]
+    row_names = rownames(sample_data(physeq))
+    check = all( firstColumn == row_names)
+    if(!check){
+        sample_data(physeq) <- cbind(SampleID= rownames(sample_data(physeq)),sample_data(physeq))
+    }
+
+    
+    ### extract otu table from phyloseq object
+    before_filtering_dataframe_sampleCounts<-convert_phyloseq_otutable_to_dataframe(physeq)
+    
+    ### filtering OTUs based on cutoff value (e.g., 5)
+    #physeq_temp =genefilter_sample(physeq, filterfun_sample(function(x) x > cutoff_value), A=0.1*nsamples(physeq))
+    physeq_temp =genefilter_sample(physeq, filterfun_sample(function(x) x > cutoff_value), A=filter*nsamples(physeq))
+    
+    ### phyloseq object after filtered
+    physeq_filter = prune_taxa(physeq_temp, physeq)
+    
+  
+    
+    ## Transform to even sampling depth
+    #physeq_filter = transform_sample_counts(physeq_filter, function(x) 1E6 * x/sum(x))
+
+    #after_filtering.dataframe<-data.frame(otu_table(physeq_filter))
+    #after_filtering_dataframe_sampleCounts<-as.data.frame(colSums(after_filtering.dataframe))
+    #colnames(after_filtering_dataframe_sampleCounts)<-"Abundance"
+    after_filtering_dataframe_sampleCounts<-convert_phyloseq_otutable_to_dataframe(physeq_filter)
+   
+#    create_PDF(pdffile,before_filtering_dataframe_sampleCounts,after_filtering_dataframe_sampleCounts,htmlfile,pngfile_before_filtering,pngfile_after_filtering)
+
+    # kingdom_str <- as.numeric(kingdom_str)
+    ## Keep only the most abundant five phyla
+
+    ### Phyla - Pre transformation (Transform to even sampling depth)
+    
+    #physeq_filter_pre_transform = tapply(taxa_sums(physeq_filter), tax_table(physeq_filter)[, kingdom_str], sum,na.rm=TRUE)
+    
+    phylum.sum_pre_transform= tapply(taxa_sums(physeq_filter), tax_table(physeq_filter)[, kingdom_str], sum,na.rm=TRUE)
+    
+    topphyla_pre_transform = names(sort(phylum.sum_pre_transform, TRUE))[1:keep]
+    
+    physeq_filter_pre_transform = prune_taxa((tax_table(physeq_filter)[, kingdom_str] %in% topphyla_pre_transform), physeq_filter)
+   
+    ### Phyla - Post Transformation (Transform to even sampling depth)
+    physeq_filter_post_transform = transform_sample_counts(physeq_filter, function(x) 1E6 * x/sum(x))
+
+    phylum.sum_post_transform = tapply(taxa_sums(physeq_filter_post_transform), tax_table(physeq_filter_post_transform)[, kingdom_str], sum,na.rm=TRUE)
+    ### number of most abundance phyla to keep
+    topphyla_post_transform = names(sort(phylum.sum_post_transform, TRUE))[1:keep]
+
+    physeq_filter_post_transform = prune_taxa((tax_table(physeq_filter_post_transform)[, kingdom_str] %in% topphyla_post_transform), physeq_filter_post_transform)
+
+
+    create_PDF(pdffile,before_filtering_dataframe_sampleCounts,after_filtering_dataframe_sampleCounts,physeq_filter_pre_transform,physeq_filter_post_transform,kingdom_str,htmlfile,pngfile_before_filtering,pngfile_after_filtering,pngfile_pre_phyla_filtering,pngfile_post_phyla_filtering)
+
+    ### convert phyloseq object to metagenomeSeq - preparing for BIOM output
+    metagenomeSeq_obj <- phyloseq_to_metagenomeSeq(physeq_filter_post_transform)
+    metagenomeSeq_biom <- MRexperiment2biom(metagenomeSeq_obj)
+
+    ## write biom file
+    write_biom(metagenomeSeq_biom, biom_file=options$outbiom)