changeset 89:875f080136b6 draft

Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
author yhoogstrate
date Wed, 28 Jan 2015 09:22:54 -0500
parents 6be55ae6196c
children f87938c392bf
files edgeR_Differential_Gene_Expression.xml
diffstat 1 files changed, 87 insertions(+), 32 deletions(-) [+]
line wrap: on
line diff
--- a/edgeR_Differential_Gene_Expression.xml	Wed Nov 19 10:02:17 2014 -0500
+++ b/edgeR_Differential_Gene_Expression.xml	Wed Jan 28 09:22:54 2015 -0500
@@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis" version="3.0.3-latest.a">
+<tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis" version="3.0.3-latest.b">
 	<description>RNA-Seq gene expression analysis using edgeR (R package)</description>
 	
 	<requirements>
@@ -36,8 +36,14 @@
 				/dev/null
 			#end if
 			
-			#if $output_MDSplot:
-				$output_MDSplot
+			#if $output_MDSplot_logFC:
+				$output_MDSplot_logFC
+			#else:
+				/dev/null
+			#end if
+			
+			#if $output_MDSplot_bcv:
+				$output_MDSplot_bcv
 			#else:
 				/dev/null
 			#end if
@@ -91,11 +97,18 @@
 			#if $output_format_images.value == "png":
 				echo "Converting PDF figures to PNG" ; 
 				
-				#if $output_MDSplot:
-					#set $output_MDSplot_tmp = str($output_MDSplot)+".png"
+				#if $output_MDSplot_logFC:
+					#set $output_MDSplot_logFC_tmp = str($output_MDSplot_logFC)+".png"
 					
-					gm convert $output_MDSplot $output_MDSplot_tmp ; 
-					mv $output_MDSplot_tmp $output_MDSplot ; 
+					gm convert $output_MDSplot_logFC $output_MDSplot_logFC_tmp ; 
+					mv $output_MDSplot_logFC_tmp $output_MDSplot_logFC ; 
+				#end if
+				
+				#if $output_MDSplot_bcv:
+					#set $output_MDSplot_bcv_tmp = str($output_MDSplot_bcv)+".png"
+					
+					gm convert $output_MDSplot_bcv $output_MDSplot_bcv_tmp ; 
+					mv $output_MDSplot_bcv_tmp $output_MDSplot_bcv ; 
 				#end if
 				
 				#if $output_BCVplot:
@@ -146,7 +159,6 @@
 			grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; 
 			
 			cat stderr.txt >&amp;2
-			
 	</command>
 	
 	<inputs>
@@ -159,11 +171,12 @@
 		
 		<param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
 			<option value="make_output_raw_counts">Raw counts table</option>
-			<option value="make_output_MDSplot">MDS-plot</option>
+			<option value="make_output_MDSplot_logFC">MDS-plot (logFC-method)</option>
+			<option value="make_output_MDSplot_bcv">MDS-plot (BCV-method; much slower)</option>
 			<option value="make_output_BCVplot">BCV-plot</option>
 			<option value="make_output_MAplot">MA-plot</option>
 			<option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
-			<option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
+			<option value="make_output_hierarchical_clustering_plot">Hierarchical custering (under contstruction)</option>
 			<option value="make_output_heatmap_plot">Heatmap</option>
 			
 			<option value="make_output_R_stdout">R stdout</option>
@@ -198,14 +211,15 @@
 output_xpkm                         = args[7]							##FPKM file - yet to be implemented
 
 output_raw_counts                   = args[8]
-output_MDSplot                      = args[9]
-output_BCVplot                      = args[10]
-output_MAplot                       = args[11]
-output_PValue_distribution_plot     = args[12]
-output_hierarchical_clustering_plot = args[13]
-output_heatmap_plot                 = args[14]
-output_RData_obj                    = args[15]
-output_format_images                = args[16]
+output_MDSplot_logFC                = args[9]
+output_MDSplot_bcv                  = args[10]
+output_BCVplot                      = args[11]
+output_MAplot                       = args[12]
+output_PValue_distribution_plot     = args[13]
+output_hierarchical_clustering_plot = args[14]
+output_heatmap_plot                 = args[15]
+output_RData_obj                    = args[16]
+output_format_images                = args[17]
 
 
 library(edgeR)
@@ -218,8 +232,8 @@
 colnames(design_matrix) &lt;- make.names(colnames(design_matrix))
 
 for(i in 1:ncol(design_matrix)) {
-  old = design_matrix[,i]
-  design_matrix[,i] = make.names(design_matrix[,i])
+  old &lt;- design_matrix[,i]
+  design_matrix[,i] &lt;- make.names(design_matrix[,i])
   if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
     print("Renaming of factors:")
     print(old)
@@ -236,7 +250,7 @@
 read_counts &lt;- expression_matrix[,columns]
 
 ## 2) In the design matrix, you only want to have samples of which you really have the counts
-columns &lt;- match(colnames(expression_matrix),rownames(design_matrix))
+columns &lt;- match(colnames(read_counts),rownames(design_matrix))
 columns &lt;- columns[!is.na(columns)]
 design_matrix &lt;- design_matrix[columns,,drop=FALSE]
 
@@ -251,6 +265,10 @@
 }
 
 
+## sorting expression matrix with the order of the read_counts
+##order &lt;- match(colnames(read_counts) , rownames(design_matrix))
+##read_counts_ordered  &lt;- read_counts[,order2]
+
 empty_samples &lt;- apply(read_counts,2,function(x) sum(x) == 0)
 if(sum(empty_samples) > 0) {
   write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
@@ -283,31 +301,58 @@
   dge &lt;- estimateGLMTagwiseDisp(dge,design)
   
   
-  if(output_MDSplot != "/dev/null") {
-    write("Creating MDS plot",stdout())
-    ##points &lt;- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
-    points &lt;- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
+  if(output_MDSplot_logFC != "/dev/null") {
+    write("Creating MDS plot (logFC method)",stdout())
+    points &lt;- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
     dev.off()# Kill it
     
     if(output_format_images == "pdf" || output_format_images == "png") {
-      pdf(output_MDSplot)
+      pdf(output_MDSplot_logFC,height=14,width=14)
     } else if(output_format_images == "svg") {
-      svg(output_MDSplot)
+      svg(output_MDSplot_logFC,height=14,width=14)
     }
     ## else {
-    ##  png(output_MDSplot)
+    ##  png(output_MDSplot_logFC)
     ##}
+    ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings
     
     diff_x &lt;- abs(max(points\$x)-min(points\$x))
     diff_y &lt;-(max(points\$y)-min(points\$y))
-    plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
+    plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2")
     points(points\$x,points\$y,pch=20)
-    text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
+    text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4)
     rm(diff_x,diff_y)
     
     dev.off()
   }
   
+  if(output_MDSplot_bcv != "/dev/null") {
+    write("Creating MDS plot (bcv method)",stdout())
+    pdf("/home/youri/Desktop/bcvmds.pdf")
+    points &lt;- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
+    dev.off()# Kill it
+    
+    if(output_format_images == "pdf" || output_format_images == "png") {
+      pdf(output_MDSplot_bcv,height=14,width=14)
+    } else if(output_format_images == "svg") {
+      svg(output_MDSplot_bcv,height=14,width=14)
+    }
+    ## else {
+    ##  png(output_MDSplot_bcv)
+    ##}
+    ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings
+    
+    diff_x &lt;- abs(max(points\$x)-min(points\$x))
+    diff_y &lt;-(max(points\$y)-min(points\$y))
+    plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2")
+    points(points\$x,points\$y,pch=20)
+    text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4)
+    rm(diff_x,diff_y)
+    
+    dev.off()
+  }
+  
+  
   if(output_BCVplot != "/dev/null") {
     write("Creating Biological coefficient of variation plot",stdout())
     
@@ -427,8 +472,18 @@
 			<filter>outputs and ("make_output_raw_counts" in outputs)</filter>
 		</data>
 		
-		<data format="png" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
-			<filter>outputs and ("make_output_MDSplot" in outputs)</filter>
+		<data format="png" name="output_MDSplot_logFC" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (logFC method)">
+			<filter>outputs and ("make_output_MDSplot_logFC" in outputs)</filter>
+			
+			<change_format>
+				<when input="output_format_images" value="png" format="png" />
+				<when input="output_format_images" value="pdf" format="pdf" />
+				<when input="output_format_images" value="svg" format="svg" />
+			</change_format>
+		</data>
+		
+		<data format="png" name="output_MDSplot_bcv" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (bcv method)">
+			<filter>outputs and ("make_output_MDSplot_bcv" in outputs)</filter>
 			
 			<change_format>
 				<when input="output_format_images" value="png" format="png" />