Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 89:875f080136b6 draft
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
author | yhoogstrate |
---|---|
date | Wed, 28 Jan 2015 09:22:54 -0500 |
parents | 6be55ae6196c |
children | f87938c392bf |
files | edgeR_Differential_Gene_Expression.xml |
diffstat | 1 files changed, 87 insertions(+), 32 deletions(-) [+] |
line wrap: on
line diff
--- a/edgeR_Differential_Gene_Expression.xml Wed Nov 19 10:02:17 2014 -0500 +++ b/edgeR_Differential_Gene_Expression.xml Wed Jan 28 09:22:54 2015 -0500 @@ -1,5 +1,5 @@ <?xml version="1.0" encoding="UTF-8"?> -<tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis" version="3.0.3-latest.a"> +<tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis" version="3.0.3-latest.b"> <description>RNA-Seq gene expression analysis using edgeR (R package)</description> <requirements> @@ -36,8 +36,14 @@ /dev/null #end if - #if $output_MDSplot: - $output_MDSplot + #if $output_MDSplot_logFC: + $output_MDSplot_logFC + #else: + /dev/null + #end if + + #if $output_MDSplot_bcv: + $output_MDSplot_bcv #else: /dev/null #end if @@ -91,11 +97,18 @@ #if $output_format_images.value == "png": echo "Converting PDF figures to PNG" ; - #if $output_MDSplot: - #set $output_MDSplot_tmp = str($output_MDSplot)+".png" + #if $output_MDSplot_logFC: + #set $output_MDSplot_logFC_tmp = str($output_MDSplot_logFC)+".png" - gm convert $output_MDSplot $output_MDSplot_tmp ; - mv $output_MDSplot_tmp $output_MDSplot ; + gm convert $output_MDSplot_logFC $output_MDSplot_logFC_tmp ; + mv $output_MDSplot_logFC_tmp $output_MDSplot_logFC ; + #end if + + #if $output_MDSplot_bcv: + #set $output_MDSplot_bcv_tmp = str($output_MDSplot_bcv)+".png" + + gm convert $output_MDSplot_bcv $output_MDSplot_bcv_tmp ; + mv $output_MDSplot_bcv_tmp $output_MDSplot_bcv ; #end if #if $output_BCVplot: @@ -146,7 +159,6 @@ grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; cat stderr.txt >&2 - </command> <inputs> @@ -159,11 +171,12 @@ <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes"> <option value="make_output_raw_counts">Raw counts table</option> - <option value="make_output_MDSplot">MDS-plot</option> + <option value="make_output_MDSplot_logFC">MDS-plot (logFC-method)</option> + <option value="make_output_MDSplot_bcv">MDS-plot (BCV-method; much slower)</option> <option value="make_output_BCVplot">BCV-plot</option> <option value="make_output_MAplot">MA-plot</option> <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option> - <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option> + <option value="make_output_hierarchical_clustering_plot">Hierarchical custering (under contstruction)</option> <option value="make_output_heatmap_plot">Heatmap</option> <option value="make_output_R_stdout">R stdout</option> @@ -198,14 +211,15 @@ output_xpkm = args[7] ##FPKM file - yet to be implemented output_raw_counts = args[8] -output_MDSplot = args[9] -output_BCVplot = args[10] -output_MAplot = args[11] -output_PValue_distribution_plot = args[12] -output_hierarchical_clustering_plot = args[13] -output_heatmap_plot = args[14] -output_RData_obj = args[15] -output_format_images = args[16] +output_MDSplot_logFC = args[9] +output_MDSplot_bcv = args[10] +output_BCVplot = args[11] +output_MAplot = args[12] +output_PValue_distribution_plot = args[13] +output_hierarchical_clustering_plot = args[14] +output_heatmap_plot = args[15] +output_RData_obj = args[16] +output_format_images = args[17] library(edgeR) @@ -218,8 +232,8 @@ colnames(design_matrix) <- make.names(colnames(design_matrix)) for(i in 1:ncol(design_matrix)) { - old = design_matrix[,i] - design_matrix[,i] = make.names(design_matrix[,i]) + old <- design_matrix[,i] + design_matrix[,i] <- make.names(design_matrix[,i]) if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) { print("Renaming of factors:") print(old) @@ -236,7 +250,7 @@ read_counts <- expression_matrix[,columns] ## 2) In the design matrix, you only want to have samples of which you really have the counts -columns <- match(colnames(expression_matrix),rownames(design_matrix)) +columns <- match(colnames(read_counts),rownames(design_matrix)) columns <- columns[!is.na(columns)] design_matrix <- design_matrix[columns,,drop=FALSE] @@ -251,6 +265,10 @@ } +## sorting expression matrix with the order of the read_counts +##order <- match(colnames(read_counts) , rownames(design_matrix)) +##read_counts_ordered <- read_counts[,order2] + empty_samples <- apply(read_counts,2,function(x) sum(x) == 0) if(sum(empty_samples) > 0) { write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) @@ -283,31 +301,58 @@ dge <- estimateGLMTagwiseDisp(dge,design) - if(output_MDSplot != "/dev/null") { - write("Creating MDS plot",stdout()) - ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot - points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + if(output_MDSplot_logFC != "/dev/null") { + write("Creating MDS plot (logFC method)",stdout()) + points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot dev.off()# Kill it if(output_format_images == "pdf" || output_format_images == "png") { - pdf(output_MDSplot) + pdf(output_MDSplot_logFC,height=14,width=14) } else if(output_format_images == "svg") { - svg(output_MDSplot) + svg(output_MDSplot_logFC,height=14,width=14) } ## else { - ## png(output_MDSplot) + ## png(output_MDSplot_logFC) ##} + ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings diff_x <- abs(max(points\$x)-min(points\$x)) diff_y <-(max(points\$y)-min(points\$y)) - plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2") + plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2") points(points\$x,points\$y,pch=20) - text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4) + text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) rm(diff_x,diff_y) dev.off() } + if(output_MDSplot_bcv != "/dev/null") { + write("Creating MDS plot (bcv method)",stdout()) + pdf("/home/youri/Desktop/bcvmds.pdf") + points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot + dev.off()# Kill it + + if(output_format_images == "pdf" || output_format_images == "png") { + pdf(output_MDSplot_bcv,height=14,width=14) + } else if(output_format_images == "svg") { + svg(output_MDSplot_bcv,height=14,width=14) + } + ## else { + ## png(output_MDSplot_bcv) + ##} + ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings + + diff_x <- abs(max(points\$x)-min(points\$x)) + diff_y <-(max(points\$y)-min(points\$y)) + plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2") + points(points\$x,points\$y,pch=20) + text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) + rm(diff_x,diff_y) + + dev.off() + } + + if(output_BCVplot != "/dev/null") { write("Creating Biological coefficient of variation plot",stdout()) @@ -427,8 +472,18 @@ <filter>outputs and ("make_output_raw_counts" in outputs)</filter> </data> - <data format="png" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot"> - <filter>outputs and ("make_output_MDSplot" in outputs)</filter> + <data format="png" name="output_MDSplot_logFC" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (logFC method)"> + <filter>outputs and ("make_output_MDSplot_logFC" in outputs)</filter> + + <change_format> + <when input="output_format_images" value="png" format="png" /> + <when input="output_format_images" value="pdf" format="pdf" /> + <when input="output_format_images" value="svg" format="svg" /> + </change_format> + </data> + + <data format="png" name="output_MDSplot_bcv" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (bcv method)"> + <filter>outputs and ("make_output_MDSplot_bcv" in outputs)</filter> <change_format> <when input="output_format_images" value="png" format="png" />