Mercurial > repos > yhoogstrate > edger_with_design_matrix
annotate edgeR_Differential_Gene_Expression.xml @ 94:46745f5666ac draft
Added test
author | yhoogstrate |
---|---|
date | Sat, 28 Mar 2015 08:09:03 -0400 |
parents | 31335aa52b2e |
children | 9dac2146b98c |
rev | line source |
---|---|
25 | 1 <?xml version="1.0" encoding="UTF-8"?> |
91 | 2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis" version="3.0.3-latest.d"> |
25 | 3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description> |
4 | |
5 <requirements> | |
67 | 6 <requirement type="package" version="3.0.3">R</requirement> |
77 | 7 <requirement type="package" version="latest">biocLite_edgeR_limma</requirement> |
25 | 8 </requirements> |
9 | |
79 | 10 <version_command>R --vanilla --slave -e "library(edgeR) ; cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null</version_command> |
11 | |
25 | 12 <command> |
13 <!-- | |
14 The following script is written in the "Cheetah" language: | |
15 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html | |
16 --> | |
17 | |
18 R --vanilla --slave -f $R_script '--args | |
19 $expression_matrix | |
20 $design_matrix | |
21 $contrast | |
22 | |
23 $fdr | |
24 | |
25 $output_count_edgeR | |
26 $output_cpm | |
27 | |
28 /dev/null <!-- Calculation of FPKM/RPKM should come here --> | |
29 | |
30 #if $output_raw_counts: | |
31 $output_raw_counts | |
32 #else: | |
33 /dev/null | |
34 #end if | |
35 | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
36 #if $output_MDSplot_logFC: |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
37 $output_MDSplot_logFC |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
38 #else: |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
39 /dev/null |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
40 #end if |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
41 |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
42 #if $output_MDSplot_bcv: |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
43 $output_MDSplot_bcv |
25 | 44 #else: |
45 /dev/null | |
46 #end if | |
47 | |
48 #if $output_BCVplot: | |
49 $output_BCVplot | |
50 #else: | |
51 /dev/null | |
52 #end if | |
53 | |
54 #if $output_MAplot: | |
55 $output_MAplot | |
56 #else: | |
57 /dev/null | |
58 #end if | |
59 | |
60 #if $output_PValue_distribution_plot: | |
61 $output_PValue_distribution_plot | |
62 #else: | |
63 /dev/null | |
64 #end if | |
65 | |
66 #if $output_hierarchical_clustering_plot: | |
67 $output_hierarchical_clustering_plot | |
68 #else: | |
69 /dev/null | |
70 #end if | |
71 | |
72 #if $output_heatmap_plot: | |
73 $output_heatmap_plot | |
74 #else: | |
75 /dev/null | |
76 #end if | |
77 | |
78 #if $output_RData_obj: | |
79 $output_RData_obj | |
80 #else: | |
81 /dev/null | |
82 #end if | |
55 | 83 |
84 $output_format_images | |
85 ' | |
25 | 86 #if $output_R: |
87 > $output_R | |
88 #else: | |
89 > /dev/null | |
90 #end if | |
91 | |
94 | 92 <!-- |
53 | 93 2> stderr.txt ; |
94 | |
95 grep -v 'Calculating library sizes from column' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
96 | |
97 ## Locale error messages: | |
98 grep -v 'During startup - Warning messages' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
99 grep -v 'Setting LC_TIME failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
100 grep -v 'Setting LC_MONETARY failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
101 grep -v 'Setting LC_PAPER failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
102 grep -v 'Setting LC_MEASUREMENT failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
103 grep -v 'Setting LC_CTYPE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
104 grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ; | |
105 | |
106 cat stderr.txt >&2 | |
94 | 107 --> |
25 | 108 </command> |
109 | |
94 | 110 <stdio> |
111 <regex match="Calculating library sizes from column" | |
112 source="stderr" | |
113 level="log" /> | |
114 <regex match="During startup - Warning messages" | |
115 source="stderr" | |
116 level="log" /> | |
117 <regex match="Setting LC_[^ ]+ failed" | |
118 source="stderr" | |
119 level="warning" | |
120 description="LOCALE has not been set correctly" /> | |
121 </stdio> | |
122 | |
25 | 123 <inputs> |
124 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" /> | |
94 | 125 <param name="design_matrix" type="data" format="tabular" label="Design matrix" help="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." /> |
25 | 126 |
127 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." /> | |
128 | |
129 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" /> | |
130 | |
131 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes"> | |
132 <option value="make_output_raw_counts">Raw counts table</option> | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
133 <option value="make_output_MDSplot_logFC">MDS-plot (logFC-method)</option> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
134 <option value="make_output_MDSplot_bcv">MDS-plot (BCV-method; much slower)</option> |
25 | 135 <option value="make_output_BCVplot">BCV-plot</option> |
136 <option value="make_output_MAplot">MA-plot</option> | |
137 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option> | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
138 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering (under contstruction)</option> |
25 | 139 <option value="make_output_heatmap_plot">Heatmap</option> |
140 | |
43 | 141 <option value="make_output_R_stdout">R stdout</option> |
25 | 142 <option value="make_output_RData_obj">R Data object</option> |
143 </param> | |
55 | 144 |
145 <param name="output_format_images" type="select" label="Output format of images" display="radio"> | |
146 <option value="png">Portable network graphics (.png)</option> | |
147 <option value="pdf">Portable document format (.pdf)</option> | |
148 <option value="svg">Scalable vector graphics (.svg)</option> | |
149 </param> | |
25 | 150 </inputs> |
151 | |
152 <configfiles> | |
153 <configfile name="R_script"> | |
154 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping | |
155 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping | |
156 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping | |
157 | |
158 ## Fetch commandline arguments | |
159 args <- commandArgs(trailingOnly = TRUE) | |
160 | |
161 expression_matrix_file = args[1] | |
162 design_matrix_file = args[2] | |
163 contrast = args[3] | |
164 | |
165 fdr = args[4] | |
166 | |
167 output_count_edgeR = args[5] | |
168 output_cpm = args[6] | |
169 | |
43 | 170 output_xpkm = args[7] ##FPKM file - yet to be implemented |
25 | 171 |
172 output_raw_counts = args[8] | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
173 output_MDSplot_logFC = args[9] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
174 output_MDSplot_bcv = args[10] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
175 output_BCVplot = args[11] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
176 output_MAplot = args[12] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
177 output_PValue_distribution_plot = args[13] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
178 output_hierarchical_clustering_plot = args[14] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
179 output_heatmap_plot = args[15] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
180 output_RData_obj = args[16] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
181 output_format_images = args[17] |
25 | 182 |
183 | |
184 library(edgeR) | |
185 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) | |
186 ## Obtain read-counts | |
187 | |
188 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) | |
189 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c("")) | |
190 | |
191 colnames(design_matrix) <- make.names(colnames(design_matrix)) | |
192 | |
193 for(i in 1:ncol(design_matrix)) { | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
194 old <- design_matrix[,i] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
195 design_matrix[,i] <- make.names(design_matrix[,i]) |
25 | 196 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) { |
197 print("Renaming of factors:") | |
198 print(old) | |
199 print("To:") | |
200 print(design_matrix[,i]) | |
201 } | |
45 | 202 ## The following line seems to malfunction the script: |
203 ##design_matrix[,i] <- as.factor(design_matrix[,i]) | |
25 | 204 } |
205 | |
44 | 206 ## 1) In the expression matrix, you only want to have the samples described in the design matrix |
25 | 207 columns <- match(rownames(design_matrix),colnames(expression_matrix)) |
43 | 208 columns <- columns[!is.na(columns)] |
25 | 209 read_counts <- expression_matrix[,columns] |
210 | |
44 | 211 ## 2) In the design matrix, you only want to have samples of which you really have the counts |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
212 columns <- match(colnames(read_counts),rownames(design_matrix)) |
44 | 213 columns <- columns[!is.na(columns)] |
214 design_matrix <- design_matrix[columns,,drop=FALSE] | |
25 | 215 |
216 ## Filter for HTSeq predifined counts: | |
217 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") | |
218 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned") | |
219 | |
44 | 220 exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) |
221 exclude <- exclude[is.na(exclude)==0] | |
25 | 222 if(length(exclude) != 0) { |
44 | 223 read_counts <- read_counts[-exclude,] |
25 | 224 } |
225 | |
226 | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
227 ## sorting expression matrix with the order of the read_counts |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
228 ##order <- match(colnames(read_counts) , rownames(design_matrix)) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
229 ##read_counts_ordered <- read_counts[,order2] |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
230 |
44 | 231 empty_samples <- apply(read_counts,2,function(x) sum(x) == 0) |
25 | 232 if(sum(empty_samples) > 0) { |
233 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr()) | |
234 write(colnames(read_counts)[empty_samples],stderr()) | |
235 } else { | |
236 | |
237 dge <- DGEList(counts=read_counts,genes=rownames(read_counts)) | |
238 | |
239 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ") | |
240 design_matrix_tmp <- design_matrix | |
241 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp)) | |
242 design <- model.matrix(as.formula(formula),design_matrix_tmp) | |
243 rm(design_matrix_tmp) | |
244 | |
245 # Filter prefixes | |
246 prefixes = colnames(design_matrix)[attr(design,"assign")] | |
247 avoid = nchar(prefixes) == nchar(colnames(design)) | |
248 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) | |
249 replacements[avoid] = colnames(design)[avoid] | |
250 colnames(design) = replacements | |
251 | |
252 # Do normalization | |
253 write("Calculating normalization factors...",stdout()) | |
254 dge <- calcNormFactors(dge) | |
255 write("Estimating common dispersion...",stdout()) | |
256 dge <- estimateGLMCommonDisp(dge,design) | |
257 write("Estimating trended dispersion...",stdout()) | |
258 dge <- estimateGLMTrendedDisp(dge,design) | |
259 write("Estimating tagwise dispersion...",stdout()) | |
260 dge <- estimateGLMTagwiseDisp(dge,design) | |
261 | |
262 | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
263 if(output_MDSplot_logFC != "/dev/null") { |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
264 write("Creating MDS plot (logFC method)",stdout()) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
265 points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot |
25 | 266 dev.off()# Kill it |
267 | |
91 | 268 if(output_format_images == "pdf") { |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
269 pdf(output_MDSplot_logFC,height=14,width=14) |
55 | 270 } else if(output_format_images == "svg") { |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
271 svg(output_MDSplot_logFC,height=14,width=14) |
91 | 272 } else { |
273 ## png(output_MDSplot_logFC) | |
274 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/ | |
275 | |
276 bitmap(output_MDSplot_logFC,type="png16m",height=14,width=14) | |
70 | 277 } |
91 | 278 |
55 | 279 |
25 | 280 diff_x <- abs(max(points\$x)-min(points\$x)) |
281 diff_y <-(max(points\$y)-min(points\$y)) | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
282 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2") |
25 | 283 points(points\$x,points\$y,pch=20) |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
284 text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) |
25 | 285 rm(diff_x,diff_y) |
286 | |
287 dev.off() | |
288 } | |
289 | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
290 if(output_MDSplot_bcv != "/dev/null") { |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
291 write("Creating MDS plot (bcv method)",stdout()) |
93 | 292 |
293 ## 1. First create a virtual plot to obtain the desired coordinates | |
294 pdf("bcvmds.pdf") | |
295 points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples))) | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
296 dev.off()# Kill it |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
297 |
93 | 298 ## 2. Re-plot the coordinates in a new figure with the size and settings. |
91 | 299 if(output_format_images == "pdf") { |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
300 pdf(output_MDSplot_bcv,height=14,width=14) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
301 } else if(output_format_images == "svg") { |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
302 svg(output_MDSplot_bcv,height=14,width=14) |
91 | 303 } else { |
304 ## png(output_MDSplot_bcv) | |
305 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/ | |
306 | |
307 bitmap(output_MDSplot_bcv,type="png16m",height=14,width=14) | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
308 } |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
309 |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
310 diff_x <- abs(max(points\$x)-min(points\$x)) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
311 diff_y <-(max(points\$y)-min(points\$y)) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
312 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2") |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
313 points(points\$x,points\$y,pch=20) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
314 text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
315 rm(diff_x,diff_y) |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
316 |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
317 dev.off() |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
318 } |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
319 |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
320 |
25 | 321 if(output_BCVplot != "/dev/null") { |
322 write("Creating Biological coefficient of variation plot",stdout()) | |
60 | 323 |
91 | 324 if(output_format_images == "pdf") { |
60 | 325 pdf(output_BCVplot) |
326 } else if(output_format_images == "svg") { | |
327 svg(output_BCVplot) | |
91 | 328 } else { |
329 ## png(output_BCVplot) | |
330 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/ | |
331 | |
332 bitmap(output_BCVplot,type="png16m") | |
70 | 333 } |
60 | 334 |
25 | 335 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") |
336 dev.off() | |
337 } | |
338 | |
339 | |
340 write("Fitting GLM...",stdout()) | |
341 fit <- glmFit(dge,design) | |
342 | |
343 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout()) | |
344 cont <- c(contrast) | |
345 cont <- makeContrasts(contrasts=cont, levels=design) | |
346 | |
347 lrt <- glmLRT(fit, contrast=cont[,1]) | |
348 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout()) | |
349 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA) | |
350 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA) | |
351 | |
352 ## todo EXPORT FPKM | |
353 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA) | |
354 | |
34 | 355 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") { |
25 | 356 etable <- topTags(lrt, n=nrow(dge))\$table |
357 etable <- etable[order(etable\$FDR), ] | |
32 | 358 |
359 if(output_MAplot != "/dev/null") { | |
360 write("Creating MA plot...",stdout()) | |
60 | 361 |
91 | 362 if(output_format_images == "pdf") { |
60 | 363 pdf(output_MAplot) |
364 } else if(output_format_images == "svg") { | |
365 svg(output_MAplot) | |
91 | 366 } else { |
367 ## png(output_MAplot) | |
368 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/ | |
369 | |
370 bitmap(output_MAplot,type="png16m") | |
70 | 371 } |
60 | 372 |
32 | 373 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) |
374 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red")) | |
375 abline(h=c(-1,1), col="blue") | |
376 dev.off() | |
377 } | |
25 | 378 |
32 | 379 if(output_PValue_distribution_plot != "/dev/null") { |
380 write("Creating P-value distribution plot...",stdout()) | |
60 | 381 |
91 | 382 if(output_format_images == "pdf") { |
383 pdf(output_PValue_distribution_plot,width=14,height=14) | |
60 | 384 } else if(output_format_images == "svg") { |
91 | 385 svg(output_PValue_distribution_plot,width=14,height=14) |
386 } else { | |
387 ## png(output_PValue_distribution_plot) | |
388 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/ | |
389 | |
390 bitmap(output_PValue_distribution_plot,type="png16m",width=14,height=14) | |
70 | 391 } |
60 | 392 |
32 | 393 expressed_genes <- subset(etable, PValue < 0.99) |
394 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)") | |
395 center <- sum(h\$counts) / length(h\$counts) | |
396 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2) | |
397 k <- ksmooth(h\$mid, h\$counts) | |
398 lines(k\$x,k\$y,col="red",lwd=2) | |
399 rmsd <- (h\$counts) - center | |
400 rmsd <- rmsd^2 | |
401 rmsd <- sum(rmsd) | |
402 rmsd <- sqrt(rmsd) | |
403 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue") | |
404 ## change e into epsilon somehow | |
405 dev.off() | |
406 } | |
40 | 407 } |
408 | |
409 if(output_heatmap_plot != "/dev/null") { | |
60 | 410 |
91 | 411 if(output_format_images == "pdf") { |
60 | 412 pdf(output_heatmap_plot,width=10.5) |
413 } else if(output_format_images == "svg") { | |
414 svg(output_heatmap_plot,width=10.5) | |
91 | 415 } else { |
416 ## png(output_heatmap_plot) | |
417 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/ | |
418 | |
419 bitmap(output_heatmap_plot,type="png16m",width=10.5) | |
70 | 420 } |
60 | 421 |
40 | 422 etable2 <- topTags(lrt, n=100)\$table |
423 order <- rownames(etable2) | |
424 cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),] | |
425 heatmap(t(cpm_sub)) | |
426 dev.off() | |
25 | 427 } |
428 | |
429 ##output_hierarchical_clustering_plot = args[13] | |
430 | |
35 | 431 if(output_RData_obj != "/dev/null") { |
25 | 432 save.image(output_RData_obj) |
433 } | |
434 | |
435 write("Done!",stdout()) | |
436 } | |
437 </configfile> | |
438 </configfiles> | |
439 | |
440 <outputs> | |
53 | 441 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differentially expressed genes" /> |
25 | 442 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" /> |
443 | |
444 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts"> | |
53 | 445 <filter>outputs and ("make_output_raw_counts" in outputs)</filter> |
25 | 446 </data> |
447 | |
89
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
448 <data format="png" name="output_MDSplot_logFC" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (logFC method)"> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
449 <filter>outputs and ("make_output_MDSplot_logFC" in outputs)</filter> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
450 |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
451 <change_format> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
452 <when input="output_format_images" value="png" format="png" /> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
453 <when input="output_format_images" value="pdf" format="pdf" /> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
454 <when input="output_format_images" value="svg" format="svg" /> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
455 </change_format> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
456 </data> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
457 |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
458 <data format="png" name="output_MDSplot_bcv" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (bcv method)"> |
875f080136b6
Solved a very serious bug: if contrast and design matrix described samples not in the same order, statistical analysis goes wrong
yhoogstrate
parents:
83
diff
changeset
|
459 <filter>outputs and ("make_output_MDSplot_bcv" in outputs)</filter> |
59 | 460 |
461 <change_format> | |
462 <when input="output_format_images" value="png" format="png" /> | |
463 <when input="output_format_images" value="pdf" format="pdf" /> | |
464 <when input="output_format_images" value="svg" format="svg" /> | |
465 </change_format> | |
25 | 466 </data> |
467 | |
60 | 468 <data format="png" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot"> |
53 | 469 <filter>outputs and ("make_output_BCVplot" in outputs)</filter> |
60 | 470 |
471 <change_format> | |
472 <when input="output_format_images" value="png" format="png" /> | |
473 <when input="output_format_images" value="pdf" format="pdf" /> | |
474 <when input="output_format_images" value="svg" format="svg" /> | |
475 </change_format> | |
25 | 476 </data> |
477 | |
60 | 478 <data format="png" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot"> |
53 | 479 <filter>outputs and ("make_output_MAplot" in outputs)</filter> |
60 | 480 |
481 <change_format> | |
482 <when input="output_format_images" value="png" format="png" /> | |
483 <when input="output_format_images" value="pdf" format="pdf" /> | |
484 <when input="output_format_images" value="svg" format="svg" /> | |
485 </change_format> | |
25 | 486 </data> |
487 | |
60 | 488 <data format="png" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution"> |
53 | 489 <filter>outputs and ("make_output_PValue_distribution_plot" in outputs)</filter> |
60 | 490 |
491 <change_format> | |
492 <when input="output_format_images" value="png" format="png" /> | |
493 <when input="output_format_images" value="pdf" format="pdf" /> | |
494 <when input="output_format_images" value="svg" format="svg" /> | |
495 </change_format> | |
25 | 496 </data> |
497 | |
60 | 498 <data format="png" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering"> |
53 | 499 <filter>outputs and ("make_output_hierarchical_clustering_plot" in outputs)</filter> |
60 | 500 |
501 <change_format> | |
502 <when input="output_format_images" value="png" format="png" /> | |
503 <when input="output_format_images" value="pdf" format="pdf" /> | |
504 <when input="output_format_images" value="svg" format="svg" /> | |
505 </change_format> | |
25 | 506 </data> |
507 | |
60 | 508 <data format="png" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap"> |
53 | 509 <filter>outputs and ("make_output_heatmap_plot" in outputs)</filter> |
60 | 510 |
511 <change_format> | |
512 <when input="output_format_images" value="png" format="png" /> | |
513 <when input="output_format_images" value="pdf" format="pdf" /> | |
514 <when input="output_format_images" value="svg" format="svg" /> | |
515 </change_format> | |
25 | 516 </data> |
517 | |
518 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object"> | |
53 | 519 <filter>outputs and ("make_output_RData_obj" in outputs)</filter> |
25 | 520 </data> |
521 | |
40 | 522 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output (debug)" > |
53 | 523 <filter>outputs and ("make_output_R_stdout" in outputs)</filter> |
25 | 524 </data> |
525 </outputs> | |
526 | |
94 | 527 <tests> |
528 <test> | |
529 <param name="expression_matrix" value="Differential_Gene_Expression/expression_matrix.tabular.txt" /> | |
530 <param name="design_matrix" value="Differential_Gene_Expression/design_matrix.tabular.txt" /> | |
531 | |
532 <param name="contrast" value="E-C"/> | |
533 | |
534 <param name="fdr" value="0.05" /> | |
535 | |
536 <param name="output_format_images" value="png" /> | |
537 | |
538 <output name="output_count_edgeR" file="Differential_Gene_Expression/differentially_expressed_genes.tabular.txt" /> | |
539 </test> | |
540 </tests> | |
541 | |
25 | 542 <help> |
543 edgeR: Differential Gene(Expression) Analysis | |
36 | 544 ############################################# |
25 | 545 |
36 | 546 Overview |
547 -------- | |
548 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data [1]. | |
25 | 549 |
550 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups. | |
36 | 551 More details on this are given in the edgeR manual: http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf |
25 | 552 and the limma manual. |
553 | |
554 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it. | |
555 This tool is called *edgeR Design Matrix Creator*. | |
556 If the appropriate design matrix (with corresponding links to the files) is given, | |
557 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given. | |
558 | |
559 If you have for example two groups, with an equal weight, you would like to compare either | |
79 | 560 "g1-g2" or "normal-cancer". |
25 | 561 |
36 | 562 The test function makes use of a MCF7 dataset used in a study that indicates that a higher sequencing depth is not neccesairily more important than a higher amount of replaciates[2]. |
25 | 563 |
36 | 564 Input |
565 ----- | |
566 Expression matrix | |
567 ^^^^^^^^^^^^^^^^^ | |
568 :: | |
25 | 569 |
570 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n" | |
571 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n" | |
572 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n" | |
573 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n" | |
574 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n" | |
575 [...] | |
576 | |
36 | 577 *Note: Make sure the number of columns in the header is identical to the number of columns in the body.* |
25 | 578 |
36 | 579 Design matrix |
580 ^^^^^^^^^^^^^ | |
581 :: | |
25 | 582 |
583 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n" | |
584 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n" | |
585 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n" | |
586 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n" | |
587 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n" | |
588 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n" | |
589 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n" | |
590 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n" | |
591 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n" | |
592 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n" | |
593 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n" | |
594 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n" | |
595 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n" | |
596 | |
36 | 597 *Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.* |
25 | 598 |
36 | 599 Contrast |
600 ^^^^^^^^ | |
601 The contrast represents the biological question. There can be many questions asked, e.g.: | |
25 | 602 |
36 | 603 - Tumor-Normal |
604 - African-European | |
605 - 0.5*(Control+Placebo) / Treated | |
25 | 606 |
36 | 607 Installation |
608 ------------ | |
25 | 609 |
610 This tool requires no specific configurations. The following dependencies are installed automatically: | |
36 | 611 |
612 - R | |
613 - Bioconductor | |
79 | 614 - limma |
615 - edgeR | |
25 | 616 |
36 | 617 License |
618 ------- | |
619 - R | |
79 | 620 - GPL 2 & GPL 3 |
36 | 621 - limma |
622 - GPL (>=2) | |
623 - edgeR | |
79 | 624 - GPL (>=2) |
36 | 625 |
626 References | |
627 ---------- | |
628 | |
629 EdgeR | |
630 ^^^^^ | |
631 **[1] edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.** | |
25 | 632 |
36 | 633 *Mark D. Robinson, Davis J. McCarthy and Gordon K. Smyth* - Bioinformatics (2010) 26 (1): 139-140. |
634 | |
635 - http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html | |
636 - http://dx.doi.org/10.1093/bioinformatics/btp616 | |
637 - http://www.bioconductor.org/packages/release/bioc/html/edgeR.html | |
25 | 638 |
36 | 639 Test-data (MCF7) |
640 ^^^^^^^^^^^^^^^^ | |
641 **[2] RNA-seq differential expression studies: more sequence or more replication?** | |
642 | |
643 *Yuwen Liu, Jie Zhou and Kevin P. White* - Bioinformatics (2014) 30 (3): 301-304. | |
644 | |
645 - http://www.ncbi.nlm.nih.gov/pubmed/24319002 | |
646 - http://dx.doi.org/10.1093/bioinformatics/btt688 | |
647 | |
648 Contact | |
649 ------- | |
79 | 650 |
651 The tool wrapper has been written by Youri Hoogstrate from the Erasmus | |
652 Medical Center (Rotterdam, Netherlands) on behalf of the Translational | |
653 Research IT (TraIT) project: | |
83 | 654 |
25 | 655 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch |
656 | |
79 | 657 More tools by the Translational Research IT (TraIT) project can be found |
658 in the following toolsheds: | |
83 | 659 |
660 http://toolshed.dtls.nl/ | |
661 | |
662 http://toolshed.g2.bx.psu.edu | |
663 | |
664 http://testtoolshed.g2.bx.psu.edu/ | |
79 | 665 |
36 | 666 I would like to thank Hina Riaz - Naz Khan for her helpful contribution. |
25 | 667 </help> |
94 | 668 |
669 <citations> | |
670 <citation type="doi">10.1093/bioinformatics/btp616</citation> | |
671 <citation type="doi">10.1093/bioinformatics/btt688</citation> | |
672 </citations> | |
25 | 673 </tool> |