99
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis" version="3.0.3-latest.d">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
|
6 <requirement type="package" version="3.0.3">R</requirement>
|
|
7 <requirement type="package" version="latest">biocLite_edgeR_limma</requirement>
|
|
8 </requirements>
|
|
9
|
|
10 <stdio>
|
101
|
11 <regex match="Error in .contrasts."
|
99
|
12 source="both"
|
|
13 level="fatal"
|
|
14 description="Have the design- and expression-matrix been swapped?" />
|
|
15 <regex match="Calculating library sizes from column"
|
|
16 source="stderr"
|
|
17 level="log" />
|
|
18 <regex match="During startup - Warning messages"
|
|
19 source="stderr"
|
|
20 level="log" />
|
|
21 <regex match="Setting LC_[^ ]+ failed"
|
|
22 source="stderr"
|
|
23 level="warning"
|
|
24 description="LOCALE has not been set correctly" />
|
|
25 </stdio>
|
|
26
|
|
27 <version_command>echo $(R --version | grep version | grep -v GNU) " , EdgeR version" $(R --vanilla --slave -e "library(edgeR) ; cat(sessionInfo()\$otherPkgs\$edgeR\$Version)" 2> /dev/null | grep -v -i "WARNING: ")</version_command>
|
|
28
|
|
29 <command>
|
|
30 <!--
|
|
31 The following script is written in the "Cheetah" language:
|
|
32 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
33 -->
|
|
34
|
|
35 R --vanilla --slave -f $R_script '--args
|
|
36 $expression_matrix
|
|
37 $design_matrix
|
|
38 $contrast
|
|
39
|
|
40 $fdr
|
|
41
|
|
42 $output_count_edgeR
|
|
43 $output_cpm
|
|
44
|
|
45 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
46
|
|
47 #if $output_raw_counts:
|
|
48 $output_raw_counts
|
|
49 #else:
|
|
50 /dev/null
|
|
51 #end if
|
|
52
|
|
53 #if $output_MDSplot_logFC:
|
|
54 $output_MDSplot_logFC
|
|
55 #else:
|
|
56 /dev/null
|
|
57 #end if
|
|
58
|
|
59 #if $output_MDSplot_bcv:
|
|
60 $output_MDSplot_bcv
|
|
61 #else:
|
|
62 /dev/null
|
|
63 #end if
|
|
64
|
|
65 #if $output_BCVplot:
|
|
66 $output_BCVplot
|
|
67 #else:
|
|
68 /dev/null
|
|
69 #end if
|
|
70
|
|
71 #if $output_MAplot:
|
|
72 $output_MAplot
|
|
73 #else:
|
|
74 /dev/null
|
|
75 #end if
|
|
76
|
|
77 #if $output_PValue_distribution_plot:
|
|
78 $output_PValue_distribution_plot
|
|
79 #else:
|
|
80 /dev/null
|
|
81 #end if
|
|
82
|
|
83 #if $output_hierarchical_clustering_plot:
|
|
84 $output_hierarchical_clustering_plot
|
|
85 #else:
|
|
86 /dev/null
|
|
87 #end if
|
|
88
|
|
89 #if $output_heatmap_plot:
|
|
90 $output_heatmap_plot
|
|
91 #else:
|
|
92 /dev/null
|
|
93 #end if
|
|
94
|
|
95 #if $output_RData_obj:
|
|
96 $output_RData_obj
|
|
97 #else:
|
|
98 /dev/null
|
|
99 #end if
|
|
100
|
|
101 $output_format_images
|
|
102 '
|
|
103 #if $output_R:
|
|
104 > $output_R
|
|
105 #else:
|
|
106 > /dev/null
|
|
107 #end if
|
|
108 </command>
|
|
109
|
|
110 <configfiles>
|
|
111 <configfile name="R_script">
|
|
112 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
113 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
114 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
115
|
|
116 ## Fetch commandline arguments
|
|
117 args <- commandArgs(trailingOnly = TRUE)
|
|
118
|
|
119 expression_matrix_file = args[1]
|
|
120 design_matrix_file = args[2]
|
|
121 contrast = args[3]
|
|
122
|
|
123 fdr = args[4]
|
|
124
|
|
125 output_count_edgeR = args[5]
|
|
126 output_cpm = args[6]
|
|
127
|
|
128 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
|
129
|
|
130 output_raw_counts = args[8]
|
|
131 output_MDSplot_logFC = args[9]
|
|
132 output_MDSplot_bcv = args[10]
|
|
133 output_BCVplot = args[11]
|
|
134 output_MAplot = args[12]
|
|
135 output_PValue_distribution_plot = args[13]
|
|
136 output_hierarchical_clustering_plot = args[14]
|
|
137 output_heatmap_plot = args[15]
|
|
138 output_RData_obj = args[16]
|
|
139 output_format_images = args[17]
|
|
140
|
|
141
|
|
142 library(edgeR)
|
|
143 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
144 ## Obtain read-counts
|
|
145
|
|
146 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
147 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
148
|
|
149 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
150
|
|
151 for(i in 1:ncol(design_matrix)) {
|
|
152 old <- design_matrix[,i]
|
|
153 design_matrix[,i] <- make.names(design_matrix[,i])
|
|
154 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
155 print("Renaming of factors:")
|
|
156 print(old)
|
|
157 print("To:")
|
|
158 print(design_matrix[,i])
|
|
159 }
|
|
160 ## The following line seems to malfunction the script:
|
|
161 ##design_matrix[,i] <- as.factor(design_matrix[,i])
|
|
162 }
|
|
163
|
|
164 ## 1) In the expression matrix, you only want to have the samples described in the design matrix
|
|
165 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
|
166 columns <- columns[!is.na(columns)]
|
|
167 read_counts <- expression_matrix[,columns]
|
|
168
|
|
169 ## 2) In the design matrix, you only want to have samples of which you really have the counts
|
|
170 columns <- match(colnames(read_counts),rownames(design_matrix))
|
|
171 columns <- columns[!is.na(columns)]
|
|
172 design_matrix <- design_matrix[columns,,drop=FALSE]
|
|
173
|
|
174 ## Filter for HTSeq predifined counts:
|
|
175 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
176 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
177
|
|
178 exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
179 exclude <- exclude[is.na(exclude)==0]
|
|
180 if(length(exclude) != 0) {
|
|
181 read_counts <- read_counts[-exclude,]
|
|
182 }
|
|
183
|
|
184
|
|
185 ## sorting expression matrix with the order of the read_counts
|
|
186 ##order <- match(colnames(read_counts) , rownames(design_matrix))
|
|
187 ##read_counts_ordered <- read_counts[,order2]
|
|
188
|
|
189 empty_samples <- apply(read_counts,2,function(x) sum(x) == 0)
|
|
190 if(sum(empty_samples) > 0) {
|
|
191 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
192 write(colnames(read_counts)[empty_samples],stderr())
|
|
193 } else {
|
|
194
|
|
195 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
196
|
|
197 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
198 design_matrix_tmp <- design_matrix
|
|
199 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
200 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
201 rm(design_matrix_tmp)
|
|
202
|
|
203 # Filter prefixes
|
|
204 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
205 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
206 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
207 replacements[avoid] = colnames(design)[avoid]
|
|
208 colnames(design) = replacements
|
|
209
|
|
210 # Do normalization
|
|
211 write("Calculating normalization factors...",stdout())
|
|
212 dge <- calcNormFactors(dge)
|
|
213 write("Estimating common dispersion...",stdout())
|
|
214 dge <- estimateGLMCommonDisp(dge,design)
|
|
215 write("Estimating trended dispersion...",stdout())
|
|
216 dge <- estimateGLMTrendedDisp(dge,design)
|
|
217 write("Estimating tagwise dispersion...",stdout())
|
|
218 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
219
|
|
220
|
|
221 if(output_MDSplot_logFC != "/dev/null") {
|
|
222 write("Creating MDS plot (logFC method)",stdout())
|
|
223 points <- plotMDS.DGEList(dge,top=500,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
224 dev.off()# Kill it
|
|
225
|
|
226 if(output_format_images == "pdf") {
|
|
227 pdf(output_MDSplot_logFC,height=14,width=14)
|
|
228 } else if(output_format_images == "svg") {
|
|
229 svg(output_MDSplot_logFC,height=14,width=14)
|
|
230 } else {
|
|
231 ## png(output_MDSplot_logFC)
|
|
232 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/
|
|
233
|
|
234 bitmap(output_MDSplot_logFC,type="png16m",height=14,width=14)
|
|
235 }
|
|
236
|
|
237
|
|
238 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
239 diff_y <-(max(points\$y)-min(points\$y))
|
|
240 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR logFC-MDS Plot on top 500 genes",type="n", xlab="Leading logFC dim 1", ylab="Leading logFC dim 2")
|
|
241 points(points\$x,points\$y,pch=20)
|
|
242 text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4)
|
|
243 rm(diff_x,diff_y)
|
|
244
|
|
245 dev.off()
|
|
246 }
|
|
247
|
|
248 if(output_MDSplot_bcv != "/dev/null") {
|
|
249 write("Creating MDS plot (bcv method)",stdout())
|
|
250
|
|
251 ## 1. First create a virtual plot to obtain the desired coordinates
|
|
252 pdf("bcvmds.pdf")
|
|
253 points <- plotMDS.DGEList(dge,method="bcv",top=500,labels=rep("",nrow(dge\$samples)))
|
|
254 dev.off()# Kill it
|
|
255
|
|
256 ## 2. Re-plot the coordinates in a new figure with the size and settings.
|
|
257 if(output_format_images == "pdf") {
|
|
258 pdf(output_MDSplot_bcv,height=14,width=14)
|
|
259 } else if(output_format_images == "svg") {
|
|
260 svg(output_MDSplot_bcv,height=14,width=14)
|
|
261 } else {
|
|
262 ## png(output_MDSplot_bcv)
|
|
263 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/
|
|
264
|
|
265 bitmap(output_MDSplot_bcv,type="png16m",height=14,width=14)
|
|
266 }
|
|
267
|
|
268 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
269 diff_y <-(max(points\$y)-min(points\$y))
|
|
270 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR BCV-MDS Plot",type="n", xlab="Leading BCV dim 1", ylab="Leading BCV dim 2")
|
|
271 points(points\$x,points\$y,pch=20)
|
|
272 text(points\$x, points\$y,rownames(dge\$samples),cex=1.25,col="gray",pos=4)
|
|
273 rm(diff_x,diff_y)
|
|
274
|
|
275 dev.off()
|
|
276 }
|
|
277
|
|
278
|
|
279 if(output_BCVplot != "/dev/null") {
|
|
280 write("Creating Biological coefficient of variation plot",stdout())
|
|
281
|
|
282 if(output_format_images == "pdf") {
|
|
283 pdf(output_BCVplot)
|
|
284 } else if(output_format_images == "svg") {
|
|
285 svg(output_BCVplot)
|
|
286 } else {
|
|
287 ## png(output_BCVplot)
|
|
288 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/
|
|
289
|
|
290 bitmap(output_BCVplot,type="png16m")
|
|
291 }
|
|
292
|
|
293 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
294 dev.off()
|
|
295 }
|
|
296
|
|
297
|
|
298 write("Fitting GLM...",stdout())
|
|
299 fit <- glmFit(dge,design)
|
|
300
|
|
301 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
302 cont <- c(contrast)
|
|
303 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
304
|
|
305 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
306 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
307 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
308 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
309
|
|
310 ## todo EXPORT FPKM
|
|
311 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
312
|
|
313 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") {
|
|
314 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
315 etable <- etable[order(etable\$FDR), ]
|
|
316
|
|
317 if(output_MAplot != "/dev/null") {
|
|
318 write("Creating MA plot...",stdout())
|
|
319
|
|
320 if(output_format_images == "pdf") {
|
|
321 pdf(output_MAplot)
|
|
322 } else if(output_format_images == "svg") {
|
|
323 svg(output_MAplot)
|
|
324 } else {
|
|
325 ## png(output_MAplot)
|
|
326 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/
|
|
327
|
|
328 bitmap(output_MAplot,type="png16m")
|
|
329 }
|
|
330
|
|
331 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
332 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
333 abline(h=c(-1,1), col="blue")
|
|
334 dev.off()
|
|
335 }
|
|
336
|
|
337 if(output_PValue_distribution_plot != "/dev/null") {
|
|
338 write("Creating P-value distribution plot...",stdout())
|
|
339
|
|
340 if(output_format_images == "pdf") {
|
|
341 pdf(output_PValue_distribution_plot,width=14,height=14)
|
|
342 } else if(output_format_images == "svg") {
|
|
343 svg(output_PValue_distribution_plot,width=14,height=14)
|
|
344 } else {
|
|
345 ## png(output_PValue_distribution_plot)
|
|
346 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/
|
|
347
|
|
348 bitmap(output_PValue_distribution_plot,type="png16m",width=14,height=14)
|
|
349 }
|
|
350
|
|
351 expressed_genes <- subset(etable, PValue < 0.99)
|
|
352 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
353 center <- sum(h\$counts) / length(h\$counts)
|
|
354 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
355 k <- ksmooth(h\$mid, h\$counts)
|
|
356 lines(k\$x,k\$y,col="red",lwd=2)
|
|
357 rmsd <- (h\$counts) - center
|
|
358 rmsd <- rmsd^2
|
|
359 rmsd <- sum(rmsd)
|
|
360 rmsd <- sqrt(rmsd)
|
|
361 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
362 ## change e into epsilon somehow
|
|
363 dev.off()
|
|
364 }
|
|
365 }
|
|
366
|
|
367 if(output_heatmap_plot != "/dev/null") {
|
|
368
|
|
369 if(output_format_images == "pdf") {
|
|
370 pdf(output_heatmap_plot,width=10.5)
|
|
371 } else if(output_format_images == "svg") {
|
|
372 svg(output_heatmap_plot,width=10.5)
|
|
373 } else {
|
|
374 ## png(output_heatmap_plot)
|
|
375 ## png does not work out of the box in the Galaxy Toolshed Version of R due to its compile settings: https://biostar.usegalaxy.org/p/9170/
|
|
376
|
|
377 bitmap(output_heatmap_plot,type="png16m",width=10.5)
|
|
378 }
|
|
379
|
|
380 etable2 <- topTags(lrt, n=100)\$table
|
|
381 order <- rownames(etable2)
|
|
382 cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),]
|
|
383 heatmap(t(cpm_sub))
|
|
384 dev.off()
|
|
385 }
|
|
386
|
|
387 ##output_hierarchical_clustering_plot = args[13]
|
|
388
|
|
389 if(output_RData_obj != "/dev/null") {
|
|
390 save.image(output_RData_obj)
|
|
391 }
|
|
392
|
|
393 write("Done!",stdout())
|
|
394 }
|
|
395 </configfile>
|
|
396 </configfiles>
|
|
397
|
|
398 <inputs>
|
|
399 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
400 <param name="design_matrix" type="data" format="tabular" label="Design matrix" help="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
401
|
|
402 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
403
|
|
404 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
405
|
|
406 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
407 <option value="make_output_raw_counts">Raw counts table</option>
|
|
408 <option value="make_output_MDSplot_logFC">MDS-plot (logFC-method)</option>
|
|
409 <option value="make_output_MDSplot_bcv">MDS-plot (BCV-method; much slower)</option>
|
|
410 <option value="make_output_BCVplot">BCV-plot</option>
|
|
411 <option value="make_output_MAplot">MA-plot</option>
|
|
412 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
413 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering (under contstruction)</option>
|
|
414 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
415
|
|
416 <option value="make_output_R_stdout">R stdout</option>
|
|
417 <option value="make_output_RData_obj">R Data object</option>
|
|
418 </param>
|
|
419
|
|
420 <param name="output_format_images" type="select" label="Output format of images" display="radio">
|
|
421 <option value="png">Portable network graphics (.png)</option>
|
|
422 <option value="pdf">Portable document format (.pdf)</option>
|
|
423 <option value="svg">Scalable vector graphics (.svg)</option>
|
|
424 </param>
|
|
425 </inputs>
|
|
426
|
|
427 <outputs>
|
|
428 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differentially expressed genes" />
|
|
429 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
430
|
|
431 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
|
432 <filter>outputs and ("make_output_raw_counts" in outputs)</filter>
|
|
433 </data>
|
|
434
|
|
435 <data format="png" name="output_MDSplot_logFC" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (logFC method)">
|
|
436 <filter>outputs and ("make_output_MDSplot_logFC" in outputs)</filter>
|
|
437
|
|
438 <change_format>
|
|
439 <when input="output_format_images" value="png" format="png" />
|
|
440 <when input="output_format_images" value="pdf" format="pdf" />
|
|
441 <when input="output_format_images" value="svg" format="svg" />
|
|
442 </change_format>
|
|
443 </data>
|
|
444
|
|
445 <data format="png" name="output_MDSplot_bcv" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot (bcv method)">
|
|
446 <filter>outputs and ("make_output_MDSplot_bcv" in outputs)</filter>
|
|
447
|
|
448 <change_format>
|
|
449 <when input="output_format_images" value="png" format="png" />
|
|
450 <when input="output_format_images" value="pdf" format="pdf" />
|
|
451 <when input="output_format_images" value="svg" format="svg" />
|
|
452 </change_format>
|
|
453 </data>
|
|
454
|
|
455 <data format="png" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
|
456 <filter>outputs and ("make_output_BCVplot" in outputs)</filter>
|
|
457
|
|
458 <change_format>
|
|
459 <when input="output_format_images" value="png" format="png" />
|
|
460 <when input="output_format_images" value="pdf" format="pdf" />
|
|
461 <when input="output_format_images" value="svg" format="svg" />
|
|
462 </change_format>
|
|
463 </data>
|
|
464
|
|
465 <data format="png" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
|
466 <filter>outputs and ("make_output_MAplot" in outputs)</filter>
|
|
467
|
|
468 <change_format>
|
|
469 <when input="output_format_images" value="png" format="png" />
|
|
470 <when input="output_format_images" value="pdf" format="pdf" />
|
|
471 <when input="output_format_images" value="svg" format="svg" />
|
|
472 </change_format>
|
|
473 </data>
|
|
474
|
|
475 <data format="png" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
|
476 <filter>outputs and ("make_output_PValue_distribution_plot" in outputs)</filter>
|
|
477
|
|
478 <change_format>
|
|
479 <when input="output_format_images" value="png" format="png" />
|
|
480 <when input="output_format_images" value="pdf" format="pdf" />
|
|
481 <when input="output_format_images" value="svg" format="svg" />
|
|
482 </change_format>
|
|
483 </data>
|
|
484
|
|
485 <data format="png" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
|
486 <filter>outputs and ("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
|
487
|
|
488 <change_format>
|
|
489 <when input="output_format_images" value="png" format="png" />
|
|
490 <when input="output_format_images" value="pdf" format="pdf" />
|
|
491 <when input="output_format_images" value="svg" format="svg" />
|
|
492 </change_format>
|
|
493 </data>
|
|
494
|
|
495 <data format="png" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
|
496 <filter>outputs and ("make_output_heatmap_plot" in outputs)</filter>
|
|
497
|
|
498 <change_format>
|
|
499 <when input="output_format_images" value="png" format="png" />
|
|
500 <when input="output_format_images" value="pdf" format="pdf" />
|
|
501 <when input="output_format_images" value="svg" format="svg" />
|
|
502 </change_format>
|
|
503 </data>
|
|
504
|
|
505 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
|
506 <filter>outputs and ("make_output_RData_obj" in outputs)</filter>
|
|
507 </data>
|
|
508
|
|
509 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output (debug)" >
|
|
510 <filter>outputs and ("make_output_R_stdout" in outputs)</filter>
|
|
511 </data>
|
|
512 </outputs>
|
|
513
|
|
514 <tests>
|
|
515 <test>
|
|
516 <param name="expression_matrix" value="Differential_Gene_Expression/expression_matrix.tabular.txt" />
|
|
517 <param name="design_matrix" value="Differential_Gene_Expression/design_matrix.tabular.txt" />
|
|
518
|
|
519 <param name="contrast" value="E-C"/>
|
|
520
|
|
521 <param name="fdr" value="0.05" />
|
|
522
|
|
523 <param name="output_format_images" value="png" />
|
|
524
|
|
525 <output name="output_count_edgeR" file="Differential_Gene_Expression/differentially_expressed_genes.tabular.txt" />
|
|
526 </test>
|
|
527 </tests>
|
|
528
|
|
529 <help>
|
|
530 edgeR: Differential Gene(Expression) Analysis
|
|
531 #############################################
|
|
532
|
|
533 Overview
|
|
534 --------
|
|
535 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data [1].
|
|
536
|
|
537 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
|
538 More details on this are given in the edgeR manual: http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
|
539 and the limma manual.
|
|
540
|
|
541 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
542 This tool is called *edgeR Design Matrix Creator*.
|
|
543 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
544 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
545
|
|
546 If you have for example two groups, with an equal weight, you would like to compare either
|
|
547 "g1-g2" or "normal-cancer".
|
|
548
|
|
549 The test function makes use of a MCF7 dataset used in a study that indicates that a higher sequencing depth is not neccesairily more important than a higher amount of replaciates[2].
|
|
550
|
|
551 Input
|
|
552 -----
|
|
553 Expression matrix
|
|
554 ^^^^^^^^^^^^^^^^^
|
|
555 ::
|
|
556
|
|
557 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
558 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
559 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
560 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
561 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
562 [...]
|
|
563
|
|
564 *Note: Make sure the number of columns in the header is identical to the number of columns in the body.*
|
|
565
|
|
566 Design matrix
|
|
567 ^^^^^^^^^^^^^
|
|
568 ::
|
|
569
|
|
570 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
571 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
572 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
573 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
574 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
575 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
576 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
577 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
578 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
579 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
580 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
581 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
582 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
583
|
|
584 *Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.*
|
|
585
|
|
586 Contrast
|
|
587 ^^^^^^^^
|
|
588 The contrast represents the biological question. There can be many questions asked, e.g.:
|
|
589
|
|
590 - Tumor-Normal
|
|
591 - African-European
|
|
592 - 0.5*(Control+Placebo) / Treated
|
|
593
|
|
594 Installation
|
|
595 ------------
|
|
596
|
|
597 This tool requires no specific configurations. The following dependencies are installed automatically:
|
|
598
|
|
599 - R
|
|
600 - Bioconductor
|
|
601 - limma
|
|
602 - edgeR
|
|
603
|
|
604 License
|
|
605 -------
|
|
606 - R
|
|
607 - GPL 2 & GPL 3
|
|
608 - limma
|
|
609 - GPL (>=2)
|
|
610 - edgeR
|
|
611 - GPL (>=2)
|
|
612
|
|
613 References
|
|
614 ----------
|
|
615
|
|
616 EdgeR
|
|
617 ^^^^^
|
|
618 **[1] edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.**
|
|
619
|
|
620 *Mark D. Robinson, Davis J. McCarthy and Gordon K. Smyth* - Bioinformatics (2010) 26 (1): 139-140.
|
|
621
|
|
622 - http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
623 - http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
624 - http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
|
625
|
|
626 Test-data (MCF7)
|
|
627 ^^^^^^^^^^^^^^^^
|
|
628 **[2] RNA-seq differential expression studies: more sequence or more replication?**
|
|
629
|
|
630 *Yuwen Liu, Jie Zhou and Kevin P. White* - Bioinformatics (2014) 30 (3): 301-304.
|
|
631
|
|
632 - http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
633 - http://dx.doi.org/10.1093/bioinformatics/btt688
|
|
634
|
|
635 Contact
|
|
636 -------
|
|
637
|
|
638 The tool wrapper has been written by Youri Hoogstrate from the Erasmus
|
|
639 Medical Center (Rotterdam, Netherlands) on behalf of the Translational
|
|
640 Research IT (TraIT) project:
|
|
641
|
|
642 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
643
|
|
644 More tools by the Translational Research IT (TraIT) project can be found
|
|
645 in the following toolsheds:
|
|
646
|
|
647 http://toolshed.dtls.nl/
|
|
648
|
|
649 http://toolshed.g2.bx.psu.edu
|
|
650
|
|
651 http://testtoolshed.g2.bx.psu.edu/
|
|
652
|
|
653 I would like to thank Hina Riaz - Naz Khan for her helpful contribution.
|
|
654 </help>
|
|
655
|
|
656 <citations>
|
|
657 <citation type="doi">10.1093/bioinformatics/btp616</citation>
|
|
658 <citation type="doi">10.1093/bioinformatics/btt688</citation>
|
|
659 </citations>
|
|
660 </tool>
|