25
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
62
|
6 <!--<requirement type="package" version="3.0.1">package_r3_withx</requirement>-->
|
67
|
7 <!--<requirement type="package" version="3.1.0">R</requirement>-->
|
|
8 <requirement type="package" version="3.0.3">R</requirement>
|
29
|
9 <requirement type="package" version="latest">package_biocLite_edgeR_limma</requirement>
|
72
|
10 <requirement type="package" version="1.3.18">graphicsmagick</requirement>
|
25
|
11 </requirements>
|
|
12
|
|
13 <command>
|
|
14 <!--
|
|
15 The following script is written in the "Cheetah" language:
|
|
16 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
17 -->
|
|
18
|
|
19 R --vanilla --slave -f $R_script '--args
|
|
20 $expression_matrix
|
|
21 $design_matrix
|
|
22 $contrast
|
|
23
|
|
24 $fdr
|
|
25
|
|
26 $output_count_edgeR
|
|
27 $output_cpm
|
|
28
|
|
29 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
30
|
|
31 #if $output_raw_counts:
|
|
32 $output_raw_counts
|
|
33 #else:
|
|
34 /dev/null
|
|
35 #end if
|
|
36
|
|
37 #if $output_MDSplot:
|
|
38 $output_MDSplot
|
|
39 #else:
|
|
40 /dev/null
|
|
41 #end if
|
|
42
|
|
43 #if $output_BCVplot:
|
|
44 $output_BCVplot
|
|
45 #else:
|
|
46 /dev/null
|
|
47 #end if
|
|
48
|
|
49 #if $output_MAplot:
|
|
50 $output_MAplot
|
|
51 #else:
|
|
52 /dev/null
|
|
53 #end if
|
|
54
|
|
55 #if $output_PValue_distribution_plot:
|
|
56 $output_PValue_distribution_plot
|
|
57 #else:
|
|
58 /dev/null
|
|
59 #end if
|
|
60
|
|
61 #if $output_hierarchical_clustering_plot:
|
|
62 $output_hierarchical_clustering_plot
|
|
63 #else:
|
|
64 /dev/null
|
|
65 #end if
|
|
66
|
|
67 #if $output_heatmap_plot:
|
|
68 $output_heatmap_plot
|
|
69 #else:
|
|
70 /dev/null
|
|
71 #end if
|
|
72
|
|
73 #if $output_RData_obj:
|
|
74 $output_RData_obj
|
|
75 #else:
|
|
76 /dev/null
|
|
77 #end if
|
55
|
78
|
|
79 $output_format_images
|
|
80 '
|
25
|
81 #if $output_R:
|
|
82 > $output_R
|
|
83 #else:
|
|
84 > /dev/null
|
|
85 #end if
|
|
86
|
53
|
87 2> stderr.txt ;
|
|
88
|
70
|
89 #if $output_format_images.value == "png":
|
|
90 echo "Converting PDF figures to PNG" ;
|
69
|
91
|
70
|
92 #if $output_MDSplot:
|
|
93 #set $output_MDSplot_tmp = str($output_MDSplot)+".png"
|
|
94
|
72
|
95 gm convert $output_MDSplot $output_MDSplot_tmp ;
|
70
|
96 mv $output_MDSplot_tmp $output_MDSplot ;
|
|
97 #end if
|
|
98
|
|
99 #if $output_BCVplot:
|
|
100 #set $output_BCVplot_tmp = str($output_BCVplot)+".png"
|
|
101
|
72
|
102 gm convert $output_BCVplot $output_BCVplot_tmp ;
|
70
|
103 mv $output_BCVplot_tmp $output_BCVplot ;
|
|
104 #end if
|
69
|
105
|
70
|
106 #if $output_MAplot:
|
|
107 #set $output_MAplot_tmp = str($output_MAplot)+".png"
|
|
108
|
72
|
109 gm convert $output_MAplot $output_MAplot_tmp ;
|
70
|
110 mv $output_MAplot_tmp $output_MAplot ;
|
|
111 #end if
|
|
112
|
|
113 #if $output_PValue_distribution_plot:
|
|
114 #set $output_PValue_distribution_plot_tmp = str($output_PValue_distribution_plot)+".png"
|
|
115
|
72
|
116 gm convert $output_PValue_distribution_plot $output_PValue_distribution_plot_tmp ;
|
70
|
117 mv $output_PValue_distribution_plot_tmp $output_PValue_distribution_plot ;
|
|
118 #end if
|
69
|
119
|
70
|
120 #if $output_hierarchical_clustering_plot:
|
|
121 #set $output_hierarchical_clustering_plot_tmp = str($output_hierarchical_clustering_plot)+".png"
|
|
122
|
72
|
123 gm convert $output_hierarchical_clustering_plot $output_hierarchical_clustering_plot_tmp ;
|
70
|
124 mv $output_hierarchical_clustering_plot_tmp $output_hierarchical_clustering_plot ;
|
|
125 #end if
|
|
126
|
|
127 #if $output_heatmap_plot:
|
|
128 #set $output_heatmap_plot_tmp = str($output_heatmap_plot)+".png"
|
|
129
|
72
|
130 gm convert $output_heatmap_plot $output_heatmap_plot_tmp ;
|
70
|
131 mv $output_heatmap_plot_tmp $output_heatmap_plot ;
|
|
132 #end if
|
67
|
133 #end if
|
|
134
|
53
|
135 grep -v 'Calculating library sizes from column' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
136
|
|
137 ## Locale error messages:
|
|
138 grep -v 'During startup - Warning messages' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
139 grep -v 'Setting LC_TIME failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
140 grep -v 'Setting LC_MONETARY failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
141 grep -v 'Setting LC_PAPER failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
142 grep -v 'Setting LC_MEASUREMENT failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
143 grep -v 'Setting LC_CTYPE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
144 grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
145
|
|
146 cat stderr.txt >&2
|
72
|
147
|
25
|
148 </command>
|
|
149
|
|
150 <inputs>
|
|
151 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
152 <param name="design_matrix" type="data" format="tabular" label="Design matrix" hepl="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
153
|
|
154 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
155
|
|
156 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
157
|
|
158 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
159 <option value="make_output_raw_counts">Raw counts table</option>
|
|
160 <option value="make_output_MDSplot">MDS-plot</option>
|
|
161 <option value="make_output_BCVplot">BCV-plot</option>
|
|
162 <option value="make_output_MAplot">MA-plot</option>
|
|
163 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
164 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
|
|
165 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
166
|
43
|
167 <option value="make_output_R_stdout">R stdout</option>
|
25
|
168 <option value="make_output_RData_obj">R Data object</option>
|
|
169 </param>
|
55
|
170
|
|
171 <param name="output_format_images" type="select" label="Output format of images" display="radio">
|
|
172 <option value="png">Portable network graphics (.png)</option>
|
|
173 <option value="pdf">Portable document format (.pdf)</option>
|
|
174 <option value="svg">Scalable vector graphics (.svg)</option>
|
|
175 </param>
|
25
|
176 </inputs>
|
|
177
|
|
178 <configfiles>
|
|
179 <configfile name="R_script">
|
|
180 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
181 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
182 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
183
|
|
184 ## Fetch commandline arguments
|
|
185 args <- commandArgs(trailingOnly = TRUE)
|
|
186
|
|
187 expression_matrix_file = args[1]
|
|
188 design_matrix_file = args[2]
|
|
189 contrast = args[3]
|
|
190
|
|
191 fdr = args[4]
|
|
192
|
|
193 output_count_edgeR = args[5]
|
|
194 output_cpm = args[6]
|
|
195
|
43
|
196 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
25
|
197
|
|
198 output_raw_counts = args[8]
|
|
199 output_MDSplot = args[9]
|
|
200 output_BCVplot = args[10]
|
|
201 output_MAplot = args[11]
|
|
202 output_PValue_distribution_plot = args[12]
|
|
203 output_hierarchical_clustering_plot = args[13]
|
|
204 output_heatmap_plot = args[14]
|
|
205 output_RData_obj = args[15]
|
55
|
206 output_format_images = args[16]
|
25
|
207
|
|
208
|
|
209 library(edgeR)
|
|
210 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
211 ## Obtain read-counts
|
|
212
|
|
213 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
214 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
215
|
|
216 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
217
|
|
218 for(i in 1:ncol(design_matrix)) {
|
|
219 old = design_matrix[,i]
|
|
220 design_matrix[,i] = make.names(design_matrix[,i])
|
|
221 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
222 print("Renaming of factors:")
|
|
223 print(old)
|
|
224 print("To:")
|
|
225 print(design_matrix[,i])
|
|
226 }
|
45
|
227 ## The following line seems to malfunction the script:
|
|
228 ##design_matrix[,i] <- as.factor(design_matrix[,i])
|
25
|
229 }
|
|
230
|
44
|
231 ## 1) In the expression matrix, you only want to have the samples described in the design matrix
|
25
|
232 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
43
|
233 columns <- columns[!is.na(columns)]
|
25
|
234 read_counts <- expression_matrix[,columns]
|
|
235
|
44
|
236 ## 2) In the design matrix, you only want to have samples of which you really have the counts
|
|
237 columns <- match(colnames(expression_matrix),rownames(design_matrix))
|
|
238 columns <- columns[!is.na(columns)]
|
|
239 design_matrix <- design_matrix[columns,,drop=FALSE]
|
25
|
240
|
|
241 ## Filter for HTSeq predifined counts:
|
|
242 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
243 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
244
|
44
|
245 exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
246 exclude <- exclude[is.na(exclude)==0]
|
25
|
247 if(length(exclude) != 0) {
|
44
|
248 read_counts <- read_counts[-exclude,]
|
25
|
249 }
|
|
250
|
|
251
|
44
|
252 empty_samples <- apply(read_counts,2,function(x) sum(x) == 0)
|
25
|
253 if(sum(empty_samples) > 0) {
|
|
254 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
255 write(colnames(read_counts)[empty_samples],stderr())
|
|
256 } else {
|
|
257
|
|
258 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
259
|
|
260 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
261 design_matrix_tmp <- design_matrix
|
|
262 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
263 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
264 rm(design_matrix_tmp)
|
|
265
|
|
266 # Filter prefixes
|
|
267 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
268 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
269 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
270 replacements[avoid] = colnames(design)[avoid]
|
|
271 colnames(design) = replacements
|
|
272
|
|
273 # Do normalization
|
|
274 write("Calculating normalization factors...",stdout())
|
|
275 dge <- calcNormFactors(dge)
|
|
276 write("Estimating common dispersion...",stdout())
|
|
277 dge <- estimateGLMCommonDisp(dge,design)
|
|
278 write("Estimating trended dispersion...",stdout())
|
|
279 dge <- estimateGLMTrendedDisp(dge,design)
|
|
280 write("Estimating tagwise dispersion...",stdout())
|
|
281 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
282
|
|
283
|
|
284 if(output_MDSplot != "/dev/null") {
|
|
285 write("Creating MDS plot",stdout())
|
|
286 ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
287 points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
288 dev.off()# Kill it
|
|
289
|
67
|
290 if(output_format_images == "pdf" || output_format_images == "png") {
|
55
|
291 pdf(output_MDSplot)
|
|
292 } else if(output_format_images == "svg") {
|
|
293 svg(output_MDSplot)
|
70
|
294 }
|
|
295 ## else {
|
67
|
296 ## png(output_MDSplot)
|
|
297 ##}
|
55
|
298
|
25
|
299 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
300 diff_y <-(max(points\$y)-min(points\$y))
|
|
301 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
|
|
302 points(points\$x,points\$y,pch=20)
|
|
303 text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
|
|
304 rm(diff_x,diff_y)
|
|
305
|
|
306 dev.off()
|
|
307 }
|
|
308
|
|
309 if(output_BCVplot != "/dev/null") {
|
|
310 write("Creating Biological coefficient of variation plot",stdout())
|
60
|
311
|
67
|
312 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
313 pdf(output_BCVplot)
|
|
314 } else if(output_format_images == "svg") {
|
|
315 svg(output_BCVplot)
|
70
|
316 }
|
|
317 ##else {
|
67
|
318 ## png(output_BCVplot)
|
|
319 ##}
|
60
|
320
|
25
|
321 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
322 dev.off()
|
|
323 }
|
|
324
|
|
325
|
|
326 write("Fitting GLM...",stdout())
|
|
327 fit <- glmFit(dge,design)
|
|
328
|
|
329 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
330 cont <- c(contrast)
|
|
331 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
332
|
|
333 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
334 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
335 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
336 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
337
|
|
338 ## todo EXPORT FPKM
|
|
339 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
340
|
34
|
341 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") {
|
25
|
342 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
343 etable <- etable[order(etable\$FDR), ]
|
32
|
344
|
|
345 if(output_MAplot != "/dev/null") {
|
|
346 write("Creating MA plot...",stdout())
|
60
|
347
|
67
|
348 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
349 pdf(output_MAplot)
|
|
350 } else if(output_format_images == "svg") {
|
|
351 svg(output_MAplot)
|
70
|
352 }
|
|
353 ##else {
|
67
|
354 ## png(output_MAplot)
|
|
355 ##}
|
60
|
356
|
32
|
357 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
358 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
359 abline(h=c(-1,1), col="blue")
|
|
360 dev.off()
|
|
361 }
|
25
|
362
|
32
|
363 if(output_PValue_distribution_plot != "/dev/null") {
|
|
364 write("Creating P-value distribution plot...",stdout())
|
60
|
365
|
67
|
366 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
367 pdf(output_PValue_distribution_plot)
|
|
368 } else if(output_format_images == "svg") {
|
|
369 svg(output_PValue_distribution_plot)
|
70
|
370 }
|
|
371 ##else {
|
67
|
372 ## png(output_PValue_distribution_plot)
|
|
373 ##}
|
60
|
374
|
32
|
375 expressed_genes <- subset(etable, PValue < 0.99)
|
|
376 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
377 center <- sum(h\$counts) / length(h\$counts)
|
|
378 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
379 k <- ksmooth(h\$mid, h\$counts)
|
|
380 lines(k\$x,k\$y,col="red",lwd=2)
|
|
381 rmsd <- (h\$counts) - center
|
|
382 rmsd <- rmsd^2
|
|
383 rmsd <- sum(rmsd)
|
|
384 rmsd <- sqrt(rmsd)
|
|
385 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
386 ## change e into epsilon somehow
|
|
387 dev.off()
|
|
388 }
|
40
|
389 }
|
|
390
|
|
391 if(output_heatmap_plot != "/dev/null") {
|
60
|
392
|
67
|
393 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
394 pdf(output_heatmap_plot,width=10.5)
|
|
395 } else if(output_format_images == "svg") {
|
|
396 svg(output_heatmap_plot,width=10.5)
|
70
|
397 }
|
|
398 ## else {
|
67
|
399 ## png(output_heatmap_plot,width=10.5)
|
|
400 ##}
|
60
|
401
|
40
|
402 etable2 <- topTags(lrt, n=100)\$table
|
|
403 order <- rownames(etable2)
|
|
404 cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),]
|
|
405 heatmap(t(cpm_sub))
|
|
406 dev.off()
|
25
|
407 }
|
|
408
|
|
409 ##output_hierarchical_clustering_plot = args[13]
|
|
410
|
35
|
411 if(output_RData_obj != "/dev/null") {
|
25
|
412 save.image(output_RData_obj)
|
|
413 }
|
|
414
|
|
415 write("Done!",stdout())
|
|
416 }
|
|
417 </configfile>
|
|
418 </configfiles>
|
|
419
|
|
420 <outputs>
|
53
|
421 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differentially expressed genes" />
|
25
|
422 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
423
|
|
424 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
53
|
425 <filter>outputs and ("make_output_raw_counts" in outputs)</filter>
|
25
|
426 </data>
|
|
427
|
59
|
428 <data format="png" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
53
|
429 <filter>outputs and ("make_output_MDSplot" in outputs)</filter>
|
59
|
430
|
|
431 <change_format>
|
|
432 <when input="output_format_images" value="png" format="png" />
|
|
433 <when input="output_format_images" value="pdf" format="pdf" />
|
|
434 <when input="output_format_images" value="svg" format="svg" />
|
|
435 </change_format>
|
25
|
436 </data>
|
|
437
|
60
|
438 <data format="png" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
53
|
439 <filter>outputs and ("make_output_BCVplot" in outputs)</filter>
|
60
|
440
|
|
441 <change_format>
|
|
442 <when input="output_format_images" value="png" format="png" />
|
|
443 <when input="output_format_images" value="pdf" format="pdf" />
|
|
444 <when input="output_format_images" value="svg" format="svg" />
|
|
445 </change_format>
|
25
|
446 </data>
|
|
447
|
60
|
448 <data format="png" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
53
|
449 <filter>outputs and ("make_output_MAplot" in outputs)</filter>
|
60
|
450
|
|
451 <change_format>
|
|
452 <when input="output_format_images" value="png" format="png" />
|
|
453 <when input="output_format_images" value="pdf" format="pdf" />
|
|
454 <when input="output_format_images" value="svg" format="svg" />
|
|
455 </change_format>
|
25
|
456 </data>
|
|
457
|
60
|
458 <data format="png" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
53
|
459 <filter>outputs and ("make_output_PValue_distribution_plot" in outputs)</filter>
|
60
|
460
|
|
461 <change_format>
|
|
462 <when input="output_format_images" value="png" format="png" />
|
|
463 <when input="output_format_images" value="pdf" format="pdf" />
|
|
464 <when input="output_format_images" value="svg" format="svg" />
|
|
465 </change_format>
|
25
|
466 </data>
|
|
467
|
60
|
468 <data format="png" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
53
|
469 <filter>outputs and ("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
60
|
470
|
|
471 <change_format>
|
|
472 <when input="output_format_images" value="png" format="png" />
|
|
473 <when input="output_format_images" value="pdf" format="pdf" />
|
|
474 <when input="output_format_images" value="svg" format="svg" />
|
|
475 </change_format>
|
25
|
476 </data>
|
|
477
|
60
|
478 <data format="png" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
53
|
479 <filter>outputs and ("make_output_heatmap_plot" in outputs)</filter>
|
60
|
480
|
|
481 <change_format>
|
|
482 <when input="output_format_images" value="png" format="png" />
|
|
483 <when input="output_format_images" value="pdf" format="pdf" />
|
|
484 <when input="output_format_images" value="svg" format="svg" />
|
|
485 </change_format>
|
25
|
486 </data>
|
|
487
|
|
488 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
53
|
489 <filter>outputs and ("make_output_RData_obj" in outputs)</filter>
|
25
|
490 </data>
|
|
491
|
40
|
492 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output (debug)" >
|
53
|
493 <filter>outputs and ("make_output_R_stdout" in outputs)</filter>
|
25
|
494 </data>
|
|
495 </outputs>
|
|
496
|
|
497 <help>
|
|
498 edgeR: Differential Gene(Expression) Analysis
|
36
|
499 #############################################
|
25
|
500
|
36
|
501 Overview
|
|
502 --------
|
|
503 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data [1].
|
25
|
504
|
|
505 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
36
|
506 More details on this are given in the edgeR manual: http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
25
|
507 and the limma manual.
|
|
508
|
|
509 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
510 This tool is called *edgeR Design Matrix Creator*.
|
|
511 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
512 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
513
|
|
514 If you have for example two groups, with an equal weight, you would like to compare either
|
|
515 "g1~g2" or "normal~cancer".
|
|
516
|
36
|
517 The test function makes use of a MCF7 dataset used in a study that indicates that a higher sequencing depth is not neccesairily more important than a higher amount of replaciates[2].
|
25
|
518
|
36
|
519 Input
|
|
520 -----
|
|
521 Expression matrix
|
|
522 ^^^^^^^^^^^^^^^^^
|
|
523 ::
|
25
|
524
|
|
525 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
526 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
527 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
528 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
529 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
530 [...]
|
|
531
|
36
|
532 *Note: Make sure the number of columns in the header is identical to the number of columns in the body.*
|
25
|
533
|
36
|
534 Design matrix
|
|
535 ^^^^^^^^^^^^^
|
|
536 ::
|
25
|
537
|
|
538 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
539 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
540 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
541 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
542 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
543 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
544 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
545 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
546 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
547 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
548 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
549 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
550 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
551
|
36
|
552 *Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.*
|
25
|
553
|
36
|
554 Contrast
|
|
555 ^^^^^^^^
|
|
556 The contrast represents the biological question. There can be many questions asked, e.g.:
|
25
|
557
|
36
|
558 - Tumor-Normal
|
|
559 - African-European
|
|
560 - 0.5*(Control+Placebo) / Treated
|
25
|
561
|
36
|
562 Installation
|
|
563 ------------
|
25
|
564
|
|
565 This tool requires no specific configurations. The following dependencies are installed automatically:
|
36
|
566
|
|
567 - R
|
|
568 - Bioconductor
|
25
|
569 - limma
|
36
|
570
|
25
|
571 - edgeR
|
|
572
|
36
|
573 License
|
|
574 -------
|
|
575 - R
|
|
576 - GPL-2 & GPL-3
|
|
577 - limma
|
|
578 - GPL (>=2)
|
|
579 - edgeR
|
|
580 - GPL (>=2)
|
|
581
|
|
582 References
|
|
583 ----------
|
|
584
|
|
585 EdgeR
|
|
586 ^^^^^
|
|
587 **[1] edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.**
|
25
|
588
|
36
|
589 *Mark D. Robinson, Davis J. McCarthy and Gordon K. Smyth* - Bioinformatics (2010) 26 (1): 139-140.
|
|
590
|
|
591 - http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
592 - http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
593 - http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
25
|
594
|
36
|
595 Test-data (MCF7)
|
|
596 ^^^^^^^^^^^^^^^^
|
|
597 **[2] RNA-seq differential expression studies: more sequence or more replication?**
|
|
598
|
|
599 *Yuwen Liu, Jie Zhou and Kevin P. White* - Bioinformatics (2014) 30 (3): 301-304.
|
|
600
|
|
601 - http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
602 - http://dx.doi.org/10.1093/bioinformatics/btt688
|
|
603
|
|
604 Contact
|
|
605 -------
|
25
|
606 The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
|
|
607 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
608
|
36
|
609 I would like to thank Hina Riaz - Naz Khan for her helpful contribution.
|
25
|
610
|
36
|
611 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
|
|
612 http://testtoolshed.g2.bx.psu.edu/
|
25
|
613 </help>
|
|
614 </tool>
|