25
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
29
|
6 <requirement type="package" version="3.0.3">package_r_3_0_3</requirement>
|
|
7 <requirement type="package" version="latest">package_biocLite_edgeR_limma</requirement>
|
25
|
8 </requirements>
|
|
9
|
|
10 <command>
|
|
11 <!--
|
|
12 The following script is written in the "Cheetah" language:
|
|
13 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
14 -->
|
|
15
|
|
16 R --vanilla --slave -f $R_script '--args
|
|
17 $expression_matrix
|
|
18 $design_matrix
|
|
19 $contrast
|
|
20
|
|
21 $fdr
|
|
22
|
|
23 $output_count_edgeR
|
|
24 $output_cpm
|
|
25
|
|
26 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
27
|
|
28 #if $output_raw_counts:
|
|
29 $output_raw_counts
|
|
30 #else:
|
|
31 /dev/null
|
|
32 #end if
|
|
33
|
|
34 #if $output_MDSplot:
|
|
35 $output_MDSplot
|
|
36 #else:
|
|
37 /dev/null
|
|
38 #end if
|
|
39
|
|
40 #if $output_BCVplot:
|
|
41 $output_BCVplot
|
|
42 #else:
|
|
43 /dev/null
|
|
44 #end if
|
|
45
|
|
46 #if $output_MAplot:
|
|
47 $output_MAplot
|
|
48 #else:
|
|
49 /dev/null
|
|
50 #end if
|
|
51
|
|
52 #if $output_PValue_distribution_plot:
|
|
53 $output_PValue_distribution_plot
|
|
54 #else:
|
|
55 /dev/null
|
|
56 #end if
|
|
57
|
|
58 #if $output_hierarchical_clustering_plot:
|
|
59 $output_hierarchical_clustering_plot
|
|
60 #else:
|
|
61 /dev/null
|
|
62 #end if
|
|
63
|
|
64 #if $output_heatmap_plot:
|
|
65 $output_heatmap_plot
|
|
66 #else:
|
|
67 /dev/null
|
|
68 #end if
|
|
69
|
|
70 #if $output_RData_obj:
|
|
71 $output_RData_obj
|
|
72 #else:
|
|
73 /dev/null
|
|
74 #end if
|
|
75 '
|
|
76 #if $output_R:
|
|
77 > $output_R
|
|
78 #else:
|
|
79 > /dev/null
|
|
80 #end if
|
|
81
|
|
82 2> stderr.txt
|
|
83 ;
|
|
84 grep -v 'Calculating library sizes from column' stderr.txt 1>&2
|
|
85
|
|
86 </command>
|
|
87
|
|
88 <inputs>
|
|
89 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
90 <param name="design_matrix" type="data" format="tabular" label="Design matrix" hepl="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
91
|
|
92 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
93
|
|
94 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
95
|
|
96 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
97 <option value="make_output_raw_counts">Raw counts table</option>
|
|
98 <option value="make_output_MDSplot">MDS-plot</option>
|
|
99 <option value="make_output_BCVplot">BCV-plot</option>
|
|
100 <option value="make_output_MAplot">MA-plot</option>
|
|
101 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
102 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
|
|
103 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
104
|
|
105 <option value="make_output_R">R stdout</option>
|
|
106 <option value="make_output_RData_obj">R Data object</option>
|
|
107 </param>
|
|
108 </inputs>
|
|
109
|
|
110 <configfiles>
|
|
111 <configfile name="R_script">
|
|
112 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
113 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
114 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
115
|
|
116 ## Fetch commandline arguments
|
|
117 args <- commandArgs(trailingOnly = TRUE)
|
|
118
|
|
119 expression_matrix_file = args[1]
|
|
120 design_matrix_file = args[2]
|
|
121 contrast = args[3]
|
|
122
|
|
123 fdr = args[4]
|
|
124
|
|
125 output_count_edgeR = args[5]
|
|
126 output_cpm = args[6]
|
|
127
|
|
128 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
|
129
|
|
130 output_raw_counts = args[8]
|
|
131 output_MDSplot = args[9]
|
|
132 output_BCVplot = args[10]
|
|
133 output_MAplot = args[11]
|
|
134 output_PValue_distribution_plot = args[12]
|
|
135 output_hierarchical_clustering_plot = args[13]
|
|
136 output_heatmap_plot = args[14]
|
|
137 output_RData_obj = args[15]
|
|
138
|
|
139
|
|
140 library(edgeR)
|
|
141 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
142 ## Obtain read-counts
|
|
143
|
|
144 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
145 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
146
|
|
147 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
148
|
|
149 for(i in 1:ncol(design_matrix)) {
|
|
150 old = design_matrix[,i]
|
|
151 design_matrix[,i] = make.names(design_matrix[,i])
|
|
152 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
153 print("Renaming of factors:")
|
|
154 print(old)
|
|
155 print("To:")
|
|
156 print(design_matrix[,i])
|
|
157 }
|
|
158 design_matrix[,i] <- as.factor(design_matrix[,i])
|
|
159 }
|
|
160
|
|
161
|
|
162 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
|
163 read_counts <- expression_matrix[,columns]
|
|
164
|
|
165
|
|
166 ## Filter for HTSeq predifined counts:
|
|
167 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
168 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
169
|
|
170 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
171 exclude = exclude[is.na(exclude)==0]
|
|
172 if(length(exclude) != 0) {
|
|
173 read_counts = read_counts[-exclude,]
|
|
174 }
|
|
175
|
|
176
|
|
177 empty_samples = apply(read_counts,2,function(x) sum(x) == 0)
|
|
178 if(sum(empty_samples) > 0) {
|
|
179 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
180 write(colnames(read_counts)[empty_samples],stderr())
|
|
181 } else {
|
|
182
|
|
183 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
184
|
|
185 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
186 design_matrix_tmp <- design_matrix
|
|
187 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
188 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
189 rm(design_matrix_tmp)
|
|
190
|
|
191 # Filter prefixes
|
|
192 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
193 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
194 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
195 replacements[avoid] = colnames(design)[avoid]
|
|
196 colnames(design) = replacements
|
|
197
|
|
198 # Do normalization
|
|
199 write("Calculating normalization factors...",stdout())
|
|
200 dge <- calcNormFactors(dge)
|
|
201 write("Estimating common dispersion...",stdout())
|
|
202 dge <- estimateGLMCommonDisp(dge,design)
|
|
203 write("Estimating trended dispersion...",stdout())
|
|
204 dge <- estimateGLMTrendedDisp(dge,design)
|
|
205 write("Estimating tagwise dispersion...",stdout())
|
|
206 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
207
|
|
208
|
|
209 if(output_MDSplot != "/dev/null") {
|
|
210 write("Creating MDS plot",stdout())
|
|
211 ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
212 points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
213 dev.off()# Kill it
|
|
214
|
|
215 pdf(output_MDSplot)
|
|
216 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
217 diff_y <-(max(points\$y)-min(points\$y))
|
|
218 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
|
|
219 points(points\$x,points\$y,pch=20)
|
|
220 text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
|
|
221 rm(diff_x,diff_y)
|
|
222
|
|
223 dev.off()
|
|
224 }
|
|
225
|
|
226 if(output_BCVplot != "/dev/null") {
|
|
227 write("Creating Biological coefficient of variation plot",stdout())
|
|
228 pdf(output_BCVplot)
|
|
229 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
230 dev.off()
|
|
231 }
|
|
232
|
|
233
|
|
234 write("Fitting GLM...",stdout())
|
|
235 fit <- glmFit(dge,design)
|
|
236
|
|
237 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
238 cont <- c(contrast)
|
|
239 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
240
|
|
241 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
242 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
243 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
244 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
245
|
|
246 ## todo EXPORT FPKM
|
|
247 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
248
|
|
249
|
|
250 if(output_MAplot != "/dev/null") {
|
|
251 write("Creating MA plot...",stdout())
|
|
252
|
|
253 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
254 etable <- etable[order(etable\$FDR), ]
|
|
255 pdf(output_MAplot)
|
|
256 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
257 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
258 abline(h=c(-1,1), col="blue")
|
|
259 dev.off()
|
|
260 }
|
|
261
|
|
262 if(output_PValue_distribution_plot != "/dev/null") {
|
|
263 write("Creating P-value distribution plot...",stdout())
|
|
264 pdf(output_PValue_distribution_plot)
|
|
265 expressed_genes <- subset(etable, PValue < 0.99)
|
|
266 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
267 center <- sum(h\$counts) / length(h\$counts)
|
|
268 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
269 k <- ksmooth(h\$mid, h\$counts)
|
|
270 lines(k\$x,k\$y,col="red",lwd=2)
|
|
271 rmsd <- (h\$counts) - center
|
|
272 rmsd <- rmsd^2
|
|
273 rmsd <- sum(rmsd)
|
|
274 rmsd <- sqrt(rmsd)
|
|
275 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
276 ## change e into epsilon somehow
|
|
277 dev.off()
|
|
278 }
|
|
279
|
|
280 ##output_hierarchical_clustering_plot = args[13]
|
|
281 ##output_heatmap_plot = args[14]
|
|
282
|
|
283 if(output_RData_obj != "/dev/null/") {
|
|
284 save.image(output_RData_obj)
|
|
285 }
|
|
286
|
|
287 write("Done!",stdout())
|
|
288 }
|
|
289 </configfile>
|
|
290 </configfiles>
|
|
291
|
|
292 <outputs>
|
|
293 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differtially expressed genes" />
|
|
294 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
295
|
|
296 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
|
297 <filter>("make_output_raw_counts" in outputs)</filter>
|
|
298 </data>
|
|
299
|
|
300 <data format="pdf" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
|
301 <filter>("make_output_MDSplot" in outputs)</filter>
|
|
302 </data>
|
|
303
|
|
304 <data format="pdf" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
|
305 <filter>("make_output_BCVplot" in outputs)</filter>
|
|
306 </data>
|
|
307
|
|
308 <data format="pdf" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
|
309 <filter>("make_output_MAplot" in outputs)</filter>
|
|
310 </data>
|
|
311
|
|
312 <data format="pdf" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
|
313 <filter>("make_output_PValue_distribution_plot" in outputs)</filter>
|
|
314 </data>
|
|
315
|
|
316 <data format="pdf" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
|
317 <filter>("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
|
318 </data>
|
|
319
|
|
320 <data format="pdf" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
|
321 <filter>("make_output_heatmap_plot" in outputs)</filter>
|
|
322 </data>
|
|
323
|
|
324 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
|
325 <filter>("make_output_R" in outputs)</filter>
|
|
326 </data>
|
|
327
|
|
328 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output" >
|
|
329 <filter>("make_output_RData_obj" in outputs)</filter>
|
|
330 </data>
|
|
331 </outputs>
|
|
332
|
|
333 <help>
|
|
334 edgeR: Differential Gene(Expression) Analysis
|
|
335
|
|
336 **Overview**
|
|
337
|
|
338 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data.
|
|
339 Author: Mark Robinson, Davis McCarthy, Yunshun Chen, Aaron Lun & Gordon Smyth
|
|
340
|
|
341 http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
342 http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
343
|
|
344 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
|
345 More details on this are given in the edgeR manual:
|
|
346 http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
|
347 and the limma manual.
|
|
348
|
|
349 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
350 This tool is called *edgeR Design Matrix Creator*.
|
|
351 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
352 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
353
|
|
354 If you have for example two groups, with an equal weight, you would like to compare either
|
|
355 "g1~g2" or "normal~cancer".
|
|
356
|
|
357 ** Input **
|
|
358
|
|
359 Expression matrix::
|
|
360
|
|
361 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
362 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
363 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
364 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
365 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
366 [...]
|
|
367
|
|
368 Note: Make sure the number of columns in the header is identical to the number of columns in the body.
|
|
369
|
|
370 Design matrix::
|
|
371
|
|
372 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
373 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
374 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
375 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
376 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
377 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
378 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
379 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
380 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
381 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
382 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
383 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
384 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
385
|
|
386 Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.
|
|
387
|
|
388 Contrast::
|
|
389
|
|
390 Tumor-Normal
|
|
391 African-European
|
|
392 0.5*(Control+Placebo) / Treated
|
|
393
|
|
394
|
|
395 **Installation**
|
|
396
|
|
397 This tool requires no specific configurations. The following dependencies are installed automatically:
|
|
398 * R
|
|
399 * Bioconductor
|
|
400 - limma
|
|
401 - edgeR
|
|
402
|
|
403 **License**
|
|
404 - R - GPL-2 & GPL-3
|
|
405 - limma - GPL (>=2)
|
|
406 - edgeR - GPL (>=2)
|
|
407
|
|
408 **Contact**
|
|
409
|
|
410 The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
|
|
411 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
412
|
|
413 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
|
|
414 http://toolshed.nbic.nl/
|
|
415
|
|
416 **References**
|
|
417
|
|
418 The test data is coming from: doi: 10.1093/bioinformatics/btt688.
|
|
419 http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
420
|
|
421 </help>
|
|
422 </tool>
|