25
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
33
|
6 <requirement type="package" version="3.0.3">R</requirement>
|
29
|
7 <requirement type="package" version="latest">package_biocLite_edgeR_limma</requirement>
|
25
|
8 </requirements>
|
|
9
|
|
10 <command>
|
|
11 <!--
|
|
12 The following script is written in the "Cheetah" language:
|
|
13 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
14 -->
|
|
15
|
|
16 R --vanilla --slave -f $R_script '--args
|
|
17 $expression_matrix
|
|
18 $design_matrix
|
|
19 $contrast
|
|
20
|
|
21 $fdr
|
|
22
|
|
23 $output_count_edgeR
|
|
24 $output_cpm
|
|
25
|
|
26 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
27
|
|
28 #if $output_raw_counts:
|
|
29 $output_raw_counts
|
|
30 #else:
|
|
31 /dev/null
|
|
32 #end if
|
|
33
|
|
34 #if $output_MDSplot:
|
|
35 $output_MDSplot
|
|
36 #else:
|
|
37 /dev/null
|
|
38 #end if
|
|
39
|
|
40 #if $output_BCVplot:
|
|
41 $output_BCVplot
|
|
42 #else:
|
|
43 /dev/null
|
|
44 #end if
|
|
45
|
|
46 #if $output_MAplot:
|
|
47 $output_MAplot
|
|
48 #else:
|
|
49 /dev/null
|
|
50 #end if
|
|
51
|
|
52 #if $output_PValue_distribution_plot:
|
|
53 $output_PValue_distribution_plot
|
|
54 #else:
|
|
55 /dev/null
|
|
56 #end if
|
|
57
|
|
58 #if $output_hierarchical_clustering_plot:
|
|
59 $output_hierarchical_clustering_plot
|
|
60 #else:
|
|
61 /dev/null
|
|
62 #end if
|
|
63
|
|
64 #if $output_heatmap_plot:
|
|
65 $output_heatmap_plot
|
|
66 #else:
|
|
67 /dev/null
|
|
68 #end if
|
|
69
|
|
70 #if $output_RData_obj:
|
|
71 $output_RData_obj
|
|
72 #else:
|
|
73 /dev/null
|
|
74 #end if
|
|
75 '
|
|
76 #if $output_R:
|
|
77 > $output_R
|
|
78 #else:
|
|
79 > /dev/null
|
|
80 #end if
|
|
81
|
|
82 2> stderr.txt
|
|
83 ;
|
|
84 grep -v 'Calculating library sizes from column' stderr.txt 1>&2
|
|
85
|
|
86 </command>
|
|
87
|
|
88 <inputs>
|
|
89 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
90 <param name="design_matrix" type="data" format="tabular" label="Design matrix" hepl="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
91
|
|
92 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
93
|
|
94 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
95
|
|
96 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
97 <option value="make_output_raw_counts">Raw counts table</option>
|
|
98 <option value="make_output_MDSplot">MDS-plot</option>
|
|
99 <option value="make_output_BCVplot">BCV-plot</option>
|
|
100 <option value="make_output_MAplot">MA-plot</option>
|
|
101 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
102 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
|
|
103 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
104
|
|
105 <option value="make_output_R">R stdout</option>
|
|
106 <option value="make_output_RData_obj">R Data object</option>
|
|
107 </param>
|
|
108 </inputs>
|
|
109
|
|
110 <configfiles>
|
|
111 <configfile name="R_script">
|
|
112 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
113 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
114 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
115
|
|
116 ## Fetch commandline arguments
|
|
117 args <- commandArgs(trailingOnly = TRUE)
|
|
118
|
|
119 expression_matrix_file = args[1]
|
|
120 design_matrix_file = args[2]
|
|
121 contrast = args[3]
|
|
122
|
|
123 fdr = args[4]
|
|
124
|
|
125 output_count_edgeR = args[5]
|
|
126 output_cpm = args[6]
|
|
127
|
|
128 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
|
129
|
|
130 output_raw_counts = args[8]
|
|
131 output_MDSplot = args[9]
|
|
132 output_BCVplot = args[10]
|
|
133 output_MAplot = args[11]
|
|
134 output_PValue_distribution_plot = args[12]
|
|
135 output_hierarchical_clustering_plot = args[13]
|
|
136 output_heatmap_plot = args[14]
|
|
137 output_RData_obj = args[15]
|
|
138
|
|
139
|
|
140 library(edgeR)
|
|
141 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
142 ## Obtain read-counts
|
|
143
|
|
144 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
145 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
146
|
|
147 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
148
|
|
149 for(i in 1:ncol(design_matrix)) {
|
|
150 old = design_matrix[,i]
|
|
151 design_matrix[,i] = make.names(design_matrix[,i])
|
|
152 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
153 print("Renaming of factors:")
|
|
154 print(old)
|
|
155 print("To:")
|
|
156 print(design_matrix[,i])
|
|
157 }
|
|
158 design_matrix[,i] <- as.factor(design_matrix[,i])
|
|
159 }
|
|
160
|
|
161
|
|
162 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
|
163 read_counts <- expression_matrix[,columns]
|
|
164
|
|
165
|
|
166 ## Filter for HTSeq predifined counts:
|
|
167 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
168 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
169
|
|
170 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
171 exclude = exclude[is.na(exclude)==0]
|
|
172 if(length(exclude) != 0) {
|
|
173 read_counts = read_counts[-exclude,]
|
|
174 }
|
|
175
|
|
176
|
|
177 empty_samples = apply(read_counts,2,function(x) sum(x) == 0)
|
|
178 if(sum(empty_samples) > 0) {
|
|
179 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
180 write(colnames(read_counts)[empty_samples],stderr())
|
|
181 } else {
|
|
182
|
|
183 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
184
|
|
185 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
186 design_matrix_tmp <- design_matrix
|
|
187 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
188 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
189 rm(design_matrix_tmp)
|
|
190
|
|
191 # Filter prefixes
|
|
192 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
193 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
194 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
195 replacements[avoid] = colnames(design)[avoid]
|
|
196 colnames(design) = replacements
|
|
197
|
|
198 # Do normalization
|
|
199 write("Calculating normalization factors...",stdout())
|
|
200 dge <- calcNormFactors(dge)
|
|
201 write("Estimating common dispersion...",stdout())
|
|
202 dge <- estimateGLMCommonDisp(dge,design)
|
|
203 write("Estimating trended dispersion...",stdout())
|
|
204 dge <- estimateGLMTrendedDisp(dge,design)
|
|
205 write("Estimating tagwise dispersion...",stdout())
|
|
206 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
207
|
|
208
|
|
209 if(output_MDSplot != "/dev/null") {
|
|
210 write("Creating MDS plot",stdout())
|
|
211 ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
212 points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
213 dev.off()# Kill it
|
|
214
|
|
215 pdf(output_MDSplot)
|
|
216 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
217 diff_y <-(max(points\$y)-min(points\$y))
|
|
218 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
|
|
219 points(points\$x,points\$y,pch=20)
|
|
220 text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
|
|
221 rm(diff_x,diff_y)
|
|
222
|
|
223 dev.off()
|
|
224 }
|
|
225
|
|
226 if(output_BCVplot != "/dev/null") {
|
|
227 write("Creating Biological coefficient of variation plot",stdout())
|
|
228 pdf(output_BCVplot)
|
|
229 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
230 dev.off()
|
|
231 }
|
|
232
|
|
233
|
|
234 write("Fitting GLM...",stdout())
|
|
235 fit <- glmFit(dge,design)
|
|
236
|
|
237 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
238 cont <- c(contrast)
|
|
239 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
240
|
|
241 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
242 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
243 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
244 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
245
|
|
246 ## todo EXPORT FPKM
|
|
247 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
248
|
|
249
|
34
|
250 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") {
|
25
|
251 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
252 etable <- etable[order(etable\$FDR), ]
|
32
|
253
|
|
254 if(output_MAplot != "/dev/null") {
|
|
255 write("Creating MA plot...",stdout())
|
|
256 pdf(output_MAplot)
|
|
257 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
258 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
259 abline(h=c(-1,1), col="blue")
|
|
260 dev.off()
|
|
261 }
|
25
|
262
|
32
|
263 if(output_PValue_distribution_plot != "/dev/null") {
|
|
264 write("Creating P-value distribution plot...",stdout())
|
|
265 pdf(output_PValue_distribution_plot)
|
|
266 expressed_genes <- subset(etable, PValue < 0.99)
|
|
267 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
268 center <- sum(h\$counts) / length(h\$counts)
|
|
269 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
270 k <- ksmooth(h\$mid, h\$counts)
|
|
271 lines(k\$x,k\$y,col="red",lwd=2)
|
|
272 rmsd <- (h\$counts) - center
|
|
273 rmsd <- rmsd^2
|
|
274 rmsd <- sum(rmsd)
|
|
275 rmsd <- sqrt(rmsd)
|
|
276 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
277 ## change e into epsilon somehow
|
|
278 dev.off()
|
|
279 }
|
25
|
280 }
|
|
281
|
|
282 ##output_hierarchical_clustering_plot = args[13]
|
|
283 ##output_heatmap_plot = args[14]
|
|
284
|
35
|
285 if(output_RData_obj != "/dev/null") {
|
25
|
286 save.image(output_RData_obj)
|
|
287 }
|
|
288
|
|
289 write("Done!",stdout())
|
|
290 }
|
|
291 </configfile>
|
|
292 </configfiles>
|
|
293
|
|
294 <outputs>
|
|
295 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differtially expressed genes" />
|
|
296 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
297
|
|
298 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
|
299 <filter>("make_output_raw_counts" in outputs)</filter>
|
|
300 </data>
|
|
301
|
|
302 <data format="pdf" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
|
303 <filter>("make_output_MDSplot" in outputs)</filter>
|
|
304 </data>
|
|
305
|
|
306 <data format="pdf" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
|
307 <filter>("make_output_BCVplot" in outputs)</filter>
|
|
308 </data>
|
|
309
|
|
310 <data format="pdf" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
|
311 <filter>("make_output_MAplot" in outputs)</filter>
|
|
312 </data>
|
|
313
|
|
314 <data format="pdf" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
|
315 <filter>("make_output_PValue_distribution_plot" in outputs)</filter>
|
|
316 </data>
|
|
317
|
|
318 <data format="pdf" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
|
319 <filter>("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
|
320 </data>
|
|
321
|
|
322 <data format="pdf" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
|
323 <filter>("make_output_heatmap_plot" in outputs)</filter>
|
|
324 </data>
|
|
325
|
|
326 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
|
327 <filter>("make_output_R" in outputs)</filter>
|
|
328 </data>
|
|
329
|
|
330 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output" >
|
|
331 <filter>("make_output_RData_obj" in outputs)</filter>
|
|
332 </data>
|
|
333 </outputs>
|
|
334
|
|
335 <help>
|
|
336 edgeR: Differential Gene(Expression) Analysis
|
|
337
|
|
338 **Overview**
|
|
339
|
|
340 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data.
|
|
341 Author: Mark Robinson, Davis McCarthy, Yunshun Chen, Aaron Lun & Gordon Smyth
|
|
342
|
|
343 http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
344 http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
345
|
|
346 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
|
347 More details on this are given in the edgeR manual:
|
|
348 http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
|
349 and the limma manual.
|
|
350
|
|
351 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
352 This tool is called *edgeR Design Matrix Creator*.
|
|
353 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
354 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
355
|
|
356 If you have for example two groups, with an equal weight, you would like to compare either
|
|
357 "g1~g2" or "normal~cancer".
|
|
358
|
|
359 ** Input **
|
|
360
|
|
361 Expression matrix::
|
|
362
|
|
363 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
364 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
365 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
366 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
367 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
368 [...]
|
|
369
|
|
370 Note: Make sure the number of columns in the header is identical to the number of columns in the body.
|
|
371
|
|
372 Design matrix::
|
|
373
|
|
374 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
375 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
376 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
377 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
378 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
379 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
380 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
381 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
382 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
383 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
384 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
385 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
386 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
387
|
|
388 Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.
|
|
389
|
|
390 Contrast::
|
|
391
|
|
392 Tumor-Normal
|
|
393 African-European
|
|
394 0.5*(Control+Placebo) / Treated
|
|
395
|
|
396
|
|
397 **Installation**
|
|
398
|
|
399 This tool requires no specific configurations. The following dependencies are installed automatically:
|
|
400 * R
|
|
401 * Bioconductor
|
|
402 - limma
|
|
403 - edgeR
|
|
404
|
|
405 **License**
|
|
406 - R - GPL-2 & GPL-3
|
|
407 - limma - GPL (>=2)
|
|
408 - edgeR - GPL (>=2)
|
|
409
|
|
410 **Contact**
|
|
411
|
|
412 The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
|
|
413 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
414
|
|
415 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
|
|
416 http://toolshed.nbic.nl/
|
|
417
|
|
418 **References**
|
|
419
|
|
420 The test data is coming from: doi: 10.1093/bioinformatics/btt688.
|
|
421 http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
422
|
|
423 </help>
|
|
424 </tool>
|