25
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
62
|
6 <!--<requirement type="package" version="3.0.1">package_r3_withx</requirement>-->
|
67
|
7 <!--<requirement type="package" version="3.1.0">R</requirement>-->
|
|
8 <requirement type="package" version="3.0.3">R</requirement>
|
29
|
9 <requirement type="package" version="latest">package_biocLite_edgeR_limma</requirement>
|
25
|
10 </requirements>
|
|
11
|
|
12 <command>
|
|
13 <!--
|
|
14 The following script is written in the "Cheetah" language:
|
|
15 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
16 -->
|
|
17
|
|
18 R --vanilla --slave -f $R_script '--args
|
|
19 $expression_matrix
|
|
20 $design_matrix
|
|
21 $contrast
|
|
22
|
|
23 $fdr
|
|
24
|
|
25 $output_count_edgeR
|
|
26 $output_cpm
|
|
27
|
|
28 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
29
|
|
30 #if $output_raw_counts:
|
|
31 $output_raw_counts
|
|
32 #else:
|
|
33 /dev/null
|
|
34 #end if
|
|
35
|
|
36 #if $output_MDSplot:
|
|
37 $output_MDSplot
|
|
38 #else:
|
|
39 /dev/null
|
|
40 #end if
|
|
41
|
|
42 #if $output_BCVplot:
|
|
43 $output_BCVplot
|
|
44 #else:
|
|
45 /dev/null
|
|
46 #end if
|
|
47
|
|
48 #if $output_MAplot:
|
|
49 $output_MAplot
|
|
50 #else:
|
|
51 /dev/null
|
|
52 #end if
|
|
53
|
|
54 #if $output_PValue_distribution_plot:
|
|
55 $output_PValue_distribution_plot
|
|
56 #else:
|
|
57 /dev/null
|
|
58 #end if
|
|
59
|
|
60 #if $output_hierarchical_clustering_plot:
|
|
61 $output_hierarchical_clustering_plot
|
|
62 #else:
|
|
63 /dev/null
|
|
64 #end if
|
|
65
|
|
66 #if $output_heatmap_plot:
|
|
67 $output_heatmap_plot
|
|
68 #else:
|
|
69 /dev/null
|
|
70 #end if
|
|
71
|
|
72 #if $output_RData_obj:
|
|
73 $output_RData_obj
|
|
74 #else:
|
|
75 /dev/null
|
|
76 #end if
|
55
|
77
|
|
78 $output_format_images
|
|
79 '
|
25
|
80 #if $output_R:
|
|
81 > $output_R
|
|
82 #else:
|
|
83 > /dev/null
|
|
84 #end if
|
|
85
|
53
|
86 2> stderr.txt ;
|
|
87
|
67
|
88 #if $output_format_images == "png":
|
|
89 convert $output_format_images "{$output_format_images}.png" ;
|
|
90 mv "{$output_format_images}.png" $output_format_images ;
|
|
91 #end if
|
|
92
|
53
|
93 grep -v 'Calculating library sizes from column' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
94
|
|
95 ## Locale error messages:
|
|
96 grep -v 'During startup - Warning messages' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
97 grep -v 'Setting LC_TIME failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
98 grep -v 'Setting LC_MONETARY failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
99 grep -v 'Setting LC_PAPER failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
100 grep -v 'Setting LC_MEASUREMENT failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
101 grep -v 'Setting LC_CTYPE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
102 grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
103
|
|
104 cat stderr.txt >&2
|
25
|
105
|
|
106 </command>
|
|
107
|
|
108 <inputs>
|
|
109 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
110 <param name="design_matrix" type="data" format="tabular" label="Design matrix" hepl="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
111
|
|
112 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
113
|
|
114 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
115
|
|
116 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
117 <option value="make_output_raw_counts">Raw counts table</option>
|
|
118 <option value="make_output_MDSplot">MDS-plot</option>
|
|
119 <option value="make_output_BCVplot">BCV-plot</option>
|
|
120 <option value="make_output_MAplot">MA-plot</option>
|
|
121 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
122 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
|
|
123 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
124
|
43
|
125 <option value="make_output_R_stdout">R stdout</option>
|
25
|
126 <option value="make_output_RData_obj">R Data object</option>
|
|
127 </param>
|
55
|
128
|
|
129 <param name="output_format_images" type="select" label="Output format of images" display="radio">
|
|
130 <option value="png">Portable network graphics (.png)</option>
|
|
131 <option value="pdf">Portable document format (.pdf)</option>
|
|
132 <option value="svg">Scalable vector graphics (.svg)</option>
|
|
133 </param>
|
25
|
134 </inputs>
|
|
135
|
|
136 <configfiles>
|
|
137 <configfile name="R_script">
|
|
138 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
139 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
140 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
141
|
|
142 ## Fetch commandline arguments
|
|
143 args <- commandArgs(trailingOnly = TRUE)
|
|
144
|
|
145 expression_matrix_file = args[1]
|
|
146 design_matrix_file = args[2]
|
|
147 contrast = args[3]
|
|
148
|
|
149 fdr = args[4]
|
|
150
|
|
151 output_count_edgeR = args[5]
|
|
152 output_cpm = args[6]
|
|
153
|
43
|
154 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
25
|
155
|
|
156 output_raw_counts = args[8]
|
|
157 output_MDSplot = args[9]
|
|
158 output_BCVplot = args[10]
|
|
159 output_MAplot = args[11]
|
|
160 output_PValue_distribution_plot = args[12]
|
|
161 output_hierarchical_clustering_plot = args[13]
|
|
162 output_heatmap_plot = args[14]
|
|
163 output_RData_obj = args[15]
|
55
|
164 output_format_images = args[16]
|
25
|
165
|
|
166
|
|
167 library(edgeR)
|
|
168 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
169 ## Obtain read-counts
|
|
170
|
|
171 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
172 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
173
|
|
174 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
175
|
|
176 for(i in 1:ncol(design_matrix)) {
|
|
177 old = design_matrix[,i]
|
|
178 design_matrix[,i] = make.names(design_matrix[,i])
|
|
179 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
180 print("Renaming of factors:")
|
|
181 print(old)
|
|
182 print("To:")
|
|
183 print(design_matrix[,i])
|
|
184 }
|
45
|
185 ## The following line seems to malfunction the script:
|
|
186 ##design_matrix[,i] <- as.factor(design_matrix[,i])
|
25
|
187 }
|
|
188
|
44
|
189 ## 1) In the expression matrix, you only want to have the samples described in the design matrix
|
25
|
190 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
43
|
191 columns <- columns[!is.na(columns)]
|
25
|
192 read_counts <- expression_matrix[,columns]
|
|
193
|
44
|
194 ## 2) In the design matrix, you only want to have samples of which you really have the counts
|
|
195 columns <- match(colnames(expression_matrix),rownames(design_matrix))
|
|
196 columns <- columns[!is.na(columns)]
|
|
197 design_matrix <- design_matrix[columns,,drop=FALSE]
|
25
|
198
|
|
199 ## Filter for HTSeq predifined counts:
|
|
200 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
201 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
202
|
44
|
203 exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
204 exclude <- exclude[is.na(exclude)==0]
|
25
|
205 if(length(exclude) != 0) {
|
44
|
206 read_counts <- read_counts[-exclude,]
|
25
|
207 }
|
|
208
|
|
209
|
44
|
210 empty_samples <- apply(read_counts,2,function(x) sum(x) == 0)
|
25
|
211 if(sum(empty_samples) > 0) {
|
|
212 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
213 write(colnames(read_counts)[empty_samples],stderr())
|
|
214 } else {
|
|
215
|
|
216 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
217
|
|
218 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
219 design_matrix_tmp <- design_matrix
|
|
220 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
221 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
222 rm(design_matrix_tmp)
|
|
223
|
|
224 # Filter prefixes
|
|
225 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
226 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
227 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
228 replacements[avoid] = colnames(design)[avoid]
|
|
229 colnames(design) = replacements
|
|
230
|
|
231 # Do normalization
|
|
232 write("Calculating normalization factors...",stdout())
|
|
233 dge <- calcNormFactors(dge)
|
|
234 write("Estimating common dispersion...",stdout())
|
|
235 dge <- estimateGLMCommonDisp(dge,design)
|
|
236 write("Estimating trended dispersion...",stdout())
|
|
237 dge <- estimateGLMTrendedDisp(dge,design)
|
|
238 write("Estimating tagwise dispersion...",stdout())
|
|
239 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
240
|
|
241
|
|
242 if(output_MDSplot != "/dev/null") {
|
|
243 write("Creating MDS plot",stdout())
|
|
244 ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
245 points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
246 dev.off()# Kill it
|
|
247
|
67
|
248 if(output_format_images == "pdf" || output_format_images == "png") {
|
55
|
249 pdf(output_MDSplot)
|
|
250 } else if(output_format_images == "svg") {
|
|
251 svg(output_MDSplot)
|
67
|
252 ##} else {
|
|
253 ## png(output_MDSplot)
|
|
254 ##}
|
55
|
255
|
25
|
256 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
257 diff_y <-(max(points\$y)-min(points\$y))
|
|
258 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
|
|
259 points(points\$x,points\$y,pch=20)
|
|
260 text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
|
|
261 rm(diff_x,diff_y)
|
|
262
|
|
263 dev.off()
|
|
264 }
|
|
265
|
|
266 if(output_BCVplot != "/dev/null") {
|
|
267 write("Creating Biological coefficient of variation plot",stdout())
|
60
|
268
|
67
|
269 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
270 pdf(output_BCVplot)
|
|
271 } else if(output_format_images == "svg") {
|
|
272 svg(output_BCVplot)
|
67
|
273 ##} else {
|
|
274 ## png(output_BCVplot)
|
|
275 ##}
|
60
|
276
|
25
|
277 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
278 dev.off()
|
|
279 }
|
|
280
|
|
281
|
|
282 write("Fitting GLM...",stdout())
|
|
283 fit <- glmFit(dge,design)
|
|
284
|
|
285 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
286 cont <- c(contrast)
|
|
287 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
288
|
|
289 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
290 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
291 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
292 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
293
|
|
294 ## todo EXPORT FPKM
|
|
295 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
296
|
34
|
297 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") {
|
25
|
298 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
299 etable <- etable[order(etable\$FDR), ]
|
32
|
300
|
|
301 if(output_MAplot != "/dev/null") {
|
|
302 write("Creating MA plot...",stdout())
|
60
|
303
|
67
|
304 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
305 pdf(output_MAplot)
|
|
306 } else if(output_format_images == "svg") {
|
|
307 svg(output_MAplot)
|
67
|
308 ##} else {
|
|
309 ## png(output_MAplot)
|
|
310 ##}
|
60
|
311
|
32
|
312 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
313 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
314 abline(h=c(-1,1), col="blue")
|
|
315 dev.off()
|
|
316 }
|
25
|
317
|
32
|
318 if(output_PValue_distribution_plot != "/dev/null") {
|
|
319 write("Creating P-value distribution plot...",stdout())
|
60
|
320
|
67
|
321 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
322 pdf(output_PValue_distribution_plot)
|
|
323 } else if(output_format_images == "svg") {
|
|
324 svg(output_PValue_distribution_plot)
|
67
|
325 ##} else {
|
|
326 ## png(output_PValue_distribution_plot)
|
|
327 ##}
|
60
|
328
|
32
|
329 expressed_genes <- subset(etable, PValue < 0.99)
|
|
330 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
331 center <- sum(h\$counts) / length(h\$counts)
|
|
332 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
333 k <- ksmooth(h\$mid, h\$counts)
|
|
334 lines(k\$x,k\$y,col="red",lwd=2)
|
|
335 rmsd <- (h\$counts) - center
|
|
336 rmsd <- rmsd^2
|
|
337 rmsd <- sum(rmsd)
|
|
338 rmsd <- sqrt(rmsd)
|
|
339 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
340 ## change e into epsilon somehow
|
|
341 dev.off()
|
|
342 }
|
40
|
343 }
|
|
344
|
|
345 if(output_heatmap_plot != "/dev/null") {
|
60
|
346
|
67
|
347 if(output_format_images == "pdf" || output_format_images == "png") {
|
60
|
348 pdf(output_heatmap_plot,width=10.5)
|
|
349 } else if(output_format_images == "svg") {
|
|
350 svg(output_heatmap_plot,width=10.5)
|
67
|
351 ##} else {
|
|
352 ## png(output_heatmap_plot,width=10.5)
|
|
353 ##}
|
60
|
354
|
40
|
355 etable2 <- topTags(lrt, n=100)\$table
|
|
356 order <- rownames(etable2)
|
|
357 cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),]
|
|
358 heatmap(t(cpm_sub))
|
|
359 dev.off()
|
25
|
360 }
|
|
361
|
|
362 ##output_hierarchical_clustering_plot = args[13]
|
|
363
|
35
|
364 if(output_RData_obj != "/dev/null") {
|
25
|
365 save.image(output_RData_obj)
|
|
366 }
|
|
367
|
|
368 write("Done!",stdout())
|
|
369 }
|
|
370 </configfile>
|
|
371 </configfiles>
|
|
372
|
|
373 <outputs>
|
53
|
374 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differentially expressed genes" />
|
25
|
375 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
376
|
|
377 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
53
|
378 <filter>outputs and ("make_output_raw_counts" in outputs)</filter>
|
25
|
379 </data>
|
|
380
|
59
|
381 <data format="png" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
53
|
382 <filter>outputs and ("make_output_MDSplot" in outputs)</filter>
|
59
|
383
|
|
384 <change_format>
|
|
385 <when input="output_format_images" value="png" format="png" />
|
|
386 <when input="output_format_images" value="pdf" format="pdf" />
|
|
387 <when input="output_format_images" value="svg" format="svg" />
|
|
388 </change_format>
|
25
|
389 </data>
|
|
390
|
60
|
391 <data format="png" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
53
|
392 <filter>outputs and ("make_output_BCVplot" in outputs)</filter>
|
60
|
393
|
|
394 <change_format>
|
|
395 <when input="output_format_images" value="png" format="png" />
|
|
396 <when input="output_format_images" value="pdf" format="pdf" />
|
|
397 <when input="output_format_images" value="svg" format="svg" />
|
|
398 </change_format>
|
25
|
399 </data>
|
|
400
|
60
|
401 <data format="png" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
53
|
402 <filter>outputs and ("make_output_MAplot" in outputs)</filter>
|
60
|
403
|
|
404 <change_format>
|
|
405 <when input="output_format_images" value="png" format="png" />
|
|
406 <when input="output_format_images" value="pdf" format="pdf" />
|
|
407 <when input="output_format_images" value="svg" format="svg" />
|
|
408 </change_format>
|
25
|
409 </data>
|
|
410
|
60
|
411 <data format="png" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
53
|
412 <filter>outputs and ("make_output_PValue_distribution_plot" in outputs)</filter>
|
60
|
413
|
|
414 <change_format>
|
|
415 <when input="output_format_images" value="png" format="png" />
|
|
416 <when input="output_format_images" value="pdf" format="pdf" />
|
|
417 <when input="output_format_images" value="svg" format="svg" />
|
|
418 </change_format>
|
25
|
419 </data>
|
|
420
|
60
|
421 <data format="png" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
53
|
422 <filter>outputs and ("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
60
|
423
|
|
424 <change_format>
|
|
425 <when input="output_format_images" value="png" format="png" />
|
|
426 <when input="output_format_images" value="pdf" format="pdf" />
|
|
427 <when input="output_format_images" value="svg" format="svg" />
|
|
428 </change_format>
|
25
|
429 </data>
|
|
430
|
60
|
431 <data format="png" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
53
|
432 <filter>outputs and ("make_output_heatmap_plot" in outputs)</filter>
|
60
|
433
|
|
434 <change_format>
|
|
435 <when input="output_format_images" value="png" format="png" />
|
|
436 <when input="output_format_images" value="pdf" format="pdf" />
|
|
437 <when input="output_format_images" value="svg" format="svg" />
|
|
438 </change_format>
|
25
|
439 </data>
|
|
440
|
|
441 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
53
|
442 <filter>outputs and ("make_output_RData_obj" in outputs)</filter>
|
25
|
443 </data>
|
|
444
|
40
|
445 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output (debug)" >
|
53
|
446 <filter>outputs and ("make_output_R_stdout" in outputs)</filter>
|
25
|
447 </data>
|
|
448 </outputs>
|
|
449
|
|
450 <help>
|
|
451 edgeR: Differential Gene(Expression) Analysis
|
36
|
452 #############################################
|
25
|
453
|
36
|
454 Overview
|
|
455 --------
|
|
456 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data [1].
|
25
|
457
|
|
458 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
36
|
459 More details on this are given in the edgeR manual: http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
25
|
460 and the limma manual.
|
|
461
|
|
462 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
463 This tool is called *edgeR Design Matrix Creator*.
|
|
464 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
465 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
466
|
|
467 If you have for example two groups, with an equal weight, you would like to compare either
|
|
468 "g1~g2" or "normal~cancer".
|
|
469
|
36
|
470 The test function makes use of a MCF7 dataset used in a study that indicates that a higher sequencing depth is not neccesairily more important than a higher amount of replaciates[2].
|
25
|
471
|
36
|
472 Input
|
|
473 -----
|
|
474 Expression matrix
|
|
475 ^^^^^^^^^^^^^^^^^
|
|
476 ::
|
25
|
477
|
|
478 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
479 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
480 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
481 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
482 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
483 [...]
|
|
484
|
36
|
485 *Note: Make sure the number of columns in the header is identical to the number of columns in the body.*
|
25
|
486
|
36
|
487 Design matrix
|
|
488 ^^^^^^^^^^^^^
|
|
489 ::
|
25
|
490
|
|
491 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
492 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
493 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
494 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
495 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
496 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
497 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
498 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
499 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
500 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
501 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
502 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
503 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
504
|
36
|
505 *Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.*
|
25
|
506
|
36
|
507 Contrast
|
|
508 ^^^^^^^^
|
|
509 The contrast represents the biological question. There can be many questions asked, e.g.:
|
25
|
510
|
36
|
511 - Tumor-Normal
|
|
512 - African-European
|
|
513 - 0.5*(Control+Placebo) / Treated
|
25
|
514
|
36
|
515 Installation
|
|
516 ------------
|
25
|
517
|
|
518 This tool requires no specific configurations. The following dependencies are installed automatically:
|
36
|
519
|
|
520 - R
|
|
521 - Bioconductor
|
25
|
522 - limma
|
36
|
523
|
25
|
524 - edgeR
|
|
525
|
36
|
526 License
|
|
527 -------
|
|
528 - R
|
|
529 - GPL-2 & GPL-3
|
|
530 - limma
|
|
531 - GPL (>=2)
|
|
532 - edgeR
|
|
533 - GPL (>=2)
|
|
534
|
|
535 References
|
|
536 ----------
|
|
537
|
|
538 EdgeR
|
|
539 ^^^^^
|
|
540 **[1] edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.**
|
25
|
541
|
36
|
542 *Mark D. Robinson, Davis J. McCarthy and Gordon K. Smyth* - Bioinformatics (2010) 26 (1): 139-140.
|
|
543
|
|
544 - http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
545 - http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
546 - http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
25
|
547
|
36
|
548 Test-data (MCF7)
|
|
549 ^^^^^^^^^^^^^^^^
|
|
550 **[2] RNA-seq differential expression studies: more sequence or more replication?**
|
|
551
|
|
552 *Yuwen Liu, Jie Zhou and Kevin P. White* - Bioinformatics (2014) 30 (3): 301-304.
|
|
553
|
|
554 - http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
555 - http://dx.doi.org/10.1093/bioinformatics/btt688
|
|
556
|
|
557 Contact
|
|
558 -------
|
25
|
559 The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
|
|
560 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
561
|
36
|
562 I would like to thank Hina Riaz - Naz Khan for her helpful contribution.
|
25
|
563
|
36
|
564 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
|
|
565 http://testtoolshed.g2.bx.psu.edu/
|
25
|
566 </help>
|
|
567 </tool>
|