25
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
62
|
6 <!--<requirement type="package" version="3.0.1">package_r3_withx</requirement>-->
|
|
7 <requirement type="package" version="3.0.2">R_3_0_2</requirement>
|
29
|
8 <requirement type="package" version="latest">package_biocLite_edgeR_limma</requirement>
|
25
|
9 </requirements>
|
|
10
|
|
11 <command>
|
|
12 <!--
|
|
13 The following script is written in the "Cheetah" language:
|
|
14 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
15 -->
|
|
16
|
|
17 R --vanilla --slave -f $R_script '--args
|
|
18 $expression_matrix
|
|
19 $design_matrix
|
|
20 $contrast
|
|
21
|
|
22 $fdr
|
|
23
|
|
24 $output_count_edgeR
|
|
25 $output_cpm
|
|
26
|
|
27 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
28
|
|
29 #if $output_raw_counts:
|
|
30 $output_raw_counts
|
|
31 #else:
|
|
32 /dev/null
|
|
33 #end if
|
|
34
|
|
35 #if $output_MDSplot:
|
|
36 $output_MDSplot
|
|
37 #else:
|
|
38 /dev/null
|
|
39 #end if
|
|
40
|
|
41 #if $output_BCVplot:
|
|
42 $output_BCVplot
|
|
43 #else:
|
|
44 /dev/null
|
|
45 #end if
|
|
46
|
|
47 #if $output_MAplot:
|
|
48 $output_MAplot
|
|
49 #else:
|
|
50 /dev/null
|
|
51 #end if
|
|
52
|
|
53 #if $output_PValue_distribution_plot:
|
|
54 $output_PValue_distribution_plot
|
|
55 #else:
|
|
56 /dev/null
|
|
57 #end if
|
|
58
|
|
59 #if $output_hierarchical_clustering_plot:
|
|
60 $output_hierarchical_clustering_plot
|
|
61 #else:
|
|
62 /dev/null
|
|
63 #end if
|
|
64
|
|
65 #if $output_heatmap_plot:
|
|
66 $output_heatmap_plot
|
|
67 #else:
|
|
68 /dev/null
|
|
69 #end if
|
|
70
|
|
71 #if $output_RData_obj:
|
|
72 $output_RData_obj
|
|
73 #else:
|
|
74 /dev/null
|
|
75 #end if
|
55
|
76
|
|
77 $output_format_images
|
|
78 '
|
25
|
79 #if $output_R:
|
|
80 > $output_R
|
|
81 #else:
|
|
82 > /dev/null
|
|
83 #end if
|
|
84
|
53
|
85 2> stderr.txt ;
|
|
86
|
|
87 grep -v 'Calculating library sizes from column' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
88
|
|
89 ## Locale error messages:
|
|
90 grep -v 'During startup - Warning messages' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
91 grep -v 'Setting LC_TIME failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
92 grep -v 'Setting LC_MONETARY failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
93 grep -v 'Setting LC_PAPER failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
94 grep -v 'Setting LC_MEASUREMENT failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
95 grep -v 'Setting LC_CTYPE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
96 grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
97
|
|
98 cat stderr.txt >&2
|
25
|
99
|
|
100 </command>
|
|
101
|
|
102 <inputs>
|
|
103 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
104 <param name="design_matrix" type="data" format="tabular" label="Design matrix" hepl="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
105
|
|
106 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
107
|
|
108 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
109
|
|
110 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
111 <option value="make_output_raw_counts">Raw counts table</option>
|
|
112 <option value="make_output_MDSplot">MDS-plot</option>
|
|
113 <option value="make_output_BCVplot">BCV-plot</option>
|
|
114 <option value="make_output_MAplot">MA-plot</option>
|
|
115 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
116 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
|
|
117 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
118
|
43
|
119 <option value="make_output_R_stdout">R stdout</option>
|
25
|
120 <option value="make_output_RData_obj">R Data object</option>
|
|
121 </param>
|
55
|
122
|
|
123 <param name="output_format_images" type="select" label="Output format of images" display="radio">
|
|
124 <option value="png">Portable network graphics (.png)</option>
|
|
125 <option value="pdf">Portable document format (.pdf)</option>
|
|
126 <option value="svg">Scalable vector graphics (.svg)</option>
|
|
127 </param>
|
25
|
128 </inputs>
|
|
129
|
|
130 <configfiles>
|
|
131 <configfile name="R_script">
|
|
132 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
133 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
134 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
135
|
|
136 ## Fetch commandline arguments
|
|
137 args <- commandArgs(trailingOnly = TRUE)
|
|
138
|
|
139 expression_matrix_file = args[1]
|
|
140 design_matrix_file = args[2]
|
|
141 contrast = args[3]
|
|
142
|
|
143 fdr = args[4]
|
|
144
|
|
145 output_count_edgeR = args[5]
|
|
146 output_cpm = args[6]
|
|
147
|
43
|
148 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
25
|
149
|
|
150 output_raw_counts = args[8]
|
|
151 output_MDSplot = args[9]
|
|
152 output_BCVplot = args[10]
|
|
153 output_MAplot = args[11]
|
|
154 output_PValue_distribution_plot = args[12]
|
|
155 output_hierarchical_clustering_plot = args[13]
|
|
156 output_heatmap_plot = args[14]
|
|
157 output_RData_obj = args[15]
|
55
|
158 output_format_images = args[16]
|
25
|
159
|
|
160
|
|
161 library(edgeR)
|
|
162 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
163 ## Obtain read-counts
|
|
164
|
|
165 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
166 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
167
|
|
168 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
169
|
|
170 for(i in 1:ncol(design_matrix)) {
|
|
171 old = design_matrix[,i]
|
|
172 design_matrix[,i] = make.names(design_matrix[,i])
|
|
173 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
174 print("Renaming of factors:")
|
|
175 print(old)
|
|
176 print("To:")
|
|
177 print(design_matrix[,i])
|
|
178 }
|
45
|
179 ## The following line seems to malfunction the script:
|
|
180 ##design_matrix[,i] <- as.factor(design_matrix[,i])
|
25
|
181 }
|
|
182
|
44
|
183 ## 1) In the expression matrix, you only want to have the samples described in the design matrix
|
25
|
184 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
43
|
185 columns <- columns[!is.na(columns)]
|
25
|
186 read_counts <- expression_matrix[,columns]
|
|
187
|
44
|
188 ## 2) In the design matrix, you only want to have samples of which you really have the counts
|
|
189 columns <- match(colnames(expression_matrix),rownames(design_matrix))
|
|
190 columns <- columns[!is.na(columns)]
|
|
191 design_matrix <- design_matrix[columns,,drop=FALSE]
|
25
|
192
|
|
193 ## Filter for HTSeq predifined counts:
|
|
194 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
195 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
196
|
44
|
197 exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
198 exclude <- exclude[is.na(exclude)==0]
|
25
|
199 if(length(exclude) != 0) {
|
44
|
200 read_counts <- read_counts[-exclude,]
|
25
|
201 }
|
|
202
|
|
203
|
44
|
204 empty_samples <- apply(read_counts,2,function(x) sum(x) == 0)
|
25
|
205 if(sum(empty_samples) > 0) {
|
|
206 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
207 write(colnames(read_counts)[empty_samples],stderr())
|
|
208 } else {
|
|
209
|
|
210 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
211
|
|
212 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
213 design_matrix_tmp <- design_matrix
|
|
214 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
215 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
216 rm(design_matrix_tmp)
|
|
217
|
|
218 # Filter prefixes
|
|
219 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
220 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
221 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
222 replacements[avoid] = colnames(design)[avoid]
|
|
223 colnames(design) = replacements
|
|
224
|
|
225 # Do normalization
|
|
226 write("Calculating normalization factors...",stdout())
|
|
227 dge <- calcNormFactors(dge)
|
|
228 write("Estimating common dispersion...",stdout())
|
|
229 dge <- estimateGLMCommonDisp(dge,design)
|
|
230 write("Estimating trended dispersion...",stdout())
|
|
231 dge <- estimateGLMTrendedDisp(dge,design)
|
|
232 write("Estimating tagwise dispersion...",stdout())
|
|
233 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
234
|
|
235
|
|
236 if(output_MDSplot != "/dev/null") {
|
|
237 write("Creating MDS plot",stdout())
|
|
238 ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
239 points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
240 dev.off()# Kill it
|
|
241
|
55
|
242 if(output_format_images == "pdf") {
|
|
243 pdf(output_MDSplot)
|
|
244 } else if(output_format_images == "svg") {
|
|
245 svg(output_MDSplot)
|
|
246 } else {
|
|
247 png(output_MDSplot)
|
|
248 }
|
|
249
|
25
|
250 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
251 diff_y <-(max(points\$y)-min(points\$y))
|
|
252 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
|
|
253 points(points\$x,points\$y,pch=20)
|
|
254 text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
|
|
255 rm(diff_x,diff_y)
|
|
256
|
|
257 dev.off()
|
|
258 }
|
|
259
|
|
260 if(output_BCVplot != "/dev/null") {
|
|
261 write("Creating Biological coefficient of variation plot",stdout())
|
60
|
262
|
|
263 if(output_format_images == "pdf") {
|
|
264 pdf(output_BCVplot)
|
|
265 } else if(output_format_images == "svg") {
|
|
266 svg(output_BCVplot)
|
|
267 } else {
|
|
268 png(output_BCVplot)
|
|
269 }
|
|
270
|
25
|
271 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
272 dev.off()
|
|
273 }
|
|
274
|
|
275
|
|
276 write("Fitting GLM...",stdout())
|
|
277 fit <- glmFit(dge,design)
|
|
278
|
|
279 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
280 cont <- c(contrast)
|
|
281 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
282
|
|
283 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
284 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
285 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
286 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
287
|
|
288 ## todo EXPORT FPKM
|
|
289 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
290
|
34
|
291 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") {
|
25
|
292 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
293 etable <- etable[order(etable\$FDR), ]
|
32
|
294
|
|
295 if(output_MAplot != "/dev/null") {
|
|
296 write("Creating MA plot...",stdout())
|
60
|
297
|
|
298 if(output_format_images == "pdf") {
|
|
299 pdf(output_MAplot)
|
|
300 } else if(output_format_images == "svg") {
|
|
301 svg(output_MAplot)
|
|
302 } else {
|
|
303 png(output_MAplot)
|
|
304 }
|
|
305
|
32
|
306 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
307 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
308 abline(h=c(-1,1), col="blue")
|
|
309 dev.off()
|
|
310 }
|
25
|
311
|
32
|
312 if(output_PValue_distribution_plot != "/dev/null") {
|
|
313 write("Creating P-value distribution plot...",stdout())
|
60
|
314
|
|
315 if(output_format_images == "pdf") {
|
|
316 pdf(output_PValue_distribution_plot)
|
|
317 } else if(output_format_images == "svg") {
|
|
318 svg(output_PValue_distribution_plot)
|
|
319 } else {
|
|
320 png(output_PValue_distribution_plot)
|
|
321 }
|
|
322
|
32
|
323 expressed_genes <- subset(etable, PValue < 0.99)
|
|
324 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
325 center <- sum(h\$counts) / length(h\$counts)
|
|
326 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
327 k <- ksmooth(h\$mid, h\$counts)
|
|
328 lines(k\$x,k\$y,col="red",lwd=2)
|
|
329 rmsd <- (h\$counts) - center
|
|
330 rmsd <- rmsd^2
|
|
331 rmsd <- sum(rmsd)
|
|
332 rmsd <- sqrt(rmsd)
|
|
333 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
334 ## change e into epsilon somehow
|
|
335 dev.off()
|
|
336 }
|
40
|
337 }
|
|
338
|
|
339 if(output_heatmap_plot != "/dev/null") {
|
60
|
340
|
|
341 if(output_format_images == "pdf") {
|
|
342 pdf(output_heatmap_plot,width=10.5)
|
|
343 } else if(output_format_images == "svg") {
|
|
344 svg(output_heatmap_plot,width=10.5)
|
|
345 } else {
|
|
346 png(output_heatmap_plot,width=10.5)
|
|
347 }
|
|
348
|
40
|
349 etable2 <- topTags(lrt, n=100)\$table
|
|
350 order <- rownames(etable2)
|
|
351 cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),]
|
|
352 heatmap(t(cpm_sub))
|
|
353 dev.off()
|
25
|
354 }
|
|
355
|
|
356 ##output_hierarchical_clustering_plot = args[13]
|
|
357
|
35
|
358 if(output_RData_obj != "/dev/null") {
|
25
|
359 save.image(output_RData_obj)
|
|
360 }
|
|
361
|
|
362 write("Done!",stdout())
|
|
363 }
|
|
364 </configfile>
|
|
365 </configfiles>
|
|
366
|
|
367 <outputs>
|
53
|
368 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differentially expressed genes" />
|
25
|
369 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
370
|
|
371 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
53
|
372 <filter>outputs and ("make_output_raw_counts" in outputs)</filter>
|
25
|
373 </data>
|
|
374
|
59
|
375 <data format="png" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
53
|
376 <filter>outputs and ("make_output_MDSplot" in outputs)</filter>
|
59
|
377
|
|
378 <change_format>
|
|
379 <when input="output_format_images" value="png" format="png" />
|
|
380 <when input="output_format_images" value="pdf" format="pdf" />
|
|
381 <when input="output_format_images" value="svg" format="svg" />
|
|
382 </change_format>
|
25
|
383 </data>
|
|
384
|
60
|
385 <data format="png" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
53
|
386 <filter>outputs and ("make_output_BCVplot" in outputs)</filter>
|
60
|
387
|
|
388 <change_format>
|
|
389 <when input="output_format_images" value="png" format="png" />
|
|
390 <when input="output_format_images" value="pdf" format="pdf" />
|
|
391 <when input="output_format_images" value="svg" format="svg" />
|
|
392 </change_format>
|
25
|
393 </data>
|
|
394
|
60
|
395 <data format="png" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
53
|
396 <filter>outputs and ("make_output_MAplot" in outputs)</filter>
|
60
|
397
|
|
398 <change_format>
|
|
399 <when input="output_format_images" value="png" format="png" />
|
|
400 <when input="output_format_images" value="pdf" format="pdf" />
|
|
401 <when input="output_format_images" value="svg" format="svg" />
|
|
402 </change_format>
|
25
|
403 </data>
|
|
404
|
60
|
405 <data format="png" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
53
|
406 <filter>outputs and ("make_output_PValue_distribution_plot" in outputs)</filter>
|
60
|
407
|
|
408 <change_format>
|
|
409 <when input="output_format_images" value="png" format="png" />
|
|
410 <when input="output_format_images" value="pdf" format="pdf" />
|
|
411 <when input="output_format_images" value="svg" format="svg" />
|
|
412 </change_format>
|
25
|
413 </data>
|
|
414
|
60
|
415 <data format="png" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
53
|
416 <filter>outputs and ("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
60
|
417
|
|
418 <change_format>
|
|
419 <when input="output_format_images" value="png" format="png" />
|
|
420 <when input="output_format_images" value="pdf" format="pdf" />
|
|
421 <when input="output_format_images" value="svg" format="svg" />
|
|
422 </change_format>
|
25
|
423 </data>
|
|
424
|
60
|
425 <data format="png" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
53
|
426 <filter>outputs and ("make_output_heatmap_plot" in outputs)</filter>
|
60
|
427
|
|
428 <change_format>
|
|
429 <when input="output_format_images" value="png" format="png" />
|
|
430 <when input="output_format_images" value="pdf" format="pdf" />
|
|
431 <when input="output_format_images" value="svg" format="svg" />
|
|
432 </change_format>
|
25
|
433 </data>
|
|
434
|
|
435 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
53
|
436 <filter>outputs and ("make_output_RData_obj" in outputs)</filter>
|
25
|
437 </data>
|
|
438
|
40
|
439 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output (debug)" >
|
53
|
440 <filter>outputs and ("make_output_R_stdout" in outputs)</filter>
|
25
|
441 </data>
|
|
442 </outputs>
|
|
443
|
|
444 <help>
|
|
445 edgeR: Differential Gene(Expression) Analysis
|
36
|
446 #############################################
|
25
|
447
|
36
|
448 Overview
|
|
449 --------
|
|
450 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data [1].
|
25
|
451
|
|
452 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
36
|
453 More details on this are given in the edgeR manual: http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
25
|
454 and the limma manual.
|
|
455
|
|
456 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
457 This tool is called *edgeR Design Matrix Creator*.
|
|
458 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
459 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
460
|
|
461 If you have for example two groups, with an equal weight, you would like to compare either
|
|
462 "g1~g2" or "normal~cancer".
|
|
463
|
36
|
464 The test function makes use of a MCF7 dataset used in a study that indicates that a higher sequencing depth is not neccesairily more important than a higher amount of replaciates[2].
|
25
|
465
|
36
|
466 Input
|
|
467 -----
|
|
468 Expression matrix
|
|
469 ^^^^^^^^^^^^^^^^^
|
|
470 ::
|
25
|
471
|
|
472 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
473 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
474 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
475 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
476 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
477 [...]
|
|
478
|
36
|
479 *Note: Make sure the number of columns in the header is identical to the number of columns in the body.*
|
25
|
480
|
36
|
481 Design matrix
|
|
482 ^^^^^^^^^^^^^
|
|
483 ::
|
25
|
484
|
|
485 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
486 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
487 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
488 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
489 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
490 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
491 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
492 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
493 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
494 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
495 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
496 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
497 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
498
|
36
|
499 *Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.*
|
25
|
500
|
36
|
501 Contrast
|
|
502 ^^^^^^^^
|
|
503 The contrast represents the biological question. There can be many questions asked, e.g.:
|
25
|
504
|
36
|
505 - Tumor-Normal
|
|
506 - African-European
|
|
507 - 0.5*(Control+Placebo) / Treated
|
25
|
508
|
36
|
509 Installation
|
|
510 ------------
|
25
|
511
|
|
512 This tool requires no specific configurations. The following dependencies are installed automatically:
|
36
|
513
|
|
514 - R
|
|
515 - Bioconductor
|
25
|
516 - limma
|
36
|
517
|
25
|
518 - edgeR
|
|
519
|
36
|
520 License
|
|
521 -------
|
|
522 - R
|
|
523 - GPL-2 & GPL-3
|
|
524 - limma
|
|
525 - GPL (>=2)
|
|
526 - edgeR
|
|
527 - GPL (>=2)
|
|
528
|
|
529 References
|
|
530 ----------
|
|
531
|
|
532 EdgeR
|
|
533 ^^^^^
|
|
534 **[1] edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.**
|
25
|
535
|
36
|
536 *Mark D. Robinson, Davis J. McCarthy and Gordon K. Smyth* - Bioinformatics (2010) 26 (1): 139-140.
|
|
537
|
|
538 - http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
539 - http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
540 - http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
25
|
541
|
36
|
542 Test-data (MCF7)
|
|
543 ^^^^^^^^^^^^^^^^
|
|
544 **[2] RNA-seq differential expression studies: more sequence or more replication?**
|
|
545
|
|
546 *Yuwen Liu, Jie Zhou and Kevin P. White* - Bioinformatics (2014) 30 (3): 301-304.
|
|
547
|
|
548 - http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
549 - http://dx.doi.org/10.1093/bioinformatics/btt688
|
|
550
|
|
551 Contact
|
|
552 -------
|
25
|
553 The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
|
|
554 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
555
|
36
|
556 I would like to thank Hina Riaz - Naz Khan for her helpful contribution.
|
25
|
557
|
36
|
558 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
|
|
559 http://testtoolshed.g2.bx.psu.edu/
|
25
|
560 </help>
|
|
561 </tool>
|