25
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR: Differential Gene(Expression) Analysis">
|
|
3 <description>RNA-Seq gene expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <requirements>
|
57
|
6 <requirement type="package" version="3.0.1">package_r3_withx</requirement>
|
29
|
7 <requirement type="package" version="latest">package_biocLite_edgeR_limma</requirement>
|
25
|
8 </requirements>
|
|
9
|
|
10 <command>
|
|
11 <!--
|
|
12 The following script is written in the "Cheetah" language:
|
|
13 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
14 -->
|
|
15
|
|
16 R --vanilla --slave -f $R_script '--args
|
|
17 $expression_matrix
|
|
18 $design_matrix
|
|
19 $contrast
|
|
20
|
|
21 $fdr
|
|
22
|
|
23 $output_count_edgeR
|
|
24 $output_cpm
|
|
25
|
|
26 /dev/null <!-- Calculation of FPKM/RPKM should come here -->
|
|
27
|
|
28 #if $output_raw_counts:
|
|
29 $output_raw_counts
|
|
30 #else:
|
|
31 /dev/null
|
|
32 #end if
|
|
33
|
|
34 #if $output_MDSplot:
|
|
35 $output_MDSplot
|
|
36 #else:
|
|
37 /dev/null
|
|
38 #end if
|
|
39
|
|
40 #if $output_BCVplot:
|
|
41 $output_BCVplot
|
|
42 #else:
|
|
43 /dev/null
|
|
44 #end if
|
|
45
|
|
46 #if $output_MAplot:
|
|
47 $output_MAplot
|
|
48 #else:
|
|
49 /dev/null
|
|
50 #end if
|
|
51
|
|
52 #if $output_PValue_distribution_plot:
|
|
53 $output_PValue_distribution_plot
|
|
54 #else:
|
|
55 /dev/null
|
|
56 #end if
|
|
57
|
|
58 #if $output_hierarchical_clustering_plot:
|
|
59 $output_hierarchical_clustering_plot
|
|
60 #else:
|
|
61 /dev/null
|
|
62 #end if
|
|
63
|
|
64 #if $output_heatmap_plot:
|
|
65 $output_heatmap_plot
|
|
66 #else:
|
|
67 /dev/null
|
|
68 #end if
|
|
69
|
|
70 #if $output_RData_obj:
|
|
71 $output_RData_obj
|
|
72 #else:
|
|
73 /dev/null
|
|
74 #end if
|
55
|
75
|
|
76 $output_format_images
|
|
77 '
|
25
|
78 #if $output_R:
|
|
79 > $output_R
|
|
80 #else:
|
|
81 > /dev/null
|
|
82 #end if
|
|
83
|
53
|
84 2> stderr.txt ;
|
|
85
|
|
86 grep -v 'Calculating library sizes from column' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
87
|
|
88 ## Locale error messages:
|
|
89 grep -v 'During startup - Warning messages' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
90 grep -v 'Setting LC_TIME failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
91 grep -v 'Setting LC_MONETARY failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
92 grep -v 'Setting LC_PAPER failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
93 grep -v 'Setting LC_MEASUREMENT failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
94 grep -v 'Setting LC_CTYPE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
95 grep -v 'Setting LC_COLLATE failed' stderr.txt > stderr2.txt ; rm stderr.txt ; mv stderr2.txt stderr.txt ;
|
|
96
|
|
97 cat stderr.txt >&2
|
25
|
98
|
|
99 </command>
|
|
100
|
|
101 <inputs>
|
|
102 <param name="expression_matrix" type="data" format="tabular" label="Expression (read count) matrix" />
|
|
103 <param name="design_matrix" type="data" format="tabular" label="Design matrix" hepl="Ensure your samplenames are identical to those in the expression matrix. Preferentially, create the contrast matrix using 'edgeR: Design- from Expression matrix'." />
|
|
104
|
|
105 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info: http://www.bioconductor.org/packages/release/bioc/html/limma.html and http://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf." />
|
|
106
|
|
107 <param name="fdr" type="float" min="0" max="1" value="0.05" label="False Discovery Rate (FDR)" />
|
|
108
|
|
109 <param name="outputs" type="select" label="Optional desired outputs" multiple="true" display="checkboxes">
|
|
110 <option value="make_output_raw_counts">Raw counts table</option>
|
|
111 <option value="make_output_MDSplot">MDS-plot</option>
|
|
112 <option value="make_output_BCVplot">BCV-plot</option>
|
|
113 <option value="make_output_MAplot">MA-plot</option>
|
|
114 <option value="make_output_PValue_distribution_plot">P-Value distribution plot</option>
|
|
115 <option value="make_output_hierarchical_clustering_plot">Hierarchical custering</option>
|
|
116 <option value="make_output_heatmap_plot">Heatmap</option>
|
|
117
|
43
|
118 <option value="make_output_R_stdout">R stdout</option>
|
25
|
119 <option value="make_output_RData_obj">R Data object</option>
|
|
120 </param>
|
55
|
121
|
|
122 <param name="output_format_images" type="select" label="Output format of images" display="radio">
|
|
123 <option value="png">Portable network graphics (.png)</option>
|
|
124 <option value="pdf">Portable document format (.pdf)</option>
|
|
125 <option value="svg">Scalable vector graphics (.svg)</option>
|
|
126 </param>
|
25
|
127 </inputs>
|
|
128
|
|
129 <configfiles>
|
|
130 <configfile name="R_script">
|
|
131 library(limma,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
132 library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
133 library(splines,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping
|
|
134
|
|
135 ## Fetch commandline arguments
|
|
136 args <- commandArgs(trailingOnly = TRUE)
|
|
137
|
|
138 expression_matrix_file = args[1]
|
|
139 design_matrix_file = args[2]
|
|
140 contrast = args[3]
|
|
141
|
|
142 fdr = args[4]
|
|
143
|
|
144 output_count_edgeR = args[5]
|
|
145 output_cpm = args[6]
|
|
146
|
43
|
147 output_xpkm = args[7] ##FPKM file - yet to be implemented
|
25
|
148
|
|
149 output_raw_counts = args[8]
|
|
150 output_MDSplot = args[9]
|
|
151 output_BCVplot = args[10]
|
|
152 output_MAplot = args[11]
|
|
153 output_PValue_distribution_plot = args[12]
|
|
154 output_hierarchical_clustering_plot = args[13]
|
|
155 output_heatmap_plot = args[14]
|
|
156 output_RData_obj = args[15]
|
55
|
157 output_format_images = args[16]
|
25
|
158
|
|
159
|
|
160 library(edgeR)
|
|
161 ##raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
162 ## Obtain read-counts
|
|
163
|
|
164 expression_matrix <- read.delim(expression_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
165 design_matrix <- read.delim(design_matrix_file,header=T,stringsAsFactors=F,row.names=1,check.names=FALSE,na.strings=c(""))
|
|
166
|
|
167 colnames(design_matrix) <- make.names(colnames(design_matrix))
|
|
168
|
|
169 for(i in 1:ncol(design_matrix)) {
|
|
170 old = design_matrix[,i]
|
|
171 design_matrix[,i] = make.names(design_matrix[,i])
|
|
172 if(paste(design_matrix[,i],collapse="\t") != paste(old,collapse="\t")) {
|
|
173 print("Renaming of factors:")
|
|
174 print(old)
|
|
175 print("To:")
|
|
176 print(design_matrix[,i])
|
|
177 }
|
45
|
178 ## The following line seems to malfunction the script:
|
|
179 ##design_matrix[,i] <- as.factor(design_matrix[,i])
|
25
|
180 }
|
|
181
|
44
|
182 ## 1) In the expression matrix, you only want to have the samples described in the design matrix
|
25
|
183 columns <- match(rownames(design_matrix),colnames(expression_matrix))
|
43
|
184 columns <- columns[!is.na(columns)]
|
25
|
185 read_counts <- expression_matrix[,columns]
|
|
186
|
44
|
187 ## 2) In the design matrix, you only want to have samples of which you really have the counts
|
|
188 columns <- match(colnames(expression_matrix),rownames(design_matrix))
|
|
189 columns <- columns[!is.na(columns)]
|
|
190 design_matrix <- design_matrix[columns,,drop=FALSE]
|
25
|
191
|
|
192 ## Filter for HTSeq predifined counts:
|
|
193 exclude_HTSeq <- c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
194 exclude_DEXSeq <- c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
195
|
44
|
196 exclude <- match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
197 exclude <- exclude[is.na(exclude)==0]
|
25
|
198 if(length(exclude) != 0) {
|
44
|
199 read_counts <- read_counts[-exclude,]
|
25
|
200 }
|
|
201
|
|
202
|
44
|
203 empty_samples <- apply(read_counts,2,function(x) sum(x) == 0)
|
25
|
204 if(sum(empty_samples) > 0) {
|
|
205 write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
|
|
206 write(colnames(read_counts)[empty_samples],stderr())
|
|
207 } else {
|
|
208
|
|
209 dge <- DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
210
|
|
211 formula <- paste(c("~0",make.names(colnames(design_matrix))),collapse = " + ")
|
|
212 design_matrix_tmp <- design_matrix
|
|
213 colnames(design_matrix_tmp) <- make.names(colnames(design_matrix_tmp))
|
|
214 design <- model.matrix(as.formula(formula),design_matrix_tmp)
|
|
215 rm(design_matrix_tmp)
|
|
216
|
|
217 # Filter prefixes
|
|
218 prefixes = colnames(design_matrix)[attr(design,"assign")]
|
|
219 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
220 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
221 replacements[avoid] = colnames(design)[avoid]
|
|
222 colnames(design) = replacements
|
|
223
|
|
224 # Do normalization
|
|
225 write("Calculating normalization factors...",stdout())
|
|
226 dge <- calcNormFactors(dge)
|
|
227 write("Estimating common dispersion...",stdout())
|
|
228 dge <- estimateGLMCommonDisp(dge,design)
|
|
229 write("Estimating trended dispersion...",stdout())
|
|
230 dge <- estimateGLMTrendedDisp(dge,design)
|
|
231 write("Estimating tagwise dispersion...",stdout())
|
|
232 dge <- estimateGLMTagwiseDisp(dge,design)
|
|
233
|
|
234
|
|
235 if(output_MDSplot != "/dev/null") {
|
|
236 write("Creating MDS plot",stdout())
|
|
237 ##points <- plotMDS(dge,method="bcv",labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
238 points <- plotMDS.DGEList(dge,labels=rep("",nrow(dge\$samples)))# Get coordinates of unflexible plot
|
|
239 dev.off()# Kill it
|
|
240
|
55
|
241 if(output_format_images == "pdf") {
|
|
242 pdf(output_MDSplot)
|
|
243 } else if(output_format_images == "svg") {
|
|
244 svg(output_MDSplot)
|
|
245 } else {
|
|
246 png(output_MDSplot)
|
|
247 }
|
|
248
|
25
|
249 diff_x <- abs(max(points\$x)-min(points\$x))
|
|
250 diff_y <-(max(points\$y)-min(points\$y))
|
|
251 plot(c(min(points\$x),max(points\$x) + 0.45 * diff_x), c(min(points\$y) - 0.05 * diff_y,max(points\$y) + 0.05 * diff_y), main="edgeR MDS Plot",type="n", xlab="BCV distance 1", ylab="BCV distance 2")
|
|
252 points(points\$x,points\$y,pch=20)
|
|
253 text(points\$x, points\$y,rownames(dge\$samples),cex=0.7,col="gray",pos=4)
|
|
254 rm(diff_x,diff_y)
|
|
255
|
|
256 dev.off()
|
|
257 }
|
|
258
|
|
259 if(output_BCVplot != "/dev/null") {
|
|
260 write("Creating Biological coefficient of variation plot",stdout())
|
60
|
261
|
|
262 if(output_format_images == "pdf") {
|
|
263 pdf(output_BCVplot)
|
|
264 } else if(output_format_images == "svg") {
|
|
265 svg(output_BCVplot)
|
|
266 } else {
|
|
267 png(output_BCVplot)
|
|
268 }
|
|
269
|
25
|
270 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
271 dev.off()
|
|
272 }
|
|
273
|
|
274
|
|
275 write("Fitting GLM...",stdout())
|
|
276 fit <- glmFit(dge,design)
|
|
277
|
|
278 write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
|
|
279 cont <- c(contrast)
|
|
280 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
281
|
|
282 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
283 write(paste("Exporting to file: ",output_count_edgeR,sep=""),stdout())
|
|
284 write.table(file=output_count_edgeR,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=TRUE,col.names=NA)
|
|
285 write.table(file=output_cpm,cpm(dge,normalized.lib.sizes=TRUE),sep="\t",row.names=TRUE,col.names=NA)
|
|
286
|
|
287 ## todo EXPORT FPKM
|
|
288 write.table(file=output_raw_counts,dge\$counts,sep="\t",row.names=TRUE,col.names=NA)
|
|
289
|
34
|
290 if(output_MAplot != "/dev/null" || output_PValue_distribution_plot != "/dev/null") {
|
25
|
291 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
292 etable <- etable[order(etable\$FDR), ]
|
32
|
293
|
|
294 if(output_MAplot != "/dev/null") {
|
|
295 write("Creating MA plot...",stdout())
|
60
|
296
|
|
297 if(output_format_images == "pdf") {
|
|
298 pdf(output_MAplot)
|
|
299 } else if(output_format_images == "svg") {
|
|
300 svg(output_MAplot)
|
|
301 } else {
|
|
302 png(output_MAplot)
|
|
303 }
|
|
304
|
32
|
305 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
306 with(subset(etable, FDR < fdr), points(logCPM, logFC, pch=20, col="red"))
|
|
307 abline(h=c(-1,1), col="blue")
|
|
308 dev.off()
|
|
309 }
|
25
|
310
|
32
|
311 if(output_PValue_distribution_plot != "/dev/null") {
|
|
312 write("Creating P-value distribution plot...",stdout())
|
60
|
313
|
|
314 if(output_format_images == "pdf") {
|
|
315 pdf(output_PValue_distribution_plot)
|
|
316 } else if(output_format_images == "svg") {
|
|
317 svg(output_PValue_distribution_plot)
|
|
318 } else {
|
|
319 png(output_PValue_distribution_plot)
|
|
320 }
|
|
321
|
32
|
322 expressed_genes <- subset(etable, PValue < 0.99)
|
|
323 h <- hist(expressed_genes\$PValue,breaks=nrow(expressed_genes)/15,main="Binned P-Values (< 0.99)")
|
|
324 center <- sum(h\$counts) / length(h\$counts)
|
|
325 lines(c(0,1),c(center,center),lty=2,col="red",lwd=2)
|
|
326 k <- ksmooth(h\$mid, h\$counts)
|
|
327 lines(k\$x,k\$y,col="red",lwd=2)
|
|
328 rmsd <- (h\$counts) - center
|
|
329 rmsd <- rmsd^2
|
|
330 rmsd <- sum(rmsd)
|
|
331 rmsd <- sqrt(rmsd)
|
|
332 text(0,max(h\$counts),paste("e=",round(rmsd,2),sep=""),pos=4,col="blue")
|
|
333 ## change e into epsilon somehow
|
|
334 dev.off()
|
|
335 }
|
40
|
336 }
|
|
337
|
|
338 if(output_heatmap_plot != "/dev/null") {
|
60
|
339
|
|
340 if(output_format_images == "pdf") {
|
|
341 pdf(output_heatmap_plot,width=10.5)
|
|
342 } else if(output_format_images == "svg") {
|
|
343 svg(output_heatmap_plot,width=10.5)
|
|
344 } else {
|
|
345 png(output_heatmap_plot,width=10.5)
|
|
346 }
|
|
347
|
40
|
348 etable2 <- topTags(lrt, n=100)\$table
|
|
349 order <- rownames(etable2)
|
|
350 cpm_sub <- cpm(dge,normalized.lib.sizes=TRUE,log=TRUE)[as.numeric(order),]
|
|
351 heatmap(t(cpm_sub))
|
|
352 dev.off()
|
25
|
353 }
|
|
354
|
|
355 ##output_hierarchical_clustering_plot = args[13]
|
|
356
|
35
|
357 if(output_RData_obj != "/dev/null") {
|
25
|
358 save.image(output_RData_obj)
|
|
359 }
|
|
360
|
|
361 write("Done!",stdout())
|
|
362 }
|
|
363 </configfile>
|
|
364 </configfiles>
|
|
365
|
|
366 <outputs>
|
53
|
367 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - differentially expressed genes" />
|
25
|
368 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
369
|
|
370 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts">
|
53
|
371 <filter>outputs and ("make_output_raw_counts" in outputs)</filter>
|
25
|
372 </data>
|
|
373
|
59
|
374 <data format="png" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
53
|
375 <filter>outputs and ("make_output_MDSplot" in outputs)</filter>
|
59
|
376
|
|
377 <change_format>
|
|
378 <when input="output_format_images" value="png" format="png" />
|
|
379 <when input="output_format_images" value="pdf" format="pdf" />
|
|
380 <when input="output_format_images" value="svg" format="svg" />
|
|
381 </change_format>
|
25
|
382 </data>
|
|
383
|
60
|
384 <data format="png" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
53
|
385 <filter>outputs and ("make_output_BCVplot" in outputs)</filter>
|
60
|
386
|
|
387 <change_format>
|
|
388 <when input="output_format_images" value="png" format="png" />
|
|
389 <when input="output_format_images" value="pdf" format="pdf" />
|
|
390 <when input="output_format_images" value="svg" format="svg" />
|
|
391 </change_format>
|
25
|
392 </data>
|
|
393
|
60
|
394 <data format="png" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
53
|
395 <filter>outputs and ("make_output_MAplot" in outputs)</filter>
|
60
|
396
|
|
397 <change_format>
|
|
398 <when input="output_format_images" value="png" format="png" />
|
|
399 <when input="output_format_images" value="pdf" format="pdf" />
|
|
400 <when input="output_format_images" value="svg" format="svg" />
|
|
401 </change_format>
|
25
|
402 </data>
|
|
403
|
60
|
404 <data format="png" name="output_PValue_distribution_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - P-Value distribution">
|
53
|
405 <filter>outputs and ("make_output_PValue_distribution_plot" in outputs)</filter>
|
60
|
406
|
|
407 <change_format>
|
|
408 <when input="output_format_images" value="png" format="png" />
|
|
409 <when input="output_format_images" value="pdf" format="pdf" />
|
|
410 <when input="output_format_images" value="svg" format="svg" />
|
|
411 </change_format>
|
25
|
412 </data>
|
|
413
|
60
|
414 <data format="png" name="output_hierarchical_clustering_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Hierarchical custering">
|
53
|
415 <filter>outputs and ("make_output_hierarchical_clustering_plot" in outputs)</filter>
|
60
|
416
|
|
417 <change_format>
|
|
418 <when input="output_format_images" value="png" format="png" />
|
|
419 <when input="output_format_images" value="pdf" format="pdf" />
|
|
420 <when input="output_format_images" value="svg" format="svg" />
|
|
421 </change_format>
|
25
|
422 </data>
|
|
423
|
60
|
424 <data format="png" name="output_heatmap_plot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - Heatmap">
|
53
|
425 <filter>outputs and ("make_output_heatmap_plot" in outputs)</filter>
|
60
|
426
|
|
427 <change_format>
|
|
428 <when input="output_format_images" value="png" format="png" />
|
|
429 <when input="output_format_images" value="pdf" format="pdf" />
|
|
430 <when input="output_format_images" value="svg" format="svg" />
|
|
431 </change_format>
|
25
|
432 </data>
|
|
433
|
|
434 <data format="RData" name="output_RData_obj" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R data object">
|
53
|
435 <filter>outputs and ("make_output_RData_obj" in outputs)</filter>
|
25
|
436 </data>
|
|
437
|
40
|
438 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output (debug)" >
|
53
|
439 <filter>outputs and ("make_output_R_stdout" in outputs)</filter>
|
25
|
440 </data>
|
|
441 </outputs>
|
|
442
|
|
443 <help>
|
|
444 edgeR: Differential Gene(Expression) Analysis
|
36
|
445 #############################################
|
25
|
446
|
36
|
447 Overview
|
|
448 --------
|
|
449 Differential expression analysis of RNA-seq and digital gene expression profiles with biological replication. Uses empirical Bayes estimation and exact tests based on the negative binomial distribution. Also useful for differential signal analysis with other types of genome-scale count data [1].
|
25
|
450
|
|
451 For every experiment, the algorithm requires a design matrix. This matrix describes which samples belong to which groups.
|
36
|
452 More details on this are given in the edgeR manual: http://www.bioconductor.org/packages/2.12/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf
|
25
|
453 and the limma manual.
|
|
454
|
|
455 Because the creation of a design matrix can be complex and time consuming, especially if no GUI is used, this package comes with an alternative tool which can help you with it.
|
|
456 This tool is called *edgeR Design Matrix Creator*.
|
|
457 If the appropriate design matrix (with corresponding links to the files) is given,
|
|
458 the correct contrast ( http://en.wikipedia.org/wiki/Contrast_(statistics) ) has to be given.
|
|
459
|
|
460 If you have for example two groups, with an equal weight, you would like to compare either
|
|
461 "g1~g2" or "normal~cancer".
|
|
462
|
36
|
463 The test function makes use of a MCF7 dataset used in a study that indicates that a higher sequencing depth is not neccesairily more important than a higher amount of replaciates[2].
|
25
|
464
|
36
|
465 Input
|
|
466 -----
|
|
467 Expression matrix
|
|
468 ^^^^^^^^^^^^^^^^^
|
|
469 ::
|
25
|
470
|
|
471 Geneid "\t" Sample-1 "\t" Sample-2 "\t" Sample-3 "\t" Sample-4 [...] "\n"
|
|
472 SMURF "\t" 123 "\t" 21 "\t" 34545 "\t" 98 ... "\n"
|
|
473 BRCA1 "\t" 435 "\t" 6655 "\t" 45 "\t" 55 ... "\n"
|
|
474 LINK33 "\t" 4 "\t" 645 "\t" 345 "\t" 1 ... "\n"
|
|
475 SNORD78 "\t" 498 "\t" 65 "\t" 98 "\t" 27 ... "\n"
|
|
476 [...]
|
|
477
|
36
|
478 *Note: Make sure the number of columns in the header is identical to the number of columns in the body.*
|
25
|
479
|
36
|
480 Design matrix
|
|
481 ^^^^^^^^^^^^^
|
|
482 ::
|
25
|
483
|
|
484 Sample "\t" Condition "\t" Ethnicity "\t" Patient "\t" Batch "\n"
|
|
485 Sample-1 "\t" Tumor "\t" European "\t" 1 "\t" 1 "\n"
|
|
486 Sample-2 "\t" Normal "\t" European "\t" 1 "\t" 1 "\n"
|
|
487 Sample-3 "\t" Tumor "\t" European "\t" 2 "\t" 1 "\n"
|
|
488 Sample-4 "\t" Normal "\t" European "\t" 2 "\t" 1 "\n"
|
|
489 Sample-5 "\t" Tumor "\t" African "\t" 3 "\t" 1 "\n"
|
|
490 Sample-6 "\t" Normal "\t" African "\t" 3 "\t" 1 "\n"
|
|
491 Sample-7 "\t" Tumor "\t" African "\t" 4 "\t" 2 "\n"
|
|
492 Sample-8 "\t" Normal "\t" African "\t" 4 "\t" 2 "\n"
|
|
493 Sample-9 "\t" Tumor "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
494 Sample-10 "\t" Normal "\t" Asian "\t" 5 "\t" 2 "\n"
|
|
495 Sample-11 "\t" Tumor "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
496 Sample-12 "\t" Normal "\t" Asian "\t" 6 "\t" 2 "\n"
|
|
497
|
36
|
498 *Note: Avoid factor names that are (1) numerical, (2) contain mathematical symbols and preferebly only use letters.*
|
25
|
499
|
36
|
500 Contrast
|
|
501 ^^^^^^^^
|
|
502 The contrast represents the biological question. There can be many questions asked, e.g.:
|
25
|
503
|
36
|
504 - Tumor-Normal
|
|
505 - African-European
|
|
506 - 0.5*(Control+Placebo) / Treated
|
25
|
507
|
36
|
508 Installation
|
|
509 ------------
|
25
|
510
|
|
511 This tool requires no specific configurations. The following dependencies are installed automatically:
|
36
|
512
|
|
513 - R
|
|
514 - Bioconductor
|
25
|
515 - limma
|
36
|
516
|
25
|
517 - edgeR
|
|
518
|
36
|
519 License
|
|
520 -------
|
|
521 - R
|
|
522 - GPL-2 & GPL-3
|
|
523 - limma
|
|
524 - GPL (>=2)
|
|
525 - edgeR
|
|
526 - GPL (>=2)
|
|
527
|
|
528 References
|
|
529 ----------
|
|
530
|
|
531 EdgeR
|
|
532 ^^^^^
|
|
533 **[1] edgeR: a Bioconductor package for differential expression analysis of digital gene expression data.**
|
25
|
534
|
36
|
535 *Mark D. Robinson, Davis J. McCarthy and Gordon K. Smyth* - Bioinformatics (2010) 26 (1): 139-140.
|
|
536
|
|
537 - http://www.bioconductor.org/packages/2.12/bioc/html/edgeR.html
|
|
538 - http://dx.doi.org/10.1093/bioinformatics/btp616
|
|
539 - http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
|
25
|
540
|
36
|
541 Test-data (MCF7)
|
|
542 ^^^^^^^^^^^^^^^^
|
|
543 **[2] RNA-seq differential expression studies: more sequence or more replication?**
|
|
544
|
|
545 *Yuwen Liu, Jie Zhou and Kevin P. White* - Bioinformatics (2014) 30 (3): 301-304.
|
|
546
|
|
547 - http://www.ncbi.nlm.nih.gov/pubmed/24319002
|
|
548 - http://dx.doi.org/10.1093/bioinformatics/btt688
|
|
549
|
|
550 Contact
|
|
551 -------
|
25
|
552 The tool wrapper has been written by Youri Hoogstrate from the Erasmus Medical Center (Rotterdam, Netherlands) on behalf of the Translational Research IT (TraIT) project:
|
|
553 http://www.ctmm.nl/en/programmas/infrastructuren/traitprojecttranslationeleresearch
|
|
554
|
36
|
555 I would like to thank Hina Riaz - Naz Khan for her helpful contribution.
|
25
|
556
|
36
|
557 More tools by the Translational Research IT (TraIT) project can be found in the following repository:
|
|
558 http://testtoolshed.g2.bx.psu.edu/
|
25
|
559 </help>
|
|
560 </tool>
|