2
|
1 <?xml version="1.0" encoding="UTF-8"?>
|
|
2 <tool id="edger_dge" name="edgeR Differential GeneExpression Analysis">
|
|
3 <description>RNA-Seq expression analysis using edgeR (R package)</description>
|
|
4
|
|
5 <command>
|
|
6 <!--
|
|
7 The following script is written in the "Cheetah" language:
|
|
8 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
|
|
9 -->
|
|
10
|
4
|
11 R CMD BATCH --vanilla --slave '--args
|
2
|
12 $design_matrix
|
|
13 $contrast
|
|
14
|
|
15 $output_count_edgeR
|
|
16 $output_cpm
|
|
17 output_FPXM
|
|
18 $output_raw_counts
|
|
19
|
|
20 $qc
|
|
21 $output_MDSplot
|
|
22 $output_BCVplot
|
|
23 $output_MAplot
|
|
24 smearPlot '
|
4
|
25 $R_script $output_R
|
2
|
26 </command>
|
|
27
|
|
28 <inputs>
|
|
29 <param name="design_matrix" type="data" format="tabular" help="Design matrix" />
|
|
30
|
|
31 <param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info." />
|
|
32
|
|
33 <param name="qc" type="select" label="Quality control reports">
|
|
34 <option value="true">Yes</option>
|
|
35 <option value="false" selected="true">No</option>
|
|
36 </param>
|
|
37
|
|
38 <param name="debug" type="select" label="R Debug output">
|
|
39 <option value="true" selected="true"> Yes</option>
|
|
40 <option value="false">No</option>
|
|
41 </param>
|
|
42 </inputs>
|
|
43
|
4
|
44 <configfiles>
|
|
45 <configfile name="R_script">
|
|
46 library(edgeR)
|
|
47
|
|
48 ## Fetch commandline arguments
|
6
|
49 args <- commandArgs(trailingOnly = TRUE)
|
4
|
50 designmatrix = args[1]
|
|
51 contrast = args[2]
|
|
52
|
|
53 output_1 = args[3]
|
|
54 output_2 = args[4]
|
|
55 output_3 = args[5] ##FPKM file - to be implemented
|
|
56 output_4 = args[6]
|
|
57
|
|
58 QC = nchar(args[7]) > 0
|
|
59
|
|
60 output_5 = args[8]
|
|
61 output_6 = args[9]
|
|
62 output_7 = args[10]
|
|
63
|
|
64 output_8 = args[11]
|
|
65
|
|
66
|
|
67 library(edgeR)
|
6
|
68 raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
4
|
69
|
|
70 ## Obtain read-counts
|
12
|
71
|
|
72 header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
|
|
73 has_header = (class(header[1,1]) == "character")
|
|
74
|
|
75 read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)
|
|
76
|
4
|
77 for(i in 2:length(raw_data[,1])) {
|
|
78 print("parsing counts from:")
|
|
79 print(raw_data[i,1])
|
11
|
80
|
|
81 header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
|
|
82 has_header = (class(header[1,1]) == "character")
|
|
83
|
|
84 read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1))
|
4
|
85 print(i)
|
|
86 }
|
|
87
|
|
88
|
|
89
|
|
90 ## Filter for HTSeq predifined counts:
|
|
91 exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
92 exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
93
|
|
94 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
95 exclude = exclude[is.na(exclude)==0]
|
|
96 if(length(exclude) != 0) {
|
|
97 read_counts = read_counts[-exclude,]
|
|
98 }
|
|
99
|
|
100
|
|
101
|
|
102
|
|
103
|
|
104
|
|
105
|
|
106
|
|
107
|
|
108 colnames(read_counts) = raw_data[,2]
|
|
109 dge = DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
110
|
6
|
111 design_tmp <- raw_data[3:length(raw_data)]
|
|
112 rownames(design_tmp) <- colnames(dge)
|
4
|
113 formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
|
6
|
114 design <- model.matrix(as.formula(formula),design_tmp)
|
4
|
115
|
|
116 prefixes = colnames(design_tmp)[attr(design,"assign")]
|
|
117 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
118 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
119 replacements[avoid] = colnames(design)[avoid]
|
|
120 colnames(design) = replacements
|
|
121
|
|
122
|
|
123
|
|
124 print("Calculating normalization factors...")
|
|
125 dge = calcNormFactors(dge)
|
|
126 print("Estimating common dispersion...")
|
|
127 dge = estimateGLMCommonDisp(dge,design)
|
|
128 print("Estimating trended dispersion...")
|
|
129 dge = estimateGLMTrendedDisp(dge,design)
|
|
130 print("Estimating tagwise dispersion...")
|
|
131 dge = estimateGLMTagwiseDisp(dge,design)
|
|
132
|
|
133
|
|
134
|
|
135
|
|
136 if (QC == TRUE) {
|
|
137 print("Creating QC plots...")
|
|
138 #### MDS Plot
|
|
139 pdf(output_5)
|
|
140 plotMDS(dge, main="edgeR MDS Plot")
|
|
141 dev.off()
|
|
142 #### Biological coefficient of variation plot
|
|
143 pdf(output_6)
|
|
144 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
145 dev.off()
|
|
146 }
|
|
147
|
|
148
|
|
149
|
|
150 print("Fitting GLM...")
|
|
151 fit = glmFit(dge,design)
|
|
152
|
|
153 print(paste("Performing likelihood ratio test: ",contrast,sep=""))
|
6
|
154 cont <- c(contrast)
|
|
155 cont <- makeContrasts(contrasts=cont, levels=design)
|
4
|
156
|
6
|
157 lrt <- glmLRT(fit, contrast=cont[,1])
|
4
|
158 print(paste("Exporting to file: ",output_1,sep=""))
|
7
|
159 write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T)
|
4
|
160 write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
|
|
161 ## todo EXPORT FPKM
|
7
|
162 write.table(file=output_4,dge\$counts,sep="\t")
|
4
|
163
|
|
164
|
|
165
|
|
166 if (QC == TRUE) {
|
|
167 print("Creating MA plots...")
|
|
168
|
|
169
|
7
|
170 etable <- topTags(lrt, n=nrow(dge))\$table
|
|
171 etable <- etable[order(etable\$FDR), ]
|
4
|
172 pdf(output_7)
|
|
173 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
6
|
174 with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
|
4
|
175 abline(h=c(-1,1), col="blue")
|
|
176 dev.off()
|
|
177 }
|
|
178 print("Done!")
|
|
179 </configfile>
|
|
180 </configfiles>
|
|
181
|
2
|
182 <outputs>
|
|
183 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" />
|
|
184 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
|
|
185 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts" />
|
|
186
|
|
187 <data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output" >
|
|
188 <filter>(debug == "true")</filter>
|
|
189 </data>
|
|
190
|
|
191 <data format="pdf" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
|
|
192 <filter>(qc == "true")</filter>
|
|
193 </data>
|
|
194
|
|
195 <data format="pdf" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
|
|
196 <filter>(qc == "true")</filter>
|
|
197 </data>
|
|
198
|
|
199 <data format="pdf" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
|
|
200 <filter>(qc == "true")</filter>
|
|
201 </data>
|
|
202 </outputs>
|
|
203
|
|
204 <help>
|
|
205 input: Design matrix using "create Design matrix" tool
|
|
206 input: contrast
|
|
207 </help>
|
|
208 </tool>
|