comparison edgeR_DGE.xml @ 4:b1aee4b59049 draft

Uploaded
author yhoogstrate
date Thu, 09 Jan 2014 02:55:02 -0500
parents 521bfa975110
children 149a52c74f39
comparison
equal deleted inserted replaced
3:df239301559a 4:b1aee4b59049
6 <!-- 6 <!--
7 The following script is written in the "Cheetah" language: 7 The following script is written in the "Cheetah" language:
8 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html 8 http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
9 --> 9 -->
10 10
11 R CMD BATCH --vanilla '--args 11 R CMD BATCH --vanilla --slave '--args
12 $design_matrix 12 $design_matrix
13 $contrast 13 $contrast
14 14
15 $output_count_edgeR 15 $output_count_edgeR
16 $output_cpm 16 $output_cpm
20 $qc 20 $qc
21 $output_MDSplot 21 $output_MDSplot
22 $output_BCVplot 22 $output_BCVplot
23 $output_MAplot 23 $output_MAplot
24 smearPlot ' 24 smearPlot '
25 /home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE/edgeR_DGE_test.R $output_R 25 $R_script $output_R
26 </command> 26 </command>
27 27
28 <inputs> 28 <inputs>
29 <param name="design_matrix" type="data" format="tabular" help="Design matrix" /> 29 <param name="design_matrix" type="data" format="tabular" help="Design matrix" />
30 30
39 <option value="true" selected="true"> Yes</option> 39 <option value="true" selected="true"> Yes</option>
40 <option value="false">No</option> 40 <option value="false">No</option>
41 </param> 41 </param>
42 </inputs> 42 </inputs>
43 43
44 <configfiles>
45 <configfile name="R_script">
46 library(edgeR)
47
48 ## Fetch commandline arguments
49 args <- commandArgs(trailingOnly = TRUE)
50 designmatrix = args[1]
51 contrast = args[2]
52
53 output_1 = args[3]
54 output_2 = args[4]
55 output_3 = args[5] ##FPKM file - to be implemented
56 output_4 = args[6]
57
58 QC = nchar(args[7]) > 0
59
60 output_5 = args[8]
61 output_6 = args[9]
62 output_7 = args[10]
63
64 output_8 = args[11]
65
66
67
68
69 library(edgeR)
70 raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
71
72 ## Obtain read-counts
73 read_counts = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1)
74 for(i in 2:length(raw_data[,1])) {
75 print("parsing counts from:")
76 print(raw_data[i,1])
77 read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1))
78 print(i)
79 }
80
81
82
83 ## Filter for HTSeq predifined counts:
84 exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
85 exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
86
87 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
88 exclude = exclude[is.na(exclude)==0]
89 if(length(exclude) != 0) {
90 read_counts = read_counts[-exclude,]
91 }
92
93
94
95
96
97
98
99
100
101 colnames(read_counts) = raw_data[,2]
102 dge = DGEList(counts=read_counts,genes=rownames(read_counts))
103
104 design_tmp <- raw_data[3:length(raw_data)]
105 rownames(design_tmp) <- colnames(dge)
106 formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
107 design <- model.matrix(as.formula(formula),design_tmp)
108
109 prefixes = colnames(design_tmp)[attr(design,"assign")]
110 avoid = nchar(prefixes) == nchar(colnames(design))
111 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
112 replacements[avoid] = colnames(design)[avoid]
113 colnames(design) = replacements
114
115
116
117 print("Calculating normalization factors...")
118 dge = calcNormFactors(dge)
119 print("Estimating common dispersion...")
120 dge = estimateGLMCommonDisp(dge,design)
121 print("Estimating trended dispersion...")
122 dge = estimateGLMTrendedDisp(dge,design)
123 print("Estimating tagwise dispersion...")
124 dge = estimateGLMTagwiseDisp(dge,design)
125
126
127
128
129 if (QC == TRUE) {
130 print("Creating QC plots...")
131 #### MDS Plot
132 pdf(output_5)
133 plotMDS(dge, main="edgeR MDS Plot")
134 dev.off()
135 #### Biological coefficient of variation plot
136 pdf(output_6)
137 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
138 dev.off()
139 }
140
141
142
143 print("Fitting GLM...")
144 fit = glmFit(dge,design)
145
146 print(paste("Performing likelihood ratio test: ",contrast,sep=""))
147 cont <- c(contrast)
148 cont <- makeContrasts(contrasts=cont, levels=design)
149
150 lrt <- glmLRT(fit, contrast=cont[,1])
151 print(paste("Exporting to file: ",output_1,sep=""))
152 write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T)
153 write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
154 ## todo EXPORT FPKM
155 write.table(file=output_4,dge$counts,sep="\t")
156
157
158
159 if (QC == TRUE) {
160 print("Creating MA plots...")
161
162
163 etable <- topTags(lrt, n=nrow(dge))$table
164 etable <- etable[order(etable$FDR), ]
165 pdf(output_7)
166 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
167 with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
168 abline(h=c(-1,1), col="blue")
169 dev.off()
170 }
171 print("Done!")
172 </configfile>
173 </configfiles>
174
44 <outputs> 175 <outputs>
45 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" /> 176 <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" />
46 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" /> 177 <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
47 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts" /> 178 <data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts" />
48 179