3
|
1 #!/usr/bin/env Rscript
|
|
2
|
|
3 # edgeR citation:
|
|
4 # Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression
|
|
5 # analysis of digital gene expression data. Bioinformatics 26, 139-140
|
|
6 #
|
|
7 # Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag
|
|
8 # abundance. Bioinformatics 23, 2881-2887
|
|
9
|
|
10 # Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with
|
|
11 # applications to SAGE data. Biostatistics, 9, 321-332
|
|
12
|
|
13 # McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq
|
|
14 # experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297
|
|
15
|
|
16 # R script Author:
|
|
17 # - MSc. René Böttcher (Erasmus MC)
|
|
18
|
|
19 # Hooked into Galaxy Server:
|
|
20 # - MSc. Youri Hoogstrate (Erasmus MC)
|
|
21
|
|
22
|
|
23 setwd("/home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE")
|
|
24
|
|
25 library(edgeR)
|
|
26
|
|
27 # Fetch commandline arguments
|
|
28 args <- commandArgs(trailingOnly = TRUE)
|
|
29 designmatrix = args[1]
|
|
30 contrast = args[2]
|
|
31
|
|
32 output_1 = args[3]
|
|
33 output_2 = args[4]
|
|
34 output_3 = args[5] #FPKM file - to be implemented
|
|
35 output_4 = args[6]
|
|
36
|
|
37 QC = nchar(args[7]) > 0
|
|
38
|
|
39 output_5 = args[8]
|
|
40 output_6 = args[9]
|
|
41 output_7 = args[10]
|
|
42
|
|
43 output_8 = args[11]
|
|
44
|
|
45
|
|
46
|
|
47
|
|
48 library(edgeR)
|
|
49 raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
|
|
50
|
|
51 # Obtain read-counts
|
|
52 read_counts = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1)
|
|
53 for(i in 2:length(raw_data[,1])) {
|
|
54 print("parsing counts from:")
|
|
55 print(raw_data[i,1])
|
|
56 read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1))
|
|
57 print(i)
|
|
58 }
|
|
59
|
|
60
|
|
61
|
|
62 # Filter for HTSeq predifined counts:
|
|
63 exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
|
|
64 exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
|
|
65
|
|
66 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
|
|
67 exclude = exclude[is.na(exclude)==0]
|
|
68 if(length(exclude) != 0) {
|
|
69 read_counts = read_counts[-exclude,]
|
|
70 }
|
|
71
|
|
72
|
|
73
|
|
74
|
|
75
|
|
76
|
|
77
|
|
78
|
|
79
|
|
80 colnames(read_counts) = raw_data[,2]
|
|
81 dge = DGEList(counts=read_counts,genes=rownames(read_counts))
|
|
82
|
|
83 design_tmp <- raw_data[3:length(raw_data)]
|
|
84 rownames(design_tmp) <- colnames(dge)
|
|
85 formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
|
|
86 design <- model.matrix(as.formula(formula),design_tmp)
|
|
87
|
|
88 prefixes = colnames(design_tmp)[attr(design,"assign")]
|
|
89 avoid = nchar(prefixes) == nchar(colnames(design))
|
|
90 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
|
|
91 replacements[avoid] = colnames(design)[avoid]
|
|
92 colnames(design) = replacements
|
|
93
|
|
94
|
|
95
|
|
96 print("Calculating normalization factors...")
|
|
97 dge = calcNormFactors(dge)
|
|
98 print("Estimating common dispersion...")
|
|
99 dge = estimateGLMCommonDisp(dge,design)
|
|
100 print("Estimating trended dispersion...")
|
|
101 dge = estimateGLMTrendedDisp(dge,design)
|
|
102 print("Estimating tagwise dispersion...")
|
|
103 dge = estimateGLMTagwiseDisp(dge,design)
|
|
104
|
|
105
|
|
106
|
|
107
|
|
108 if (QC == TRUE) {
|
|
109 print("Creating QC plots...")
|
|
110 ## MDS Plot
|
|
111 pdf(output_5)
|
|
112 plotMDS(dge, main="edgeR MDS Plot")
|
|
113 dev.off()
|
|
114 ## Biological coefficient of variation plot
|
|
115 pdf(output_6)
|
|
116 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
|
|
117 dev.off()
|
|
118 }
|
|
119
|
|
120
|
|
121
|
|
122 print("Fitting GLM...")
|
|
123 fit = glmFit(dge,design)
|
|
124
|
|
125 print(paste("Performing likelihood ratio test: ",contrast,sep=""))
|
|
126 cont <- c(contrast)
|
|
127 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
128
|
|
129 lrt <- glmLRT(fit, contrast=cont[,1])
|
|
130 print(paste("Exporting to file: ",output_1,sep=""))
|
|
131 write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T)
|
|
132 write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
|
|
133 # todo EXPORT FPKM
|
|
134 write.table(file=output_4,dge$counts,sep="\t")
|
|
135
|
|
136
|
|
137
|
|
138 if (QC == TRUE) {
|
|
139 print("Creating MA plots...")
|
|
140
|
|
141
|
|
142 etable <- topTags(lrt, n=nrow(dge))$table
|
|
143 etable <- etable[order(etable$FDR), ]
|
|
144 pdf(output_7)
|
|
145 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
|
|
146 with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
|
|
147 abline(h=c(-1,1), col="blue")
|
|
148 dev.off()
|
|
149 }
|
|
150 print("Done!")
|
|
151
|