Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison edgeR_DGE_test.R @ 3:df239301559a draft
Uploaded
author | yhoogstrate |
---|---|
date | Thu, 09 Jan 2014 02:44:37 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:521bfa975110 | 3:df239301559a |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # edgeR citation: | |
4 # Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression | |
5 # analysis of digital gene expression data. Bioinformatics 26, 139-140 | |
6 # | |
7 # Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag | |
8 # abundance. Bioinformatics 23, 2881-2887 | |
9 | |
10 # Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with | |
11 # applications to SAGE data. Biostatistics, 9, 321-332 | |
12 | |
13 # McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq | |
14 # experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297 | |
15 | |
16 # R script Author: | |
17 # - MSc. René Böttcher (Erasmus MC) | |
18 | |
19 # Hooked into Galaxy Server: | |
20 # - MSc. Youri Hoogstrate (Erasmus MC) | |
21 | |
22 | |
23 setwd("/home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE") | |
24 | |
25 library(edgeR) | |
26 | |
27 # Fetch commandline arguments | |
28 args <- commandArgs(trailingOnly = TRUE) | |
29 designmatrix = args[1] | |
30 contrast = args[2] | |
31 | |
32 output_1 = args[3] | |
33 output_2 = args[4] | |
34 output_3 = args[5] #FPKM file - to be implemented | |
35 output_4 = args[6] | |
36 | |
37 QC = nchar(args[7]) > 0 | |
38 | |
39 output_5 = args[8] | |
40 output_6 = args[9] | |
41 output_7 = args[10] | |
42 | |
43 output_8 = args[11] | |
44 | |
45 | |
46 | |
47 | |
48 library(edgeR) | |
49 raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) | |
50 | |
51 # Obtain read-counts | |
52 read_counts = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1) | |
53 for(i in 2:length(raw_data[,1])) { | |
54 print("parsing counts from:") | |
55 print(raw_data[i,1]) | |
56 read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1)) | |
57 print(i) | |
58 } | |
59 | |
60 | |
61 | |
62 # Filter for HTSeq predifined counts: | |
63 exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") | |
64 exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned") | |
65 | |
66 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) | |
67 exclude = exclude[is.na(exclude)==0] | |
68 if(length(exclude) != 0) { | |
69 read_counts = read_counts[-exclude,] | |
70 } | |
71 | |
72 | |
73 | |
74 | |
75 | |
76 | |
77 | |
78 | |
79 | |
80 colnames(read_counts) = raw_data[,2] | |
81 dge = DGEList(counts=read_counts,genes=rownames(read_counts)) | |
82 | |
83 design_tmp <- raw_data[3:length(raw_data)] | |
84 rownames(design_tmp) <- colnames(dge) | |
85 formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") | |
86 design <- model.matrix(as.formula(formula),design_tmp) | |
87 | |
88 prefixes = colnames(design_tmp)[attr(design,"assign")] | |
89 avoid = nchar(prefixes) == nchar(colnames(design)) | |
90 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) | |
91 replacements[avoid] = colnames(design)[avoid] | |
92 colnames(design) = replacements | |
93 | |
94 | |
95 | |
96 print("Calculating normalization factors...") | |
97 dge = calcNormFactors(dge) | |
98 print("Estimating common dispersion...") | |
99 dge = estimateGLMCommonDisp(dge,design) | |
100 print("Estimating trended dispersion...") | |
101 dge = estimateGLMTrendedDisp(dge,design) | |
102 print("Estimating tagwise dispersion...") | |
103 dge = estimateGLMTagwiseDisp(dge,design) | |
104 | |
105 | |
106 | |
107 | |
108 if (QC == TRUE) { | |
109 print("Creating QC plots...") | |
110 ## MDS Plot | |
111 pdf(output_5) | |
112 plotMDS(dge, main="edgeR MDS Plot") | |
113 dev.off() | |
114 ## Biological coefficient of variation plot | |
115 pdf(output_6) | |
116 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") | |
117 dev.off() | |
118 } | |
119 | |
120 | |
121 | |
122 print("Fitting GLM...") | |
123 fit = glmFit(dge,design) | |
124 | |
125 print(paste("Performing likelihood ratio test: ",contrast,sep="")) | |
126 cont <- c(contrast) | |
127 cont <- makeContrasts(contrasts=cont, levels=design) | |
128 | |
129 lrt <- glmLRT(fit, contrast=cont[,1]) | |
130 print(paste("Exporting to file: ",output_1,sep="")) | |
131 write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T) | |
132 write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") | |
133 # todo EXPORT FPKM | |
134 write.table(file=output_4,dge$counts,sep="\t") | |
135 | |
136 | |
137 | |
138 if (QC == TRUE) { | |
139 print("Creating MA plots...") | |
140 | |
141 | |
142 etable <- topTags(lrt, n=nrow(dge))$table | |
143 etable <- etable[order(etable$FDR), ] | |
144 pdf(output_7) | |
145 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) | |
146 with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) | |
147 abline(h=c(-1,1), col="blue") | |
148 dev.off() | |
149 } | |
150 print("Done!") | |
151 |