Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison edgeR_DGE_test.R @ 3:df239301559a draft
Uploaded
| author | yhoogstrate |
|---|---|
| date | Thu, 09 Jan 2014 02:44:37 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 2:521bfa975110 | 3:df239301559a |
|---|---|
| 1 #!/usr/bin/env Rscript | |
| 2 | |
| 3 # edgeR citation: | |
| 4 # Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor package for differential expression | |
| 5 # analysis of digital gene expression data. Bioinformatics 26, 139-140 | |
| 6 # | |
| 7 # Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing differences in tag | |
| 8 # abundance. Bioinformatics 23, 2881-2887 | |
| 9 | |
| 10 # Robinson MD and Smyth GK (2008). Small-sample estimation of negative binomial dispersion, with | |
| 11 # applications to SAGE data. Biostatistics, 9, 321-332 | |
| 12 | |
| 13 # McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis of multifactor RNA-Seq | |
| 14 # experiments with respect to biological variation. Nucleic Acids Research 40, 4288-4297 | |
| 15 | |
| 16 # R script Author: | |
| 17 # - MSc. René Böttcher (Erasmus MC) | |
| 18 | |
| 19 # Hooked into Galaxy Server: | |
| 20 # - MSc. Youri Hoogstrate (Erasmus MC) | |
| 21 | |
| 22 | |
| 23 setwd("/home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE") | |
| 24 | |
| 25 library(edgeR) | |
| 26 | |
| 27 # Fetch commandline arguments | |
| 28 args <- commandArgs(trailingOnly = TRUE) | |
| 29 designmatrix = args[1] | |
| 30 contrast = args[2] | |
| 31 | |
| 32 output_1 = args[3] | |
| 33 output_2 = args[4] | |
| 34 output_3 = args[5] #FPKM file - to be implemented | |
| 35 output_4 = args[6] | |
| 36 | |
| 37 QC = nchar(args[7]) > 0 | |
| 38 | |
| 39 output_5 = args[8] | |
| 40 output_6 = args[9] | |
| 41 output_7 = args[10] | |
| 42 | |
| 43 output_8 = args[11] | |
| 44 | |
| 45 | |
| 46 | |
| 47 | |
| 48 library(edgeR) | |
| 49 raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) | |
| 50 | |
| 51 # Obtain read-counts | |
| 52 read_counts = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1) | |
| 53 for(i in 2:length(raw_data[,1])) { | |
| 54 print("parsing counts from:") | |
| 55 print(raw_data[i,1]) | |
| 56 read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1)) | |
| 57 print(i) | |
| 58 } | |
| 59 | |
| 60 | |
| 61 | |
| 62 # Filter for HTSeq predifined counts: | |
| 63 exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") | |
| 64 exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned") | |
| 65 | |
| 66 exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) | |
| 67 exclude = exclude[is.na(exclude)==0] | |
| 68 if(length(exclude) != 0) { | |
| 69 read_counts = read_counts[-exclude,] | |
| 70 } | |
| 71 | |
| 72 | |
| 73 | |
| 74 | |
| 75 | |
| 76 | |
| 77 | |
| 78 | |
| 79 | |
| 80 colnames(read_counts) = raw_data[,2] | |
| 81 dge = DGEList(counts=read_counts,genes=rownames(read_counts)) | |
| 82 | |
| 83 design_tmp <- raw_data[3:length(raw_data)] | |
| 84 rownames(design_tmp) <- colnames(dge) | |
| 85 formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") | |
| 86 design <- model.matrix(as.formula(formula),design_tmp) | |
| 87 | |
| 88 prefixes = colnames(design_tmp)[attr(design,"assign")] | |
| 89 avoid = nchar(prefixes) == nchar(colnames(design)) | |
| 90 replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) | |
| 91 replacements[avoid] = colnames(design)[avoid] | |
| 92 colnames(design) = replacements | |
| 93 | |
| 94 | |
| 95 | |
| 96 print("Calculating normalization factors...") | |
| 97 dge = calcNormFactors(dge) | |
| 98 print("Estimating common dispersion...") | |
| 99 dge = estimateGLMCommonDisp(dge,design) | |
| 100 print("Estimating trended dispersion...") | |
| 101 dge = estimateGLMTrendedDisp(dge,design) | |
| 102 print("Estimating tagwise dispersion...") | |
| 103 dge = estimateGLMTagwiseDisp(dge,design) | |
| 104 | |
| 105 | |
| 106 | |
| 107 | |
| 108 if (QC == TRUE) { | |
| 109 print("Creating QC plots...") | |
| 110 ## MDS Plot | |
| 111 pdf(output_5) | |
| 112 plotMDS(dge, main="edgeR MDS Plot") | |
| 113 dev.off() | |
| 114 ## Biological coefficient of variation plot | |
| 115 pdf(output_6) | |
| 116 plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") | |
| 117 dev.off() | |
| 118 } | |
| 119 | |
| 120 | |
| 121 | |
| 122 print("Fitting GLM...") | |
| 123 fit = glmFit(dge,design) | |
| 124 | |
| 125 print(paste("Performing likelihood ratio test: ",contrast,sep="")) | |
| 126 cont <- c(contrast) | |
| 127 cont <- makeContrasts(contrasts=cont, levels=design) | |
| 128 | |
| 129 lrt <- glmLRT(fit, contrast=cont[,1]) | |
| 130 print(paste("Exporting to file: ",output_1,sep="")) | |
| 131 write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T) | |
| 132 write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") | |
| 133 # todo EXPORT FPKM | |
| 134 write.table(file=output_4,dge$counts,sep="\t") | |
| 135 | |
| 136 | |
| 137 | |
| 138 if (QC == TRUE) { | |
| 139 print("Creating MA plots...") | |
| 140 | |
| 141 | |
| 142 etable <- topTags(lrt, n=nrow(dge))$table | |
| 143 etable <- etable[order(etable$FDR), ] | |
| 144 pdf(output_7) | |
| 145 with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) | |
| 146 with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) | |
| 147 abline(h=c(-1,1), col="blue") | |
| 148 dev.off() | |
| 149 } | |
| 150 print("Done!") | |
| 151 |
