Mercurial > repos > yhoogstrate > edger_with_design_matrix
changeset 4:b1aee4b59049 draft
Uploaded
author | yhoogstrate |
---|---|
date | Thu, 09 Jan 2014 02:55:02 -0500 |
parents | df239301559a |
children | c0cd78c1c10d |
files | edgeR_DGE.xml |
diffstat | 1 files changed, 133 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/edgeR_DGE.xml Thu Jan 09 02:44:37 2014 -0500 +++ b/edgeR_DGE.xml Thu Jan 09 02:55:02 2014 -0500 @@ -8,7 +8,7 @@ http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html --> - R CMD BATCH --vanilla '--args + R CMD BATCH --vanilla --slave '--args $design_matrix $contrast @@ -22,7 +22,7 @@ $output_BCVplot $output_MAplot smearPlot ' - /home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE/edgeR_DGE_test.R $output_R + $R_script $output_R </command> <inputs> @@ -41,6 +41,137 @@ </param> </inputs> + <configfiles> + <configfile name="R_script"> +library(edgeR) + +## Fetch commandline arguments +args <- commandArgs(trailingOnly = TRUE) +designmatrix = args[1] +contrast = args[2] + +output_1 = args[3] +output_2 = args[4] +output_3 = args[5] ##FPKM file - to be implemented +output_4 = args[6] + +QC = nchar(args[7]) > 0 + +output_5 = args[8] +output_6 = args[9] +output_7 = args[10] + +output_8 = args[11] + + + + +library(edgeR) +raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T) + +## Obtain read-counts +read_counts = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1) +for(i in 2:length(raw_data[,1])) { + print("parsing counts from:") + print(raw_data[i,1]) + read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1)) + print(i) +} + + + +## Filter for HTSeq predifined counts: +exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique") +exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned") + +exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts)) +exclude = exclude[is.na(exclude)==0] +if(length(exclude) != 0) { + read_counts = read_counts[-exclude,] +} + + + + + + + + + +colnames(read_counts) = raw_data[,2] +dge = DGEList(counts=read_counts,genes=rownames(read_counts)) + +design_tmp <- raw_data[3:length(raw_data)] +rownames(design_tmp) <- colnames(dge) +formula = paste(c("~0",colnames(design_tmp)),collapse = " + ") +design <- model.matrix(as.formula(formula),design_tmp) + +prefixes = colnames(design_tmp)[attr(design,"assign")] +avoid = nchar(prefixes) == nchar(colnames(design)) +replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design))) +replacements[avoid] = colnames(design)[avoid] +colnames(design) = replacements + + + +print("Calculating normalization factors...") +dge = calcNormFactors(dge) +print("Estimating common dispersion...") +dge = estimateGLMCommonDisp(dge,design) +print("Estimating trended dispersion...") +dge = estimateGLMTrendedDisp(dge,design) +print("Estimating tagwise dispersion...") +dge = estimateGLMTagwiseDisp(dge,design) + + + + +if (QC == TRUE) { + print("Creating QC plots...") + #### MDS Plot + pdf(output_5) + plotMDS(dge, main="edgeR MDS Plot") + dev.off() + #### Biological coefficient of variation plot + pdf(output_6) + plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance") + dev.off() +} + + + +print("Fitting GLM...") +fit = glmFit(dge,design) + +print(paste("Performing likelihood ratio test: ",contrast,sep="")) +cont <- c(contrast) +cont <- makeContrasts(contrasts=cont, levels=design) + +lrt <- glmLRT(fit, contrast=cont[,1]) +print(paste("Exporting to file: ",output_1,sep="")) +write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T) +write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t") +## todo EXPORT FPKM +write.table(file=output_4,dge$counts,sep="\t") + + + +if (QC == TRUE) { + print("Creating MA plots...") + + + etable <- topTags(lrt, n=nrow(dge))$table + etable <- etable[order(etable$FDR), ] + pdf(output_7) + with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance")) + with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red")) + abline(h=c(-1,1), col="blue") + dev.off() +} +print("Done!") + </configfile> + </configfiles> + <outputs> <data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" /> <data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />