changeset 4:b1aee4b59049 draft

Uploaded
author yhoogstrate
date Thu, 09 Jan 2014 02:55:02 -0500
parents df239301559a
children c0cd78c1c10d
files edgeR_DGE.xml
diffstat 1 files changed, 133 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/edgeR_DGE.xml	Thu Jan 09 02:44:37 2014 -0500
+++ b/edgeR_DGE.xml	Thu Jan 09 02:55:02 2014 -0500
@@ -8,7 +8,7 @@
 			http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
 		-->
 		
-		R CMD BATCH --vanilla '--args
+		R CMD BATCH --vanilla --slave '--args
 			$design_matrix
 			$contrast
 			
@@ -22,7 +22,7 @@
 			$output_BCVplot
 			$output_MAplot
 			smearPlot '
-			/home/youri/Desktop/galaxy/tools/TraIT/edgeR/DGE/edgeR_DGE_test.R $output_R
+			$R_script $output_R
 	</command>
 	
 	<inputs>
@@ -41,6 +41,137 @@
 		</param>
 	</inputs>
 	
+	<configfiles>
+		<configfile name="R_script">
+library(edgeR)
+ 
+## Fetch commandline arguments
+args <- commandArgs(trailingOnly = TRUE)
+designmatrix        = args[1]
+contrast            = args[2]
+
+output_1            = args[3]
+output_2            = args[4]
+output_3            = args[5]		##FPKM file - to be implemented
+output_4            = args[6]
+
+QC                  = nchar(args[7]) > 0
+
+output_5            = args[8]
+output_6            = args[9]
+output_7            = args[10]
+
+output_8            = args[11]
+
+
+
+
+library(edgeR)
+raw_data <- read.delim(designmatrix,header=T,stringsAsFactors=T)
+
+## Obtain read-counts
+read_counts 		= read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1)
+for(i in 2:length(raw_data[,1])) {
+  print("parsing counts from:")
+  print(raw_data[i,1])
+  read_counts = cbind(read_counts,read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1))
+  print(i)
+}
+
+
+
+## Filter for HTSeq predifined counts:
+exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
+exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
+
+exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
+exclude = exclude[is.na(exclude)==0]
+if(length(exclude) != 0)  {
+  read_counts = read_counts[-exclude,]
+}
+
+
+
+
+
+
+
+
+
+colnames(read_counts) = raw_data[,2]
+dge 					= DGEList(counts=read_counts,genes=rownames(read_counts))
+
+design_tmp <- raw_data[3:length(raw_data)]
+rownames(design_tmp)     <- colnames(dge)
+formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
+design <- model.matrix(as.formula(formula),design_tmp)
+
+prefixes = colnames(design_tmp)[attr(design,"assign")]
+avoid = nchar(prefixes) == nchar(colnames(design))
+replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
+replacements[avoid] = colnames(design)[avoid]
+colnames(design) = replacements
+
+
+
+print("Calculating normalization factors...")
+dge		= calcNormFactors(dge)
+print("Estimating common dispersion...")
+dge 	= estimateGLMCommonDisp(dge,design)
+print("Estimating trended dispersion...")
+dge 	= estimateGLMTrendedDisp(dge,design)
+print("Estimating tagwise dispersion...")
+dge 	= estimateGLMTagwiseDisp(dge,design)
+
+
+
+
+if (QC == TRUE) {
+  print("Creating QC plots...")
+  #### MDS Plot
+  pdf(output_5)
+  plotMDS(dge, main="edgeR MDS Plot")
+  dev.off()
+  #### Biological coefficient of variation plot
+  pdf(output_6)
+  plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
+  dev.off()
+}
+
+
+
+print("Fitting GLM...")
+fit 	= glmFit(dge,design)
+
+print(paste("Performing likelihood ratio test: ",contrast,sep=""))
+cont <- c(contrast)
+cont <- makeContrasts(contrasts=cont, levels=design)
+
+lrt <- glmLRT(fit, contrast=cont[,1])
+print(paste("Exporting to file: ",output_1,sep=""))
+write.table(file=output_1,topTags(lrt,n=nrow(read_counts))$table,sep="\t",row.names=T)
+write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
+## todo EXPORT FPKM
+write.table(file=output_4,dge$counts,sep="\t")
+
+
+
+if (QC == TRUE) {
+  print("Creating MA plots...")
+  
+  
+  etable <- topTags(lrt, n=nrow(dge))$table
+  etable <- etable[order(etable$FDR), ]
+  pdf(output_7)
+  with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
+  with(subset(etable, FDR<0.05), points(logCPM, logFC, pch=20, col="red"))
+  abline(h=c(-1,1), col="blue")
+  dev.off()
+}
+print("Done!")
+		</configfile>
+	</configfiles>
+	
 	<outputs>
 		<data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" />
 		<data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />