Mercurial > repos > yhoogstrate > edger_with_design_matrix

--- a/edgeR_DGE.xml	Tue May 20 05:26:34 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,212 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<tool id="edger_dge" name="edgeR Differential GeneExpression Analysis">
-	<description>RNA-Seq expression analysis using edgeR (R package)</description>
-
-	<command>
-		<!--
-			The following script is written in the "Cheetah" language:
-			http://www.cheetahtemplate.org/docs/users_guide_html_multipage/contents.html
-		-->
-
-		R --vanilla --slave -f $R_script '--args
-			$design_matrix
-			$contrast
-
-			$output_count_edgeR
-			$output_cpm
-			output_FPXM
-			$output_raw_counts
-
-			$qc
-			$output_MDSplot
-			$output_BCVplot
-			$output_MAplot
-			smearPlot '
-			> $output_R
-			2> stderr.txt
-			&amp;&amp;
-			grep -v 'Calculating library sizes from column' stderr.txt 1>&amp;2
-
-	</command>
-
-	<inputs>
-		<param name="design_matrix" type="data" format="tabular" help="Design matrix" />
-
-		<param name="contrast" type="text" label="Contrast (biological question)" help="e.g. 'tumor-normal' or '(G1+G2)/2-G3' using the factors chosen in the design matrix. Read the 'makeContrasts' manual from Limma package for more info." />
-
-		<param name="qc" type="select" label="Quality control reports">
-			<option value="true" selected="true">Yes</option>
-			<option value="false">No</option>
-		</param>
-
-		<param name="debug" type="select" label="R Debug output">
-			<option value="true"> Yes</option>
-			<option value="false" selected="true">No</option>
-		</param>
-	</inputs>
-
-	<configfiles>
-		<configfile name="R_script">
-library(edgeR,quietly=TRUE) ## enable quietly to avoid unnecessaity stderr dumping for loading limma
-
-## Fetch commandline arguments
-args &lt;- commandArgs(trailingOnly = TRUE)
-designmatrix        = args[1]
-contrast            = args[2]
-
-output_1            = args[3]
-output_2            = args[4]
-output_3            = args[5]		##FPKM file - yet to be implemented
-output_4            = args[6]
-
-QC                  = nchar(args[7]) > 0
-
-output_5            = args[8]
-output_6            = args[9]
-output_7            = args[10]
-
-output_8            = args[11]
-
-
-library(edgeR)
-raw_data &lt;- read.delim(designmatrix,header=T,stringsAsFactors=T)
-
-## Obtain read-counts
-
-header = read.delim(as.character(raw_data[1,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
-has_header = (class(header[1,1]) == "character")
-
-read_counts = read.delim(as.character(raw_data[1,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
-
-for(i in 2:length(raw_data[,1])) {
-  write("parsing counts from:",stdout())
-  write(raw_data[i,1],stdout())
-
-  header = read.delim(as.character(raw_data[i,1]),header=F,stringsAsFactors=F,row.names=1,nrows=1)
-  has_header = (class(header[1,1]) == "character")
-  table = read.delim(as.character(raw_data[i,1]),header=has_header,stringsAsFactors=F,row.names=1)[1]
-
-  read_counts = cbind(read_counts,table)
-}
-
-colnames(read_counts) = as.character(raw_data[,2])
-
-
-
-## Filter for HTSeq predifined counts:
-exclude_HTSeq = c("no_feature","ambiguous","too_low_aQual","not_aligned","alignment_not_unique")
-exclude_DEXSeq = c("_ambiguous","_empty","_lowaqual","_notaligned")
-
-exclude = match(c(exclude_HTSeq, exclude_DEXSeq),rownames(read_counts))
-exclude = exclude[is.na(exclude)==0]
-if(length(exclude) != 0)  {
-  read_counts = read_counts[-exclude,]
-}
-
-
-
-empty_samples = apply(read_counts,2,function(x) sum(x) == 0)
-if(sum(empty_samples) > 0) {
-  write(paste("There are ",sum(empty_samples)," empty samples found:",sep=""),stderr())
-  write(colnames(read_counts)[empty_samples],stderr())
-} else {
-  dge = DGEList(counts=read_counts,genes=rownames(read_counts))
-
-  design_tmp &lt;- raw_data[3:length(raw_data)]
-  rownames(design_tmp)     &lt;- colnames(dge)
-  formula = paste(c("~0",colnames(design_tmp)),collapse = " + ")
-  design &lt;- model.matrix(as.formula(formula),design_tmp)
-
-  prefixes = colnames(design_tmp)[attr(design,"assign")]
-  avoid = nchar(prefixes) == nchar(colnames(design))
-  replacements = substr(colnames(design),nchar(prefixes)+1,nchar(colnames(design)))
-  replacements[avoid] = colnames(design)[avoid]
-  colnames(design) = replacements
-
-
-
-  write("Calculating normalization factors...",stdout())
-  dge = calcNormFactors(dge)
-  write("Estimating common dispersion...",stdout())
-  dge = estimateGLMCommonDisp(dge,design)
-  write("Estimating trended dispersion...",stdout())
-  dge = estimateGLMTrendedDisp(dge,design)
-  write("Estimating tagwise dispersion...",stdout())
-  dge = estimateGLMTagwiseDisp(dge,design)
-
-
-
-
-  if(QC == TRUE) {
-    write("Creating QC plots...",stdout())
-    #### MDS Plot
-    pdf(output_5)
-    plotMDS(dge, main="edgeR MDS Plot")
-    dev.off()
-    #### Biological coefficient of variation plot
-    pdf(output_6)
-    plotBCV(dge, cex=0.4, main="edgeR: Biological coefficient of variation (BCV) vs abundance")
-    dev.off()
-  }
-
-
-
-  write("Fitting GLM...",stdout())
-  fit   = glmFit(dge,design)
-
-  write(paste("Performing likelihood ratio test: ",contrast,sep=""),stdout())
-  cont &lt;- c(contrast)
-  cont &lt;- makeContrasts(contrasts=cont, levels=design)
-
-  lrt &lt;- glmLRT(fit, contrast=cont[,1])
-  write(paste("Exporting to file: ",output_1,sep=""),stdout())
-  write.table(file=output_1,topTags(lrt,n=nrow(read_counts))\$table,sep="\t",row.names=T)
-  write.table(file=output_2,cpm(dge,normalized.lib.sizes=TRUE),sep="\t")
-  ## todo EXPORT FPKM
-  write.table(file=output_4,dge\$counts,sep="\t")
-
-
-
-  if(QC == TRUE) {
-    write("Creating MA plots...",stdout())
-
-    etable &lt;- topTags(lrt, n=nrow(dge))\$table
-    etable &lt;- etable[order(etable\$FDR), ]
-    pdf(output_7)
-    with(etable, plot(logCPM, logFC, pch=20, main="edgeR: Fold change vs abundance"))
-    with(subset(etable, FDR&lt;0.05), points(logCPM, logFC, pch=20, col="red"))
-    abline(h=c(-1,1), col="blue")
-    dev.off()
-  }
-  write("Done!",stdout())
-}
-		</configfile>
-	</configfiles>
-
-	<outputs>
-		<data format="tabular" name="output_count_edgeR" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - table" />
-		<data format="tabular" name="output_cpm" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - CPM" />
-		<data format="tabular" name="output_raw_counts" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - raw counts" />
-
-		<data format="txt" name="output_R" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - R output" >
-			<filter>(debug == "true")</filter>
-		</data>
-
-		<data format="pdf" name="output_MDSplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MDS-plot">
-			<filter>(qc == "true")</filter>
-		</data>
-
-		<data format="pdf" name="output_BCVplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - BCV-plot">
-			<filter>(qc == "true")</filter>
-		</data>
-
-		<data format="pdf" name="output_MAplot" label="edgeR DGE on ${design_matrix.hid}: ${design_matrix.name} - MA-plot">
-			<filter>(qc == "true")</filter>
-		</data>
-	</outputs>
-
-	<help>
-		input: Design matrix using "create Design matrix" tool
-		input: contrast
-	</help>
-</tool>
--- a/edgeR_Design_Matrix.listing.py	Tue May 20 05:26:34 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,26 +0,0 @@
-
-def listfiles(unpaired_samples,paired_samples):
-	file_list = []
-
-	if(unpaired_samples != None):
-		if(type(unpaired_samples) == type([])):
-			for sample in unpaired_samples:
-				file_list.append([sample.name,sample.file_name,False])
-
-		## The following takes care of a bug in galaxy;
-		## if only one history object is selected, it isn't returned as a list but as single object.
-		## This is probably a rudimentair part of code that originates from the time that multiple data-objects were not implemented.
-
-		else:
-			file_list.append([unpaired_samples.name,unpaired_samples.file_name,False])
-
-	if(paired_samples != None):
-		pair_id = 0
-		for pair in paired_samples:
-			pair_id += 1
-			sample_id = 0
-			for sample in pair['samples']:
-				sample_id += 1
-				file_list.append([sample['sample'].name+" *[Pair "+str(pair_id)+"; sample "+str(sample_id)+"]",sample['sample'].file_name,False])
-
-	return file_list
--- a/edgeR_Design_Matrix.py	Tue May 20 05:26:34 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,178 +0,0 @@
-#!/usr/bin/env python
-import argparse, os, shutil, sys, tempfile, subprocess
-
-
-class sampleContainer:
-	def __init__(self):
-		self.samples = []
-		self.pairs = []
-		self.pair_index = {}
-		self.treatments = {}
-		self.treatment_types = {}
-		self.names = {}
-
-	def add_sample(self,sample):
-		if(sample in self.samples):
-			sys.stderr.write("Error:\n* Sample has been added multiple times: "+sample+"\n")
-			sys.exit(1)
-		else:
-			self.samples.append(sample)
-			print "     - Added: "+sample
-
-	def add_pair(self,pair):
-		print "   - Adding pair:"
-		for sample in pair:
-			self.add_sample(sample)
-		self.pairs.append(pair)
-		pair_id = len(self.pairs)
-
-		for sample in pair:
-			self.pair_index[sample] = pair_id
-
-	def add_treatment_argument(self,treatment_argument,name=None):
-		print " - Parsing treatment"
-
-		treatment = {"factor_index":{},"sample_index":{}}
-		only_integers = True
-
-		i = 1
-		for item in treatment_argument:
-			if(i % 2):
-				factor = item
-				if(treatment['factor_index'].has_key(factor)):
-					sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
-				else:
-					print "   - Adding factor: "+factor
-					treatment["factor_index"][factor] = []
-					if(not factor.isdigit()):
-						only_integers = False
-			else:
-				for sample in item.split(","):
-					treatment["factor_index"][factor].append(sample)
-					if(treatment["sample_index"].has_key(sample)):
-						sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
-					else:
-						treatment["sample_index"][sample] = factor
-			i += 1
-
-		treatment_factors = sorted(treatment["factor_index"].keys())
-
-		if(name == None):
-			treatment["name"] = "_vs_".join(treatment_factors)
-		else:
-			treatment["name"] = str(name)
-
-		if(len(treatment["sample_index"]) != len(self.samples)):
-			sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
-
-		if(only_integers):
-			treatment_type = "integer"
-		else:
-			treatment_type = "string"
-
-		if(self.treatments.has_key(treatment["name"])):
-			sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
-		else:
-			self.treatments[treatment["name"]] = treatment
-			self.treatment_types[treatment["name"]] = treatment_type
-			print "     - Treatment of type: "+treatment_type+" is valid: "+treatment["name"]
-
-	def add_names_argument(self,names_argument):
-		for sample in names_argument:
-			self.add_name(sample)
-
-	def add_name(self,argument):
-		argument = argument.split(":",1)
-		self.names[argument[0]] = argument[1]
-
-	def add_unpaired_samples_argument(self,argument):
-		print " - Adding unpaired samples"
-		for sample in argument:
-			self.add_sample(sample)
-
-	def add_paired_samples_argument(self,argument):
-		print " - Adding paired samples"
-		pair = []
-		for potential_sample in argument:
-			if(potential_sample == ":"):
-				self.add_pair(pair)
-				pair = []
-			else:
-				pair.append(potential_sample)
-
-	def add_treatments_argument(self,argument):
-		print " - Adding treatments"
-		treatment_argument = []
-		for potential_treatment in argument:
-			if(potential_treatment == ":"):
-				self.add_treatment_argument(treatment_argument)
-				treatment_argument = []
-			else:
-				treatment_argument.append(potential_treatment)
-
-	def add_pairings(self):
-		samples = sorted(self.samples)
-		treatment_arguments = []
-
-		if len(self.pairs) > 0:
-			for pair_id in range(len(self.pairs)):
-				# add to treatments
-				treatment_arguments.append(str(pair_id+1))
-				treatment_arguments.append(",".join(self.pairs[pair_id]))
-
-				# remove from samples
-				for sample in self.pairs[pair_id]:
-					samples.remove(sample)
-
-		# Add remaining samples as separate factors
-		i = len(self.pairs)+1
-		for sample in samples:
-			treatment_arguments.append(str(i))
-			treatment_arguments.append(sample)
-			i += 1
-
-		treatment_arguments.append(":")
-
-		self.add_treatments_argument(treatment_arguments)
-
-	def export(self,output):
-		# Open file stream
-		if(args.output == "-"):
-			fh = sys.stdout
-		else:
-			fh = open(args.output,"w")
-
-		# Write header:
-		fh.write("filename\tname\t"+"\t".join(self.treatments.keys())+"\n")
-
-		#fh.write("#\ttype")
-		#for treatment in self.treatments.keys():
-		#	fh.write("\t"+self.treatment_types[treatment])
-		#fh.write("\n")
-
-		# Write body:
-		for sample in self.samples:
-			fh.write(sample+"\t"+self.names[sample])
-			for treatment in self.treatments.keys():
-				fh.write("\t"+self.treatments[treatment]["sample_index"][sample])
-			fh.write("\n")
-
-		fh.close()
-
-if __name__=="__main__":
-	parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
-	parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
-	parser.add_argument("-u","--unpaired-samples", nargs="*", help='Unpaired sample files.')
-	parser.add_argument("-p","--paired-samples", nargs="*", help='Paired sample files: pat_x_sample_1.txt, .')
-	parser.add_argument("-t","--treatments", nargs="+", help='Treatment conditions',required=True)
-	parser.add_argument("-n","--sample-names", nargs="+", help='Paired sample files: pat_x_sample_1.txt:NAME .')
-
-	args = parser.parse_args()
-
-	s = sampleContainer()
-	s.add_unpaired_samples_argument(args.unpaired_samples)
-	s.add_paired_samples_argument(args.paired_samples)
-	s.add_treatments_argument(args.treatments)
-	s.add_names_argument(args.sample_names)
-	s.add_pairings()
-	s.export(args.output)
--- a/edgeR_Design_Matrix.xml	Tue May 20 05:26:34 2014 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<tool id="edger_design_matrix_creator" name="edgeR Design Matrix Creator">
-	<description>Create the experimental design for edgeR by a few clicks in Galaxy</description>
-
-	<command interpreter="python">
-		edgeR_Design_Matrix.py
-			-o $contrast_matrix
-			-u
-				#for $unpaired_sample in $unpaired_samples
-					${unpaired_sample}
-				#end for
-			-p
-				#for $paired_sample in $paired_samples
-					#for $sample in $paired_sample.samples
-						${sample.sample}
-					#end for
-					:
-				#end for
-			-t
-				#for $treatment in $check.treatments
-					#for $factor in $treatment.factors
-						"$factor.name"
-						$factor.samples
-					#end for
-					:
-				#end for
-			-n
-				#for $unpaired_sample in $unpaired_samples
-					"${unpaired_sample}:${unpaired_sample.name}"
-				#end for
-				#for $paired_sample in $paired_samples
-					#for $sample in $paired_sample.samples
-						"${sample.sample}:${sample.sample.name}"
-					#end for
-				#end for
-	</command>
-
-	<code file="edgeR_Design_Matrix.listing.py" />
-
-	<inputs>
-		<param name="unpaired_samples" type="data" format="tabular" label="Unpaired read-count dataset" help="from featureCounts/DEXSeq-count/HTSeq-count, etc.  Press [ctrl] and click the sample to unselect it." multiple="true" />
-
-		<repeat name="paired_samples" title="Add paired samples; per patient">
-			<repeat name="samples" title="Add sample for patient" min="2">
-				<param name="sample" type="data" format="tabular" label="Read-count dataset that belongs to a pair" help="from featureCounts/DEXSeq-count/HTSeq-count, etc." />
-			</repeat>
-		</repeat>
-
-		<conditional name="check">
-			<param name="continue" help="When clicking this variable, the treatment menu will appear" type="boolean" truevalue="yes" />
-
-			<when value="yes">
-				<repeat name="treatments" title="Treatments" min="1">
-					<repeat name="factors" title="Factor" min="2" help="e.g. 'normal' or 'tumor', 'untreated', 'recurrent', 'metastatic', or 'group_1',...,'group_n' etc.">
-						<param name="name" type="text" />
-						<param name="samples" type="select" label="Corresponding samples" multiple="true" dynamic_options="listfiles(unpaired_samples,paired_samples)"  />
-					</repeat>
-				</repeat>
-			</when>
-		</conditional>
-
-	</inputs>
-
-	<outputs>
-		<data format="tabular" name="contrast_matrix" label="Contrast matrix" />
-	</outputs>
-
-	<help>
-Examples::
-	 2 group, unpaired::
-		Group name 1 = Healthy
-		Group name 2 = Tumor
-
-	</help>
-</tool>