view edgeR_Design_Matrix.py @ 10:a6ce4b7710d8 draft

Uploaded
author yhoogstrate
date Thu, 09 Jan 2014 05:42:44 -0500
parents 82bff7137e81
children
line wrap: on
line source

#!/usr/bin/env python
import argparse, os, shutil, sys, tempfile, subprocess


class sampleContainer:
	def __init__(self):
		self.samples = []
		self.pairs = []
		self.pair_index = {}
		self.treatments = {}
		self.treatment_types = {}
		self.names = {}
	
	def add_sample(self,sample):
		if(sample in self.samples):
			sys.stderr.write("Error:\n* Sample has been added multiple times: "+sample+"\n")
			sys.exit(1)
		else:
			self.samples.append(sample)
			print "     - Added: "+sample
	
	def add_pair(self,pair):
		print "   - Adding pair:"
		for sample in pair:
			self.add_sample(sample)
		self.pairs.append(pair)
		pair_id = len(self.pairs)
		
		for sample in pair:
			self.pair_index[sample] = pair_id
	
	def add_treatment_argument(self,treatment_argument,name=None):
		print " - Parsing treatment"
		
		treatment = {"factor_index":{},"sample_index":{}}
		only_integers = True
		
		i = 1
		for item in treatment_argument:
			if(i % 2):
				factor = item
				if(treatment['factor_index'].has_key(factor)):
					sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
				else:
					print "   - Adding factor: "+factor
					treatment["factor_index"][factor] = []
					if(not factor.isdigit()):
						only_integers = False
			else:
				for sample in item.split(","):
					treatment["factor_index"][factor].append(sample)
					if(treatment["sample_index"].has_key(sample)):
						sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
					else:
						treatment["sample_index"][sample] = factor
			i += 1
		
		treatment_factors = sorted(treatment["factor_index"].keys())
		
		if(name == None):
			treatment["name"] = "_vs_".join(treatment_factors)
		else:
			treatment["name"] = str(name)
		
		if(len(treatment["sample_index"]) != len(self.samples)):
			sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
		
		if(only_integers):
			treatment_type = "integer"
		else:
			treatment_type = "string"
		
		if(self.treatments.has_key(treatment["name"])):
			sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
		else:
			self.treatments[treatment["name"]] = treatment
			self.treatment_types[treatment["name"]] = treatment_type
			print "     - Treatment of type: "+treatment_type+" is valid: "+treatment["name"]
	
	def add_names_argument(self,names_argument):
		for sample in names_argument:
			self.add_name(sample)
	
	def add_name(self,argument):
		argument = argument.split(":",1)
		self.names[argument[0]] = argument[1]
	
	def add_unpaired_samples_argument(self,argument):
		print " - Adding unpaired samples"
		for sample in argument:
			self.add_sample(sample)
	
	def add_paired_samples_argument(self,argument):
		print " - Adding paired samples"
		pair = []
		for potential_sample in argument:
			if(potential_sample == ":"):
				self.add_pair(pair)
				pair = []
			else:
				pair.append(potential_sample)
	
	def add_treatments_argument(self,argument):
		print " - Adding treatments"
		treatment_argument = []
		for potential_treatment in argument:
			if(potential_treatment == ":"):
				self.add_treatment_argument(treatment_argument)
				treatment_argument = []
			else:
				treatment_argument.append(potential_treatment)
	
	def add_pairings(self):
		samples = sorted(self.samples)
		treatment_arguments = []
		
		if len(self.pairs) > 0:
			for pair_id in range(len(self.pairs)):
				# add to treatments
				treatment_arguments.append(str(pair_id+1))
				treatment_arguments.append(",".join(self.pairs[pair_id]))
				
				# remove from samples
				for sample in self.pairs[pair_id]:
					samples.remove(sample)
		
		# Add remaining samples as separate factors
		i = len(self.pairs)+1
		for sample in samples:
			treatment_arguments.append(str(i))
			treatment_arguments.append(sample)
			i += 1
		
		treatment_arguments.append(":")
		
		self.add_treatments_argument(treatment_arguments)
	
	def export(self,output):
		# Open file stream
		if(args.output == "-"):
			fh = sys.stdout
		else:
			fh = open(args.output,"w")
		
		# Write header:
		fh.write("filename\tname\t"+"\t".join(self.treatments.keys())+"\n")
		
		#fh.write("#\ttype")
		#for treatment in self.treatments.keys():
		#	fh.write("\t"+self.treatment_types[treatment])
		#fh.write("\n")
		
		# Write body:
		for sample in self.samples:
			fh.write(sample+"\t"+self.names[sample])
			for treatment in self.treatments.keys():
				fh.write("\t"+self.treatments[treatment]["sample_index"][sample])
			fh.write("\n")
		
		fh.close()

if __name__=="__main__":
	parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
	parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
	parser.add_argument("-u","--unpaired-samples", nargs="*", help='Unpaired sample files.')
	parser.add_argument("-p","--paired-samples", nargs="*", help='Paired sample files: pat_x_sample_1.txt, .')
	parser.add_argument("-t","--treatments", nargs="+", help='Treatment conditions',required=True)
	parser.add_argument("-n","--sample-names", nargs="+", help='Paired sample files: pat_x_sample_1.txt:NAME .')
	
	args = parser.parse_args()
	
	s = sampleContainer()
	s.add_unpaired_samples_argument(args.unpaired_samples)
	s.add_paired_samples_argument(args.paired_samples)
	s.add_treatments_argument(args.treatments)
	s.add_names_argument(args.sample_names)
	s.add_pairings()
	s.export(args.output)