| 79 | 1 #!/usr/bin/env python | 
|  | 2 | 
|  | 3 import argparse, os, shutil, sys, tempfile, subprocess | 
|  | 4 | 
|  | 5 __version_info__ = ('1', '0', '0')#, 'beta') | 
|  | 6 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3] | 
|  | 7 __author__ = 'Youri Hoogstrate' | 
|  | 8 __homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools' | 
|  | 9 __license__ = 'GNU General Public License v3 (GPLv3)' | 
|  | 10 | 
|  | 11 | 
|  | 12 class sampleContainer: | 
|  | 13 	def __init__(self): | 
|  | 14 		self.samples = [] | 
|  | 15 		self.treatments = {} | 
|  | 16 		self.treatment_index = [] | 
|  | 17 		self.treatment_types = {} | 
|  | 18 | 
|  | 19 	def do_decode(self,encoded_str): | 
|  | 20 		return encoded_str.decode("base64").strip().replace("\t",'') | 
|  | 21 | 
|  | 22 	def add_samples(self,argument): | 
|  | 23 		print " - Adding samples" | 
|  | 24 		for sample in argument: | 
|  | 25 			self.add_sample(self.do_decode(sample)) | 
|  | 26 | 
|  | 27 	def add_sample(self,sample): | 
|  | 28 		if(sample in self.samples): | 
|  | 29 			sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") | 
|  | 30 			sys.exit(1) | 
|  | 31 		else: | 
|  | 32 			self.samples.append(sample) | 
|  | 33 			print "     - Added: "+sample | 
|  | 34 | 
|  | 35 	def add_blocking(self,argument): | 
|  | 36 		print " - Adding paired samples" | 
|  | 37 		pair = [] | 
|  | 38 		for block in argument: | 
|  | 39 			self.add_block(block) | 
|  | 40 | 
|  | 41 	def add_block(self,blocks): | 
|  | 42 		blocks = blocks.split(":") | 
|  | 43 		as_treatment = blocks[0] | 
|  | 44 		blocks = blocks[1:] | 
|  | 45 | 
|  | 46 		used_samples = [] | 
|  | 47 		indexed_samples = {} | 
|  | 48 | 
|  | 49 		for i in range(len(blocks)): | 
|  | 50 			block = blocks[i] | 
|  | 51 			samples = self.get_samples_from_block(block) | 
|  | 52 			indexed_samples[i+1] = [] | 
|  | 53 			for sample in samples: | 
|  | 54 				if(sample in used_samples): | 
|  | 55 					sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") | 
|  | 56 					sys.exit(0) | 
|  | 57 				else: | 
|  | 58 					indexed_samples[i+1] = block | 
|  | 59 				used_samples.append(sample) | 
|  | 60 | 
|  | 61 		for sample in self.samples: | 
|  | 62 			if(sample not in used_samples): | 
|  | 63 				i = i + 1 | 
|  | 64 				indexed_samples[i+1] = str(sample).encode('base64').strip() | 
|  | 65 | 
|  | 66 		for index in indexed_samples.keys(): | 
|  | 67 			key = str(index).encode('base64').strip() | 
|  | 68 			as_treatment += ":"+key+":"+indexed_samples[index] | 
|  | 69 | 
|  | 70 		self.add_treatment(as_treatment) | 
|  | 71 | 
|  | 72 	def get_samples_from_block(self,decoded_block): | 
|  | 73 		return [ self.do_decode(x) for x in decoded_block.split(",")] | 
|  | 74 | 
|  | 75 	def add_treatments(self,argument): | 
|  | 76 		print " - Adding treatments" | 
|  | 77 		for treatment in argument: | 
|  | 78 			self.add_treatment(treatment) | 
|  | 79 | 
|  | 80 	def add_treatment(self,treatment_argument): | 
|  | 81 		print " - Parsing treatment" | 
|  | 82 | 
|  | 83 | 
|  | 84 		treatment_argument = treatment_argument.split(":") | 
|  | 85 		name = self.do_decode(treatment_argument[0]) | 
|  | 86 		treatment_argument = treatment_argument[1:] | 
|  | 87 | 
|  | 88 | 
|  | 89 		treatment = {"factor_index":{},"sample_index":{}} | 
|  | 90 		only_integers = True | 
|  | 91 | 
|  | 92 		i = 1 | 
|  | 93 		for item in treatment_argument: | 
|  | 94 			if(i % 2): | 
|  | 95 				factor = self.do_decode(item) | 
|  | 96 | 
|  | 97 				if(treatment['factor_index'].has_key(factor)): | 
|  | 98 					sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") | 
|  | 99 					sys.exit(0) | 
|  | 100 				else: | 
|  | 101 					print "   - Adding factor: "+factor | 
|  | 102 					treatment["factor_index"][factor] = [] | 
|  | 103 					if(not factor.isdigit()): | 
|  | 104 						only_integers = False | 
|  | 105 			else: | 
|  | 106 				for sample in item.split(","): | 
|  | 107 					sample = self.do_decode(sample) | 
|  | 108 | 
|  | 109 					if(not sample in self.samples): | 
|  | 110 						sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") | 
|  | 111 						sys.exit(0) | 
|  | 112 | 
|  | 113 					treatment["factor_index"][factor].append(sample) | 
|  | 114 					if(treatment["sample_index"].has_key(sample)): | 
|  | 115 						sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") | 
|  | 116 						sys.exit(0) | 
|  | 117 					else: | 
|  | 118 						treatment["sample_index"][sample] = factor | 
|  | 119 			i += 1 | 
|  | 120 | 
|  | 121 		treatment_factors = sorted(treatment["factor_index"].keys()) | 
|  | 122 | 
|  | 123 		if(name == None): | 
|  | 124 			treatment["name"] = "_vs_".join(treatment_factors) | 
|  | 125 		else: | 
|  | 126 			treatment["name"] = str(name) | 
|  | 127 | 
|  | 128 		if(len(treatment["sample_index"]) != len(self.samples)): | 
|  | 129 			sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") | 
|  | 130 | 
|  | 131 		if(only_integers): | 
|  | 132 			treatment_type = "integer" | 
|  | 133 		else: | 
|  | 134 			treatment_type = "string" | 
|  | 135 | 
|  | 136 		if(self.treatments.has_key(treatment["name"])): | 
|  | 137 			sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") | 
|  | 138 		else: | 
|  | 139 			self.treatments[treatment["name"]] = treatment | 
|  | 140 			self.treatment_index.append(treatment["name"]) | 
|  | 141 			self.treatment_types[treatment["name"]] = treatment_type | 
|  | 142 			print "     - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" | 
|  | 143 | 
|  | 144 	def export(self,output): | 
|  | 145 		# Open file stream | 
|  | 146 		if(args.output == "-"): | 
|  | 147 			fh = sys.stdout | 
|  | 148 		else: | 
|  | 149 			fh = open(args.output,"w") | 
|  | 150 | 
|  | 151 		# Write header: | 
|  | 152 		fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n") | 
|  | 153 | 
|  | 154 		# Write body: | 
|  | 155 		for sample in self.samples: | 
|  | 156 			fh.write(sample) | 
|  | 157 			for treatment_id in self.treatment_index: | 
|  | 158 				treatment = self.treatments[treatment_id] | 
|  | 159 				fh.write("\t"+treatment["sample_index"][sample]) | 
|  | 160 			fh.write("\n") | 
|  | 161 | 
|  | 162 		fh.close() | 
|  | 163 | 
|  | 164 if __name__=="__main__": | 
|  | 165 	parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") | 
|  | 166 	parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) | 
|  | 167 	parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') | 
|  | 168 	parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') | 
|  | 169 	parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) | 
|  | 170 	parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') | 
|  | 171 | 
|  | 172 	args = parser.parse_args() | 
|  | 173 | 
|  | 174 	columns = None | 
|  | 175 	if(args.columns_file): | 
|  | 176 		with open(args.columns_file, "r") as f: | 
|  | 177 			listed_columns = [None] + f.readline().strip("\n").split("\t") | 
|  | 178 			for i in range(1,len(listed_columns)): | 
|  | 179 				listed_columns[i] =  listed_columns[i].encode('base64').replace('\n','') | 
|  | 180 | 
|  | 181 	s = sampleContainer() | 
|  | 182 | 
|  | 183 	if(listed_columns): | 
|  | 184 		columns = [] | 
|  | 185 		for sample in args.sample_names: | 
|  | 186 			columns.append(listed_columns[int(sample)]) | 
|  | 187 | 
|  | 188 | 
|  | 189 		treatments = [] | 
|  | 190 		for treatment in args.treatments: | 
|  | 191 			treatment = treatment.split(":") | 
|  | 192 			for i in range(1,len(treatment)): | 
|  | 193 				if(i%2 == 0): | 
|  | 194 					treatment_tmp = treatment[i].split(",") | 
|  | 195 					for j in range(len(treatment_tmp)): | 
|  | 196 						treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] | 
|  | 197 					treatment[i] = ",".join(treatment_tmp) | 
|  | 198 | 
|  | 199 			treatments.append(":".join(treatment)) | 
|  | 200 | 
|  | 201 		blockings = [] | 
|  | 202 		if(args.blocking): | 
|  | 203 			for blocking in args.blocking: | 
|  | 204 				blocking = blocking.split(":") | 
|  | 205 				for i in range(1,len(blocking)): | 
|  | 206 					block = blocking[i].split(",") | 
|  | 207 					for j in range(len(block)): | 
|  | 208 						block[j] = listed_columns[int(block[j])] | 
|  | 209 					blocking[i] = ",".join(block) | 
|  | 210 				blockings.append(":".join(blocking)) | 
|  | 211 | 
|  | 212 		s.add_samples(columns) | 
|  | 213 		s.add_treatments(treatments) | 
|  | 214 		s.add_blocking(blockings) | 
|  | 215 | 
|  | 216 	else: | 
|  | 217 		s.add_samples(args.sample_names) | 
|  | 218 		s.add_treatments(args.treatments) | 
|  | 219 		if(args.blocking): | 
|  | 220 			s.add_blocking(args.blocking) | 
|  | 221 | 
|  | 222 	s.export(args.output) |