| 
79
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 import argparse, os, shutil, sys, tempfile, subprocess
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 __version_info__ = ('1', '0', '0')#, 'beta')
 | 
| 
 | 
     6 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3]
 | 
| 
 | 
     7 __author__ = 'Youri Hoogstrate'
 | 
| 
 | 
     8 __homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools'
 | 
| 
 | 
     9 __license__ = 'GNU General Public License v3 (GPLv3)'
 | 
| 
 | 
    10 
 | 
| 
 | 
    11 
 | 
| 
 | 
    12 class sampleContainer:
 | 
| 
 | 
    13 	def __init__(self):
 | 
| 
 | 
    14 		self.samples = []
 | 
| 
 | 
    15 		self.treatments = {}
 | 
| 
 | 
    16 		self.treatment_index = []
 | 
| 
 | 
    17 		self.treatment_types = {}
 | 
| 
 | 
    18 	
 | 
| 
 | 
    19 	def do_decode(self,encoded_str):
 | 
| 
 | 
    20 		return encoded_str.decode("base64").strip().replace("\t",'')
 | 
| 
 | 
    21 	
 | 
| 
 | 
    22 	def add_samples(self,argument):
 | 
| 
 | 
    23 		print " - Adding samples"
 | 
| 
 | 
    24 		for sample in argument:
 | 
| 
 | 
    25 			self.add_sample(self.do_decode(sample))
 | 
| 
 | 
    26 	
 | 
| 
 | 
    27 	def add_sample(self,sample):
 | 
| 
 | 
    28 		if(sample in self.samples):
 | 
| 
 | 
    29 			sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n")
 | 
| 
 | 
    30 			sys.exit(1)
 | 
| 
 | 
    31 		else:
 | 
| 
 | 
    32 			self.samples.append(sample)
 | 
| 
 | 
    33 			print "     - Added: "+sample
 | 
| 
 | 
    34 	
 | 
| 
 | 
    35 	def add_blocking(self,argument):
 | 
| 
 | 
    36 		print " - Adding paired samples"
 | 
| 
 | 
    37 		pair = []
 | 
| 
 | 
    38 		for block in argument:
 | 
| 
 | 
    39 			self.add_block(block)
 | 
| 
 | 
    40 	
 | 
| 
 | 
    41 	def add_block(self,blocks):
 | 
| 
 | 
    42 		blocks = blocks.split(":")
 | 
| 
 | 
    43 		as_treatment = blocks[0]
 | 
| 
 | 
    44 		blocks = blocks[1:]
 | 
| 
 | 
    45 		
 | 
| 
 | 
    46 		used_samples = []
 | 
| 
 | 
    47 		indexed_samples = {}
 | 
| 
 | 
    48 		
 | 
| 
 | 
    49 		for i in range(len(blocks)):
 | 
| 
 | 
    50 			block = blocks[i]
 | 
| 
 | 
    51 			samples = self.get_samples_from_block(block)
 | 
| 
 | 
    52 			indexed_samples[i+1] = []
 | 
| 
 | 
    53 			for sample in samples:
 | 
| 
 | 
    54 				if(sample in used_samples):
 | 
| 
 | 
    55 					sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n")
 | 
| 
 | 
    56 					sys.exit(0)
 | 
| 
 | 
    57 				else:
 | 
| 
 | 
    58 					indexed_samples[i+1] = block
 | 
| 
 | 
    59 				used_samples.append(sample)
 | 
| 
 | 
    60 		
 | 
| 
 | 
    61 		for sample in self.samples:
 | 
| 
 | 
    62 			if(sample not in used_samples):
 | 
| 
 | 
    63 				i = i + 1
 | 
| 
 | 
    64 				indexed_samples[i+1] = str(sample).encode('base64').strip()
 | 
| 
 | 
    65 		
 | 
| 
 | 
    66 		for index in indexed_samples.keys():
 | 
| 
 | 
    67 			key = str(index).encode('base64').strip()
 | 
| 
 | 
    68 			as_treatment += ":"+key+":"+indexed_samples[index]
 | 
| 
 | 
    69 		
 | 
| 
 | 
    70 		self.add_treatment(as_treatment)
 | 
| 
 | 
    71 	
 | 
| 
 | 
    72 	def get_samples_from_block(self,decoded_block):
 | 
| 
 | 
    73 		return [ self.do_decode(x) for x in decoded_block.split(",")]
 | 
| 
 | 
    74 	
 | 
| 
 | 
    75 	def add_treatments(self,argument):
 | 
| 
 | 
    76 		print " - Adding treatments"
 | 
| 
 | 
    77 		for treatment in argument:
 | 
| 
 | 
    78 			self.add_treatment(treatment)
 | 
| 
 | 
    79 	
 | 
| 
 | 
    80 	def add_treatment(self,treatment_argument):
 | 
| 
 | 
    81 		print " - Parsing treatment"
 | 
| 
 | 
    82 		
 | 
| 
 | 
    83 		
 | 
| 
 | 
    84 		treatment_argument = treatment_argument.split(":")
 | 
| 
 | 
    85 		name = self.do_decode(treatment_argument[0])
 | 
| 
 | 
    86 		treatment_argument = treatment_argument[1:]
 | 
| 
 | 
    87 		
 | 
| 
 | 
    88 		
 | 
| 
 | 
    89 		treatment = {"factor_index":{},"sample_index":{}}
 | 
| 
 | 
    90 		only_integers = True
 | 
| 
 | 
    91 		
 | 
| 
 | 
    92 		i = 1
 | 
| 
 | 
    93 		for item in treatment_argument:
 | 
| 
 | 
    94 			if(i % 2):
 | 
| 
 | 
    95 				factor = self.do_decode(item)
 | 
| 
 | 
    96 				
 | 
| 
 | 
    97 				if(treatment['factor_index'].has_key(factor)):
 | 
| 
 | 
    98 					sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
 | 
| 
 | 
    99 					sys.exit(0)
 | 
| 
 | 
   100 				else:
 | 
| 
 | 
   101 					print "   - Adding factor: "+factor
 | 
| 
 | 
   102 					treatment["factor_index"][factor] = []
 | 
| 
 | 
   103 					if(not factor.isdigit()):
 | 
| 
 | 
   104 						only_integers = False
 | 
| 
 | 
   105 			else:
 | 
| 
 | 
   106 				for sample in item.split(","):
 | 
| 
 | 
   107 					sample = self.do_decode(sample)
 | 
| 
 | 
   108 					
 | 
| 
 | 
   109 					if(not sample in self.samples):
 | 
| 
 | 
   110 						sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n")
 | 
| 
 | 
   111 						sys.exit(0)
 | 
| 
 | 
   112 					
 | 
| 
 | 
   113 					treatment["factor_index"][factor].append(sample)
 | 
| 
 | 
   114 					if(treatment["sample_index"].has_key(sample)):
 | 
| 
 | 
   115 						sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
 | 
| 
 | 
   116 						sys.exit(0)
 | 
| 
 | 
   117 					else:
 | 
| 
 | 
   118 						treatment["sample_index"][sample] = factor
 | 
| 
 | 
   119 			i += 1
 | 
| 
 | 
   120 		
 | 
| 
 | 
   121 		treatment_factors = sorted(treatment["factor_index"].keys())
 | 
| 
 | 
   122 		
 | 
| 
 | 
   123 		if(name == None):
 | 
| 
 | 
   124 			treatment["name"] = "_vs_".join(treatment_factors)
 | 
| 
 | 
   125 		else:
 | 
| 
 | 
   126 			treatment["name"] = str(name)
 | 
| 
 | 
   127 		
 | 
| 
 | 
   128 		if(len(treatment["sample_index"]) != len(self.samples)):
 | 
| 
 | 
   129 			sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
 | 
| 
 | 
   130 		
 | 
| 
 | 
   131 		if(only_integers):
 | 
| 
 | 
   132 			treatment_type = "integer"
 | 
| 
 | 
   133 		else:
 | 
| 
 | 
   134 			treatment_type = "string"
 | 
| 
 | 
   135 		
 | 
| 
 | 
   136 		if(self.treatments.has_key(treatment["name"])):
 | 
| 
 | 
   137 			sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
 | 
| 
 | 
   138 		else:
 | 
| 
 | 
   139 			self.treatments[treatment["name"]] = treatment
 | 
| 
 | 
   140 			self.treatment_index.append(treatment["name"])
 | 
| 
 | 
   141 			self.treatment_types[treatment["name"]] = treatment_type
 | 
| 
 | 
   142 			print "     - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid"
 | 
| 
 | 
   143 	
 | 
| 
 | 
   144 	def export(self,output):
 | 
| 
 | 
   145 		# Open file stream
 | 
| 
 | 
   146 		if(args.output == "-"):
 | 
| 
 | 
   147 			fh = sys.stdout
 | 
| 
 | 
   148 		else:
 | 
| 
 | 
   149 			fh = open(args.output,"w")
 | 
| 
 | 
   150 		
 | 
| 
 | 
   151 		# Write header:
 | 
| 
 | 
   152 		fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n")
 | 
| 
 | 
   153 		
 | 
| 
 | 
   154 		# Write body:
 | 
| 
 | 
   155 		for sample in self.samples:
 | 
| 
 | 
   156 			fh.write(sample)
 | 
| 
 | 
   157 			for treatment_id in self.treatment_index:
 | 
| 
 | 
   158 				treatment = self.treatments[treatment_id]
 | 
| 
 | 
   159 				fh.write("\t"+treatment["sample_index"][sample])
 | 
| 
 | 
   160 			fh.write("\n")
 | 
| 
 | 
   161 		
 | 
| 
 | 
   162 		fh.close()
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 if __name__=="__main__":
 | 
| 
 | 
   165 	parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
 | 
| 
 | 
   166 	parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
 | 
| 
 | 
   167 	parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)')
 | 
| 
 | 
   168 	parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)')
 | 
| 
 | 
   169 	parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True)
 | 
| 
 | 
   170 	parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"')
 | 
| 
 | 
   171 	
 | 
| 
 | 
   172 	args = parser.parse_args()
 | 
| 
 | 
   173 	
 | 
| 
 | 
   174 	columns = None
 | 
| 
 | 
   175 	if(args.columns_file):
 | 
| 
 | 
   176 		with open(args.columns_file, "r") as f:
 | 
| 
 | 
   177 			listed_columns = [None] + f.readline().strip("\n").split("\t")
 | 
| 
 | 
   178 			for i in range(1,len(listed_columns)):
 | 
| 
 | 
   179 				listed_columns[i] =  listed_columns[i].encode('base64').replace('\n','')
 | 
| 
 | 
   180 	
 | 
| 
 | 
   181 	s = sampleContainer()
 | 
| 
 | 
   182 	
 | 
| 
 | 
   183 	if(listed_columns):
 | 
| 
 | 
   184 		columns = []
 | 
| 
 | 
   185 		for sample in args.sample_names:
 | 
| 
 | 
   186 			columns.append(listed_columns[int(sample)])
 | 
| 
 | 
   187 		
 | 
| 
 | 
   188 		
 | 
| 
 | 
   189 		treatments = []
 | 
| 
 | 
   190 		for treatment in args.treatments:
 | 
| 
 | 
   191 			treatment = treatment.split(":")
 | 
| 
 | 
   192 			for i in range(1,len(treatment)):
 | 
| 
 | 
   193 				if(i%2 == 0):
 | 
| 
 | 
   194 					treatment_tmp = treatment[i].split(",")
 | 
| 
 | 
   195 					for j in range(len(treatment_tmp)):
 | 
| 
 | 
   196 						treatment_tmp[j] = listed_columns[int(treatment_tmp[j])]
 | 
| 
 | 
   197 					treatment[i] = ",".join(treatment_tmp)
 | 
| 
 | 
   198 					
 | 
| 
 | 
   199 			treatments.append(":".join(treatment))
 | 
| 
 | 
   200 		
 | 
| 
 | 
   201 		blockings = []
 | 
| 
 | 
   202 		if(args.blocking):
 | 
| 
 | 
   203 			for blocking in args.blocking:
 | 
| 
 | 
   204 				blocking = blocking.split(":")
 | 
| 
 | 
   205 				for i in range(1,len(blocking)):
 | 
| 
 | 
   206 					block = blocking[i].split(",")
 | 
| 
 | 
   207 					for j in range(len(block)):
 | 
| 
 | 
   208 						block[j] = listed_columns[int(block[j])]
 | 
| 
 | 
   209 					blocking[i] = ",".join(block)
 | 
| 
 | 
   210 				blockings.append(":".join(blocking))
 | 
| 
 | 
   211 		
 | 
| 
 | 
   212 		s.add_samples(columns)
 | 
| 
 | 
   213 		s.add_treatments(treatments)
 | 
| 
 | 
   214 		s.add_blocking(blockings)
 | 
| 
 | 
   215 	
 | 
| 
 | 
   216 	else:
 | 
| 
 | 
   217 		s.add_samples(args.sample_names)
 | 
| 
 | 
   218 		s.add_treatments(args.treatments)
 | 
| 
 | 
   219 		if(args.blocking):
 | 
| 
 | 
   220 			s.add_blocking(args.blocking)
 | 
| 
 | 
   221 	
 | 
| 
 | 
   222 	s.export(args.output)
 |