comparison design_matrix_creator.py @ 31:9e9b98a1cb12 draft

Uploaded
author yhoogstrate
date Tue, 20 May 2014 05:28:50 -0400
parents
children 7fc17dc83777
comparison
equal deleted inserted replaced
30:171f3cf6233d 31:9e9b98a1cb12
1 #!/usr/bin/env python
2 import argparse, os, shutil, sys, tempfile, subprocess
3
4
5 class sampleContainer:
6 def __init__(self):
7 self.samples = []
8 self.treatments = {}
9 self.treatment_types = {}
10
11 def do_decode(self,encoded_str):
12 return encoded_str.decode("base64").strip().replace("\t",'')
13
14 def add_samples(self,argument):
15 print " - Adding samples"
16 for sample in argument:
17 self.add_sample(self.do_decode(sample))
18
19 def add_sample(self,sample):
20 if(sample in self.samples):
21 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n")
22 sys.exit(1)
23 else:
24 self.samples.append(sample)
25 print " - Added: "+sample
26
27 def add_blocking(self,argument):
28 print " - Adding paired samples"
29 pair = []
30 for block in argument:
31 self.add_block(block)
32
33 def add_block(self,blocks):
34 blocks = blocks.split(":")
35 as_treatment = blocks[0]
36 blocks = blocks[1:]
37
38 used_samples = []
39 indexed_samples = {}
40
41 for i in range(len(blocks)):
42 block = blocks[i]
43 samples = self.get_samples_from_block(block)
44 indexed_samples[i+1] = []
45 for sample in samples:
46 if(sample in used_samples):
47 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n")
48 sys.exit(0)
49 else:
50 indexed_samples[i+1] = block
51 used_samples.append(sample)
52
53 for sample in self.samples:
54 if(sample not in used_samples):
55 i = i + 1
56 indexed_samples[i+1] = str(sample).encode('base64').strip()
57
58 for index in indexed_samples.keys():
59 key = str(index).encode('base64').strip()
60 as_treatment += ":"+key+":"+indexed_samples[index]
61
62 self.add_treatment(as_treatment)
63
64 def get_samples_from_block(self,decoded_block):
65 return [ self.do_decode(x) for x in decoded_block.split(",")]
66
67 def add_treatments(self,argument):
68 print " - Adding treatments"
69 for treatment in argument:
70 self.add_treatment(treatment)
71
72 def add_treatment(self,treatment_argument):
73 print " - Parsing treatment"
74
75
76 treatment_argument = treatment_argument.split(":")
77 name = self.do_decode(treatment_argument[0])
78 treatment_argument = treatment_argument[1:]
79
80
81 treatment = {"factor_index":{},"sample_index":{}}
82 only_integers = True
83
84 i = 1
85 for item in treatment_argument:
86 if(i % 2):
87 factor = self.do_decode(item)
88
89 if(treatment['factor_index'].has_key(factor)):
90 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
91 sys.exit(0)
92 else:
93 print " - Adding factor: "+factor
94 treatment["factor_index"][factor] = []
95 if(not factor.isdigit()):
96 only_integers = False
97 else:
98 for sample in item.split(","):
99 sample = self.do_decode(sample)
100
101 if(not sample in self.samples):
102 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n")
103 sys.exit(0)
104
105 treatment["factor_index"][factor].append(sample)
106 if(treatment["sample_index"].has_key(sample)):
107 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
108 sys.exit(0)
109 else:
110 treatment["sample_index"][sample] = factor
111 i += 1
112
113 treatment_factors = sorted(treatment["factor_index"].keys())
114
115 if(name == None):
116 treatment["name"] = "_vs_".join(treatment_factors)
117 else:
118 treatment["name"] = str(name)
119
120 if(len(treatment["sample_index"]) != len(self.samples)):
121 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
122
123 if(only_integers):
124 treatment_type = "integer"
125 else:
126 treatment_type = "string"
127
128 if(self.treatments.has_key(treatment["name"])):
129 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
130 else:
131 self.treatments[treatment["name"]] = treatment
132 self.treatment_types[treatment["name"]] = treatment_type
133 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid"
134
135 def export(self,output):
136 # Open file stream
137 if(args.output == "-"):
138 fh = sys.stdout
139 else:
140 fh = open(args.output,"w")
141
142 # Write header:
143 fh.write("sample-name\t"+"\t".join(self.treatments.keys())+"\n")
144
145 # Write body:
146 for sample in self.samples:
147 fh.write(sample)
148 for treatment in self.treatments.keys():
149 fh.write("\t"+self.treatments[treatment]["sample_index"][sample])
150 fh.write("\n")
151
152 fh.close()
153
154 if __name__=="__main__":
155 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
156 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
157 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)')
158 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)')
159 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True)
160 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"')
161
162 args = parser.parse_args()
163
164 columns = None
165 if(args.columns_file):
166 with open(args.columns_file, "r") as f:
167 listed_columns = [None] + f.readline().strip("\n").split("\t")
168 for i in range(1,len(listed_columns)):
169 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','')
170
171 s = sampleContainer()
172
173 if(listed_columns):
174 columns = []
175 for sample in args.sample_names:
176 columns.append(listed_columns[int(sample)])
177
178
179 treatments = []
180 for treatment in args.treatments:
181 treatment = treatment.split(":")
182 for i in range(1,len(treatment)):
183 if(i%2 == 0):
184 treatment_tmp = treatment[i].split(",")
185 for j in range(len(treatment_tmp)):
186 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])]
187 treatment[i] = ",".join(treatment_tmp)
188
189 treatments.append(":".join(treatment))
190
191 blockings = []
192 if(args.blocking):
193 for blocking in args.blocking:
194 blocking = blocking.split(":")
195 for i in range(1,len(blocking)):
196 block = blocking[i].split(",")
197 for j in range(len(block)):
198 block[j] = listed_columns[int(block[j])]
199 blocking[i] = ",".join(block)
200 blockings.append(":".join(blocking))
201
202 s.add_samples(columns)
203 s.add_treatments(treatments)
204 s.add_blocking(blockings)
205
206 else:
207 s.add_samples(args.sample_names)
208 s.add_treatments(args.treatments)
209 if(args.blocking):
210 s.add_blocking(args.blocking)
211
212 s.export(args.output)