Mercurial > repos > yhoogstrate > edger_with_design_matrix
comparison design_matrix_creator.py @ 31:9e9b98a1cb12 draft
Uploaded
author | yhoogstrate |
---|---|
date | Tue, 20 May 2014 05:28:50 -0400 |
parents | |
children | 7fc17dc83777 |
comparison
equal
deleted
inserted
replaced
30:171f3cf6233d | 31:9e9b98a1cb12 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse, os, shutil, sys, tempfile, subprocess | |
3 | |
4 | |
5 class sampleContainer: | |
6 def __init__(self): | |
7 self.samples = [] | |
8 self.treatments = {} | |
9 self.treatment_types = {} | |
10 | |
11 def do_decode(self,encoded_str): | |
12 return encoded_str.decode("base64").strip().replace("\t",'') | |
13 | |
14 def add_samples(self,argument): | |
15 print " - Adding samples" | |
16 for sample in argument: | |
17 self.add_sample(self.do_decode(sample)) | |
18 | |
19 def add_sample(self,sample): | |
20 if(sample in self.samples): | |
21 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n") | |
22 sys.exit(1) | |
23 else: | |
24 self.samples.append(sample) | |
25 print " - Added: "+sample | |
26 | |
27 def add_blocking(self,argument): | |
28 print " - Adding paired samples" | |
29 pair = [] | |
30 for block in argument: | |
31 self.add_block(block) | |
32 | |
33 def add_block(self,blocks): | |
34 blocks = blocks.split(":") | |
35 as_treatment = blocks[0] | |
36 blocks = blocks[1:] | |
37 | |
38 used_samples = [] | |
39 indexed_samples = {} | |
40 | |
41 for i in range(len(blocks)): | |
42 block = blocks[i] | |
43 samples = self.get_samples_from_block(block) | |
44 indexed_samples[i+1] = [] | |
45 for sample in samples: | |
46 if(sample in used_samples): | |
47 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n") | |
48 sys.exit(0) | |
49 else: | |
50 indexed_samples[i+1] = block | |
51 used_samples.append(sample) | |
52 | |
53 for sample in self.samples: | |
54 if(sample not in used_samples): | |
55 i = i + 1 | |
56 indexed_samples[i+1] = str(sample).encode('base64').strip() | |
57 | |
58 for index in indexed_samples.keys(): | |
59 key = str(index).encode('base64').strip() | |
60 as_treatment += ":"+key+":"+indexed_samples[index] | |
61 | |
62 self.add_treatment(as_treatment) | |
63 | |
64 def get_samples_from_block(self,decoded_block): | |
65 return [ self.do_decode(x) for x in decoded_block.split(",")] | |
66 | |
67 def add_treatments(self,argument): | |
68 print " - Adding treatments" | |
69 for treatment in argument: | |
70 self.add_treatment(treatment) | |
71 | |
72 def add_treatment(self,treatment_argument): | |
73 print " - Parsing treatment" | |
74 | |
75 | |
76 treatment_argument = treatment_argument.split(":") | |
77 name = self.do_decode(treatment_argument[0]) | |
78 treatment_argument = treatment_argument[1:] | |
79 | |
80 | |
81 treatment = {"factor_index":{},"sample_index":{}} | |
82 only_integers = True | |
83 | |
84 i = 1 | |
85 for item in treatment_argument: | |
86 if(i % 2): | |
87 factor = self.do_decode(item) | |
88 | |
89 if(treatment['factor_index'].has_key(factor)): | |
90 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n") | |
91 sys.exit(0) | |
92 else: | |
93 print " - Adding factor: "+factor | |
94 treatment["factor_index"][factor] = [] | |
95 if(not factor.isdigit()): | |
96 only_integers = False | |
97 else: | |
98 for sample in item.split(","): | |
99 sample = self.do_decode(sample) | |
100 | |
101 if(not sample in self.samples): | |
102 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n") | |
103 sys.exit(0) | |
104 | |
105 treatment["factor_index"][factor].append(sample) | |
106 if(treatment["sample_index"].has_key(sample)): | |
107 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n") | |
108 sys.exit(0) | |
109 else: | |
110 treatment["sample_index"][sample] = factor | |
111 i += 1 | |
112 | |
113 treatment_factors = sorted(treatment["factor_index"].keys()) | |
114 | |
115 if(name == None): | |
116 treatment["name"] = "_vs_".join(treatment_factors) | |
117 else: | |
118 treatment["name"] = str(name) | |
119 | |
120 if(len(treatment["sample_index"]) != len(self.samples)): | |
121 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n") | |
122 | |
123 if(only_integers): | |
124 treatment_type = "integer" | |
125 else: | |
126 treatment_type = "string" | |
127 | |
128 if(self.treatments.has_key(treatment["name"])): | |
129 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n") | |
130 else: | |
131 self.treatments[treatment["name"]] = treatment | |
132 self.treatment_types[treatment["name"]] = treatment_type | |
133 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid" | |
134 | |
135 def export(self,output): | |
136 # Open file stream | |
137 if(args.output == "-"): | |
138 fh = sys.stdout | |
139 else: | |
140 fh = open(args.output,"w") | |
141 | |
142 # Write header: | |
143 fh.write("sample-name\t"+"\t".join(self.treatments.keys())+"\n") | |
144 | |
145 # Write body: | |
146 for sample in self.samples: | |
147 fh.write(sample) | |
148 for treatment in self.treatments.keys(): | |
149 fh.write("\t"+self.treatments[treatment]["sample_index"][sample]) | |
150 fh.write("\n") | |
151 | |
152 fh.close() | |
153 | |
154 if __name__=="__main__": | |
155 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.") | |
156 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True) | |
157 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)') | |
158 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)') | |
159 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True) | |
160 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"') | |
161 | |
162 args = parser.parse_args() | |
163 | |
164 columns = None | |
165 if(args.columns_file): | |
166 with open(args.columns_file, "r") as f: | |
167 listed_columns = [None] + f.readline().strip("\n").split("\t") | |
168 for i in range(1,len(listed_columns)): | |
169 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','') | |
170 | |
171 s = sampleContainer() | |
172 | |
173 if(listed_columns): | |
174 columns = [] | |
175 for sample in args.sample_names: | |
176 columns.append(listed_columns[int(sample)]) | |
177 | |
178 | |
179 treatments = [] | |
180 for treatment in args.treatments: | |
181 treatment = treatment.split(":") | |
182 for i in range(1,len(treatment)): | |
183 if(i%2 == 0): | |
184 treatment_tmp = treatment[i].split(",") | |
185 for j in range(len(treatment_tmp)): | |
186 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])] | |
187 treatment[i] = ",".join(treatment_tmp) | |
188 | |
189 treatments.append(":".join(treatment)) | |
190 | |
191 blockings = [] | |
192 if(args.blocking): | |
193 for blocking in args.blocking: | |
194 blocking = blocking.split(":") | |
195 for i in range(1,len(blocking)): | |
196 block = blocking[i].split(",") | |
197 for j in range(len(block)): | |
198 block[j] = listed_columns[int(block[j])] | |
199 blocking[i] = ",".join(block) | |
200 blockings.append(":".join(blocking)) | |
201 | |
202 s.add_samples(columns) | |
203 s.add_treatments(treatments) | |
204 s.add_blocking(blockings) | |
205 | |
206 else: | |
207 s.add_samples(args.sample_names) | |
208 s.add_treatments(args.treatments) | |
209 if(args.blocking): | |
210 s.add_blocking(args.blocking) | |
211 | |
212 s.export(args.output) |