31
|
1 #!/usr/bin/env python
|
|
2 import argparse, os, shutil, sys, tempfile, subprocess
|
|
3
|
|
4
|
|
5 class sampleContainer:
|
|
6 def __init__(self):
|
|
7 self.samples = []
|
|
8 self.treatments = {}
|
52
|
9 self.treatment_index = []
|
31
|
10 self.treatment_types = {}
|
|
11
|
|
12 def do_decode(self,encoded_str):
|
|
13 return encoded_str.decode("base64").strip().replace("\t",'')
|
|
14
|
|
15 def add_samples(self,argument):
|
|
16 print " - Adding samples"
|
|
17 for sample in argument:
|
|
18 self.add_sample(self.do_decode(sample))
|
|
19
|
|
20 def add_sample(self,sample):
|
|
21 if(sample in self.samples):
|
|
22 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n")
|
|
23 sys.exit(1)
|
|
24 else:
|
|
25 self.samples.append(sample)
|
|
26 print " - Added: "+sample
|
|
27
|
|
28 def add_blocking(self,argument):
|
|
29 print " - Adding paired samples"
|
|
30 pair = []
|
|
31 for block in argument:
|
|
32 self.add_block(block)
|
|
33
|
|
34 def add_block(self,blocks):
|
|
35 blocks = blocks.split(":")
|
|
36 as_treatment = blocks[0]
|
|
37 blocks = blocks[1:]
|
|
38
|
|
39 used_samples = []
|
|
40 indexed_samples = {}
|
|
41
|
|
42 for i in range(len(blocks)):
|
|
43 block = blocks[i]
|
|
44 samples = self.get_samples_from_block(block)
|
|
45 indexed_samples[i+1] = []
|
|
46 for sample in samples:
|
|
47 if(sample in used_samples):
|
|
48 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n")
|
|
49 sys.exit(0)
|
|
50 else:
|
|
51 indexed_samples[i+1] = block
|
|
52 used_samples.append(sample)
|
|
53
|
|
54 for sample in self.samples:
|
|
55 if(sample not in used_samples):
|
|
56 i = i + 1
|
|
57 indexed_samples[i+1] = str(sample).encode('base64').strip()
|
|
58
|
|
59 for index in indexed_samples.keys():
|
|
60 key = str(index).encode('base64').strip()
|
|
61 as_treatment += ":"+key+":"+indexed_samples[index]
|
|
62
|
|
63 self.add_treatment(as_treatment)
|
|
64
|
|
65 def get_samples_from_block(self,decoded_block):
|
|
66 return [ self.do_decode(x) for x in decoded_block.split(",")]
|
|
67
|
|
68 def add_treatments(self,argument):
|
|
69 print " - Adding treatments"
|
|
70 for treatment in argument:
|
|
71 self.add_treatment(treatment)
|
|
72
|
|
73 def add_treatment(self,treatment_argument):
|
|
74 print " - Parsing treatment"
|
|
75
|
|
76
|
|
77 treatment_argument = treatment_argument.split(":")
|
|
78 name = self.do_decode(treatment_argument[0])
|
|
79 treatment_argument = treatment_argument[1:]
|
|
80
|
|
81
|
|
82 treatment = {"factor_index":{},"sample_index":{}}
|
|
83 only_integers = True
|
|
84
|
|
85 i = 1
|
|
86 for item in treatment_argument:
|
|
87 if(i % 2):
|
|
88 factor = self.do_decode(item)
|
|
89
|
|
90 if(treatment['factor_index'].has_key(factor)):
|
|
91 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
|
|
92 sys.exit(0)
|
|
93 else:
|
|
94 print " - Adding factor: "+factor
|
|
95 treatment["factor_index"][factor] = []
|
|
96 if(not factor.isdigit()):
|
|
97 only_integers = False
|
|
98 else:
|
|
99 for sample in item.split(","):
|
|
100 sample = self.do_decode(sample)
|
|
101
|
|
102 if(not sample in self.samples):
|
|
103 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n")
|
|
104 sys.exit(0)
|
|
105
|
|
106 treatment["factor_index"][factor].append(sample)
|
|
107 if(treatment["sample_index"].has_key(sample)):
|
|
108 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
|
|
109 sys.exit(0)
|
|
110 else:
|
|
111 treatment["sample_index"][sample] = factor
|
|
112 i += 1
|
|
113
|
|
114 treatment_factors = sorted(treatment["factor_index"].keys())
|
|
115
|
|
116 if(name == None):
|
|
117 treatment["name"] = "_vs_".join(treatment_factors)
|
|
118 else:
|
|
119 treatment["name"] = str(name)
|
|
120
|
|
121 if(len(treatment["sample_index"]) != len(self.samples)):
|
|
122 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
|
|
123
|
|
124 if(only_integers):
|
|
125 treatment_type = "integer"
|
|
126 else:
|
|
127 treatment_type = "string"
|
|
128
|
|
129 if(self.treatments.has_key(treatment["name"])):
|
|
130 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
|
|
131 else:
|
|
132 self.treatments[treatment["name"]] = treatment
|
52
|
133 self.treatment_index.append(treatment["name"])
|
31
|
134 self.treatment_types[treatment["name"]] = treatment_type
|
|
135 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid"
|
|
136
|
|
137 def export(self,output):
|
|
138 # Open file stream
|
|
139 if(args.output == "-"):
|
|
140 fh = sys.stdout
|
|
141 else:
|
|
142 fh = open(args.output,"w")
|
|
143
|
|
144 # Write header:
|
52
|
145 fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n")
|
31
|
146
|
|
147 # Write body:
|
|
148 for sample in self.samples:
|
|
149 fh.write(sample)
|
52
|
150 for treatment_id in self.treatment_index:
|
|
151 treatment = self.treatments[treatment_id]
|
|
152 fh.write("\t"+treatment["sample_index"][sample])
|
31
|
153 fh.write("\n")
|
|
154
|
|
155 fh.close()
|
|
156
|
|
157 if __name__=="__main__":
|
|
158 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
|
|
159 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
|
|
160 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)')
|
|
161 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)')
|
|
162 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True)
|
|
163 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"')
|
|
164
|
|
165 args = parser.parse_args()
|
|
166
|
|
167 columns = None
|
|
168 if(args.columns_file):
|
|
169 with open(args.columns_file, "r") as f:
|
|
170 listed_columns = [None] + f.readline().strip("\n").split("\t")
|
|
171 for i in range(1,len(listed_columns)):
|
|
172 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','')
|
|
173
|
|
174 s = sampleContainer()
|
|
175
|
|
176 if(listed_columns):
|
|
177 columns = []
|
|
178 for sample in args.sample_names:
|
|
179 columns.append(listed_columns[int(sample)])
|
|
180
|
|
181
|
|
182 treatments = []
|
|
183 for treatment in args.treatments:
|
|
184 treatment = treatment.split(":")
|
|
185 for i in range(1,len(treatment)):
|
|
186 if(i%2 == 0):
|
|
187 treatment_tmp = treatment[i].split(",")
|
|
188 for j in range(len(treatment_tmp)):
|
|
189 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])]
|
|
190 treatment[i] = ",".join(treatment_tmp)
|
|
191
|
|
192 treatments.append(":".join(treatment))
|
|
193
|
|
194 blockings = []
|
|
195 if(args.blocking):
|
|
196 for blocking in args.blocking:
|
|
197 blocking = blocking.split(":")
|
|
198 for i in range(1,len(blocking)):
|
|
199 block = blocking[i].split(",")
|
|
200 for j in range(len(block)):
|
|
201 block[j] = listed_columns[int(block[j])]
|
|
202 blocking[i] = ",".join(block)
|
|
203 blockings.append(":".join(blocking))
|
|
204
|
|
205 s.add_samples(columns)
|
|
206 s.add_treatments(treatments)
|
|
207 s.add_blocking(blockings)
|
|
208
|
|
209 else:
|
|
210 s.add_samples(args.sample_names)
|
|
211 s.add_treatments(args.treatments)
|
|
212 if(args.blocking):
|
|
213 s.add_blocking(args.blocking)
|
|
214
|
|
215 s.export(args.output)
|