31
|
1 #!/usr/bin/env python
|
|
2 import argparse, os, shutil, sys, tempfile, subprocess
|
|
3
|
|
4
|
|
5 class sampleContainer:
|
|
6 def __init__(self):
|
|
7 self.samples = []
|
|
8 self.treatments = {}
|
|
9 self.treatment_types = {}
|
|
10
|
|
11 def do_decode(self,encoded_str):
|
|
12 return encoded_str.decode("base64").strip().replace("\t",'')
|
|
13
|
|
14 def add_samples(self,argument):
|
|
15 print " - Adding samples"
|
|
16 for sample in argument:
|
|
17 self.add_sample(self.do_decode(sample))
|
|
18
|
|
19 def add_sample(self,sample):
|
|
20 if(sample in self.samples):
|
|
21 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n")
|
|
22 sys.exit(1)
|
|
23 else:
|
|
24 self.samples.append(sample)
|
|
25 print " - Added: "+sample
|
|
26
|
|
27 def add_blocking(self,argument):
|
|
28 print " - Adding paired samples"
|
|
29 pair = []
|
|
30 for block in argument:
|
|
31 self.add_block(block)
|
|
32
|
|
33 def add_block(self,blocks):
|
|
34 blocks = blocks.split(":")
|
|
35 as_treatment = blocks[0]
|
|
36 blocks = blocks[1:]
|
|
37
|
|
38 used_samples = []
|
|
39 indexed_samples = {}
|
|
40
|
|
41 for i in range(len(blocks)):
|
|
42 block = blocks[i]
|
|
43 samples = self.get_samples_from_block(block)
|
|
44 indexed_samples[i+1] = []
|
|
45 for sample in samples:
|
|
46 if(sample in used_samples):
|
|
47 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n")
|
|
48 sys.exit(0)
|
|
49 else:
|
|
50 indexed_samples[i+1] = block
|
|
51 used_samples.append(sample)
|
|
52
|
|
53 for sample in self.samples:
|
|
54 if(sample not in used_samples):
|
|
55 i = i + 1
|
|
56 indexed_samples[i+1] = str(sample).encode('base64').strip()
|
|
57
|
|
58 for index in indexed_samples.keys():
|
|
59 key = str(index).encode('base64').strip()
|
|
60 as_treatment += ":"+key+":"+indexed_samples[index]
|
|
61
|
|
62 self.add_treatment(as_treatment)
|
|
63
|
|
64 def get_samples_from_block(self,decoded_block):
|
|
65 return [ self.do_decode(x) for x in decoded_block.split(",")]
|
|
66
|
|
67 def add_treatments(self,argument):
|
|
68 print " - Adding treatments"
|
|
69 for treatment in argument:
|
|
70 self.add_treatment(treatment)
|
|
71
|
|
72 def add_treatment(self,treatment_argument):
|
|
73 print " - Parsing treatment"
|
|
74
|
|
75
|
|
76 treatment_argument = treatment_argument.split(":")
|
|
77 name = self.do_decode(treatment_argument[0])
|
|
78 treatment_argument = treatment_argument[1:]
|
|
79
|
|
80
|
|
81 treatment = {"factor_index":{},"sample_index":{}}
|
|
82 only_integers = True
|
|
83
|
|
84 i = 1
|
|
85 for item in treatment_argument:
|
|
86 if(i % 2):
|
|
87 factor = self.do_decode(item)
|
|
88
|
|
89 if(treatment['factor_index'].has_key(factor)):
|
|
90 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
|
|
91 sys.exit(0)
|
|
92 else:
|
|
93 print " - Adding factor: "+factor
|
|
94 treatment["factor_index"][factor] = []
|
|
95 if(not factor.isdigit()):
|
|
96 only_integers = False
|
|
97 else:
|
|
98 for sample in item.split(","):
|
|
99 sample = self.do_decode(sample)
|
|
100
|
|
101 if(not sample in self.samples):
|
|
102 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n")
|
|
103 sys.exit(0)
|
|
104
|
|
105 treatment["factor_index"][factor].append(sample)
|
|
106 if(treatment["sample_index"].has_key(sample)):
|
|
107 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
|
|
108 sys.exit(0)
|
|
109 else:
|
|
110 treatment["sample_index"][sample] = factor
|
|
111 i += 1
|
|
112
|
|
113 treatment_factors = sorted(treatment["factor_index"].keys())
|
|
114
|
|
115 if(name == None):
|
|
116 treatment["name"] = "_vs_".join(treatment_factors)
|
|
117 else:
|
|
118 treatment["name"] = str(name)
|
|
119
|
|
120 if(len(treatment["sample_index"]) != len(self.samples)):
|
|
121 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
|
|
122
|
|
123 if(only_integers):
|
|
124 treatment_type = "integer"
|
|
125 else:
|
|
126 treatment_type = "string"
|
|
127
|
|
128 if(self.treatments.has_key(treatment["name"])):
|
|
129 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
|
|
130 else:
|
|
131 self.treatments[treatment["name"]] = treatment
|
|
132 self.treatment_types[treatment["name"]] = treatment_type
|
|
133 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid"
|
|
134
|
|
135 def export(self,output):
|
|
136 # Open file stream
|
|
137 if(args.output == "-"):
|
|
138 fh = sys.stdout
|
|
139 else:
|
|
140 fh = open(args.output,"w")
|
|
141
|
|
142 # Write header:
|
|
143 fh.write("sample-name\t"+"\t".join(self.treatments.keys())+"\n")
|
|
144
|
|
145 # Write body:
|
|
146 for sample in self.samples:
|
|
147 fh.write(sample)
|
|
148 for treatment in self.treatments.keys():
|
|
149 fh.write("\t"+self.treatments[treatment]["sample_index"][sample])
|
|
150 fh.write("\n")
|
|
151
|
|
152 fh.close()
|
|
153
|
|
154 if __name__=="__main__":
|
|
155 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
|
|
156 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
|
|
157 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)')
|
|
158 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)')
|
|
159 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True)
|
|
160 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"')
|
|
161
|
|
162 args = parser.parse_args()
|
|
163
|
|
164 columns = None
|
|
165 if(args.columns_file):
|
|
166 with open(args.columns_file, "r") as f:
|
|
167 listed_columns = [None] + f.readline().strip("\n").split("\t")
|
|
168 for i in range(1,len(listed_columns)):
|
|
169 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','')
|
|
170
|
|
171 s = sampleContainer()
|
|
172
|
|
173 if(listed_columns):
|
|
174 columns = []
|
|
175 for sample in args.sample_names:
|
|
176 columns.append(listed_columns[int(sample)])
|
|
177
|
|
178
|
|
179 treatments = []
|
|
180 for treatment in args.treatments:
|
|
181 treatment = treatment.split(":")
|
|
182 for i in range(1,len(treatment)):
|
|
183 if(i%2 == 0):
|
|
184 treatment_tmp = treatment[i].split(",")
|
|
185 for j in range(len(treatment_tmp)):
|
|
186 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])]
|
|
187 treatment[i] = ",".join(treatment_tmp)
|
|
188
|
|
189 treatments.append(":".join(treatment))
|
|
190
|
|
191 blockings = []
|
|
192 if(args.blocking):
|
|
193 for blocking in args.blocking:
|
|
194 blocking = blocking.split(":")
|
|
195 for i in range(1,len(blocking)):
|
|
196 block = blocking[i].split(",")
|
|
197 for j in range(len(block)):
|
|
198 block[j] = listed_columns[int(block[j])]
|
|
199 blocking[i] = ",".join(block)
|
|
200 blockings.append(":".join(blocking))
|
|
201
|
|
202 s.add_samples(columns)
|
|
203 s.add_treatments(treatments)
|
|
204 s.add_blocking(blockings)
|
|
205
|
|
206 else:
|
|
207 s.add_samples(args.sample_names)
|
|
208 s.add_treatments(args.treatments)
|
|
209 if(args.blocking):
|
|
210 s.add_blocking(args.blocking)
|
|
211
|
|
212 s.export(args.output)
|