79
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse, os, shutil, sys, tempfile, subprocess
|
|
4
|
|
5 __version_info__ = ('1', '0', '0')#, 'beta')
|
|
6 __version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3])+"-"+__version_info__[3]
|
|
7 __author__ = 'Youri Hoogstrate'
|
|
8 __homepage__ = 'https://bitbucket.org/EMCbioinf/galaxy-tool-shed-tools'
|
|
9 __license__ = 'GNU General Public License v3 (GPLv3)'
|
|
10
|
|
11
|
|
12 class sampleContainer:
|
|
13 def __init__(self):
|
|
14 self.samples = []
|
|
15 self.treatments = {}
|
|
16 self.treatment_index = []
|
|
17 self.treatment_types = {}
|
|
18
|
|
19 def do_decode(self,encoded_str):
|
|
20 return encoded_str.decode("base64").strip().replace("\t",'')
|
|
21
|
|
22 def add_samples(self,argument):
|
|
23 print " - Adding samples"
|
|
24 for sample in argument:
|
|
25 self.add_sample(self.do_decode(sample))
|
|
26
|
|
27 def add_sample(self,sample):
|
|
28 if(sample in self.samples):
|
|
29 sys.stderr.write("Error:\n* Non-unique sample: "+sample+"\n")
|
|
30 sys.exit(1)
|
|
31 else:
|
|
32 self.samples.append(sample)
|
|
33 print " - Added: "+sample
|
|
34
|
|
35 def add_blocking(self,argument):
|
|
36 print " - Adding paired samples"
|
|
37 pair = []
|
|
38 for block in argument:
|
|
39 self.add_block(block)
|
|
40
|
|
41 def add_block(self,blocks):
|
|
42 blocks = blocks.split(":")
|
|
43 as_treatment = blocks[0]
|
|
44 blocks = blocks[1:]
|
|
45
|
|
46 used_samples = []
|
|
47 indexed_samples = {}
|
|
48
|
|
49 for i in range(len(blocks)):
|
|
50 block = blocks[i]
|
|
51 samples = self.get_samples_from_block(block)
|
|
52 indexed_samples[i+1] = []
|
|
53 for sample in samples:
|
|
54 if(sample in used_samples):
|
|
55 sys.stderr.write("Error:\n* Blocking contains multiple times the same sample: "+sample+"\n")
|
|
56 sys.exit(0)
|
|
57 else:
|
|
58 indexed_samples[i+1] = block
|
|
59 used_samples.append(sample)
|
|
60
|
|
61 for sample in self.samples:
|
|
62 if(sample not in used_samples):
|
|
63 i = i + 1
|
|
64 indexed_samples[i+1] = str(sample).encode('base64').strip()
|
|
65
|
|
66 for index in indexed_samples.keys():
|
|
67 key = str(index).encode('base64').strip()
|
|
68 as_treatment += ":"+key+":"+indexed_samples[index]
|
|
69
|
|
70 self.add_treatment(as_treatment)
|
|
71
|
|
72 def get_samples_from_block(self,decoded_block):
|
|
73 return [ self.do_decode(x) for x in decoded_block.split(",")]
|
|
74
|
|
75 def add_treatments(self,argument):
|
|
76 print " - Adding treatments"
|
|
77 for treatment in argument:
|
|
78 self.add_treatment(treatment)
|
|
79
|
|
80 def add_treatment(self,treatment_argument):
|
|
81 print " - Parsing treatment"
|
|
82
|
|
83
|
|
84 treatment_argument = treatment_argument.split(":")
|
|
85 name = self.do_decode(treatment_argument[0])
|
|
86 treatment_argument = treatment_argument[1:]
|
|
87
|
|
88
|
|
89 treatment = {"factor_index":{},"sample_index":{}}
|
|
90 only_integers = True
|
|
91
|
|
92 i = 1
|
|
93 for item in treatment_argument:
|
|
94 if(i % 2):
|
|
95 factor = self.do_decode(item)
|
|
96
|
|
97 if(treatment['factor_index'].has_key(factor)):
|
|
98 sys.stderr.write("Error:\n* Factor has been added multiple times to treatment: "+factor+"\n")
|
|
99 sys.exit(0)
|
|
100 else:
|
|
101 print " - Adding factor: "+factor
|
|
102 treatment["factor_index"][factor] = []
|
|
103 if(not factor.isdigit()):
|
|
104 only_integers = False
|
|
105 else:
|
|
106 for sample in item.split(","):
|
|
107 sample = self.do_decode(sample)
|
|
108
|
|
109 if(not sample in self.samples):
|
|
110 sys.stderr.write("Error:\n* Unknown sample: "+sample+"\n")
|
|
111 sys.exit(0)
|
|
112
|
|
113 treatment["factor_index"][factor].append(sample)
|
|
114 if(treatment["sample_index"].has_key(sample)):
|
|
115 sys.stderr.write("Error:\n* Factor has been added to treatment before: "+sample+"/"+factor+", factors must be mutually exclusive!\n")
|
|
116 sys.exit(0)
|
|
117 else:
|
|
118 treatment["sample_index"][sample] = factor
|
|
119 i += 1
|
|
120
|
|
121 treatment_factors = sorted(treatment["factor_index"].keys())
|
|
122
|
|
123 if(name == None):
|
|
124 treatment["name"] = "_vs_".join(treatment_factors)
|
|
125 else:
|
|
126 treatment["name"] = str(name)
|
|
127
|
|
128 if(len(treatment["sample_index"]) != len(self.samples)):
|
|
129 sys.stderr.write("Error:\n* The number of samples for treatment '"+treatment["name"]+"' ("+str(len(treatment["sample_index"]))+") is different from the total number of samples ("+str(len(self.samples))+").\n")
|
|
130
|
|
131 if(only_integers):
|
|
132 treatment_type = "integer"
|
|
133 else:
|
|
134 treatment_type = "string"
|
|
135
|
|
136 if(self.treatments.has_key(treatment["name"])):
|
|
137 sys.stderr.write("Error:\n* Treatment was already added: '"+treatment["name"]+"\n")
|
|
138 else:
|
|
139 self.treatments[treatment["name"]] = treatment
|
|
140 self.treatment_index.append(treatment["name"])
|
|
141 self.treatment_types[treatment["name"]] = treatment_type
|
|
142 print " - Treatment \""+treatment["name"]+"\" of type \""+treatment_type+"\" is valid"
|
|
143
|
|
144 def export(self,output):
|
|
145 # Open file stream
|
|
146 if(args.output == "-"):
|
|
147 fh = sys.stdout
|
|
148 else:
|
|
149 fh = open(args.output,"w")
|
|
150
|
|
151 # Write header:
|
|
152 fh.write("sample-name\t"+"\t".join(self.treatment_index)+"\n")
|
|
153
|
|
154 # Write body:
|
|
155 for sample in self.samples:
|
|
156 fh.write(sample)
|
|
157 for treatment_id in self.treatment_index:
|
|
158 treatment = self.treatments[treatment_id]
|
|
159 fh.write("\t"+treatment["sample_index"][sample])
|
|
160 fh.write("\n")
|
|
161
|
|
162 fh.close()
|
|
163
|
|
164 if __name__=="__main__":
|
|
165 parser = argparse.ArgumentParser(description="Create an edgeR design matrix with read-count datasets.")
|
|
166 parser.add_argument("-o","--output", help="Output file, '-' for stdout.",required=True)
|
|
167 parser.add_argument("-c","--columns-file", nargs="?", help='Use columns of [this] file as UIDs (counting from 1)')
|
|
168 parser.add_argument("-s","--sample-names", nargs="*", help='Sample names (UIDs that correspond to the columns in the expression matrix)')
|
|
169 parser.add_argument("-t","--treatments", nargs="+", help='Treatment or conditions: "name::sample:condition& (sample-names and conditions have to be provided using Base64 encoding to avoid weird characters)',required=True)
|
|
170 parser.add_argument("-b","--blocking", nargs="+", help='Description of sample blocking: "blocking_condition*&sample-1-name&sample-2-name&sample-n-name"')
|
|
171
|
|
172 args = parser.parse_args()
|
|
173
|
|
174 columns = None
|
|
175 if(args.columns_file):
|
|
176 with open(args.columns_file, "r") as f:
|
|
177 listed_columns = [None] + f.readline().strip("\n").split("\t")
|
|
178 for i in range(1,len(listed_columns)):
|
|
179 listed_columns[i] = listed_columns[i].encode('base64').replace('\n','')
|
|
180
|
|
181 s = sampleContainer()
|
|
182
|
|
183 if(listed_columns):
|
|
184 columns = []
|
|
185 for sample in args.sample_names:
|
|
186 columns.append(listed_columns[int(sample)])
|
|
187
|
|
188
|
|
189 treatments = []
|
|
190 for treatment in args.treatments:
|
|
191 treatment = treatment.split(":")
|
|
192 for i in range(1,len(treatment)):
|
|
193 if(i%2 == 0):
|
|
194 treatment_tmp = treatment[i].split(",")
|
|
195 for j in range(len(treatment_tmp)):
|
|
196 treatment_tmp[j] = listed_columns[int(treatment_tmp[j])]
|
|
197 treatment[i] = ",".join(treatment_tmp)
|
|
198
|
|
199 treatments.append(":".join(treatment))
|
|
200
|
|
201 blockings = []
|
|
202 if(args.blocking):
|
|
203 for blocking in args.blocking:
|
|
204 blocking = blocking.split(":")
|
|
205 for i in range(1,len(blocking)):
|
|
206 block = blocking[i].split(",")
|
|
207 for j in range(len(block)):
|
|
208 block[j] = listed_columns[int(block[j])]
|
|
209 blocking[i] = ",".join(block)
|
|
210 blockings.append(":".join(blocking))
|
|
211
|
|
212 s.add_samples(columns)
|
|
213 s.add_treatments(treatments)
|
|
214 s.add_blocking(blockings)
|
|
215
|
|
216 else:
|
|
217 s.add_samples(args.sample_names)
|
|
218 s.add_treatments(args.treatments)
|
|
219 if(args.blocking):
|
|
220 s.add_blocking(args.blocking)
|
|
221
|
|
222 s.export(args.output)
|