Mercurial > repos > nanettec > lookup
comparison lookup.py @ 0:eac0e302ab3a draft
Uploaded
author | nanettec |
---|---|
date | Thu, 25 Feb 2016 04:58:09 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:eac0e302ab3a |
---|---|
1 """ | |
2 @summary: The information from the Markers file and the QTL Cartographer Z file, | |
3 are combined to proportionally estimate a base pair position at each 2cM interval. | |
4 The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions. | |
5 @author: nanette.coetzer@gmail.com | |
6 @version 5 | |
7 | |
8 """ | |
9 import optparse, sys | |
10 import subprocess | |
11 import tempfile | |
12 import os, re | |
13 | |
14 def stop_err( msg ): | |
15 sys.stderr.write( "%s\n" % msg ) | |
16 sys.exit() | |
17 | |
18 def __main__(): | |
19 #Parse Command Line | |
20 parser = optparse.OptionParser() | |
21 parser.add_option("-i", "--input1", default=None, dest="input1", | |
22 help="Markers file") | |
23 parser.add_option("-j", "--input2", default=None, dest="input2", | |
24 help="QTL Cartographer Z file") | |
25 parser.add_option("-k", "--input3", default=None, dest="input3", | |
26 help="Chromosome length file") | |
27 parser.add_option("-o", "--output1", default=None, dest="output1", | |
28 help="Lookup file") | |
29 parser.add_option("-p", "--output2", default=None, dest="output2", | |
30 help="Lookup summary file") | |
31 | |
32 (options, args) = parser.parse_args() | |
33 | |
34 try: | |
35 open(options.input1, "r").close() | |
36 except TypeError, e: | |
37 stop_err("You need to supply the Markers file:\n" + str(e)) | |
38 except IOError, e: | |
39 stop_err("Can not open the Markers file:\n" + str(e)) | |
40 | |
41 try: | |
42 open(options.input2, "r").close() | |
43 except TypeError, e: | |
44 stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e)) | |
45 except IOError, e: | |
46 stop_err("Can not open the QTL Cartographer Z file:\n" + str(e)) | |
47 | |
48 try: | |
49 open(options.input3, "r").close() | |
50 except TypeError, e: | |
51 stop_err("You need to supply the Chromosome length file:\n" + str(e)) | |
52 except IOError, e: | |
53 stop_err("Can not open the Chromosome length file:\n" + str(e)) | |
54 | |
55 | |
56 ############################################## | |
57 | |
58 in1 = open(options.input1,'r') # Markers.txt | |
59 prev_c = "0" | |
60 count = 0 | |
61 marker_dict = {} # For each unique chr + marker, store the cM + bp positions | |
62 last_marker_dict = {} # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately] | |
63 | |
64 for line in in1: # markers file | |
65 count += 1 | |
66 if line.startswith("c"): | |
67 # skip the header line | |
68 if count != 1: | |
69 # use info in each line to populate marker_dict | |
70 i = line.strip().split("\t") | |
71 marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] | |
72 chr = i[0] | |
73 if chr != prev_c and prev_c != "0": | |
74 # if the chr number changes, add previous marker info to last_marker_dict | |
75 last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp | |
76 prev_c = i[0] | |
77 prev_m = i[1] | |
78 prev_cM = i[3] | |
79 prev_bp = i[4] | |
80 else: | |
81 # no header row | |
82 # use info in each line to populate marker_dict | |
83 i = line.strip().split("\t") | |
84 marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4] | |
85 chr = i[0] | |
86 if chr != prev_c and prev_c != "0": | |
87 # if the chr number changes, add previous marker info to last_marker_dict | |
88 last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp | |
89 prev_c = i[0] | |
90 prev_m = i[1] | |
91 prev_cM = i[3] | |
92 prev_bp = i[4] | |
93 | |
94 # add the last marker (on last chr) info to last_marker_dict | |
95 last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp | |
96 in1.close() | |
97 | |
98 ############################################## | |
99 | |
100 in1 = open(options.input1,'r') | |
101 s2_l2 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) bp position | |
102 s1_l1 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) cM position | |
103 count = 0 | |
104 pc = 0 | |
105 if line.startswith("c"): | |
106 for line in in1: | |
107 count += 1 | |
108 if count != 1: | |
109 i = line.strip().split("\t") | |
110 if i[0] == pc: | |
111 s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] | |
112 s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] | |
113 else: | |
114 pass | |
115 pc = i[0] # previous chr | |
116 pm = i[1] # previous marker | |
117 pcM = i[3] # previous cM position | |
118 pbp = i[4] # previous bp position | |
119 else: | |
120 for line in in1: | |
121 count += 1 | |
122 #if count != 1: | |
123 i = line.strip().split("\t") | |
124 if i[0] == pc: | |
125 s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4] | |
126 s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3] | |
127 else: | |
128 pass | |
129 pc = i[0] # previous chr | |
130 pm = i[1] # previous marker | |
131 pcM = i[3] # previous cM position | |
132 pbp = i[4] # previous bp position | |
133 in1.close() | |
134 | |
135 ############################################## | |
136 | |
137 id = 0 | |
138 check1 = "" | |
139 check2 = "" | |
140 prev_m = 0 | |
141 prev_c = "0" | |
142 # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file) | |
143 r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+") | |
144 | |
145 # create temp output file | |
146 lookup_temp = tempfile.mktemp() | |
147 temp_outfile = open(lookup_temp,'w') | |
148 temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n") | |
149 | |
150 in2 = open(options.input2,'r') | |
151 for line in in2: # z file | |
152 if check2 == "": | |
153 count = 0 | |
154 i = line.strip() | |
155 m1 = r.match(i) | |
156 try: | |
157 c = m1.group(1) | |
158 ans = 1 | |
159 except: | |
160 ans = 0 | |
161 if ans == 1: | |
162 c = m1.group(1) | |
163 m = m1.group(2) | |
164 p = m1.group(3) | |
165 # if the previous marker number is larger than the current marker number, a new chromosome started | |
166 # before carrying on, add the last marker of the previous chromosome | |
167 if int(prev_m) > int(m): | |
168 id += 1 | |
169 add_m = int(prev_m)+1 | |
170 add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] | |
171 add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] | |
172 add_p = (float(add_cM) + 0.01)/100 | |
173 temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") | |
174 #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m] | |
175 # Calculation: | |
176 s1 = float(s1_l1[c+"\t"+m].split("\t")[0]) | |
177 l1 = float(s1_l1[c+"\t"+m].split("\t")[1]) | |
178 s2 = float(s2_l2[c+"\t"+m].split("\t")[0]) | |
179 l2 = float(s2_l2[c+"\t"+m].split("\t")[1]) | |
180 v1 = float(p)*100-0.01 | |
181 x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0)) | |
182 check1 = "done" | |
183 id += 1 | |
184 temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n") | |
185 # store previous chr and marker | |
186 prev_m = m | |
187 prev_c = c | |
188 # do not go on to the other blocks in the z.txt file | |
189 if ans == 0 and check1 == "done": | |
190 check2 = "stop" | |
191 # add the last marker of the last chromosome | |
192 id += 1 | |
193 add_m = int(prev_m)+1 | |
194 add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0] | |
195 add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1] | |
196 add_p = (float(add_cM) + 0.01)/100 | |
197 temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n") | |
198 | |
199 in2.close() | |
200 temp_outfile.close() | |
201 | |
202 ############################################## | |
203 | |
204 # Aim1: add cM_length column | |
205 # Aim2: create lookup summary file | |
206 temp_infile = open(lookup_temp,'r') | |
207 in3 = open(options.input3,'r') # chr_length.txt | |
208 out1 = open(options.output1,'w') # lookup.txt | |
209 out2 = open(options.output2,'w') # lookup_summary.txt | |
210 | |
211 out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n") | |
212 chr_dict = {} | |
213 for line in in3: | |
214 l = line.strip().split("\t") | |
215 if l[0].startswith("chr"): | |
216 chr = l[0][3:] | |
217 else: | |
218 chr = l[0] | |
219 chr_dict[chr] = l[1] | |
220 | |
221 tot_num_int_pos = 0 | |
222 count = 0 | |
223 # calc totals for summary file | |
224 tot_markers = 0 | |
225 tot_bp = 0 | |
226 tot_cM = 0 | |
227 tot_int_pos = 0 | |
228 tot_bins = 0 | |
229 for line in temp_infile: | |
230 l = line.strip().split("\t") | |
231 if count == 0: | |
232 out1.write(line.strip()+"\n") | |
233 if count > 1: | |
234 if l[4] != "0.0": | |
235 length_cm = float(l[4])-float(prev_cm) | |
236 out1.write(prev_line+"\t"+str(length_cm)+"\n") | |
237 else: | |
238 # always the end of a chromosome | |
239 out1.write(prev_line+"\t0\n") | |
240 # summary file | |
241 pl = prev_line.split("\t") | |
242 num_int_pos = int(pl[0]) - tot_num_int_pos | |
243 tot_num_int_pos += num_int_pos | |
244 out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") | |
245 # calc totals for summary file | |
246 tot_markers += int(pl[2]) | |
247 tot_bp += int(chr_dict[pl[1]]) | |
248 tot_cM += float(pl[4]) | |
249 tot_int_pos += num_int_pos | |
250 tot_bins += (num_int_pos-1) | |
251 prev_line = line.strip() | |
252 prev_cm = l[4] | |
253 count += 1 | |
254 # the end of the last chromosome | |
255 out1.write(prev_line+"\t0\n") | |
256 # summary file | |
257 pl = prev_line.split("\t") | |
258 num_int_pos = int(pl[0]) - tot_num_int_pos | |
259 tot_num_int_pos += num_int_pos | |
260 out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n") | |
261 # calc totals for summary file | |
262 tot_markers += int(pl[2]) | |
263 tot_bp += int(chr_dict[pl[1]]) | |
264 tot_cM += float(pl[4]) | |
265 tot_int_pos += num_int_pos | |
266 tot_bins += (num_int_pos-1) | |
267 out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n") | |
268 | |
269 temp_infile.close() | |
270 in3.close() | |
271 out1.close() | |
272 out2.close() | |
273 | |
274 ############################################## | |
275 | |
276 if __name__=="__main__": | |
277 __main__() | |
278 | |
279 | |
280 |