comparison lookup.py @ 0:eac0e302ab3a draft

Uploaded
author nanettec
date Thu, 25 Feb 2016 04:58:09 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:eac0e302ab3a
1 """
2 @summary: The information from the Markers file and the QTL Cartographer Z file,
3 are combined to proportionally estimate a base pair position at each 2cM interval.
4 The Lookup file can then serve as a lookup table to convert between base pair positions centimorgan positions.
5 @author: nanette.coetzer@gmail.com
6 @version 5
7
8 """
9 import optparse, sys
10 import subprocess
11 import tempfile
12 import os, re
13
14 def stop_err( msg ):
15 sys.stderr.write( "%s\n" % msg )
16 sys.exit()
17
18 def __main__():
19 #Parse Command Line
20 parser = optparse.OptionParser()
21 parser.add_option("-i", "--input1", default=None, dest="input1",
22 help="Markers file")
23 parser.add_option("-j", "--input2", default=None, dest="input2",
24 help="QTL Cartographer Z file")
25 parser.add_option("-k", "--input3", default=None, dest="input3",
26 help="Chromosome length file")
27 parser.add_option("-o", "--output1", default=None, dest="output1",
28 help="Lookup file")
29 parser.add_option("-p", "--output2", default=None, dest="output2",
30 help="Lookup summary file")
31
32 (options, args) = parser.parse_args()
33
34 try:
35 open(options.input1, "r").close()
36 except TypeError, e:
37 stop_err("You need to supply the Markers file:\n" + str(e))
38 except IOError, e:
39 stop_err("Can not open the Markers file:\n" + str(e))
40
41 try:
42 open(options.input2, "r").close()
43 except TypeError, e:
44 stop_err("You need to supply the QTL Cartographer Z file:\n" + str(e))
45 except IOError, e:
46 stop_err("Can not open the QTL Cartographer Z file:\n" + str(e))
47
48 try:
49 open(options.input3, "r").close()
50 except TypeError, e:
51 stop_err("You need to supply the Chromosome length file:\n" + str(e))
52 except IOError, e:
53 stop_err("Can not open the Chromosome length file:\n" + str(e))
54
55
56 ##############################################
57
58 in1 = open(options.input1,'r') # Markers.txt
59 prev_c = "0"
60 count = 0
61 marker_dict = {} # For each unique chr + marker, store the cM + bp positions
62 last_marker_dict = {} # For the last marker on each chr, store the cM + bp positions [reason: these intervals are not included in the z.txt file, so add separately]
63
64 for line in in1: # markers file
65 count += 1
66 if line.startswith("c"):
67 # skip the header line
68 if count != 1:
69 # use info in each line to populate marker_dict
70 i = line.strip().split("\t")
71 marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
72 chr = i[0]
73 if chr != prev_c and prev_c != "0":
74 # if the chr number changes, add previous marker info to last_marker_dict
75 last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
76 prev_c = i[0]
77 prev_m = i[1]
78 prev_cM = i[3]
79 prev_bp = i[4]
80 else:
81 # no header row
82 # use info in each line to populate marker_dict
83 i = line.strip().split("\t")
84 marker_dict[i[0]+"\t"+i[1]] = i[3]+"\t"+i[4]
85 chr = i[0]
86 if chr != prev_c and prev_c != "0":
87 # if the chr number changes, add previous marker info to last_marker_dict
88 last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
89 prev_c = i[0]
90 prev_m = i[1]
91 prev_cM = i[3]
92 prev_bp = i[4]
93
94 # add the last marker (on last chr) info to last_marker_dict
95 last_marker_dict[prev_c+"\t"+prev_m] = prev_cM+"\t"+prev_bp
96 in1.close()
97
98 ##############################################
99
100 in1 = open(options.input1,'r')
101 s2_l2 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) bp position
102 s1_l1 = {} # For each unique chr + marker, store the previous (smallest) and current (largest) cM position
103 count = 0
104 pc = 0
105 if line.startswith("c"):
106 for line in in1:
107 count += 1
108 if count != 1:
109 i = line.strip().split("\t")
110 if i[0] == pc:
111 s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
112 s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
113 else:
114 pass
115 pc = i[0] # previous chr
116 pm = i[1] # previous marker
117 pcM = i[3] # previous cM position
118 pbp = i[4] # previous bp position
119 else:
120 for line in in1:
121 count += 1
122 #if count != 1:
123 i = line.strip().split("\t")
124 if i[0] == pc:
125 s2_l2[pc+"\t"+pm] = pbp+"\t"+i[4]
126 s1_l1[pc+"\t"+pm] = pcM+"\t"+i[3]
127 else:
128 pass
129 pc = i[0] # previous chr
130 pm = i[1] # previous marker
131 pcM = i[3] # previous cM position
132 pbp = i[4] # previous bp position
133 in1.close()
134
135 ##############################################
136
137 id = 0
138 check1 = ""
139 check2 = ""
140 prev_m = 0
141 prev_c = "0"
142 # parse the z.txt file, to extract only the chr, marker and interval position (of the first block in the file)
143 r = re.compile("^(\d+)\s+(\d+)\s+(\d+.\d+)\s+")
144
145 # create temp output file
146 lookup_temp = tempfile.mktemp()
147 temp_outfile = open(lookup_temp,'w')
148 temp_outfile.write("id\tchr\tmarker\tinterval\tcM\tbp\tlength_cM\n")
149
150 in2 = open(options.input2,'r')
151 for line in in2: # z file
152 if check2 == "":
153 count = 0
154 i = line.strip()
155 m1 = r.match(i)
156 try:
157 c = m1.group(1)
158 ans = 1
159 except:
160 ans = 0
161 if ans == 1:
162 c = m1.group(1)
163 m = m1.group(2)
164 p = m1.group(3)
165 # if the previous marker number is larger than the current marker number, a new chromosome started
166 # before carrying on, add the last marker of the previous chromosome
167 if int(prev_m) > int(m):
168 id += 1
169 add_m = int(prev_m)+1
170 add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
171 add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
172 add_p = (float(add_cM) + 0.01)/100
173 temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
174 #print c+"\t"+m+"\t"+p+"\t"+str(float(p)*100-0.01)+"\t"+marker_dict[c+"\t"+m]
175 # Calculation:
176 s1 = float(s1_l1[c+"\t"+m].split("\t")[0])
177 l1 = float(s1_l1[c+"\t"+m].split("\t")[1])
178 s2 = float(s2_l2[c+"\t"+m].split("\t")[0])
179 l2 = float(s2_l2[c+"\t"+m].split("\t")[1])
180 v1 = float(p)*100-0.01
181 x2 = int(round((((v1-s1)*(l2-s2)/(l1-s1))+s2),0))
182 check1 = "done"
183 id += 1
184 temp_outfile.write(str(id)+"\t"+str(c)+"\t"+str(m)+"\t"+str(p)+"\t"+str(float(p)*100-0.01)+"\t"+str(x2)+"\n")
185 # store previous chr and marker
186 prev_m = m
187 prev_c = c
188 # do not go on to the other blocks in the z.txt file
189 if ans == 0 and check1 == "done":
190 check2 = "stop"
191 # add the last marker of the last chromosome
192 id += 1
193 add_m = int(prev_m)+1
194 add_cM = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[0]
195 add_bp = last_marker_dict[prev_c+"\t"+str(add_m)].split("\t")[1]
196 add_p = (float(add_cM) + 0.01)/100
197 temp_outfile.write(str(id)+"\t"+str(prev_c)+"\t"+str(add_m)+"\t"+str(add_p)+"\t"+str(add_cM)+"\t"+str(add_bp)+"\n")
198
199 in2.close()
200 temp_outfile.close()
201
202 ##############################################
203
204 # Aim1: add cM_length column
205 # Aim2: create lookup summary file
206 temp_infile = open(lookup_temp,'r')
207 in3 = open(options.input3,'r') # chr_length.txt
208 out1 = open(options.output1,'w') # lookup.txt
209 out2 = open(options.output2,'w') # lookup_summary.txt
210
211 out2.write("chr\tmarkers\tcM\tbp\tinterval positions\tbins\n")
212 chr_dict = {}
213 for line in in3:
214 l = line.strip().split("\t")
215 if l[0].startswith("chr"):
216 chr = l[0][3:]
217 else:
218 chr = l[0]
219 chr_dict[chr] = l[1]
220
221 tot_num_int_pos = 0
222 count = 0
223 # calc totals for summary file
224 tot_markers = 0
225 tot_bp = 0
226 tot_cM = 0
227 tot_int_pos = 0
228 tot_bins = 0
229 for line in temp_infile:
230 l = line.strip().split("\t")
231 if count == 0:
232 out1.write(line.strip()+"\n")
233 if count > 1:
234 if l[4] != "0.0":
235 length_cm = float(l[4])-float(prev_cm)
236 out1.write(prev_line+"\t"+str(length_cm)+"\n")
237 else:
238 # always the end of a chromosome
239 out1.write(prev_line+"\t0\n")
240 # summary file
241 pl = prev_line.split("\t")
242 num_int_pos = int(pl[0]) - tot_num_int_pos
243 tot_num_int_pos += num_int_pos
244 out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
245 # calc totals for summary file
246 tot_markers += int(pl[2])
247 tot_bp += int(chr_dict[pl[1]])
248 tot_cM += float(pl[4])
249 tot_int_pos += num_int_pos
250 tot_bins += (num_int_pos-1)
251 prev_line = line.strip()
252 prev_cm = l[4]
253 count += 1
254 # the end of the last chromosome
255 out1.write(prev_line+"\t0\n")
256 # summary file
257 pl = prev_line.split("\t")
258 num_int_pos = int(pl[0]) - tot_num_int_pos
259 tot_num_int_pos += num_int_pos
260 out2.write(str(pl[1])+"\t"+str(pl[2])+"\t"+str(pl[4])+"\t"+chr_dict[pl[1]]+"\t"+str(num_int_pos)+"\t"+str(num_int_pos-1)+"\n")
261 # calc totals for summary file
262 tot_markers += int(pl[2])
263 tot_bp += int(chr_dict[pl[1]])
264 tot_cM += float(pl[4])
265 tot_int_pos += num_int_pos
266 tot_bins += (num_int_pos-1)
267 out2.write("Total\t"+str(tot_markers)+"\t"+str(tot_cM)+"\t"+str(tot_bp)+"\t"+str(tot_int_pos)+"\t"+str(tot_bins)+"\n")
268
269 temp_infile.close()
270 in3.close()
271 out1.close()
272 out2.close()
273
274 ##############################################
275
276 if __name__=="__main__":
277 __main__()
278
279
280