Mercurial > repos > devteam > microsats_alignment_level
comparison microsats_alignment_level.py @ 0:68d598b4433e
Imported from capsule None
author | devteam |
---|---|
date | Tue, 01 Apr 2014 09:12:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:68d598b4433e |
---|---|
1 #!/usr/bin/env python | |
2 #Guruprasad Ananda | |
3 """ | |
4 Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output. | |
5 """ | |
6 import os | |
7 import re | |
8 import string | |
9 import sys | |
10 import tempfile | |
11 | |
12 def reverse_complement(text): | |
13 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" ) | |
14 comp = [ch for ch in text.translate(DNA_COMP)] | |
15 comp.reverse() | |
16 return "".join(comp) | |
17 | |
18 | |
19 def main(): | |
20 if len(sys.argv) != 8: | |
21 print >> sys.stderr, "Insufficient number of arguments." | |
22 sys.exit() | |
23 | |
24 infile = open(sys.argv[1],'r') | |
25 separation = int(sys.argv[2]) | |
26 outfile = sys.argv[3] | |
27 mono_threshold = int(sys.argv[5]) | |
28 non_mono_threshold = int(sys.argv[6]) | |
29 allow_different_units = int(sys.argv[7]) | |
30 | |
31 print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" % ( separation, mono_threshold, non_mono_threshold, allow_different_units==1 ) | |
32 try: | |
33 fout = open(outfile, "w") | |
34 print >> fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit" | |
35 #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik") | |
36 sputnik_cmd = "sputnik" | |
37 input = infile.read() | |
38 block_num = 0 | |
39 input = input.replace('\r','\n') | |
40 for block in input.split('\n\n'): | |
41 block_num += 1 | |
42 tmpin = tempfile.NamedTemporaryFile() | |
43 tmpout = tempfile.NamedTemporaryFile() | |
44 tmpin.write(block.strip()) | |
45 cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name | |
46 try: | |
47 os.system(cmdline) | |
48 except Exception: | |
49 continue | |
50 sputnik_out = tmpout.read() | |
51 tmpin.close() | |
52 tmpout.close() | |
53 if sputnik_out != "": | |
54 if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')): | |
55 continue | |
56 align_block = block.strip().split('>') | |
57 | |
58 lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6} | |
59 blockdict = {} | |
60 r = 0 | |
61 namelist = [] | |
62 for k, sput_block in enumerate(sputnik_out.split('>')[1:]): | |
63 whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip() | |
64 p = re.compile('\n(\S*nucleotide)') | |
65 repeats = p.split(sput_block.strip()) | |
66 repeats_count = len(repeats) | |
67 j = 1 | |
68 name = repeats[0].strip() | |
69 try: | |
70 coords = re.search('\d+[-_:]\d+', name).group() | |
71 coords = coords.replace('_', '-').replace(':', '-') | |
72 except Exception: | |
73 coords = '0-0' | |
74 r += 1 | |
75 blockdict[r] = {} | |
76 try: | |
77 sp_name = name[:name.index('.')] | |
78 chr_name = name[name.index('.'):name.index('(')] | |
79 namelist.append(sp_name + chr_name) | |
80 except: | |
81 namelist.append(name[:20]) | |
82 while j < repeats_count: | |
83 try: | |
84 if repeats[j].strip() not in lendict: | |
85 j += 2 | |
86 continue | |
87 | |
88 if blockdict[r].has_key('types'): | |
89 blockdict[r]['types'].append(repeats[j].strip()) #type of microsat | |
90 else: | |
91 blockdict[r]['types'] = [repeats[j].strip()] #type of microsat | |
92 | |
93 start = int(repeats[j+1].split('--')[0].split(':')[0].strip()) | |
94 #check to see if there are gaps before the start of the repeat, and change the start accordingly | |
95 sgaps = 0 | |
96 ch_pos = start - 1 | |
97 while ch_pos >= 0: | |
98 if whole_seq[ch_pos] == '-': | |
99 sgaps += 1 | |
100 else: | |
101 break #break at the 1st non-gap character | |
102 ch_pos -= 1 | |
103 if blockdict[r].has_key('starts'): | |
104 blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS | |
105 else: | |
106 blockdict[r]['starts'] = [start+sgaps] | |
107 | |
108 end = int(repeats[j+1].split('--')[0].split(':')[1].strip()) | |
109 #check to see if there are gaps after the end of the repeat, and change the end accordingly | |
110 egaps = 0 | |
111 for ch in whole_seq[end:]: | |
112 if ch == '-': | |
113 egaps += 1 | |
114 else: | |
115 break #break at the 1st non-gap character | |
116 if blockdict[r].has_key('ends'): | |
117 blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS | |
118 else: | |
119 blockdict[r]['ends'] = [end+egaps] | |
120 | |
121 repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence | |
122 repeat_len = repeats[j+1].split('--')[1].split()[1].strip() | |
123 gap_count = repeat_seq.count('-') | |
124 #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count | |
125 repeat_len = str(int(repeat_len) - gap_count) | |
126 | |
127 rel_start = blockdict[r]['starts'][-1] | |
128 gaps_before_start = whole_seq[:rel_start].count('-') | |
129 | |
130 if blockdict[r].has_key('gaps_before_start'): | |
131 blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths | |
132 else: | |
133 blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths | |
134 | |
135 whole_seq_start = int(coords.split('-')[0]) | |
136 if blockdict[r].has_key('whole_seq_start'): | |
137 blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths | |
138 else: | |
139 blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths | |
140 | |
141 if blockdict[r].has_key('lengths'): | |
142 blockdict[r]['lengths'].append(repeat_len) #lengths | |
143 else: | |
144 blockdict[r]['lengths'] = [repeat_len] #lengths | |
145 | |
146 if blockdict[r].has_key('counts'): | |
147 blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit | |
148 else: | |
149 blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit | |
150 | |
151 if blockdict[r].has_key('units'): | |
152 blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit | |
153 else: | |
154 blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit | |
155 | |
156 except Exception: | |
157 pass | |
158 j += 2 | |
159 #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'. | |
160 delete_index_list = [] | |
161 for ind, item in enumerate(blockdict[r]['ends']): | |
162 try: | |
163 if blockdict[r]['starts'][ind+1]-item < separation: | |
164 if ind not in delete_index_list: | |
165 delete_index_list.append(ind) | |
166 if ind+1 not in delete_index_list: | |
167 delete_index_list.append(ind+1) | |
168 except Exception: | |
169 pass | |
170 for index in delete_index_list: #mark them for deletion | |
171 try: | |
172 blockdict[r]['starts'][index] = 'marked' | |
173 blockdict[r]['ends'][index] = 'marked' | |
174 blockdict[r]['types'][index] = 'marked' | |
175 blockdict[r]['gaps_before_start'][index] = 'marked' | |
176 blockdict[r]['whole_seq_start'][index] = 'marked' | |
177 blockdict[r]['lengths'][index] = 'marked' | |
178 blockdict[r]['counts'][index] = 'marked' | |
179 blockdict[r]['units'][index] = 'marked' | |
180 except Exception: | |
181 pass | |
182 #remove 'marked' elements from all the lists | |
183 """ | |
184 for key in blockdict[r].keys(): | |
185 for elem in blockdict[r][key]: | |
186 if elem == 'marked': | |
187 blockdict[r][key].remove(elem) | |
188 """ | |
189 #print blockdict | |
190 | |
191 #make sure that the blockdict has keys for both the species | |
192 if (1 not in blockdict) or (2 not in blockdict): | |
193 continue | |
194 | |
195 visited_2 = [0 for x in range(len(blockdict[2]['starts']))] | |
196 for ind1, coord_s1 in enumerate(blockdict[1]['starts']): | |
197 if coord_s1 == 'marked': | |
198 continue | |
199 coord_e1 = blockdict[1]['ends'][ind1] | |
200 out = [] | |
201 for ind2, coord_s2 in enumerate(blockdict[2]['starts']): | |
202 if coord_s2 == 'marked': | |
203 visited_2[ind2] = 1 | |
204 continue | |
205 coord_e2 = blockdict[2]['ends'][ind2] | |
206 #skip if the 2 repeats are not of the same type or don't have the same repeating unit. | |
207 if allow_different_units == 0: | |
208 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): | |
209 continue | |
210 else: | |
211 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2): | |
212 continue | |
213 #print >> sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2) | |
214 #skip if the repeat number thresholds are not met | |
215 if blockdict[1]['types'][ind1] == 'mononucleotide': | |
216 if (int(blockdict[1]['counts'][ind1]) < mono_threshold): | |
217 continue | |
218 else: | |
219 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): | |
220 continue | |
221 | |
222 if blockdict[2]['types'][ind2] == 'mononucleotide': | |
223 if (int(blockdict[2]['counts'][ind2]) < mono_threshold): | |
224 continue | |
225 else: | |
226 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): | |
227 continue | |
228 #print "s1,e1=%s,%s; s2,e2=%s,%s" % ( coord_s1, coord_e1, coord_s2, coord_e2 ) | |
229 if (coord_s1 in range(coord_s2, coord_e2)) or (coord_e1 in range(coord_s2, coord_e2)): | |
230 out.append(str(block_num)) | |
231 out.append(namelist[0]) | |
232 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] | |
233 rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) | |
234 out.append(str(rel_start)) | |
235 out.append(str(rel_end)) | |
236 out.append(blockdict[1]['types'][ind1]) | |
237 out.append(blockdict[1]['lengths'][ind1]) | |
238 out.append(blockdict[1]['counts'][ind1]) | |
239 out.append(blockdict[1]['units'][ind1]) | |
240 out.append(namelist[1]) | |
241 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] | |
242 rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) | |
243 out.append(str(rel_start)) | |
244 out.append(str(rel_end)) | |
245 out.append(blockdict[2]['types'][ind2]) | |
246 out.append(blockdict[2]['lengths'][ind2]) | |
247 out.append(blockdict[2]['counts'][ind2]) | |
248 out.append(blockdict[2]['units'][ind2]) | |
249 print >> fout, '\t'.join(out) | |
250 visited_2[ind2] = 1 | |
251 out = [] | |
252 | |
253 if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet. | |
254 for ind2, coord_s2 in enumerate(blockdict[2]['starts']): | |
255 if coord_s2 == 'marked': | |
256 continue | |
257 if visited_2[ind] != 0: | |
258 continue | |
259 coord_e2 = blockdict[2]['ends'][ind2] | |
260 out = [] | |
261 for ind1, coord_s1 in enumerate(blockdict[1]['starts']): | |
262 if coord_s1 == 'marked': | |
263 continue | |
264 coord_e1 = blockdict[1]['ends'][ind1] | |
265 #skip if the 2 repeats are not of the same type or don't have the same repeating unit. | |
266 if allow_different_units == 0: | |
267 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]): | |
268 continue | |
269 else: | |
270 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2: | |
271 continue | |
272 #skip if the repeat number thresholds are not met | |
273 if blockdict[1]['types'][ind1] == 'mononucleotide': | |
274 if (int(blockdict[1]['counts'][ind1]) < mono_threshold): | |
275 continue | |
276 else: | |
277 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold): | |
278 continue | |
279 | |
280 if blockdict[2]['types'][ind2] == 'mononucleotide': | |
281 if (int(blockdict[2]['counts'][ind2]) < mono_threshold): | |
282 continue | |
283 else: | |
284 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold): | |
285 continue | |
286 | |
287 if (coord_s2 in range(coord_s1, coord_e1)) or (coord_e2 in range(coord_s1, coord_e1)): | |
288 out.append(str(block_num)) | |
289 out.append(namelist[0]) | |
290 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1] | |
291 rel_end = rel_start + int(blockdict[1]['lengths'][ind1]) | |
292 out.append(str(rel_start)) | |
293 out.append(str(rel_end)) | |
294 out.append(blockdict[1]['types'][ind1]) | |
295 out.append(blockdict[1]['lengths'][ind1]) | |
296 out.append(blockdict[1]['counts'][ind1]) | |
297 out.append(blockdict[1]['units'][ind1]) | |
298 out.append(namelist[1]) | |
299 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2] | |
300 rel_end = rel_start + int(blockdict[2]['lengths'][ind2]) | |
301 out.append(str(rel_start)) | |
302 out.append(str(rel_end)) | |
303 out.append(blockdict[2]['types'][ind2]) | |
304 out.append(blockdict[2]['lengths'][ind2]) | |
305 out.append(blockdict[2]['counts'][ind2]) | |
306 out.append(blockdict[2]['units'][ind2]) | |
307 print >> fout, '\t'.join(out) | |
308 visited_2[ind2] = 1 | |
309 out = [] | |
310 | |
311 #print >> fout, blockdict | |
312 except Exception, exc: | |
313 print >> sys.stderr, "type(exc),args,exc: %s, %s, %s" % ( type(exc), exc.args, exc ) | |
314 | |
315 | |
316 if __name__ == "__main__": | |
317 main() |