comparison trams.py @ 3:1f00946b18c2 draft default tip

Uploaded
author rijst
date Wed, 12 Dec 2012 09:09:45 -0500
parents cc961e057668
children
comparison
equal deleted inserted replaced
2:7e46920d9664 3:1f00946b18c2
1 #################
2 transl_table = 11
3 intro_message = ''' +------------------------------------------------------------------+
4 | Tool for Rapid Annotation of Microbial SNPs (TRAMS): a simple |
5 | program for rapid annotation of genomic variation in prokaryotes |
6 | |
7 | Developed by: Richard A. Reumerman, Paul R. Herron, |
8 | Paul A. Hoskisson and Vartul Sangal |
9 +------------------------------------------------------------------+\n'''
10 #################
11
12 import sys
13 import time
14 start = time.clock()
15
16 # Command line files: SNP REF REF-TYPE ANNOT OVERL SUM;
17 if len(sys.argv) < 7:
18 exit("\nNot enough arguments given.\nUsage: TRAMS_Galaxy.py [SNP.] [REF.] [ANNOT.] [OVERL.] [SUM.]")
19 try:
20 file_snps = open(sys.argv[1], "rU")
21 except IOError as e:
22 exit("Error trying to open '"+sys.argv[1]+"': {1}".format(e.errno, e.strerror))
23 try:
24 file_ref = open(sys.argv[2], "rU")
25 except IOError as e:
26 exit("Error trying to open '"+sys.argv[2]+"': {1}".format(e.errno, e.strerror))
27
28 filetype_reference = sys.argv[3]
29
30 try:
31 file_out = open(sys.argv[4], "w")
32 except IOError as e:
33 exit("Error trying to open '"+sys.argv[4]+"': {1}".format(e.errno, e.strerror))
34 try:
35 file_overlap = open(sys.argv[5], "w")
36 except IOError as e:
37 exit("Error trying to open '"+sys.argv[5]+"': {1}".format(e.errno, e.strerror))
38 try:
39 file_summary = open(sys.argv[6], "w")
40 except IOError as e:
41 exit("Error trying to open '"+sys.argv[6]+"': {1}".format(e.errno, e.strerror))
42
43 import Bio
44 from Bio import SeqIO, SeqFeature
45 from Bio.SeqRecord import SeqRecord
46 from Bio.Seq import Seq
47 from Bio.Alphabet import generic_dna, IUPAC
48 from Bio.Data import CodonTable
49
50 modules_loaded = time.clock()
51
52 def non_coding_calc(gene, pos = 0):
53 '''This function takes a pseudogene and returns the number of bases
54 located in between the sub-features before 'pos'. Returns 0 if 'pseudo' = False.
55 Input: {start, subfeats, pseudo}, pos (default = 0)'''
56 if not gene['pseudo']: return 0
57
58 non_coding_bases = 0
59 prev_subfeat_end = gene['start']
60 if gene['strand'] == -1:
61 for subfeature in gene['subfeats']:
62 if subfeature.location._start.position < pos:
63 prev_subfeat_end = subfeature.location._end.position
64 continue
65 non_coding_bases += (subfeature.location._start.position - prev_subfeat_end)
66 prev_subfeat_end = subfeature.location._end.position
67 else:
68 for subfeature in gene['subfeats']:
69 non_coding_bases += (subfeature.location._start.position - prev_subfeat_end)
70 prev_subfeat_end = subfeature.location._end.position
71 if prev_subfeat_end >= pos and pos != 0: break
72
73 return non_coding_bases
74
75
76 def region_calc(bounds,length):
77 regions = []
78 lastend=i=0
79 while i < len(bounds):
80 if bounds[i]['start'] > lastend:# Intergenic region present;
81 regions.append([lastend,bounds[i]['start'],-1])
82 lastend = bounds[i]['start']
83 else:
84 regions.append([bounds[i]['start'],bounds[i]['end'],i])
85 if bounds[i]['end'] > lastend:
86 lastend = bounds[i]['end']
87 i += 1
88
89 if regions[-1][1] < length:# Final tail of genome;
90 regions.append([lastend,length,-1])
91
92 return regions
93
94
95 def overlap_calc(bounds):
96 '''This function takes an array of feature starts and ends and
97 returns an array of starts and ends of all overlapping regions.
98 Input: [{start,end}]'''
99 i = 0
100 overlaps = []
101 while i < len(bounds) - 1:
102 for downstr in bounds[i+1:]:
103 if downstr[0] < bounds[i][1]:# Features overlap;
104 if downstr[1] < bounds[i][1]:# Complete overlap;
105 overlaps.append([downstr[0],downstr[1],bounds[i][2],downstr[2],[0,0]])
106 else:# Partial overlap;
107 overlaps.append([downstr[0],bounds[i][1],bounds[i][2],downstr[2],[0,0]])
108 else:# No use looking further;
109 break
110
111 i += 1
112
113 return overlaps
114
115
116 def match_feature(bounds,pos,prev=0):
117 '''This function checks if a position is located inside a feature and
118 returns the feature's number if found or -1 if none is found.
119 Input: {start,end},pos,prev_feat (default = 0)'''
120 for i in range(prev, len(bounds)):
121 if (pos >= bounds[i]['start']) and (pos < bounds[i]['end']):
122 return i
123 elif pos < bounds[i]['start']:# No use looking further
124 return -1
125
126 return -1
127
128
129 def write_output(line,target=file_out):
130 '''This function takes the 2 dimensional array containing all the SNP
131 data. It contains an array of information on the feature and an array
132 for each strain for which SNPs are given.
133 Input: [[pos],[ref],[cells],[cells],etc]'''
134 target.write('\n'+str(line[0][0]))
135 for cell in line[1]:
136 target.write('\t'+str(cell))
137 for strain in line[2:]:
138 target.write('\t')
139 for cell in strain:
140 target.write('\t'+str(cell))
141
142 target.flush()
143
144
145 def new_codon_calc(ref_codon, new_base, pos_in_cod):
146 return str(ref_codon[0:pos_in_cod-1]+new_base+ref_codon[pos_in_cod:len(ref_codon)])
147
148
149 def mut_type_check(ref_res, ref_codon, pos_in_gene, new_base, new_codon):
150 if str(new_codon).lower() == str(ref_codon).lower():
151 return ['','','','']
152 new_residue = Seq(new_codon).translate(table=transl_table)
153 if str(new_residue) == str(ref_res):
154 mut_type = 'synonymous'
155 elif (pos_in_gene / 3) < 1 and str(ref_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons:# position 0,1 or 2 and SNP is in start codon;
156 if str(new_codon).upper() in CodonTable.unambiguous_dna_by_id[transl_table].start_codons: mut_type = 'nonsynonymous'
157 else: mut_type = 'nonstart'
158 elif str(new_residue) == '*': mut_type = 'nonsense'
159 elif str(ref_res) == '*': mut_type = 'nonstop'
160 else: mut_type = 'nonsynonymous'
161
162 return [mut_type,new_base,new_codon,new_residue]
163
164
165 def codon_process(codon):
166 '''This function processes a codon. It loops through it 3 times,
167 once to determine which is the highest position mutated, once to
168 fill in the cells for the output file and once to output all lines.
169 Input: [empty,start_pos,[line1],[line2],[line3],strand]
170 It also uses global variable strain_nr'''
171 lastposition = [-1]*(strain_nr-1)
172 new_codons = ['']*(strain_nr-1)
173 if codon[-1] == -1:# Change codon position order for -1 features;
174 temp = codon [1:-1]
175 temp.reverse()
176 codon[1:-1] = temp
177 for i,line in enumerate(codon[1:-1],1):
178 if line == '': continue
179 for j,strain in enumerate(line[2:]):
180 if strain[1] in ['a','g','c','t']:
181 lastposition[j] = i
182 new_codons[j] = codon[i][1][8]
183
184 for i,line in enumerate(codon[1:-1],1):
185 if codon[-1] == -1: pos_in_cod = 4-i
186 else: pos_in_cod = i
187
188 if line == '': continue
189 for j,strain in enumerate(line[2:]):
190 if i == lastposition[j]: # Check for synonymous etc.;
191 new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod)
192 codon[i][j+2] = mut_type_check(line[1][9],line[1][8],codon[0],strain[1],new_codons[j])
193 straininfo[j][codon[i][j+2][0]] += 1# Counting;
194 elif strain[1] in ['a','g','c','t']:
195 codon[i][j+2] = ['MNP',strain[1],'','']
196 straininfo[j]['mnps'] += 1
197 new_codons[j] = new_codon_calc(new_codons[j],strain[1],pos_in_cod)
198 elif strain[0] == 'Allele missing': codon[i][j+2] = strain
199 else: codon[i][j+2] = ['']*4
200
201 for line in codon[1:-1]:
202 if line != '': write_output(line)
203
204 def feature_props(feature):
205 properties = {'type':feature.type,'strand':feature.location._strand,
206 'sequence':feature.extract(seq_record.seq),'pseudo': False,
207 'locus_tag':'','gene_name':'','product':'',
208 'start':int(feature.location._start.position),
209 'end':int(feature.location._end.position)}
210 if 'pseudo' in feature.qualifiers:
211 properties['pseudo'] = True
212 properties['type'] = 'pseudogene'
213 properties['pure_seq'] = properties['sequence']
214 if properties['strand'] == -1:
215 properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position].reverse_complement()
216 else:
217 properties['sequence'] = seq_record.seq[feature.location._start.position:feature.location._end.position]
218 if feature.sub_features: properties['subfeats'] = feature.sub_features
219 if 'locus_tag' in feature.qualifiers: properties['locus_tag'] = feature.qualifiers['locus_tag'][0]
220 if 'gene' in feature.qualifiers: properties['gene_name']= feature.qualifiers['gene'][0]
221 if feature.type in ['tRNA','rRNA','CDS']: properties['product'] = feature.qualifiers['product'][0]
222
223 return properties
224
225 # Read embl/genbank file for information on sequence features;
226 try:
227 seq_record = SeqIO.parse(file_ref, filetype_reference).next()
228 except:
229 file_ref.close()
230 quit("Error reading "+sys.argv[2]+", please check file for errors.")
231 file_ref.close()
232
233 # Loop through genome features and save relevant properties;
234 feats = []# Dictionary of properties;
235
236 feature_types = {'intergenic':0,'gene':0,'pseudogene':0}
237 feat_temp_store = ''
238 for feature in seq_record.features:
239 # Check if gene is defined as other feature (e.g. CDS). Else, save info from 'gene';
240 if feat_temp_store != '':
241 if (feature.location._start.position == feat_temp_store.location._start.position and
242 feature.location._end.position == feat_temp_store.location._end.position):# Gene also defined as other feature;
243 feat_temp_store = ''
244 else:# Gene not also defined as CDS;
245 feats.append(feature_props(feat_temp_store))
246 feat_temp_store = ''
247 elif feature.type == 'gene':
248 feat_temp_store = feature
249
250 if not feature.type in ['source','gene','misc_feature']:
251 if not feature.type in feature_types and feature.type != 'CDS': feature_types[feature.type] = 0
252 feats.append(feature_props(feature))
253
254
255 feat_props = sorted(feats, key=lambda cells:int(cells['start']))
256 feat_boundaries = [{'start':item['start'],'end':item['end']} for item in feat_props]
257 regions = region_calc(feat_boundaries,len(seq_record.seq))
258 feat_overlap = overlap_calc(regions)
259
260 reference_loaded = time.clock()
261
262 # Create array of SNPs from input file for processing;
263 lines = [line.split('\t') for line in file_snps if line.strip()]
264 file_snps.close()
265 # First line contains headers, extract number of strains etc;
266 headers = lines[0]
267 snp_table = sorted(lines[1:], key=lambda cells:int(cells[0]))
268
269 snps_loaded = time.clock()
270
271 # Print output file headers;
272 headers[-1] = headers[-1].rstrip()# Remove newline character;
273 strain_nr = len(headers)-1
274 strains_found = 'Found '+str(strain_nr)+' strains: '+headers[1]+' (reference)'
275 first_line = '\t'+headers[1]+'\t'*9
276 second_line = 'Position\tFeature\tLocus tag\tGene\tProduct\tStart\tEnd\tStrand\tRef. base\tRef. codon\tRef. res.'
277 straininfo = [0]*(len(headers[2:]))
278 for i,strain in enumerate(headers[2:]):
279 straininfo[i] = {'snps':0,'mnps':0,'synonymous':0,'nonsynonymous':0,'nonstart':0,'nonstop':0,'nonsense':0}
280 straininfo[i].update(feature_types)
281 strains_found += ', '+strain
282 first_line += '\t\t'+strain+'\t'*3
283 second_line += '\t\tSNP type\tNew base\tNew codon\tNew res.'
284
285 file_out.write(first_line+'\n'+second_line)
286 file_out.flush()
287
288 # Loop through SNPs from array and process them;
289 props = {}# Properties of a feature;
290 prev_snp = ''# Position of previous SNP;
291 to_write = []# Information of current SNP;
292 compl_bases = {'a':'t','t':'a','g':'c','c':'g'}
293 firstsnp = True# First snp of region, or of codon in cases of 3 positions in codon mutated;
294 prev_start=j=k=0
295 overlap_snps = []
296 codon = ['']*5# Array of codon positions. First item is position of first base of codon in the gene;
297
298 for region in regions:
299 firstsnp = True
300 i = prev_start
301 while i < len(snp_table):# Loop through SNPs
302 snp_entry = snp_table[i]
303 if not str(snp_entry[0]).isdigit():# Not a valid line, skip;
304 i += 1
305 continue
306
307 pos = int(snp_entry[0])-1
308 if pos < region[0]:# Not inside region yet;
309 i += 1
310 continue
311 elif firstsnp and pos < region[1]:
312 prev_start = i
313 elif pos >= region[1]:# End of region, process and next;
314 if not firstsnp and codon != ['','','','','']:
315 codon_process(codon)
316 break
317
318 # Documentation of SNPs in feature overlaps;
319 while j < len(feat_overlap)-1 and pos > feat_overlap[j][1]: j += 1
320 k = j
321 while k < len(feat_overlap) and pos >= feat_overlap[k][0]:
322 if pos < feat_overlap[k][1]:
323 if feat_overlap[k][4][0] == 0:
324 feat_overlap[k][4][0] = pos
325 feat_overlap[k][4][1] = pos
326 k += 1
327
328
329 snp_entry[-1] = snp_entry[-1].rstrip()# Remove newline character at end of line;
330 mnp=in_feat=False
331 snp_feat = region[2]
332 ref_base = snp_entry[1]
333
334 to_write = [[pos+1]]
335
336 # Output feature properties and reference situation;
337 if snp_feat == -1:
338 codon = ['']*5
339 to_write.append(['intergenic','','','','','','',ref_base.upper(),'',''])
340 elif feat_props[snp_feat]['type'] not in ['CDS','gene','pseudogene']:# In feature, but non-coding;
341 codon = ['']*5
342 props = feat_props[snp_feat]
343 if props['strand'] == -1: ref_base = (compl_bases[snp_entry[1].lower()])
344 else: ref_base = snp_entry[1]
345 to_write.append([props['type'],props['locus_tag'],props['gene_name'],
346 props['product'],props['start']+1,props['end'],
347 '',ref_base.upper(),'',''])
348 else:# in CDS/gene feature, check codon etc;
349 props = feat_props[snp_feat]
350 sequence = props['sequence']
351 if props['strand'] == -1:
352 pos_in_gene = props['end'] - pos - 1# Python counting
353 ref_base = (compl_bases[snp_entry[1].lower()])
354 else:
355 pos_in_gene = pos - props['start']# Python counting
356 ref_base = snp_entry[1]
357
358 in_feat = True
359 if props['pseudo'] and 'subfeats' in props:# Pseudogene that needs special attention;
360 in_feat = False
361 subfeat_boundaries = [{'start':item.location._start.position,'end':item.location._end.position}
362 for item in props['subfeats']]
363 snp_subfeat = match_feature(subfeat_boundaries,pos)
364 if snp_subfeat != -1:
365 in_feat = True
366 pos_in_gene -= non_coding_calc({'start':props['start'],'subfeats':props['subfeats'],
367 'pseudo':True,'strand':props['strand']},pos)
368 sequence = props['pure_seq']
369
370 if not in_feat:# In pseudogene non-coding region;
371 codon = ['']*5
372 to_write.append(['non coding',props['locus_tag'],props['gene_name'],props['product'],
373 props['start']+1,props['end'],props['strand'],ref_base.upper(),
374 '',''])
375 else:# In coding region;
376 pos_in_cod = (pos_in_gene+1)%3
377 if pos_in_cod == 0: pos_in_cod = 3# Remainder of division 0 means 3rd place in codon;
378
379 old_codon = sequence[pos_in_gene-pos_in_cod+1:pos_in_gene-pos_in_cod+4].upper()
380 old_residue = old_codon.translate(table=transl_table)
381 to_write.append([props['type'],props['locus_tag'],props['gene_name'],props['product'],
382 props['start']+1,props['end'],props['strand'],ref_base.upper(),
383 old_codon,old_residue])
384
385 if in_feat and not firstsnp and (pos >= prev_snp):# Check if snp is in same codon as previous snp. Position check for overlapping features;
386 if props['strand'] == 1 and (pos - prev_snp + 1) < pos_in_cod:# Same codon (Positive strand);
387 mnp = True
388 elif props['strand'] == -1 and (pos - prev_snp + 1) <= (3 - pos_in_cod):# Same codon (negative strand);
389 mnp = True
390
391 # Process previous codon if not MNP;
392 if in_feat and not mnp:
393 if not firstsnp:
394 codon_process(codon)
395 codon = [pos_in_gene-pos_in_cod+1,'','','',props['strand']]
396
397
398 for l, snp in enumerate(snp_entry[2:]):# Loop through SNPs/strains;
399
400 snp = snp.lower()
401 if snp == '':# Empty cell;
402 to_write.append(['','','',''])
403 continue
404
405 if snp == '-': # Feature not present in this strain;
406 to_write.append(['Allele missing','','',''])
407 continue
408
409 if snp_feat == -1:# Intergenic;
410 if snp == ref_base.lower():
411 to_write.append(['']*4)
412 else:
413 to_write.append(['',snp,'',''])
414 straininfo[l]['intergenic'] += 1
415 straininfo[l]['snps'] += 1
416 continue
417
418 if props['strand'] == -1:
419 snp = compl_bases[snp]
420
421 if snp == ref_base.lower():
422 to_write.append(['']*4)
423 else:
424 to_write.append(['',snp,'',''])
425 straininfo[l]['snps'] += 1
426 if props['type'] != 'CDS':
427 straininfo[l][props['type']] += 1
428
429
430
431 if props['type'] in ['CDS','gene','pseudogene'] and in_feat:
432 codon[pos_in_cod] = to_write
433 else:
434 write_output(to_write)
435
436 if firstsnp: firstsnp = False
437 prev_snp = pos+1
438 i += 1
439
440
441 if codon != ['','','','','']: codon_process(codon)
442
443 file_out.close()
444
445 end = time.clock()
446
447 file_summary.write("\n")
448 file_summary.write(intro_message)
449 file_summary.write('\n'+strains_found+'.\n')
450
451 file_summary.write("\nFinished annotation. Total time: %s s\n\n" % round(end-start,1))
452
453
454 file_overlap.write('SNP start\tSNP end\tFeature 1\tLocus tag\tProduct\t\tFeature 2\tLocus tag\tProduct')
455 for overlap in feat_overlap:
456 if overlap[4] != [0,0]:
457 overlap[4][0]+=1
458 overlap[4][1]+=1
459 if overlap[4][0] == overlap[4][1]: overlap[4][1] = ''
460 write_output([[str(overlap[4][0])],[str(overlap[4][1]),feat_props[overlap[2]]['type'],feat_props[overlap[2]]['locus_tag'],feat_props[overlap[2]]['product']],
461 [feat_props[overlap[3]]['type'],feat_props[overlap[3]]['locus_tag'],feat_props[overlap[3]]['product']]],
462 file_overlap)
463
464
465 for i,strain in enumerate(headers[2:]):
466 file_summary.write("\n")
467 info = straininfo[i]
468 file_summary.write("+ Strain %s:\n" % strain)
469 file_summary.write(" %s SNPs found\n" % info['snps'])
470 file_summary.write(" Number of SNPs found CDS features: %s\n" % (info['mnps']+info['nonstart']+info['nonstop']+info['nonsense']+
471 info['synonymous']+info['nonsynonymous']))
472 file_summary.write(" (of which in pseudogenes: %s)\n" % info['pseudogene'])
473 file_summary.write(" - MNPs: %s\n" % info['mnps'])
474 file_summary.write(" - Synonymous: %s\n" % info['synonymous'])
475 file_summary.write(" - Nonsynonymous: %s\n" % info['nonsynonymous'])
476 file_summary.write(" - Nonsense: %s\n" % info['nonsense'])
477 file_summary.write(" - Nonstart: %s\n" % info['nonstart'])
478 file_summary.write(" - Nonstop: %s\n" % info['nonstop'])
479 file_summary.write(" Intergenic: %s\n" % info['intergenic'])
480
481 for typ in feature_types:
482 if typ not in ['intergenic','pseudogene'] and info[typ] != 0:
483 file_summary.write(" %s: %s\n" % (typ,info[typ]))
484 file_summary.flush()
485
486 file_overlap.close()
487 file_summary.close()