Mercurial > repos > yating-l > jbrowse_hub
comparison blastxmlToGff3.py @ 0:e4f3f2ed4fa5 draft
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f18ea51d27ec7addfa6413716391cfefebc8acbc-dirty
author | yating-l |
---|---|
date | Fri, 10 Mar 2017 13:48:19 -0500 |
parents | |
children | d8049deb0c97 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e4f3f2ed4fa5 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 | |
4 from Bio.Blast import NCBIXML | |
5 from collections import OrderedDict | |
6 import utils | |
7 | |
8 | |
9 def align2cigar(hsp_query, hsp_reference): | |
10 ''' | |
11 Build CIGAR representation from an hsp_query | |
12 input: | |
13 hsp_query | |
14 hsp_sbjct | |
15 output: | |
16 CIGAR string | |
17 ''' | |
18 query = hsp_query | |
19 ref = hsp_reference | |
20 # preType, curType: | |
21 # 'M' represents match, | |
22 # 'I' represents insert a gap into the reference sequence, | |
23 # 'D' represents insert a gap into the target (delete from reference) | |
24 # some ideas of this algin2cigar function are coming from | |
25 # https://gist.github.com/ozagordi/099bdb796507da8d9426 | |
26 prevType = 'M' | |
27 curType = 'M' | |
28 count = 0 | |
29 cigar = [] | |
30 num = len(query) | |
31 for i in range(num): | |
32 if query[i] == '-': | |
33 curType = 'D' | |
34 elif ref[i] == '-': | |
35 curType = 'I' | |
36 else: | |
37 curType = 'M' | |
38 if curType == prevType: | |
39 count += 1 | |
40 else: | |
41 cigar.append('%s%d' % (prevType, count)) | |
42 prevType = curType | |
43 count = 1 | |
44 cigar.append('%s%d' % (curType, count)) | |
45 return ' '.join(cigar) | |
46 | |
47 def gff3_writer(blast_records, gff3_file): | |
48 gff3 = open(gff3_file, 'w') | |
49 gff3.write("##gff-version 3\n") | |
50 seq_regions = dict() | |
51 for blast_record in blast_records: | |
52 query_name = blast_record.query.split(" ")[0] | |
53 source = blast_record.application | |
54 for alignment in blast_record.alignments: | |
55 title = alignment.title.split(" ") | |
56 seqid = title[len(title) - 1] | |
57 length = alignment.length | |
58 feature_type = 'match' | |
59 for hsp in alignment.hsps: | |
60 field = OrderedDict() | |
61 attribute = OrderedDict() | |
62 ref = hsp.sbjct | |
63 query = hsp.query | |
64 field['seqid'] = seqid | |
65 field['source'] = source | |
66 field['type'] = feature_type | |
67 if seqid not in seq_regions: | |
68 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(length) + '\n') | |
69 seq_regions[seqid] = length | |
70 field['start'] = hsp.sbjct_start | |
71 ref_length = len(ref.replace('-', '')) | |
72 # if run tblastn, the actual length of reference should be multiplied by 3 | |
73 if source.lower() == "tblastn": | |
74 ref_length *= 3 | |
75 field['end'] = field['start'] + ref_length - 1 | |
76 field['score'] = hsp.expect | |
77 #decide if the alignment in the same strand or reverse strand | |
78 #reading frame | |
79 # (+, +), (0, 0), (-, -) => + | |
80 # (+, -), (-, +) => - | |
81 if hsp.frame[1] * hsp.frame[0] > 0: | |
82 field['strand'] = '+' | |
83 elif hsp.frame[1] * hsp.frame[0] < 0: | |
84 field['strand'] = '-' | |
85 else: | |
86 if hsp.frame[0] + hsp.frame[1] >= 0: | |
87 field['strand'] = '+' | |
88 else: | |
89 field['strand'] = '-' | |
90 field['phase'] = '.' | |
91 | |
92 target_start = hsp.query_start | |
93 target_len = len(query.replace('-', '')) | |
94 # if run blastx, the actual length of query should be multiplied by 3 | |
95 if source.lower() == "blastx": | |
96 target_len *= 3 | |
97 target_end = target_start + target_len -1 | |
98 attribute['ID'] = field['seqid'] + '_' + str(field['start']) + '_' + str(field['end']) + '_' + query_name + '_' + str(target_start) + '_' + str(target_end) | |
99 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) | |
100 attribute['Gap'] = align2cigar(query, ref) | |
101 # show reading frame attribute only if the frame is not (0, 0) | |
102 if hsp.frame[0] != 0 or hsp.frame[1] != 0: | |
103 attribute['reading_frame'] = str(hsp.frame[0]) + ", " + str(hsp.frame[1]) | |
104 utils.write_features(field, attribute, gff3) | |
105 | |
106 def blastxml2gff3(xml_file, gff3_file): | |
107 result_handle = open(xml_file) | |
108 blast_records = NCBIXML.parse(result_handle) | |
109 gff3_writer(blast_records, gff3_file) | |
110 | |
111 if __name__ == "__main__": | |
112 blastxml2gff3("../tblastn_dmel.blastxml", "gff3.txt") | |
113 |