Mercurial > repos > yating-l > jbrowse_hub
comparison blastxmlToGff3.py @ 45:15b4a74722d1 draft
planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit 73ede51a133dc8914812195a1b4cd512bfb8f23d
author | yating-l |
---|---|
date | Wed, 22 Mar 2017 12:07:32 -0400 |
parents | d8049deb0c97 |
children | 6803152ea92a |
comparison
equal
deleted
inserted
replaced
44:37422f705e9b | 45:15b4a74722d1 |
---|---|
43 count = 1 | 43 count = 1 |
44 cigar.append('%s%d' % (curType, count)) | 44 cigar.append('%s%d' % (curType, count)) |
45 return ' '.join(cigar) | 45 return ' '.join(cigar) |
46 | 46 |
47 def gff3_writer(blast_records, gff3_file): | 47 def gff3_writer(blast_records, gff3_file): |
48 gff3 = open(gff3_file, 'w') | 48 gff3 = open(gff3_file, 'a') |
49 gff3.write("##gff-version 3\n") | 49 gff3.write("##gff-version 3\n") |
50 seq_regions = dict() | 50 seq_regions = dict() |
51 for blast_record in blast_records: | 51 for blast_record in blast_records: |
52 query_name = blast_record.query.split(" ")[0] | 52 query_name = blast_record.query.split(" ")[0] |
53 source = blast_record.application | 53 source = blast_record.application |
54 method = blast_record.matrix | |
54 for alignment in blast_record.alignments: | 55 for alignment in blast_record.alignments: |
56 group = { | |
57 "parent_field" : OrderedDict(), | |
58 "parent_attribute" : OrderedDict(), | |
59 "alignments" : [] | |
60 } | |
55 title = alignment.title.split(" ") | 61 title = alignment.title.split(" ") |
56 seqid = title[len(title) - 1] | 62 contig_name = title[len(title) - 1] |
57 length = alignment.length | 63 length = alignment.length |
58 feature_type = 'match' | 64 group['parent_field']['seqid'] = contig_name |
65 group['parent_field']['source'] = source | |
66 group['parent_field']['type'] = 'match' | |
67 group['parent_attribute']['ID'] = contig_name + '_' + query_name | |
68 group['parent_attribute']['method'] = method | |
69 group['parent_attribute']['length'] = length | |
70 if contig_name not in seq_regions: | |
71 gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n') | |
72 seq_regions[contig_name] = length | |
73 match_num = 0 | |
74 coords = [length, 0] | |
59 for hsp in alignment.hsps: | 75 for hsp in alignment.hsps: |
76 hsp_align = {} | |
60 field = OrderedDict() | 77 field = OrderedDict() |
61 attribute = OrderedDict() | 78 attribute = OrderedDict() |
62 ref = hsp.sbjct | 79 ref = hsp.sbjct |
63 query = hsp.query | 80 query = hsp.query |
64 field['seqid'] = seqid | 81 field['seqid'] = contig_name |
65 field['source'] = source | 82 field['source'] = source |
66 field['type'] = feature_type | 83 field['type'] = 'match_part' |
67 if seqid not in seq_regions: | 84 |
68 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(length) + '\n') | |
69 seq_regions[seqid] = length | |
70 field['start'] = hsp.sbjct_start | 85 field['start'] = hsp.sbjct_start |
86 if field['start'] < coords[0]: | |
87 coords[0] = field['start'] | |
71 ref_length = len(ref.replace('-', '')) | 88 ref_length = len(ref.replace('-', '')) |
72 # if run tblastn, the actual length of reference should be multiplied by 3 | 89 # if run tblastn, the actual length of reference should be multiplied by 3 |
73 if source.lower() == "tblastn": | 90 if source.lower() == "tblastn": |
74 ref_length *= 3 | 91 ref_length *= 3 |
75 field['end'] = field['start'] + ref_length - 1 | 92 field['end'] = field['start'] + ref_length - 1 |
76 field['score'] = hsp.expect | 93 if field['end'] > coords[1]: |
94 coords[1] = field['end'] | |
95 field['score'] = hsp.score | |
77 #decide if the alignment in the same strand or reverse strand | 96 #decide if the alignment in the same strand or reverse strand |
78 #reading frame | 97 #reading frame |
79 # (+, +), (0, 0), (-, -) => + | 98 # (+, +), (0, 0), (-, -) => + |
80 # (+, -), (-, +) => - | 99 # (+, -), (-, +) => - |
81 if hsp.frame[1] * hsp.frame[0] > 0: | 100 if hsp.frame[1] * hsp.frame[0] > 0: |
93 target_len = len(query.replace('-', '')) | 112 target_len = len(query.replace('-', '')) |
94 # if run blastx, the actual length of query should be multiplied by 3 | 113 # if run blastx, the actual length of query should be multiplied by 3 |
95 if source.lower() == "blastx": | 114 if source.lower() == "blastx": |
96 target_len *= 3 | 115 target_len *= 3 |
97 target_end = target_start + target_len -1 | 116 target_end = target_start + target_len -1 |
98 attribute['ID'] = field['seqid'] + '_' + str(field['start']) + '_' + str(field['end']) + '_' + query_name + '_' + str(target_start) + '_' + str(target_end) | 117 attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num) |
118 attribute['Parent'] = group['parent_attribute']['ID'] | |
99 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) | 119 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) |
100 attribute['Gap'] = align2cigar(query, ref) | 120 attribute['Gap'] = align2cigar(query, ref) |
101 #store the query sequence in the file in order to display alignment with BlastAlignment plugin | 121 #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin |
122 attribute['subject'] = hsp.sbjct | |
102 attribute['query'] = hsp.query | 123 attribute['query'] = hsp.query |
124 attribute['match'] = hsp.match | |
125 attribute['gaps'] = attribute['match'].count(' ') | |
126 similar = attribute['match'].count('+') | |
127 attribute['identities'] = len(attribute['match']) - similar - attribute['gaps'] | |
128 attribute['positives'] = attribute['identities'] + similar | |
129 attribute['expect'] = hsp.expect | |
103 # show reading frame attribute only if the frame is not (0, 0) | 130 # show reading frame attribute only if the frame is not (0, 0) |
104 if hsp.frame[0] != 0 or hsp.frame[1] != 0: | 131 attribute['frame'] = hsp.frame[1] |
105 attribute['reading_frame'] = str(hsp.frame[0]) + ", " + str(hsp.frame[1]) | 132 match_num += 1 |
106 utils.write_features(field, attribute, gff3) | 133 hsp_align['field'] = field |
134 hsp_align['attribute'] = attribute | |
135 group['alignments'].append(hsp_align) | |
136 group['parent_field']['start'] = coords[0] | |
137 group['parent_field']['end'] = coords[1] | |
138 group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.' | |
139 group['parent_attribute']['match_num'] = match_num | |
140 utils.write_features(group['parent_field'], group['parent_attribute'], gff3) | |
141 for align in group['alignments']: | |
142 utils.write_features(align['field'], align['attribute'], gff3) | |
143 gff3.close() | |
144 | |
107 | 145 |
108 def blastxml2gff3(xml_file, gff3_file): | 146 def blastxml2gff3(xml_file, gff3_file): |
109 result_handle = open(xml_file) | 147 result_handle = open(xml_file) |
110 blast_records = NCBIXML.parse(result_handle) | 148 blast_records = NCBIXML.parse(result_handle) |
111 gff3_writer(blast_records, gff3_file) | 149 gff3_writer(blast_records, gff3_file) |
112 | 150 |
113 if __name__ == "__main__": | 151 if __name__ == "__main__": |
114 blastxml2gff3("../tblastn_dmel.blastxml", "gff3.txt") | 152 blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt") |
115 | 153 |