Mercurial > repos > yating-l > jbrowsearchivecreator
comparison blastxmlToGff3.py @ 0:8d1cf7ce65cd draft
planemo upload for repository https://github.com/Yating-L/jbrowse-archive-creator.git commit d583ac16a6c6942730ea536eb59cc37941816030-dirty
| author | yating-l |
|---|---|
| date | Thu, 18 May 2017 17:25:33 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8d1cf7ce65cd |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 | |
| 4 from Bio.Blast import NCBIXML | |
| 5 from collections import OrderedDict | |
| 6 import utils | |
| 7 | |
| 8 | |
| 9 def align2cigar(hsp_query, hsp_reference): | |
| 10 """ | |
| 11 Build CIGAR representation from an hsp_query | |
| 12 input: | |
| 13 hsp_query | |
| 14 hsp_sbjct | |
| 15 output: | |
| 16 CIGAR string | |
| 17 """ | |
| 18 query = hsp_query | |
| 19 ref = hsp_reference | |
| 20 # preType, curType: | |
| 21 # 'M' represents match, | |
| 22 # 'I' represents insert a gap into the reference sequence, | |
| 23 # 'D' represents insert a gap into the target (delete from reference) | |
| 24 # some ideas of this algin2cigar function are coming from | |
| 25 # https://gist.github.com/ozagordi/099bdb796507da8d9426 | |
| 26 prevType = 'M' | |
| 27 curType = 'M' | |
| 28 count = 0 | |
| 29 cigar = [] | |
| 30 num = len(query) | |
| 31 for i in range(num): | |
| 32 if query[i] == '-': | |
| 33 curType = 'D' | |
| 34 elif ref[i] == '-': | |
| 35 curType = 'I' | |
| 36 else: | |
| 37 curType = 'M' | |
| 38 if curType == prevType: | |
| 39 count += 1 | |
| 40 else: | |
| 41 cigar.append('%s%d' % (prevType, count)) | |
| 42 prevType = curType | |
| 43 count = 1 | |
| 44 cigar.append('%s%d' % (curType, count)) | |
| 45 return ' '.join(cigar) | |
| 46 | |
| 47 def gff3_writer(blast_records, gff3_file): | |
| 48 gff3 = open(gff3_file, 'a') | |
| 49 gff3.write("##gff-version 3\n") | |
| 50 seq_regions = dict() | |
| 51 for blast_record in blast_records: | |
| 52 query_name = blast_record.query.split(" ")[0] | |
| 53 source = blast_record.application | |
| 54 method = blast_record.matrix | |
| 55 for alignment in blast_record.alignments: | |
| 56 group = { | |
| 57 "parent_field" : OrderedDict(), | |
| 58 "parent_attribute" : OrderedDict(), | |
| 59 "alignments" : [] | |
| 60 } | |
| 61 title = alignment.title.split(" ") | |
| 62 contig_name = title[len(title) - 1] | |
| 63 length = alignment.length | |
| 64 group['parent_field']['seqid'] = contig_name | |
| 65 group['parent_field']['source'] = source | |
| 66 group['parent_field']['type'] = 'match' | |
| 67 group['parent_attribute']['ID'] = contig_name + '_' + query_name | |
| 68 group['parent_attribute']['method'] = method | |
| 69 group['parent_attribute']['length'] = length | |
| 70 if contig_name not in seq_regions: | |
| 71 gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n') | |
| 72 seq_regions[contig_name] = length | |
| 73 match_num = 0 | |
| 74 coords = [length, 0] | |
| 75 for hsp in alignment.hsps: | |
| 76 hsp_align = {} | |
| 77 field = OrderedDict() | |
| 78 attribute = OrderedDict() | |
| 79 ref = hsp.sbjct | |
| 80 query = hsp.query | |
| 81 field['seqid'] = contig_name | |
| 82 field['source'] = source | |
| 83 field['type'] = 'match_part' | |
| 84 | |
| 85 field['start'] = hsp.sbjct_start | |
| 86 if field['start'] < coords[0]: | |
| 87 coords[0] = field['start'] | |
| 88 ref_length = len(ref.replace('-', '')) | |
| 89 # if run tblastn, the actual length of reference should be multiplied by 3 | |
| 90 if source.lower() == "tblastn": | |
| 91 ref_length *= 3 | |
| 92 field['end'] = field['start'] + ref_length - 1 | |
| 93 if field['end'] > coords[1]: | |
| 94 coords[1] = field['end'] | |
| 95 field['score'] = hsp.score | |
| 96 #decide if the alignment in the same strand or reverse strand | |
| 97 #reading frame | |
| 98 # (+, +), (0, 0), (-, -) => + | |
| 99 # (+, -), (-, +) => - | |
| 100 if hsp.frame[1] * hsp.frame[0] > 0: | |
| 101 field['strand'] = '+' | |
| 102 elif hsp.frame[1] * hsp.frame[0] < 0: | |
| 103 field['strand'] = '-' | |
| 104 else: | |
| 105 if hsp.frame[0] + hsp.frame[1] >= 0: | |
| 106 field['strand'] = '+' | |
| 107 else: | |
| 108 field['strand'] = '-' | |
| 109 field['phase'] = '.' | |
| 110 | |
| 111 target_start = hsp.query_start | |
| 112 target_len = len(query.replace('-', '')) | |
| 113 # if run blastx, the actual length of query should be multiplied by 3 | |
| 114 if source.lower() == "blastx": | |
| 115 target_len *= 3 | |
| 116 target_end = target_start + target_len -1 | |
| 117 attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num) | |
| 118 attribute['Parent'] = group['parent_attribute']['ID'] | |
| 119 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) | |
| 120 attribute['Gap'] = align2cigar(query, ref) | |
| 121 #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin | |
| 122 attribute['subject'] = hsp.sbjct | |
| 123 attribute['query'] = hsp.query | |
| 124 attribute['match'] = hsp.match | |
| 125 attribute['gaps'] = attribute['match'].count(' ') | |
| 126 similar = attribute['match'].count('+') | |
| 127 attribute['identities'] = len(attribute['match']) - similar - attribute['gaps'] | |
| 128 attribute['positives'] = attribute['identities'] + similar | |
| 129 attribute['expect'] = hsp.expect | |
| 130 # show reading frame attribute only if the frame is not (0, 0) | |
| 131 attribute['frame'] = hsp.frame[1] | |
| 132 match_num += 1 | |
| 133 hsp_align['field'] = field | |
| 134 hsp_align['attribute'] = attribute | |
| 135 group['alignments'].append(hsp_align) | |
| 136 group['parent_field']['start'] = coords[0] | |
| 137 group['parent_field']['end'] = coords[1] | |
| 138 group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.' | |
| 139 group['parent_attribute']['match_num'] = match_num | |
| 140 group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end'])) | |
| 141 utils.write_features(group['parent_field'], group['parent_attribute'], gff3) | |
| 142 prev_end = -1 | |
| 143 for align in group['alignments']: | |
| 144 overlap = '' | |
| 145 if align['field']['start'] <= prev_end: | |
| 146 overlap += str(align['field']['start']) + ',' + str(prev_end) | |
| 147 prev_end = align['field']['end'] | |
| 148 align['attribute']['overlap'] = overlap | |
| 149 utils.write_features(align['field'], align['attribute'], gff3) | |
| 150 gff3.close() | |
| 151 | |
| 152 def blastxml2gff3(xml_file, gff3_file): | |
| 153 result_handle = open(xml_file) | |
| 154 blast_records = NCBIXML.parse(result_handle) | |
| 155 gff3_writer(blast_records, gff3_file) | |
| 156 | |
| 157 if __name__ == "__main__": | |
| 158 blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt") | |
| 159 |
