comparison blastxmlToGff3.py @ 45:15b4a74722d1 draft

planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit 73ede51a133dc8914812195a1b4cd512bfb8f23d
author yating-l
date Wed, 22 Mar 2017 12:07:32 -0400
parents d8049deb0c97
children 6803152ea92a
comparison
equal deleted inserted replaced
44:37422f705e9b 45:15b4a74722d1
43 count = 1 43 count = 1
44 cigar.append('%s%d' % (curType, count)) 44 cigar.append('%s%d' % (curType, count))
45 return ' '.join(cigar) 45 return ' '.join(cigar)
46 46
47 def gff3_writer(blast_records, gff3_file): 47 def gff3_writer(blast_records, gff3_file):
48 gff3 = open(gff3_file, 'w') 48 gff3 = open(gff3_file, 'a')
49 gff3.write("##gff-version 3\n") 49 gff3.write("##gff-version 3\n")
50 seq_regions = dict() 50 seq_regions = dict()
51 for blast_record in blast_records: 51 for blast_record in blast_records:
52 query_name = blast_record.query.split(" ")[0] 52 query_name = blast_record.query.split(" ")[0]
53 source = blast_record.application 53 source = blast_record.application
54 method = blast_record.matrix
54 for alignment in blast_record.alignments: 55 for alignment in blast_record.alignments:
56 group = {
57 "parent_field" : OrderedDict(),
58 "parent_attribute" : OrderedDict(),
59 "alignments" : []
60 }
55 title = alignment.title.split(" ") 61 title = alignment.title.split(" ")
56 seqid = title[len(title) - 1] 62 contig_name = title[len(title) - 1]
57 length = alignment.length 63 length = alignment.length
58 feature_type = 'match' 64 group['parent_field']['seqid'] = contig_name
65 group['parent_field']['source'] = source
66 group['parent_field']['type'] = 'match'
67 group['parent_attribute']['ID'] = contig_name + '_' + query_name
68 group['parent_attribute']['method'] = method
69 group['parent_attribute']['length'] = length
70 if contig_name not in seq_regions:
71 gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n')
72 seq_regions[contig_name] = length
73 match_num = 0
74 coords = [length, 0]
59 for hsp in alignment.hsps: 75 for hsp in alignment.hsps:
76 hsp_align = {}
60 field = OrderedDict() 77 field = OrderedDict()
61 attribute = OrderedDict() 78 attribute = OrderedDict()
62 ref = hsp.sbjct 79 ref = hsp.sbjct
63 query = hsp.query 80 query = hsp.query
64 field['seqid'] = seqid 81 field['seqid'] = contig_name
65 field['source'] = source 82 field['source'] = source
66 field['type'] = feature_type 83 field['type'] = 'match_part'
67 if seqid not in seq_regions: 84
68 gff3.write("##sequence-region " + field['seqid'] + ' 1 ' + str(length) + '\n')
69 seq_regions[seqid] = length
70 field['start'] = hsp.sbjct_start 85 field['start'] = hsp.sbjct_start
86 if field['start'] < coords[0]:
87 coords[0] = field['start']
71 ref_length = len(ref.replace('-', '')) 88 ref_length = len(ref.replace('-', ''))
72 # if run tblastn, the actual length of reference should be multiplied by 3 89 # if run tblastn, the actual length of reference should be multiplied by 3
73 if source.lower() == "tblastn": 90 if source.lower() == "tblastn":
74 ref_length *= 3 91 ref_length *= 3
75 field['end'] = field['start'] + ref_length - 1 92 field['end'] = field['start'] + ref_length - 1
76 field['score'] = hsp.expect 93 if field['end'] > coords[1]:
94 coords[1] = field['end']
95 field['score'] = hsp.score
77 #decide if the alignment in the same strand or reverse strand 96 #decide if the alignment in the same strand or reverse strand
78 #reading frame 97 #reading frame
79 # (+, +), (0, 0), (-, -) => + 98 # (+, +), (0, 0), (-, -) => +
80 # (+, -), (-, +) => - 99 # (+, -), (-, +) => -
81 if hsp.frame[1] * hsp.frame[0] > 0: 100 if hsp.frame[1] * hsp.frame[0] > 0:
93 target_len = len(query.replace('-', '')) 112 target_len = len(query.replace('-', ''))
94 # if run blastx, the actual length of query should be multiplied by 3 113 # if run blastx, the actual length of query should be multiplied by 3
95 if source.lower() == "blastx": 114 if source.lower() == "blastx":
96 target_len *= 3 115 target_len *= 3
97 target_end = target_start + target_len -1 116 target_end = target_start + target_len -1
98 attribute['ID'] = field['seqid'] + '_' + str(field['start']) + '_' + str(field['end']) + '_' + query_name + '_' + str(target_start) + '_' + str(target_end) 117 attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num)
118 attribute['Parent'] = group['parent_attribute']['ID']
99 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end) 119 attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end)
100 attribute['Gap'] = align2cigar(query, ref) 120 attribute['Gap'] = align2cigar(query, ref)
101 #store the query sequence in the file in order to display alignment with BlastAlignment plugin 121 #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin
122 attribute['subject'] = hsp.sbjct
102 attribute['query'] = hsp.query 123 attribute['query'] = hsp.query
124 attribute['match'] = hsp.match
125 attribute['gaps'] = attribute['match'].count(' ')
126 similar = attribute['match'].count('+')
127 attribute['identities'] = len(attribute['match']) - similar - attribute['gaps']
128 attribute['positives'] = attribute['identities'] + similar
129 attribute['expect'] = hsp.expect
103 # show reading frame attribute only if the frame is not (0, 0) 130 # show reading frame attribute only if the frame is not (0, 0)
104 if hsp.frame[0] != 0 or hsp.frame[1] != 0: 131 attribute['frame'] = hsp.frame[1]
105 attribute['reading_frame'] = str(hsp.frame[0]) + ", " + str(hsp.frame[1]) 132 match_num += 1
106 utils.write_features(field, attribute, gff3) 133 hsp_align['field'] = field
134 hsp_align['attribute'] = attribute
135 group['alignments'].append(hsp_align)
136 group['parent_field']['start'] = coords[0]
137 group['parent_field']['end'] = coords[1]
138 group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.'
139 group['parent_attribute']['match_num'] = match_num
140 utils.write_features(group['parent_field'], group['parent_attribute'], gff3)
141 for align in group['alignments']:
142 utils.write_features(align['field'], align['attribute'], gff3)
143 gff3.close()
144
107 145
108 def blastxml2gff3(xml_file, gff3_file): 146 def blastxml2gff3(xml_file, gff3_file):
109 result_handle = open(xml_file) 147 result_handle = open(xml_file)
110 blast_records = NCBIXML.parse(result_handle) 148 blast_records = NCBIXML.parse(result_handle)
111 gff3_writer(blast_records, gff3_file) 149 gff3_writer(blast_records, gff3_file)
112 150
113 if __name__ == "__main__": 151 if __name__ == "__main__":
114 blastxml2gff3("../tblastn_dmel.blastxml", "gff3.txt") 152 blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt")
115 153