Mercurial > repos > iuc > virannot_blast2tsv
comparison blast2tsv.py @ 0:b82ce29791e7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit ab5e1189217b6ed5f1c5d7c5ff6b79b6a4c18cff
author | iuc |
---|---|
date | Wed, 21 Aug 2024 13:12:59 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b82ce29791e7 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # Name: blast2tsv | |
5 # Author(s): Sebastien Theil, Marie Lefebvre - INRAE | |
6 # Aims: Convert blast xml output to tsv and add taxonomy | |
7 | |
8 | |
9 import argparse | |
10 import csv | |
11 import logging as log | |
12 import os | |
13 | |
14 from Bio import Entrez | |
15 from Bio import SeqIO | |
16 from Bio.Blast import NCBIXML | |
17 from ete3 import NCBITaxa | |
18 | |
19 ncbi = NCBITaxa() | |
20 | |
21 | |
22 def main(): | |
23 options = _set_options() | |
24 _set_log_level(options.verbosity) | |
25 hits = _read_xml(options) | |
26 _write_tsv(options, hits) | |
27 | |
28 | |
29 def _guess_database(accession): | |
30 """Guess the correct database for querying based off the format of the accession""" | |
31 if accession.isdigit(): | |
32 db = 'taxonomy' | |
33 else: | |
34 database_mappings_refseq = {'AC': 'nuccore', 'NC': 'nuccore', 'NG': 'nuccore', | |
35 'NT': 'nuccore', 'NW': 'nuccore', 'NZ': 'nuccore', | |
36 'AP': 'protein', 'NP': 'protein', 'YP': 'protein', | |
37 'XP': 'protein', 'WP': 'protein', 'OX': 'nuccore'} | |
38 try: | |
39 db = database_mappings_refseq[accession[0:2]] | |
40 except KeyError: | |
41 db = 'nuccore' | |
42 log.warning("DB not found for " + accession + ". Set to nuccore.") | |
43 return db | |
44 | |
45 | |
46 def _read_xml(options): | |
47 """ | |
48 Parse XML blast results file | |
49 Keep only the first hit | |
50 """ | |
51 log.info("Read XML file.") | |
52 results = open(options.xml_file, 'r') | |
53 records = NCBIXML.parse(results) | |
54 xml_results = {} | |
55 for blast_record in records: | |
56 for aln in blast_record.alignments: | |
57 hit_count = 1 | |
58 for hit in aln.hsps: | |
59 hsp = {} | |
60 if hit_count == 1: | |
61 first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand | |
62 cumul_hit_identity = hit.identities if hit.identities else 0 | |
63 cumul_hit_score = hit.bits # hit score | |
64 cumul_hit_evalue = hit.expect # evalue | |
65 cumul_hit_length = hit.align_length if hit.align_length is not None else 0 | |
66 hit_count = hit_count + 1 | |
67 else: | |
68 # all HSPs in different strand than 1st HSPs will be discarded. | |
69 if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0): | |
70 cumul_hit_identity = cumul_hit_identity + hit.identities | |
71 cumul_hit_length = cumul_hit_length + hit.align_length | |
72 cumul_hit_evalue = cumul_hit_evalue + hit.expect | |
73 cumul_hit_score = cumul_hit_score + hit.bits | |
74 hit_count = hit_count + 1 | |
75 if hit_count == 1: | |
76 final_hit_count = hit_count | |
77 elif hit_count > 1: | |
78 final_hit_count = hit_count - 1 | |
79 hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match | |
80 hsp["query_id"] = blast_record.query # or query_id | |
81 hsp["query_length"] = blast_record.query_length # length of the query | |
82 hsp["accession"] = aln.accession.replace("ref|", "") | |
83 hsp["description"] = aln.hit_def | |
84 hsp["hit_length"] = aln.length # length of the hit | |
85 hsp["hsp_length"] = hit.align_length # length of the hsp alignment | |
86 hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0] | |
87 if cumul_hit_length == 0: | |
88 hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage | |
89 else: | |
90 hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage | |
91 hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity | |
92 hsp["num_hsps"] = final_hit_count | |
93 hsp["hit_cumul_length"] = cumul_hit_length | |
94 hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1] | |
95 db = _guess_database(hsp["accession"]) | |
96 try: | |
97 handle = Entrez.esummary(db=db, id=hsp["accession"]) | |
98 taxid = str(int(Entrez.read(handle)[0]['TaxId'])) | |
99 handle.close() | |
100 log.info("Taxid found for " + hsp["accession"]) | |
101 lineage = ncbi.get_lineage(taxid) | |
102 names = ncbi.get_taxid_translator(lineage) | |
103 ordered = [names[tid] for tid in lineage] | |
104 taxonomy = ordered[1:] | |
105 hsp["tax_id"] = taxid | |
106 hsp["taxonomy"] = ';'.join(taxonomy) | |
107 hsp["organism"] = taxonomy[-1] | |
108 except RuntimeError: | |
109 hsp["tax_id"] = "" | |
110 hsp["taxonomy"] = "" | |
111 hsp["organism"] = "" | |
112 log.warning(f"RuntimeError - Taxid not found for {hsp['accession']}") | |
113 except Exception as err: | |
114 hsp["tax_id"] = "" | |
115 hsp["taxonomy"] = "" | |
116 hsp["organism"] = "" | |
117 log.warning(f"Taxid not found for {hsp['accession']}. The error is {err}") | |
118 if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \ | |
119 hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score: | |
120 xml_results[hsp["query_id"]] = hsp | |
121 else: | |
122 xml_results[hsp["query_id"]] = [hsp["query_length"]] | |
123 | |
124 return xml_results | |
125 | |
126 | |
127 def _get_overlap_value(algo, hsp, type, qlength): | |
128 """ | |
129 Set hsp or hit overlap values for hit and query | |
130 Return array [query_overlap, hit_overlap] | |
131 """ | |
132 if type == 'hsp': | |
133 q_align_len = qlength | |
134 h_align_len = hsp["hsp_length"] | |
135 else: | |
136 q_align_len = qlength | |
137 h_align_len = hsp["hit_cumul_length"] | |
138 | |
139 if algo == 'BLASTX': | |
140 if q_align_len: | |
141 query_overlap = (q_align_len * 3 / q_align_len) * 100 | |
142 if hsp["hit_length"]: | |
143 hit_overlap = (h_align_len / hsp["hit_length"]) * 100 | |
144 elif algo == 'TBLASTN': | |
145 if q_align_len: | |
146 query_overlap = (q_align_len / q_align_len) * 100 | |
147 if hsp["hit_length"]: | |
148 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 | |
149 elif algo == 'TBLASTX': | |
150 if q_align_len: | |
151 query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100 | |
152 if hsp["hit_length"]: | |
153 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100 | |
154 else: | |
155 if q_align_len: | |
156 query_overlap = (q_align_len / q_align_len) * 100 | |
157 if hsp["hit_length"]: | |
158 hit_overlap = (h_align_len / hsp["hit_length"]) * 100 | |
159 if query_overlap is None: | |
160 query_overlap = 0 | |
161 if query_overlap > 100: | |
162 query_overlap = 100 | |
163 if 'hit_overlap' not in locals(): | |
164 hit_overlap = 0 | |
165 if hit_overlap > 100: | |
166 hit_overlap = 100 | |
167 | |
168 return [round(query_overlap, 0), round(hit_overlap, 0)] | |
169 | |
170 | |
171 def _write_tsv(options, hits): | |
172 """ | |
173 Write output | |
174 """ | |
175 # get a list of contig without corresponding number of mapped reads | |
176 if options.rn_file is not None: | |
177 with open(options.rn_file) as rn: | |
178 rows = (line.split('\t') for line in rn) | |
179 rn_list = {row[0]: row[1:] for row in rows} | |
180 fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta')) | |
181 headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n" | |
182 if not os.path.exists(options.output): | |
183 os.mkdir(options.output) | |
184 tsv_file = options.output + "/blast2tsv_output.tab" | |
185 log.info("Write output file: " + tsv_file) | |
186 f = open(tsv_file, "w+") | |
187 f.write(headers) | |
188 for h in hits: | |
189 if options.rn_file is not None: | |
190 read_nb = ''.join(rn_list[h]).replace("\n", "") | |
191 else: | |
192 read_nb = '' | |
193 if len(hits[h]) > 1: | |
194 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t") | |
195 f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t") | |
196 f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t") | |
197 f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t") | |
198 f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t") | |
199 f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t") | |
200 if h in fasta: | |
201 f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq)) | |
202 else: | |
203 f.write(hits[h]["taxonomy"] + "\t\"\"") | |
204 f.write("\n") | |
205 else: | |
206 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t") | |
207 f.write("\n") | |
208 f.close() | |
209 _create_abundance(options, tsv_file) | |
210 | |
211 | |
212 def _create_abundance(options, tsv_file): | |
213 """ | |
214 extract values from tsv files | |
215 and create abundance files | |
216 """ | |
217 log.info("Calculating abundance.") | |
218 file_path = tsv_file | |
219 abundance = dict() | |
220 with open(tsv_file, 'r') as current_file: | |
221 log.debug("Reading " + file_path) | |
222 csv_reader = csv.reader(current_file, delimiter='\t') | |
223 line_count = 0 | |
224 for row in csv_reader: | |
225 if line_count == 0: | |
226 # headers | |
227 line_count += 1 | |
228 else: | |
229 # no annotation | |
230 if len(row) == 16: | |
231 if row[14] != "": | |
232 nb_reads = row[2] | |
233 if nb_reads == "": | |
234 current_reads_nb = 0 | |
235 log.debug("No reads number for " + row[1]) | |
236 else: | |
237 current_reads_nb = int(nb_reads) | |
238 contig_id = row[14] | |
239 if contig_id in abundance: | |
240 # add reads | |
241 abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb | |
242 abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1 | |
243 else: | |
244 # init reads for this taxo | |
245 abundance[contig_id] = {} | |
246 abundance[contig_id]["reads_nb"] = current_reads_nb | |
247 abundance[contig_id]["contigs_nb"] = 1 | |
248 else: | |
249 log.debug("No annotations for contig " + row[1]) | |
250 else: | |
251 log.debug("No annotations for contig " + row[1]) | |
252 log.debug(abundance) | |
253 reads_file = open(options.output + "/blast2tsv_reads.txt", "w+") | |
254 for taxo in abundance: | |
255 reads_file.write(str(abundance[taxo]["reads_nb"])) | |
256 reads_file.write("\t") | |
257 reads_file.write("\t".join(taxo.split(";"))) | |
258 reads_file.write("\n") | |
259 reads_file.close() | |
260 log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt") | |
261 contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+") | |
262 for taxo in abundance: | |
263 contigs_file.write(str(abundance[taxo]["contigs_nb"])) | |
264 contigs_file.write("\t") | |
265 contigs_file.write("\t".join(taxo.split(";"))) | |
266 contigs_file.write("\n") | |
267 contigs_file.close() | |
268 log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt") | |
269 | |
270 | |
271 def _set_options(): | |
272 parser = argparse.ArgumentParser() | |
273 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
274 parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file') | |
275 parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file') | |
276 parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
277 parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov') | |
278 parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov') | |
279 parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score') | |
280 parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo') | |
281 parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output') | |
282 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
283 args = parser.parse_args() | |
284 return args | |
285 | |
286 | |
287 def _set_log_level(verbosity): | |
288 if verbosity == 1: | |
289 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
290 log.basicConfig(level=log.INFO, format=log_format) | |
291 elif verbosity == 3: | |
292 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
293 log.basicConfig(level=log.DEBUG, format=log_format) | |
294 | |
295 | |
296 if __name__ == "__main__": | |
297 main() |