comparison blast2tsv.py @ 0:b82ce29791e7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit ab5e1189217b6ed5f1c5d7c5ff6b79b6a4c18cff
author iuc
date Wed, 21 Aug 2024 13:12:59 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b82ce29791e7
1 #!/usr/bin/env python3
2
3
4 # Name: blast2tsv
5 # Author(s): Sebastien Theil, Marie Lefebvre - INRAE
6 # Aims: Convert blast xml output to tsv and add taxonomy
7
8
9 import argparse
10 import csv
11 import logging as log
12 import os
13
14 from Bio import Entrez
15 from Bio import SeqIO
16 from Bio.Blast import NCBIXML
17 from ete3 import NCBITaxa
18
19 ncbi = NCBITaxa()
20
21
22 def main():
23 options = _set_options()
24 _set_log_level(options.verbosity)
25 hits = _read_xml(options)
26 _write_tsv(options, hits)
27
28
29 def _guess_database(accession):
30 """Guess the correct database for querying based off the format of the accession"""
31 if accession.isdigit():
32 db = 'taxonomy'
33 else:
34 database_mappings_refseq = {'AC': 'nuccore', 'NC': 'nuccore', 'NG': 'nuccore',
35 'NT': 'nuccore', 'NW': 'nuccore', 'NZ': 'nuccore',
36 'AP': 'protein', 'NP': 'protein', 'YP': 'protein',
37 'XP': 'protein', 'WP': 'protein', 'OX': 'nuccore'}
38 try:
39 db = database_mappings_refseq[accession[0:2]]
40 except KeyError:
41 db = 'nuccore'
42 log.warning("DB not found for " + accession + ". Set to nuccore.")
43 return db
44
45
46 def _read_xml(options):
47 """
48 Parse XML blast results file
49 Keep only the first hit
50 """
51 log.info("Read XML file.")
52 results = open(options.xml_file, 'r')
53 records = NCBIXML.parse(results)
54 xml_results = {}
55 for blast_record in records:
56 for aln in blast_record.alignments:
57 hit_count = 1
58 for hit in aln.hsps:
59 hsp = {}
60 if hit_count == 1:
61 first_hit_frame = hit.frame[1] if len(hit.frame) > 0 else 0 # strand
62 cumul_hit_identity = hit.identities if hit.identities else 0
63 cumul_hit_score = hit.bits # hit score
64 cumul_hit_evalue = hit.expect # evalue
65 cumul_hit_length = hit.align_length if hit.align_length is not None else 0
66 hit_count = hit_count + 1
67 else:
68 # all HSPs in different strand than 1st HSPs will be discarded.
69 if (first_hit_frame > 0 and hit.frame[1] > 0) or (first_hit_frame < 0 and hit.frame[1] < 0):
70 cumul_hit_identity = cumul_hit_identity + hit.identities
71 cumul_hit_length = cumul_hit_length + hit.align_length
72 cumul_hit_evalue = cumul_hit_evalue + hit.expect
73 cumul_hit_score = cumul_hit_score + hit.bits
74 hit_count = hit_count + 1
75 if hit_count == 1:
76 final_hit_count = hit_count
77 elif hit_count > 1:
78 final_hit_count = hit_count - 1
79 hsp["evalue"] = cumul_hit_evalue / final_hit_count # The smaller the E-value, the better the match
80 hsp["query_id"] = blast_record.query # or query_id
81 hsp["query_length"] = blast_record.query_length # length of the query
82 hsp["accession"] = aln.accession.replace("ref|", "")
83 hsp["description"] = aln.hit_def
84 hsp["hit_length"] = aln.length # length of the hit
85 hsp["hsp_length"] = hit.align_length # length of the hsp alignment
86 hsp["queryOverlap"] = _get_overlap_value(options.algo, hsp, 'hsp', hsp["query_length"])[0]
87 if cumul_hit_length == 0:
88 hsp["percentIdentity"] = round(cumul_hit_identity, 1) # identity percentage
89 else:
90 hsp["percentIdentity"] = round(cumul_hit_identity / cumul_hit_length * 100, 1) # identity percentage
91 hsp["score"] = cumul_hit_score # The higher the bit-score, the better the sequence similarity
92 hsp["num_hsps"] = final_hit_count
93 hsp["hit_cumul_length"] = cumul_hit_length
94 hsp["hitOverlap"] = _get_overlap_value(options.algo, hsp, 'hit', hsp["query_length"])[1]
95 db = _guess_database(hsp["accession"])
96 try:
97 handle = Entrez.esummary(db=db, id=hsp["accession"])
98 taxid = str(int(Entrez.read(handle)[0]['TaxId']))
99 handle.close()
100 log.info("Taxid found for " + hsp["accession"])
101 lineage = ncbi.get_lineage(taxid)
102 names = ncbi.get_taxid_translator(lineage)
103 ordered = [names[tid] for tid in lineage]
104 taxonomy = ordered[1:]
105 hsp["tax_id"] = taxid
106 hsp["taxonomy"] = ';'.join(taxonomy)
107 hsp["organism"] = taxonomy[-1]
108 except RuntimeError:
109 hsp["tax_id"] = ""
110 hsp["taxonomy"] = ""
111 hsp["organism"] = ""
112 log.warning(f"RuntimeError - Taxid not found for {hsp['accession']}")
113 except Exception as err:
114 hsp["tax_id"] = ""
115 hsp["taxonomy"] = ""
116 hsp["organism"] = ""
117 log.warning(f"Taxid not found for {hsp['accession']}. The error is {err}")
118 if hsp["evalue"] <= options.max_evalue and hsp["queryOverlap"] >= options.min_qov and \
119 hsp["hitOverlap"] >= options.min_hov and hsp["score"] >= options.min_score:
120 xml_results[hsp["query_id"]] = hsp
121 else:
122 xml_results[hsp["query_id"]] = [hsp["query_length"]]
123
124 return xml_results
125
126
127 def _get_overlap_value(algo, hsp, type, qlength):
128 """
129 Set hsp or hit overlap values for hit and query
130 Return array [query_overlap, hit_overlap]
131 """
132 if type == 'hsp':
133 q_align_len = qlength
134 h_align_len = hsp["hsp_length"]
135 else:
136 q_align_len = qlength
137 h_align_len = hsp["hit_cumul_length"]
138
139 if algo == 'BLASTX':
140 if q_align_len:
141 query_overlap = (q_align_len * 3 / q_align_len) * 100
142 if hsp["hit_length"]:
143 hit_overlap = (h_align_len / hsp["hit_length"]) * 100
144 elif algo == 'TBLASTN':
145 if q_align_len:
146 query_overlap = (q_align_len / q_align_len) * 100
147 if hsp["hit_length"]:
148 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100
149 elif algo == 'TBLASTX':
150 if q_align_len:
151 query_overlap = (q_align_len * 3 / hsp["hsp_length"]) * 100
152 if hsp["hit_length"]:
153 hit_overlap = (h_align_len * 3 / hsp["hit_length"]) * 100
154 else:
155 if q_align_len:
156 query_overlap = (q_align_len / q_align_len) * 100
157 if hsp["hit_length"]:
158 hit_overlap = (h_align_len / hsp["hit_length"]) * 100
159 if query_overlap is None:
160 query_overlap = 0
161 if query_overlap > 100:
162 query_overlap = 100
163 if 'hit_overlap' not in locals():
164 hit_overlap = 0
165 if hit_overlap > 100:
166 hit_overlap = 100
167
168 return [round(query_overlap, 0), round(hit_overlap, 0)]
169
170
171 def _write_tsv(options, hits):
172 """
173 Write output
174 """
175 # get a list of contig without corresponding number of mapped reads
176 if options.rn_file is not None:
177 with open(options.rn_file) as rn:
178 rows = (line.split('\t') for line in rn)
179 rn_list = {row[0]: row[1:] for row in rows}
180 fasta = SeqIO.to_dict(SeqIO.parse(open(options.fasta_file), 'fasta'))
181 headers = "#algo\tquery_id\tnb_reads\tquery_length\taccession\tdescription\torganism\tpercentIdentity\tnb_hsps\tqueryOverlap\thitOverlap\tevalue\tscore\ttax_id\ttaxonomy\tsequence\n"
182 if not os.path.exists(options.output):
183 os.mkdir(options.output)
184 tsv_file = options.output + "/blast2tsv_output.tab"
185 log.info("Write output file: " + tsv_file)
186 f = open(tsv_file, "w+")
187 f.write(headers)
188 for h in hits:
189 if options.rn_file is not None:
190 read_nb = ''.join(rn_list[h]).replace("\n", "")
191 else:
192 read_nb = ''
193 if len(hits[h]) > 1:
194 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h]["query_length"]) + "\t")
195 f.write(hits[h]["accession"] + "\t" + hits[h]["description"] + "\t")
196 f.write(hits[h]["organism"] + "\t" + str(hits[h]["percentIdentity"]) + "\t")
197 f.write(str(hits[h]["num_hsps"]) + "\t" + str(hits[h]["queryOverlap"]) + "\t")
198 f.write(str(hits[h]["hitOverlap"]) + "\t" + str(hits[h]["evalue"]) + "\t")
199 f.write(str(hits[h]["score"]) + "\t" + str(hits[h]["tax_id"]) + "\t")
200 if h in fasta:
201 f.write(hits[h]["taxonomy"] + "\t" + str(fasta[h].seq))
202 else:
203 f.write(hits[h]["taxonomy"] + "\t\"\"")
204 f.write("\n")
205 else:
206 f.write(options.algo + "\t" + h + "\t" + read_nb + "\t" + str(hits[h])[1:-1] + "\t")
207 f.write("\n")
208 f.close()
209 _create_abundance(options, tsv_file)
210
211
212 def _create_abundance(options, tsv_file):
213 """
214 extract values from tsv files
215 and create abundance files
216 """
217 log.info("Calculating abundance.")
218 file_path = tsv_file
219 abundance = dict()
220 with open(tsv_file, 'r') as current_file:
221 log.debug("Reading " + file_path)
222 csv_reader = csv.reader(current_file, delimiter='\t')
223 line_count = 0
224 for row in csv_reader:
225 if line_count == 0:
226 # headers
227 line_count += 1
228 else:
229 # no annotation
230 if len(row) == 16:
231 if row[14] != "":
232 nb_reads = row[2]
233 if nb_reads == "":
234 current_reads_nb = 0
235 log.debug("No reads number for " + row[1])
236 else:
237 current_reads_nb = int(nb_reads)
238 contig_id = row[14]
239 if contig_id in abundance:
240 # add reads
241 abundance[contig_id]["reads_nb"] = abundance[row[14]]["reads_nb"] + current_reads_nb
242 abundance[contig_id]["contigs_nb"] = abundance[row[14]]["contigs_nb"] + 1
243 else:
244 # init reads for this taxo
245 abundance[contig_id] = {}
246 abundance[contig_id]["reads_nb"] = current_reads_nb
247 abundance[contig_id]["contigs_nb"] = 1
248 else:
249 log.debug("No annotations for contig " + row[1])
250 else:
251 log.debug("No annotations for contig " + row[1])
252 log.debug(abundance)
253 reads_file = open(options.output + "/blast2tsv_reads.txt", "w+")
254 for taxo in abundance:
255 reads_file.write(str(abundance[taxo]["reads_nb"]))
256 reads_file.write("\t")
257 reads_file.write("\t".join(taxo.split(";")))
258 reads_file.write("\n")
259 reads_file.close()
260 log.info("Abundance file created " + options.output + "/blast2tsv_reads.txt")
261 contigs_file = open(options.output + "/blast2tsv_contigs.txt", "w+")
262 for taxo in abundance:
263 contigs_file.write(str(abundance[taxo]["contigs_nb"]))
264 contigs_file.write("\t")
265 contigs_file.write("\t".join(taxo.split(";")))
266 contigs_file.write("\n")
267 contigs_file.close()
268 log.info("Abundance file created " + options.output + "/blast2tsv_contigs.txt")
269
270
271 def _set_options():
272 parser = argparse.ArgumentParser()
273 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file')
274 parser.add_argument('-rn', '--read-count', help='Tab-delimited file associating seqID with read number.', action='store', dest='rn_file')
275 parser.add_argument('-c', '--contigs', help='FASTA file with contigs sequence.', action='store', required=True, dest='fasta_file')
276 parser.add_argument('-me', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue')
277 parser.add_argument('-qov', '--min_query_overlap', help='Minimum query overlap', action='store', type=int, default=5, dest='min_qov')
278 parser.add_argument('-mhov', '--min_hit_overlap', help='Minimum hit overlap', action='store', type=int, default=5, dest='min_hov')
279 parser.add_argument('-s', '--min_score', help='Minimum score', action='store', type=int, default=30, dest='min_score')
280 parser.add_argument('-a', '--algo', help='Blast type detection (BLASTN|BLASTP|BLASTX|TBLASTX|TBLASTN|DIAMONDX).', action='store', type=str, default='BLASTX', dest='algo')
281 parser.add_argument('-o', '--out', help='The output file (.csv).', action='store', type=str, default='./blast2tsv', dest='output')
282 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
283 args = parser.parse_args()
284 return args
285
286
287 def _set_log_level(verbosity):
288 if verbosity == 1:
289 log_format = '%(asctime)s %(levelname)-8s %(message)s'
290 log.basicConfig(level=log.INFO, format=log_format)
291 elif verbosity == 3:
292 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
293 log.basicConfig(level=log.DEBUG, format=log_format)
294
295
296 if __name__ == "__main__":
297 main()