Mercurial > repos > iuc > virannot_blast2tsv
comparison rps2tsv.py @ 0:b82ce29791e7 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit ab5e1189217b6ed5f1c5d7c5ff6b79b6a4c18cff
author | iuc |
---|---|
date | Wed, 21 Aug 2024 13:12:59 +0000 |
parents | |
children | 0c7f6833f9ad |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b82ce29791e7 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # Name: rps2ecsv | |
5 # Author: Marie Lefebvre - INRAE | |
6 # Aims: Convert rpsblast xml output to csv and add taxonomy | |
7 | |
8 | |
9 import argparse | |
10 import json | |
11 import logging as log | |
12 from urllib import request | |
13 from urllib.error import HTTPError, URLError | |
14 | |
15 from Bio.Blast import NCBIXML | |
16 from ete3 import NCBITaxa | |
17 | |
18 ncbi = NCBITaxa() | |
19 | |
20 | |
21 def main(): | |
22 options = _set_options() | |
23 _set_log_level(options.verbosity) | |
24 hits = _read_xml(options) | |
25 _write_tsv(options, hits) | |
26 | |
27 | |
28 def _read_xml(options): | |
29 """ | |
30 Parse XML RPSblast results file | |
31 """ | |
32 log.info("Read XML file " + options.xml_file) | |
33 xml = open(options.xml_file, 'r') | |
34 records = NCBIXML.parse(xml) | |
35 xml_results = {} | |
36 for blast_record in records: | |
37 for aln in blast_record.alignments: | |
38 for hit in aln.hsps: | |
39 hsp = {} | |
40 hit_evalue = hit.expect | |
41 if hit_evalue > options.max_evalue: | |
42 continue | |
43 hit_frame = hit.frame[0] # frame | |
44 hit_evalue = hit.expect # evalue | |
45 hit_startQ = hit.query_start | |
46 hit_endQ = hit.query_end | |
47 hsp["frame"] = hit_frame | |
48 hsp["evalue"] = hit_evalue | |
49 hsp["startQ"] = hit_startQ | |
50 hsp["endQ"] = hit_endQ | |
51 hsp["query_id"] = blast_record.query_id | |
52 hsp["cdd_id"] = aln.hit_def.split(",")[0] | |
53 hsp["hit_id"] = aln.hit_id | |
54 hsp["query_length"] = blast_record.query_length # length of the query | |
55 hsp["description"] = aln.hit_def | |
56 hsp["accession"] = aln.accession | |
57 hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF") | |
58 log.info("Requeting Interpro for " + hsp["pfam_id"]) | |
59 url = "https://www.ebi.ac.uk/interpro/api/taxonomy/uniprot/entry/pfam/" + hsp["pfam_id"] | |
60 req = request.Request(url) | |
61 try: | |
62 response = request.urlopen(req) | |
63 except HTTPError as e: | |
64 log.debug('Http error for interpro: ', e.code) | |
65 except URLError as e: | |
66 log.debug('Url error for interpro: ', e.reason) | |
67 else: | |
68 encoded_response = response.read() | |
69 decoded_response = encoded_response.decode() | |
70 payload = json.loads(decoded_response) | |
71 kingdoms = [] | |
72 for item in payload["results"][:6]: | |
73 if item["metadata"]["parent"] is not None: | |
74 lineage_parent = item["metadata"]["parent"] | |
75 translation = ncbi.get_taxid_translator([int(lineage_parent)]) | |
76 names = list(translation.values()) | |
77 if len(names) > 0: | |
78 if names[0] == "root": | |
79 taxonomy = names[1:] # remove 'root' at the begining | |
80 else: | |
81 taxonomy = names | |
82 else: | |
83 taxonomy = names | |
84 if len(taxonomy) != 0: | |
85 kingdoms.append(taxonomy[0]) | |
86 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4} | |
87 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True)) | |
88 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items()) | |
89 hsp["taxonomy"] = concat_freq | |
90 xml_results[hsp["query_id"]] = hsp | |
91 return xml_results | |
92 | |
93 | |
94 def _write_tsv(options, hits): | |
95 """ | |
96 Write output | |
97 """ | |
98 log.info("Write output file " + options.output) | |
99 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n" | |
100 f = open(options.output, "w+") | |
101 f.write(headers) | |
102 for h in hits: | |
103 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t") | |
104 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t") | |
105 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t") | |
106 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"]) | |
107 f.write("\n") | |
108 f.close() | |
109 | |
110 | |
111 def _set_options(): | |
112 parser = argparse.ArgumentParser() | |
113 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file') | |
114 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue') | |
115 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output') | |
116 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1) | |
117 args = parser.parse_args() | |
118 return args | |
119 | |
120 | |
121 def _set_log_level(verbosity): | |
122 if verbosity == 1: | |
123 log_format = '%(asctime)s %(levelname)-8s %(message)s' | |
124 log.basicConfig(level=log.INFO, format=log_format) | |
125 elif verbosity == 3: | |
126 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s' | |
127 log.basicConfig(level=log.DEBUG, format=log_format) | |
128 | |
129 | |
130 if __name__ == "__main__": | |
131 main() |