comparison rps2tsv.py @ 0:b82ce29791e7 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/virAnnot commit ab5e1189217b6ed5f1c5d7c5ff6b79b6a4c18cff
author iuc
date Wed, 21 Aug 2024 13:12:59 +0000
parents
children 0c7f6833f9ad
comparison
equal deleted inserted replaced
-1:000000000000 0:b82ce29791e7
1 #!/usr/bin/env python3
2
3
4 # Name: rps2ecsv
5 # Author: Marie Lefebvre - INRAE
6 # Aims: Convert rpsblast xml output to csv and add taxonomy
7
8
9 import argparse
10 import json
11 import logging as log
12 from urllib import request
13 from urllib.error import HTTPError, URLError
14
15 from Bio.Blast import NCBIXML
16 from ete3 import NCBITaxa
17
18 ncbi = NCBITaxa()
19
20
21 def main():
22 options = _set_options()
23 _set_log_level(options.verbosity)
24 hits = _read_xml(options)
25 _write_tsv(options, hits)
26
27
28 def _read_xml(options):
29 """
30 Parse XML RPSblast results file
31 """
32 log.info("Read XML file " + options.xml_file)
33 xml = open(options.xml_file, 'r')
34 records = NCBIXML.parse(xml)
35 xml_results = {}
36 for blast_record in records:
37 for aln in blast_record.alignments:
38 for hit in aln.hsps:
39 hsp = {}
40 hit_evalue = hit.expect
41 if hit_evalue > options.max_evalue:
42 continue
43 hit_frame = hit.frame[0] # frame
44 hit_evalue = hit.expect # evalue
45 hit_startQ = hit.query_start
46 hit_endQ = hit.query_end
47 hsp["frame"] = hit_frame
48 hsp["evalue"] = hit_evalue
49 hsp["startQ"] = hit_startQ
50 hsp["endQ"] = hit_endQ
51 hsp["query_id"] = blast_record.query_id
52 hsp["cdd_id"] = aln.hit_def.split(",")[0]
53 hsp["hit_id"] = aln.hit_id
54 hsp["query_length"] = blast_record.query_length # length of the query
55 hsp["description"] = aln.hit_def
56 hsp["accession"] = aln.accession
57 hsp["pfam_id"] = hsp["description"].split(",")[0].replace("pfam", "PF")
58 log.info("Requeting Interpro for " + hsp["pfam_id"])
59 url = "https://www.ebi.ac.uk/interpro/api/taxonomy/uniprot/entry/pfam/" + hsp["pfam_id"]
60 req = request.Request(url)
61 try:
62 response = request.urlopen(req)
63 except HTTPError as e:
64 log.debug('Http error for interpro: ', e.code)
65 except URLError as e:
66 log.debug('Url error for interpro: ', e.reason)
67 else:
68 encoded_response = response.read()
69 decoded_response = encoded_response.decode()
70 payload = json.loads(decoded_response)
71 kingdoms = []
72 for item in payload["results"][:6]:
73 if item["metadata"]["parent"] is not None:
74 lineage_parent = item["metadata"]["parent"]
75 translation = ncbi.get_taxid_translator([int(lineage_parent)])
76 names = list(translation.values())
77 if len(names) > 0:
78 if names[0] == "root":
79 taxonomy = names[1:] # remove 'root' at the begining
80 else:
81 taxonomy = names
82 else:
83 taxonomy = names
84 if len(taxonomy) != 0:
85 kingdoms.append(taxonomy[0])
86 frequency = {kingdom: kingdoms.count(kingdom) for kingdom in kingdoms} # {'Pseudomonadota': 9, 'cellular organisms': 4}
87 sorted_freq = dict(sorted(frequency.items(), key=lambda x: x[1], reverse=True))
88 concat_freq = ";".join("{}({})".format(k, v) for k, v in sorted_freq.items())
89 hsp["taxonomy"] = concat_freq
90 xml_results[hsp["query_id"]] = hsp
91 return xml_results
92
93
94 def _write_tsv(options, hits):
95 """
96 Write output
97 """
98 log.info("Write output file " + options.output)
99 headers = "#query_id\tquery_length\tcdd_id\thit_id\tevalue\tstartQ\tendQ\tframe\tdescription\tsuperkingdom\n"
100 f = open(options.output, "w+")
101 f.write(headers)
102 for h in hits:
103 f.write(h + "\t" + str(hits[h]["query_length"]) + "\t")
104 f.write(hits[h]["cdd_id"] + "\t" + hits[h]["hit_id"] + "\t" + str(hits[h]["evalue"]) + "\t")
105 f.write(str(hits[h]["startQ"]) + "\t" + str(hits[h]["endQ"]) + "\t" + str(hits[h]["frame"]) + "\t")
106 f.write(hits[h]["description"] + "\t" + hits[h]["taxonomy"])
107 f.write("\n")
108 f.close()
109
110
111 def _set_options():
112 parser = argparse.ArgumentParser()
113 parser.add_argument('-x', '--xml', help='XML files with results of blast', action='store', required=True, dest='xml_file')
114 parser.add_argument('-e', '--max_evalue', help='Max evalue', action='store', type=float, default=0.0001, dest='max_evalue')
115 parser.add_argument('-o', '--out', help='The output file (.tab).', action='store', type=str, default='./rps2tsv_output.tab', dest='output')
116 parser.add_argument('-v', '--verbosity', help='Verbose level', action='store', type=int, choices=[1, 2, 3, 4], default=1)
117 args = parser.parse_args()
118 return args
119
120
121 def _set_log_level(verbosity):
122 if verbosity == 1:
123 log_format = '%(asctime)s %(levelname)-8s %(message)s'
124 log.basicConfig(level=log.INFO, format=log_format)
125 elif verbosity == 3:
126 log_format = '%(filename)s:%(lineno)s - %(asctime)s %(levelname)-8s %(message)s'
127 log.basicConfig(level=log.DEBUG, format=log_format)
128
129
130 if __name__ == "__main__":
131 main()