annotate blast2html.py @ 3:b98111afad32 branch

branch
author Jan Kanis <jan.code@jankanis.nl>
date Thu, 23 Jul 2015 11:34:25 +0200
parents 9272a08cb8fe
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
1 #!/usr/bin/env python3
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
2 # -*- coding: utf-8 -*-
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
3
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
4 # Actually this program works with both python 2 and 3, tested against python 2.6
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
5
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
6 # Copyright The Hyve B.V. 2014-2015
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
7 # License: GPL version 3 or (at your option) any higher version
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
8
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
9 from __future__ import unicode_literals, division, print_function
1
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
10 import sys
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
11 import argparse
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
12 import six, codecs, io
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
13 from os import path
0
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
14
1
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
15 # print("Hello World")
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
16 # print("The tool is working!")
0
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
17
Jan Kanis <jan.code@jankanis.nl>
parents:
diff changeset
18
1
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
19 def main():
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
20 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
21
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
22 parser = argparse.ArgumentParser(description="Convert a BLAST XML result into a nicely readable html page",
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
23 usage="{0} [-i] INPUT [-o OUTPUT] [--genelink-template URL_TEMPLATE] [--dbname DBNAME]".format(sys.argv[0]))
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
24 input_group = parser.add_mutually_exclusive_group(required=True)
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
25 input_group.add_argument('positional_arg', metavar='INPUT', nargs='?', type=argparse.FileType(mode='r'),
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
26 help='The input Blast XML file, same as -i/--input')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
27 input_group.add_argument('-i', '--input', type=argparse.FileType(mode='r'),
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
28 help='The input Blast XML file')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
29 parser.add_argument('-o', '--output', type=argparse.FileType(mode='w'), default=sys.stdout,
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
30 help='The output html file')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
31 # We just want the file name here, so jinja can open the file
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
32 # itself. But it is easier to just use a FileType so argparse can
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
33 # handle the errors. This introduces a small race condition when
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
34 # jinja later tries to re-open the template file, but we don't
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
35 # care too much.
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
36 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template,
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
37 help='The template file to use. Defaults to blast_html.html.jinja')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
38
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
39 parser.add_argument('--dbname', type=str, default='Gene Bank',
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
40 help="The link text to use for external links to a gene bank database. Defaults to 'Gene Bank'")
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
41 parser.add_argument('--genelink-template', metavar='URL_TEMPLATE',
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
42 default='http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign',
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
43 help="""A link template to link hits to a gene bank webpage. The template string is a
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
44 Python format string. It can contain the following replacement elements: {id[N]}, {fullid},
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
45 {defline[N]}, {fulldefline}, {accession}, where N is a number. id[N] and defline[N] will be
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
46 replaced by the Nth element of the id or defline, where '|' is the field separator.
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
47
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
48 The default is 'http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign',
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
49 which is a link to the NCBI nucleotide database.""")
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
50
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
51 parser.add_argument('--db-config-dir',
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
52 help="""The directory where databases are configured in blastdb*.loc files. These files
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
53 are consulted for creating a gene bank link. The files should conform to the format that
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
54 Galaxy's BLAST expect, i.e. tab-separated tables (with lines starting with '#' ignored),
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
55 with two extra fields, for a total of five fields per line instead of three.. The third
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
56 field of each line should be a database path as used by BLAST. The fourth field is the
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
57 human readable database name, and the fifth a template link to the gene bank conforming
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
58 to the syntax for the --genelink-template option. Entries in these config files override
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
59 links specified using --genelink-template and --dbname.""")
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
60
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
61 args = parser.parse_args()
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
62 if args.input == None:
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
63 args.input = args.positional_arg
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
64 if args.input == None:
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
65 parser.error('no input specified')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
66
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
67 if six.PY2:
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
68 # The argparse.FileType wrapper doesn't support an encoding
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
69 # argument, so for python 2 we need to wrap or reopen the
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
70 # output. The input files are already read as utf-8 by the
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
71 # respective libraries.
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
72 #
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
73 # One option is using codecs, but the codecs' writelines()
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
74 # method doesn't support streaming but collects all output and
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
75 # writes at once (see Python issues #5445 and #21910). On the
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
76 # other hand the io module is slower (though not
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
77 # significantly).
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
78
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
79 # args.output = codecs.getwriter('utf-8')(args.output)
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
80 # def fixed_writelines(iter, self=args.output):
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
81 # for i in iter:
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
82 # self.write(i)
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
83 # args.output.writelines = fixed_writelines
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
84
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
85 args.output.close()
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
86 args.output = io.open(args.output.name, 'w', encoding='utf-8')
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
87
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
88 templatedir, templatename = path.split(args.template.name)
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
89 args.template.close()
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
90 if not templatedir:
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
91 templatedir = '.'
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
92
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
93 args.output.write("<html><title>Hello World!</title><body><h1>Hello World!</h1><p>It works!</p></body></html>\n")
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
94
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
95 args.output.close()
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
96
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
97 if __name__ == '__main__':
9272a08cb8fe update v2
Jan Kanis <jan.code@jankanis.nl>
parents: 0
diff changeset
98 main()