Mercurial > repos > jankanis > blast2html
comparison blast2html.py @ 98:4378d11f0ed7 draft
implement configurable gene bank links
| author | Jan Kanis <jan.code@jankanis.nl> |
|---|---|
| date | Mon, 30 Jun 2014 16:49:45 +0200 |
| parents | 9fb1a7d67317 |
| children | 02b795b784e1 |
comparison
equal
deleted
inserted
replaced
| 97:df9fd5f35967 | 98:4378d11f0ed7 |
|---|---|
| 13 import warnings | 13 import warnings |
| 14 import six, codecs | 14 import six, codecs |
| 15 from six.moves import builtins | 15 from six.moves import builtins |
| 16 from os import path | 16 from os import path |
| 17 from itertools import repeat | 17 from itertools import repeat |
| 18 from collections import defaultdict | |
| 18 import argparse | 19 import argparse |
| 19 from lxml import objectify | 20 from lxml import objectify |
| 20 import jinja2 | 21 import jinja2 |
| 22 | |
| 23 builtin_str = str | |
| 24 str = six.text_type | |
| 21 | 25 |
| 22 | 26 |
| 23 | 27 |
| 24 _filters = dict(float='float') | 28 _filters = dict(float='float') |
| 25 def filter(func_or_name): | 29 def filter(func_or_name): |
| 73 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles.""" | 77 """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles.""" |
| 74 id_titles = hit.Hit_def.text.split('>') | 78 id_titles = hit.Hit_def.text.split('>') |
| 75 | 79 |
| 76 titles = [] | 80 titles = [] |
| 77 for t in id_titles[1:]: | 81 for t in id_titles[1:]: |
| 78 fullid, title = t.split(' ', 1) | 82 id, title = t.split(' ', 1) |
| 79 hitid, id = fullid.split('|', 2)[1:3] | 83 titles.append(argparse.Namespace(Hit_id = id, |
| 80 titles.append(dict(id = id, | 84 Hit_def = title, |
| 81 hitid = hitid, | 85 Hit_accession = '', |
| 82 fullid = fullid, | 86 getroottree = hit.getroottree)) |
| 83 title = title)) | |
| 84 return titles | 87 return titles |
| 85 | 88 |
| 86 @filter | 89 @filter |
| 87 def hitid(hit): | 90 def hitid(hit): |
| 88 hitid = hit.Hit_id.text | 91 return str(hit.Hit_id) |
| 89 s = hitid.split('|', 2) | |
| 90 if len(s) >= 2: | |
| 91 return s[1] | |
| 92 return hitid | |
| 93 | |
| 94 @filter | |
| 95 def seqid(hit): | |
| 96 hitid = hit.Hit_id.text | |
| 97 s = hitid.split('|', 2) | |
| 98 if len(s) >= 3: | |
| 99 return s[2] | |
| 100 return hitid | |
| 101 | 92 |
| 102 | 93 |
| 103 @filter | 94 @filter |
| 104 def alignment_pre(hsp): | 95 def alignment_pre(hsp): |
| 105 """Create the preformatted alignment blocks""" | 96 """Create the preformatted alignment blocks""" |
| 175 return 'Plus' | 166 return 'Plus' |
| 176 elif frame == -1: | 167 elif frame == -1: |
| 177 return 'Minus' | 168 return 'Minus' |
| 178 raise Exception("frame should be either +1 or -1") | 169 raise Exception("frame should be either +1 or -1") |
| 179 | 170 |
| 180 def genelink(hit, type='genbank', hsp=None): | 171 # def genelink(hit, type='genbank', hsp=None): |
| 181 if not isinstance(hit, six.string_types): | 172 # if not isinstance(hit, six.string_types): |
| 182 hit = hitid(hit) | 173 # hit = hitid(hit) |
| 183 link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type) | 174 # link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type) |
| 184 if hsp != None: | 175 # if hsp != None: |
| 185 link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to']) | 176 # link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to']) |
| 186 return link | 177 # return link |
| 187 | 178 |
| 188 | 179 |
| 189 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139 | 180 # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139 |
| 190 # I've removed the html escapes, since html escaping is already being performed by the template engine. | 181 # I've removed the html escapes, since html escaping is already being performed by the template engine. |
| 191 | 182 |
| 216 Javascript string literal escape. Note that this only escapes data | 207 Javascript string literal escape. Note that this only escapes data |
| 217 for embedding within javascript string literals, not in general | 208 for embedding within javascript string literals, not in general |
| 218 javascript snippets. | 209 javascript snippets. |
| 219 """ | 210 """ |
| 220 | 211 |
| 221 value = six.text_type(value) | 212 value = str(value) |
| 222 | 213 |
| 223 for bad, good in _js_escapes: | 214 for bad, good in _js_escapes: |
| 224 value = value.replace(bad, good) | 215 value = value.replace(bad, good) |
| 225 | 216 |
| 226 return value | 217 return value |
| 238 | 229 |
| 239 colors = ('black', 'blue', 'green', 'magenta', 'red') | 230 colors = ('black', 'blue', 'green', 'magenta', 'red') |
| 240 | 231 |
| 241 max_scale_labels = 10 | 232 max_scale_labels = 10 |
| 242 | 233 |
| 243 def __init__(self, input, templatedir, templatename): | 234 def __init__(self, input, templatedir, templatename, genelinks={}): |
| 244 self.input = input | 235 self.input = input |
| 245 self.templatename = templatename | 236 self.templatename = templatename |
| 237 self.genelinks = genelinks | |
| 246 | 238 |
| 247 self.blast = objectify.parse(self.input).getroot() | 239 self.blast = objectify.parse(self.input).getroot() |
| 248 self.loader = jinja2.FileSystemLoader(searchpath=templatedir) | 240 self.loader = jinja2.FileSystemLoader(searchpath=templatedir) |
| 249 self.environment = jinja2.Environment(loader=self.loader, | 241 self.environment = jinja2.Environment(loader=self.loader, |
| 250 lstrip_blocks=True, trim_blocks=True, autoescape=True) | 242 lstrip_blocks=True, trim_blocks=True, autoescape=True) |
| 273 ) | 265 ) |
| 274 | 266 |
| 275 result = template.render(blast=self.blast, | 267 result = template.render(blast=self.blast, |
| 276 iterations=self.blast.BlastOutput_iterations.Iteration, | 268 iterations=self.blast.BlastOutput_iterations.Iteration, |
| 277 colors=self.colors, | 269 colors=self.colors, |
| 278 genelink=genelink, | |
| 279 params=params) | 270 params=params) |
| 280 if six.PY2: | 271 if six.PY2: |
| 281 result = result.encode('utf-8') | 272 result = result.encode('utf-8') |
| 282 output.write(result) | 273 output.write(result) |
| 283 | 274 |
| 349 totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))), | 340 totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))), |
| 350 cover = "{0:.0%}".format(cover_count / query_length), | 341 cover = "{0:.0%}".format(cover_count / query_length), |
| 351 e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))), | 342 e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))), |
| 352 # FIXME: is this the correct formula vv? | 343 # FIXME: is this the correct formula vv? |
| 353 # float(...) because non-flooring division doesn't work with lxml elements in python 2.6 | 344 # float(...) because non-flooring division doesn't work with lxml elements in python 2.6 |
| 354 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps))), | 345 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps)))) |
| 355 accession = hit.Hit_accession) | 346 |
| 347 @filter | |
| 348 def genelink(self, hit, text=None, clas=None, display_nolink=True): | |
| 349 if text is None: | |
| 350 text = hitid(hit) | |
| 351 db = hit.getroottree().getroot().BlastOutput_db | |
| 352 if isinstance(self.genelinks, six.string_types): | |
| 353 template = self.genelinks | |
| 354 else: | |
| 355 template = self.genelinks.get(db) | |
| 356 if template is None: | |
| 357 return text if display_nolink else '' | |
| 358 args = dict(id=hitid(hit).split('|'), | |
| 359 fullid=hitid(hit), | |
| 360 defline=str(hit.Hit_def).split('|'), | |
| 361 fulldefline=str(hit.Hit_def), | |
| 362 accession=str(hit.Hit_accession)) | |
| 363 try: | |
| 364 link = template.format(**args) | |
| 365 except Exception as e: | |
| 366 warnings.warn('Error in formatting gene bank link {} with {}: {}'.format(template, args, e)) | |
| 367 return text if display_nolink else '' | |
| 368 classattr = 'class="{}" '.format(jinja2.escape(clas)) if clas is not None else '' | |
| 369 return jinja2.Markup("<a {}href=\"{}\">{}</a>".format(classattr, jinja2.escape(link), jinja2.escape(text))) | |
| 370 | |
| 371 | |
| 372 def read_genelinks(dir): | |
| 373 links = {} | |
| 374 for f in ('blastdb.loc', 'blastdb_p.loc', 'blastdb_d.loc'): | |
| 375 try: | |
| 376 f = open(path.join(dir, f)) | |
| 377 for l in f.readlines(): | |
| 378 if l.strip().startswith('#'): | |
| 379 continue | |
| 380 line = l.split('\t') | |
| 381 try: | |
| 382 links[line[2]] = line[3] | |
| 383 except IndexError: | |
| 384 continue | |
| 385 f.close() | |
| 386 except OSError: | |
| 387 continue | |
| 388 if not links: | |
| 389 warnings.warn("No gene bank link templates found") | |
| 390 return links | |
| 356 | 391 |
| 357 | 392 |
| 358 def main(): | 393 def main(): |
| 359 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja') | 394 default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja') |
| 360 | 395 |
| 372 # handle the errors. This introduces a small race condition when | 407 # handle the errors. This introduces a small race condition when |
| 373 # jinja later tries to re-open the template file, but we don't | 408 # jinja later tries to re-open the template file, but we don't |
| 374 # care too much. | 409 # care too much. |
| 375 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template, | 410 parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template, |
| 376 help='The template file to use. Defaults to blast_html.html.jinja') | 411 help='The template file to use. Defaults to blast_html.html.jinja') |
| 377 | 412 |
| 413 dblink_group = parser.add_mutually_exclusive_group() | |
| 414 dblink_group.add_argument('--genelink-template', default='http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign', | |
| 415 help="""A link template to link hits to a gene bank webpage. The template string is a | |
| 416 Python format string. It can contain the following replacement elements: {id[N]}, {fullid}, | |
| 417 {defline[N]}, {fulldefline}, {accession}, where N is a number. id[N] and defline[N] will be | |
| 418 replaced by the Nth element of the id or defline, where '|' is the field separator. | |
| 419 | |
| 420 The default is 'http://www.ncbi.nlm.nih.gov/nucleotide/{accession}?report=genbank&log$=nuclalign', | |
| 421 which is a link to the NCBI nucleotide database.""") | |
| 422 | |
| 423 dblink_group.add_argument('--db-config-dir', | |
| 424 help="""The directory where databases are configured in blastdb*.loc files. These files | |
| 425 are consulted for creating a gene bank link. The files should be tab-separated tables (with lines | |
| 426 starting with '#' ignored), where the third field of a line should be a database path and the fourth | |
| 427 a genebank link template conforming to the --genelink-template option syntax. | |
| 428 | |
| 429 This option is incompatible with --genelink-template.""") | |
| 430 | |
| 378 args = parser.parse_args() | 431 args = parser.parse_args() |
| 379 if args.input == None: | 432 if args.input == None: |
| 380 args.input = args.positional_arg | 433 args.input = args.positional_arg |
| 381 if args.input == None: | 434 if args.input == None: |
| 382 parser.error('no input specified') | 435 parser.error('no input specified') |
| 384 templatedir, templatename = path.split(args.template.name) | 437 templatedir, templatename = path.split(args.template.name) |
| 385 args.template.close() | 438 args.template.close() |
| 386 if not templatedir: | 439 if not templatedir: |
| 387 templatedir = '.' | 440 templatedir = '.' |
| 388 | 441 |
| 389 b = BlastVisualize(args.input, templatedir, templatename) | 442 if args.db_config_dir is None: |
| 443 genelinks = args.genelink_template | |
| 444 elif not path.isdir(args.db_config_dir): | |
| 445 parser.error('db-config-dir does not exist or is not a directory') | |
| 446 else: | |
| 447 genelinks = read_genelinks(args.db_config_dir) | |
| 448 | |
| 449 b = BlastVisualize(args.input, templatedir, templatename, genelinks) | |
| 390 b.render(args.output) | 450 b.render(args.output) |
| 391 | 451 |
| 392 | 452 |
| 393 if __name__ == '__main__': | 453 if __name__ == '__main__': |
| 394 main() | 454 main() |
