Mercurial > repos > dfornika > blast_report_basic
changeset 0:5dfd84907521 draft
planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/blast_report_basic commit bc359460bb66db7946cc68ccbd47cd479624c4a1-dirty
author | dfornika |
---|---|
date | Tue, 03 Mar 2020 00:14:34 +0000 |
parents | |
children | a63f676fe808 |
files | blast_report.py blast_report_basic.xml templates/template1.tmpl templates/template2.tmpl tool-data/blast_reference_bins.loc.sample tool-data/blast_report_templates.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 8 files changed, 560 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_report.py Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,209 @@ +#!/usr/bin/env python +'''Report on BLAST results. + +python blast_report.py input_tab cheetah_tmpl output_html output_tab [-f [filter_pident]:[filterkw1,...,filterkwN]] [-b bin1_label=bin1_path[,...binN_label=binN_path]] +''' +import argparse +import re +import sys + +from Cheetah.Template import Template + + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + + +class BLASTBin: + def __init__(self, label, file): + self.label = label + self.dict = {} + + file_in = open(file) + for line in file_in: + self.dict[line.rstrip().split('.')[0]] = '' + file_in.close() + + def __str__(self): + return "label: %s dict: %s" % (self.label, str(self.dict)) + + +class BLASTQuery: + def __init__(self, query_id): + self.query_id = query_id + self.matches = [] + self.match_accessions = {} + self.bins = {} #{bin(label):[match indexes]} + self.pident_filtered = 0 + self.kw_filtered = 0 + self.kw_filtered_breakdown = {} #{kw:count} + + def __str__(self): + return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ + % (self.query_id, + str(len(self.matches)), + str([bin.label for bin in bins]), + str(self.pident_filtered), + str(self.kw_filtered), + str(self.kw_filtered_breakdown)) + + +class BLASTMatch: + def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins): + self.subject_acc = subject_acc + self.subject_descr = subject_descr + self.score = score + self.p_cov = p_cov + self.p_ident = p_ident + self.bins = subject_bins + + def __str__(self): + return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ + % (self.subject_acc, + self.subject_descr, + str(self.score), + str(round(self.p_cov,2)), + str(round(self.p_ident, 2))) + + + +#PARSE OPTIONS AND ARGUMENTS +parser = argparse.ArgumentParser() + +parser.add_argument('-f', '--filter', + type='string', + dest='filter', + ) +parser.add_argument('-b', '--bins', + type='string', + dest='bins' + ) +parser.add_argument('-r', '--redundant', + dest='redundant', + default=False, + action='store_true' + ) +args = parser.parse_args() + +try: + input_tab, cheetah_tmpl, output_html, output_tab = args +except: + stop_err('you must supply the arguments input_tab, cheetah_tmpl and output_html.') +# print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (input_tab, cheetah_tmpl, output_html, output_tab)) + + +#BINS +bins=[] +if args.bins != None: + bins = list([BLASTBin(label_file.split('=')[0],label_file.split('=')[-1]) for label_file in args.bins.split(',')]) +print('database bins: %s' % str([bin.label for bin in bins])) + + #FILTERS +filter_pident = 0 +filter_kws = [] +if args.filter != None: + pident_kws = args.filter.split(':') + filter_pident = float(pident_kws[0]) + filter_kws = pident_kws[-1].split(',') +print('filter_pident: %s filter_kws: %s' % (str(filter_pident), str(filter_kws))) + +if args.redundant: + print('Throwing out redundant hits...') + +#RESULTS! +PIDENT_COL = 2 +DESCR_COL = 25 +SUBJ_ID_COL = 12 +SCORE_COL = 11 +PCOV_COL = 24 +queries = [] +current_query = '' +output_tab = open(output_tab, 'w') + +with open(input_tab) as input_tab: + for line in input_tab: + cols = line.split('\t') + if cols[0] != current_query: + current_query = cols[0] + queries.append(BLASTQuery(current_query)) + + try: + accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] + except IndexError as e: + stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) + + #hsp option: keep best (first) hit only for each query and accession id. + if args.redundant: + if accs[0] in queries[-1].match_accessions: + continue #don't save the result and skip to the next + else: + queries[-1].match_accessions[accs[0]] = '' + + + p_ident = float(cols[PIDENT_COL]) + #FILTER BY PIDENT + if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True + queries[-1].pident_filtered += 1 + continue + + descrs = cols[DESCR_COL] + #FILTER BY KEY WORDS + filter_by_kw = False + for kw in filter_kws: + kw = kw.strip() #Fix by Damion D Nov 2013 + if kw != '' and re.search(kw, descrs, re.IGNORECASE): + filter_by_kw = True + try: + queries[-1].kw_filtered_breakdown[kw] += 1 + except: + queries[-1].kw_filtered_breakdown[kw] = 1 + if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True + queries[-1].kw_filtered += 1 + continue + descr = descrs.split(';')[0] + + #ATTEMPT BIN + subj_bins = [] + for bin in bins: #if we are not binning, bins = [] so for loop not entered + for acc in accs: + if acc.split('.')[0] in bin.dict: + try: + queries[-1].bins[bin.label].append(len(queries[-1].matches)) + except: + queries[-1].bins[bin.label] = [len(queries[-1].matches)] + subj_bins.append(bin.label) + break #this result has been binned to this bin so break + acc = accs[0] + + score = int(float(cols[SCORE_COL])) + p_cov = float(cols[PCOV_COL]) + + #SAVE RESULT + queries[-1].matches.append( + BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) + ) + output_tab.write(line) +input_tab.close() +output_tab.close() + +''' +for query in queries: + print(query) + for match in query.matches: + print(' %s' % str(match)) + for bin in query.bins: + print(' bin: %s' % bin) + for x in query.bins[bin]: + print(' %s' % str(query.matches[x])) +''' + +namespace = {'queries': queries} +html = Template(file=cheetah_tmpl, searchList=[namespace]) +out_html = open(output_html, 'w') +out_html.write(str(html)) +out_html.close() + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_report_basic.xml Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,86 @@ +<tool id="blast_report_basic" name="BLAST report" version="0.1.0+galaxy0" > + <description>Produce an HTML table report of BLAST results</description> + <command detect_errors="error_code"> + <![CDATA[ + '${__tool_directory__}/blast_report.py' + -f ${filter_pident}:$filter_kws + #if str($bins) != "None" + -b "${bins}" + #end if + $discard_redundant + '${tabular_blast_report_input}' + '${__tool_directory__}/templates/template2.tmpl}' + '${out_html}' + '${out_tab}' + ]]> + </command> + <inputs> + <param name="tabular_blast_report_input" type="data" format="tabular" label="Tabular BLAST results (extended 26 columns)"/> + <param name="filter_pident" type="integer" min="90" max="100" value="97" label="Minimum percentage identity"/> + <param name="filter_kws" type="text" size="50" label="Comma-separated list of description keyword filters" value="bovine,clone,environmental,swine,uncultivated,uncultured,unidentified"/> + <param name="bins" type="select" label="Database bins" multiple="true" display="checkboxes"> + <options from_data_table="blast_reference_bins"> + <validator type="no_options" message="No BLAST reference bins available" /> + </options> + </param> + <param name="discard_redundant" type="boolean" truevalue="-r" falsevalue="" label="Throw out redundant hits?"/> + </inputs> + <outputs> + <data name="out_html" format="html" label="$tool.name on data $in_tab.hid: report"/> + <data name="out_tab" format="tabular" label="$tool.name on data $in_tab.hid: tabular results"/> + </outputs> + <help> + +.. class:: infomark + +**What it does** + +This tool produces a HTML report for each query in a tabular BLAST file. + +---- + +**Tabular BLAST results** + +One or more query's BLAST results in extended 26 column tabular format. + +---- + +**Report template** + +The report template dictates the format of the HTML report. +Note that changing the template from the standard "Top 20 hits shown, toggle remainder" to "Euzby results shown first" causes +the order of the results in the HTML report and the tabular BLAST results (if outputted) to be inconsistent with each other. + +---- + +**Minimum percentage identity** + +Filter by percentage identity. This filter is applied before the description keyword filters. + +---- + +**Comma-separated list of description keyword filters** + +Filter by description keywords. Do not include spaces (unless your keyword is two words). These are applied +after the percentage identity filter. + +---- + +**Database bins** + +Bin the results by accession number into "database bins." + +---- + +**Throw out redundant hits?** + +Only the first hit for any accession number will be reported. + +---- + +**Output tabular BLAST results?** + +This option produces a tabular BLAST file with the same results as those shown in the report. + + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/template1.tmpl Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,108 @@ +#silent import time +#set $display_m = 20 +#set $header = '<tr class="header"><th>Accession</th><th>Description</th><th>Score</th><th>% Coverage</th><th>% Identity</th></tr>' +<html> + <head> + <style> + body { + font-size:0.75em; + } + table, tr { + width: 100%; + } + table { + border-collapse: collapse; + border: 1px solid black; + } + tr.header { + background-color: lightgrey; + } + th { + border: 1px solid black; + } + td { + border-left: 1px solid black; + border-right: 1px solid black; + border-bottom: 1px dashed grey; + } + td.descr { + font-size: 80%; + } + h3 { + page-break-before: always; + color: blue; + } + h3.first { + page-break-before: avoid; + } + span.super { + color: navy; + font-size: 75%; + vertical-align: top; + } + </style> + <script> + function toggle(id){ + var element = document.getElementById(id) + console.log(id) + if (element.style.display == 'none') { + //console.log(element.tagName); + if (element.tagName == 'TBODY') element.style.display = 'table-row-group'; + else if (element.tagName == 'TD') element.style.display = 'table-cell'; + else element.style.display = 'block'; + } else { + element.style.display = 'none'; + } + } + </script> + </head> + <body> + #set $q = 0 + #for $query in $queries + #set $bin_symbols = dict([($bin,$i) for $i, $bin in enumerate($query.bins, 1)]) + #set $m = 0 + <h3 id="${query.query_id}" #if $q == 0 then'class="first"' else '' #>$query.query_id</h3> + <br/> + <table id="${query.query_id}_matches"> + #if len($query.matches) == 0: + <tr class="header"><th colspan="5">No matches to report</th></tr> + </table> + #else: + $header + #for $match in $query.matches: + #if $m == $display_m + <tbody id="${query.query_id}_extra" style="display:none"> + #end if + <tr> + <td>$match.subject_acc <span class="super">#echo ', '.join(sorted([str($bin_symbols[$bin]) for $bin in $match.bins]))#</span></td> + <td class="descr">$match.subject_descr</td> + <td>$match.score</td> + <td>$match.p_cov</td> + <td>$match.p_ident</td> + </tr> + #set $m += 1 + #end for + #if $m >= $display_m + </tbody> + <td id="${query.query_id}_show" align="center" colspan="6" >Displaying ${display_m}/$m matches. <a href="#${query.query_id}_extra" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Show the remaining results.</a></td> + <td id="${query.query_id}_hide" align="center" colspan="6" style="display:none"><a href="#${query.query_id}" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Hide the last #echo $m - $display_m # results.</a></td> + <tr> + </tr> + #end if + </table> + #if len($bin_symbols) > 0: + <p>#echo ', '.join(['<span class="super">%s</span> %s'%($bin_symbols[$bin],$bin) for $bin in $query.bins])#</p> + #end if + #end if + #if $query.pident_filtered > 0: + <p>$query.pident_filtered results filtered by % Identity.</p> + #end if + #if $query.kw_filtered > 0: + <p>$query.kw_filtered results filtered by description keywords: #echo ', '.join(list(["%s matches to '%s'" % (str($query.kw_filtered_breakdown[$kw]),$kw) for $kw in $query.kw_filtered_breakdown])) #.</p> + #end if + <p>Report produced on #echo time.strftime("%d/%m/%Y") #.</p> + <hr noshade size="1" color="blue"> + #set $q += 1 + #end for + </body> +</html>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/template2.tmpl Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,127 @@ +#silent import time +#set $display_m = 20 +#set $header = '<tr class="header"><th>Accession</th><th>Description</th><th>Score</th><th>% Coverage</th><th>% Identity</th></tr>' +<html> + <head> + <style> + body { + font-size:0.75em; + } + table, tr { + width: 100%; + } + table { + border-collapse: collapse; + border: 1px solid black; + } + tr.header { + background-color: lightgrey; + } + th { + border: 1px solid black; + } + td { + border-left: 1px solid black; + border-right: 1px solid black; + border-bottom: 1px dashed grey; + } + td.descr { + font-size: 80%; + } + h3 { + page-break-before: always; + color: blue; + } + h3.first { + page-break-before: avoid; + } + span.super { + color: navy; + font-size: 75%; + vertical-align: top; + } + </style> + <script> + function toggle(id){ + var element = document.getElementById(id) + console.log(id) + if (element.style.display == 'none') { + //console.log(element.tagName); + if (element.tagName == 'TBODY') element.style.display = 'table-row-group'; + else if (element.tagName == 'TD') element.style.display = 'table-cell'; + else element.style.display = 'block'; + } else { + element.style.display = 'none'; + } + } + </script> + </head> + <body> + #set $q = 0 + #for $query in $queries + #set $bin_symbols = dict([($bin,$i) for $i, $bin in enumerate($query.bins, 1)]) + #set $m = 0 + <h3 id="${query.query_id}" #if $q == 0 then'class="first"' else '' #>$query.query_id</h3> + <br/> + <table id="${query.query_id}_matches"> + #set $num_of_euzby = -1 + #if len($query.matches) == 0: + <tr class="header"><th colspan="5">No matches to report</th></tr> + </table> + #else: + $header + #try + #set $priority = $query.bins['Euzby'] + #set $front = [] + #for $i in reversed($priority) + #silent $front.append($query.matches.pop($i)) + #end for + #set $num_of_euzby = len($front) + #silent $front.reverse() + #silent $front.extend($query.matches) + #set $query.matches = $front + #except + #pass + #end try + #for $match in $query.matches: + #if $m == $display_m + <tbody id="${query.query_id}_extra" style="display:none"> + #end if + ##if $m>0 and set($match.bins)!=set($query.matches[m-1].bins) + ##put an empty line to separate Euzby records from other records + #if $m==$num_of_euzby and $m>0 + <tr><td align="center" colspan="6"> </td></tr> + #end if + <tr> + <td>$match.subject_acc <span class="super">#echo ', '.join(sorted([str($bin_symbols[$bin]) for $bin in $match.bins]))#</span></td> + <td class="descr">$match.subject_descr</td> + <td>$match.score</td> + <td>$match.p_cov</td> + <td>$match.p_ident</td> + </tr> + #set $m += 1 + #end for + #if $m >= $display_m + </tbody> + <td id="${query.query_id}_show" align="center" colspan="6" >Displaying ${display_m}/$m matches. <a href="#${query.query_id}_extra" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Show the remaining results.</a></td> + <td id="${query.query_id}_hide" align="center" colspan="6" style="display:none"><a href="#${query.query_id}" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Hide the last #echo $m - $display_m # results.</a></td> + <tr> + </tr> + #end if + </table> + #if len($bin_symbols) > 0: + <p>#echo ', '.join(['<span class="super">%s</span> %s'%($bin_symbols[$bin],$bin) for $bin in $query.bins])#</p> + #end if + #end if + #if $query.pident_filtered > 0: + <p>$query.pident_filtered results filtered by % Identity.</p> + #end if + #if $query.kw_filtered > 0: + <p>$query.kw_filtered results filtered by description keywords: #echo ', '.join(list(["%s matches to '%s'" % (str($query.kw_filtered_breakdown[$kw]),$kw) for $kw in $query.kw_filtered_breakdown])) #.</p> + #end if + <p>Report produced on #echo time.strftime("%d/%m/%Y") #.</p> + <hr noshade size="1" color="blue"> + #set $q += 1 + #end for + </body> +</html>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blast_reference_bins.loc.sample Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,7 @@ +# Expect three columns, tab separated, as follows: +# - value (Galaxy records this in the Galaxy DB, consider using a UUID but any unique value will work) +# - name (Galaxy shows this in the UI) +# - path (Path to the blast reference bin file) +# +# e.g. +# f45ee89a-d456-469a-8aeb-54cdfea821ec<tab>16S Microbial NCBI<tab>/path/to/16S_Microbial_NCBI.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blast_report_templates.loc.sample Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,7 @@ +# Expect three columns, tab separated, as follows: +# - value (Galaxy records this in the Galaxy DB, consider using a UUID but any unique value will work) +# - name (Galaxy shows this in the UI) +# - path (Path to the blast report template (cheetah format)) +# +# e.g. +# f45ee89a-d456-469a-8aeb-54cdfea821ec<tab>Default BLAST Report Template<tab>/path/to/template.tmpl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of BLAST reference bins in the required format --> + <table name="blast_reference_bins" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/blast_reference_bins.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,8 @@ +<?xml version="1.0"?> +<tables> + <!-- Locations of BLAST report templates in the required format --> + <table name="blast_report_templates" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/blast_report_templates.loc" /> + </table> +</tables>