changeset 0:5dfd84907521 draft

planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/blast_report_basic commit bc359460bb66db7946cc68ccbd47cd479624c4a1-dirty
author dfornika
date Tue, 03 Mar 2020 00:14:34 +0000
parents
children a63f676fe808
files blast_report.py blast_report_basic.xml templates/template1.tmpl templates/template2.tmpl tool-data/blast_reference_bins.loc.sample tool-data/blast_report_templates.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 8 files changed, 560 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_report.py	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+'''Report on BLAST results.
+
+python blast_report.py input_tab cheetah_tmpl output_html output_tab [-f [filter_pident]:[filterkw1,...,filterkwN]] [-b bin1_label=bin1_path[,...binN_label=binN_path]]
+'''
+import argparse
+import re
+import sys
+
+from Cheetah.Template import Template
+
+
+def stop_err( msg ):
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(1)
+
+
+class BLASTBin:
+    def __init__(self, label, file):
+        self.label = label
+        self.dict = {}
+        
+        file_in = open(file)
+        for line in file_in:
+            self.dict[line.rstrip().split('.')[0]] = ''
+        file_in.close()
+    
+    def __str__(self):
+        return "label: %s    dict: %s" % (self.label, str(self.dict))
+
+
+class BLASTQuery:
+    def __init__(self, query_id):
+        self.query_id = query_id
+        self.matches = []
+        self.match_accessions = {}
+        self.bins = {} #{bin(label):[match indexes]}
+        self.pident_filtered = 0
+        self.kw_filtered = 0
+        self.kw_filtered_breakdown = {} #{kw:count}
+        
+    def __str__(self):
+        return "query_id: %s    len(matches): %s    bins (labels only): %s    pident_filtered: %s    kw_filtered: %s    kw_filtered_breakdown: %s" \
+            % (self.query_id,
+               str(len(self.matches)),
+               str([bin.label for bin in bins]),
+               str(self.pident_filtered),
+               str(self.kw_filtered),
+               str(self.kw_filtered_breakdown))
+
+
+class BLASTMatch:
+    def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins):
+        self.subject_acc = subject_acc
+        self.subject_descr = subject_descr
+        self.score = score
+        self.p_cov = p_cov
+        self.p_ident = p_ident
+        self.bins = subject_bins
+        
+    def __str__(self):
+        return "subject_acc: %s    subject_descr: %s    score: %s    p-cov: %s    p-ident: %s" \
+            % (self.subject_acc,
+               self.subject_descr,
+               str(self.score),
+               str(round(self.p_cov,2)),
+               str(round(self.p_ident, 2)))
+
+
+
+#PARSE OPTIONS AND ARGUMENTS
+parser = argparse.ArgumentParser()
+
+parser.add_argument('-f', '--filter',
+                    type='string',
+                    dest='filter',
+                    )
+parser.add_argument('-b', '--bins',
+                    type='string',
+                    dest='bins'
+                    )
+parser.add_argument('-r', '--redundant',
+                    dest='redundant',
+                    default=False,
+                    action='store_true'
+                    )
+args = parser.parse_args()
+
+try:
+    input_tab, cheetah_tmpl, output_html, output_tab = args
+except:
+    stop_err('you must supply the arguments input_tab, cheetah_tmpl and output_html.')
+# print('input_tab: %s    cheetah_tmpl: %s    output_html: %s    output_tab: %s' % (input_tab, cheetah_tmpl, output_html, output_tab))
+
+
+#BINS
+bins=[]
+if args.bins != None:
+    bins = list([BLASTBin(label_file.split('=')[0],label_file.split('=')[-1]) for label_file in args.bins.split(',')])
+print('database bins: %s' % str([bin.label for bin in bins]))
+
+    #FILTERS
+filter_pident = 0
+filter_kws = []
+if args.filter != None:
+    pident_kws = args.filter.split(':')
+    filter_pident = float(pident_kws[0])
+    filter_kws = pident_kws[-1].split(',')
+print('filter_pident: %s    filter_kws: %s' % (str(filter_pident), str(filter_kws)))
+
+if args.redundant:
+    print('Throwing out redundant hits...')
+
+#RESULTS!
+PIDENT_COL = 2
+DESCR_COL = 25
+SUBJ_ID_COL = 12
+SCORE_COL = 11
+PCOV_COL = 24
+queries = []
+current_query = ''
+output_tab = open(output_tab, 'w')
+    
+with open(input_tab) as input_tab:
+    for line in input_tab:
+        cols = line.split('\t')
+        if cols[0] != current_query:
+            current_query = cols[0]
+            queries.append(BLASTQuery(current_query))
+
+        try:
+            accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2]
+        except IndexError as e:
+            stop_err("Problem with splitting:" + cols[SUBJ_ID_COL])
+
+        #hsp option: keep best (first) hit only for each query and accession id.
+        if args.redundant:
+            if accs[0] in queries[-1].match_accessions:
+                continue #don't save the result and skip to the next
+            else:
+                queries[-1].match_accessions[accs[0]] = ''
+
+
+        p_ident = float(cols[PIDENT_COL])
+        #FILTER BY PIDENT
+        if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True
+            queries[-1].pident_filtered += 1
+            continue
+        
+        descrs = cols[DESCR_COL]
+        #FILTER BY KEY WORDS
+        filter_by_kw = False
+        for kw in filter_kws:
+            kw = kw.strip() #Fix by Damion D Nov 2013
+            if kw != '' and re.search(kw, descrs, re.IGNORECASE):
+                filter_by_kw = True
+                try:
+                    queries[-1].kw_filtered_breakdown[kw] += 1
+                except:
+                    queries[-1].kw_filtered_breakdown[kw] = 1
+        if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True
+            queries[-1].kw_filtered += 1
+            continue
+        descr = descrs.split(';')[0]
+        
+        #ATTEMPT BIN
+        subj_bins = []
+        for bin in bins: #if we are not binning, bins = [] so for loop not entered
+            for acc in accs:
+                if acc.split('.')[0] in bin.dict:
+                    try:
+                        queries[-1].bins[bin.label].append(len(queries[-1].matches))
+                    except:
+                        queries[-1].bins[bin.label] = [len(queries[-1].matches)]
+                    subj_bins.append(bin.label)
+                    break #this result has been binned to this bin so break
+        acc = accs[0]
+        
+        score = int(float(cols[SCORE_COL]))
+        p_cov = float(cols[PCOV_COL])
+        
+        #SAVE RESULT
+        queries[-1].matches.append(
+            BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins)
+        )
+        output_tab.write(line)            
+input_tab.close()
+output_tab.close()
+
+'''
+for query in queries:
+    print(query)
+    for match in query.matches:
+        print('    %s' % str(match))
+    for bin in query.bins:
+        print('    bin: %s' % bin)
+        for x in query.bins[bin]:
+            print('        %s' % str(query.matches[x]))
+'''
+
+namespace = {'queries': queries}
+html = Template(file=cheetah_tmpl, searchList=[namespace])
+out_html = open(output_html, 'w')
+out_html.write(str(html))
+out_html.close()
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blast_report_basic.xml	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,86 @@
+<tool id="blast_report_basic" name="BLAST report" version="0.1.0+galaxy0" >
+    <description>Produce an HTML table report of BLAST results</description>
+    <command detect_errors="error_code">
+        <![CDATA[
+          '${__tool_directory__}/blast_report.py' 
+            -f ${filter_pident}:$filter_kws
+            #if str($bins) != "None"
+              -b "${bins}"
+            #end if
+            $discard_redundant
+	    '${tabular_blast_report_input}'
+            '${__tool_directory__}/templates/template2.tmpl}'
+            '${out_html}'
+            '${out_tab}'
+        ]]>
+    </command>
+    <inputs>
+        <param name="tabular_blast_report_input" type="data" format="tabular" label="Tabular BLAST results (extended 26 columns)"/>
+        <param name="filter_pident" type="integer" min="90" max="100" value="97" label="Minimum percentage identity"/>
+        <param name="filter_kws" type="text" size="50" label="Comma-separated list of description keyword filters" value="bovine,clone,environmental,swine,uncultivated,uncultured,unidentified"/>
+        <param name="bins" type="select" label="Database bins" multiple="true" display="checkboxes">
+            <options from_data_table="blast_reference_bins">
+                <validator type="no_options" message="No BLAST reference bins available" />
+            </options>
+        </param>
+       	<param name="discard_redundant" type="boolean" truevalue="-r" falsevalue="" label="Throw out redundant hits?"/> 
+    </inputs>
+    <outputs>
+        <data name="out_html" format="html" label="$tool.name on data $in_tab.hid: report"/>
+        <data name="out_tab" format="tabular" label="$tool.name on data $in_tab.hid: tabular results"/>
+    </outputs>
+    <help>
+
+.. class:: infomark
+
+**What it does**
+
+This tool produces a HTML report for each query in a tabular BLAST file.
+
+----
+
+**Tabular BLAST results**
+
+One or more query's BLAST results in extended 26 column tabular format. 
+
+----
+
+**Report template**
+
+The report template dictates the format of the HTML report.
+Note that changing the template from the standard "Top 20 hits shown, toggle remainder" to "Euzby results shown first" causes
+the order of the results in the HTML report and the tabular BLAST results (if outputted) to be inconsistent with each other.
+
+----
+
+**Minimum percentage identity**
+
+Filter by percentage identity. This filter is applied before the description keyword filters.
+
+----
+
+**Comma-separated list of description keyword filters**
+
+Filter by description keywords. Do not include spaces (unless your keyword is two words). These are applied
+after the percentage identity filter.
+
+----
+
+**Database bins**
+
+Bin the results by accession number into "database bins."
+
+----
+
+**Throw out redundant hits?**
+
+Only the first hit for any accession number will be reported.
+
+----
+
+**Output tabular BLAST results?**
+
+This option produces a tabular BLAST file with the same results as those shown in the report.
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/templates/template1.tmpl	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,108 @@
+#silent import time
+#set $display_m = 20
+#set $header = '<tr class="header"><th>Accession</th><th>Description</th><th>Score</th><th>% Coverage</th><th>% Identity</th></tr>'
+<html>
+	<head>
+		<style>
+			body {
+				font-size:0.75em;
+			}
+			table, tr {
+				width: 100%;
+			}
+			table {
+				border-collapse: collapse;
+				border: 1px solid black;
+			}
+			tr.header {
+				background-color: lightgrey;
+			}
+			th {
+				border: 1px solid black;
+			}
+			td {
+				border-left: 1px solid black;
+				border-right: 1px solid black;
+				border-bottom: 1px dashed grey;
+			}
+			td.descr {
+				font-size: 80%;
+			}
+			h3 {
+				page-break-before: always;
+				color: blue;
+			}
+			h3.first {
+				page-break-before: avoid;
+			}
+			span.super {
+				color: navy;
+				font-size: 75%;
+				vertical-align: top;
+			}
+		</style>
+		<script>
+			function toggle(id){
+				var element = document.getElementById(id)
+				console.log(id)
+				if (element.style.display == 'none') {
+					//console.log(element.tagName);
+					if (element.tagName == 'TBODY') element.style.display = 'table-row-group';
+					else if (element.tagName == 'TD') element.style.display = 'table-cell';
+					else element.style.display = 'block';
+				} else {
+					element.style.display = 'none';
+				}
+			}
+		</script>
+	</head>
+	<body>
+		#set $q = 0
+		#for $query in $queries
+			#set $bin_symbols = dict([($bin,$i) for $i, $bin in enumerate($query.bins, 1)])
+			#set $m = 0
+			<h3 id="${query.query_id}" #if $q == 0 then'class="first"' else '' #>$query.query_id</h3>
+			<br/>
+			<table id="${query.query_id}_matches">
+			#if len($query.matches) == 0:
+				<tr class="header"><th colspan="5">No matches to report</th></tr>
+			</table>
+			#else:
+			$header
+			#for $match in $query.matches:
+				#if $m == $display_m
+				<tbody id="${query.query_id}_extra" style="display:none">
+				#end if
+				<tr>
+					<td>$match.subject_acc <span class="super">#echo ', '.join(sorted([str($bin_symbols[$bin]) for $bin in $match.bins]))#</span></td>
+					<td class="descr">$match.subject_descr</td>
+					<td>$match.score</td>
+					<td>$match.p_cov</td>
+					<td>$match.p_ident</td>
+				</tr>
+			#set $m += 1
+			#end for
+			#if $m >= $display_m
+			</tbody>
+				<td id="${query.query_id}_show" align="center" colspan="6" >Displaying ${display_m}/$m matches. <a href="#${query.query_id}_extra" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Show the remaining results.</a></td>
+				<td id="${query.query_id}_hide" align="center" colspan="6" style="display:none"><a href="#${query.query_id}" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Hide the last #echo $m - $display_m # results.</a></td>
+			<tr>
+			</tr>
+			#end if
+			</table>
+			#if len($bin_symbols) > 0:
+			<p>#echo ', '.join(['<span class="super">%s</span> %s'%($bin_symbols[$bin],$bin) for $bin in $query.bins])#</p>
+			#end if
+			#end if
+			#if $query.pident_filtered > 0:
+			<p>$query.pident_filtered results filtered by % Identity.</p>
+			#end if
+			#if $query.kw_filtered > 0:
+			<p>$query.kw_filtered results filtered by description keywords: #echo ', '.join(list(["%s matches to '%s'" % (str($query.kw_filtered_breakdown[$kw]),$kw) for $kw in $query.kw_filtered_breakdown])) #.</p>
+			#end if
+			<p>Report produced on #echo time.strftime("%d/%m/%Y") #.</p>
+			<hr noshade size="1" color="blue">
+		#set $q += 1
+		#end for
+	</body>
+</html>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/templates/template2.tmpl	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,127 @@
+#silent import time
+#set $display_m = 20
+#set $header = '<tr class="header"><th>Accession</th><th>Description</th><th>Score</th><th>% Coverage</th><th>% Identity</th></tr>'
+<html>
+	<head>
+		<style>
+			body {
+				font-size:0.75em;
+			}
+			table, tr {
+				width: 100%;
+			}
+			table {
+				border-collapse: collapse;
+				border: 1px solid black;
+			}
+			tr.header {
+				background-color: lightgrey;
+			}
+			th {
+				border: 1px solid black;
+			}
+			td {
+				border-left: 1px solid black;
+				border-right: 1px solid black;
+				border-bottom: 1px dashed grey;
+			}
+			td.descr {
+				font-size: 80%;
+			}
+			h3 {
+				page-break-before: always;
+				color: blue;
+			}
+			h3.first {
+				page-break-before: avoid;
+			}
+			span.super {
+				color: navy;
+				font-size: 75%;
+				vertical-align: top;
+			}
+		</style>
+		<script>
+			function toggle(id){
+				var element = document.getElementById(id)
+				console.log(id)
+				if (element.style.display == 'none') {
+					//console.log(element.tagName);
+					if (element.tagName == 'TBODY') element.style.display = 'table-row-group';
+					else if (element.tagName == 'TD') element.style.display = 'table-cell';
+					else element.style.display = 'block';
+				} else {
+					element.style.display = 'none';
+				}
+			}
+		</script>
+	</head>
+	<body>
+		#set $q = 0
+		#for $query in $queries
+			#set $bin_symbols = dict([($bin,$i) for $i, $bin in enumerate($query.bins, 1)])
+			#set $m = 0
+			<h3 id="${query.query_id}" #if $q == 0 then'class="first"' else '' #>$query.query_id</h3>
+			<br/>
+			<table id="${query.query_id}_matches">
+			#set $num_of_euzby = -1
+			#if len($query.matches) == 0:
+				<tr class="header"><th colspan="5">No matches to report</th></tr>
+			</table>
+			#else:
+			$header
+			#try
+			#set $priority = $query.bins['Euzby']
+			#set $front = []
+			#for $i in reversed($priority)
+			#silent $front.append($query.matches.pop($i))
+			#end for
+			#set $num_of_euzby = len($front)
+			#silent $front.reverse()
+			#silent $front.extend($query.matches)
+			#set $query.matches = $front
+			#except
+			#pass
+			#end try
+			#for $match in $query.matches:
+				#if $m == $display_m
+				<tbody id="${query.query_id}_extra" style="display:none">
+				#end if
+				##if $m>0 and set($match.bins)!=set($query.matches[m-1].bins)
+				##put an empty line to separate Euzby records from other records
+				#if $m==$num_of_euzby and $m>0
+				<tr><td align="center" colspan="6">&nbsp;</td></tr>
+				#end if
+				<tr>
+					<td>$match.subject_acc <span class="super">#echo ', '.join(sorted([str($bin_symbols[$bin]) for $bin in $match.bins]))#</span></td>
+					<td class="descr">$match.subject_descr</td>
+					<td>$match.score</td>
+					<td>$match.p_cov</td>
+					<td>$match.p_ident</td>
+				</tr>
+			#set $m += 1
+			#end for
+			#if $m >= $display_m
+			</tbody>
+				<td id="${query.query_id}_show" align="center" colspan="6" >Displaying ${display_m}/$m matches. <a href="#${query.query_id}_extra" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Show the remaining results.</a></td>
+				<td id="${query.query_id}_hide" align="center" colspan="6" style="display:none"><a href="#${query.query_id}" onclick="toggle('${query.query_id}_extra'); toggle('${query.query_id}_show'); toggle('${query.query_id}_hide');">Hide the last #echo $m - $display_m # results.</a></td>
+			<tr>
+			</tr>
+			#end if
+			</table>
+			#if len($bin_symbols) > 0:
+			<p>#echo ', '.join(['<span class="super">%s</span> %s'%($bin_symbols[$bin],$bin) for $bin in $query.bins])#</p>
+			#end if
+			#end if
+			#if $query.pident_filtered > 0:
+			<p>$query.pident_filtered results filtered by % Identity.</p>
+			#end if
+			#if $query.kw_filtered > 0:
+			<p>$query.kw_filtered results filtered by description keywords: #echo ', '.join(list(["%s matches to '%s'" % (str($query.kw_filtered_breakdown[$kw]),$kw) for $kw in $query.kw_filtered_breakdown])) #.</p>
+			#end if
+			<p>Report produced on #echo time.strftime("%d/%m/%Y") #.</p>
+			<hr noshade size="1" color="blue">
+		#set $q += 1
+		#end for
+	</body>
+</html>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blast_reference_bins.loc.sample	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,7 @@
+# Expect three columns, tab separated, as follows:
+# - value (Galaxy records this in the Galaxy DB, consider using a UUID but any unique value will work)
+# - name (Galaxy shows this in the UI)
+# - path (Path to the blast reference bin file)
+#
+# e.g.
+# f45ee89a-d456-469a-8aeb-54cdfea821ec<tab>16S Microbial NCBI<tab>/path/to/16S_Microbial_NCBI.tab
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/blast_report_templates.loc.sample	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,7 @@
+# Expect three columns, tab separated, as follows:
+# - value (Galaxy records this in the Galaxy DB, consider using a UUID but any unique value will work)
+# - name (Galaxy shows this in the UI)
+# - path (Path to the blast report template (cheetah format))
+#
+# e.g.
+# f45ee89a-d456-469a-8aeb-54cdfea821ec<tab>Default BLAST Report Template<tab>/path/to/template.tmpl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of BLAST reference bins in the required format -->
+    <table name="blast_reference_bins" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="tool-data/blast_reference_bins.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Tue Mar 03 00:14:34 2020 +0000
@@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<tables>
+    <!-- Locations of BLAST report templates in the required format -->
+    <table name="blast_report_templates" comment_char="#">
+        <columns>value, name, path</columns>
+        <file path="${__HERE__}/test-data/blast_report_templates.loc" />
+    </table>
+</tables>