# HG changeset patch # User dfornika # Date 1583194474 0 # Node ID 5dfd8490752166dcd0dc9d240de0a5065ac911ce planemo upload for repository https://github.com/public-health-bioinformatics/galaxy_tools/blob/master/tools/blast_report_basic commit bc359460bb66db7946cc68ccbd47cd479624c4a1-dirty diff -r 000000000000 -r 5dfd84907521 blast_report.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_report.py Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,209 @@ +#!/usr/bin/env python +'''Report on BLAST results. + +python blast_report.py input_tab cheetah_tmpl output_html output_tab [-f [filter_pident]:[filterkw1,...,filterkwN]] [-b bin1_label=bin1_path[,...binN_label=binN_path]] +''' +import argparse +import re +import sys + +from Cheetah.Template import Template + + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + + +class BLASTBin: + def __init__(self, label, file): + self.label = label + self.dict = {} + + file_in = open(file) + for line in file_in: + self.dict[line.rstrip().split('.')[0]] = '' + file_in.close() + + def __str__(self): + return "label: %s dict: %s" % (self.label, str(self.dict)) + + +class BLASTQuery: + def __init__(self, query_id): + self.query_id = query_id + self.matches = [] + self.match_accessions = {} + self.bins = {} #{bin(label):[match indexes]} + self.pident_filtered = 0 + self.kw_filtered = 0 + self.kw_filtered_breakdown = {} #{kw:count} + + def __str__(self): + return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ + % (self.query_id, + str(len(self.matches)), + str([bin.label for bin in bins]), + str(self.pident_filtered), + str(self.kw_filtered), + str(self.kw_filtered_breakdown)) + + +class BLASTMatch: + def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins): + self.subject_acc = subject_acc + self.subject_descr = subject_descr + self.score = score + self.p_cov = p_cov + self.p_ident = p_ident + self.bins = subject_bins + + def __str__(self): + return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ + % (self.subject_acc, + self.subject_descr, + str(self.score), + str(round(self.p_cov,2)), + str(round(self.p_ident, 2))) + + + +#PARSE OPTIONS AND ARGUMENTS +parser = argparse.ArgumentParser() + +parser.add_argument('-f', '--filter', + type='string', + dest='filter', + ) +parser.add_argument('-b', '--bins', + type='string', + dest='bins' + ) +parser.add_argument('-r', '--redundant', + dest='redundant', + default=False, + action='store_true' + ) +args = parser.parse_args() + +try: + input_tab, cheetah_tmpl, output_html, output_tab = args +except: + stop_err('you must supply the arguments input_tab, cheetah_tmpl and output_html.') +# print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (input_tab, cheetah_tmpl, output_html, output_tab)) + + +#BINS +bins=[] +if args.bins != None: + bins = list([BLASTBin(label_file.split('=')[0],label_file.split('=')[-1]) for label_file in args.bins.split(',')]) +print('database bins: %s' % str([bin.label for bin in bins])) + + #FILTERS +filter_pident = 0 +filter_kws = [] +if args.filter != None: + pident_kws = args.filter.split(':') + filter_pident = float(pident_kws[0]) + filter_kws = pident_kws[-1].split(',') +print('filter_pident: %s filter_kws: %s' % (str(filter_pident), str(filter_kws))) + +if args.redundant: + print('Throwing out redundant hits...') + +#RESULTS! +PIDENT_COL = 2 +DESCR_COL = 25 +SUBJ_ID_COL = 12 +SCORE_COL = 11 +PCOV_COL = 24 +queries = [] +current_query = '' +output_tab = open(output_tab, 'w') + +with open(input_tab) as input_tab: + for line in input_tab: + cols = line.split('\t') + if cols[0] != current_query: + current_query = cols[0] + queries.append(BLASTQuery(current_query)) + + try: + accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] + except IndexError as e: + stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) + + #hsp option: keep best (first) hit only for each query and accession id. + if args.redundant: + if accs[0] in queries[-1].match_accessions: + continue #don't save the result and skip to the next + else: + queries[-1].match_accessions[accs[0]] = '' + + + p_ident = float(cols[PIDENT_COL]) + #FILTER BY PIDENT + if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True + queries[-1].pident_filtered += 1 + continue + + descrs = cols[DESCR_COL] + #FILTER BY KEY WORDS + filter_by_kw = False + for kw in filter_kws: + kw = kw.strip() #Fix by Damion D Nov 2013 + if kw != '' and re.search(kw, descrs, re.IGNORECASE): + filter_by_kw = True + try: + queries[-1].kw_filtered_breakdown[kw] += 1 + except: + queries[-1].kw_filtered_breakdown[kw] = 1 + if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True + queries[-1].kw_filtered += 1 + continue + descr = descrs.split(';')[0] + + #ATTEMPT BIN + subj_bins = [] + for bin in bins: #if we are not binning, bins = [] so for loop not entered + for acc in accs: + if acc.split('.')[0] in bin.dict: + try: + queries[-1].bins[bin.label].append(len(queries[-1].matches)) + except: + queries[-1].bins[bin.label] = [len(queries[-1].matches)] + subj_bins.append(bin.label) + break #this result has been binned to this bin so break + acc = accs[0] + + score = int(float(cols[SCORE_COL])) + p_cov = float(cols[PCOV_COL]) + + #SAVE RESULT + queries[-1].matches.append( + BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) + ) + output_tab.write(line) +input_tab.close() +output_tab.close() + +''' +for query in queries: + print(query) + for match in query.matches: + print(' %s' % str(match)) + for bin in query.bins: + print(' bin: %s' % bin) + for x in query.bins[bin]: + print(' %s' % str(query.matches[x])) +''' + +namespace = {'queries': queries} +html = Template(file=cheetah_tmpl, searchList=[namespace]) +out_html = open(output_html, 'w') +out_html.write(str(html)) +out_html.close() + + +if __name__ == '__main__': + main() diff -r 000000000000 -r 5dfd84907521 blast_report_basic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_report_basic.xml Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,86 @@ + + Produce an HTML table report of BLAST results + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool produces a HTML report for each query in a tabular BLAST file. + +---- + +**Tabular BLAST results** + +One or more query's BLAST results in extended 26 column tabular format. + +---- + +**Report template** + +The report template dictates the format of the HTML report. +Note that changing the template from the standard "Top 20 hits shown, toggle remainder" to "Euzby results shown first" causes +the order of the results in the HTML report and the tabular BLAST results (if outputted) to be inconsistent with each other. + +---- + +**Minimum percentage identity** + +Filter by percentage identity. This filter is applied before the description keyword filters. + +---- + +**Comma-separated list of description keyword filters** + +Filter by description keywords. Do not include spaces (unless your keyword is two words). These are applied +after the percentage identity filter. + +---- + +**Database bins** + +Bin the results by accession number into "database bins." + +---- + +**Throw out redundant hits?** + +Only the first hit for any accession number will be reported. + +---- + +**Output tabular BLAST results?** + +This option produces a tabular BLAST file with the same results as those shown in the report. + + + diff -r 000000000000 -r 5dfd84907521 templates/template1.tmpl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/template1.tmpl Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,108 @@ +#silent import time +#set $display_m = 20 +#set $header = 'AccessionDescriptionScore% Coverage% Identity' + + + + + + + #set $q = 0 + #for $query in $queries + #set $bin_symbols = dict([($bin,$i) for $i, $bin in enumerate($query.bins, 1)]) + #set $m = 0 +

$query.query_id

+
+ + #if len($query.matches) == 0: + +
No matches to report
+ #else: + $header + #for $match in $query.matches: + #if $m == $display_m + + #end if + + $match.subject_acc #echo ', '.join(sorted([str($bin_symbols[$bin]) for $bin in $match.bins]))# + $match.subject_descr + $match.score + $match.p_cov + $match.p_ident + + #set $m += 1 + #end for + #if $m >= $display_m + + Displaying ${display_m}/$m matches. Show the remaining results. + Hide the last #echo $m - $display_m # results. + + + #end if + + #if len($bin_symbols) > 0: +

#echo ', '.join(['%s %s'%($bin_symbols[$bin],$bin) for $bin in $query.bins])#

+ #end if + #end if + #if $query.pident_filtered > 0: +

$query.pident_filtered results filtered by % Identity.

+ #end if + #if $query.kw_filtered > 0: +

$query.kw_filtered results filtered by description keywords: #echo ', '.join(list(["%s matches to '%s'" % (str($query.kw_filtered_breakdown[$kw]),$kw) for $kw in $query.kw_filtered_breakdown])) #.

+ #end if +

Report produced on #echo time.strftime("%d/%m/%Y") #.

+
+ #set $q += 1 + #end for + + diff -r 000000000000 -r 5dfd84907521 templates/template2.tmpl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/templates/template2.tmpl Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,127 @@ +#silent import time +#set $display_m = 20 +#set $header = 'AccessionDescriptionScore% Coverage% Identity' + + + + + + + #set $q = 0 + #for $query in $queries + #set $bin_symbols = dict([($bin,$i) for $i, $bin in enumerate($query.bins, 1)]) + #set $m = 0 +

$query.query_id

+
+ + #set $num_of_euzby = -1 + #if len($query.matches) == 0: + +
No matches to report
+ #else: + $header + #try + #set $priority = $query.bins['Euzby'] + #set $front = [] + #for $i in reversed($priority) + #silent $front.append($query.matches.pop($i)) + #end for + #set $num_of_euzby = len($front) + #silent $front.reverse() + #silent $front.extend($query.matches) + #set $query.matches = $front + #except + #pass + #end try + #for $match in $query.matches: + #if $m == $display_m + + #end if + ##if $m>0 and set($match.bins)!=set($query.matches[m-1].bins) + ##put an empty line to separate Euzby records from other records + #if $m==$num_of_euzby and $m>0 +   + #end if + + $match.subject_acc #echo ', '.join(sorted([str($bin_symbols[$bin]) for $bin in $match.bins]))# + $match.subject_descr + $match.score + $match.p_cov + $match.p_ident + + #set $m += 1 + #end for + #if $m >= $display_m + + Displaying ${display_m}/$m matches. Show the remaining results. + Hide the last #echo $m - $display_m # results. + + + #end if + + #if len($bin_symbols) > 0: +

#echo ', '.join(['%s %s'%($bin_symbols[$bin],$bin) for $bin in $query.bins])#

+ #end if + #end if + #if $query.pident_filtered > 0: +

$query.pident_filtered results filtered by % Identity.

+ #end if + #if $query.kw_filtered > 0: +

$query.kw_filtered results filtered by description keywords: #echo ', '.join(list(["%s matches to '%s'" % (str($query.kw_filtered_breakdown[$kw]),$kw) for $kw in $query.kw_filtered_breakdown])) #.

+ #end if +

Report produced on #echo time.strftime("%d/%m/%Y") #.

+
+ #set $q += 1 + #end for + + diff -r 000000000000 -r 5dfd84907521 tool-data/blast_reference_bins.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blast_reference_bins.loc.sample Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,7 @@ +# Expect three columns, tab separated, as follows: +# - value (Galaxy records this in the Galaxy DB, consider using a UUID but any unique value will work) +# - name (Galaxy shows this in the UI) +# - path (Path to the blast reference bin file) +# +# e.g. +# f45ee89a-d456-469a-8aeb-54cdfea821ec16S Microbial NCBI/path/to/16S_Microbial_NCBI.tab diff -r 000000000000 -r 5dfd84907521 tool-data/blast_report_templates.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/blast_report_templates.loc.sample Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,7 @@ +# Expect three columns, tab separated, as follows: +# - value (Galaxy records this in the Galaxy DB, consider using a UUID but any unique value will work) +# - name (Galaxy shows this in the UI) +# - path (Path to the blast report template (cheetah format)) +# +# e.g. +# f45ee89a-d456-469a-8aeb-54cdfea821ecDefault BLAST Report Template/path/to/template.tmpl diff -r 000000000000 -r 5dfd84907521 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+
diff -r 000000000000 -r 5dfd84907521 tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Mar 03 00:14:34 2020 +0000 @@ -0,0 +1,8 @@ + + + + + value, name, path + +
+