Mercurial > repos > dfornika > blast_report
changeset 0:18b097eb1a51 draft
planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/blast_report commit 006cbba6513492f5a06b573c676400a2d464520b-dirty
author | dfornika |
---|---|
date | Tue, 10 Sep 2019 12:51:57 -0400 |
parents | |
children | a56a5519d60a |
files | blast_report.py blast_report.xml |
diffstat | 2 files changed, 300 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_report.py Tue Sep 10 12:51:57 2019 -0400 @@ -0,0 +1,198 @@ +#!/usr/bin/env python +from __future__ import print_function + +'''Report on BLAST results. + +python bccdc_blast_report.py input_tab cheetah_tmpl output_html output_tab [-f [filter_pident]:[filterkw1,...,filterkwN]] [-b bin1_label=bin1_path[,...binN_label=binN_path]] +''' + +import optparse +import re +import sys + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + +class BLASTBin: + def __init__(self, label, file): + self.label = label + self.dict = {} + + file_in = open(file) + for line in file_in: + self.dict[line.rstrip().split('.')[0]] = '' + file_in.close() + + def __str__(self): + return "label: %s dict: %s" % (self.label, str(self.dict)) + +class BLASTQuery: + def __init__(self, query_id): + self.query_id = query_id + self.matches = [] + self.match_accessions = {} + self.bins = {} #{bin(label):[match indexes]} + self.pident_filtered = 0 + self.kw_filtered = 0 + self.kw_filtered_breakdown = {} #{kw:count} + + def __str__(self): + return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ + % (self.query_id, + str(len(self.matches)), + str([bin.label for bin in bins]), + str(self.pident_filtered), + str(self.kw_filtered), + str(self.kw_filtered_breakdown)) + +class BLASTMatch: + def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins): + self.subject_acc = subject_acc + self.subject_descr = subject_descr + self.score = score + self.p_cov = p_cov + self.p_ident = p_ident + self.bins = subject_bins + + def __str__(self): + return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ + % (self.subject_acc, + self.subject_descr, + str(self.score), + str(round(self.p_cov,2)), + str(round(self.p_ident, 2))) + +#PARSE OPTIONS AND ARGUMENTS +parser = optparse.OptionParser(description='Report on BLAST results.', + usage='python bccdc_blast_report_generator.py input_tabut cheetah_tmpl output_html [output_id output_dir] [options]') + +parser.add_option('-f', '--filter', + type='string', + dest='filter', + ) +parser.add_option('-b', '--bins', + type='string', + dest='bins' + ) +parser.add_option('-r', '--redundant', + dest='hsp', + default=False, + action='store_true' + ) +options, args = parser.parse_args() + +try: + input_tab, cheetah_tmpl, output_html, output_tab = args +except: + stop_err('you must supply the arguments input_tab, cheetah_tmpl and output_html.') +#print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (input_tab, cheetah_tmpl, output_html, output_tab)) + +#BINS +bins=[] +if options.bins != None: + bins = list([BLASTBin(label_file.split('=')[0],label_file.split('=')[-1]) for label_file in options.bins.split(',')]) +print('database bins: %s' % str([bin.label for bin in bins])) + +#FILTERS +filter_pident = 0 +filter_kws = [] +if options.filter != None: + pident_kws = options.filter.split(':') + filter_pident = float(pident_kws[0]) + filter_kws = pident_kws[-1].split(',') +print('filter_pident: %s filter_kws: %s' % (str(filter_pident), str(filter_kws))) + +if options.hsp: + print('Throwing out redundant hits...') + +#RESULTS! +PIDENT_COL = 2 +DESCR_COL = 25 +SUBJ_ID_COL = 12 +SCORE_COL = 11 +PCOV_COL = 24 +queries = [] +current_query = '' +output_tab = open(output_tab, 'w') +with open(input_tab) as input_tab: + for line in input_tab: + cols = line.split('\t') + if cols[0] != current_query: + current_query = cols[0] + queries.append(BLASTQuery(current_query)) + + try: + accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] + except IndexError as e: + stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) + + #hsp option: keep best (first) hit only for each query and accession id. + if options.hsp: + if accs[0] in queries[-1].match_accessions: + continue #don't save the result and skip to the next + else: + queries[-1].match_accessions[accs[0]] = '' + + + p_ident = float(cols[PIDENT_COL]) + #FILTER BY PIDENT + if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True + queries[-1].pident_filtered += 1 + continue + + descrs = cols[DESCR_COL] + #FILTER BY KEY WORDS + filter_by_kw = False + for kw in filter_kws: + kw = kw.strip() #Fix by Damion D Nov 2013 + if kw != '' and re.search(kw, descrs, re.IGNORECASE): + filter_by_kw = True + try: + queries[-1].kw_filtered_breakdown[kw] += 1 + except: + queries[-1].kw_filtered_breakdown[kw] = 1 + if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True + queries[-1].kw_filtered += 1 + continue + descr = descrs.split(';')[0] + + #ATTEMPT BIN + subj_bins = [] + for bin in bins: #if we are not binning, bins = [] so for loop not entered + for acc in accs: + if acc.split('.')[0] in bin.dict: + try: + queries[-1].bins[bin.label].append(len(queries[-1].matches)) + except: + queries[-1].bins[bin.label] = [len(queries[-1].matches)] + subj_bins.append(bin.label) + break #this result has been binned to this bin so break + acc = accs[0] + + score = int(float(cols[SCORE_COL])) + p_cov = float(cols[PCOV_COL]) + + #SAVE RESULT + queries[-1].matches.append(BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins)) + output_tab.write(line) +input_tab.close() +output_tab.close() + +''' +for query in queries: + print(query) + for match in query.matches: + print(' %s' % str(match)) + for bin in query.bins: + print(' bin: %s' % bin) + for x in query.bins[bin]: + print(' %s' % str(query.matches[x])) +''' + +from Cheetah.Template import Template +namespace = {'queries': queries} +html = Template(file=cheetah_tmpl, searchList=[namespace]) +out_html = open(output_html, 'w') +out_html.write(str(html)) +out_html.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_report.xml Tue Sep 10 12:51:57 2019 -0400 @@ -0,0 +1,102 @@ +<tool id="blast_report" name="BLAST report" force_history_refresh="true" version="1.1.0"> + <description>Report on BLAST results</description> + <command > + <![CDATA[ + '$__tool_directory__/blast_report.py' + '${in_tab}' + '${tmpl}' + '${out_html}' + '${out_tab}' + -f '{$filter_pident}:$filter_kws' + #if str($bins) == "None" + #pass + #else + -b "${bins}" + #end if + #if $hsp_bool + -r + #end if + ]]> + </command> + <inputs> + <param name="in_tab" type="data" format="tabular" label="Tabular BLAST results (extended 26 columns)"/> + <param name="tmpl" type="select" optional="false" label="Report template"> + <options from_file="bccdc_blast_report_templates.loc"> + <column name="value" index="1"/> + <column name="name" index="0"/> + </options> + </param> + <param name="filter_pident" type="integer" min="90" max="100" value="97" label="Minimum percentage identity"/> + <param name="filter_kws" type="text" size="50" label="Comma-separated list of description keyword filters" value="bovine,clone,environmental,swine,uncultivated,uncultured,unidentified"/> + <param name="bins" type="select" label="Database bins" multiple="true" display="checkboxes"> + <options from_file="bccdc_blast_bins.loc"> + <column name="value" index="1"/> + <column name="name" index="0"/> + </options> + </param> + <!--<repeat name="hist_bins" title="History database bins"> + <param name="filter" type="data" format="csv" label="History database bin"/> + </repeat>--> + <param name="hsp_bool" type="boolean" label="Throw out redundant hits?"/> + <param name="tab_bool" type="boolean" label="Output tabular file?"/> + </inputs> + <outputs> + <data name="out_html" format="html" label="$tool.name on data $in_tab.hid: report"/> + <data name="out_tab" format="tabular" label="$tool.name on data $in_tab.hid: tabular results"> + <filter> tab_bool </filter> + </data> + </outputs> + <help> +.. class:: infomark + +**What it does** + +This tool produces a HTML report for each query in a tabular BLAST file. + +---- + +**Tabular BLAST results** + +One or more query's BLAST results in extended 26 column tabular format. + +---- + +**Report template** + +The report template dictates the format of the HTML report. +Note that changing the template from the standard "Top 20 hits shown, toggle remainder" to "Euzby results shown first" causes +the order of the results in the HTML report and the tabular BLAST results (if outputted) to be inconsistent with each other. + +---- + +**Minimum percentage identity** + +Filter by percentage identity. This filter is applied before the description keyword filters. + +---- + +**Comma-separated list of description keyword filters** + +Filter by description keywords. Do not include spaces (unless your keyword is two words). These are applied +after the percentage identity filter. + +---- + +**Database bins** + +Bin the results by accession number into "database bins." + +---- + +**Throw out redundant hits?** + +Only the first hit for any accession number will be reported. + +---- + +**Output tabular BLAST results?** + +This option produces a tabular BLAST file with the same results as those shown in the report. + + </help> +</tool>