Mercurial > repos > dfornika > blast_report_basic

--- a/blast_report.py	Tue Mar 03 10:11:18 2020 +0000
+++ b/blast_report.py	Tue Mar 03 10:55:12 2020 +0000
@@ -2,19 +2,14 @@

 from __future__ import print_function

-'''Report on BLAST results.
-
-python blast_report.py input_tab cheetah_tmpl output_html output_tab [-i [min_identity]] [-f filterkw1,...,filterkwN]] [-b bin1_label bin1_path[,...binN_label binN_path]]
-'''
-
 import argparse
 import re
 import sys

 from Cheetah.Template import Template
-from pprint import pprint
+

-def stop_err( msg ):
+def stop_err(msg):
     sys.stderr.write("%s\n" % msg)
     sys.exit(1)

@@ -23,12 +18,12 @@
     def __init__(self, label, file):
         self.label = label
         self.dict = {}
-
+
         file_in = open(file)
         for line in file_in:
             self.dict[line.rstrip().split('.')[0]] = ''
         file_in.close()
-
+
     def __str__(self):
         return "label: %s    dict: %s" % (self.label, str(self.dict))

@@ -38,13 +33,21 @@
         self.query_id = query_id
         self.matches = []
         self.match_accessions = {}
-        self.bins = {} #{bin(label):[match indexes]}
+        self.bins = {}  # {bin(label):[match indexes]}
         self.pident_filtered = 0
         self.kw_filtered = 0
-        self.kw_filtered_breakdown = {} #{kw:count}
-
+        self.kw_filtered_breakdown = {}  # {kw:count}
+
     def __str__(self):
-        return "query_id: %s    len(matches): %s    bins (labels only): %s    pident_filtered: %s    kw_filtered: %s    kw_filtered_breakdown: %s" \
+        format_string = "\t".join([
+            "query_id: %s",
+            "len(matches): %s",
+            "bins (labels only): %s",
+            "pident_filtered: %s",
+            "kw_filtered: %s",
+            "kw_filtered_breakdown: %s"
+        ])
+        return format_string \
             % (self.query_id,
                str(len(self.matches)),
                str([bin.label for bin in bins]),
@@ -61,17 +64,17 @@
         self.p_cov = p_cov
         self.p_ident = p_ident
         self.bins = subject_bins
-
+
     def __str__(self):
         return "subject_acc: %s    subject_descr: %s    score: %s    p-cov: %s    p-ident: %s" \
             % (self.subject_acc,
                self.subject_descr,
                str(self.score),
-               str(round(self.p_cov,2)),
+               str(round(self.p_cov, 2)),
                str(round(self.p_ident, 2)))


-#PARSE OPTIONS AND ARGUMENTS
+# PARSE OPTIONS AND ARGUMENTS
 parser = argparse.ArgumentParser()

 parser.add_argument('-f', '--filter-keywords',
@@ -97,20 +100,15 @@

 args = parser.parse_args()

-pprint(args.bins)
-
-print('input_tab: %s    cheetah_tmpl: %s    output_html: %s    output_tab: %s' % (args.input_tab, args.cheetah_tmpl, args.output_html, args.output_tab))
-
-
-#BINS
-bins=[]
-if args.bins != None:
+# BINS
+bins = []
+if args.bins is not None:
     for bin in args.bins:
         bins.append(BLASTBin(bin[0], bin[1]))

 print('database bins: %s' % str([bin.label for bin in bins]))

-#FILTERS
+# FILTERS
 filter_pident = 0
 filter_kws = []
 if args.filter_keywords:
@@ -129,7 +127,7 @@
 queries = []
 current_query = ''
 output_tab = open(args.output_tab, 'w')
-
+
 with open(args.input_tab) as input_tab:
     for line in input_tab:
         cols = line.split('\t')
@@ -142,22 +140,21 @@
         except IndexError as e:
             stop_err("Problem with splitting:" + cols[SUBJ_ID_COL])

-        #hsp option: keep best (first) hit only for each query and accession id.
+        # keep best (first) hit only for each query and accession id.
         if args.discard_redundant:
             if accs[0] in queries[-1].match_accessions:
-                continue #don't save the result and skip to the next
+                continue  # don't save the result and skip to the next
             else:
                 queries[-1].match_accessions[accs[0]] = ''

-
         p_ident = float(cols[PIDENT_COL])
-        #FILTER BY PIDENT
-        if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True
+        # FILTER BY PIDENT
+        if p_ident < filter_pident:  # if we are not filtering, filter_pident == 0 and this will never evaluate to True
             queries[-1].pident_filtered += 1
             continue
-
+
         descrs = cols[DESCR_COL]
-        #FILTER BY KEY WORDS
+        # FILTER BY KEY WORDS
         filter_by_kw = False
         for kw in filter_kws:
             kw = kw.strip()
@@ -165,34 +162,34 @@
                 filter_by_kw = True
                 try:
                     queries[-1].kw_filtered_breakdown[kw] += 1
-                except:
+                except Exception as e:
                     queries[-1].kw_filtered_breakdown[kw] = 1
-        if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True
+        if filter_by_kw:  # if we are not filtering, for loop will not be entered and this will never be True
             queries[-1].kw_filtered += 1
             continue
         descr = descrs.split(';')[0]
-
-        #ATTEMPT BIN
+
+        # ATTEMPT BIN
         subj_bins = []
-        for bin in bins: #if we are not binning, bins = [] so for loop not entered
+        for bin in bins:  # if we are not binning, bins = [] so for loop not entered
             for acc in accs:
                 if acc.split('.')[0] in bin.dict:
                     try:
                         queries[-1].bins[bin.label].append(len(queries[-1].matches))
-                    except:
+                    except Exception as e:
                         queries[-1].bins[bin.label] = [len(queries[-1].matches)]
                     subj_bins.append(bin.label)
-                    break #this result has been binned to this bin so break
+                    break  # this result has been binned to this bin so break
         acc = accs[0]
-
+
         score = int(float(cols[SCORE_COL]))
         p_cov = float(cols[PCOV_COL])
-
-        #SAVE RESULT
+
+        # SAVE RESULT
         queries[-1].matches.append(
             BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins)
         )
-        output_tab.write(line)
+        output_tab.write(line)
 input_tab.close()
 output_tab.close()

@@ -212,4 +209,3 @@
 out_html = open(args.output_html, 'w')
 out_html.write(str(html))
 out_html.close()
-