comparison blast_report.py @ 30:1d6a2561e05e draft

Uploaded
author dfornika
date Tue, 03 Mar 2020 10:55:12 +0000
parents c3b7a0a72107
children
comparison
equal deleted inserted replaced
29:c3b7a0a72107 30:1d6a2561e05e
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 from __future__ import print_function 3 from __future__ import print_function
4
5 '''Report on BLAST results.
6
7 python blast_report.py input_tab cheetah_tmpl output_html output_tab [-i [min_identity]] [-f filterkw1,...,filterkwN]] [-b bin1_label bin1_path[,...binN_label binN_path]]
8 '''
9 4
10 import argparse 5 import argparse
11 import re 6 import re
12 import sys 7 import sys
13 8
14 from Cheetah.Template import Template 9 from Cheetah.Template import Template
15 from pprint import pprint 10
16 11
17 def stop_err( msg ): 12 def stop_err(msg):
18 sys.stderr.write("%s\n" % msg) 13 sys.stderr.write("%s\n" % msg)
19 sys.exit(1) 14 sys.exit(1)
20 15
21 16
22 class BLASTBin: 17 class BLASTBin:
23 def __init__(self, label, file): 18 def __init__(self, label, file):
24 self.label = label 19 self.label = label
25 self.dict = {} 20 self.dict = {}
26 21
27 file_in = open(file) 22 file_in = open(file)
28 for line in file_in: 23 for line in file_in:
29 self.dict[line.rstrip().split('.')[0]] = '' 24 self.dict[line.rstrip().split('.')[0]] = ''
30 file_in.close() 25 file_in.close()
31 26
32 def __str__(self): 27 def __str__(self):
33 return "label: %s dict: %s" % (self.label, str(self.dict)) 28 return "label: %s dict: %s" % (self.label, str(self.dict))
34 29
35 30
36 class BLASTQuery: 31 class BLASTQuery:
37 def __init__(self, query_id): 32 def __init__(self, query_id):
38 self.query_id = query_id 33 self.query_id = query_id
39 self.matches = [] 34 self.matches = []
40 self.match_accessions = {} 35 self.match_accessions = {}
41 self.bins = {} #{bin(label):[match indexes]} 36 self.bins = {} # {bin(label):[match indexes]}
42 self.pident_filtered = 0 37 self.pident_filtered = 0
43 self.kw_filtered = 0 38 self.kw_filtered = 0
44 self.kw_filtered_breakdown = {} #{kw:count} 39 self.kw_filtered_breakdown = {} # {kw:count}
45 40
46 def __str__(self): 41 def __str__(self):
47 return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ 42 format_string = "\t".join([
43 "query_id: %s",
44 "len(matches): %s",
45 "bins (labels only): %s",
46 "pident_filtered: %s",
47 "kw_filtered: %s",
48 "kw_filtered_breakdown: %s"
49 ])
50 return format_string \
48 % (self.query_id, 51 % (self.query_id,
49 str(len(self.matches)), 52 str(len(self.matches)),
50 str([bin.label for bin in bins]), 53 str([bin.label for bin in bins]),
51 str(self.pident_filtered), 54 str(self.pident_filtered),
52 str(self.kw_filtered), 55 str(self.kw_filtered),
59 self.subject_descr = subject_descr 62 self.subject_descr = subject_descr
60 self.score = score 63 self.score = score
61 self.p_cov = p_cov 64 self.p_cov = p_cov
62 self.p_ident = p_ident 65 self.p_ident = p_ident
63 self.bins = subject_bins 66 self.bins = subject_bins
64 67
65 def __str__(self): 68 def __str__(self):
66 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ 69 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \
67 % (self.subject_acc, 70 % (self.subject_acc,
68 self.subject_descr, 71 self.subject_descr,
69 str(self.score), 72 str(self.score),
70 str(round(self.p_cov,2)), 73 str(round(self.p_cov, 2)),
71 str(round(self.p_ident, 2))) 74 str(round(self.p_ident, 2)))
72 75
73 76
74 #PARSE OPTIONS AND ARGUMENTS 77 # PARSE OPTIONS AND ARGUMENTS
75 parser = argparse.ArgumentParser() 78 parser = argparse.ArgumentParser()
76 79
77 parser.add_argument('-f', '--filter-keywords', 80 parser.add_argument('-f', '--filter-keywords',
78 dest='filter_keywords', 81 dest='filter_keywords',
79 ) 82 )
95 parser.add_argument('output_html') 98 parser.add_argument('output_html')
96 parser.add_argument('output_tab') 99 parser.add_argument('output_tab')
97 100
98 args = parser.parse_args() 101 args = parser.parse_args()
99 102
100 pprint(args.bins) 103 # BINS
101 104 bins = []
102 print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (args.input_tab, args.cheetah_tmpl, args.output_html, args.output_tab)) 105 if args.bins is not None:
103
104
105 #BINS
106 bins=[]
107 if args.bins != None:
108 for bin in args.bins: 106 for bin in args.bins:
109 bins.append(BLASTBin(bin[0], bin[1])) 107 bins.append(BLASTBin(bin[0], bin[1]))
110 108
111 print('database bins: %s' % str([bin.label for bin in bins])) 109 print('database bins: %s' % str([bin.label for bin in bins]))
112 110
113 #FILTERS 111 # FILTERS
114 filter_pident = 0 112 filter_pident = 0
115 filter_kws = [] 113 filter_kws = []
116 if args.filter_keywords: 114 if args.filter_keywords:
117 filter_kws = args.filter_keywords.split(',') 115 filter_kws = args.filter_keywords.split(',')
118 print('minimum percent identity: %s filter_kws: %s' % (str(args.min_identity), str(filter_kws))) 116 print('minimum percent identity: %s filter_kws: %s' % (str(args.min_identity), str(filter_kws)))
127 SCORE_COL = 11 125 SCORE_COL = 11
128 PCOV_COL = 25 126 PCOV_COL = 25
129 queries = [] 127 queries = []
130 current_query = '' 128 current_query = ''
131 output_tab = open(args.output_tab, 'w') 129 output_tab = open(args.output_tab, 'w')
132 130
133 with open(args.input_tab) as input_tab: 131 with open(args.input_tab) as input_tab:
134 for line in input_tab: 132 for line in input_tab:
135 cols = line.split('\t') 133 cols = line.split('\t')
136 if cols[0] != current_query: 134 if cols[0] != current_query:
137 current_query = cols[0] 135 current_query = cols[0]
140 try: 138 try:
141 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] 139 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2]
142 except IndexError as e: 140 except IndexError as e:
143 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) 141 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL])
144 142
145 #hsp option: keep best (first) hit only for each query and accession id. 143 # keep best (first) hit only for each query and accession id.
146 if args.discard_redundant: 144 if args.discard_redundant:
147 if accs[0] in queries[-1].match_accessions: 145 if accs[0] in queries[-1].match_accessions:
148 continue #don't save the result and skip to the next 146 continue # don't save the result and skip to the next
149 else: 147 else:
150 queries[-1].match_accessions[accs[0]] = '' 148 queries[-1].match_accessions[accs[0]] = ''
151 149
152
153 p_ident = float(cols[PIDENT_COL]) 150 p_ident = float(cols[PIDENT_COL])
154 #FILTER BY PIDENT 151 # FILTER BY PIDENT
155 if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True 152 if p_ident < filter_pident: # if we are not filtering, filter_pident == 0 and this will never evaluate to True
156 queries[-1].pident_filtered += 1 153 queries[-1].pident_filtered += 1
157 continue 154 continue
158 155
159 descrs = cols[DESCR_COL] 156 descrs = cols[DESCR_COL]
160 #FILTER BY KEY WORDS 157 # FILTER BY KEY WORDS
161 filter_by_kw = False 158 filter_by_kw = False
162 for kw in filter_kws: 159 for kw in filter_kws:
163 kw = kw.strip() 160 kw = kw.strip()
164 if kw != '' and re.search(kw, descrs, re.IGNORECASE): 161 if kw != '' and re.search(kw, descrs, re.IGNORECASE):
165 filter_by_kw = True 162 filter_by_kw = True
166 try: 163 try:
167 queries[-1].kw_filtered_breakdown[kw] += 1 164 queries[-1].kw_filtered_breakdown[kw] += 1
168 except: 165 except Exception as e:
169 queries[-1].kw_filtered_breakdown[kw] = 1 166 queries[-1].kw_filtered_breakdown[kw] = 1
170 if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True 167 if filter_by_kw: # if we are not filtering, for loop will not be entered and this will never be True
171 queries[-1].kw_filtered += 1 168 queries[-1].kw_filtered += 1
172 continue 169 continue
173 descr = descrs.split(';')[0] 170 descr = descrs.split(';')[0]
174 171
175 #ATTEMPT BIN 172 # ATTEMPT BIN
176 subj_bins = [] 173 subj_bins = []
177 for bin in bins: #if we are not binning, bins = [] so for loop not entered 174 for bin in bins: # if we are not binning, bins = [] so for loop not entered
178 for acc in accs: 175 for acc in accs:
179 if acc.split('.')[0] in bin.dict: 176 if acc.split('.')[0] in bin.dict:
180 try: 177 try:
181 queries[-1].bins[bin.label].append(len(queries[-1].matches)) 178 queries[-1].bins[bin.label].append(len(queries[-1].matches))
182 except: 179 except Exception as e:
183 queries[-1].bins[bin.label] = [len(queries[-1].matches)] 180 queries[-1].bins[bin.label] = [len(queries[-1].matches)]
184 subj_bins.append(bin.label) 181 subj_bins.append(bin.label)
185 break #this result has been binned to this bin so break 182 break # this result has been binned to this bin so break
186 acc = accs[0] 183 acc = accs[0]
187 184
188 score = int(float(cols[SCORE_COL])) 185 score = int(float(cols[SCORE_COL]))
189 p_cov = float(cols[PCOV_COL]) 186 p_cov = float(cols[PCOV_COL])
190 187
191 #SAVE RESULT 188 # SAVE RESULT
192 queries[-1].matches.append( 189 queries[-1].matches.append(
193 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) 190 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins)
194 ) 191 )
195 output_tab.write(line) 192 output_tab.write(line)
196 input_tab.close() 193 input_tab.close()
197 output_tab.close() 194 output_tab.close()
198 195
199 ''' 196 '''
200 for query in queries: 197 for query in queries:
210 namespace = {'queries': queries} 207 namespace = {'queries': queries}
211 html = Template(file=args.cheetah_tmpl, searchList=[namespace]) 208 html = Template(file=args.cheetah_tmpl, searchList=[namespace])
212 out_html = open(args.output_html, 'w') 209 out_html = open(args.output_html, 'w')
213 out_html.write(str(html)) 210 out_html.write(str(html))
214 out_html.close() 211 out_html.close()
215