Mercurial > repos > dfornika > blast_report_basic
comparison blast_report.py @ 30:1d6a2561e05e draft
Uploaded
author | dfornika |
---|---|
date | Tue, 03 Mar 2020 10:55:12 +0000 |
parents | c3b7a0a72107 |
children |
comparison
equal
deleted
inserted
replaced
29:c3b7a0a72107 | 30:1d6a2561e05e |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 from __future__ import print_function | 3 from __future__ import print_function |
4 | |
5 '''Report on BLAST results. | |
6 | |
7 python blast_report.py input_tab cheetah_tmpl output_html output_tab [-i [min_identity]] [-f filterkw1,...,filterkwN]] [-b bin1_label bin1_path[,...binN_label binN_path]] | |
8 ''' | |
9 | 4 |
10 import argparse | 5 import argparse |
11 import re | 6 import re |
12 import sys | 7 import sys |
13 | 8 |
14 from Cheetah.Template import Template | 9 from Cheetah.Template import Template |
15 from pprint import pprint | 10 |
16 | 11 |
17 def stop_err( msg ): | 12 def stop_err(msg): |
18 sys.stderr.write("%s\n" % msg) | 13 sys.stderr.write("%s\n" % msg) |
19 sys.exit(1) | 14 sys.exit(1) |
20 | 15 |
21 | 16 |
22 class BLASTBin: | 17 class BLASTBin: |
23 def __init__(self, label, file): | 18 def __init__(self, label, file): |
24 self.label = label | 19 self.label = label |
25 self.dict = {} | 20 self.dict = {} |
26 | 21 |
27 file_in = open(file) | 22 file_in = open(file) |
28 for line in file_in: | 23 for line in file_in: |
29 self.dict[line.rstrip().split('.')[0]] = '' | 24 self.dict[line.rstrip().split('.')[0]] = '' |
30 file_in.close() | 25 file_in.close() |
31 | 26 |
32 def __str__(self): | 27 def __str__(self): |
33 return "label: %s dict: %s" % (self.label, str(self.dict)) | 28 return "label: %s dict: %s" % (self.label, str(self.dict)) |
34 | 29 |
35 | 30 |
36 class BLASTQuery: | 31 class BLASTQuery: |
37 def __init__(self, query_id): | 32 def __init__(self, query_id): |
38 self.query_id = query_id | 33 self.query_id = query_id |
39 self.matches = [] | 34 self.matches = [] |
40 self.match_accessions = {} | 35 self.match_accessions = {} |
41 self.bins = {} #{bin(label):[match indexes]} | 36 self.bins = {} # {bin(label):[match indexes]} |
42 self.pident_filtered = 0 | 37 self.pident_filtered = 0 |
43 self.kw_filtered = 0 | 38 self.kw_filtered = 0 |
44 self.kw_filtered_breakdown = {} #{kw:count} | 39 self.kw_filtered_breakdown = {} # {kw:count} |
45 | 40 |
46 def __str__(self): | 41 def __str__(self): |
47 return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ | 42 format_string = "\t".join([ |
43 "query_id: %s", | |
44 "len(matches): %s", | |
45 "bins (labels only): %s", | |
46 "pident_filtered: %s", | |
47 "kw_filtered: %s", | |
48 "kw_filtered_breakdown: %s" | |
49 ]) | |
50 return format_string \ | |
48 % (self.query_id, | 51 % (self.query_id, |
49 str(len(self.matches)), | 52 str(len(self.matches)), |
50 str([bin.label for bin in bins]), | 53 str([bin.label for bin in bins]), |
51 str(self.pident_filtered), | 54 str(self.pident_filtered), |
52 str(self.kw_filtered), | 55 str(self.kw_filtered), |
59 self.subject_descr = subject_descr | 62 self.subject_descr = subject_descr |
60 self.score = score | 63 self.score = score |
61 self.p_cov = p_cov | 64 self.p_cov = p_cov |
62 self.p_ident = p_ident | 65 self.p_ident = p_ident |
63 self.bins = subject_bins | 66 self.bins = subject_bins |
64 | 67 |
65 def __str__(self): | 68 def __str__(self): |
66 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ | 69 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ |
67 % (self.subject_acc, | 70 % (self.subject_acc, |
68 self.subject_descr, | 71 self.subject_descr, |
69 str(self.score), | 72 str(self.score), |
70 str(round(self.p_cov,2)), | 73 str(round(self.p_cov, 2)), |
71 str(round(self.p_ident, 2))) | 74 str(round(self.p_ident, 2))) |
72 | 75 |
73 | 76 |
74 #PARSE OPTIONS AND ARGUMENTS | 77 # PARSE OPTIONS AND ARGUMENTS |
75 parser = argparse.ArgumentParser() | 78 parser = argparse.ArgumentParser() |
76 | 79 |
77 parser.add_argument('-f', '--filter-keywords', | 80 parser.add_argument('-f', '--filter-keywords', |
78 dest='filter_keywords', | 81 dest='filter_keywords', |
79 ) | 82 ) |
95 parser.add_argument('output_html') | 98 parser.add_argument('output_html') |
96 parser.add_argument('output_tab') | 99 parser.add_argument('output_tab') |
97 | 100 |
98 args = parser.parse_args() | 101 args = parser.parse_args() |
99 | 102 |
100 pprint(args.bins) | 103 # BINS |
101 | 104 bins = [] |
102 print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (args.input_tab, args.cheetah_tmpl, args.output_html, args.output_tab)) | 105 if args.bins is not None: |
103 | |
104 | |
105 #BINS | |
106 bins=[] | |
107 if args.bins != None: | |
108 for bin in args.bins: | 106 for bin in args.bins: |
109 bins.append(BLASTBin(bin[0], bin[1])) | 107 bins.append(BLASTBin(bin[0], bin[1])) |
110 | 108 |
111 print('database bins: %s' % str([bin.label for bin in bins])) | 109 print('database bins: %s' % str([bin.label for bin in bins])) |
112 | 110 |
113 #FILTERS | 111 # FILTERS |
114 filter_pident = 0 | 112 filter_pident = 0 |
115 filter_kws = [] | 113 filter_kws = [] |
116 if args.filter_keywords: | 114 if args.filter_keywords: |
117 filter_kws = args.filter_keywords.split(',') | 115 filter_kws = args.filter_keywords.split(',') |
118 print('minimum percent identity: %s filter_kws: %s' % (str(args.min_identity), str(filter_kws))) | 116 print('minimum percent identity: %s filter_kws: %s' % (str(args.min_identity), str(filter_kws))) |
127 SCORE_COL = 11 | 125 SCORE_COL = 11 |
128 PCOV_COL = 25 | 126 PCOV_COL = 25 |
129 queries = [] | 127 queries = [] |
130 current_query = '' | 128 current_query = '' |
131 output_tab = open(args.output_tab, 'w') | 129 output_tab = open(args.output_tab, 'w') |
132 | 130 |
133 with open(args.input_tab) as input_tab: | 131 with open(args.input_tab) as input_tab: |
134 for line in input_tab: | 132 for line in input_tab: |
135 cols = line.split('\t') | 133 cols = line.split('\t') |
136 if cols[0] != current_query: | 134 if cols[0] != current_query: |
137 current_query = cols[0] | 135 current_query = cols[0] |
140 try: | 138 try: |
141 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] | 139 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] |
142 except IndexError as e: | 140 except IndexError as e: |
143 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) | 141 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) |
144 | 142 |
145 #hsp option: keep best (first) hit only for each query and accession id. | 143 # keep best (first) hit only for each query and accession id. |
146 if args.discard_redundant: | 144 if args.discard_redundant: |
147 if accs[0] in queries[-1].match_accessions: | 145 if accs[0] in queries[-1].match_accessions: |
148 continue #don't save the result and skip to the next | 146 continue # don't save the result and skip to the next |
149 else: | 147 else: |
150 queries[-1].match_accessions[accs[0]] = '' | 148 queries[-1].match_accessions[accs[0]] = '' |
151 | 149 |
152 | |
153 p_ident = float(cols[PIDENT_COL]) | 150 p_ident = float(cols[PIDENT_COL]) |
154 #FILTER BY PIDENT | 151 # FILTER BY PIDENT |
155 if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True | 152 if p_ident < filter_pident: # if we are not filtering, filter_pident == 0 and this will never evaluate to True |
156 queries[-1].pident_filtered += 1 | 153 queries[-1].pident_filtered += 1 |
157 continue | 154 continue |
158 | 155 |
159 descrs = cols[DESCR_COL] | 156 descrs = cols[DESCR_COL] |
160 #FILTER BY KEY WORDS | 157 # FILTER BY KEY WORDS |
161 filter_by_kw = False | 158 filter_by_kw = False |
162 for kw in filter_kws: | 159 for kw in filter_kws: |
163 kw = kw.strip() | 160 kw = kw.strip() |
164 if kw != '' and re.search(kw, descrs, re.IGNORECASE): | 161 if kw != '' and re.search(kw, descrs, re.IGNORECASE): |
165 filter_by_kw = True | 162 filter_by_kw = True |
166 try: | 163 try: |
167 queries[-1].kw_filtered_breakdown[kw] += 1 | 164 queries[-1].kw_filtered_breakdown[kw] += 1 |
168 except: | 165 except Exception as e: |
169 queries[-1].kw_filtered_breakdown[kw] = 1 | 166 queries[-1].kw_filtered_breakdown[kw] = 1 |
170 if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True | 167 if filter_by_kw: # if we are not filtering, for loop will not be entered and this will never be True |
171 queries[-1].kw_filtered += 1 | 168 queries[-1].kw_filtered += 1 |
172 continue | 169 continue |
173 descr = descrs.split(';')[0] | 170 descr = descrs.split(';')[0] |
174 | 171 |
175 #ATTEMPT BIN | 172 # ATTEMPT BIN |
176 subj_bins = [] | 173 subj_bins = [] |
177 for bin in bins: #if we are not binning, bins = [] so for loop not entered | 174 for bin in bins: # if we are not binning, bins = [] so for loop not entered |
178 for acc in accs: | 175 for acc in accs: |
179 if acc.split('.')[0] in bin.dict: | 176 if acc.split('.')[0] in bin.dict: |
180 try: | 177 try: |
181 queries[-1].bins[bin.label].append(len(queries[-1].matches)) | 178 queries[-1].bins[bin.label].append(len(queries[-1].matches)) |
182 except: | 179 except Exception as e: |
183 queries[-1].bins[bin.label] = [len(queries[-1].matches)] | 180 queries[-1].bins[bin.label] = [len(queries[-1].matches)] |
184 subj_bins.append(bin.label) | 181 subj_bins.append(bin.label) |
185 break #this result has been binned to this bin so break | 182 break # this result has been binned to this bin so break |
186 acc = accs[0] | 183 acc = accs[0] |
187 | 184 |
188 score = int(float(cols[SCORE_COL])) | 185 score = int(float(cols[SCORE_COL])) |
189 p_cov = float(cols[PCOV_COL]) | 186 p_cov = float(cols[PCOV_COL]) |
190 | 187 |
191 #SAVE RESULT | 188 # SAVE RESULT |
192 queries[-1].matches.append( | 189 queries[-1].matches.append( |
193 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) | 190 BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins) |
194 ) | 191 ) |
195 output_tab.write(line) | 192 output_tab.write(line) |
196 input_tab.close() | 193 input_tab.close() |
197 output_tab.close() | 194 output_tab.close() |
198 | 195 |
199 ''' | 196 ''' |
200 for query in queries: | 197 for query in queries: |
210 namespace = {'queries': queries} | 207 namespace = {'queries': queries} |
211 html = Template(file=args.cheetah_tmpl, searchList=[namespace]) | 208 html = Template(file=args.cheetah_tmpl, searchList=[namespace]) |
212 out_html = open(args.output_html, 'w') | 209 out_html = open(args.output_html, 'w') |
213 out_html.write(str(html)) | 210 out_html.write(str(html)) |
214 out_html.close() | 211 out_html.close() |
215 |