Mercurial > repos > dfornika > blast_report
comparison blast_report.py @ 0:18b097eb1a51 draft
planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/blast_report commit 006cbba6513492f5a06b573c676400a2d464520b-dirty
author | dfornika |
---|---|
date | Tue, 10 Sep 2019 12:51:57 -0400 |
parents | |
children | 2b4f30c6b50a |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:18b097eb1a51 |
---|---|
1 #!/usr/bin/env python | |
2 from __future__ import print_function | |
3 | |
4 '''Report on BLAST results. | |
5 | |
6 python bccdc_blast_report.py input_tab cheetah_tmpl output_html output_tab [-f [filter_pident]:[filterkw1,...,filterkwN]] [-b bin1_label=bin1_path[,...binN_label=binN_path]] | |
7 ''' | |
8 | |
9 import optparse | |
10 import re | |
11 import sys | |
12 | |
13 def stop_err( msg ): | |
14 sys.stderr.write("%s\n" % msg) | |
15 sys.exit(1) | |
16 | |
17 class BLASTBin: | |
18 def __init__(self, label, file): | |
19 self.label = label | |
20 self.dict = {} | |
21 | |
22 file_in = open(file) | |
23 for line in file_in: | |
24 self.dict[line.rstrip().split('.')[0]] = '' | |
25 file_in.close() | |
26 | |
27 def __str__(self): | |
28 return "label: %s dict: %s" % (self.label, str(self.dict)) | |
29 | |
30 class BLASTQuery: | |
31 def __init__(self, query_id): | |
32 self.query_id = query_id | |
33 self.matches = [] | |
34 self.match_accessions = {} | |
35 self.bins = {} #{bin(label):[match indexes]} | |
36 self.pident_filtered = 0 | |
37 self.kw_filtered = 0 | |
38 self.kw_filtered_breakdown = {} #{kw:count} | |
39 | |
40 def __str__(self): | |
41 return "query_id: %s len(matches): %s bins (labels only): %s pident_filtered: %s kw_filtered: %s kw_filtered_breakdown: %s" \ | |
42 % (self.query_id, | |
43 str(len(self.matches)), | |
44 str([bin.label for bin in bins]), | |
45 str(self.pident_filtered), | |
46 str(self.kw_filtered), | |
47 str(self.kw_filtered_breakdown)) | |
48 | |
49 class BLASTMatch: | |
50 def __init__(self, subject_acc, subject_descr, score, p_cov, p_ident, subject_bins): | |
51 self.subject_acc = subject_acc | |
52 self.subject_descr = subject_descr | |
53 self.score = score | |
54 self.p_cov = p_cov | |
55 self.p_ident = p_ident | |
56 self.bins = subject_bins | |
57 | |
58 def __str__(self): | |
59 return "subject_acc: %s subject_descr: %s score: %s p-cov: %s p-ident: %s" \ | |
60 % (self.subject_acc, | |
61 self.subject_descr, | |
62 str(self.score), | |
63 str(round(self.p_cov,2)), | |
64 str(round(self.p_ident, 2))) | |
65 | |
66 #PARSE OPTIONS AND ARGUMENTS | |
67 parser = optparse.OptionParser(description='Report on BLAST results.', | |
68 usage='python bccdc_blast_report_generator.py input_tabut cheetah_tmpl output_html [output_id output_dir] [options]') | |
69 | |
70 parser.add_option('-f', '--filter', | |
71 type='string', | |
72 dest='filter', | |
73 ) | |
74 parser.add_option('-b', '--bins', | |
75 type='string', | |
76 dest='bins' | |
77 ) | |
78 parser.add_option('-r', '--redundant', | |
79 dest='hsp', | |
80 default=False, | |
81 action='store_true' | |
82 ) | |
83 options, args = parser.parse_args() | |
84 | |
85 try: | |
86 input_tab, cheetah_tmpl, output_html, output_tab = args | |
87 except: | |
88 stop_err('you must supply the arguments input_tab, cheetah_tmpl and output_html.') | |
89 #print('input_tab: %s cheetah_tmpl: %s output_html: %s output_tab: %s' % (input_tab, cheetah_tmpl, output_html, output_tab)) | |
90 | |
91 #BINS | |
92 bins=[] | |
93 if options.bins != None: | |
94 bins = list([BLASTBin(label_file.split('=')[0],label_file.split('=')[-1]) for label_file in options.bins.split(',')]) | |
95 print('database bins: %s' % str([bin.label for bin in bins])) | |
96 | |
97 #FILTERS | |
98 filter_pident = 0 | |
99 filter_kws = [] | |
100 if options.filter != None: | |
101 pident_kws = options.filter.split(':') | |
102 filter_pident = float(pident_kws[0]) | |
103 filter_kws = pident_kws[-1].split(',') | |
104 print('filter_pident: %s filter_kws: %s' % (str(filter_pident), str(filter_kws))) | |
105 | |
106 if options.hsp: | |
107 print('Throwing out redundant hits...') | |
108 | |
109 #RESULTS! | |
110 PIDENT_COL = 2 | |
111 DESCR_COL = 25 | |
112 SUBJ_ID_COL = 12 | |
113 SCORE_COL = 11 | |
114 PCOV_COL = 24 | |
115 queries = [] | |
116 current_query = '' | |
117 output_tab = open(output_tab, 'w') | |
118 with open(input_tab) as input_tab: | |
119 for line in input_tab: | |
120 cols = line.split('\t') | |
121 if cols[0] != current_query: | |
122 current_query = cols[0] | |
123 queries.append(BLASTQuery(current_query)) | |
124 | |
125 try: | |
126 accs = cols[SUBJ_ID_COL].split('|')[1::2][1::2] | |
127 except IndexError as e: | |
128 stop_err("Problem with splitting:" + cols[SUBJ_ID_COL]) | |
129 | |
130 #hsp option: keep best (first) hit only for each query and accession id. | |
131 if options.hsp: | |
132 if accs[0] in queries[-1].match_accessions: | |
133 continue #don't save the result and skip to the next | |
134 else: | |
135 queries[-1].match_accessions[accs[0]] = '' | |
136 | |
137 | |
138 p_ident = float(cols[PIDENT_COL]) | |
139 #FILTER BY PIDENT | |
140 if p_ident < filter_pident: #if we are not filtering, filter_pident == 0 and this will never evaluate to True | |
141 queries[-1].pident_filtered += 1 | |
142 continue | |
143 | |
144 descrs = cols[DESCR_COL] | |
145 #FILTER BY KEY WORDS | |
146 filter_by_kw = False | |
147 for kw in filter_kws: | |
148 kw = kw.strip() #Fix by Damion D Nov 2013 | |
149 if kw != '' and re.search(kw, descrs, re.IGNORECASE): | |
150 filter_by_kw = True | |
151 try: | |
152 queries[-1].kw_filtered_breakdown[kw] += 1 | |
153 except: | |
154 queries[-1].kw_filtered_breakdown[kw] = 1 | |
155 if filter_by_kw: #if we are not filtering, for loop will not be entered and this will never be True | |
156 queries[-1].kw_filtered += 1 | |
157 continue | |
158 descr = descrs.split(';')[0] | |
159 | |
160 #ATTEMPT BIN | |
161 subj_bins = [] | |
162 for bin in bins: #if we are not binning, bins = [] so for loop not entered | |
163 for acc in accs: | |
164 if acc.split('.')[0] in bin.dict: | |
165 try: | |
166 queries[-1].bins[bin.label].append(len(queries[-1].matches)) | |
167 except: | |
168 queries[-1].bins[bin.label] = [len(queries[-1].matches)] | |
169 subj_bins.append(bin.label) | |
170 break #this result has been binned to this bin so break | |
171 acc = accs[0] | |
172 | |
173 score = int(float(cols[SCORE_COL])) | |
174 p_cov = float(cols[PCOV_COL]) | |
175 | |
176 #SAVE RESULT | |
177 queries[-1].matches.append(BLASTMatch(acc, descr, score, p_cov, p_ident, subj_bins)) | |
178 output_tab.write(line) | |
179 input_tab.close() | |
180 output_tab.close() | |
181 | |
182 ''' | |
183 for query in queries: | |
184 print(query) | |
185 for match in query.matches: | |
186 print(' %s' % str(match)) | |
187 for bin in query.bins: | |
188 print(' bin: %s' % bin) | |
189 for x in query.bins[bin]: | |
190 print(' %s' % str(query.matches[x])) | |
191 ''' | |
192 | |
193 from Cheetah.Template import Template | |
194 namespace = {'queries': queries} | |
195 html = Template(file=cheetah_tmpl, searchList=[namespace]) | |
196 out_html = open(output_html, 'w') | |
197 out_html.write(str(html)) | |
198 out_html.close() |