Mercurial > repos > saketkc > polyphen2
comparison polyphen2_web/polyphen2_web.py @ 0:b319f980c9e6 draft
Uploaded
author | saketkc |
---|---|
date | Mon, 14 Apr 2014 17:27:06 -0400 |
parents | |
children | 3c40b02934ad |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:b319f980c9e6 |
---|---|
1 #!/usr/bin/python | |
2 from bs4 import BeautifulSoup | |
3 import argparse | |
4 import sys | |
5 import time | |
6 import os | |
7 import tempfile | |
8 import requests | |
9 import shutil | |
10 import csv | |
11 submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi' | |
12 result_url = 'http://genetics.bwh.harvard.edu' | |
13 | |
14 refresh_interval = 30 | |
15 TIMEOUT = 60 * 60 | |
16 TIME_DELAY = 7 | |
17 MAX_TRIES = 30 | |
18 | |
19 # Genome assembly version used for chromosome | |
20 # coordinates of the SNPs in user input | |
21 UCSCDB = ['hg19', 'hg18'] | |
22 # Classifier model used for predictions. | |
23 MODELNAME = ['HumDiv', 'HumVar'] | |
24 | |
25 # Set of transcripts on which genomic SNPs will be mapped | |
26 SNPFILTER = { | |
27 'All': 0, | |
28 'Canonical': 1, | |
29 'CCDS': 3, | |
30 } | |
31 # Functional SNP categories to include in genomic SNPs annotation report | |
32 SNPFUNCTION = ['c', 'm', ''] | |
33 | |
34 | |
35 def stop_err(msg, err=1): | |
36 sys.stderr.write('%s\n' % msg) | |
37 sys.exit(err) | |
38 | |
39 | |
40 class Polyphen2Web: | |
41 | |
42 def __init__(self, ucscdb=None, model_name=None, snp_filter=None, | |
43 snp_function=None, file_location=None, email=None): | |
44 self.ucscdb = ucscdb | |
45 self.model_name = model_name | |
46 self.snp_filter = snp_filter | |
47 self.snp_function = snp_function | |
48 self.file_location = file_location | |
49 self.notify_me = email | |
50 | |
51 def soupify(self, string): | |
52 return BeautifulSoup(string) | |
53 | |
54 def make_request(self): | |
55 in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t') | |
56 tmp_dir = tempfile.mkdtemp() | |
57 path = os.path.join(tmp_dir, 'csv_file') | |
58 with open(path, 'wb') as fh: | |
59 a = csv.writer(fh) | |
60 a.writerows(in_txt) | |
61 contents = open(self.file_location, 'r').read().replace( | |
62 '\t', ' ').replace('::::::::::::::', '') | |
63 if self.snp_function == 'All': | |
64 self.snp_function = '' | |
65 payload = { | |
66 '_ggi_project': 'PPHWeb2', | |
67 '_ggi_origin': 'query', | |
68 '_ggi_batch': contents, | |
69 '_ggi_target_pipeline': '1', | |
70 'MODELNAME': self.model_name, | |
71 'UCSCDB': self.ucscdb, | |
72 'SNPFILTER': SNPFILTER[self.snp_filter], | |
73 'SNPFUNC': self.snp_function, | |
74 'NOTIFYME': '', | |
75 | |
76 } | |
77 if self.notify_me: | |
78 payload['NOTIFYME'] = self.notify_me | |
79 request = requests.post(submission_url, data=payload) | |
80 content = request.content | |
81 soup = self.soupify(content) | |
82 sid_soup = soup.find('input', {'name': 'sid'}) | |
83 try: | |
84 sid = sid_soup['value'] | |
85 except: | |
86 sid = None | |
87 shutil.rmtree(tmp_dir) | |
88 return sid | |
89 | |
90 def poll_for_files(self, sid, | |
91 max_tries=MAX_TRIES, | |
92 time_delay=TIME_DELAY, | |
93 timeout=TIMEOUT): | |
94 payload = { | |
95 '_ggi_project': 'PPHWeb2', | |
96 '_ggi_origin': 'manage', | |
97 '_ggi_target_manage': 'Refresh', | |
98 'sid': sid, | |
99 } | |
100 content = None | |
101 tries = 0 | |
102 url_dict = None | |
103 while True: | |
104 tries += 1 | |
105 if tries > max_tries: | |
106 stop_err('Number of tries exceeded!') | |
107 request = requests.post(submission_url, data=payload) | |
108 content = request.content | |
109 soup = self.soupify(content) | |
110 all_tables = soup.findAll('table') | |
111 if all_tables: | |
112 try: | |
113 running_jobs_table = all_tables[-2] | |
114 except: | |
115 running_jobs_table = None | |
116 if running_jobs_table: | |
117 rows = running_jobs_table.findAll('tr') | |
118 if len(rows) == 1: | |
119 row = rows[0] | |
120 hrefs = row.findAll('a') | |
121 # print hrefs | |
122 if len(hrefs) >= 3: | |
123 short_txt = hrefs[0]['href'] | |
124 # print short_txt | |
125 path = short_txt.split('-')[0] | |
126 full_txt = result_url + path + '-full.txt' | |
127 log_txt = result_url + path + '-log.txt' | |
128 snps_txt = result_url + path + '-snps.txt' | |
129 short_txt = result_url + path + \ | |
130 '-short.txt' # short_txt | |
131 url_dict = { | |
132 'full_file': full_txt, | |
133 'snps_file': snps_txt, | |
134 'log_file': log_txt, | |
135 'short_file': short_txt, | |
136 } | |
137 return url_dict | |
138 time.sleep(time_delay) | |
139 return url_dict | |
140 | |
141 def save_to_files(self, url_dict, args): | |
142 tmp_dir = tempfile.mkdtemp() | |
143 for key, value in url_dict.iteritems(): | |
144 r = requests.get(value, stream=True) | |
145 if r.status_code == 200: | |
146 path = os.path.join(tmp_dir, key) | |
147 with open(path, 'wb') as f: | |
148 for chunk in r.iter_content(128): | |
149 f.write(chunk) | |
150 shutil.move(path, args[key]) | |
151 if os.path.exists(tmp_dir): | |
152 shutil.rmtree(tmp_dir) | |
153 return True | |
154 | |
155 | |
156 def main(args): | |
157 parser = argparse.ArgumentParser() | |
158 parser.add_argument('-u', | |
159 '--ucscdb', | |
160 dest='ucscdb', | |
161 choices=UCSCDB, | |
162 required=True, type=str) | |
163 parser.add_argument('-m', '--model', | |
164 dest='modelname', choices=MODELNAME, | |
165 required=True, type=str) | |
166 parser.add_argument('-fl', '--filter', | |
167 '--snpfilter', dest='snpfilter', | |
168 choices=SNPFILTER.keys(), | |
169 required=True, type=str) | |
170 parser.add_argument('-i', '--input', | |
171 dest='input', nargs='?', | |
172 required=True, type=str, | |
173 default=sys.stdin) | |
174 parser.add_argument('-e', '--email', | |
175 dest='email', | |
176 required=False, default=None) | |
177 parser.add_argument('--log', dest='log_file', | |
178 required=True, default=None, type=str) | |
179 parser.add_argument('--short', dest='short_file', | |
180 required=True, default=None, type=str) | |
181 parser.add_argument('--full', dest='full_file', | |
182 required=True, default=None, type=str) | |
183 parser.add_argument('--snp', dest='snps_file', | |
184 required=True, default=None, type=str) | |
185 parser.add_argument('--function', dest='snpfunction', | |
186 required=True, type=str) | |
187 args_s = vars(parser.parse_args(args)) | |
188 polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'], | |
189 model_name=args_s['modelname'], | |
190 snp_filter=args_s['snpfilter'], | |
191 snp_function=args_s['snpfunction'], | |
192 file_location=args_s['input'], | |
193 email=args_s['email']) | |
194 sid = polyphen2_web.make_request() | |
195 if not sid: | |
196 stop_err( | |
197 'Something went wrong! The tracking id could not be retrieved.') | |
198 url_dict = polyphen2_web.poll_for_files(sid) | |
199 locations = {} | |
200 if not url_dict: | |
201 stop_err('There was error downloading the output files!') | |
202 for key in url_dict.keys(): | |
203 locations[key] = args_s[key] | |
204 polyphen2_web.save_to_files(url_dict, locations) | |
205 return True | |
206 | |
207 if __name__ == '__main__': | |
208 main(sys.argv[1:]) |