comparison polyphen2_web/polyphen2_web.py @ 0:b319f980c9e6 draft

Uploaded
author saketkc
date Mon, 14 Apr 2014 17:27:06 -0400
parents
children 3c40b02934ad
comparison
equal deleted inserted replaced
-1:000000000000 0:b319f980c9e6
1 #!/usr/bin/python
2 from bs4 import BeautifulSoup
3 import argparse
4 import sys
5 import time
6 import os
7 import tempfile
8 import requests
9 import shutil
10 import csv
11 submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi'
12 result_url = 'http://genetics.bwh.harvard.edu'
13
14 refresh_interval = 30
15 TIMEOUT = 60 * 60
16 TIME_DELAY = 7
17 MAX_TRIES = 30
18
19 # Genome assembly version used for chromosome
20 # coordinates of the SNPs in user input
21 UCSCDB = ['hg19', 'hg18']
22 # Classifier model used for predictions.
23 MODELNAME = ['HumDiv', 'HumVar']
24
25 # Set of transcripts on which genomic SNPs will be mapped
26 SNPFILTER = {
27 'All': 0,
28 'Canonical': 1,
29 'CCDS': 3,
30 }
31 # Functional SNP categories to include in genomic SNPs annotation report
32 SNPFUNCTION = ['c', 'm', '']
33
34
35 def stop_err(msg, err=1):
36 sys.stderr.write('%s\n' % msg)
37 sys.exit(err)
38
39
40 class Polyphen2Web:
41
42 def __init__(self, ucscdb=None, model_name=None, snp_filter=None,
43 snp_function=None, file_location=None, email=None):
44 self.ucscdb = ucscdb
45 self.model_name = model_name
46 self.snp_filter = snp_filter
47 self.snp_function = snp_function
48 self.file_location = file_location
49 self.notify_me = email
50
51 def soupify(self, string):
52 return BeautifulSoup(string)
53
54 def make_request(self):
55 in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t')
56 tmp_dir = tempfile.mkdtemp()
57 path = os.path.join(tmp_dir, 'csv_file')
58 with open(path, 'wb') as fh:
59 a = csv.writer(fh)
60 a.writerows(in_txt)
61 contents = open(self.file_location, 'r').read().replace(
62 '\t', ' ').replace('::::::::::::::', '')
63 if self.snp_function == 'All':
64 self.snp_function = ''
65 payload = {
66 '_ggi_project': 'PPHWeb2',
67 '_ggi_origin': 'query',
68 '_ggi_batch': contents,
69 '_ggi_target_pipeline': '1',
70 'MODELNAME': self.model_name,
71 'UCSCDB': self.ucscdb,
72 'SNPFILTER': SNPFILTER[self.snp_filter],
73 'SNPFUNC': self.snp_function,
74 'NOTIFYME': '',
75
76 }
77 if self.notify_me:
78 payload['NOTIFYME'] = self.notify_me
79 request = requests.post(submission_url, data=payload)
80 content = request.content
81 soup = self.soupify(content)
82 sid_soup = soup.find('input', {'name': 'sid'})
83 try:
84 sid = sid_soup['value']
85 except:
86 sid = None
87 shutil.rmtree(tmp_dir)
88 return sid
89
90 def poll_for_files(self, sid,
91 max_tries=MAX_TRIES,
92 time_delay=TIME_DELAY,
93 timeout=TIMEOUT):
94 payload = {
95 '_ggi_project': 'PPHWeb2',
96 '_ggi_origin': 'manage',
97 '_ggi_target_manage': 'Refresh',
98 'sid': sid,
99 }
100 content = None
101 tries = 0
102 url_dict = None
103 while True:
104 tries += 1
105 if tries > max_tries:
106 stop_err('Number of tries exceeded!')
107 request = requests.post(submission_url, data=payload)
108 content = request.content
109 soup = self.soupify(content)
110 all_tables = soup.findAll('table')
111 if all_tables:
112 try:
113 running_jobs_table = all_tables[-2]
114 except:
115 running_jobs_table = None
116 if running_jobs_table:
117 rows = running_jobs_table.findAll('tr')
118 if len(rows) == 1:
119 row = rows[0]
120 hrefs = row.findAll('a')
121 # print hrefs
122 if len(hrefs) >= 3:
123 short_txt = hrefs[0]['href']
124 # print short_txt
125 path = short_txt.split('-')[0]
126 full_txt = result_url + path + '-full.txt'
127 log_txt = result_url + path + '-log.txt'
128 snps_txt = result_url + path + '-snps.txt'
129 short_txt = result_url + path + \
130 '-short.txt' # short_txt
131 url_dict = {
132 'full_file': full_txt,
133 'snps_file': snps_txt,
134 'log_file': log_txt,
135 'short_file': short_txt,
136 }
137 return url_dict
138 time.sleep(time_delay)
139 return url_dict
140
141 def save_to_files(self, url_dict, args):
142 tmp_dir = tempfile.mkdtemp()
143 for key, value in url_dict.iteritems():
144 r = requests.get(value, stream=True)
145 if r.status_code == 200:
146 path = os.path.join(tmp_dir, key)
147 with open(path, 'wb') as f:
148 for chunk in r.iter_content(128):
149 f.write(chunk)
150 shutil.move(path, args[key])
151 if os.path.exists(tmp_dir):
152 shutil.rmtree(tmp_dir)
153 return True
154
155
156 def main(args):
157 parser = argparse.ArgumentParser()
158 parser.add_argument('-u',
159 '--ucscdb',
160 dest='ucscdb',
161 choices=UCSCDB,
162 required=True, type=str)
163 parser.add_argument('-m', '--model',
164 dest='modelname', choices=MODELNAME,
165 required=True, type=str)
166 parser.add_argument('-fl', '--filter',
167 '--snpfilter', dest='snpfilter',
168 choices=SNPFILTER.keys(),
169 required=True, type=str)
170 parser.add_argument('-i', '--input',
171 dest='input', nargs='?',
172 required=True, type=str,
173 default=sys.stdin)
174 parser.add_argument('-e', '--email',
175 dest='email',
176 required=False, default=None)
177 parser.add_argument('--log', dest='log_file',
178 required=True, default=None, type=str)
179 parser.add_argument('--short', dest='short_file',
180 required=True, default=None, type=str)
181 parser.add_argument('--full', dest='full_file',
182 required=True, default=None, type=str)
183 parser.add_argument('--snp', dest='snps_file',
184 required=True, default=None, type=str)
185 parser.add_argument('--function', dest='snpfunction',
186 required=True, type=str)
187 args_s = vars(parser.parse_args(args))
188 polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'],
189 model_name=args_s['modelname'],
190 snp_filter=args_s['snpfilter'],
191 snp_function=args_s['snpfunction'],
192 file_location=args_s['input'],
193 email=args_s['email'])
194 sid = polyphen2_web.make_request()
195 if not sid:
196 stop_err(
197 'Something went wrong! The tracking id could not be retrieved.')
198 url_dict = polyphen2_web.poll_for_files(sid)
199 locations = {}
200 if not url_dict:
201 stop_err('There was error downloading the output files!')
202 for key in url_dict.keys():
203 locations[key] = args_s[key]
204 polyphen2_web.save_to_files(url_dict, locations)
205 return True
206
207 if __name__ == '__main__':
208 main(sys.argv[1:])