Mercurial > repos > saketkc > polyphen2
comparison polyphen2_web/polyphen2_web.py @ 0:b319f980c9e6 draft
Uploaded
| author | saketkc |
|---|---|
| date | Mon, 14 Apr 2014 17:27:06 -0400 |
| parents | |
| children | 3c40b02934ad |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:b319f980c9e6 |
|---|---|
| 1 #!/usr/bin/python | |
| 2 from bs4 import BeautifulSoup | |
| 3 import argparse | |
| 4 import sys | |
| 5 import time | |
| 6 import os | |
| 7 import tempfile | |
| 8 import requests | |
| 9 import shutil | |
| 10 import csv | |
| 11 submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi' | |
| 12 result_url = 'http://genetics.bwh.harvard.edu' | |
| 13 | |
| 14 refresh_interval = 30 | |
| 15 TIMEOUT = 60 * 60 | |
| 16 TIME_DELAY = 7 | |
| 17 MAX_TRIES = 30 | |
| 18 | |
| 19 # Genome assembly version used for chromosome | |
| 20 # coordinates of the SNPs in user input | |
| 21 UCSCDB = ['hg19', 'hg18'] | |
| 22 # Classifier model used for predictions. | |
| 23 MODELNAME = ['HumDiv', 'HumVar'] | |
| 24 | |
| 25 # Set of transcripts on which genomic SNPs will be mapped | |
| 26 SNPFILTER = { | |
| 27 'All': 0, | |
| 28 'Canonical': 1, | |
| 29 'CCDS': 3, | |
| 30 } | |
| 31 # Functional SNP categories to include in genomic SNPs annotation report | |
| 32 SNPFUNCTION = ['c', 'm', ''] | |
| 33 | |
| 34 | |
| 35 def stop_err(msg, err=1): | |
| 36 sys.stderr.write('%s\n' % msg) | |
| 37 sys.exit(err) | |
| 38 | |
| 39 | |
| 40 class Polyphen2Web: | |
| 41 | |
| 42 def __init__(self, ucscdb=None, model_name=None, snp_filter=None, | |
| 43 snp_function=None, file_location=None, email=None): | |
| 44 self.ucscdb = ucscdb | |
| 45 self.model_name = model_name | |
| 46 self.snp_filter = snp_filter | |
| 47 self.snp_function = snp_function | |
| 48 self.file_location = file_location | |
| 49 self.notify_me = email | |
| 50 | |
| 51 def soupify(self, string): | |
| 52 return BeautifulSoup(string) | |
| 53 | |
| 54 def make_request(self): | |
| 55 in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t') | |
| 56 tmp_dir = tempfile.mkdtemp() | |
| 57 path = os.path.join(tmp_dir, 'csv_file') | |
| 58 with open(path, 'wb') as fh: | |
| 59 a = csv.writer(fh) | |
| 60 a.writerows(in_txt) | |
| 61 contents = open(self.file_location, 'r').read().replace( | |
| 62 '\t', ' ').replace('::::::::::::::', '') | |
| 63 if self.snp_function == 'All': | |
| 64 self.snp_function = '' | |
| 65 payload = { | |
| 66 '_ggi_project': 'PPHWeb2', | |
| 67 '_ggi_origin': 'query', | |
| 68 '_ggi_batch': contents, | |
| 69 '_ggi_target_pipeline': '1', | |
| 70 'MODELNAME': self.model_name, | |
| 71 'UCSCDB': self.ucscdb, | |
| 72 'SNPFILTER': SNPFILTER[self.snp_filter], | |
| 73 'SNPFUNC': self.snp_function, | |
| 74 'NOTIFYME': '', | |
| 75 | |
| 76 } | |
| 77 if self.notify_me: | |
| 78 payload['NOTIFYME'] = self.notify_me | |
| 79 request = requests.post(submission_url, data=payload) | |
| 80 content = request.content | |
| 81 soup = self.soupify(content) | |
| 82 sid_soup = soup.find('input', {'name': 'sid'}) | |
| 83 try: | |
| 84 sid = sid_soup['value'] | |
| 85 except: | |
| 86 sid = None | |
| 87 shutil.rmtree(tmp_dir) | |
| 88 return sid | |
| 89 | |
| 90 def poll_for_files(self, sid, | |
| 91 max_tries=MAX_TRIES, | |
| 92 time_delay=TIME_DELAY, | |
| 93 timeout=TIMEOUT): | |
| 94 payload = { | |
| 95 '_ggi_project': 'PPHWeb2', | |
| 96 '_ggi_origin': 'manage', | |
| 97 '_ggi_target_manage': 'Refresh', | |
| 98 'sid': sid, | |
| 99 } | |
| 100 content = None | |
| 101 tries = 0 | |
| 102 url_dict = None | |
| 103 while True: | |
| 104 tries += 1 | |
| 105 if tries > max_tries: | |
| 106 stop_err('Number of tries exceeded!') | |
| 107 request = requests.post(submission_url, data=payload) | |
| 108 content = request.content | |
| 109 soup = self.soupify(content) | |
| 110 all_tables = soup.findAll('table') | |
| 111 if all_tables: | |
| 112 try: | |
| 113 running_jobs_table = all_tables[-2] | |
| 114 except: | |
| 115 running_jobs_table = None | |
| 116 if running_jobs_table: | |
| 117 rows = running_jobs_table.findAll('tr') | |
| 118 if len(rows) == 1: | |
| 119 row = rows[0] | |
| 120 hrefs = row.findAll('a') | |
| 121 # print hrefs | |
| 122 if len(hrefs) >= 3: | |
| 123 short_txt = hrefs[0]['href'] | |
| 124 # print short_txt | |
| 125 path = short_txt.split('-')[0] | |
| 126 full_txt = result_url + path + '-full.txt' | |
| 127 log_txt = result_url + path + '-log.txt' | |
| 128 snps_txt = result_url + path + '-snps.txt' | |
| 129 short_txt = result_url + path + \ | |
| 130 '-short.txt' # short_txt | |
| 131 url_dict = { | |
| 132 'full_file': full_txt, | |
| 133 'snps_file': snps_txt, | |
| 134 'log_file': log_txt, | |
| 135 'short_file': short_txt, | |
| 136 } | |
| 137 return url_dict | |
| 138 time.sleep(time_delay) | |
| 139 return url_dict | |
| 140 | |
| 141 def save_to_files(self, url_dict, args): | |
| 142 tmp_dir = tempfile.mkdtemp() | |
| 143 for key, value in url_dict.iteritems(): | |
| 144 r = requests.get(value, stream=True) | |
| 145 if r.status_code == 200: | |
| 146 path = os.path.join(tmp_dir, key) | |
| 147 with open(path, 'wb') as f: | |
| 148 for chunk in r.iter_content(128): | |
| 149 f.write(chunk) | |
| 150 shutil.move(path, args[key]) | |
| 151 if os.path.exists(tmp_dir): | |
| 152 shutil.rmtree(tmp_dir) | |
| 153 return True | |
| 154 | |
| 155 | |
| 156 def main(args): | |
| 157 parser = argparse.ArgumentParser() | |
| 158 parser.add_argument('-u', | |
| 159 '--ucscdb', | |
| 160 dest='ucscdb', | |
| 161 choices=UCSCDB, | |
| 162 required=True, type=str) | |
| 163 parser.add_argument('-m', '--model', | |
| 164 dest='modelname', choices=MODELNAME, | |
| 165 required=True, type=str) | |
| 166 parser.add_argument('-fl', '--filter', | |
| 167 '--snpfilter', dest='snpfilter', | |
| 168 choices=SNPFILTER.keys(), | |
| 169 required=True, type=str) | |
| 170 parser.add_argument('-i', '--input', | |
| 171 dest='input', nargs='?', | |
| 172 required=True, type=str, | |
| 173 default=sys.stdin) | |
| 174 parser.add_argument('-e', '--email', | |
| 175 dest='email', | |
| 176 required=False, default=None) | |
| 177 parser.add_argument('--log', dest='log_file', | |
| 178 required=True, default=None, type=str) | |
| 179 parser.add_argument('--short', dest='short_file', | |
| 180 required=True, default=None, type=str) | |
| 181 parser.add_argument('--full', dest='full_file', | |
| 182 required=True, default=None, type=str) | |
| 183 parser.add_argument('--snp', dest='snps_file', | |
| 184 required=True, default=None, type=str) | |
| 185 parser.add_argument('--function', dest='snpfunction', | |
| 186 required=True, type=str) | |
| 187 args_s = vars(parser.parse_args(args)) | |
| 188 polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'], | |
| 189 model_name=args_s['modelname'], | |
| 190 snp_filter=args_s['snpfilter'], | |
| 191 snp_function=args_s['snpfunction'], | |
| 192 file_location=args_s['input'], | |
| 193 email=args_s['email']) | |
| 194 sid = polyphen2_web.make_request() | |
| 195 if not sid: | |
| 196 stop_err( | |
| 197 'Something went wrong! The tracking id could not be retrieved.') | |
| 198 url_dict = polyphen2_web.poll_for_files(sid) | |
| 199 locations = {} | |
| 200 if not url_dict: | |
| 201 stop_err('There was error downloading the output files!') | |
| 202 for key in url_dict.keys(): | |
| 203 locations[key] = args_s[key] | |
| 204 polyphen2_web.save_to_files(url_dict, locations) | |
| 205 return True | |
| 206 | |
| 207 if __name__ == '__main__': | |
| 208 main(sys.argv[1:]) |
