Mercurial > repos > saketkc > polyphen2
diff polyphen2_web/polyphen2_web.py @ 0:b319f980c9e6 draft
Uploaded
author | saketkc |
---|---|
date | Mon, 14 Apr 2014 17:27:06 -0400 |
parents | |
children | 3c40b02934ad |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/polyphen2_web/polyphen2_web.py Mon Apr 14 17:27:06 2014 -0400 @@ -0,0 +1,208 @@ +#!/usr/bin/python +from bs4 import BeautifulSoup +import argparse +import sys +import time +import os +import tempfile +import requests +import shutil +import csv +submission_url = 'http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi' +result_url = 'http://genetics.bwh.harvard.edu' + +refresh_interval = 30 +TIMEOUT = 60 * 60 +TIME_DELAY = 7 +MAX_TRIES = 30 + +# Genome assembly version used for chromosome +# coordinates of the SNPs in user input +UCSCDB = ['hg19', 'hg18'] +# Classifier model used for predictions. +MODELNAME = ['HumDiv', 'HumVar'] + +# Set of transcripts on which genomic SNPs will be mapped +SNPFILTER = { + 'All': 0, + 'Canonical': 1, + 'CCDS': 3, +} +# Functional SNP categories to include in genomic SNPs annotation report +SNPFUNCTION = ['c', 'm', ''] + + +def stop_err(msg, err=1): + sys.stderr.write('%s\n' % msg) + sys.exit(err) + + +class Polyphen2Web: + + def __init__(self, ucscdb=None, model_name=None, snp_filter=None, + snp_function=None, file_location=None, email=None): + self.ucscdb = ucscdb + self.model_name = model_name + self.snp_filter = snp_filter + self.snp_function = snp_function + self.file_location = file_location + self.notify_me = email + + def soupify(self, string): + return BeautifulSoup(string) + + def make_request(self): + in_txt = csv.reader(open(self.file_location, 'rb'), delimiter='\t') + tmp_dir = tempfile.mkdtemp() + path = os.path.join(tmp_dir, 'csv_file') + with open(path, 'wb') as fh: + a = csv.writer(fh) + a.writerows(in_txt) + contents = open(self.file_location, 'r').read().replace( + '\t', ' ').replace('::::::::::::::', '') + if self.snp_function == 'All': + self.snp_function = '' + payload = { + '_ggi_project': 'PPHWeb2', + '_ggi_origin': 'query', + '_ggi_batch': contents, + '_ggi_target_pipeline': '1', + 'MODELNAME': self.model_name, + 'UCSCDB': self.ucscdb, + 'SNPFILTER': SNPFILTER[self.snp_filter], + 'SNPFUNC': self.snp_function, + 'NOTIFYME': '', + + } + if self.notify_me: + payload['NOTIFYME'] = self.notify_me + request = requests.post(submission_url, data=payload) + content = request.content + soup = self.soupify(content) + sid_soup = soup.find('input', {'name': 'sid'}) + try: + sid = sid_soup['value'] + except: + sid = None + shutil.rmtree(tmp_dir) + return sid + + def poll_for_files(self, sid, + max_tries=MAX_TRIES, + time_delay=TIME_DELAY, + timeout=TIMEOUT): + payload = { + '_ggi_project': 'PPHWeb2', + '_ggi_origin': 'manage', + '_ggi_target_manage': 'Refresh', + 'sid': sid, + } + content = None + tries = 0 + url_dict = None + while True: + tries += 1 + if tries > max_tries: + stop_err('Number of tries exceeded!') + request = requests.post(submission_url, data=payload) + content = request.content + soup = self.soupify(content) + all_tables = soup.findAll('table') + if all_tables: + try: + running_jobs_table = all_tables[-2] + except: + running_jobs_table = None + if running_jobs_table: + rows = running_jobs_table.findAll('tr') + if len(rows) == 1: + row = rows[0] + hrefs = row.findAll('a') + # print hrefs + if len(hrefs) >= 3: + short_txt = hrefs[0]['href'] + # print short_txt + path = short_txt.split('-')[0] + full_txt = result_url + path + '-full.txt' + log_txt = result_url + path + '-log.txt' + snps_txt = result_url + path + '-snps.txt' + short_txt = result_url + path + \ + '-short.txt' # short_txt + url_dict = { + 'full_file': full_txt, + 'snps_file': snps_txt, + 'log_file': log_txt, + 'short_file': short_txt, + } + return url_dict + time.sleep(time_delay) + return url_dict + + def save_to_files(self, url_dict, args): + tmp_dir = tempfile.mkdtemp() + for key, value in url_dict.iteritems(): + r = requests.get(value, stream=True) + if r.status_code == 200: + path = os.path.join(tmp_dir, key) + with open(path, 'wb') as f: + for chunk in r.iter_content(128): + f.write(chunk) + shutil.move(path, args[key]) + if os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + return True + + +def main(args): + parser = argparse.ArgumentParser() + parser.add_argument('-u', + '--ucscdb', + dest='ucscdb', + choices=UCSCDB, + required=True, type=str) + parser.add_argument('-m', '--model', + dest='modelname', choices=MODELNAME, + required=True, type=str) + parser.add_argument('-fl', '--filter', + '--snpfilter', dest='snpfilter', + choices=SNPFILTER.keys(), + required=True, type=str) + parser.add_argument('-i', '--input', + dest='input', nargs='?', + required=True, type=str, + default=sys.stdin) + parser.add_argument('-e', '--email', + dest='email', + required=False, default=None) + parser.add_argument('--log', dest='log_file', + required=True, default=None, type=str) + parser.add_argument('--short', dest='short_file', + required=True, default=None, type=str) + parser.add_argument('--full', dest='full_file', + required=True, default=None, type=str) + parser.add_argument('--snp', dest='snps_file', + required=True, default=None, type=str) + parser.add_argument('--function', dest='snpfunction', + required=True, type=str) + args_s = vars(parser.parse_args(args)) + polyphen2_web = Polyphen2Web(ucscdb=args_s['ucscdb'], + model_name=args_s['modelname'], + snp_filter=args_s['snpfilter'], + snp_function=args_s['snpfunction'], + file_location=args_s['input'], + email=args_s['email']) + sid = polyphen2_web.make_request() + if not sid: + stop_err( + 'Something went wrong! The tracking id could not be retrieved.') + url_dict = polyphen2_web.poll_for_files(sid) + locations = {} + if not url_dict: + stop_err('There was error downloading the output files!') + for key in url_dict.keys(): + locations[key] = args_s[key] + polyphen2_web.save_to_files(url_dict, locations) + return True + +if __name__ == '__main__': + main(sys.argv[1:])