Mercurial > repos > jjohnson > data_manager_snpsift_dbnsfp
view data_manager/data_manager_snpsift_dbnsfp.py @ 0:da5d5dc2e55c draft default tip
Uploaded
author | jjohnson |
---|---|
date | Wed, 09 Dec 2015 13:58:01 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python import sys import os import re import optparse import urllib import tarfile import gzip import json import pysam from pysam import ctabix import zipfile import os.path import shutil """ # Install dbNSFP databases # from DbNsfp site # Download dbNSFP database $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip # Uncompress $ unzip dbNSFP2.4.zip # Create a single file version $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt # Compress using block-gzip algorithm bgzip dbNSFP2.4.txt # Create tabix index tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz data_table: <table name="snpsift_dbnsfps" comment_char="#"> <columns>key, build, name, value, annotations</columns> <file path="tool-data/snpsift_dbnsfps.loc" /> </table> #id build description path annotations #GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.gz SIFT_pred,Uniprot_acc #GRCh38_dbNSFP2.7 GRCh38 GRCh38 dbNSFP2.7 /depot/snpeff/dbNSFP2.7.gz SIFT_pred,Uniprot_acc """ data_table = 'snpsift_dbnsfps' softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/' dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)' tokenize = re.compile(r'(\d+)|(\D+)').findall dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?' def stop_err(msg): sys.stderr.write(msg) sys.exit(1) def get_nsfp_genome_version(name): genome_version = 'hg19' dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?' m = re.match(dbNSFP_name_pat,name) if m: (base,mid,ver) = m.groups() if base == 'dbscSNV': genome_version = 'hg19' else: genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18' return genome_version def get_annotations(gzip_path): annotations = None fh = None try: fh = gzip.open(gzip_path, 'r') buf = fh.read(10000) lines = buf.splitlines() headers = lines[0].split('\t') annotations = ','.join([x.strip() for x in headers[4:]]) except Exception, e: stop_err('Error Reading annotations %s : %s' % (gzip_path, e)) finally: if fh: fh.close() return annotations def tabix_file(input_fname, output_fname): print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname) ctabix.tabix_compress(input_fname, output_fname, force=True) # Column indices are 0-based. ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1) def natural_sortkey(string): return tuple(int(num) if num else alpha for num, alpha in tokenize(string)) def download_dbnsfp_database(url, output_file): dbnsfp_tsv = None file_path = 'downloaded_file' urllib.urlretrieve(url, file_path) if zipfile.is_zipfile(file_path): dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv' wtr = open(dbnsfp_tsv, 'w') my_zip = zipfile.ZipFile(file_path, 'r') allfiles = [info.filename for info in my_zip.infolist()] files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)] files = sorted(files, key=natural_sortkey) for j, file in enumerate(files): fh = my_zip.open(file, 'rU') for i, line in enumerate(fh): if j > 0 and i == 0: continue wtr.write(line) return dbnsfp_tsv def main(): # Parse Command Line parser = optparse.OptionParser() parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version') parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset') parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file') parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset') parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset') (options, args) = parser.parse_args() filename = args[0] params = json.loads(open(filename).read()) target_directory = params['output_data'][0]['extra_files_path'] if not os.path.exists(target_directory): os.mkdir(target_directory) data_manager_dict = {} genome_version = options.dbkey if options.dbkey else 'unknown' dbnsfp_tsv = None db_name = None bzip_name = None bzip_path = None if options.softgenetics: dbnsfp_url = softgenetics_url + options.softgenetics db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics) genome_version = get_nsfp_genome_version(options.softgenetics) tsv = db_name + '.tsv' dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv) elif options.dbnsfp_tabular: db_name = options.db_name dbnsfp_tsv = options.dbnsfp_tabular elif options.snpsiftdbnsfp: (dirpath,bgzip_name) = os.path.split(options.snpsiftdbnsfp) idxpath = options.snpsiftdbnsfp + '.tbi' shutil.copy(options.snpsiftdbnsfp,target_directory) shutil.copy(idxpath,target_directory) bzip_path = os.path.join(target_directory, bgzip_name) db_name = re.sub('(.txt)?.gz$','',bgzip_name) else: stop_err('Either --softgenetics or --dbnsfp_tabular required') if dbnsfp_tsv: bgzip_name = '%s.txt.gz' % db_name bzip_path = os.path.join(target_directory, bgzip_name) tabix_file(dbnsfp_tsv,bzip_path) annotations = get_annotations(bzip_path) # Create the SnpSift dbNSFP Reference Data data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations) data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, []) data_manager_dict['data_tables'][data_table].append(data_table_entry) # save info to json file open(filename, 'wb').write(json.dumps(data_manager_dict)) if __name__ == "__main__": main()