data_manager_snpsift_dbnsfp: data_manager/data_manager_snpsift

comparison data_manager/data_manager_snpsift_dbnsfp.py @ 0:da5d5dc2e55c draft default tip

Uploaded

author	jjohnson
date	Wed, 09 Dec 2015 13:58:01 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:da5d5dc2e55c
+#!/usr/bin/env python
+import sys
+import os
+import re
+import optparse
+import urllib
+import tarfile
+import gzip
+import json
+import pysam
+from pysam import ctabix
+import zipfile
+import os.path
+import shutil
+"""
+# Install dbNSFP databases
+# from DbNsfp site
+# Download dbNSFP database
+$ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip
+# Uncompress
+$ unzip dbNSFP2.4.zip
+# Create a single file version
+$ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt
+# Compress using block-gzip algorithm
+bgzip dbNSFP2.4.txt
+# Create tabix index
+tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz
+data_table:
+<table name="snpsift_dbnsfps" comment_char="#">
+<columns>key, build, name, value, annotations</columns>
+<file path="tool-data/snpsift_dbnsfps.loc" />
+</table>
+#id     build   description     path    annotations
+#GRCh37_dbNSFP2.4       GRCh37  GRCh37 dbNSFP2.4        /depot/snpeff/dbNSFP2.4.gz  SIFT_pred,Uniprot_acc
+#GRCh38_dbNSFP2.7       GRCh38  GRCh38 dbNSFP2.7        /depot/snpeff/dbNSFP2.7.gz  SIFT_pred,Uniprot_acc
+"""
+data_table = 'snpsift_dbnsfps'
+softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/'
+dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)'
+tokenize = re.compile(r'(\d+)|(\D+)').findall
+dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?'
+def stop_err(msg):
+sys.stderr.write(msg)
+sys.exit(1)
+def get_nsfp_genome_version(name):
+genome_version = 'hg19'
+dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?'
+m = re.match(dbNSFP_name_pat,name)
+if m:
+(base,mid,ver) = m.groups()
+if base == 'dbscSNV':
+genome_version = 'hg19'
+else:
+genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18'
+return genome_version
+def get_annotations(gzip_path):
+annotations = None
+fh = None
+try:
+fh = gzip.open(gzip_path, 'r')
+buf = fh.read(10000)
+lines = buf.splitlines()
+headers = lines[0].split('\t')
+annotations = ','.join([x.strip() for x in headers[4:]])
+except Exception, e:
+stop_err('Error Reading annotations %s : %s' % (gzip_path, e))
+finally:
+if fh:
+fh.close()
+return annotations
+def tabix_file(input_fname, output_fname):
+print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname)
+ctabix.tabix_compress(input_fname, output_fname, force=True)
+# Column indices are 0-based.
+ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1)
+def natural_sortkey(string):
+return tuple(int(num) if num else alpha for num, alpha in tokenize(string))
+def download_dbnsfp_database(url, output_file):
+dbnsfp_tsv = None
+file_path = 'downloaded_file'
+urllib.urlretrieve(url, file_path)
+if zipfile.is_zipfile(file_path):
+dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv'
+wtr = open(dbnsfp_tsv, 'w')
+my_zip = zipfile.ZipFile(file_path, 'r')
+allfiles = [info.filename for info in my_zip.infolist()]
+files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)]
+files = sorted(files, key=natural_sortkey)
+for j, file in enumerate(files):
+fh = my_zip.open(file, 'rU')
+for i, line in enumerate(fh):
+if j > 0 and i == 0:
+continue
+wtr.write(line)
+return dbnsfp_tsv
+def main():
+# Parse Command Line
+parser = optparse.OptionParser()
+parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version')
+parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset')
+parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file')
+parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset')
+parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset')
+(options, args) = parser.parse_args()
+filename = args[0]
+params = json.loads(open(filename).read())
+target_directory = params['output_data'][0]['extra_files_path']
+if not os.path.exists(target_directory):
+os.mkdir(target_directory)
+data_manager_dict = {}
+genome_version = options.dbkey if options.dbkey else 'unknown'
+dbnsfp_tsv = None
+db_name = None
+bzip_name = None
+bzip_path = None
+if options.softgenetics:
+dbnsfp_url = softgenetics_url + options.softgenetics
+db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics)
+genome_version = get_nsfp_genome_version(options.softgenetics)
+tsv = db_name + '.tsv'
+dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv)
+elif options.dbnsfp_tabular:
+db_name = options.db_name
+dbnsfp_tsv = options.dbnsfp_tabular
+elif options.snpsiftdbnsfp:
+(dirpath,bgzip_name) = os.path.split(options.snpsiftdbnsfp)
+idxpath = options.snpsiftdbnsfp + '.tbi'
+shutil.copy(options.snpsiftdbnsfp,target_directory)
+shutil.copy(idxpath,target_directory)
+bzip_path = os.path.join(target_directory, bgzip_name)
+db_name = re.sub('(.txt)?.gz$','',bgzip_name)
+else:
+stop_err('Either --softgenetics or --dbnsfp_tabular required')
+if dbnsfp_tsv:
+bgzip_name = '%s.txt.gz' % db_name
+bzip_path = os.path.join(target_directory, bgzip_name)
+tabix_file(dbnsfp_tsv,bzip_path)
+annotations = get_annotations(bzip_path)
+# Create the SnpSift dbNSFP Reference Data
+data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations)
+data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+data_manager_dict['data_tables'][data_table].append(data_table_entry)
+# save info to json file
+open(filename, 'wb').write(json.dumps(data_manager_dict))
+if __name__ == "__main__":
+main()

Mercurial > repos > jjohnson > data_manager_snpsift_dbnsfp

comparison data_manager/data_manager_snpsift_dbnsfp.py @ 0:da5d5dc2e55c draft default tip