# HG changeset patch
# User jjohnson
# Date 1449687481 18000
# Node ID da5d5dc2e55cc2a79aa3448ba0e4605e1bfcce28
Uploaded
diff -r 000000000000 -r da5d5dc2e55c data_manager/data_manager_snpsift_dbnsfp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_snpsift_dbnsfp.py Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,170 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import re
+import optparse
+import urllib
+import tarfile
+import gzip
+import json
+import pysam
+from pysam import ctabix
+import zipfile
+import os.path
+import shutil
+
+"""
+# Install dbNSFP databases
+# from DbNsfp site
+ # Download dbNSFP database
+ $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip
+ # Uncompress
+ $ unzip dbNSFP2.4.zip
+ # Create a single file version
+ $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt
+ # Compress using block-gzip algorithm
+ bgzip dbNSFP2.4.txt
+ # Create tabix index
+ tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz
+
+data_table:
+
+
+ key, build, name, value, annotations
+
+
+
+#id build description path annotations
+#GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.gz SIFT_pred,Uniprot_acc
+#GRCh38_dbNSFP2.7 GRCh38 GRCh38 dbNSFP2.7 /depot/snpeff/dbNSFP2.7.gz SIFT_pred,Uniprot_acc
+
+"""
+
+
+data_table = 'snpsift_dbnsfps'
+softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/'
+dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)'
+tokenize = re.compile(r'(\d+)|(\D+)').findall
+dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?'
+
+
+def stop_err(msg):
+ sys.stderr.write(msg)
+ sys.exit(1)
+
+def get_nsfp_genome_version(name):
+ genome_version = 'hg19'
+ dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?'
+ m = re.match(dbNSFP_name_pat,name)
+ if m:
+ (base,mid,ver) = m.groups()
+ if base == 'dbscSNV':
+ genome_version = 'hg19'
+ else:
+ genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18'
+ return genome_version
+
+def get_annotations(gzip_path):
+ annotations = None
+ fh = None
+ try:
+ fh = gzip.open(gzip_path, 'r')
+ buf = fh.read(10000)
+ lines = buf.splitlines()
+ headers = lines[0].split('\t')
+ annotations = ','.join([x.strip() for x in headers[4:]])
+ except Exception, e:
+ stop_err('Error Reading annotations %s : %s' % (gzip_path, e))
+ finally:
+ if fh:
+ fh.close()
+ return annotations
+
+
+def tabix_file(input_fname, output_fname):
+ print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname)
+ ctabix.tabix_compress(input_fname, output_fname, force=True)
+ # Column indices are 0-based.
+ ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1)
+
+
+def natural_sortkey(string):
+ return tuple(int(num) if num else alpha for num, alpha in tokenize(string))
+
+
+def download_dbnsfp_database(url, output_file):
+ dbnsfp_tsv = None
+ file_path = 'downloaded_file'
+ urllib.urlretrieve(url, file_path)
+ if zipfile.is_zipfile(file_path):
+ dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv'
+ wtr = open(dbnsfp_tsv, 'w')
+ my_zip = zipfile.ZipFile(file_path, 'r')
+ allfiles = [info.filename for info in my_zip.infolist()]
+ files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)]
+ files = sorted(files, key=natural_sortkey)
+ for j, file in enumerate(files):
+ fh = my_zip.open(file, 'rU')
+ for i, line in enumerate(fh):
+ if j > 0 and i == 0:
+ continue
+ wtr.write(line)
+ return dbnsfp_tsv
+
+
+def main():
+ # Parse Command Line
+ parser = optparse.OptionParser()
+ parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version')
+ parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset')
+ parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file')
+ parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset')
+ parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset')
+ (options, args) = parser.parse_args()
+
+ filename = args[0]
+ params = json.loads(open(filename).read())
+ target_directory = params['output_data'][0]['extra_files_path']
+ if not os.path.exists(target_directory):
+ os.mkdir(target_directory)
+ data_manager_dict = {}
+ genome_version = options.dbkey if options.dbkey else 'unknown'
+ dbnsfp_tsv = None
+ db_name = None
+ bzip_name = None
+ bzip_path = None
+ if options.softgenetics:
+ dbnsfp_url = softgenetics_url + options.softgenetics
+ db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics)
+ genome_version = get_nsfp_genome_version(options.softgenetics)
+ tsv = db_name + '.tsv'
+ dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv)
+ elif options.dbnsfp_tabular:
+ db_name = options.db_name
+ dbnsfp_tsv = options.dbnsfp_tabular
+ elif options.snpsiftdbnsfp:
+ (dirpath,bgzip_name) = os.path.split(options.snpsiftdbnsfp)
+ idxpath = options.snpsiftdbnsfp + '.tbi'
+ shutil.copy(options.snpsiftdbnsfp,target_directory)
+ shutil.copy(idxpath,target_directory)
+ bzip_path = os.path.join(target_directory, bgzip_name)
+ db_name = re.sub('(.txt)?.gz$','',bgzip_name)
+ else:
+ stop_err('Either --softgenetics or --dbnsfp_tabular required')
+ if dbnsfp_tsv:
+ bgzip_name = '%s.txt.gz' % db_name
+ bzip_path = os.path.join(target_directory, bgzip_name)
+ tabix_file(dbnsfp_tsv,bzip_path)
+ annotations = get_annotations(bzip_path)
+ # Create the SnpSift dbNSFP Reference Data
+ data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations)
+ data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+ data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+ data_manager_dict['data_tables'][data_table].append(data_table_entry)
+
+ # save info to json file
+ open(filename, 'wb').write(json.dumps(data_manager_dict))
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r da5d5dc2e55c data_manager/data_manager_snpsift_dbnsfp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_snpsift_dbnsfp.xml Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,82 @@
+
+ Install a dbNSFP variant annotation database
+
+ pysam
+
+
+ #import re
+ data_manager_snpsift_dbnsfp.py
+ #if $db.src == 'softgenetics':
+ --softgenetics $db.softgenetics_name
+ #elif $db.src == 'history':
+ #if $db.snpsiftdbnsfp.ext == 'snpsiftdbnsfp':
+ #import os.path
+ --snpsiftdbnsfp $os.path.join($db.snpsiftdbnsfp.extra_files_path, $db.snpsiftdbnsfp.metadata.bgzip)
+ #else
+ --dbnsfp_tabular $db.snpsiftdbnsfp
+ #end if
+ --db_name $db.db_name
+ #if str($db.dbkey).strip() != '':
+ --dbkey "$db.dbkey"
+ #elif str($db.snpsiftdbnsfp.metadata.dbkey) != '?':
+ --dbkey "$db.snpsiftdbnsfp.metadata.dbkey"
+ #end if
+ #end if
+ "$out_file"
+
+
+
+
+
+
+
+
+
+ Download From: ftp://dbnsfp.softgenetics.com/
+ Enter the name of the database, e.g.: dbNSFPv3.0c.zip
+
+ (dbNSFP|dbscSNV).*[.]zip
+
+
+
+
+
+
+ ^\S*$
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+This tool installs dbNSFP_ databases to annotate VCF files using SnpSift_dbNSFP_
+It populates data table: snpsift_dbnsfps
+
+.. _dbNSFP: https://sites.google.com/site/jpopgen/dbNSFP
+.. _SnpSift_dbNSFP: http://snpefIf.sourceforge.net/SnpSift.html#dbNSFP
+
+Please cite:
+"A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3.", Cingolani P, Platts A, Wang le L, Coon M, Nguyen T, Wang L, Land SJ, Lu X, Ruden DM. Fly (Austin). 2012 Apr-Jun;6(2):80-92. PMID: 22728672 [PubMed - in process]
+
+
+
+
diff -r 000000000000 -r da5d5dc2e55c data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,22 @@
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r da5d5dc2e55c test-data/test_nsfp.data_manager_json
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_nsfp.data_manager_json Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,1 @@
+{"data_tables": {"snpsift_dbnsfp": [{"value": "test_nsfp_tsv.txt.gz", "name": "? test_nsfp_tsv", "build": "?", "dbkey": "?_test_nsfp_tsv", "annotations": "hg18_pos(1-coor), genename, SIFT_score, SIFT_pred, Polyphen2_HDIV_score"}]}}
\ No newline at end of file
diff -r 000000000000 -r da5d5dc2e55c test-data/test_nsfp.tsv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_nsfp.tsv Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,7 @@
+#chr pos(1-coor) ref alt hg18_pos(1-coor) genename SIFT_score SIFT_pred Polyphen2_HDIV_score
+1 69134 A C 58997 OR4F5 0.03 D 0.043
+1 69134 A G 58997 OR4F5 0.09 T 0.0
+1 69134 A T 58997 OR4F5 0.03 D 0.308
+4 100239319 T A 100458342 ADH1B 0 D 0.021
+4 100239319 T C 100458342 ADH1B 0.15 T 0.0
+4 100239319 T G 100458342 ADH1B 0 D 0.0
diff -r 000000000000 -r da5d5dc2e55c tool-data/snpsift_dbnsfps.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpsift_dbnsfps.loc.sample Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,3 @@
+#key build description path annotations
+#GRCh37_dbNSFP2.4 GRCh37 GRCh37 dbNSFP2.4 /depot/snpeff/dbNSFP2.4.txt.gz SIFT_pred,Uniprot_acc
+#GRCh38_dbNSFP3.1c GRCh38 GRCh38 dbNSFP3.1c /depot/snpeff/dbNSFP3.1c.txt.gz SIFT_pred,Uniprot_acc
diff -r 000000000000 -r da5d5dc2e55c tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,7 @@
+
+
+ key, build, name, value, annotations
+
+
+
+
diff -r 000000000000 -r da5d5dc2e55c tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Wed Dec 09 13:58:01 2015 -0500
@@ -0,0 +1,6 @@
+
+
+
+
+
+