Mercurial > repos > iuc > data_manager_snpsift_dbnsfp

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_snpsift_dbnsfp.py	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,167 @@
+#!/usr/bin/env python
+
+import gzip
+import json
+import optparse
+import os
+import os.path
+import re
+import shutil
+import sys
+import urllib
+import zipfile
+
+from pysam import ctabix
+
+"""
+# Install dbNSFP databases
+# from DbNsfp site
+  # Download dbNSFP database
+    $ wget ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/dbNSFPv2.4.zip
+  # Uncompress
+    $ unzip dbNSFP2.4.zip
+  # Create a single file version
+    $ (head -n 1 dbNSFP2.4_variant.chr1 ; cat dbNSFP2.4_variant.chr* | grep -v "^#") > dbNSFP2.4.txt
+  # Compress using block-gzip algorithm
+    bgzip dbNSFP2.4.txt
+  # Create tabix index
+    tabix -s 1 -b 2 -e 2 dbNSFP2.4.txt.gz
+
+data_table:
+
+    <table name="snpsift_dbnsfps" comment_char="#">
+        <columns>key, build, name, value, annotations</columns>
+        <file path="tool-data/snpsift_dbnsfps.loc" />
+    </table>
+
+#id     build   description     path    annotations
+#GRCh37_dbNSFP2.4       GRCh37  GRCh37 dbNSFP2.4        /depot/snpeff/dbNSFP2.4.gz  SIFT_pred,Uniprot_acc
+#GRCh38_dbNSFP2.7       GRCh38  GRCh38 dbNSFP2.7        /depot/snpeff/dbNSFP2.7.gz  SIFT_pred,Uniprot_acc
+"""
+
+data_table = 'snpsift_dbnsfps'
+softgenetics_url = 'ftp://dbnsfp:dbnsfp@dbnsfp.softgenetics.com/'
+dbNSFP_file_pat = '(dbNSFP(.*)_variant|dbscSNV(.*)).chr(.*)'
+tokenize = re.compile(r'(\d+)|(\D+)').findall
+dbNSFP_name_pat = 'dbNSFP(v|_light)?(\d*).*?'
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit(1)
+
+
+def get_nsfp_genome_version(name):
+    genome_version = 'hg19'
+    dbNSFP_name_pat = '(dbscSNV|dbNSFP(v|_light)?)(\d*).*?'
+    m = re.match(dbNSFP_name_pat, name)
+    if m:
+        (base, mid, ver) = m.groups()
+        if base == 'dbscSNV':
+            genome_version = 'hg19'
+        else:
+            genome_version = 'hg38' if ver == '3' else 'hg19' if ver == '2' else 'hg18'
+    return genome_version
+
+
+def get_annotations(gzip_path):
+    annotations = None
+    fh = None
+    try:
+        fh = gzip.open(gzip_path, 'r')
+        buf = fh.read(10000)
+        lines = buf.splitlines()
+        headers = lines[0].split('\t')
+        annotations = ','.join([x.strip() for x in headers[4:]])
+    except Exception as e:
+        stop_err('Error Reading annotations %s : %s' % (gzip_path, e))
+    finally:
+        if fh:
+            fh.close()
+    return annotations
+
+
+def tabix_file(input_fname, output_fname):
+    print >> sys.stdout, "tabix_file: %s -> %s" % (input_fname, output_fname)
+    ctabix.tabix_compress(input_fname, output_fname, force=True)
+    # Column indices are 0-based.
+    ctabix.tabix_index(output_fname, seq_col=0, start_col=1, end_col=1)
+
+
+def natural_sortkey(string):
+    return tuple(int(num) if num else alpha for num, alpha in tokenize(string))
+
+
+def download_dbnsfp_database(url, output_file):
+    dbnsfp_tsv = None
+    file_path = 'downloaded_file'
+    urllib.urlretrieve(url, file_path)
+    with zipfile.ZipFile(file_path, 'r') as my_zip:
+        dbnsfp_tsv = output_file if output_file else 'dbnsfp_tsv'
+        wtr = open(dbnsfp_tsv, 'w')
+        allfiles = [info.filename for info in my_zip.infolist()]
+        files = [f for f in allfiles if re.match(dbNSFP_file_pat, f)]
+        files = sorted(files, key=natural_sortkey)
+        for j, file in enumerate(files):
+            fh = my_zip.open(file, 'rU')
+            for i, line in enumerate(fh):
+                if j > 0 and i == 0:
+                    continue
+                wtr.write(line)
+    return dbnsfp_tsv
+
+
+def main():
+    # Parse Command Line
+    parser = optparse.OptionParser()
+    parser.add_option('-g', '--dbkey', dest='dbkey', action='store', type="string", default=None, help='dbkey genome version')
+    parser.add_option('-n', '--db_name', dest='db_name', action='store', type="string", default=None, help='A name for a history snpsiftdbnsfp dataset')
+    parser.add_option('-s', '--softgenetics', dest='softgenetics', action='store', type="string", default=None, help='A name for softgenetics dbNSFP file')
+    parser.add_option('-H', '--snpsiftdbnsfp', dest='snpsiftdbnsfp', action='store', type="string", default=None, help='A history snpsiftdbnsfp dataset')
+    parser.add_option('-T', '--dbnsfp_tabular', dest='dbnsfp_tabular', action='store', type="string", default=None, help='A history dbnsfp_tabular dataset')
+    (options, args) = parser.parse_args()
+
+    filename = args[0]
+    params = json.loads(open(filename).read())
+    target_directory = params['output_data'][0]['extra_files_path']
+    if not os.path.exists(target_directory):
+        os.mkdir(target_directory)
+    data_manager_dict = {}
+    genome_version = options.dbkey if options.dbkey else 'unknown'
+    dbnsfp_tsv = None
+    db_name = None
+    bzip_path = None
+    if options.softgenetics:
+        dbnsfp_url = softgenetics_url + options.softgenetics
+        db_name = options.db_name if options.db_name else re.sub('\.zip$', '', options.softgenetics)
+        genome_version = get_nsfp_genome_version(options.softgenetics)
+        tsv = db_name + '.tsv'
+        dbnsfp_tsv = download_dbnsfp_database(dbnsfp_url, tsv)
+    elif options.dbnsfp_tabular:
+        db_name = options.db_name
+        dbnsfp_tsv = options.dbnsfp_tabular
+    elif options.snpsiftdbnsfp:
+        (dirpath, bgzip_name) = os.path.split(options.snpsiftdbnsfp)
+        idxpath = options.snpsiftdbnsfp + '.tbi'
+        shutil.copy(options.snpsiftdbnsfp, target_directory)
+        shutil.copy(idxpath, target_directory)
+        bzip_path = os.path.join(target_directory, bgzip_name)
+        db_name = re.sub('(.txt)?.gz$', '', bgzip_name)
+    else:
+        stop_err('Either --softgenetics or --dbnsfp_tabular required')
+    if dbnsfp_tsv:
+        bgzip_name = '%s.txt.gz' % db_name
+        bzip_path = os.path.join(target_directory, bgzip_name)
+        tabix_file(dbnsfp_tsv, bzip_path)
+    annotations = get_annotations(bzip_path)
+    # Create the SnpSift dbNSFP Reference Data
+    data_table_entry = dict(key='%s_%s' % (genome_version, db_name), build=genome_version, name='%s %s' % (genome_version, db_name), value=bgzip_name, annotations=annotations)
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][data_table] = data_manager_dict['data_tables'].get(data_table, [])
+    data_manager_dict['data_tables'][data_table].append(data_table_entry)
+
+    # save info to json file
+    open(filename, 'wb').write(json.dumps(data_manager_dict))
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_snpsift_dbnsfp.xml	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,89 @@
+<tool id="data_manager_snpsift_dbnsfp" name="SnpSift dbNSFP" version="4.1.0" tool_type="manage_data">
+    <description>Install a dbNSFP variant annotation database</description>
+    <requirements>
+        <requirement type="package" version="0.7.7">pysam</requirement>
+    </requirements>
+    <stdio>
+        <exit_code range=":-1"  level="fatal"   description="Error: Cannot open file" />
+        <exit_code range="1:"  level="fatal"   description="Error" />
+    </stdio>
+    <command interpreter="python">
+        #import re
+        data_manager_snpsift_dbnsfp.py
+        #if $db.src == 'softgenetics':
+            --softgenetics "$db.softgenetics_name"
+        #elif $db.src == 'history':
+            #if $db.snpsiftdbnsfp.ext == 'snpsiftdbnsfp':
+                #import os.path
+                --snpsiftdbnsfp "$os.path.join($db.snpsiftdbnsfp.extra_files_path, $db.snpsiftdbnsfp.metadata.bgzip)"
+            #else
+                --dbnsfp_tabular "$db.snpsiftdbnsfp"
+            #end if
+            --db_name "$db.db_name"
+            #if str($db.dbkey).strip() != '':
+                --dbkey "$db.dbkey"
+            #elif str($db.snpsiftdbnsfp.metadata.dbkey) != '?':
+                --dbkey "$db.snpsiftdbnsfp.metadata.dbkey"
+            #end if
+        #end if
+        "$out_file"
+        </command>
+    <inputs>
+        <conditional name="db">
+            <param name="src" type="select" label="Source for dbNSFP file">
+                <option value="softgenetics">Jpopgen dbNSFP from softgenetics</option>
+                <option value="history">from your history</option>
+            </param>
+            <when value="softgenetics">
+                <param name="softgenetics_name" type="text" value="" label="dbNSFP file name at softgenetics ftp site">
+                  <help>Download From:  ftp://dbnsfp.softgenetics.com/
+                        Enter the name of the database, e.g.:  dbNSFPv3.0c.zip
+                  </help>
+                  <validator type="regex"  message="A dbNSFP or dbscSNV .zip">(dbNSFP|dbscSNV).*[.]zip</validator>
+                </param>
+            </when>
+            <when value="history">
+                <param name="snpsiftdbnsfp" type="data" format="snpsiftdbnsfp,dbnsfp.tabular" label="A snpsift dbnsfp from your history"
+                 help="This can can be generated by converting a tabular file set to type: dbnsfp.tabular"/>
+                <param name="db_name" type="text" value="" label="The unique name to give this dbnsfp database">
+                  <validator type="length" min="3" max="20" message="Must have between 3 and 20 chracters"/>
+                  <validator type="regex" message="No whitespace allowed">^\S*$</validator>
+                </param>
+                <param name="dbkey" type="text" value="hg19" optional="true" label="DBKEY to assign to data to this dbNSFP database" />
+            </when>
+        </conditional>
+    </inputs>
+
+    <outputs>
+           <data name="out_file" format="data_manager_json" label="${tool.name}"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="src" value="history"/>
+            <param name="snpsiftdbnsfp" value="test_nsfp.tsv" ftype="dbnsfp.tabular"/>
+            <param name="dbkey" value="hg19"/>
+            <param name="db_name" value="test_nsfp_tsv" />
+            <output name="out_file" file="test_nsfp.data_manager_json"/>
+        </test>
+    </tests>
+    <help>
+
+This tool installs dbNSFP_ databases to annotate VCF files using SnpSift_dbNSFP_
+It populates data table: snpsift_dbnsfps
+
+.. _dbNSFP: https://sites.google.com/site/jpopgen/dbNSFP
+.. _SnpSift_dbNSFP: http://snpefIf.sourceforge.net/SnpSift.html#dbNSFP
+
+Please refer to https://sites.google.com/site/jpopgen/dbNSFP for which citations to use with specific dbNSFP database versions.
+
+    </help>
+    <citations>
+        <citation type="doi">DOI: 10.1002/humu.21517</citation>
+        <citation type="doi">DOI: 10.1002/humu.22376</citation>
+        <citation type="doi">DOI: 10.1002/humu.22932</citation>
+        <citation type="doi">doi: 10.1093/hmg/ddu733</citation>
+        <citation type="doi">doi: 10.1093/nar/gku1206</citation>
+        <citation type="doi">doi: 10.3389/fgene.2012.00035</citation>
+    </citations>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<data_managers>
+  <data_manager tool_file="data_manager/data_manager_snpsift_dbnsfp.xml" id="data_manager_snpsift_dbnsfp" >
+    <data_table name="snpsift_dbnsfps">  <!-- Defines a Data Table to be modified. -->
+      <output> <!-- Handle the output of the Data Manager Tool -->
+        <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="build" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="value" output_ref="out_file" >
+          <move type="directory" relativize_symlinks="True">
+            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">snpSift/v4_1/dbnsfp</target>
+          </move>
+          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/snpSift/v4_1/dbnsfp/${value}</value_translation>
+          <value_translation type="function">abspath</value_translation>
+        </column>
+        <column name="annnotations" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+      </output>
+    </data_table>
+  </data_manager>
+</data_managers>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_nsfp.data_manager_json	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,1 @@
+{"data_tables": {"snpsift_dbnsfp": [{"value": "test_nsfp_tsv.txt.gz", "name": "? test_nsfp_tsv", "build": "?", "dbkey": "?_test_nsfp_tsv", "annotations": "hg18_pos(1-coor), genename, SIFT_score, SIFT_pred, Polyphen2_HDIV_score"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_nsfp.tsv	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,7 @@
+#chr	pos(1-coor)	ref	alt	hg18_pos(1-coor)	genename	SIFT_score	SIFT_pred	Polyphen2_HDIV_score
+1	69134	A	C	58997	OR4F5	0.03	D	0.043
+1	69134	A	G	58997	OR4F5	0.09	T	0.0
+1	69134	A	T	58997	OR4F5	0.03	D	0.308
+4	100239319	T	A	100458342	ADH1B	0	D	0.021
+4	100239319	T	C	100458342	ADH1B	0.15	T	0.0
+4	100239319	T	G	100458342	ADH1B	0	D	0.0
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpsift_dbnsfps.loc.sample	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,3 @@
+#key	build	description	path	annotations
+#GRCh37_dbNSFP2.4	GRCh37	GRCh37 dbNSFP2.4	/depot/snpeff/dbNSFP2.4.txt.gz	SIFT_pred,Uniprot_acc
+#GRCh38_dbNSFP3.1c	GRCh38	GRCh38 dbNSFP3.1c	/depot/snpeff/dbNSFP3.1c.txt.gz	SIFT_pred,Uniprot_acc
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,7 @@
+<tables>
+    <table name="snpsift_dbnsfps" comment_char="#">
+        <columns>key, build, name, value, annotations</columns>
+        <file path="tool-data/snpsift_dbnsfps.loc" />
+    </table>
+</tables>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml	Tue Jun 07 10:22:35 2016 -0400
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<tool_dependency>
+    <package name="pysam" version="0.7.7">
+        <repository changeset_revision="ca10c522f37e" name="package_pysam_0_7_7" owner="iuc" toolshed="https://testtoolshed.g2.bx.psu.edu" />
+    </package>
+</tool_dependency>