changeset 5:78bcf4ac437c

Use tool_data_table with key and version columns added to allow for multiple versions in a .loc file
author Jim Johnson <jj@umn.edu>
date Tue, 13 Jan 2015 12:54:20 -0600 (2015-01-13)
parents 6a378d0f4856
children a329eda0cdff
files data_manager/data_manager_snpEff_databases.py data_manager/data_manager_snpEff_download.py data_manager/data_manager_snpEff_download.xml data_manager_conf.xml repository_dependencies.xml tool-data/snpeff4_annotations.loc.sample tool-data/snpeff4_databases.loc.sample tool-data/snpeff4_genomedb.loc.sample tool-data/snpeff4_regulationdb.loc.sample tool-data/snpeffv_annotations.loc.sample tool-data/snpeffv_databases.loc.sample tool-data/snpeffv_genomedb.loc.sample tool-data/snpeffv_regulationdb.loc.sample tool_data_table_conf.xml.sample
diffstat 14 files changed, 134 insertions(+), 53 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/data_manager_snpEff_databases.py	Thu Oct 23 05:43:46 2014 -0500
+++ b/data_manager/data_manager_snpEff_databases.py	Tue Jan 13 12:54:20 2015 -0600
@@ -18,6 +18,28 @@
     sys.stderr.write(msg)
     sys.exit(1)
 
+def getSnpeffVersion(jar_path):
+    snpeff_version = 'SnpEff ?.?'
+    (snpEff_dir,snpEff_jar) = os.path.split(jar_path)
+    stderr_path = 'snpeff.err'
+    stderr_fh = open(stderr_path,'w')
+    args = [ 'java','-jar', ]
+    args.append( snpEff_jar )
+    args.append( '-h' )
+    proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() )
+    return_code = proc.wait()
+    if return_code != 255:
+        sys.exit( return_code )
+    stderr_fh.close()
+    fh = open(stderr_path,'r')
+    for line in fh:
+        m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line)
+        if m:
+            snpeff_version = m.groups()[0] + m.groups()[1]
+            break
+    fh.close()
+    return snpeff_version
+
 def fetch_databases(data_manager_dict, target_directory, jar_path):
     (snpEff_dir,snpEff_jar) = os.path.split(jar_path)
     if not os.path.exists(target_directory):
@@ -35,9 +57,10 @@
     if return_code:
         sys.exit( return_code )
     databases_output.close()
+    snpeff_version = getSnpeffVersion(jar_path)
     try:
         data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
-        data_manager_dict['data_tables']['snpeff4_databases'] = data_manager_dict['data_tables'].get( 'snpeff4_databases', [] )
+        data_manager_dict['data_tables']['snpeffv_databases'] = data_manager_dict['data_tables'].get( 'snpeffv_databases', [] )
         data_table_entries = []
         fh = open(databases_path,'r')
         for i,line in enumerate(fh):
@@ -50,8 +73,9 @@
                 if genome_version == '30c2c903' or fields[1].strip() == 'TestCase' or fields[1].strip().startswith('Test_'):
                     continue
                 description = fields[1].strip() + ' : ' + genome_version
-                data_table_entries.append(dict(value=genome_version, name=description))
-        data_manager_dict['data_tables']['snpeff4_databases'] = data_table_entries
+                key = snpeff_version + '_' + genome_version
+                data_table_entries.append(dict(key=key, version=snpeff_version, value=genome_version, name=description))
+        data_manager_dict['data_tables']['snpeffv_databases'] = data_table_entries
     except Exception, e:
         stop_err( 'Error parsing %s %s\n' % (config,str( e )) )
     else:
--- a/data_manager/data_manager_snpEff_download.py	Thu Oct 23 05:43:46 2014 -0500
+++ b/data_manager/data_manager_snpEff_download.py	Tue Jan 13 12:54:20 2015 -0600
@@ -9,6 +9,7 @@
 import shutil
 import optparse
 import urllib2
+import gzip
 from ftplib import FTP
 import tarfile
 
@@ -64,6 +65,49 @@
         return ','.join(descriptions)
     return organisms    
 
+def getSnpeffVersion(jar_path):
+    snpeff_version = 'SnpEff ?.?'
+    (snpEff_dir,snpEff_jar) = os.path.split(jar_path)
+    stderr_path = 'snpeff.err'
+    stderr_fh = open(stderr_path,'w')
+    args = [ 'java','-jar', ]
+    args.append( snpEff_jar )
+    args.append( '-h' )
+    proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() )
+    return_code = proc.wait()
+    if return_code != 255:
+        sys.exit( return_code )
+    stderr_fh.close()
+    fh = open(stderr_path,'r')
+    for line in fh:
+        m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line)
+        if m:
+            snpeff_version = m.groups()[0] + m.groups()[1]
+            break
+    fh.close()
+    return snpeff_version
+
+# Starting with SnpEff 4.1 the .bin files contain the SnpEff version:
+# Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed):
+"""
+SnpEff  4.1
+CHROMOSOME      2       1       0       179197  GL000219.1      false
+CHROMOSOME      3       1       0       81347269        HSCHR17_1       false
+"""
+def getSnpeffVersionFromFile(path):
+    snpeff_version = None
+    try:
+        fh = gzip.open(path, 'rb')
+        buf = fh.read(100)
+        lines = buf.splitlines()
+        m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip())
+        if m:
+            snpeff_version = m.groups()[0] + m.groups()[1]
+        fh.close()
+    except Exception, e:
+        stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) )
+    return snpeff_version   
+
 """
 # Download human database 'hg19'
 java -jar snpEff.jar download -v hg19
@@ -74,7 +118,7 @@
 regulation_HeLa-S3.bin
 regulation_pattern = 'regulation_(.+).bin'
 """
-def download_database(data_manager_dict, target_directory, jar_path,config,genome_version,organism):
+def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism):
     ## get data_dir from config 
     ##---
     ## Databases are stored here
@@ -103,25 +147,28 @@
     #  annotation files that are included in snpEff by a flag
     annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'}
     genome_path = os.path.join(data_dir,genome_version)
+    snpeff_version = getSnpeffVersion(jar_path)
+    key  = snpeff_version + '_' + genome_version 
     if os.path.isdir(genome_path):
         for root, dirs, files in os.walk(genome_path):
             for fname in files:
                 if fname.startswith('snpEffectPredictor'):
                     # if snpEffectPredictor.bin download succeeded
                     name = genome_version + (' : ' + organism if organism else '') 
-                    data_table_entry = dict(value=genome_version, name=name, path=data_dir)
-                    _add_data_table_entry( data_manager_dict, 'snpeff4_genomedb', data_table_entry )
+                    # version = getSnpeffVersionFromFile(os.path.join(root,fname))
+                    data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir)
+                    _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry )
                 else:
                     m = re.match(regulation_pattern,fname)
                     if m:
                         name = m.groups()[0]
-                        data_table_entry = dict(genome=genome_version,value=name, name=name)
-                        _add_data_table_entry( data_manager_dict, 'snpeff4_regulationdb', data_table_entry )
+                        data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name)
+                        _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry )
                     elif fname in annotations_dict:
                         value = annotations_dict[fname]
                         name = value.lstrip('-')
-                        data_table_entry = dict(genome=genome_version,value=value, name=name)
-                        _add_data_table_entry( data_manager_dict, 'snpeff4_annotations', data_table_entry )
+                        data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name)
+                        _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry )
     return data_manager_dict
 
 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):
--- a/data_manager/data_manager_snpEff_download.xml	Thu Oct 23 05:43:46 2014 -0500
+++ b/data_manager/data_manager_snpEff_download.xml	Tue Jan 13 12:54:20 2015 -0600
@@ -28,15 +28,15 @@
                 <assert_contents>
                     <!-- Check that a genome was added -->
                     <has_text text="GRCh38.76" />
-                    <has_text text="snpeff4_regulationdb" />
-                    <has_text text="snpeff4_annotations" />
+                    <has_text text="snpeffv_regulationdb" />
+                    <has_text text="snpeffv_annotations" />
                 </assert_contents>
             </output>
         </test>
     </tests>
     <help>
 
-This tool downloads a SnpEff database and populates data tables: snpeff4_genomedb, snpeff4_regulationdb, and snpeff4_annotations.
+This tool downloads a SnpEff database and populates data tables: snpeffv_genomedb, snpeffv_regulationdb, and snpeffv_annotations.
 
 To see the list of available SnpEff genomes run the "SnpEff Databases" data manager which records the available genome databases in data table: snpeff4_databases 
 
--- a/data_manager_conf.xml	Thu Oct 23 05:43:46 2014 -0500
+++ b/data_manager_conf.xml	Tue Jan 13 12:54:20 2015 -0600
@@ -1,36 +1,44 @@
 <?xml version="1.0"?>
 <data_managers>
-  <data_manager tool_file="data_manager/data_manager_snpEff_databases.xml" id="data_manager_snpeff4_databases" >
-    <data_table name="snpeff4_databases">  <!-- Defines a Data Table to be modified. -->
+  <data_manager tool_file="data_manager/data_manager_snpEff_databases.xml" id="data_manager_snpeff_databases" >
+    <data_table name="snpeffv_databases">  <!-- Defines a Data Table to be modified. -->
       <output> <!-- Handle the output of the Data Manager Tool -->
+        <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool -->
       </output>
     </data_table>
   </data_manager>
-  <data_manager tool_file="data_manager/data_manager_snpEff_download.xml" id="data_manager_snpeff4_download" >
-    <data_table name="snpeff4_genomedb">  <!-- Defines a Data Table to be modified. -->
+  <data_manager tool_file="data_manager/data_manager_snpEff_download.xml" id="data_manager_snpeff_download" >
+    <data_table name="snpeffv_genomedb">  <!-- Defines a Data Table to be modified. -->
       <output> <!-- Handle the output of the Data Manager Tool -->
+        <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="path" output_ref="out_file" >
           <move type="directory" relativize_symlinks="True">
-            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">snpEff/v4/data</target>
+            <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">snpEff/v4_0/data</target>
           </move>
-          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/snpEff/v4/data</value_translation>
+          <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/snpEff/v4_0/data</value_translation>
           <value_translation type="function">abspath</value_translation>
         </column>
       </output>
     </data_table>
-    <data_table name="snpeff4_regulationdb">  <!-- Defines a Data Table to be modified. -->
+    <data_table name="snpeffv_regulationdb">  <!-- Defines a Data Table to be modified. -->
       <output> <!-- Handle the output of the Data Manager Tool -->
+        <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="genome" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
       </output>
     </data_table>
-    <data_table name="snpeff4_annotations">  <!-- Defines a Data Table to be modified. -->
+    <data_table name="snpeffv_annotations">  <!-- Defines a Data Table to be modified. -->
       <output> <!-- Handle the output of the Data Manager Tool -->
+        <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+        <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="genome" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
         <column name="name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
--- a/repository_dependencies.xml	Thu Oct 23 05:43:46 2014 -0500
+++ b/repository_dependencies.xml	Tue Jan 13 12:54:20 2015 -0600
@@ -1,4 +1,4 @@
 <?xml version="1.0"?>
 <repositories description="This requires the SnpEff datatype definitions.">
-    <repository name="snpeff_datatypes" owner="jjohnson" changeset_revision="b33911fdbac4" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
+    <repository name="snpeff_datatypes" owner="jjohnson" changeset_revision="9efd0d32fe8a" toolshed="http://testtoolshed.g2.bx.psu.edu/" />
 </repositories>
--- a/tool-data/snpeff4_annotations.loc.sample	Thu Oct 23 05:43:46 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-## Regulation Databases for SnpEff 
-## These are from the list on: http://snpeff.sourceforge.net/download.html
-#genome	annotation_name description
-#GRCh37.71	nextprot	nextprot
-#GRCh37.71	motif	motif
--- a/tool-data/snpeff4_databases.loc.sample	Thu Oct 23 05:43:46 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-## Available Databases for SnpEff 
-## These are from the list on: http://snpeff.sourceforge.net/download.html
-## the Description field in this sample is "Genome : Version" 
-#Version	Description
-#GRCh37.68	Homo sapiens : GRCh37.68
--- a/tool-data/snpeff4_genomedb.loc.sample	Thu Oct 23 05:43:46 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-## Downloaded Databases for SnpEff 
-## These are from the list on: http://snpeff.sourceforge.net/download.html
-## the Description field in this sample is "Genome : Version" 
-#Version        Description	data_dir path
-#GRCh37.68      Homo sapiens : GRCh37.68	/home/galaxy/snpEff/data
--- a/tool-data/snpeff4_regulationdb.loc.sample	Thu Oct 23 05:43:46 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,4 +0,0 @@
-## Regulation Databases for SnpEff 
-## These are from the list on: http://snpeff.sourceforge.net/download.html
-#genome	regulation_name description
-#GRCh37.70	CD4	CD4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpeffv_annotations.loc.sample	Tue Jan 13 12:54:20 2015 -0600
@@ -0,0 +1,5 @@
+## Regulation Databases for SnpEff 
+## These are from the list on: http://snpeff.sourceforge.net/download.html
+#key	snpeff_version	genome	annotation_name description
+#SnpEff4.0_GRCh37.75	SnpEff4.0	GRCh37.75	nextprot	nextprot
+#SnpEff4.0_GRCh38.76	SnpEff4.1	GRCh38.76	motif	motif
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpeffv_databases.loc.sample	Tue Jan 13 12:54:20 2015 -0600
@@ -0,0 +1,5 @@
+## Available Databases for SnpEff 
+## These are from the list on: http://snpeff.sourceforge.net/download.html
+## the Description field in this sample is "Genome : Version" 
+#key	snpeff_version	Version	Description
+#SnpEff4.0_GRCh37.75	SnpEff4.0	GRCh37.75	Homo sapiens : GRCh37.75
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpeffv_genomedb.loc.sample	Tue Jan 13 12:54:20 2015 -0600
@@ -0,0 +1,6 @@
+## Downloaded Databases for SnpEff 
+## These are from the list on: http://snpeff.sourceforge.net/download.html
+## the Description field in this sample is "Genome : Version" 
+#Key	snpeff_version	Version	Description	data_dir	path
+#SnpEff4.0_GRCh37.74	SnpEff4.0	GRCh37.74	Homo sapiens : GRCh37.74	/home/galaxy/snpEff/v4_0/data
+#SnpEff4.1_GRCh38.76	SnpEff4.1	GRCh38.76	Homo sapiens : GRCh38.76	/home/galaxy/snpEff/v4_1/data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/snpeffv_regulationdb.loc.sample	Tue Jan 13 12:54:20 2015 -0600
@@ -0,0 +1,5 @@
+## Regulation Databases for SnpEff 
+## These are from the list on: http://snpeff.sourceforge.net/download.html
+#Key	snpeff_version	genome	regulation_name description
+#SnpEff4.0_GRCh37.74	SnpEff4.0	GRCh37.74	CD4	CD4
+#SnpEff4.1_GRCh38.76	SnpEff4.1	GRCh38.76	CD4	CD4
--- a/tool_data_table_conf.xml.sample	Thu Oct 23 05:43:46 2014 -0500
+++ b/tool_data_table_conf.xml.sample	Tue Jan 13 12:54:20 2015 -0600
@@ -1,19 +1,19 @@
 <tables>
-    <table name="snpeff4_databases" comment_char="#">
-        <columns>value, name</columns>
-        <file path="tool-data/snpeff4_databases.loc" />
+    <table name="snpeffv_genomedb" comment_char="#">
+        <columns>key, version, value, name, path</columns>
+        <file path="tool-data/snpeffv_genomedb.loc" />
     </table>
-    <table name="snpeff4_genomedb" comment_char="#">
-        <columns>value, name, path</columns>
-        <file path="tool-data/snpeff4_genomedb.loc" />
+    <table name="snpeffv_regulationdb" comment_char="#">
+        <columns>key, version, genome, value, name</columns>
+        <file path="tool-data/snpeffv_regulationdb.loc" />
     </table>
-    <table name="snpeff4_regulationdb" comment_char="#">
-        <columns>genome, value, name</columns>
-        <file path="tool-data/snpeff4_regulationdb.loc" />
+    <table name="snpeffv_annotations" comment_char="#">
+        <columns>key, version, genome, value, name</columns>
+        <file path="tool-data/snpeffv_annotations.loc" />
     </table>
-    <table name="snpeff4_annotations" comment_char="#">
-        <columns>genome, value, name</columns>
-        <file path="tool-data/snpeff4_annotations.loc" />
+    <table name="snpeffv_databases" comment_char="#">
+        <columns>key, version, value, name</columns>
+        <file path="tool-data/snpeffv_databases.loc" />
     </table>
 </tables>