diff data_manager/data_manager_snpEff_download.py @ 5:78bcf4ac437c

Use tool_data_table with key and version columns added to allow for multiple versions in a .loc file
author Jim Johnson <jj@umn.edu>
date Tue, 13 Jan 2015 12:54:20 -0600
parents c6769a700e55
children
line wrap: on
line diff
--- a/data_manager/data_manager_snpEff_download.py	Thu Oct 23 05:43:46 2014 -0500
+++ b/data_manager/data_manager_snpEff_download.py	Tue Jan 13 12:54:20 2015 -0600
@@ -9,6 +9,7 @@
 import shutil
 import optparse
 import urllib2
+import gzip
 from ftplib import FTP
 import tarfile
 
@@ -64,6 +65,49 @@
         return ','.join(descriptions)
     return organisms    
 
+def getSnpeffVersion(jar_path):
+    snpeff_version = 'SnpEff ?.?'
+    (snpEff_dir,snpEff_jar) = os.path.split(jar_path)
+    stderr_path = 'snpeff.err'
+    stderr_fh = open(stderr_path,'w')
+    args = [ 'java','-jar', ]
+    args.append( snpEff_jar )
+    args.append( '-h' )
+    proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() )
+    return_code = proc.wait()
+    if return_code != 255:
+        sys.exit( return_code )
+    stderr_fh.close()
+    fh = open(stderr_path,'r')
+    for line in fh:
+        m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line)
+        if m:
+            snpeff_version = m.groups()[0] + m.groups()[1]
+            break
+    fh.close()
+    return snpeff_version
+
+# Starting with SnpEff 4.1 the .bin files contain the SnpEff version:
+# Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed):
+"""
+SnpEff  4.1
+CHROMOSOME      2       1       0       179197  GL000219.1      false
+CHROMOSOME      3       1       0       81347269        HSCHR17_1       false
+"""
+def getSnpeffVersionFromFile(path):
+    snpeff_version = None
+    try:
+        fh = gzip.open(path, 'rb')
+        buf = fh.read(100)
+        lines = buf.splitlines()
+        m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip())
+        if m:
+            snpeff_version = m.groups()[0] + m.groups()[1]
+        fh.close()
+    except Exception, e:
+        stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) )
+    return snpeff_version   
+
 """
 # Download human database 'hg19'
 java -jar snpEff.jar download -v hg19
@@ -74,7 +118,7 @@
 regulation_HeLa-S3.bin
 regulation_pattern = 'regulation_(.+).bin'
 """
-def download_database(data_manager_dict, target_directory, jar_path,config,genome_version,organism):
+def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism):
     ## get data_dir from config 
     ##---
     ## Databases are stored here
@@ -103,25 +147,28 @@
     #  annotation files that are included in snpEff by a flag
     annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'}
     genome_path = os.path.join(data_dir,genome_version)
+    snpeff_version = getSnpeffVersion(jar_path)
+    key  = snpeff_version + '_' + genome_version 
     if os.path.isdir(genome_path):
         for root, dirs, files in os.walk(genome_path):
             for fname in files:
                 if fname.startswith('snpEffectPredictor'):
                     # if snpEffectPredictor.bin download succeeded
                     name = genome_version + (' : ' + organism if organism else '') 
-                    data_table_entry = dict(value=genome_version, name=name, path=data_dir)
-                    _add_data_table_entry( data_manager_dict, 'snpeff4_genomedb', data_table_entry )
+                    # version = getSnpeffVersionFromFile(os.path.join(root,fname))
+                    data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir)
+                    _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry )
                 else:
                     m = re.match(regulation_pattern,fname)
                     if m:
                         name = m.groups()[0]
-                        data_table_entry = dict(genome=genome_version,value=name, name=name)
-                        _add_data_table_entry( data_manager_dict, 'snpeff4_regulationdb', data_table_entry )
+                        data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name)
+                        _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry )
                     elif fname in annotations_dict:
                         value = annotations_dict[fname]
                         name = value.lstrip('-')
-                        data_table_entry = dict(genome=genome_version,value=value, name=name)
-                        _add_data_table_entry( data_manager_dict, 'snpeff4_annotations', data_table_entry )
+                        data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name)
+                        _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry )
     return data_manager_dict
 
 def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):