Mercurial > repos > jjohnson > data_manager_snpeff
diff data_manager/data_manager_snpEff_download.py @ 5:78bcf4ac437c
Use tool_data_table with key and version columns added to allow for multiple versions in a .loc file
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Tue, 13 Jan 2015 12:54:20 -0600 |
parents | c6769a700e55 |
children |
line wrap: on
line diff
--- a/data_manager/data_manager_snpEff_download.py Thu Oct 23 05:43:46 2014 -0500 +++ b/data_manager/data_manager_snpEff_download.py Tue Jan 13 12:54:20 2015 -0600 @@ -9,6 +9,7 @@ import shutil import optparse import urllib2 +import gzip from ftplib import FTP import tarfile @@ -64,6 +65,49 @@ return ','.join(descriptions) return organisms +def getSnpeffVersion(jar_path): + snpeff_version = 'SnpEff ?.?' + (snpEff_dir,snpEff_jar) = os.path.split(jar_path) + stderr_path = 'snpeff.err' + stderr_fh = open(stderr_path,'w') + args = [ 'java','-jar', ] + args.append( snpEff_jar ) + args.append( '-h' ) + proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) + return_code = proc.wait() + if return_code != 255: + sys.exit( return_code ) + stderr_fh.close() + fh = open(stderr_path,'r') + for line in fh: + m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line) + if m: + snpeff_version = m.groups()[0] + m.groups()[1] + break + fh.close() + return snpeff_version + +# Starting with SnpEff 4.1 the .bin files contain the SnpEff version: +# Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed): +""" +SnpEff 4.1 +CHROMOSOME 2 1 0 179197 GL000219.1 false +CHROMOSOME 3 1 0 81347269 HSCHR17_1 false +""" +def getSnpeffVersionFromFile(path): + snpeff_version = None + try: + fh = gzip.open(path, 'rb') + buf = fh.read(100) + lines = buf.splitlines() + m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip()) + if m: + snpeff_version = m.groups()[0] + m.groups()[1] + fh.close() + except Exception, e: + stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) ) + return snpeff_version + """ # Download human database 'hg19' java -jar snpEff.jar download -v hg19 @@ -74,7 +118,7 @@ regulation_HeLa-S3.bin regulation_pattern = 'regulation_(.+).bin' """ -def download_database(data_manager_dict, target_directory, jar_path,config,genome_version,organism): +def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism): ## get data_dir from config ##--- ## Databases are stored here @@ -103,25 +147,28 @@ # annotation files that are included in snpEff by a flag annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} genome_path = os.path.join(data_dir,genome_version) + snpeff_version = getSnpeffVersion(jar_path) + key = snpeff_version + '_' + genome_version if os.path.isdir(genome_path): for root, dirs, files in os.walk(genome_path): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded name = genome_version + (' : ' + organism if organism else '') - data_table_entry = dict(value=genome_version, name=name, path=data_dir) - _add_data_table_entry( data_manager_dict, 'snpeff4_genomedb', data_table_entry ) + # version = getSnpeffVersionFromFile(os.path.join(root,fname)) + data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir) + _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry ) else: m = re.match(regulation_pattern,fname) if m: name = m.groups()[0] - data_table_entry = dict(genome=genome_version,value=name, name=name) - _add_data_table_entry( data_manager_dict, 'snpeff4_regulationdb', data_table_entry ) + data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name) + _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry ) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') - data_table_entry = dict(genome=genome_version,value=value, name=name) - _add_data_table_entry( data_manager_dict, 'snpeff4_annotations', data_table_entry ) + data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name) + _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry ) return data_manager_dict def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):