Mercurial > repos > jjohnson > data_manager_snpeff
changeset 5:78bcf4ac437c
Use tool_data_table with key and version columns added to allow for multiple versions in a .loc file
author | Jim Johnson <jj@umn.edu> |
---|---|
date | Tue, 13 Jan 2015 12:54:20 -0600 (2015-01-13) |
parents | 6a378d0f4856 |
children | a329eda0cdff |
files | data_manager/data_manager_snpEff_databases.py data_manager/data_manager_snpEff_download.py data_manager/data_manager_snpEff_download.xml data_manager_conf.xml repository_dependencies.xml tool-data/snpeff4_annotations.loc.sample tool-data/snpeff4_databases.loc.sample tool-data/snpeff4_genomedb.loc.sample tool-data/snpeff4_regulationdb.loc.sample tool-data/snpeffv_annotations.loc.sample tool-data/snpeffv_databases.loc.sample tool-data/snpeffv_genomedb.loc.sample tool-data/snpeffv_regulationdb.loc.sample tool_data_table_conf.xml.sample |
diffstat | 14 files changed, 134 insertions(+), 53 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_snpEff_databases.py Thu Oct 23 05:43:46 2014 -0500 +++ b/data_manager/data_manager_snpEff_databases.py Tue Jan 13 12:54:20 2015 -0600 @@ -18,6 +18,28 @@ sys.stderr.write(msg) sys.exit(1) +def getSnpeffVersion(jar_path): + snpeff_version = 'SnpEff ?.?' + (snpEff_dir,snpEff_jar) = os.path.split(jar_path) + stderr_path = 'snpeff.err' + stderr_fh = open(stderr_path,'w') + args = [ 'java','-jar', ] + args.append( snpEff_jar ) + args.append( '-h' ) + proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) + return_code = proc.wait() + if return_code != 255: + sys.exit( return_code ) + stderr_fh.close() + fh = open(stderr_path,'r') + for line in fh: + m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line) + if m: + snpeff_version = m.groups()[0] + m.groups()[1] + break + fh.close() + return snpeff_version + def fetch_databases(data_manager_dict, target_directory, jar_path): (snpEff_dir,snpEff_jar) = os.path.split(jar_path) if not os.path.exists(target_directory): @@ -35,9 +57,10 @@ if return_code: sys.exit( return_code ) databases_output.close() + snpeff_version = getSnpeffVersion(jar_path) try: data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) - data_manager_dict['data_tables']['snpeff4_databases'] = data_manager_dict['data_tables'].get( 'snpeff4_databases', [] ) + data_manager_dict['data_tables']['snpeffv_databases'] = data_manager_dict['data_tables'].get( 'snpeffv_databases', [] ) data_table_entries = [] fh = open(databases_path,'r') for i,line in enumerate(fh): @@ -50,8 +73,9 @@ if genome_version == '30c2c903' or fields[1].strip() == 'TestCase' or fields[1].strip().startswith('Test_'): continue description = fields[1].strip() + ' : ' + genome_version - data_table_entries.append(dict(value=genome_version, name=description)) - data_manager_dict['data_tables']['snpeff4_databases'] = data_table_entries + key = snpeff_version + '_' + genome_version + data_table_entries.append(dict(key=key, version=snpeff_version, value=genome_version, name=description)) + data_manager_dict['data_tables']['snpeffv_databases'] = data_table_entries except Exception, e: stop_err( 'Error parsing %s %s\n' % (config,str( e )) ) else:
--- a/data_manager/data_manager_snpEff_download.py Thu Oct 23 05:43:46 2014 -0500 +++ b/data_manager/data_manager_snpEff_download.py Tue Jan 13 12:54:20 2015 -0600 @@ -9,6 +9,7 @@ import shutil import optparse import urllib2 +import gzip from ftplib import FTP import tarfile @@ -64,6 +65,49 @@ return ','.join(descriptions) return organisms +def getSnpeffVersion(jar_path): + snpeff_version = 'SnpEff ?.?' + (snpEff_dir,snpEff_jar) = os.path.split(jar_path) + stderr_path = 'snpeff.err' + stderr_fh = open(stderr_path,'w') + args = [ 'java','-jar', ] + args.append( snpEff_jar ) + args.append( '-h' ) + proc = subprocess.Popen( args=args, shell=False, cwd=snpEff_dir, stderr=stderr_fh.fileno() ) + return_code = proc.wait() + if return_code != 255: + sys.exit( return_code ) + stderr_fh.close() + fh = open(stderr_path,'r') + for line in fh: + m = re.match('^[Ss]npEff version (SnpEff)\s*(\d+\.\d+).*$',line) + if m: + snpeff_version = m.groups()[0] + m.groups()[1] + break + fh.close() + return snpeff_version + +# Starting with SnpEff 4.1 the .bin files contain the SnpEff version: +# Example - the first 3 line of GRCh37.75/snpEffectPredictor.bin (uncompressed): +""" +SnpEff 4.1 +CHROMOSOME 2 1 0 179197 GL000219.1 false +CHROMOSOME 3 1 0 81347269 HSCHR17_1 false +""" +def getSnpeffVersionFromFile(path): + snpeff_version = None + try: + fh = gzip.open(path, 'rb') + buf = fh.read(100) + lines = buf.splitlines() + m = re.match('^(SnpEff)\s+(\d+\.\d+).*$',lines[0].strip()) + if m: + snpeff_version = m.groups()[0] + m.groups()[1] + fh.close() + except Exception, e: + stop_err( 'Error parsing SnpEff version from: %s %s\n' % (path,str( e )) ) + return snpeff_version + """ # Download human database 'hg19' java -jar snpEff.jar download -v hg19 @@ -74,7 +118,7 @@ regulation_HeLa-S3.bin regulation_pattern = 'regulation_(.+).bin' """ -def download_database(data_manager_dict, target_directory, jar_path,config,genome_version,organism): +def download_database(data_manager_dict, target_directory, jar_path, config, genome_version, organism): ## get data_dir from config ##--- ## Databases are stored here @@ -103,25 +147,28 @@ # annotation files that are included in snpEff by a flag annotations_dict = {'nextProt.bin' : '-nextprot','motif.bin': '-motif'} genome_path = os.path.join(data_dir,genome_version) + snpeff_version = getSnpeffVersion(jar_path) + key = snpeff_version + '_' + genome_version if os.path.isdir(genome_path): for root, dirs, files in os.walk(genome_path): for fname in files: if fname.startswith('snpEffectPredictor'): # if snpEffectPredictor.bin download succeeded name = genome_version + (' : ' + organism if organism else '') - data_table_entry = dict(value=genome_version, name=name, path=data_dir) - _add_data_table_entry( data_manager_dict, 'snpeff4_genomedb', data_table_entry ) + # version = getSnpeffVersionFromFile(os.path.join(root,fname)) + data_table_entry = dict(key=key,version=snpeff_version,value=genome_version, name=name, path=data_dir) + _add_data_table_entry( data_manager_dict, 'snpeffv_genomedb', data_table_entry ) else: m = re.match(regulation_pattern,fname) if m: name = m.groups()[0] - data_table_entry = dict(genome=genome_version,value=name, name=name) - _add_data_table_entry( data_manager_dict, 'snpeff4_regulationdb', data_table_entry ) + data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=name, name=name) + _add_data_table_entry( data_manager_dict, 'snpeffv_regulationdb', data_table_entry ) elif fname in annotations_dict: value = annotations_dict[fname] name = value.lstrip('-') - data_table_entry = dict(genome=genome_version,value=value, name=name) - _add_data_table_entry( data_manager_dict, 'snpeff4_annotations', data_table_entry ) + data_table_entry = dict(key=key,version=snpeff_version,genome=genome_version,value=value, name=name) + _add_data_table_entry( data_manager_dict, 'snpeffv_annotations', data_table_entry ) return data_manager_dict def _add_data_table_entry( data_manager_dict, data_table, data_table_entry ):
--- a/data_manager/data_manager_snpEff_download.xml Thu Oct 23 05:43:46 2014 -0500 +++ b/data_manager/data_manager_snpEff_download.xml Tue Jan 13 12:54:20 2015 -0600 @@ -28,15 +28,15 @@ <assert_contents> <!-- Check that a genome was added --> <has_text text="GRCh38.76" /> - <has_text text="snpeff4_regulationdb" /> - <has_text text="snpeff4_annotations" /> + <has_text text="snpeffv_regulationdb" /> + <has_text text="snpeffv_annotations" /> </assert_contents> </output> </test> </tests> <help> -This tool downloads a SnpEff database and populates data tables: snpeff4_genomedb, snpeff4_regulationdb, and snpeff4_annotations. +This tool downloads a SnpEff database and populates data tables: snpeffv_genomedb, snpeffv_regulationdb, and snpeffv_annotations. To see the list of available SnpEff genomes run the "SnpEff Databases" data manager which records the available genome databases in data table: snpeff4_databases
--- a/data_manager_conf.xml Thu Oct 23 05:43:46 2014 -0500 +++ b/data_manager_conf.xml Tue Jan 13 12:54:20 2015 -0600 @@ -1,36 +1,44 @@ <?xml version="1.0"?> <data_managers> - <data_manager tool_file="data_manager/data_manager_snpEff_databases.xml" id="data_manager_snpeff4_databases" > - <data_table name="snpeff4_databases"> <!-- Defines a Data Table to be modified. --> + <data_manager tool_file="data_manager/data_manager_snpEff_databases.xml" id="data_manager_snpeff_databases" > + <data_table name="snpeffv_databases"> <!-- Defines a Data Table to be modified. --> <output> <!-- Handle the output of the Data Manager Tool --> + <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> </output> </data_table> </data_manager> - <data_manager tool_file="data_manager/data_manager_snpEff_download.xml" id="data_manager_snpeff4_download" > - <data_table name="snpeff4_genomedb"> <!-- Defines a Data Table to be modified. --> + <data_manager tool_file="data_manager/data_manager_snpEff_download.xml" id="data_manager_snpeff_download" > + <data_table name="snpeffv_genomedb"> <!-- Defines a Data Table to be modified. --> <output> <!-- Handle the output of the Data Manager Tool --> + <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="path" output_ref="out_file" > <move type="directory" relativize_symlinks="True"> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">snpEff/v4/data</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">snpEff/v4_0/data</target> </move> - <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/snpEff/v4/data</value_translation> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/snpEff/v4_0/data</value_translation> <value_translation type="function">abspath</value_translation> </column> </output> </data_table> - <data_table name="snpeff4_regulationdb"> <!-- Defines a Data Table to be modified. --> + <data_table name="snpeffv_regulationdb"> <!-- Defines a Data Table to be modified. --> <output> <!-- Handle the output of the Data Manager Tool --> + <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="genome" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> </output> </data_table> - <data_table name="snpeff4_annotations"> <!-- Defines a Data Table to be modified. --> + <data_table name="snpeffv_annotations"> <!-- Defines a Data Table to be modified. --> <output> <!-- Handle the output of the Data Manager Tool --> + <column name="key" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="genome" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool -->
--- a/repository_dependencies.xml Thu Oct 23 05:43:46 2014 -0500 +++ b/repository_dependencies.xml Tue Jan 13 12:54:20 2015 -0600 @@ -1,4 +1,4 @@ <?xml version="1.0"?> <repositories description="This requires the SnpEff datatype definitions."> - <repository name="snpeff_datatypes" owner="jjohnson" changeset_revision="b33911fdbac4" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> + <repository name="snpeff_datatypes" owner="jjohnson" changeset_revision="9efd0d32fe8a" toolshed="http://testtoolshed.g2.bx.psu.edu/" /> </repositories>
--- a/tool-data/snpeff4_annotations.loc.sample Thu Oct 23 05:43:46 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -## Regulation Databases for SnpEff -## These are from the list on: http://snpeff.sourceforge.net/download.html -#genome annotation_name description -#GRCh37.71 nextprot nextprot -#GRCh37.71 motif motif
--- a/tool-data/snpeff4_databases.loc.sample Thu Oct 23 05:43:46 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -## Available Databases for SnpEff -## These are from the list on: http://snpeff.sourceforge.net/download.html -## the Description field in this sample is "Genome : Version" -#Version Description -#GRCh37.68 Homo sapiens : GRCh37.68
--- a/tool-data/snpeff4_genomedb.loc.sample Thu Oct 23 05:43:46 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,5 +0,0 @@ -## Downloaded Databases for SnpEff -## These are from the list on: http://snpeff.sourceforge.net/download.html -## the Description field in this sample is "Genome : Version" -#Version Description data_dir path -#GRCh37.68 Homo sapiens : GRCh37.68 /home/galaxy/snpEff/data
--- a/tool-data/snpeff4_regulationdb.loc.sample Thu Oct 23 05:43:46 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -## Regulation Databases for SnpEff -## These are from the list on: http://snpeff.sourceforge.net/download.html -#genome regulation_name description -#GRCh37.70 CD4 CD4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/snpeffv_annotations.loc.sample Tue Jan 13 12:54:20 2015 -0600 @@ -0,0 +1,5 @@ +## Regulation Databases for SnpEff +## These are from the list on: http://snpeff.sourceforge.net/download.html +#key snpeff_version genome annotation_name description +#SnpEff4.0_GRCh37.75 SnpEff4.0 GRCh37.75 nextprot nextprot +#SnpEff4.0_GRCh38.76 SnpEff4.1 GRCh38.76 motif motif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/snpeffv_databases.loc.sample Tue Jan 13 12:54:20 2015 -0600 @@ -0,0 +1,5 @@ +## Available Databases for SnpEff +## These are from the list on: http://snpeff.sourceforge.net/download.html +## the Description field in this sample is "Genome : Version" +#key snpeff_version Version Description +#SnpEff4.0_GRCh37.75 SnpEff4.0 GRCh37.75 Homo sapiens : GRCh37.75
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/snpeffv_genomedb.loc.sample Tue Jan 13 12:54:20 2015 -0600 @@ -0,0 +1,6 @@ +## Downloaded Databases for SnpEff +## These are from the list on: http://snpeff.sourceforge.net/download.html +## the Description field in this sample is "Genome : Version" +#Key snpeff_version Version Description data_dir path +#SnpEff4.0_GRCh37.74 SnpEff4.0 GRCh37.74 Homo sapiens : GRCh37.74 /home/galaxy/snpEff/v4_0/data +#SnpEff4.1_GRCh38.76 SnpEff4.1 GRCh38.76 Homo sapiens : GRCh38.76 /home/galaxy/snpEff/v4_1/data
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/snpeffv_regulationdb.loc.sample Tue Jan 13 12:54:20 2015 -0600 @@ -0,0 +1,5 @@ +## Regulation Databases for SnpEff +## These are from the list on: http://snpeff.sourceforge.net/download.html +#Key snpeff_version genome regulation_name description +#SnpEff4.0_GRCh37.74 SnpEff4.0 GRCh37.74 CD4 CD4 +#SnpEff4.1_GRCh38.76 SnpEff4.1 GRCh38.76 CD4 CD4
--- a/tool_data_table_conf.xml.sample Thu Oct 23 05:43:46 2014 -0500 +++ b/tool_data_table_conf.xml.sample Tue Jan 13 12:54:20 2015 -0600 @@ -1,19 +1,19 @@ <tables> - <table name="snpeff4_databases" comment_char="#"> - <columns>value, name</columns> - <file path="tool-data/snpeff4_databases.loc" /> + <table name="snpeffv_genomedb" comment_char="#"> + <columns>key, version, value, name, path</columns> + <file path="tool-data/snpeffv_genomedb.loc" /> </table> - <table name="snpeff4_genomedb" comment_char="#"> - <columns>value, name, path</columns> - <file path="tool-data/snpeff4_genomedb.loc" /> + <table name="snpeffv_regulationdb" comment_char="#"> + <columns>key, version, genome, value, name</columns> + <file path="tool-data/snpeffv_regulationdb.loc" /> </table> - <table name="snpeff4_regulationdb" comment_char="#"> - <columns>genome, value, name</columns> - <file path="tool-data/snpeff4_regulationdb.loc" /> + <table name="snpeffv_annotations" comment_char="#"> + <columns>key, version, genome, value, name</columns> + <file path="tool-data/snpeffv_annotations.loc" /> </table> - <table name="snpeff4_annotations" comment_char="#"> - <columns>genome, value, name</columns> - <file path="tool-data/snpeff4_annotations.loc" /> + <table name="snpeffv_databases" comment_char="#"> + <columns>key, version, value, name</columns> + <file path="tool-data/snpeffv_databases.loc" /> </table> </tables>