Mercurial > repos > sh477 > data_manager_vep_cache_downloader
changeset 7:7890790d2afd draft
Fully working now and improved several things
author | sh477 |
---|---|
date | Tue, 01 Mar 2022 18:12:26 +0000 |
parents | 3bd006fa2be2 |
children | 3f3246b8972e |
files | data_manager/data_manager_vep_cache_download.py data_manager/data_manager_vep_cache_download.xml data_manager_conf.xml test-data/from_test-meta.data_manager.json test-data/vep_versioned_annotation_cache.loc test-data/vep_versioned_caches.loc tool-data/vep_versioned_annotation_cache.loc.sample tool-data/vep_versioned_caches.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 10 files changed, 38 insertions(+), 38 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_vep_cache_download.py Mon Feb 28 14:42:50 2022 +0000 +++ b/data_manager/data_manager_vep_cache_download.py Tue Mar 01 18:12:26 2022 +0000 @@ -18,9 +18,11 @@ # Process parameters for metadata and file download url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/") - m = re.search(r"_([^_]*?)_vep_(\d+?)_", params['param_dict']['file_name']) - version = str(m.group(2)) - cache_type = m.group(1) if m.group(1) == "merged" or m.group(1) == "refseq" else "default" + m = re.search(r"(.*?)(merged|refseq)?_vep_(\d+?)_", params['param_dict']['file_name']) + version = str(m.group(3)) + cache_type = m.group(2) if m.group(2) else "default" + species = m.group(1).rstrip("_") + display_name = f"{species.capitalize().replace('_', ' ')} {params['param_dict']['dbkey']} (V{version}{'' if cache_type == 'default' else ', ' + cache_type.capitalize()})" # Download and extract given cache archive, remove archive afterwards final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name'])) @@ -32,20 +34,19 @@ # Construct metadata for the new data table entry data_manager_dict = { 'data_tables': { - 'vep_versioned_caches': [ + 'vep_versioned_annotation_cache': [ { 'value': params['param_dict']['file_name'].strip(".tar.gz"), 'dbkey': params['param_dict']['dbkey'], 'version': version, 'cachetype': cache_type, - 'name': params['param_dict']['display_name'], + 'name': display_name, + 'species': species, 'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz") } ] } } - - #assert 42 == 0, str(data_manager_dict) # Save metadata to out_file with open(sys.argv[1], 'w') as fh:
--- a/data_manager/data_manager_vep_cache_download.xml Mon Feb 28 14:42:50 2022 +0000 +++ b/data_manager/data_manager_vep_cache_download.xml Tue Mar 01 18:12:26 2022 +0000 @@ -11,27 +11,26 @@ label="DBKEY of genome that the VEP cache data is for" help="" /> <param name="url" type="text" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/" - label="FTP root url for VEP cache files" help=""/> - <param name="file_name" type="text" label="File name of cache file to be downloaded from root url." help="E.g. homo_sapiens_vep_105_GRCh38.tar.gz"/> - <param name="display_name" type="text" label="Display name used in data-selection dropdowns." help="E.g. Homo sapiens hg38 (V105)"/> + label="FTP root url for VEP cache files" help="Release number should be equal to desired VEP version"/> + <param name="file_name" type="text" label="File name of cache file to be downloaded from root url" help="E.g. homo_sapiens_vep_105_GRCh38.tar.gz"/> </inputs> <outputs> <data name="out_file" format="data_manager_json"/> </outputs> <tests> <test> - <param name="dbkey" value="ce11"/> + <param name="dbkey" value="ci3"/> <param name="url" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/"/> - <param name="file_name" value="caenorhabditis_elegans_vep_105_WBcel235.tar.gz"/> - <param name="display_name" value="C. elegans ce11 (V105)"/> + <param name="file_name" value="ciona_intestinalis_refseq_vep_105_KH.tar.gz"/> <output name="out_file" file="from_test-meta.data_manager.json"/> </test> </tests> <help> This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the -"vep_versioned_caches" data table. You should use the indexed version of the cache files and it is strongly recommended to -use the cache files which version number matches the VEP version number. Note that for most genomes there are three versions -of cache data available: default, refseq and merged (combining the former two). Choose the one suitable for your usage. +"vep_versioned_annotation_cache" data table. You should use the indexed version of the cache files and it is strongly +recommended to use the cache files which version number matches the VEP version number. Note that for most genomes there +are three versions of cache data available: default, refseq and merged (combining the former two). Choose the one suitable +for your usage. A general introduction to the VEP cache and download links can be found on the official website: https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html
--- a/data_manager_conf.xml Mon Feb 28 14:42:50 2022 +0000 +++ b/data_manager_conf.xml Tue Mar 01 18:12:26 2022 +0000 @@ -1,13 +1,14 @@ <?xml version="1.0"?> <data_managers> <data_manager tool_file="data_manager/data_manager_vep_cache_download.xml" id="data_manager_vep_cache_download" > - <data_table name="vep_versioned_caches"> <!-- Defines a Data Table to be modified. --> + <data_table name="vep_versioned_annotation_cache"> <!-- Defines a Data Table to be modified. --> <output> <!-- Handle the output of the Data Manager Tool --> <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="cachetype" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="species" /> <!-- columns that are going to be specified by the Data Manager Tool --> <column name="path" output_ref="out_file" > <move type="directory" relativize_symlinks="True"> <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">vep/${version}/${dbkey}/${cachetype}</target>
--- a/test-data/from_test-meta.data_manager.json Mon Feb 28 14:42:50 2022 +0000 +++ b/test-data/from_test-meta.data_manager.json Tue Mar 01 18:12:26 2022 +0000 @@ -1,1 +1,1 @@ -{"data_tables": {"vep_versioned_caches": [{"cachetype": "default", "dbkey": "ce11", "name": "C. elegans ce11 (V105)", "path": "./caenorhabditis_elegans_vep_105_WBcel235", "value": "caenorhabditis_elegans_vep_105_WBcel235", "version": "105"}]}} \ No newline at end of file +{"data_tables": {"vep_versioned_annotation_cache": [{"cachetype": "refseq", "dbkey": "ci3", "name": "Ciona intestinalis ci3 (V105, Refseq)", "path": "./ciona_intestinalis_refseq_vep_105_KH", "species": "ciona_intestinalis", "value": "ciona_intestinalis_refseq_vep_105_KH", "version": "105"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vep_versioned_annotation_cache.loc Tue Mar 01 18:12:26 2022 +0000 @@ -0,0 +1,2 @@ +#<value> <dbkey> <version> <cachetype> <name> <species> <path> +# \ No newline at end of file
--- a/test-data/vep_versioned_caches.loc Mon Feb 28 14:42:50 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -#<value> <dbkey> <version> <cachetype> <name> <path> -# -caenorhabditis_elegans_vep_105_WBcel235 ce11 105 default C. elegans ce11 (V105) /home/sebastian/galaxy/tool-data/vep/105/ce11/default
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/vep_versioned_annotation_cache.loc.sample Tue Mar 01 18:12:26 2022 +0000 @@ -0,0 +1,11 @@ +#This file describes vep cache data and its metadata available on the server. +#The data table has the format (white space characters are TAB characters): +# +#<value> <dbkey> <version> <cachetype> <name> <species> <path> +# +#So, vep_versioned_annotation_cache.loc tables could look like this: +# +#homo_sapiens_vep_105_GRCh38 hg38 105 default Homo sapiens hg38 (V105) homo_sapiens /path/to/vep_versioned_annotation_cache/105/hg38/default +#homo_sapiens_refseq_vep_105_GRCh38 hg38 105 refseq Homo sapiens hg38 (V105, Refseq) homo_sapiens /path/to/vep_versioned_annotation_cache/105/hg38/refseq +#homo_sapiens_merged_vep_105_GRCh38 hg38 105 merged Homo sapiens hg38 (V105, Merged) homo_sapiens /path/to/vep_versioned_annotation_cache/105/hg38/merged +# \ No newline at end of file
--- a/tool-data/vep_versioned_caches.loc.sample Mon Feb 28 14:42:50 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,11 +0,0 @@ -#This file describes vep cache data and its metadata available on the server. -#The data table has the format (white space characters are TAB characters): -# -#<value> <dbkey> <version> <cachetype> <name> <path> -# -#So, vep_versioned_caches.loc tables could look like this: -# -#homo_sapiens_vep_105_GRCh38 hg38 105 default Homo sapiens hg38 (V105) /path/to/vep_versioned_caches/105/hg38/default -#homo_sapiens_refseq_vep_105_GRCh38 hg38 105 refseq Homo sapiens hg38 refseq (V105) /path/to/vep_versioned_caches/105/hg38/refseq -#homo_sapiens_merged_vep_105_GRCh38 hg38 105 merged Homo sapiens hg38 merged (V105) /path/to/vep_versioned_caches/105/hg38/merged -# \ No newline at end of file
--- a/tool_data_table_conf.xml.sample Mon Feb 28 14:42:50 2022 +0000 +++ b/tool_data_table_conf.xml.sample Tue Mar 01 18:12:26 2022 +0000 @@ -1,8 +1,8 @@ <tables> <!-- Table of installed versioned vep cache data --> - <table name="vep_versioned_caches" comment_char="#"> - <columns>value, dbkey, version, cachetype, name, path</columns> - <file path="tool-data/vep_versioned_caches.loc" /> + <table name="vep_versioned_annotation_cache" comment_char="#"> + <columns>value, dbkey, version, cachetype, name, species, path</columns> + <file path="tool-data/vep_versioned_annotation_cache.loc" /> </table> <!-- Locations of dbkeys and len files under genome directory --> <table name="__dbkeys__" comment_char="#">
--- a/tool_data_table_conf.xml.test Mon Feb 28 14:42:50 2022 +0000 +++ b/tool_data_table_conf.xml.test Tue Mar 01 18:12:26 2022 +0000 @@ -1,8 +1,8 @@ <tables> <!-- Table of installed versioned vep cache data --> - <table name="vep_versioned_caches" comment_char="#"> - <columns>value, dbkey, version, cachetype, name, path</columns> - <file path="${__HERE__}/test-data/vep_versioned_caches.loc" /> + <table name="vep_versioned_annotation_cache" comment_char="#"> + <columns>value, dbkey, version, cachetype, name, species, path</columns> + <file path="${__HERE__}/test-data/vep_versioned_annotation_cache.loc" /> </table> <!-- Locations of dbkeys and len files under genome directory --> <table name="__dbkeys__" comment_char="#">