# HG changeset patch # User sh477 # Date 1645547204 0 # Node ID 612026ea6db5330dbc4aaaa61f0ede3a76e94900 # Parent b0c0733f58dac50dd87be2e79d6ed738a0f081ce Uploaded first real version diff -r b0c0733f58da -r 612026ea6db5 .shed.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.shed.yml Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,11 @@ +categories: +- Data Managers +description: Download and install annotation cache files for Ensembl VEP +long_description: | + This tool downloads given versions of VEP cache annotation files and makes + them available to Ensembl VEP in Galaxy via the "vep_versioned_caches" data + table. +name: data_manager_vep_cache_downloader +owner: sh477 +remote_repository_url: +type: unrestricted \ No newline at end of file diff -r b0c0733f58da -r 612026ea6db5 data_manager/data_manager_vep_cache_download.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_vep_cache_download.py Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import datetime +import json +import os +import re +from urllib.request import urlretrieve +import sys +import tarfile + + +def main(): + # Read in given out_file and create target directory for file download + with open(sys.argv[1]) as fh: + params = json.load(fh) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + # Process parameters for metadata and file download + url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/") + m = re.search(r"_([^_]*?)_vep_(\d+?)_", params['param_dict']['file_name']) + version = str(m.group(1)) + cache_type = m.group(0) if m.group(0) == "merged" or m.group(0) == "refseq" else "default" + + # Download and extract given cache archive, remove archive afterwards + final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name'])) + tar = tarfile.open(final_file, "r:gz") + tar.extractall(target_directory) + tar.close() + os.remove(final_file) + + # Construct metadata for the new data table entry + data_manager_dict = { + 'data_tables': { + 'vep_cache_databases': [ + { + 'value': params['param_dict']['file_name'].strip(".tar.gz"), + 'dbkey': params['param_dict']['dbkey'], + 'version': version, + 'cachetype': ctype, + 'name': params['param_dict']['display_name'], + 'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz") + } + ] + } + } + + # Save metadata to out_file + with open(sys.argv[1], 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True) + + +if __name__ == "__main__": + main() diff -r b0c0733f58da -r 612026ea6db5 data_manager/data_manager_vep_cache_download.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_vep_cache_download.xml Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,42 @@ + + the cache files required by VEP + + python + + + python '$__tool_directory__/data_manager_vep_cache_download.py' '$out_file' + + + + + + + + + + + + + + + + + + + + +This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the +"vep_versioned_caches" data table. You should use the indexed version of the cache files and it is strongly recommended to +use the cache files which version number matches the VEP version number. Note that for most genomes there are three versions +of cache data available: default, refseq and merged (combining the former two). Choose the one suitable for your usage. + +A general introduction to the VEP cache and download links can be found on the official website: +https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html + + + 10.1186/s13059-016-0974-4 + + diff -r b0c0733f58da -r 612026ea6db5 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,21 @@ + + + + + + + + + + + + + vep/${version}/${dbkey}/${cachetype} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/vep/${version}/${dbkey}/${cachetype}/ + abspath + + + + + \ No newline at end of file diff -r b0c0733f58da -r 612026ea6db5 data_manager_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml.sample Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,12 @@ + + + + value, dbkey, version, cachetype, name, path + +
+ + + value, name, len_path + +
+
\ No newline at end of file diff -r b0c0733f58da -r 612026ea6db5 data_manager_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml.test Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,12 @@ + + + + value, dbkey, version, cachetype, name, path + +
+ + + value, name, len_path + +
+
\ No newline at end of file diff -r b0c0733f58da -r 612026ea6db5 test-data/dbkeys.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dbkeys.loc Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,3 @@ +# +hg38 Human hg38 a_path +ce11 C. elegans ce11 a_path diff -r b0c0733f58da -r 612026ea6db5 test-data/from_test-meta.data_manager.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/from_test-meta.data_manager.json Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"vep_cache_databases": [{"value": "caenorhabditis_elegans_vep_105_WBcel235", "dbkey": "ce11", "version": "105", "cachetype": "default", "name": "C. elegans c11 (V105)", "path": "./caenorhabditis_elegans_vep_105_WBcel235"}]}} \ No newline at end of file diff -r b0c0733f58da -r 612026ea6db5 test-data/vep_versioned_caches.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vep_versioned_caches.loc Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,2 @@ +# +# diff -r b0c0733f58da -r 612026ea6db5 tool-data/dbkeys.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dbkeys.loc.sample Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,1 @@ +# diff -r b0c0733f58da -r 612026ea6db5 tool-data/vep_versioned_caches.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/vep_versioned_caches.loc.sample Tue Feb 22 16:26:44 2022 +0000 @@ -0,0 +1,11 @@ +#This file describes vep cache data and its metadata available on the server. +#The data table has the format (white space characters are TAB characters): +# +# +# +#So, vep_versioned_caches.loc tables could look like this: +# +#homo_sapiens_vep_105_GRCh38 hg38 105 default Homo sapiens hg38 (V105) /path/to/vep_versioned_caches/105/hg38/default +#homo_sapiens_refseq_vep_105_GRCh38 hg38 105 refseq Homo sapiens hg38 refseq (V105) /path/to/vep_versioned_caches/105/hg38/refseq +#homo_sapiens_merged_vep_105_GRCh38 hg38 105 merged Homo sapiens hg38 merged (V105) /path/to/vep_versioned_caches/105/hg38/merged +# \ No newline at end of file