Mercurial > repos > sh477 > data_manager_vep_cache_downloader
changeset 5:a3dba0440f08 draft
Reformatting
author | sh477 |
---|---|
date | Mon, 28 Feb 2022 09:17:29 +0000 |
parents | 97cd72b5130e |
children | 3bd006fa2be2 |
files | .shed.yml data_manager/data_manager_vep_cache_download.py data_manager/data_manager_vep_cache_download.xml data_manager_conf.xml data_manager_conf.xml.sample data_manager_conf.xml.test test-data/dbkeys.loc test-data/vep_versioned_caches.loc tool-data/dbkeys.loc.sample |
diffstat | 9 files changed, 157 insertions(+), 157 deletions(-) [+] |
line wrap: on
line diff
--- a/.shed.yml Wed Feb 23 12:26:11 2022 +0000 +++ b/.shed.yml Mon Feb 28 09:17:29 2022 +0000 @@ -1,11 +1,11 @@ -categories: -- Data Managers -description: Download and install annotation cache files for Ensembl VEP -long_description: | - This tool downloads given versions of VEP cache annotation files and makes - them available to Ensembl VEP in Galaxy via the "vep_versioned_caches" data - table. -name: data_manager_vep_cache_downloader -owner: sh477 -remote_repository_url: +categories: +- Data Managers +description: Download and install annotation cache files for Ensembl VEP +long_description: | + This tool downloads given versions of VEP cache annotation files and makes + them available to Ensembl VEP in Galaxy via the "vep_versioned_caches" data + table. +name: data_manager_vep_cache_downloader +owner: sh477 +remote_repository_url: type: unrestricted \ No newline at end of file
--- a/data_manager/data_manager_vep_cache_download.py Wed Feb 23 12:26:11 2022 +0000 +++ b/data_manager/data_manager_vep_cache_download.py Mon Feb 28 09:17:29 2022 +0000 @@ -1,56 +1,56 @@ -#!/usr/bin/env python - -import datetime -import json -import os -import re -from urllib.request import urlretrieve -import sys -import tarfile - - -def main(): - # Read in given out_file and create target directory for file download - with open(sys.argv[1]) as fh: - params = json.load(fh) - target_directory = params['output_data'][0]['extra_files_path'] - os.mkdir(target_directory) - - # Process parameters for metadata and file download - url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/") - m = re.search(r"_([^_]*?)_vep_(\d+?)_", params['param_dict']['file_name']) - version = str(m.group(2)) - cache_type = m.group(1) if m.group(1) == "merged" or m.group(1) == "refseq" else "default" - - # Download and extract given cache archive, remove archive afterwards - final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name'])) - tar = tarfile.open(final_file, "r:gz") - tar.extractall(target_directory) - tar.close() - os.remove(final_file) - - # Construct metadata for the new data table entry - data_manager_dict = { - 'data_tables': { - 'vep_versioned_caches': [ - { - 'value': params['param_dict']['file_name'].strip(".tar.gz"), - 'dbkey': params['param_dict']['dbkey'], - 'version': version, - 'cachetype': cache_type, - 'name': params['param_dict']['display_name'], - 'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz") - } - ] - } - } - - #assert 42 == 0, str(data_manager_dict) - - # Save metadata to out_file - with open(sys.argv[1], 'w') as fh: - json.dump(data_manager_dict, fh, sort_keys=True) - - -if __name__ == "__main__": - main() +#!/usr/bin/env python + +import datetime +import json +import os +import re +from urllib.request import urlretrieve +import sys +import tarfile + + +def main(): + # Read in given out_file and create target directory for file download + with open(sys.argv[1]) as fh: + params = json.load(fh) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + # Process parameters for metadata and file download + url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/") + m = re.search(r"_([^_]*?)_vep_(\d+?)_", params['param_dict']['file_name']) + version = str(m.group(2)) + cache_type = m.group(1) if m.group(1) == "merged" or m.group(1) == "refseq" else "default" + + # Download and extract given cache archive, remove archive afterwards + final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name'])) + tar = tarfile.open(final_file, "r:gz") + tar.extractall(target_directory) + tar.close() + os.remove(final_file) + + # Construct metadata for the new data table entry + data_manager_dict = { + 'data_tables': { + 'vep_versioned_caches': [ + { + 'value': params['param_dict']['file_name'].strip(".tar.gz"), + 'dbkey': params['param_dict']['dbkey'], + 'version': version, + 'cachetype': cache_type, + 'name': params['param_dict']['display_name'], + 'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz") + } + ] + } + } + + #assert 42 == 0, str(data_manager_dict) + + # Save metadata to out_file + with open(sys.argv[1], 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True) + + +if __name__ == "__main__": + main()
--- a/data_manager/data_manager_vep_cache_download.xml Wed Feb 23 12:26:11 2022 +0000 +++ b/data_manager/data_manager_vep_cache_download.xml Mon Feb 28 09:17:29 2022 +0000 @@ -1,42 +1,42 @@ -<tool id="data_manager_vep_cache_download" name="Download and install VEP cache" version="0.1" tool_type="manage_data"> - <description>ToDo:the cache files required by VEP</description> - <requirements> - <requirement type="package" version="3.9">python</requirement> - </requirements> - <command detect_errors="exit_code"> - python '$__tool_directory__/data_manager_vep_cache_download.py' '$out_file' - </command> - <inputs> - <param name="dbkey" type="genomebuild" - label="DBKEY of genome that the VEP cache data is for" - help="" /> - <param name="url" type="text" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/" - label="FTP root url for VEP cache files" help=""/> - <param name="file_name" type="text" label="File name of cache file to be downloaded from root url." help="E.g. homo_sapiens_vep_105_GRCh38.tar.gz"/> - <param name="display_name" type="text" label="Display name used in data-selection dropdowns." help="E.g. Homo sapiens hg38 (V105)"/> - </inputs> - <outputs> - <data name="out_file" format="data_manager_json"/> - </outputs> - <tests> - <test> - <param name="dbkey" value="ce11"/> - <param name="url" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/"/> - <param name="file_name" value="caenorhabditis_elegans_vep_105_WBcel235.tar.gz"/> - <param name="display_name" value="C. elegans ce11 (V105)"/> - <output name="out_file" file="from_test-meta.data_manager.json"/> - </test> - </tests> - <help> -This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the -"vep_versioned_caches" data table. You should use the indexed version of the cache files and it is strongly recommended to -use the cache files which version number matches the VEP version number. Note that for most genomes there are three versions -of cache data available: default, refseq and merged (combining the former two). Choose the one suitable for your usage. - -A general introduction to the VEP cache and download links can be found on the official website: -https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html - </help> - <citations> - <citation type="doi">10.1186/s13059-016-0974-4</citation> - </citations> -</tool> +<tool id="data_manager_vep_cache_download" name="Download and install VEP cache" version="0.1" tool_type="manage_data"> + <description>versioned annotation files for VEP</description> + <requirements> + <requirement type="package" version="3.9">python</requirement> + </requirements> + <command detect_errors="exit_code"> + python '$__tool_directory__/data_manager_vep_cache_download.py' '$out_file' + </command> + <inputs> + <param name="dbkey" type="genomebuild" + label="DBKEY of genome that the VEP cache data is for" + help="" /> + <param name="url" type="text" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/" + label="FTP root url for VEP cache files" help=""/> + <param name="file_name" type="text" label="File name of cache file to be downloaded from root url." help="E.g. homo_sapiens_vep_105_GRCh38.tar.gz"/> + <param name="display_name" type="text" label="Display name used in data-selection dropdowns." help="E.g. Homo sapiens hg38 (V105)"/> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="dbkey" value="ce11"/> + <param name="url" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/"/> + <param name="file_name" value="caenorhabditis_elegans_vep_105_WBcel235.tar.gz"/> + <param name="display_name" value="C. elegans ce11 (V105)"/> + <output name="out_file" file="from_test-meta.data_manager.json"/> + </test> + </tests> + <help> +This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the +"vep_versioned_caches" data table. You should use the indexed version of the cache files and it is strongly recommended to +use the cache files which version number matches the VEP version number. Note that for most genomes there are three versions +of cache data available: default, refseq and merged (combining the former two). Choose the one suitable for your usage. + +A general introduction to the VEP cache and download links can be found on the official website: +https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html + </help> + <citations> + <citation type="doi">10.1186/s13059-016-0974-4</citation> + </citations> +</tool> \ No newline at end of file
--- a/data_manager_conf.xml Wed Feb 23 12:26:11 2022 +0000 +++ b/data_manager_conf.xml Mon Feb 28 09:17:29 2022 +0000 @@ -1,21 +1,21 @@ -<?xml version="1.0"?> -<data_managers> - <data_manager tool_file="data_manager/data_manager_vep_cache_download.xml" id="data_manager_vep_cache_download" > - <data_table name="vep_versioned_caches"> <!-- Defines a Data Table to be modified. --> - <output> <!-- Handle the output of the Data Manager Tool --> - <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> - <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool --> - <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> - <column name="cachetype" /> <!-- columns that are going to be specified by the Data Manager Tool --> - <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> - <column name="path" output_ref="out_file" > - <move type="directory" relativize_symlinks="True"> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">vep/${version}/${dbkey}/${cachetype}</target> - </move> - <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/vep/${version}/${dbkey}/${cachetype}/</value_translation> - <value_translation type="function">abspath</value_translation> - </column> - </output> - </data_table> - </data_manager> -</data_managers> +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_vep_cache_download.xml" id="data_manager_vep_cache_download" > + <data_table name="vep_versioned_caches"> <!-- Defines a Data Table to be modified. --> + <output> <!-- Handle the output of the Data Manager Tool --> + <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="cachetype" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="path" output_ref="out_file" > + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">vep/${version}/${dbkey}/${cachetype}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/vep/${version}/${dbkey}/${cachetype}/</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers> \ No newline at end of file
--- a/data_manager_conf.xml.sample Wed Feb 23 12:26:11 2022 +0000 +++ b/data_manager_conf.xml.sample Mon Feb 28 09:17:29 2022 +0000 @@ -1,12 +1,12 @@ -<tables> - <!-- Table of installed versioned vep cache data --> - <table name="vep_versioned_caches" comment_char="#"> - <columns>value, dbkey, version, cachetype, name, path</columns> - <file path="tool-data/vep_versioned_caches.loc" /> - </table> - <!-- Locations of dbkeys and len files under genome directory --> - <table name="__dbkeys__" comment_char="#"> - <columns>value, name, len_path</columns> - <file path="tool-data/dbkeys.loc" /> - </table> -</tables> +<tables> + <!-- Table of installed versioned vep cache data --> + <table name="vep_versioned_caches" comment_char="#"> + <columns>value, dbkey, version, cachetype, name, path</columns> + <file path="tool-data/vep_versioned_caches.loc" /> + </table> + <!-- Locations of dbkeys and len files under genome directory --> + <table name="__dbkeys__" comment_char="#"> + <columns>value, name, len_path</columns> + <file path="tool-data/dbkeys.loc" /> + </table> +</tables> \ No newline at end of file
--- a/data_manager_conf.xml.test Wed Feb 23 12:26:11 2022 +0000 +++ b/data_manager_conf.xml.test Mon Feb 28 09:17:29 2022 +0000 @@ -1,12 +1,12 @@ -<tables> - <!-- Table of installed versioned vep cache data --> - <table name="vep_versioned_caches" comment_char="#"> - <columns>value, dbkey, version, cachetype, name, path</columns> - <file path="${__HERE__}/test-data/vep_versioned_caches.loc" /> - </table> - <!-- Locations of dbkeys and len files under genome directory --> - <table name="__dbkeys__" comment_char="#"> - <columns>value, name, len_path</columns> - <file path="${__HERE__}/test-data/dbkeys.loc" /> - </table> -</tables> +<tables> + <!-- Table of installed versioned vep cache data --> + <table name="vep_versioned_caches" comment_char="#"> + <columns>value, dbkey, version, cachetype, name, path</columns> + <file path="${__HERE__}/test-data/vep_versioned_caches.loc" /> + </table> + <!-- Locations of dbkeys and len files under genome directory --> + <table name="__dbkeys__" comment_char="#"> + <columns>value, name, len_path</columns> + <file path="${__HERE__}/test-data/dbkeys.loc" /> + </table> +</tables> \ No newline at end of file
--- a/test-data/dbkeys.loc Wed Feb 23 12:26:11 2022 +0000 +++ b/test-data/dbkeys.loc Mon Feb 28 09:17:29 2022 +0000 @@ -1,3 +1,3 @@ #<dbkey> <display_name> <len_file_path> hg38 Human hg38 a_path -ce11 C. elegans ce11 a_path +ce11 C. elegans ce11 a_path \ No newline at end of file