Mercurial > repos > sh477 > data_manager_vep_cache_downloader

--- a/data_manager/data_manager_vep_cache_download.py	Mon Feb 28 14:42:50 2022 +0000
+++ b/data_manager/data_manager_vep_cache_download.py	Tue Mar 01 18:12:26 2022 +0000
@@ -18,9 +18,11 @@

     # Process parameters for metadata and file download
     url = params['param_dict']['url'].rstrip("/") + "/" + params['param_dict']['file_name'].lstrip("/")
-    m = re.search(r"_([^_]*?)_vep_(\d+?)_", params['param_dict']['file_name'])
-    version = str(m.group(2))
-    cache_type = m.group(1) if m.group(1) == "merged" or m.group(1) == "refseq" else "default"
+    m = re.search(r"(.*?)(merged|refseq)?_vep_(\d+?)_", params['param_dict']['file_name'])
+    version = str(m.group(3))
+    cache_type = m.group(2) if m.group(2) else "default"
+    species = m.group(1).rstrip("_")
+    display_name = f"{species.capitalize().replace('_', ' ')} {params['param_dict']['dbkey']} (V{version}{'' if cache_type == 'default' else ', ' + cache_type.capitalize()})"

     # Download and extract given cache archive, remove archive afterwards
     final_file, headers = urlretrieve(url, os.path.join(target_directory, params['param_dict']['file_name']))
@@ -32,20 +34,19 @@
     # Construct metadata for the new data table entry
     data_manager_dict = {
         'data_tables': {
-            'vep_versioned_caches': [
+            'vep_versioned_annotation_cache': [
                 {
                     'value': params['param_dict']['file_name'].strip(".tar.gz"),
                     'dbkey': params['param_dict']['dbkey'],
                     'version': version,
                     'cachetype': cache_type,
-                    'name': params['param_dict']['display_name'],
+                    'name': display_name,
+                    'species': species,
                     'path': './%s' % params['param_dict']['file_name'].strip(".tar.gz")
                 }
             ]
         }
     }
-
-    #assert 42 == 0, str(data_manager_dict)

     # Save metadata to out_file
     with open(sys.argv[1], 'w') as fh:
--- a/data_manager/data_manager_vep_cache_download.xml	Mon Feb 28 14:42:50 2022 +0000
+++ b/data_manager/data_manager_vep_cache_download.xml	Tue Mar 01 18:12:26 2022 +0000
@@ -11,27 +11,26 @@
             label="DBKEY of genome that the VEP cache data is for"
             help="" />
         <param name="url" type="text" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/"
-            label="FTP root url for VEP cache files" help=""/>
-        <param name="file_name" type="text" label="File name of cache file to be downloaded from root url." help="E.g. homo_sapiens_vep_105_GRCh38.tar.gz"/>
-        <param name="display_name" type="text" label="Display name used in data-selection dropdowns." help="E.g. Homo sapiens hg38 (V105)"/>
+            label="FTP root url for VEP cache files" help="Release number should be equal to desired VEP version"/>
+        <param name="file_name" type="text" label="File name of cache file to be downloaded from root url" help="E.g. homo_sapiens_vep_105_GRCh38.tar.gz"/>
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json"/>
     </outputs>
     <tests>
     <test>
-        <param name="dbkey" value="ce11"/>
+        <param name="dbkey" value="ci3"/>
         <param name="url" value="http://ftp.ensembl.org/pub/release-105/variation/indexed_vep_cache/"/>
-        <param name="file_name" value="caenorhabditis_elegans_vep_105_WBcel235.tar.gz"/>
-        <param name="display_name" value="C. elegans ce11 (V105)"/>
+        <param name="file_name" value="ciona_intestinalis_refseq_vep_105_KH.tar.gz"/>
         <output name="out_file" file="from_test-meta.data_manager.json"/>
         </test>
     </tests>
     <help>
 This tool downloads given versions of VEP cache annotation files and makes them available to Ensembl VEP in Galaxy via the
-"vep_versioned_caches" data table. You should use the indexed version of the cache files and it is strongly recommended to
-use the cache files which version number matches the VEP version number. Note that for most genomes there are three versions
-of cache data available: default, refseq and merged (combining the former two). Choose the one suitable for your usage.
+"vep_versioned_annotation_cache" data table. You should use the indexed version of the cache files and it is strongly
+recommended to use the cache files which version number matches the VEP version number. Note that for most genomes there
+are three versions of cache data available: default, refseq and merged (combining the former two). Choose the one suitable
+for your usage.

 A general introduction to the VEP cache and download links can be found on the official website:
 https://www.ensembl.org/info/docs/tools/vep/script/vep_cache.html
--- a/data_manager_conf.xml	Mon Feb 28 14:42:50 2022 +0000
+++ b/data_manager_conf.xml	Tue Mar 01 18:12:26 2022 +0000
@@ -1,13 +1,14 @@
 <?xml version="1.0"?>
 <data_managers>
     <data_manager tool_file="data_manager/data_manager_vep_cache_download.xml" id="data_manager_vep_cache_download" >
-        <data_table name="vep_versioned_caches">  <!-- Defines a Data Table to be modified. -->
+        <data_table name="vep_versioned_annotation_cache">  <!-- Defines a Data Table to be modified. -->
             <output> <!-- Handle the output of the Data Manager Tool -->
                 <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
                 <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool -->
                 <column name="version" /> <!-- columns that are going to be specified by the Data Manager Tool -->
                 <column name="cachetype" />  <!-- columns that are going to be specified by the Data Manager Tool -->
                 <column name="name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+				<column name="species" />  <!-- columns that are going to be specified by the Data Manager Tool -->
                 <column name="path" output_ref="out_file" >
                     <move type="directory" relativize_symlinks="True">
                         <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">vep/${version}/${dbkey}/${cachetype}</target>
--- a/test-data/from_test-meta.data_manager.json	Mon Feb 28 14:42:50 2022 +0000
+++ b/test-data/from_test-meta.data_manager.json	Tue Mar 01 18:12:26 2022 +0000
@@ -1,1 +1,1 @@
-{"data_tables": {"vep_versioned_caches": [{"cachetype": "default", "dbkey": "ce11", "name": "C. elegans ce11 (V105)", "path": "./caenorhabditis_elegans_vep_105_WBcel235", "value": "caenorhabditis_elegans_vep_105_WBcel235", "version": "105"}]}}
\ No newline at end of file
+{"data_tables": {"vep_versioned_annotation_cache": [{"cachetype": "refseq", "dbkey": "ci3", "name": "Ciona intestinalis ci3 (V105, Refseq)", "path": "./ciona_intestinalis_refseq_vep_105_KH", "species": "ciona_intestinalis", "value": "ciona_intestinalis_refseq_vep_105_KH", "version": "105"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/vep_versioned_annotation_cache.loc	Tue Mar 01 18:12:26 2022 +0000
@@ -0,0 +1,2 @@
+#<value>									<dbkey>			<version>		<cachetype>		<name>		<species>						<path>
+#
\ No newline at end of file
--- a/test-data/vep_versioned_caches.loc	Mon Feb 28 14:42:50 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-#<value>									<dbkey>			<version>		<cachetype>		<name>								<path>
-#
-caenorhabditis_elegans_vep_105_WBcel235	ce11	105	default	C. elegans ce11 (V105)	/home/sebastian/galaxy/tool-data/vep/105/ce11/default
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/vep_versioned_annotation_cache.loc.sample	Tue Mar 01 18:12:26 2022 +0000
@@ -0,0 +1,11 @@
+#This file describes vep cache data and its metadata available on the server.
+#The data table has the format (white space characters are TAB characters):
+#
+#<value>									<dbkey>			<version>		<cachetype>		<name>								<species>			<path>
+#
+#So, vep_versioned_annotation_cache.loc tables could look like this:
+#
+#homo_sapiens_vep_105_GRCh38				hg38			105				default			Homo sapiens hg38 (V105)			homo_sapiens		/path/to/vep_versioned_annotation_cache/105/hg38/default
+#homo_sapiens_refseq_vep_105_GRCh38			hg38			105				refseq			Homo sapiens hg38 (V105, Refseq)	homo_sapiens		/path/to/vep_versioned_annotation_cache/105/hg38/refseq
+#homo_sapiens_merged_vep_105_GRCh38			hg38			105				merged			Homo sapiens hg38 (V105, Merged)	homo_sapiens		/path/to/vep_versioned_annotation_cache/105/hg38/merged
+#
\ No newline at end of file
--- a/tool-data/vep_versioned_caches.loc.sample	Mon Feb 28 14:42:50 2022 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,11 +0,0 @@
-#This file describes vep cache data and its metadata available on the server.
-#The data table has the format (white space characters are TAB characters):
-#
-#<value>									<dbkey>			<version>		<cachetype>		<name>								<path>
-#
-#So, vep_versioned_caches.loc tables could look like this:
-#
-#homo_sapiens_vep_105_GRCh38				hg38			105				default			Homo sapiens hg38 (V105)			/path/to/vep_versioned_caches/105/hg38/default
-#homo_sapiens_refseq_vep_105_GRCh38			hg38			105				refseq			Homo sapiens hg38 refseq (V105)		/path/to/vep_versioned_caches/105/hg38/refseq
-#homo_sapiens_merged_vep_105_GRCh38			hg38			105				merged			Homo sapiens hg38 merged (V105)		/path/to/vep_versioned_caches/105/hg38/merged
-#
\ No newline at end of file
--- a/tool_data_table_conf.xml.sample	Mon Feb 28 14:42:50 2022 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Mar 01 18:12:26 2022 +0000
@@ -1,8 +1,8 @@
 <tables>
     <!-- Table of installed versioned vep cache data -->
-    <table name="vep_versioned_caches" comment_char="#">
-        <columns>value, dbkey, version, cachetype, name, path</columns>
-        <file path="tool-data/vep_versioned_caches.loc" />
+    <table name="vep_versioned_annotation_cache" comment_char="#">
+        <columns>value, dbkey, version, cachetype, name, species, path</columns>
+        <file path="tool-data/vep_versioned_annotation_cache.loc" />
     </table>
     <!-- Locations of dbkeys and len files under genome directory -->
     <table name="__dbkeys__" comment_char="#">
--- a/tool_data_table_conf.xml.test	Mon Feb 28 14:42:50 2022 +0000
+++ b/tool_data_table_conf.xml.test	Tue Mar 01 18:12:26 2022 +0000
@@ -1,8 +1,8 @@
 <tables>
     <!-- Table of installed versioned vep cache data -->
-    <table name="vep_versioned_caches" comment_char="#">
-        <columns>value, dbkey, version, cachetype, name, path</columns>
-        <file path="${__HERE__}/test-data/vep_versioned_caches.loc" />
+    <table name="vep_versioned_annotation_cache" comment_char="#">
+        <columns>value, dbkey, version, cachetype, name, species, path</columns>
+        <file path="${__HERE__}/test-data/vep_versioned_annotation_cache.loc" />
     </table>
     <!-- Locations of dbkeys and len files under genome directory -->
     <table name="__dbkeys__" comment_char="#">