changeset 0:3e344aedb267 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_interproscan commit 2f5d27a375fcc2e8d77914b3d9e402a9e2df2d97"
author iuc
date Mon, 15 Nov 2021 17:20:14 +0000
parents
children d9a238ff2bc3
files data_manager/interproscan.py data_manager/interproscan.xml data_manager_conf.xml test-data/interproscan.loc tool-data/interproscan.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 7 files changed, 276 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/interproscan.py	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+
+import argparse
+import hashlib
+import json
+import operator
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tarfile
+
+import requests
+
+
+GH_REPO_API = 'https://api.github.com/repos/ebi-pf-team/interproscan/'
+MD5_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz.md5'
+DATA_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz'
+
+# For tests: download a smaller archive containing *some* data
+PARTIAL_URL = 'https://github.com/ebi-pf-team/interproscan/archive/{version}.tar.gz'
+
+
+def list_tags(url=None):
+
+    if not url:
+        url = GH_REPO_API + 'tags'
+
+    resp = requests.get(url=url)
+    data = resp.json()
+
+    tags = []
+    for tag in data:
+        if re.match(r"^[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]$", tag['name']):
+            tags.append(tag['name'])
+
+    if 'next' in resp.links:
+        tags += list_tags(resp.links['next']['url'])
+
+    return sorted(tags)
+
+
+def download_file(url, dest):
+    with requests.get(url, stream=True) as r:
+        r.raise_for_status()
+        with open(dest, 'wb') as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Download data for InterProScan')
+    parser.add_argument('--partial', dest='partial', action='store_true', help='Only download a small subset of data (for testing)')
+    parser.add_argument('-v', '--version', help='Specify an InterProScan version (default: latest)')
+    parser.add_argument("datatable_name")
+    parser.add_argument("galaxy_datamanager_filename")
+
+    args = parser.parse_args()
+
+    with open(args.galaxy_datamanager_filename) as fh:
+        config = json.load(fh)
+
+    output_directory = config.get("output_data", [{}])[0].get("extra_files_path", None)
+    data_manager_dict = {}
+    data_manager_dict["data_tables"] = config.get("data_tables", {})
+    data_manager_dict["data_tables"][args.datatable_name] = data_manager_dict[
+        "data_tables"
+    ].get(args.datatable_name, [])
+
+    os.mkdir(output_directory)
+
+    all_tags = list_tags()
+
+    if args.version:
+        if args.version not in all_tags:
+            raise RuntimeError("Version '%s' is not valid" % args.version)
+        tag = args.version
+    else:
+        tag = all_tags[-1]
+
+    print("Will download data for InterProScan version: %s" % tag)
+
+    print("Getting MD5 checksum:")
+    md5 = requests.get(url=MD5_URL.format(version=tag)).text
+    if not re.match(r"^([a-fA-F\d]{32})  interproscan-[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]-64-bit.tar.gz$", md5):
+        raise RuntimeError("Got invalid MD5 from the InterProScan FTP server: '%s'" % md5)
+    print("%s" % md5)
+
+    if args.partial:
+        print("Downloading partial data tarball...")
+        dest_tar = os.path.join(output_directory, PARTIAL_URL.format(version=tag).split('/')[-1])
+        download_file(PARTIAL_URL.format(version=tag), dest_tar)
+    else:
+        print("Downloading data tarball...")
+        dest_tar = os.path.join(output_directory, DATA_URL.format(version=tag).split('/')[-1])
+        download_file(DATA_URL.format(version=tag), dest_tar)
+
+        print("Finished, now checking md5...")
+        md5_computed = hashlib.md5(open(dest_tar, 'rb').read()).hexdigest()
+        if not md5.startswith(md5_computed):
+            raise RuntimeError("MD5 check failed: computed '%s', expected '%s'" % (md5_computed, md5))
+
+    print("Ok, now extracting data...")
+    tar = tarfile.open(dest_tar, "r:gz")
+    tar.extractall(output_directory)
+    tar.close()
+
+    if args.partial:
+        print("Moving partial data files around...")
+        shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag, 'core/jms-implementation/support-mini-x86-32/data/'), os.path.join(output_directory, 'data'))
+    else:
+        print("Moving data files around...")
+        shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag), os.path.join(output_directory, 'data'))
+
+    print("Done, removing tarball and unneeded files...")
+    os.remove(dest_tar)
+    shutil.rmtree(os.path.join(output_directory, 'interproscan-%s' % tag))
+
+    print("Running initial_setup.py (index hmm models)...")
+    # Write a temp properties file in work dir
+    prop_file_src = os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'interproscan.properties')
+    with open(prop_file_src, 'r') as prop:
+        prop_content = prop.read()
+    prop_content = re.sub(r'^data\.directory=.*$', 'data.directory=%s' % os.path.join(output_directory, 'data'), prop_content, flags=re.M)
+    with open('interproscan.properties', 'w') as prop:
+        prop.write(prop_content)
+    # Run the index command
+    cmd_args = [os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'initial_setup.py')]
+    proc = subprocess.Popen(args=cmd_args, shell=False)
+    out, err = proc.communicate()
+    print(out)
+    print(err, file=sys.stderr)
+    return_code = proc.wait()
+    if return_code:
+        print("Error running initial_setup.py.", file=sys.stderr)
+        sys.exit(return_code)
+
+    data_manager_dict["data_tables"][args.datatable_name].append(
+        dict(
+            value=tag,
+            description="InterProScan %s" % tag,
+            interproscan_version=tag,
+            path=output_directory,
+        )
+    )
+
+    print("Saving data table content...")
+
+    data_manager_dict["data_tables"][args.datatable_name].sort(
+        key=operator.itemgetter("value"), reverse=True
+    )
+    with open(args.galaxy_datamanager_filename, "w") as fh:
+        json.dump(data_manager_dict, fh, indent=2, sort_keys=True)
+
+    print("Finished.")
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/interproscan.xml	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,69 @@
+<tool id="data_manager_interproscan" name="InterProScan data manager" version="0.0.1" tool_type="manage_data" profile="20.01">
+    <requirements>
+        <requirement type="package" version="5.52-86.0">interproscan</requirement>
+        <requirement type="package" version="2.26.0">requests</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/interproscan.py'
+$partial_data
+--version '$version'
+'interproscan'
+'${output_file}'
+    ]]></command>
+    <inputs>
+        <param name="partial_data" type="hidden" value="" help="Used for testing"/>
+        <param name="version" type="text" value="" label="Version to download" help="Leave empty to download the latest version">
+            <validator type="regex" message="Version must be a valid InterProScan version (e.g. 5.52-86.0)">^([0-9]+\.[0-9]+-[0-9]+\.[0-9]+)?$</validator>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="partial_data" value="--partial"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="InterProScan 5."/>
+                    <has_text text='"interproscan_version": "5.'/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="Pressed and indexed" />
+                <has_text text="Completed indexing the hmm models" />
+            </assert_stdout>
+        </test>
+        <test>
+            <param name="partial_data" value="--partial"/>
+            <param name="version" value="5.51-85.0"/>
+            <output name="output_file">
+                <assert_contents>
+                    <has_text text="InterProScan 5.51-85.0"/>
+                    <has_text text='"interproscan_version": "5.51-85.0'/>
+                </assert_contents>
+            </output>
+            <assert_stdout>
+                <has_text text="Pressed and indexed" />
+                <has_text text="Completed indexing the hmm models" />
+            </assert_stdout>
+        </test>
+        <test expect_failure="true">
+            <param name="partial_data" value="--partial"/>
+            <param name="version" value="xxxx"/>
+            <assert_stderr>
+                <has_text text="Version must be a valid InterProScan version" />
+            </assert_stderr>
+        </test>
+    </tests>
+    <help><![CDATA[
+        This data managers fetches data from EBI FTP server for the InterProScan
+        annotation tool and updates the InterProScan data table.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btu031</citation>
+        <citation type="doi">10.7717/peerj.167</citation>
+        <citation type="doi">10.1093/bioinformatics/17.9.847</citation>
+        <citation type="doi">10.1093/nar/gki442</citation>
+        <citation type="doi">10.1093/nar/gkn785</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,19 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/interproscan.xml" id="data_manager_interproscan">
+        <data_table name="interproscan">
+            <output>
+                <column name="value" />
+                <column name="description" />
+                <column name="format_version" />
+                <column name="path" output_ref="output_file" >
+                    <move type="directory" relativize_symlinks="True">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">interproscan/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/interproscan/${value}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/interproscan.loc	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,8 @@
+# this is a tab separated file describing the location of interproscan databases used for the
+# interproscan annotation tool
+#
+# the columns are:
+# value	description	interproscan_version	path
+#
+# for example
+# 5.52-86.0	InterProScan 5.52-86.0	5.52-86.0	/tmp/database/interproscan/5.52-86.0/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/interproscan.loc.sample	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,8 @@
+# this is a tab separated file describing the location of interproscan databases used for the
+# interproscan annotation tool
+#
+# the columns are:
+# value	description	interproscan_version	path
+#
+# for example
+# 5.52-86.0	InterProScan 5.52-86.0	5.52-86.0	/tmp/database/interproscan/5.52-86.0/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="interproscan" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, description, interproscan_version, path</columns>
+        <file path="tool-data/interproscan.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Mon Nov 15 17:20:14 2021 +0000
@@ -0,0 +1,6 @@
+<tables>
+    <table name="interproscan" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, description, interproscan_version, path</columns>
+        <file path="${__HERE__}/test-data/interproscan.loc" />
+    </table>
+</tables>