Mercurial > repos > wolma > data_manager_packaged_annotation_data

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/install_packaged_annotation_data.py	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+
+import argparse
+import datetime
+import json
+import os
+import re
+
+from urllib.request import urlretrieve
+
+import yaml
+
+
+class PackagedAnnotationMeta():
+    @classmethod
+    def from_file(cls, fname):
+        meta = yaml.safe_load(open(fname))
+        return cls(meta)
+
+    def __init__(self, meta_dict):
+        if 'build' not in meta_dict:
+            meta_dict['build'] = datetime.date.today().isoformat()
+        if 'volume' not in meta_dict:
+            meta_dict['volume'] = 1
+
+        required_meta = ['name', 'build', 'volume', 'refgenome', 'records']
+        for key in required_meta:
+            if not meta_dict.get(key):
+                raise KeyError(
+                    'Required info "{0}" missing from metadata'
+                    .format(key)
+                )
+        required_record_meta = ['id', 'name', 'version', 'format', 'source']
+        for key in required_record_meta:
+            for record in meta_dict['records']:
+                if not record.get(key):
+                    raise KeyError(
+                        '{0}\n'
+                        'Required info "{0}" missing from record metadata'
+                        .format(record, key)
+                    )
+        self.meta = meta_dict
+        self.meta['id'] = self._get_id()
+
+    def _get_id(self):
+        components = [
+            self.meta['name'],
+            self.meta['refgenome'],
+            str(self.meta['volume']),
+            str(self.meta['build'])
+        ]
+        return '__'.join(
+            [
+                re.sub(r'[^a-zA-Z_0-9\-]', '', i.replace(' ', '_'))
+                for i in components
+            ]
+        )
+
+    def records(self, full_record_names=False):
+        for record in self.meta['records']:
+            ret = record.copy()
+            if full_record_names:
+                ret['name'] = self._full_record_name(record)
+            yield ret
+
+    def fullname(self):
+        return '{0} ({1}, vol:{2}/build:{3})'.format(
+            self.meta['name'],
+            self.meta['refgenome'],
+            self.meta['volume'],
+            self.meta['build']
+        )
+
+    def _full_record_name(self, record):
+        return '{0} ({1}, {2}; from {3}/vol:{4}/build{5})'.format(
+            record['name'], record['version'],
+            self.meta['refgenome'],
+            self.meta['name'],
+            self.meta['volume'],
+            self.meta['build']
+        )
+
+    def dump(self, fname):
+        with open(fname, 'w') as fo:
+            yaml.dump(
+                self.meta, fo, allow_unicode=False, default_flow_style=False
+            )
+
+
+def fetch_data(source_url, target_file):
+    final_file, headers = urlretrieve(source_url, target_file)
+
+def install_data(data, target_directory):
+    # TODO: allow multiple FASTA input files
+    fasta_base_name = os.path.split( fasta_filename )[-1]
+    sym_linked_fasta_filename = os.path.join( target_directory, fasta_base_name )
+    os.symlink( fasta_filename, sym_linked_fasta_filename )
+    args = ['bowtie2-build', sym_linked_fasta_filename, index_id]
+    proc = subprocess.Popen(args=args, shell=False, cwd=target_directory)
+    return_code = proc.wait()
+    if return_code:
+        print("Error building index.", file=sys.stderr)
+        sys.exit(return_code)
+    return [' '.join(cmd_quote(arg) for arg in args)]
+
+
+def meta_to_dm_records(meta, dbkey=None):
+    data_table_rows = []
+    for record in meta.records(full_record_names=True):
+        data_table_rows.append(
+            {
+                'value': '{0}:{1}'.format(meta.meta['id'], record['id']),
+                'dbkey': dbkey or meta.meta['refgenome'],
+                'data_name': record['name'],
+                'data_id': record['id'],
+                'data_format': record['format'],
+                'package_id': meta.meta['id'],
+                'package_name': meta.fullname(),
+                'path': ''
+            }
+        )
+    return data_table_rows
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('metadata')
+    parser.add_argument(
+        '-o', '--galaxy-datamanager-json',
+        required=True
+    )
+    parser.add_argument('-t', '--target-directory', default=None)
+    parser.add_argument('--dbkey', default=None)
+    args = parser.parse_args()
+
+
+    if args.target_directory:
+        if not os.path.isdir(args.target_directory):
+            os.mkdir(args.target_directory)
+    else:
+        args.target_directory = os.getcwd()
+
+    meta = PackagedAnnotationMeta.from_file(args.metadata)
+
+    for record in meta.records():
+        fetch_data(
+            record['source'],
+            os.path.join(args.target_directory, record['id'])
+        )
+
+    meta.dump(os.path.join(args.target_directory, 'meta.yml'))
+
+    # Finally, we prepare the metadata for the new data table record ...
+    data_manager_dict = {
+        'data_tables': {
+            'packaged_annotation_data': meta_to_dm_records(meta, args.dbkey)
+        }
+    }
+
+    # ... and save it to the json results file
+    with open(args.galaxy_datamanager_json, 'w') as fh:
+        json.dump(data_manager_dict, fh, sort_keys=True)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/install_packaged_annotation_data.xml	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,39 @@
+<tool id="data_manager_packaged_annotation_data" name="Download and install packaged collections of genome annotation data" version="0.0.1" tool_type="manage_data" profile="19.01">
+    <description>fetching</description>
+    <requirements>
+        <requirement type="package" version="3.7">python</requirement>
+        <requirement type="package" version="5.1.1">pyyaml</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/install_packaged_annotation_data.py'
+  --target-directory '${out_file.extra_files_path}'
+  -o '$out_file'
+  --dbkey '$dbkey'
+  '$metadata'
+    ]]></command>
+    <inputs>
+        <param name="dbkey" type="genomebuild"
+        label="DBKEY of genome that the annotation data is for"
+        help="Take a look at the refgenome value from the metadata file to guide you in your selection." />
+        <param name="metadata" type="data" format="txt" label="Metadata describing the package and its contents" />
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <tests>
+        <!-- TODO: need some way to test that new entry was added to data table -->
+        <test>
+            <param name="dbkey" value="hg19"/>
+            <param name="metadata" value="test-meta.yml"/>
+            <output name="out_file" file="from_test-meta.data_manager.json"/>
+        </test>
+    </tests>
+    <help>
+**What it does**
+
+This tool fetches and installs packages of genome annotation datasets that are
+not tightly bound to specific tools, but generic enough to be of use for many different tools.
+
+It populates the "packaged_annotation_data" data table.
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,25 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/install_packaged_annotation_data.xml" id="data_manager_packaged_annotation_data" >
+        <data_table name="packaged_annotation_data">  <!-- Defines a Data Table to be modified. -->
+            <output> <!-- Handle the output of the Data Manager Tool -->
+                <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="data_name" /> <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="data_id" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="data_format" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="package_id" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="package_name" />  <!-- columns that are going to be specified by the Data Manager Tool -->
+                <column name="path" output_ref="out_file" >
+                    <move type="directory" relativize_symlinks="True">
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">packaged_annotation_data/${dbkey}/${package_id}/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/packaged_annotation_data/${dbkey}/${package_id}/${path}/</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dbkeys.loc	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,2 @@
+#<dbkey>		<display_name>	<len_file_path>
+hg19	Human hg19	a_path
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/from_test-meta.data_manager.json	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,1 @@
+{"data_tables": {"packaged_annotation_data": [{"data_format": "bed", "data_id": "hotspots.data", "data_name": "CancerHotspots (v2, hg19; from Cancer variant data/vol:1/build2021-12-16)", "dbkey": "hg19", "package_id": "Cancer_variant_data__hg19__1__2021-12-16", "package_name": "Cancer variant data (hg19, vol:1/build:2021-12-16)", "path": "", "value": "Cancer_variant_data__hg19__1__2021-12-16:hotspots.data"}, {"data_format": "bed", "data_id": "civic.variants", "data_name": "CIViC variants (01-Feb-2019, hg19; from Cancer variant data/vol:1/build2021-12-16)", "dbkey": "hg19", "package_id": "Cancer_variant_data__hg19__1__2021-12-16", "package_name": "Cancer variant data (hg19, vol:1/build:2021-12-16)", "path": "", "value": "Cancer_variant_data__hg19__1__2021-12-16:civic.variants"}]}}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/packaged_annotation_data.loc	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,3 @@
+#<value>		<dbkey>			<data_name>				<data_id>		<data_format>	<package_id>			<package_name>		<path>
+#
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test-meta.yml	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,17 @@
+name: Cancer variant data
+refgenome: hg19
+records:
+  - id: hotspots.data
+    name: CancerHotspots
+    version: v2
+    doi: 10.1158/2159-8290.CD-17-0321
+    format: bed
+    source: https://zenodo.org/api/files/a89ff3af-261e-4c24-a9fb-5050ce8807b2/hotspots.bed
+    checksum: md5:ec8ec9afd4ae4935ac474e150e4e90aa
+  - id: civic.variants
+    name: CIViC variants
+    version: 01-Feb-2019
+    doi: http://dx.doi.org/10.1038/ng.3774
+    format: bed
+    source: https://zenodo.org/api/files/a89ff3af-261e-4c24-a9fb-5050ce8807b2/01-Feb-2019-CIVic.bed
+    checksum: md5:9e42bb7492be9e0011bf29b7e4f83f41
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/dbkeys.loc.sample	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,1 @@
+#<dbkey>		<display_name>	<len_file_path>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/packaged_annotation_data.loc.sample	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,20 @@
+#This file describes genome annotation data packages and their contents
+#available on the server.
+#Such data can consist of any number of individual files in a variety of
+#formats (e.g., bed, vcf, tabular) describing any features with respect to the
+#genome with the associated dbkey.
+#The directory referenced in the <path> column of the table is expected to
+#contain the file listed under <data_id> and a meta.yml file with details about
+#the annotation package volume and all of its contents.
+#This data table has the format (white space characters are TAB characters):
+#
+#<value>														<dbkey>			<data_name>																	<data_id>		<data_format>	<package_id>								<package_name>					<path>
+#
+#So, packaged_annotation_data.loc tables could look like this:
+#
+#dbSNP_hg19__1__1:dbSNP.tidy									hg19			dbSNP tidy (b147.20160601, hg19; from dbSNP/vol:1/build:1)					dbSNP.tidy		vcf_bgzip		dbSNP__hg19__1__1					dbSNP (hg19, vol:1/build:1)							/path/to/packaged_annotation_data/hg19/dbSNP/1/1
+#Cancer_variant_data__1__1:hotspots.data						hg19			CancerHotspots (v2, hg19; from Cancer variant data/vol:1/build:1)				hotspots.data	bed				Cancer_variant_data__hg19__1__1		Cancer variant data (hg19, vol:1/build:1)			/path/to/packaged_annotation_data/hg19/Cancer_variant_data/1/1
+#Cancer_genes_data__1__1:civic.genes							hg19			CIViC genes	(01-Feb-2019, hg19; from Cancer gene data/vol:1/build:1)			civic.genes		tabular			Cancer_gene_data__hg19__1__1		Cancer gene data (hg19, vol:1/build:1)					/path/to/packaged_annotation_data/hg19/Cancer_variant_data/1/1
+#SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1:ARTICv3		NC_045512.2		ARTIC (v3, NC_045512.2; from SARS-CoV-2 amplicon primer sets/vol:1/build:1)	ARTICv3			bed6			SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1	SARS-CoV-2 amplicon primer sets (NC_045512.2, vol:1/build:1)	/path/to/packaged_annotation_data/NC_045512.2/SARS-CoV-2_amplicon_primer_sets/1/1
+#SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1:ARTICv4		NC_045512.2		ARTIC (v4, NC_045512.2; from SARS-CoV-2 amplicon primer sets/vol:1/build:1)	ARTICv4			bed6			SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1	SARS-CoV-2 amplicon primer sets (NC_045512.2, vol:1/build:1)	/path/to/packaged_annotation_data/NC_045512.2/SARS-CoV-2_amplicon_primer_sets/1/1
+#
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Table of installed structured annotation data -->
+    <table name="packaged_annotation_data" comment_char="#">
+        <columns>value, dbkey, data_name, data_id, data_format, package_id, package_name, path</columns>
+        <file path="tool-data/packaged_annotation_data.loc" />
+    </table>
+    <!-- Locations of dbkeys and len files under genome directory -->
+    <table name="__dbkeys__" comment_char="#">
+        <columns>value, name, len_path</columns>
+        <file path="tool-data/dbkeys.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Thu Dec 16 18:26:23 2021 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Table of installed structured annotation data -->
+    <table name="packaged_annotation_data" comment_char="#">
+        <columns>value, dbkey, data_name, data_id, data_format, package_id, package_name, path</columns>
+        <file path="${__HERE__}/test-data/packaged_annotation_data.loc" />
+    </table>
+    <!-- Locations of dbkeys and len files under genome directory -->
+    <table name="__dbkeys__" comment_char="#">
+        <columns>value, name, len_path</columns>
+        <file path="${__HERE__}/test-data/dbkeys.loc" />
+    </table>
+</tables>