# HG changeset patch # User wolma # Date 1639679183 0 # Node ID 632d33df6758faf8cd7b9bea3d8f43efe616cff4 "planemo upload commit 3dc5291eccd1fb516be67694c18a27bda5f69f91" diff -r 000000000000 -r 632d33df6758 data_manager/install_packaged_annotation_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/install_packaged_annotation_data.py Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,162 @@ +#!/usr/bin/env python + +import argparse +import datetime +import json +import os +import re + +from urllib.request import urlretrieve + +import yaml + + +class PackagedAnnotationMeta(): + @classmethod + def from_file(cls, fname): + meta = yaml.safe_load(open(fname)) + return cls(meta) + + def __init__(self, meta_dict): + if 'build' not in meta_dict: + meta_dict['build'] = datetime.date.today().isoformat() + if 'volume' not in meta_dict: + meta_dict['volume'] = 1 + + required_meta = ['name', 'build', 'volume', 'refgenome', 'records'] + for key in required_meta: + if not meta_dict.get(key): + raise KeyError( + 'Required info "{0}" missing from metadata' + .format(key) + ) + required_record_meta = ['id', 'name', 'version', 'format', 'source'] + for key in required_record_meta: + for record in meta_dict['records']: + if not record.get(key): + raise KeyError( + '{0}\n' + 'Required info "{0}" missing from record metadata' + .format(record, key) + ) + self.meta = meta_dict + self.meta['id'] = self._get_id() + + def _get_id(self): + components = [ + self.meta['name'], + self.meta['refgenome'], + str(self.meta['volume']), + str(self.meta['build']) + ] + return '__'.join( + [ + re.sub(r'[^a-zA-Z_0-9\-]', '', i.replace(' ', '_')) + for i in components + ] + ) + + def records(self, full_record_names=False): + for record in self.meta['records']: + ret = record.copy() + if full_record_names: + ret['name'] = self._full_record_name(record) + yield ret + + def fullname(self): + return '{0} ({1}, vol:{2}/build:{3})'.format( + self.meta['name'], + self.meta['refgenome'], + self.meta['volume'], + self.meta['build'] + ) + + def _full_record_name(self, record): + return '{0} ({1}, {2}; from {3}/vol:{4}/build{5})'.format( + record['name'], record['version'], + self.meta['refgenome'], + self.meta['name'], + self.meta['volume'], + self.meta['build'] + ) + + def dump(self, fname): + with open(fname, 'w') as fo: + yaml.dump( + self.meta, fo, allow_unicode=False, default_flow_style=False + ) + + +def fetch_data(source_url, target_file): + final_file, headers = urlretrieve(source_url, target_file) + +def install_data(data, target_directory): + # TODO: allow multiple FASTA input files + fasta_base_name = os.path.split( fasta_filename )[-1] + sym_linked_fasta_filename = os.path.join( target_directory, fasta_base_name ) + os.symlink( fasta_filename, sym_linked_fasta_filename ) + args = ['bowtie2-build', sym_linked_fasta_filename, index_id] + proc = subprocess.Popen(args=args, shell=False, cwd=target_directory) + return_code = proc.wait() + if return_code: + print("Error building index.", file=sys.stderr) + sys.exit(return_code) + return [' '.join(cmd_quote(arg) for arg in args)] + + +def meta_to_dm_records(meta, dbkey=None): + data_table_rows = [] + for record in meta.records(full_record_names=True): + data_table_rows.append( + { + 'value': '{0}:{1}'.format(meta.meta['id'], record['id']), + 'dbkey': dbkey or meta.meta['refgenome'], + 'data_name': record['name'], + 'data_id': record['id'], + 'data_format': record['format'], + 'package_id': meta.meta['id'], + 'package_name': meta.fullname(), + 'path': '' + } + ) + return data_table_rows + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('metadata') + parser.add_argument( + '-o', '--galaxy-datamanager-json', + required=True + ) + parser.add_argument('-t', '--target-directory', default=None) + parser.add_argument('--dbkey', default=None) + args = parser.parse_args() + + + if args.target_directory: + if not os.path.isdir(args.target_directory): + os.mkdir(args.target_directory) + else: + args.target_directory = os.getcwd() + + meta = PackagedAnnotationMeta.from_file(args.metadata) + + for record in meta.records(): + fetch_data( + record['source'], + os.path.join(args.target_directory, record['id']) + ) + + meta.dump(os.path.join(args.target_directory, 'meta.yml')) + + # Finally, we prepare the metadata for the new data table record ... + data_manager_dict = { + 'data_tables': { + 'packaged_annotation_data': meta_to_dm_records(meta, args.dbkey) + } + } + + # ... and save it to the json results file + with open(args.galaxy_datamanager_json, 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True) diff -r 000000000000 -r 632d33df6758 data_manager/install_packaged_annotation_data.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/install_packaged_annotation_data.xml Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,39 @@ + + fetching + + python + pyyaml + + + + + + + + + + + + + + + + + + +**What it does** + +This tool fetches and installs packages of genome annotation datasets that are +not tightly bound to specific tools, but generic enough to be of use for many different tools. + +It populates the "packaged_annotation_data" data table. + + diff -r 000000000000 -r 632d33df6758 data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + packaged_annotation_data/${dbkey}/${package_id}/${path} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/packaged_annotation_data/${dbkey}/${package_id}/${path}/ + abspath + + + + + + + diff -r 000000000000 -r 632d33df6758 test-data/dbkeys.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dbkeys.loc Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,2 @@ +# +hg19 Human hg19 a_path diff -r 000000000000 -r 632d33df6758 test-data/from_test-meta.data_manager.json --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/from_test-meta.data_manager.json Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"packaged_annotation_data": [{"data_format": "bed", "data_id": "hotspots.data", "data_name": "CancerHotspots (v2, hg19; from Cancer variant data/vol:1/build2021-12-16)", "dbkey": "hg19", "package_id": "Cancer_variant_data__hg19__1__2021-12-16", "package_name": "Cancer variant data (hg19, vol:1/build:2021-12-16)", "path": "", "value": "Cancer_variant_data__hg19__1__2021-12-16:hotspots.data"}, {"data_format": "bed", "data_id": "civic.variants", "data_name": "CIViC variants (01-Feb-2019, hg19; from Cancer variant data/vol:1/build2021-12-16)", "dbkey": "hg19", "package_id": "Cancer_variant_data__hg19__1__2021-12-16", "package_name": "Cancer variant data (hg19, vol:1/build:2021-12-16)", "path": "", "value": "Cancer_variant_data__hg19__1__2021-12-16:civic.variants"}]}} \ No newline at end of file diff -r 000000000000 -r 632d33df6758 test-data/packaged_annotation_data.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/packaged_annotation_data.loc Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,3 @@ +# +# + diff -r 000000000000 -r 632d33df6758 test-data/test-meta.yml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-meta.yml Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,17 @@ +name: Cancer variant data +refgenome: hg19 +records: + - id: hotspots.data + name: CancerHotspots + version: v2 + doi: 10.1158/2159-8290.CD-17-0321 + format: bed + source: https://zenodo.org/api/files/a89ff3af-261e-4c24-a9fb-5050ce8807b2/hotspots.bed + checksum: md5:ec8ec9afd4ae4935ac474e150e4e90aa + - id: civic.variants + name: CIViC variants + version: 01-Feb-2019 + doi: http://dx.doi.org/10.1038/ng.3774 + format: bed + source: https://zenodo.org/api/files/a89ff3af-261e-4c24-a9fb-5050ce8807b2/01-Feb-2019-CIVic.bed + checksum: md5:9e42bb7492be9e0011bf29b7e4f83f41 diff -r 000000000000 -r 632d33df6758 tool-data/dbkeys.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dbkeys.loc.sample Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,1 @@ +# diff -r 000000000000 -r 632d33df6758 tool-data/packaged_annotation_data.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/packaged_annotation_data.loc.sample Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,20 @@ +#This file describes genome annotation data packages and their contents +#available on the server. +#Such data can consist of any number of individual files in a variety of +#formats (e.g., bed, vcf, tabular) describing any features with respect to the +#genome with the associated dbkey. +#The directory referenced in the column of the table is expected to +#contain the file listed under and a meta.yml file with details about +#the annotation package volume and all of its contents. +#This data table has the format (white space characters are TAB characters): +# +# +# +#So, packaged_annotation_data.loc tables could look like this: +# +#dbSNP_hg19__1__1:dbSNP.tidy hg19 dbSNP tidy (b147.20160601, hg19; from dbSNP/vol:1/build:1) dbSNP.tidy vcf_bgzip dbSNP__hg19__1__1 dbSNP (hg19, vol:1/build:1) /path/to/packaged_annotation_data/hg19/dbSNP/1/1 +#Cancer_variant_data__1__1:hotspots.data hg19 CancerHotspots (v2, hg19; from Cancer variant data/vol:1/build:1) hotspots.data bed Cancer_variant_data__hg19__1__1 Cancer variant data (hg19, vol:1/build:1) /path/to/packaged_annotation_data/hg19/Cancer_variant_data/1/1 +#Cancer_genes_data__1__1:civic.genes hg19 CIViC genes (01-Feb-2019, hg19; from Cancer gene data/vol:1/build:1) civic.genes tabular Cancer_gene_data__hg19__1__1 Cancer gene data (hg19, vol:1/build:1) /path/to/packaged_annotation_data/hg19/Cancer_variant_data/1/1 +#SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1:ARTICv3 NC_045512.2 ARTIC (v3, NC_045512.2; from SARS-CoV-2 amplicon primer sets/vol:1/build:1) ARTICv3 bed6 SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1 SARS-CoV-2 amplicon primer sets (NC_045512.2, vol:1/build:1) /path/to/packaged_annotation_data/NC_045512.2/SARS-CoV-2_amplicon_primer_sets/1/1 +#SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1:ARTICv4 NC_045512.2 ARTIC (v4, NC_045512.2; from SARS-CoV-2 amplicon primer sets/vol:1/build:1) ARTICv4 bed6 SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1 SARS-CoV-2 amplicon primer sets (NC_045512.2, vol:1/build:1) /path/to/packaged_annotation_data/NC_045512.2/SARS-CoV-2_amplicon_primer_sets/1/1 +# diff -r 000000000000 -r 632d33df6758 tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,12 @@ + + + + value, dbkey, data_name, data_id, data_format, package_id, package_name, path + +
+ + + value, name, len_path + +
+
diff -r 000000000000 -r 632d33df6758 tool_data_table_conf.xml.test --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Thu Dec 16 18:26:23 2021 +0000 @@ -0,0 +1,12 @@ + + + + value, dbkey, data_name, data_id, data_format, package_id, package_name, path + +
+ + + value, name, len_path + +
+