Mercurial > repos > matthias > data_manager_dada2
changeset 0:419037fe1150 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/master/data_managers/data_manager_dada2 commit 42eb67646e47bef13eed672ff6b9d06b1d82ae3d-dirty
author | matthias |
---|---|
date | Thu, 07 Mar 2019 09:33:43 -0500 |
parents | |
children | 1c50cfb0c0ab |
files | data_manager/.dada_fetcher.xml.swp data_manager/.data_manager.py.swp data_manager/.megan_tools_fetcher.xml.swp data_manager/dada_fetcher.xml data_manager/data_manager.py data_manager_conf.xml test-data/SSURef_Nr99_132_tax_silva_to_NCBI_synonyms_json tool-data/.dada2_species.loc.sample.swp tool-data/.dada2_taxonomy.loc.sample.swp tool-data/dada2_species.loc.sample tool-data/dada2_taxonomy.loc.sample tool_data_table_conf.xml.sample |
diffstat | 11 files changed, 208 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/dada_fetcher.xml Thu Mar 07 09:33:43 2019 -0500 @@ -0,0 +1,39 @@ +<?xml version="1.0"?> +<tool id="dada_fetcher" name="dada2 dada manager" tool_type="manage_data" version="0.0.1"> + <description>Download reference data sets</description> + <command detect_errors="exit_code"> + <![CDATA[ + python '$__tool_directory__/data_manager.py' + --out '${out_file}' + --file '$type_cond.database_name' + ]]> + </command> + <inputs> + <param name="database_name" type="select" label="mapping data"> + <option value="silva132">Silva version 132</option> + <option value="silva128">Silva version 128</option> + <option value="rdp16">RDP trainset 16 + RDP database release 11.5</option> + <option value="rdp14">RDP trainset 14</option> + <option value="gg13.84">GreenGenes version 13.8</option> +<!-- <option value="unite8.0">UNITE: General Fasta release 8.0 </option> + <option value="RefSeq_RDP">NCBI RefSeq 16S rRNA database supplemented by RDP</option> + <option value="gtdb">GTDB: Genome Taxonomy Database (More info: http://gtdb.ecogenomic.org/)</option> + <option value="hitdb1">HitDB version 1 (Human InTestinal 16S rRNA)</option> + <option value="silva132_euk">Silva Eukaryotic 18S, v132 & v128</option> + <option value="PR2v4.11.0">Protist Ribosomal Reference database 2 version 4.11.0</option>--> + </param> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <tests> + <test> + <param name="database_name" value="silva132"/> + <output name="out_file" file="silva132_json"/> + </test> + </tests> + <help> +http://www.arb-silva.de/silva-license-information + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager.py Thu Mar 07 09:33:43 2019 -0500 @@ -0,0 +1,106 @@ +import argparse +import json +import os +import shutil +import sys +import zipfile +try: + # For Python 3.0 and later + from urllib.request import Request, urlopen +except ImportError: + # Fall back to Python 2 imports + from urllib2 import Request, urlopen + +DEFAULT_TAXLEVELS="Kingdom,Phylum,Class,Order,Family,Genus,Species" + +FILE2NAME = { + "silva132":"Silva version 132", + "silva128":"Silva version 128", + "rdp16":"RDP trainset 16", + "rdp14":"RDP trainset 14", + "gg13.84":"GreenGenes version 13.8", +} + +FILE2TAXURL = { + "silva132":"https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", + "silva128":"https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", + "rdp16":"https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", + "rdp14":"https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", + "gg13.84":"https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", +} + +FILE2SPECIESURL = { + "silva132":"https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", + "silva128":"https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", + "rdp16":"https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", + "rdp14":"https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" +} + +FILE2TAXLEVELS = { +} + +def url_download(url, fname, workdir): + """ + download url to workdir/fname + + return the path to the resulting file + """ + file_path = os.path.join(workdir, fname) + if not os.path.exists(workdir): + os.makedirs(workdir) + src = None + dst = None + try: + req = Request(url) + src = urlopen(req) + with open(file_path, 'wb') as dst: + while True: + chunk = src.read(2**10) + if chunk: + dst.write(chunk) + else: + break + finally: + if src: + src.close() + return os.path.join(workdir, fname) + +def main(dataset, outjson): + + params = json.loads(open(outjson).read()) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + output_path = os.path.abspath(os.path.join(os.getcwd(), 'dada2')) + + workdir = os.path.join(os.getcwd(), 'dada2') + path = url_download( FILE2TAXURL[dataset], taxdataset+".taxonomy", workdir) + + data_manager_json = {"data_tables":{}} + data_manager_entry = {} + data_manager_entry['value'] = dataset + data_manager_entry['name'] = FILE2NAME[dataset] + data_manager_entry['path'] = path + data_manager_entry['taxlevels'] = FILE2TAXLEVELS.get(dataset, DEFAULT_TAXLEVELS) + data_manager_json["data_tables"]["dada2_taxonomy"] = data_manager_entry + + + if FILE2SPECIES.get(dataset, False ): + path = url_download( FILE2SPECIES[dataset], taxdataset+".species", workdir) + + data_manager_entry = {} + data_manager_entry['value'] = dataset + data_manager_entry['name'] = FILE2NAME[dataset] + data_manager_entry['path'] = path + data_manager_json["data_tables"]["dada2_species"] = data_manager_entry + + for filename in os.listdir(workdir): + shutil.move(os.path.join(output_path, filename), target_directory) + file(outjson, 'w').write(json.dumps(data_manager_json)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Create data manager json.') + parser.add_argument('--out', action='store', help='JSON filename') + parser.add_argument('--dataset', action='store', help='Download data set name') + args = parser.parse_args() + + main(args.dataset, args.out)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Mar 07 09:33:43 2019 -0500 @@ -0,0 +1,34 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/dada2_fetcher.xml" id="dada2_fetcher" version="0.0.1"> + <data_table name="dada2_taxonomy"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="out_file"> + <move type="file" relativize_symlinks="True"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + <column name="taxlevels" /> + </output> + </data_table> + <data_table name="dada2_species"> + <output> + <column name="value" /> + <column name="name" /> + <column name="path" output_ref="out_file"> + <move type="file" relativize_symlinks="True"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">dada2/${path}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/dada2/${path}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dada2_species.loc.sample Thu Mar 07 09:33:43 2019 -0500 @@ -0,0 +1,9 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of dada2 reference data sets for species assignment, using three +# tab separated columns: +# +# <unique_build_id> <display_name> <fasta_file_path> +# +# Datasets can be retrieved from http://busco.ezlab.org/frame_wget.html +# +# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dada2_taxonomy.loc.sample Thu Mar 07 09:33:43 2019 -0500 @@ -0,0 +1,9 @@ +# This is a sample file distributed with Galaxy that is used to define a +# list of dada2 reference data sets for taxonomy assignment, using three +# tab separated columns: +# +# <unique_build_id> <display_name> <fasta_file_path> <taxlevels> +# +# Datasets can be retrieved from https://benjjneb.github.io/dada2/training.html +# +# taxlevels is a comma separated list of taxonomy levels
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Thu Mar 07 09:33:43 2019 -0500 @@ -0,0 +1,11 @@ +<?xml version="1.0"?> +<tables> + <table name="dada2_species" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/dada2_species.loc" /> + </table> + <table name="dada2_taxonomy" comment_char="#"> + <columns>value, name, path, taxlevels</columns> + <file path="tool-data/dada2_taxonomy.loc" /> + </table> +</tables>