Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq
changeset 19:d118e256faca draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit 120c6491f4b0888220e432693a9805d8198d7397"
author | sanbi-uwc |
---|---|
date | Thu, 16 Apr 2020 10:19:57 +0000 |
parents | 75c1817c2ecf |
children | |
files | data_manager/fetch_artic_primers.py data_manager/fetch_artic_primers.xml data_manager/fetch_refseq.py data_manager/fetch_refseq.xml data_manager_conf.xml test-data/artic.json test-data/plastid.json tool-data/artic_primers.loc.sample tool_data_table_conf.xml.sample |
diffstat | 9 files changed, 119 insertions(+), 288 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_artic_primers.py Thu Apr 16 10:19:57 2020 +0000 @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +from __future__ import print_function, division + +import argparse +import json +import os +import os.path +import sys + +import requests + +DATA_TABLE_NAME = 'artic_primers' +def fetch_artic_primers(output_filename, output_directory, primers): + primer_sets = { + 'ARTICv1': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V1/nCoV-2019.bed', + 'ARTICv2': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V2/nCoV-2019.bed', + 'ARTICv3': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed' + } + + if not os.path.isdir(output_directory): + os.makedirs(output_directory) + data_manager_dict = json.load(open(output_filename)) + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][DATA_TABLE_NAME] = data_manager_dict['data_tables'].get(DATA_TABLE_NAME, []) + + data = [] + for name, url in primer_sets.items(): + response = requests.get(url) + if response.status_code != 200: + print('Error: download of', url, 'failed with code', response.status_code, file=sys.stderr) + exit(response.status_code) + bed_output_filename = os.path.join(output_directory, name + '.bed') + open(bed_output_filename, 'w').write(response.text) + description = name[:-2] + ' ' + name[-2:] + ' primer set' + data.append(dict(value=name, path=bed_output_filename, description=description)) + data_manager_dict['data_tables'][DATA_TABLE_NAME].extend(data) + print(data_manager_dict) + json.dump(data_manager_dict, open(output_filename, 'w')) + +class SplitArgs(argparse.Action): + def __call__(self, parser, namespace, values, option_string=None): + setattr(namespace, self.dest, values.split(',')) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Fetch ARTIC primer files for Galaxy use') + parser.add_argument('--output_directory', default='tmp', help='Directory to write output to') + parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs') + parser.add_argument('--primers', default='ARTCIv1,ARTICv2,ARTICv3', action=SplitArgs, help='Comma separated list of primers to fetch') + args = parser.parse_args() + fetch_artic_primers(args.galaxy_datamanager_filename, args.output_directory, args.primers) \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_artic_primers.xml Thu Apr 16 10:19:57 2020 +0000 @@ -0,0 +1,31 @@ +<tool id="fetch_artic_primers" name="ARTIC primer data manager" version="0.0.1" tool_type="manage_data" profile="19.05"> + <requirements> + <requirement type="package">python</requirement> + <requirement type="package" version="2.22.0">requests</requirement> + </requirements> + <!-- fetch all the primers in one go --> + <command detect_errors="exit_code"> + python '$__tool_directory__/fetch_artic_primers.py' + --galaxy_datamanager_filename '${output_file}' + </command> + <inputs> + <param name="primers" type="select" multiple="true" label="SARS-CoV-2 Primers to fetch"> + <option value="ARTICv1" selected="true">ARTIC v1</option> + <option value="ARTICv2" selected="true">ARTIC v2</option> + <option value="ARTICv3" selected="true">ARTIC v3</option> + </param> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="primers" value="ARTICv1,ARTICv2,ARTICv3"/> + <output name="output_file"> + <assert_contents> + <has_text text="ARTIC"/> + </assert_contents> + </output> + </test> + </tests> +</tool> \ No newline at end of file
--- a/data_manager/fetch_refseq.py Fri Sep 28 23:46:24 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,191 +0,0 @@ -#!/usr/bin/env python - -from __future__ import division, print_function - -import argparse -import functools -import gzip -import json -import os -import os.path -import sys -from datetime import date -from multiprocessing import Process, Queue - -import requests - -try: - from io import StringIO -except ImportError: - from StringIO import StringIO -# Refseq structure -# - Release number -# - Divisions -# 1. archea -# 2. bacteria -# 3. fungi -# 4. invertebrate -# 5. mitochondrion -# 6. other -# 7. plant -# 8. plasmid -# 9. plastid -# 10. protozoa -# 11. vertebrate mammalian -# 12. vertebrate other -# 13. viral -# within each division -# DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz -# where fna and faa are FASTA, gbff and gpff are Genbank - - -def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name): - data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) - data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', []) - data_manager_dict['data_tables'][data_table_name].append(data_table_entry) - return data_manager_dict - - -def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False, make_len_file=False): - input_filename = conn.get() - if compress: - open_output = gzip.open - else: - open_output = open - if make_len_file: - fa_pos = output_filename.find('.fa') - if fa_pos == -1: - # this should not happen - filename does not contain '.fa' - len_filename = output_filename + '.len' - else: - len_filename = output_filename[:fa_pos] + '.len' - len_output = open(len_filename, 'wb') - record_len = 0 - record_id = '' - with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: - while input_filename != 'STOP': - if debug: - print('Reading', input_filename, file=sys.stderr) - with gzip.open(input_filename, 'rb') as input_file: - read_chunk = functools.partial(input_file.read, (chunk_size)) - for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3 - if make_len_file: - # break data into lines and parse as FASTA, perhaps continuing from partial previous record - for line in data.split('\n'): - if line.startswith('>'): - if record_id != '': - len_output.write('{}\t{}\n'.format(record_id, record_len)) - # update record ID of record we are processing, set length to 0 - record_len = 0 - record_id = line[1:].split()[0] - else: - assert record_id != '', "FASTA data found before FASTA record ID known in {}, data: {}".format(input_filename, line) - record_len += len(line.strip()) - output_file.write(data) - if make_len_file: - # write last entry to .len file - len_output.write('{}\t{}\n'.format(record_id, record_len)) - os.unlink(input_filename) - input_filename = conn.get() - len_output.close() - - -def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): - base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' - valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', - 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) - ending_mappings = { - 'genomic': '.genomic.fna.gz', - 'protein': '.protein.faa.gz', - 'rna': 'rna.fna.gz' - } - assert division_name in valid_divisions, "Unknown division name ({})".format(division_name) - for mol_type in mol_types: - assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type) - if not os.path.exists(output_directory): - os.mkdir(output_directory) - release_num_file = base_url + 'RELEASE_NUMBER' - r = requests.get(release_num_file) - release_num = str(int(r.text.strip())) - division_base_url = base_url + division_name - if debug: - print('Retrieving {}'.format(division_base_url), file=sys.stderr) - r = requests.get(division_base_url) - listing_text = r.text - - unzip_queues = {} - unzip_processes = [] - final_output_filenames = [] - for mol_type in mol_types: - q = unzip_queues[mol_type] = Queue() - output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta' - if compress: - output_filename += '.gz' - final_output_filenames.append(output_filename) - unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename), - kwargs=dict(debug=debug, compress=compress))) - unzip_processes[-1].start() - - # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M - for line in StringIO(listing_text): - if '.gz' not in line: - continue - parts = line.split('"') - assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip()) - filename = parts[1] - for mol_type in mol_types: - ending = ending_mappings[mol_type] - if filename.endswith(ending): - if debug: - print('Downloading:', filename, ending, mol_type, file=sys.stderr) - output_filename = os.path.join(output_directory, filename) - with open(output_filename, 'wb') as output_file: - r = requests.get(division_base_url + '/' + filename) - for chunk in r.iter_content(chunk_size=4096): - output_file.write(chunk) - conn = unzip_queues[mol_type] - conn.put(output_filename) - - for mol_type in mol_types: - conn = unzip_queues[mol_type] - conn.put('STOP') - - return [release_num, final_output_filenames] - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Download RefSeq databases') - parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)') - parser.add_argument('--compress', default=False, action='store_true', help='Compress output files') - parser.add_argument('--output_directory', default='tmp', help='Directory to write output to') - parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs') - parser.add_argument('--division_names', help='RefSeq divisions to download') - parser.add_argument('--mol_types', help='Molecule types (genomic, rna, protein) to fetch') - parser.add_argument('--pin_date', help='Force download date to this version string') - args = parser.parse_args() - - division_names = args.division_names.split(',') - mol_types = args.mol_types.split(',') - if args.galaxy_datamanager_filename is not None: - dm_opts = json.loads(open(args.galaxy_datamanager_filename).read()) - output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter - data_manager_dict = {} - else: - output_directory = args.output_directory - for division_name in division_names: - if args.pin_date is not None: - today_str = args.pin_date - else: - today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format - [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress) - if args.galaxy_datamanager_filename is not None: - for i, mol_type in enumerate(mol_types): - assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i]) - unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey - dbkey = unique_key - desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')' - path = os.path.join(output_directory, fasta_files[i]) - _add_data_table_entry(data_manager_dict=data_manager_dict, - data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path), - data_table_name='all_fasta') - open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
--- a/data_manager/fetch_refseq.xml Fri Sep 28 23:46:24 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,87 +0,0 @@ -<tool id="data_manager_fetch_refseq" name="RefSeq data manager" version="0.0.19" tool_type="manage_data"> - <description>Fetch FASTA data from NCBI RefSeq and update all_fasta data table</description> - <requirements> - <requirement type="package" version="3">python</requirement> - </requirements> - <command detect_errors="aggressive"><![CDATA[ - python3 $__tool_directory__/fetch_refseq.py - #if str( $advanced.advanced_selector ) == 'advanced': - '${advanced.compress}' - #end if - --galaxy_datamanager_filename '${output_file}' - --division_names ${division_names} - --mol_types ${mol_types} - #if str( $pin_date ) != 'NO': - --pin_date '${pin_date}' - #end if - ]]></command> - <inputs> - <param argument="division_names" type="select" label="RefSeq division" multiple="true"> - <option value="archea">Archea</option> - <option value="bacteria">Bacteria</option> - <option value="complete">Complete</option> - <option value="fungi">Fungi</option> - <option value="invertebrate">Invertebrate</option> - <option value="mitochondrion">Mitochondrion</option> - <option value="other">Other</option> - <option value="plant">Plant</option> - <option value="plasmid">Plasmid</option> - <option value="plastid">Plastid</option> - <option value="protozoa">Protozoa</option> - <option value="vertebrate_mammalian">Mammalian Vertebrate</option> - <option value="vertebrate_other">Other Vertebrate</option> - <option value="viral">Viral</option> - </param> - <param argument="mol_types" type="select" multiple="true" label="Molecule type" help="Select at least one of genomic, protein or rna sequence"> - <option value="protein">Protein</option> - <option value="genomic">Genomic (DNA)</option> - <option value="rna">RNA</option> - </param> - <conditional name="advanced"> - <param name="advanced_selector" type="select" label="Advanced Options"> - <option value="basic" selected="True">Basic</option> - <option value="advanced">Advanced</option> - </param> - <when value="basic"> - </when> - <when value="advanced"> - <param type="boolean" argument="--compress" truevalue="--compress" falsevalue="" label="Compress FASTA files" - help="Compress downloaded FASTA files (with gzip). Limits compatibility with tools expecting uncompressed FASTA."/> - </when> - </conditional> - <param argument="--pin_date" type="hidden" value="NO" help="Used for testing"/> - </inputs> - <outputs> - <data name="output_file" format="data_manager_json"/> - </outputs> - <tests> - <test> - <param name="division_names" value="plastid"/> - <param name="mol_types" value="protein"/> - <param name="pin_date" value="2018-03-14"/> - <param name="advanced_selector" value="basic"/> - <output name="output_file"> - <assert_contents> - <has_text text="2018-03-14"/> - <has_text text="refseq_plastid"/> - <has_text text="/refseq_plastid."/> - </assert_contents> - </output> - </test> - </tests> - <help><![CDATA[ -This data manager fetches FASTA format collections of proteins, nucleotides (genomic DNA) and RNA -from NCBI's RefSeq_ data collection. - -RefSeq is released every two months and consists of a number of divisions. Some sequences are shared -between multiple divisions. This data manager allows the Galaxy administrator to select which -divisions and which molecule types within each division to download. Once downloaded the -files are made accessible by adding an entry into the *all_fasta* data table. - -.. _RefSeq: https://www.ncbi.nlm.nih.gov/refseq/ - ]]> - </help> - <citations> - <citation type="doi">10.1093/nar/gkv1189</citation> - </citations> -</tool> \ No newline at end of file
--- a/data_manager_conf.xml Fri Sep 28 23:46:24 2018 -0400 +++ b/data_manager_conf.xml Thu Apr 16 10:19:57 2020 +0000 @@ -1,17 +1,16 @@ <?xml version="1.0"?> <data_managers> - <data_manager tool_file="data_manager/fetch_refseq.xml" id="fetch_genome_fetch_refseq"> - <data_table name="all_fasta"> + <data_manager tool_file="data_manager/fetch_artic_primers.xml" id="fetch_artic_primers"> + <data_table name="artic_primers"> <output> <column name="value" /> - <column name="dbkey" /> - <column name="name" /> + <column name="description" /> <column name="path" output_ref="output_file" > <move type="file"> <source>${path}</source> - <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">refseq/#echo str($dbkey).split('.')[1]#/${value}.fasta</target> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">artic_primers/#echo str($name).bed#</target> </move> - <value_translation>refseq/#echo str($dbkey).split('.')[1]#/${value}.fasta</value_translation> + <value_translation>artic_primers/#echo str($name).bed#</value_translation> <value_translation type="function">abspath</value_translation> </column> </output>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/artic.json Thu Apr 16 10:19:57 2020 +0000 @@ -0,0 +1,21 @@ +{ + "data_tables": { + "artic_primers": [ + { + "value": "ARTICv1", + "description": "ARTIC v1 primer set", + "path": "tmp/ARTICv1.bed" + }, + { + "value": "ARTICv2", + "description": "ARTIC v2 primer set", + "path": "tmp/ARTICv2.bed" + }, + { + "value": "ARTICv3", + "description": "ARTIC v3 primer set", + "path": "tmp/ARTICv3.bed" + } + ] + } +}
--- a/test-data/plastid.json Fri Sep 28 23:46:24 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -{"data_tables": {"all_fasta": [{"path": "tmp/plastid.89.protein.fasta.gz", "dbkey": "plastid.89.protein", "name": "RefSeq plastid Release 89 protein (2018-09-07)", "value": "plastid.89.protein.2018-03-14"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/artic_primers.loc.sample Thu Apr 16 10:19:57 2020 +0000 @@ -0,0 +1,7 @@ +# this is a tab separated file describing the location of ARTIC primers for use in SARS-CoV-2 sequencing +# +# the columns are: +# value description path +# +# for example +# ARTICv1 ARTIC v1 primers /data/galaxy/tool_data/artic_primers/ARTICv1.bed \ No newline at end of file
--- a/tool_data_table_conf.xml.sample Fri Sep 28 23:46:24 2018 -0400 +++ b/tool_data_table_conf.xml.sample Thu Apr 16 10:19:57 2020 +0000 @@ -1,7 +1,7 @@ <tables> <!-- Locations of all fasta files under genome directory --> - <table name="all_fasta" comment_char="#"> - <columns>value, dbkey, name, path</columns> - <file path="tool-data/all_fasta.loc" /> + <table name="artic_primers" comment_char="#"> + <columns>value, description, path</columns> + <file path="tool-data/artic_primers.loc" /> </table> </tables> \ No newline at end of file