# HG changeset patch
# User sanbi-uwc
# Date 1587032397 0
# Node ID d118e256facad095d178b5f959acbeed957d75ac
# Parent 75c1817c2ecffda26218065623c2b4d52a1d1cab
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit 120c6491f4b0888220e432693a9805d8198d7397"
diff -r 75c1817c2ecf -r d118e256faca data_manager/fetch_artic_primers.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_artic_primers.py Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+from __future__ import print_function, division
+
+import argparse
+import json
+import os
+import os.path
+import sys
+
+import requests
+
+DATA_TABLE_NAME = 'artic_primers'
+def fetch_artic_primers(output_filename, output_directory, primers):
+ primer_sets = {
+ 'ARTICv1': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V1/nCoV-2019.bed',
+ 'ARTICv2': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V2/nCoV-2019.bed',
+ 'ARTICv3': 'https://raw.githubusercontent.com/artic-network/artic-ncov2019/master/primer_schemes/nCoV-2019/V3/nCoV-2019.bed'
+ }
+
+ if not os.path.isdir(output_directory):
+ os.makedirs(output_directory)
+ data_manager_dict = json.load(open(output_filename))
+ data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+ data_manager_dict['data_tables'][DATA_TABLE_NAME] = data_manager_dict['data_tables'].get(DATA_TABLE_NAME, [])
+
+ data = []
+ for name, url in primer_sets.items():
+ response = requests.get(url)
+ if response.status_code != 200:
+ print('Error: download of', url, 'failed with code', response.status_code, file=sys.stderr)
+ exit(response.status_code)
+ bed_output_filename = os.path.join(output_directory, name + '.bed')
+ open(bed_output_filename, 'w').write(response.text)
+ description = name[:-2] + ' ' + name[-2:] + ' primer set'
+ data.append(dict(value=name, path=bed_output_filename, description=description))
+ data_manager_dict['data_tables'][DATA_TABLE_NAME].extend(data)
+ print(data_manager_dict)
+ json.dump(data_manager_dict, open(output_filename, 'w'))
+
+class SplitArgs(argparse.Action):
+ def __call__(self, parser, namespace, values, option_string=None):
+ setattr(namespace, self.dest, values.split(','))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='Fetch ARTIC primer files for Galaxy use')
+ parser.add_argument('--output_directory', default='tmp', help='Directory to write output to')
+ parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs')
+ parser.add_argument('--primers', default='ARTCIv1,ARTICv2,ARTICv3', action=SplitArgs, help='Comma separated list of primers to fetch')
+ args = parser.parse_args()
+ fetch_artic_primers(args.galaxy_datamanager_filename, args.output_directory, args.primers)
\ No newline at end of file
diff -r 75c1817c2ecf -r d118e256faca data_manager/fetch_artic_primers.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/fetch_artic_primers.xml Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,31 @@
+
+
+ python
+ requests
+
+
+
+ python '$__tool_directory__/fetch_artic_primers.py'
+ --galaxy_datamanager_filename '${output_file}'
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 75c1817c2ecf -r d118e256faca data_manager/fetch_refseq.py
--- a/data_manager/fetch_refseq.py Fri Sep 28 23:46:24 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,191 +0,0 @@
-#!/usr/bin/env python
-
-from __future__ import division, print_function
-
-import argparse
-import functools
-import gzip
-import json
-import os
-import os.path
-import sys
-from datetime import date
-from multiprocessing import Process, Queue
-
-import requests
-
-try:
- from io import StringIO
-except ImportError:
- from StringIO import StringIO
-# Refseq structure
-# - Release number
-# - Divisions
-# 1. archea
-# 2. bacteria
-# 3. fungi
-# 4. invertebrate
-# 5. mitochondrion
-# 6. other
-# 7. plant
-# 8. plasmid
-# 9. plastid
-# 10. protozoa
-# 11. vertebrate mammalian
-# 12. vertebrate other
-# 13. viral
-# within each division
-# DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz
-# where fna and faa are FASTA, gbff and gpff are Genbank
-
-
-def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
- data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
- data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', [])
- data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
- return data_manager_dict
-
-
-def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False, make_len_file=False):
- input_filename = conn.get()
- if compress:
- open_output = gzip.open
- else:
- open_output = open
- if make_len_file:
- fa_pos = output_filename.find('.fa')
- if fa_pos == -1:
- # this should not happen - filename does not contain '.fa'
- len_filename = output_filename + '.len'
- else:
- len_filename = output_filename[:fa_pos] + '.len'
- len_output = open(len_filename, 'wb')
- record_len = 0
- record_id = ''
- with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file:
- while input_filename != 'STOP':
- if debug:
- print('Reading', input_filename, file=sys.stderr)
- with gzip.open(input_filename, 'rb') as input_file:
- read_chunk = functools.partial(input_file.read, (chunk_size))
- for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3
- if make_len_file:
- # break data into lines and parse as FASTA, perhaps continuing from partial previous record
- for line in data.split('\n'):
- if line.startswith('>'):
- if record_id != '':
- len_output.write('{}\t{}\n'.format(record_id, record_len))
- # update record ID of record we are processing, set length to 0
- record_len = 0
- record_id = line[1:].split()[0]
- else:
- assert record_id != '', "FASTA data found before FASTA record ID known in {}, data: {}".format(input_filename, line)
- record_len += len(line.strip())
- output_file.write(data)
- if make_len_file:
- # write last entry to .len file
- len_output.write('{}\t{}\n'.format(record_id, record_len))
- os.unlink(input_filename)
- input_filename = conn.get()
- len_output.close()
-
-
-def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False):
- base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/'
- valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other',
- 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'])
- ending_mappings = {
- 'genomic': '.genomic.fna.gz',
- 'protein': '.protein.faa.gz',
- 'rna': 'rna.fna.gz'
- }
- assert division_name in valid_divisions, "Unknown division name ({})".format(division_name)
- for mol_type in mol_types:
- assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type)
- if not os.path.exists(output_directory):
- os.mkdir(output_directory)
- release_num_file = base_url + 'RELEASE_NUMBER'
- r = requests.get(release_num_file)
- release_num = str(int(r.text.strip()))
- division_base_url = base_url + division_name
- if debug:
- print('Retrieving {}'.format(division_base_url), file=sys.stderr)
- r = requests.get(division_base_url)
- listing_text = r.text
-
- unzip_queues = {}
- unzip_processes = []
- final_output_filenames = []
- for mol_type in mol_types:
- q = unzip_queues[mol_type] = Queue()
- output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta'
- if compress:
- output_filename += '.gz'
- final_output_filenames.append(output_filename)
- unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename),
- kwargs=dict(debug=debug, compress=compress)))
- unzip_processes[-1].start()
-
- # sample line: vertebrate_other.86.genomic.gbff.gz 2018-07-13 00:59 10M
- for line in StringIO(listing_text):
- if '.gz' not in line:
- continue
- parts = line.split('"')
- assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip())
- filename = parts[1]
- for mol_type in mol_types:
- ending = ending_mappings[mol_type]
- if filename.endswith(ending):
- if debug:
- print('Downloading:', filename, ending, mol_type, file=sys.stderr)
- output_filename = os.path.join(output_directory, filename)
- with open(output_filename, 'wb') as output_file:
- r = requests.get(division_base_url + '/' + filename)
- for chunk in r.iter_content(chunk_size=4096):
- output_file.write(chunk)
- conn = unzip_queues[mol_type]
- conn.put(output_filename)
-
- for mol_type in mol_types:
- conn = unzip_queues[mol_type]
- conn.put('STOP')
-
- return [release_num, final_output_filenames]
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(description='Download RefSeq databases')
- parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)')
- parser.add_argument('--compress', default=False, action='store_true', help='Compress output files')
- parser.add_argument('--output_directory', default='tmp', help='Directory to write output to')
- parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs')
- parser.add_argument('--division_names', help='RefSeq divisions to download')
- parser.add_argument('--mol_types', help='Molecule types (genomic, rna, protein) to fetch')
- parser.add_argument('--pin_date', help='Force download date to this version string')
- args = parser.parse_args()
-
- division_names = args.division_names.split(',')
- mol_types = args.mol_types.split(',')
- if args.galaxy_datamanager_filename is not None:
- dm_opts = json.loads(open(args.galaxy_datamanager_filename).read())
- output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter
- data_manager_dict = {}
- else:
- output_directory = args.output_directory
- for division_name in division_names:
- if args.pin_date is not None:
- today_str = args.pin_date
- else:
- today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format
- [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress)
- if args.galaxy_datamanager_filename is not None:
- for i, mol_type in enumerate(mol_types):
- assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i])
- unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey
- dbkey = unique_key
- desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')'
- path = os.path.join(output_directory, fasta_files[i])
- _add_data_table_entry(data_manager_dict=data_manager_dict,
- data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path),
- data_table_name='all_fasta')
- open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
diff -r 75c1817c2ecf -r d118e256faca data_manager/fetch_refseq.xml
--- a/data_manager/fetch_refseq.xml Fri Sep 28 23:46:24 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,87 +0,0 @@
-
- Fetch FASTA data from NCBI RefSeq and update all_fasta data table
-
- python
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 10.1093/nar/gkv1189
-
-
\ No newline at end of file
diff -r 75c1817c2ecf -r d118e256faca data_manager_conf.xml
--- a/data_manager_conf.xml Fri Sep 28 23:46:24 2018 -0400
+++ b/data_manager_conf.xml Thu Apr 16 10:19:57 2020 +0000
@@ -1,17 +1,16 @@
-
-
+
+
diff -r 75c1817c2ecf -r d118e256faca test-data/artic.json
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/artic.json Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,21 @@
+{
+ "data_tables": {
+ "artic_primers": [
+ {
+ "value": "ARTICv1",
+ "description": "ARTIC v1 primer set",
+ "path": "tmp/ARTICv1.bed"
+ },
+ {
+ "value": "ARTICv2",
+ "description": "ARTIC v2 primer set",
+ "path": "tmp/ARTICv2.bed"
+ },
+ {
+ "value": "ARTICv3",
+ "description": "ARTIC v3 primer set",
+ "path": "tmp/ARTICv3.bed"
+ }
+ ]
+ }
+}
diff -r 75c1817c2ecf -r d118e256faca test-data/plastid.json
--- a/test-data/plastid.json Fri Sep 28 23:46:24 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-{"data_tables": {"all_fasta": [{"path": "tmp/plastid.89.protein.fasta.gz", "dbkey": "plastid.89.protein", "name": "RefSeq plastid Release 89 protein (2018-09-07)", "value": "plastid.89.protein.2018-03-14"}]}}
\ No newline at end of file
diff -r 75c1817c2ecf -r d118e256faca tool-data/artic_primers.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/artic_primers.loc.sample Thu Apr 16 10:19:57 2020 +0000
@@ -0,0 +1,7 @@
+# this is a tab separated file describing the location of ARTIC primers for use in SARS-CoV-2 sequencing
+#
+# the columns are:
+# value description path
+#
+# for example
+# ARTICv1 ARTIC v1 primers /data/galaxy/tool_data/artic_primers/ARTICv1.bed
\ No newline at end of file
diff -r 75c1817c2ecf -r d118e256faca tool_data_table_conf.xml.sample
--- a/tool_data_table_conf.xml.sample Fri Sep 28 23:46:24 2018 -0400
+++ b/tool_data_table_conf.xml.sample Thu Apr 16 10:19:57 2020 +0000
@@ -1,7 +1,7 @@
-
- value, dbkey, name, path
-
+
+ value, description, path
+
\ No newline at end of file