Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq
changeset 18:75c1817c2ecf draft
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit 6c7dfb528659d4413423377eced68ca360ed1316-dirty
author | sanbi-uwc |
---|---|
date | Fri, 28 Sep 2018 23:46:24 -0400 |
parents | 0347d4d8b436 |
children | d118e256faca |
files | data_manager/fetch_refseq.py data_manager/fetch_refseq.xml |
diffstat | 2 files changed, 51 insertions(+), 17 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/fetch_refseq.py Sat Sep 08 08:48:38 2018 -0400 +++ b/data_manager/fetch_refseq.py Fri Sep 28 23:46:24 2018 -0400 @@ -1,17 +1,19 @@ #!/usr/bin/env python -from __future__ import print_function, division +from __future__ import division, print_function + import argparse -from datetime import date import functools import gzip import json -from multiprocessing import Process, Queue import os import os.path -import re +import sys +from datetime import date +from multiprocessing import Process, Queue + import requests -import sys + try: from io import StringIO except ImportError: @@ -36,18 +38,30 @@ # DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz # where fna and faa are FASTA, gbff and gpff are Genbank + def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name): data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', []) data_manager_dict['data_tables'][data_table_name].append(data_table_entry) return data_manager_dict -def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False): + +def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False, make_len_file=False): input_filename = conn.get() if compress: open_output = gzip.open else: open_output = open + if make_len_file: + fa_pos = output_filename.find('.fa') + if fa_pos == -1: + # this should not happen - filename does not contain '.fa' + len_filename = output_filename + '.len' + else: + len_filename = output_filename[:fa_pos] + '.len' + len_output = open(len_filename, 'wb') + record_len = 0 + record_id = '' with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: while input_filename != 'STOP': if debug: @@ -55,14 +69,31 @@ with gzip.open(input_filename, 'rb') as input_file: read_chunk = functools.partial(input_file.read, (chunk_size)) for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3 + if make_len_file: + # break data into lines and parse as FASTA, perhaps continuing from partial previous record + for line in data.split('\n'): + if line.startswith('>'): + if record_id != '': + len_output.write('{}\t{}\n'.format(record_id, record_len)) + # update record ID of record we are processing, set length to 0 + record_len = 0 + record_id = line[1:].split()[0] + else: + assert record_id != '', "FASTA data found before FASTA record ID known in {}, data: {}".format(input_filename, line) + record_len += len(line.strip()) output_file.write(data) + if make_len_file: + # write last entry to .len file + len_output.write('{}\t{}\n'.format(record_id, record_len)) os.unlink(input_filename) input_filename = conn.get() + len_output.close() + def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', - 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) + 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) ending_mappings = { 'genomic': '.genomic.fna.gz', 'protein': '.protein.faa.gz', @@ -81,7 +112,7 @@ print('Retrieving {}'.format(division_base_url), file=sys.stderr) r = requests.get(division_base_url) listing_text = r.text - + unzip_queues = {} unzip_processes = [] final_output_filenames = [] @@ -94,10 +125,10 @@ unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename), kwargs=dict(debug=debug, compress=compress))) unzip_processes[-1].start() - + # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M for line in StringIO(listing_text): - if not '.gz' in line: + if '.gz' not in line: continue parts = line.split('"') assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip()) @@ -121,6 +152,7 @@ return [release_num, final_output_filenames] + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Download RefSeq databases') parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)') @@ -136,7 +168,7 @@ mol_types = args.mol_types.split(',') if args.galaxy_datamanager_filename is not None: dm_opts = json.loads(open(args.galaxy_datamanager_filename).read()) - output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter + output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter data_manager_dict = {} else: output_directory = args.output_directory @@ -144,16 +176,16 @@ if args.pin_date is not None: today_str = args.pin_date else: - today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format + today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress) if args.galaxy_datamanager_filename is not None: for i, mol_type in enumerate(mol_types): assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i]) - unique_key = division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey - dbkey = division_name + '.' + release_num + '.' + mol_type + unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey + dbkey = unique_key desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')' path = os.path.join(output_directory, fasta_files[i]) - _add_data_table_entry(data_manager_dict=data_manager_dict, + _add_data_table_entry(data_manager_dict=data_manager_dict, data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path), data_table_name='all_fasta') - open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode()) \ No newline at end of file + open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
--- a/data_manager/fetch_refseq.xml Sat Sep 08 08:48:38 2018 -0400 +++ b/data_manager/fetch_refseq.xml Fri Sep 28 23:46:24 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="data_manager_fetch_refseq" name="RefSeq data manager" version="0.0.18" tool_type="manage_data"> +<tool id="data_manager_fetch_refseq" name="RefSeq data manager" version="0.0.19" tool_type="manage_data"> <description>Fetch FASTA data from NCBI RefSeq and update all_fasta data table</description> <requirements> <requirement type="package" version="3">python</requirement> @@ -63,6 +63,8 @@ <output name="output_file"> <assert_contents> <has_text text="2018-03-14"/> + <has_text text="refseq_plastid"/> + <has_text text="/refseq_plastid."/> </assert_contents> </output> </test>