# HG changeset patch
# User sanbi-uwc
# Date 1538192784 14400
# Node ID 75c1817c2ecffda26218065623c2b4d52a1d1cab
# Parent 0347d4d8b436034ed13ce46fe7b6c9dd8e430d0d
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit 6c7dfb528659d4413423377eced68ca360ed1316-dirty
diff -r 0347d4d8b436 -r 75c1817c2ecf data_manager/fetch_refseq.py
--- a/data_manager/fetch_refseq.py Sat Sep 08 08:48:38 2018 -0400
+++ b/data_manager/fetch_refseq.py Fri Sep 28 23:46:24 2018 -0400
@@ -1,17 +1,19 @@
#!/usr/bin/env python
-from __future__ import print_function, division
+from __future__ import division, print_function
+
import argparse
-from datetime import date
import functools
import gzip
import json
-from multiprocessing import Process, Queue
import os
import os.path
-import re
+import sys
+from datetime import date
+from multiprocessing import Process, Queue
+
import requests
-import sys
+
try:
from io import StringIO
except ImportError:
@@ -36,18 +38,30 @@
# DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz
# where fna and faa are FASTA, gbff and gpff are Genbank
+
def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', [])
data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
return data_manager_dict
-def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False):
+
+def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False, make_len_file=False):
input_filename = conn.get()
if compress:
open_output = gzip.open
else:
open_output = open
+ if make_len_file:
+ fa_pos = output_filename.find('.fa')
+ if fa_pos == -1:
+ # this should not happen - filename does not contain '.fa'
+ len_filename = output_filename + '.len'
+ else:
+ len_filename = output_filename[:fa_pos] + '.len'
+ len_output = open(len_filename, 'wb')
+ record_len = 0
+ record_id = ''
with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file:
while input_filename != 'STOP':
if debug:
@@ -55,14 +69,31 @@
with gzip.open(input_filename, 'rb') as input_file:
read_chunk = functools.partial(input_file.read, (chunk_size))
for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3
+ if make_len_file:
+ # break data into lines and parse as FASTA, perhaps continuing from partial previous record
+ for line in data.split('\n'):
+ if line.startswith('>'):
+ if record_id != '':
+ len_output.write('{}\t{}\n'.format(record_id, record_len))
+ # update record ID of record we are processing, set length to 0
+ record_len = 0
+ record_id = line[1:].split()[0]
+ else:
+ assert record_id != '', "FASTA data found before FASTA record ID known in {}, data: {}".format(input_filename, line)
+ record_len += len(line.strip())
output_file.write(data)
+ if make_len_file:
+ # write last entry to .len file
+ len_output.write('{}\t{}\n'.format(record_id, record_len))
os.unlink(input_filename)
input_filename = conn.get()
+ len_output.close()
+
def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False):
base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/'
valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other',
- 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'])
+ 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral'])
ending_mappings = {
'genomic': '.genomic.fna.gz',
'protein': '.protein.faa.gz',
@@ -81,7 +112,7 @@
print('Retrieving {}'.format(division_base_url), file=sys.stderr)
r = requests.get(division_base_url)
listing_text = r.text
-
+
unzip_queues = {}
unzip_processes = []
final_output_filenames = []
@@ -94,10 +125,10 @@
unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename),
kwargs=dict(debug=debug, compress=compress)))
unzip_processes[-1].start()
-
+
# sample line: vertebrate_other.86.genomic.gbff.gz 2018-07-13 00:59 10M
for line in StringIO(listing_text):
- if not '.gz' in line:
+ if '.gz' not in line:
continue
parts = line.split('"')
assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip())
@@ -121,6 +152,7 @@
return [release_num, final_output_filenames]
+
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Download RefSeq databases')
parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)')
@@ -136,7 +168,7 @@
mol_types = args.mol_types.split(',')
if args.galaxy_datamanager_filename is not None:
dm_opts = json.loads(open(args.galaxy_datamanager_filename).read())
- output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter
+ output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter
data_manager_dict = {}
else:
output_directory = args.output_directory
@@ -144,16 +176,16 @@
if args.pin_date is not None:
today_str = args.pin_date
else:
- today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format
+ today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format
[release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress)
if args.galaxy_datamanager_filename is not None:
for i, mol_type in enumerate(mol_types):
assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i])
- unique_key = division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey
- dbkey = division_name + '.' + release_num + '.' + mol_type
+ unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey
+ dbkey = unique_key
desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')'
path = os.path.join(output_directory, fasta_files[i])
- _add_data_table_entry(data_manager_dict=data_manager_dict,
+ _add_data_table_entry(data_manager_dict=data_manager_dict,
data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path),
data_table_name='all_fasta')
- open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
\ No newline at end of file
+ open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode())
diff -r 0347d4d8b436 -r 75c1817c2ecf data_manager/fetch_refseq.xml
--- a/data_manager/fetch_refseq.xml Sat Sep 08 08:48:38 2018 -0400
+++ b/data_manager/fetch_refseq.xml Fri Sep 28 23:46:24 2018 -0400
@@ -1,4 +1,4 @@
-
+
Fetch FASTA data from NCBI RefSeq and update all_fasta data table
python
@@ -63,6 +63,8 @@