comparison data_manager/fetch_refseq.py @ 2:a4ee45e7237b draft

planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit a0125981706495e0a8be4fafe2eb1af3f0cfdaa3-dirty
author sanbi-uwc
date Fri, 07 Sep 2018 18:08:23 -0400
parents cfe6cd521835
children 4852eb1a75e5
comparison
equal deleted inserted replaced
1:300562c726cc 2:a4ee45e7237b
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 from __future__ import print_function, division 3 from __future__ import print_function, division
4 import argparse 4 import argparse
5 from datetime import date 5 from datetime import date
6 import functools
6 import gzip 7 import gzip
7 import json 8 import json
8 from multiprocessing import Process, Queue 9 from multiprocessing import Process, Queue
9 import os 10 import os
10 import os.path 11 import os.path
50 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: 51 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file:
51 while input_filename != 'STOP': 52 while input_filename != 'STOP':
52 if debug: 53 if debug:
53 print('Reading', input_filename, file=sys.stderr) 54 print('Reading', input_filename, file=sys.stderr)
54 with gzip.open(input_filename) as input_file: 55 with gzip.open(input_filename) as input_file:
55 data = input_file.read(chunk_size) 56 read_chunk = functools.partial(input_file.read, (chunk_size))
56 while data != '': 57 for data in iter(read_chunk, ''): # use '' as a sentinel to stop the loop
57 output_file.write(data) 58 output_file.write(data)
58 data = input_file.read(chunk_size) 59 os.unlink(input_filename)
59 # os.unlink(input_filename)
60 input_filename = conn.get() 60 input_filename = conn.get()
61 61
62 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): 62 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False):
63 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' 63 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/'
64 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', 64 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other',