Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq
comparison data_manager/fetch_refseq.py @ 2:a4ee45e7237b draft
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit a0125981706495e0a8be4fafe2eb1af3f0cfdaa3-dirty
author | sanbi-uwc |
---|---|
date | Fri, 07 Sep 2018 18:08:23 -0400 |
parents | cfe6cd521835 |
children | 4852eb1a75e5 |
comparison
equal
deleted
inserted
replaced
1:300562c726cc | 2:a4ee45e7237b |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 from __future__ import print_function, division | 3 from __future__ import print_function, division |
4 import argparse | 4 import argparse |
5 from datetime import date | 5 from datetime import date |
6 import functools | |
6 import gzip | 7 import gzip |
7 import json | 8 import json |
8 from multiprocessing import Process, Queue | 9 from multiprocessing import Process, Queue |
9 import os | 10 import os |
10 import os.path | 11 import os.path |
50 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: | 51 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: |
51 while input_filename != 'STOP': | 52 while input_filename != 'STOP': |
52 if debug: | 53 if debug: |
53 print('Reading', input_filename, file=sys.stderr) | 54 print('Reading', input_filename, file=sys.stderr) |
54 with gzip.open(input_filename) as input_file: | 55 with gzip.open(input_filename) as input_file: |
55 data = input_file.read(chunk_size) | 56 read_chunk = functools.partial(input_file.read, (chunk_size)) |
56 while data != '': | 57 for data in iter(read_chunk, ''): # use '' as a sentinel to stop the loop |
57 output_file.write(data) | 58 output_file.write(data) |
58 data = input_file.read(chunk_size) | 59 os.unlink(input_filename) |
59 # os.unlink(input_filename) | |
60 input_filename = conn.get() | 60 input_filename = conn.get() |
61 | 61 |
62 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): | 62 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): |
63 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' | 63 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' |
64 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', | 64 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', |