Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq
comparison data_manager/fetch_refseq.py @ 19:d118e256faca draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit 120c6491f4b0888220e432693a9805d8198d7397"
| author | sanbi-uwc |
|---|---|
| date | Thu, 16 Apr 2020 10:19:57 +0000 |
| parents | 75c1817c2ecf |
| children |
comparison
equal
deleted
inserted
replaced
| 18:75c1817c2ecf | 19:d118e256faca |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 from __future__ import division, print_function | |
| 4 | |
| 5 import argparse | |
| 6 import functools | |
| 7 import gzip | |
| 8 import json | |
| 9 import os | |
| 10 import os.path | |
| 11 import sys | |
| 12 from datetime import date | |
| 13 from multiprocessing import Process, Queue | |
| 14 | |
| 15 import requests | |
| 16 | |
| 17 try: | |
| 18 from io import StringIO | |
| 19 except ImportError: | |
| 20 from StringIO import StringIO | |
| 21 # Refseq structure | |
| 22 # - Release number | |
| 23 # - Divisions | |
| 24 # 1. archea | |
| 25 # 2. bacteria | |
| 26 # 3. fungi | |
| 27 # 4. invertebrate | |
| 28 # 5. mitochondrion | |
| 29 # 6. other | |
| 30 # 7. plant | |
| 31 # 8. plasmid | |
| 32 # 9. plastid | |
| 33 # 10. protozoa | |
| 34 # 11. vertebrate mammalian | |
| 35 # 12. vertebrate other | |
| 36 # 13. viral | |
| 37 # within each division | |
| 38 # DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz | |
| 39 # where fna and faa are FASTA, gbff and gpff are Genbank | |
| 40 | |
| 41 | |
| 42 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name): | |
| 43 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
| 44 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', []) | |
| 45 data_manager_dict['data_tables'][data_table_name].append(data_table_entry) | |
| 46 return data_manager_dict | |
| 47 | |
| 48 | |
| 49 def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False, make_len_file=False): | |
| 50 input_filename = conn.get() | |
| 51 if compress: | |
| 52 open_output = gzip.open | |
| 53 else: | |
| 54 open_output = open | |
| 55 if make_len_file: | |
| 56 fa_pos = output_filename.find('.fa') | |
| 57 if fa_pos == -1: | |
| 58 # this should not happen - filename does not contain '.fa' | |
| 59 len_filename = output_filename + '.len' | |
| 60 else: | |
| 61 len_filename = output_filename[:fa_pos] + '.len' | |
| 62 len_output = open(len_filename, 'wb') | |
| 63 record_len = 0 | |
| 64 record_id = '' | |
| 65 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: | |
| 66 while input_filename != 'STOP': | |
| 67 if debug: | |
| 68 print('Reading', input_filename, file=sys.stderr) | |
| 69 with gzip.open(input_filename, 'rb') as input_file: | |
| 70 read_chunk = functools.partial(input_file.read, (chunk_size)) | |
| 71 for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3 | |
| 72 if make_len_file: | |
| 73 # break data into lines and parse as FASTA, perhaps continuing from partial previous record | |
| 74 for line in data.split('\n'): | |
| 75 if line.startswith('>'): | |
| 76 if record_id != '': | |
| 77 len_output.write('{}\t{}\n'.format(record_id, record_len)) | |
| 78 # update record ID of record we are processing, set length to 0 | |
| 79 record_len = 0 | |
| 80 record_id = line[1:].split()[0] | |
| 81 else: | |
| 82 assert record_id != '', "FASTA data found before FASTA record ID known in {}, data: {}".format(input_filename, line) | |
| 83 record_len += len(line.strip()) | |
| 84 output_file.write(data) | |
| 85 if make_len_file: | |
| 86 # write last entry to .len file | |
| 87 len_output.write('{}\t{}\n'.format(record_id, record_len)) | |
| 88 os.unlink(input_filename) | |
| 89 input_filename = conn.get() | |
| 90 len_output.close() | |
| 91 | |
| 92 | |
| 93 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): | |
| 94 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' | |
| 95 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', | |
| 96 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) | |
| 97 ending_mappings = { | |
| 98 'genomic': '.genomic.fna.gz', | |
| 99 'protein': '.protein.faa.gz', | |
| 100 'rna': 'rna.fna.gz' | |
| 101 } | |
| 102 assert division_name in valid_divisions, "Unknown division name ({})".format(division_name) | |
| 103 for mol_type in mol_types: | |
| 104 assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type) | |
| 105 if not os.path.exists(output_directory): | |
| 106 os.mkdir(output_directory) | |
| 107 release_num_file = base_url + 'RELEASE_NUMBER' | |
| 108 r = requests.get(release_num_file) | |
| 109 release_num = str(int(r.text.strip())) | |
| 110 division_base_url = base_url + division_name | |
| 111 if debug: | |
| 112 print('Retrieving {}'.format(division_base_url), file=sys.stderr) | |
| 113 r = requests.get(division_base_url) | |
| 114 listing_text = r.text | |
| 115 | |
| 116 unzip_queues = {} | |
| 117 unzip_processes = [] | |
| 118 final_output_filenames = [] | |
| 119 for mol_type in mol_types: | |
| 120 q = unzip_queues[mol_type] = Queue() | |
| 121 output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta' | |
| 122 if compress: | |
| 123 output_filename += '.gz' | |
| 124 final_output_filenames.append(output_filename) | |
| 125 unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename), | |
| 126 kwargs=dict(debug=debug, compress=compress))) | |
| 127 unzip_processes[-1].start() | |
| 128 | |
| 129 # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M | |
| 130 for line in StringIO(listing_text): | |
| 131 if '.gz' not in line: | |
| 132 continue | |
| 133 parts = line.split('"') | |
| 134 assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip()) | |
| 135 filename = parts[1] | |
| 136 for mol_type in mol_types: | |
| 137 ending = ending_mappings[mol_type] | |
| 138 if filename.endswith(ending): | |
| 139 if debug: | |
| 140 print('Downloading:', filename, ending, mol_type, file=sys.stderr) | |
| 141 output_filename = os.path.join(output_directory, filename) | |
| 142 with open(output_filename, 'wb') as output_file: | |
| 143 r = requests.get(division_base_url + '/' + filename) | |
| 144 for chunk in r.iter_content(chunk_size=4096): | |
| 145 output_file.write(chunk) | |
| 146 conn = unzip_queues[mol_type] | |
| 147 conn.put(output_filename) | |
| 148 | |
| 149 for mol_type in mol_types: | |
| 150 conn = unzip_queues[mol_type] | |
| 151 conn.put('STOP') | |
| 152 | |
| 153 return [release_num, final_output_filenames] | |
| 154 | |
| 155 | |
| 156 if __name__ == '__main__': | |
| 157 parser = argparse.ArgumentParser(description='Download RefSeq databases') | |
| 158 parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)') | |
| 159 parser.add_argument('--compress', default=False, action='store_true', help='Compress output files') | |
| 160 parser.add_argument('--output_directory', default='tmp', help='Directory to write output to') | |
| 161 parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs') | |
| 162 parser.add_argument('--division_names', help='RefSeq divisions to download') | |
| 163 parser.add_argument('--mol_types', help='Molecule types (genomic, rna, protein) to fetch') | |
| 164 parser.add_argument('--pin_date', help='Force download date to this version string') | |
| 165 args = parser.parse_args() | |
| 166 | |
| 167 division_names = args.division_names.split(',') | |
| 168 mol_types = args.mol_types.split(',') | |
| 169 if args.galaxy_datamanager_filename is not None: | |
| 170 dm_opts = json.loads(open(args.galaxy_datamanager_filename).read()) | |
| 171 output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter | |
| 172 data_manager_dict = {} | |
| 173 else: | |
| 174 output_directory = args.output_directory | |
| 175 for division_name in division_names: | |
| 176 if args.pin_date is not None: | |
| 177 today_str = args.pin_date | |
| 178 else: | |
| 179 today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format | |
| 180 [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress) | |
| 181 if args.galaxy_datamanager_filename is not None: | |
| 182 for i, mol_type in enumerate(mol_types): | |
| 183 assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i]) | |
| 184 unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey | |
| 185 dbkey = unique_key | |
| 186 desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')' | |
| 187 path = os.path.join(output_directory, fasta_files[i]) | |
| 188 _add_data_table_entry(data_manager_dict=data_manager_dict, | |
| 189 data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path), | |
| 190 data_table_name='all_fasta') | |
| 191 open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode()) |
