Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq
comparison data_manager/fetch_refseq.py @ 0:cfe6cd521835 draft
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
author | sanbi-uwc |
---|---|
date | Fri, 07 Sep 2018 17:13:20 -0400 |
parents | |
children | a4ee45e7237b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cfe6cd521835 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 from __future__ import print_function, division | |
4 import argparse | |
5 from datetime import date | |
6 import gzip | |
7 import json | |
8 from multiprocessing import Process, Queue | |
9 import os | |
10 import os.path | |
11 import re | |
12 import requests | |
13 import sys | |
14 try: | |
15 from io import StringIO | |
16 except ImportError: | |
17 from StringIO import StringIO | |
18 # Refseq structure | |
19 # - Release number | |
20 # - Divisions | |
21 # 1. archea | |
22 # 2. bacteria | |
23 # 3. fungi | |
24 # 4. invertebrate | |
25 # 5. mitochondrion | |
26 # 6. other | |
27 # 7. plant | |
28 # 8. plasmid | |
29 # 9. plastid | |
30 # 10. protozoa | |
31 # 11. vertebrate mammalian | |
32 # 12. vertebrate other | |
33 # 13. viral | |
34 # within each division | |
35 # DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz | |
36 # where fna and faa are FASTA, gbff and gpff are Genbank | |
37 | |
38 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name): | |
39 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
40 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', []) | |
41 data_manager_dict['data_tables'][data_table_name].append(data_table_entry) | |
42 return data_manager_dict | |
43 | |
44 def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False): | |
45 input_filename = conn.get() | |
46 if compress: | |
47 open_output = gzip.open | |
48 else: | |
49 open_output = open | |
50 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: | |
51 while input_filename != 'STOP': | |
52 if debug: | |
53 print('Reading', input_filename, file=sys.stderr) | |
54 with gzip.open(input_filename) as input_file: | |
55 data = input_file.read(chunk_size) | |
56 while data != '': | |
57 output_file.write(data) | |
58 data = input_file.read(chunk_size) | |
59 # os.unlink(input_filename) | |
60 input_filename = conn.get() | |
61 | |
62 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): | |
63 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' | |
64 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', | |
65 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) | |
66 ending_mappings = { | |
67 'genomic': '.genomic.fna.gz', | |
68 'protein': '.protein.faa.gz', | |
69 'rna': 'rna.fna.gz' | |
70 } | |
71 assert division_name in valid_divisions, "Unknown division name ({})".format(division_name) | |
72 for mol_type in mol_types: | |
73 assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type) | |
74 if not os.path.exists(output_directory): | |
75 os.mkdir(output_directory) | |
76 release_num_file = base_url + 'RELEASE_NUMBER' | |
77 r = requests.get(release_num_file) | |
78 release_num = str(int(r.text.strip())) | |
79 division_base_url = base_url + division_name | |
80 if debug: | |
81 print('Retrieving {}'.format(division_base_url), file=sys.stderr) | |
82 r = requests.get(division_base_url) | |
83 listing_text = r.text | |
84 | |
85 unzip_queues = {} | |
86 unzip_processes = [] | |
87 final_output_filenames = [] | |
88 for mol_type in mol_types: | |
89 q = unzip_queues[mol_type] = Queue() | |
90 output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta' | |
91 if compress: | |
92 output_filename += '.gz' | |
93 final_output_filenames.append(output_filename) | |
94 unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename), | |
95 kwargs=dict(debug=debug, compress=compress))) | |
96 unzip_processes[-1].start() | |
97 | |
98 # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M | |
99 for line in StringIO(listing_text): | |
100 if not '.gz' in line: | |
101 continue | |
102 parts = line.split('"') | |
103 assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip()) | |
104 filename = parts[1] | |
105 for mol_type in mol_types: | |
106 ending = ending_mappings[mol_type] | |
107 if filename.endswith(ending): | |
108 if debug: | |
109 print('Downloading:', filename, ending, mol_type, file=sys.stderr) | |
110 output_filename = os.path.join(output_directory, filename) | |
111 with open(output_filename, 'wb') as output_file: | |
112 r = requests.get(division_base_url + '/' + filename) | |
113 for chunk in r.iter_content(chunk_size=4096): | |
114 output_file.write(chunk) | |
115 conn = unzip_queues[mol_type] | |
116 conn.put(output_filename) | |
117 | |
118 for mol_type in mol_types: | |
119 conn = unzip_queues[mol_type] | |
120 conn.put('STOP') | |
121 | |
122 return [release_num, final_output_filenames] | |
123 | |
124 if __name__ == '__main__': | |
125 parser = argparse.ArgumentParser(description='Download RefSeq databases') | |
126 parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)') | |
127 parser.add_argument('--compress', default=False, action='store_true', help='Compress output files') | |
128 parser.add_argument('--output_directory', default='tmp', help='Directory to write output to') | |
129 parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs') | |
130 parser.add_argument('--division_names', nargs='+', help='RefSeq divisions to download') | |
131 parser.add_argument('--mol_types', nargs='+', help='Molecule types (genomic, rna, protein) to fetch') | |
132 parser.add_argument('--pin_date', help='Force download date to this version string') | |
133 args = parser.parse_args() | |
134 if args.galaxy_datamanager_filename is not None: | |
135 dm_opts = json.loads(open(args.galaxy_datamanager_filename).read()) | |
136 output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter | |
137 data_manager_dict = {} | |
138 else: | |
139 output_directory = args.output_directory | |
140 for division_name in args.division_names: | |
141 if args.pin_date is not None: | |
142 today_str = args.pin_date | |
143 else: | |
144 today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format | |
145 [release_num, fasta_files] = get_refseq_division(division_name, args.mol_types, output_directory, args.debug, args.compress) | |
146 if args.galaxy_datamanager_filename is not None: | |
147 for i, mol_type in enumerate(args.mol_types): | |
148 assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i]) | |
149 unique_key = division_name + '.' + release_num + '.' + mol_type + '.' + today_str | |
150 dbkey = division_name + '.' + release_num + '.' + mol_type | |
151 desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')' | |
152 path = os.path.join(output_directory, fasta_files[i]) | |
153 _add_data_table_entry(data_manager_dict=data_manager_dict, | |
154 data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path), | |
155 data_table_name='all_fasta') | |
156 open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode()) |