Mercurial > repos > sanbi-uwc > data_manager_fetch_refseq
annotate data_manager/fetch_refseq.py @ 7:efdead58e937 draft
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit b682adad2c3c74567d23e1a5cf2bfcc3df1c96ae-dirty
author | sanbi-uwc |
---|---|
date | Fri, 07 Sep 2018 19:47:52 -0400 |
parents | 4852eb1a75e5 |
children | d878e492546c |
rev | line source |
---|---|
0
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
2 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
3 from __future__ import print_function, division |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
4 import argparse |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
5 from datetime import date |
2
a4ee45e7237b
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit a0125981706495e0a8be4fafe2eb1af3f0cfdaa3-dirty
sanbi-uwc
parents:
0
diff
changeset
|
6 import functools |
0
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
7 import gzip |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
8 import json |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
9 from multiprocessing import Process, Queue |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
10 import os |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
11 import os.path |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
12 import re |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
13 import requests |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
14 import sys |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
15 try: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
16 from io import StringIO |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
17 except ImportError: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
18 from StringIO import StringIO |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
19 # Refseq structure |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
20 # - Release number |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
21 # - Divisions |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
22 # 1. archea |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
23 # 2. bacteria |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
24 # 3. fungi |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
25 # 4. invertebrate |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
26 # 5. mitochondrion |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
27 # 6. other |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
28 # 7. plant |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
29 # 8. plasmid |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
30 # 9. plastid |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
31 # 10. protozoa |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
32 # 11. vertebrate mammalian |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
33 # 12. vertebrate other |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
34 # 13. viral |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
35 # within each division |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
36 # DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
37 # where fna and faa are FASTA, gbff and gpff are Genbank |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
38 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
39 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
40 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
41 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', []) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
42 data_manager_dict['data_tables'][data_table_name].append(data_table_entry) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
43 return data_manager_dict |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
44 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
45 def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
46 input_filename = conn.get() |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
47 if compress: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
48 open_output = gzip.open |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
49 else: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
50 open_output = open |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
51 with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
52 while input_filename != 'STOP': |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
53 if debug: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
54 print('Reading', input_filename, file=sys.stderr) |
3
4852eb1a75e5
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit b682adad2c3c74567d23e1a5cf2bfcc3df1c96ae
sanbi-uwc
parents:
2
diff
changeset
|
55 with gzip.open(input_filename, 'rb') as input_file: |
2
a4ee45e7237b
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit a0125981706495e0a8be4fafe2eb1af3f0cfdaa3-dirty
sanbi-uwc
parents:
0
diff
changeset
|
56 read_chunk = functools.partial(input_file.read, (chunk_size)) |
3
4852eb1a75e5
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit b682adad2c3c74567d23e1a5cf2bfcc3df1c96ae
sanbi-uwc
parents:
2
diff
changeset
|
57 for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3 |
0
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
58 output_file.write(data) |
2
a4ee45e7237b
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit a0125981706495e0a8be4fafe2eb1af3f0cfdaa3-dirty
sanbi-uwc
parents:
0
diff
changeset
|
59 os.unlink(input_filename) |
0
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
60 input_filename = conn.get() |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
61 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
62 def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
63 base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
64 valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
65 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
66 ending_mappings = { |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
67 'genomic': '.genomic.fna.gz', |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
68 'protein': '.protein.faa.gz', |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
69 'rna': 'rna.fna.gz' |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
70 } |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
71 assert division_name in valid_divisions, "Unknown division name ({})".format(division_name) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
72 for mol_type in mol_types: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
73 assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
74 if not os.path.exists(output_directory): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
75 os.mkdir(output_directory) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
76 release_num_file = base_url + 'RELEASE_NUMBER' |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
77 r = requests.get(release_num_file) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
78 release_num = str(int(r.text.strip())) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
79 division_base_url = base_url + division_name |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
80 if debug: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
81 print('Retrieving {}'.format(division_base_url), file=sys.stderr) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
82 r = requests.get(division_base_url) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
83 listing_text = r.text |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
84 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
85 unzip_queues = {} |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
86 unzip_processes = [] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
87 final_output_filenames = [] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
88 for mol_type in mol_types: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
89 q = unzip_queues[mol_type] = Queue() |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
90 output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta' |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
91 if compress: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
92 output_filename += '.gz' |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
93 final_output_filenames.append(output_filename) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
94 unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename), |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
95 kwargs=dict(debug=debug, compress=compress))) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
96 unzip_processes[-1].start() |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
97 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
98 # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
99 for line in StringIO(listing_text): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
100 if not '.gz' in line: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
101 continue |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
102 parts = line.split('"') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
103 assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip()) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
104 filename = parts[1] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
105 for mol_type in mol_types: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
106 ending = ending_mappings[mol_type] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
107 if filename.endswith(ending): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
108 if debug: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
109 print('Downloading:', filename, ending, mol_type, file=sys.stderr) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
110 output_filename = os.path.join(output_directory, filename) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
111 with open(output_filename, 'wb') as output_file: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
112 r = requests.get(division_base_url + '/' + filename) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
113 for chunk in r.iter_content(chunk_size=4096): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
114 output_file.write(chunk) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
115 conn = unzip_queues[mol_type] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
116 conn.put(output_filename) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
117 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
118 for mol_type in mol_types: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
119 conn = unzip_queues[mol_type] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
120 conn.put('STOP') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
121 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
122 return [release_num, final_output_filenames] |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
123 |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
124 if __name__ == '__main__': |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
125 parser = argparse.ArgumentParser(description='Download RefSeq databases') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
126 parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
127 parser.add_argument('--compress', default=False, action='store_true', help='Compress output files') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
128 parser.add_argument('--output_directory', default='tmp', help='Directory to write output to') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
129 parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
130 parser.add_argument('--division_names', nargs='+', help='RefSeq divisions to download') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
131 parser.add_argument('--mol_types', nargs='+', help='Molecule types (genomic, rna, protein) to fetch') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
132 parser.add_argument('--pin_date', help='Force download date to this version string') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
133 args = parser.parse_args() |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
134 if args.galaxy_datamanager_filename is not None: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
135 dm_opts = json.loads(open(args.galaxy_datamanager_filename).read()) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
136 output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
137 data_manager_dict = {} |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
138 else: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
139 output_directory = args.output_directory |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
140 for division_name in args.division_names: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
141 if args.pin_date is not None: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
142 today_str = args.pin_date |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
143 else: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
144 today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
145 [release_num, fasta_files] = get_refseq_division(division_name, args.mol_types, output_directory, args.debug, args.compress) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
146 if args.galaxy_datamanager_filename is not None: |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
147 for i, mol_type in enumerate(args.mol_types): |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
148 assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i]) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
149 unique_key = division_name + '.' + release_num + '.' + mol_type + '.' + today_str |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
150 dbkey = division_name + '.' + release_num + '.' + mol_type |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
151 desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')' |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
152 path = os.path.join(output_directory, fasta_files[i]) |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
153 _add_data_table_entry(data_manager_dict=data_manager_dict, |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
154 data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path), |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
155 data_table_name='all_fasta') |
cfe6cd521835
planemo upload for repository https://github.com/pvanheus/refseq_fasta_data_manager commit cfd2aa18826b938402ccfc6003f1793886001202-dirty
sanbi-uwc
parents:
diff
changeset
|
156 open(args.galaxy_datamanager_filename, 'wb').write(json.dumps(data_manager_dict).encode()) |