annotate data_manager/fetch_ganon.py @ 1:9de84cd78a82 draft

Uploaded
author estrain
date Fri, 05 Jul 2019 07:56:23 -0400
parents a50614a513f3
children c220590bde7a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a50614a513f3 Uploaded
estrain
parents:
diff changeset
1 #!/usr/bin/env python
a50614a513f3 Uploaded
estrain
parents:
diff changeset
2
a50614a513f3 Uploaded
estrain
parents:
diff changeset
3 import argparse
a50614a513f3 Uploaded
estrain
parents:
diff changeset
4 import json
a50614a513f3 Uploaded
estrain
parents:
diff changeset
5 import os
a50614a513f3 Uploaded
estrain
parents:
diff changeset
6 import os.path
a50614a513f3 Uploaded
estrain
parents:
diff changeset
7 import sys
a50614a513f3 Uploaded
estrain
parents:
diff changeset
8 import ftplib
a50614a513f3 Uploaded
estrain
parents:
diff changeset
9 import socket
a50614a513f3 Uploaded
estrain
parents:
diff changeset
10
a50614a513f3 Uploaded
estrain
parents:
diff changeset
11 def get_refseq_rrna(rrna):
a50614a513f3 Uploaded
estrain
parents:
diff changeset
12
a50614a513f3 Uploaded
estrain
parents:
diff changeset
13 host = 'ftp.ncbi.nlm.nih.gov'
a50614a513f3 Uploaded
estrain
parents:
diff changeset
14 folder_path = 'refseq/TargetedLoci/Bacteria/'
a50614a513f3 Uploaded
estrain
parents:
diff changeset
15 file_name="bacteria."+rrna+"rRNA.fna.gz"
a50614a513f3 Uploaded
estrain
parents:
diff changeset
16
a50614a513f3 Uploaded
estrain
parents:
diff changeset
17 try:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
18 f = ftplib.FTP(host)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
19 except (socket.error, socket.gaierror), e:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
20 print 'ERROR: cannot reach "%s"' % host
a50614a513f3 Uploaded
estrain
parents:
diff changeset
21 return
a50614a513f3 Uploaded
estrain
parents:
diff changeset
22 print '*** Connected to host "%s"' % host
a50614a513f3 Uploaded
estrain
parents:
diff changeset
23
a50614a513f3 Uploaded
estrain
parents:
diff changeset
24 try:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
25 f.login()
a50614a513f3 Uploaded
estrain
parents:
diff changeset
26 except ftplib.error_perm:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
27 print 'ERROR: cannot login anonymously'
a50614a513f3 Uploaded
estrain
parents:
diff changeset
28 f.quit()
a50614a513f3 Uploaded
estrain
parents:
diff changeset
29 return
a50614a513f3 Uploaded
estrain
parents:
diff changeset
30 print '*** Logged in as "anonymous"'
a50614a513f3 Uploaded
estrain
parents:
diff changeset
31
a50614a513f3 Uploaded
estrain
parents:
diff changeset
32 try:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
33 f.cwd(folder_path)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
34 except ftplib.error_perm:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
35 print 'ERROR: cannot CD to "%s"' % folder_path
a50614a513f3 Uploaded
estrain
parents:
diff changeset
36 f.quit()
a50614a513f3 Uploaded
estrain
parents:
diff changeset
37 return
a50614a513f3 Uploaded
estrain
parents:
diff changeset
38 print '*** Changed to "%s" folder' % folder_path
a50614a513f3 Uploaded
estrain
parents:
diff changeset
39
a50614a513f3 Uploaded
estrain
parents:
diff changeset
40 try:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
41 f.retrbinary('RETR %s' % file_name,
a50614a513f3 Uploaded
estrain
parents:
diff changeset
42 open(file_name, 'wb').write)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
43 except ftplib.error_perm:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
44 print 'ERROR: cannot read file "%s"' % file_name
a50614a513f3 Uploaded
estrain
parents:
diff changeset
45 os.unlink(file_name)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
46 else:
a50614a513f3 Uploaded
estrain
parents:
diff changeset
47 print '*** Downloaded "%s" to CWD' % file_name
a50614a513f3 Uploaded
estrain
parents:
diff changeset
48 f.quit()
a50614a513f3 Uploaded
estrain
parents:
diff changeset
49
a50614a513f3 Uploaded
estrain
parents:
diff changeset
50 return[file_name]
a50614a513f3 Uploaded
estrain
parents:
diff changeset
51
a50614a513f3 Uploaded
estrain
parents:
diff changeset
52 def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name):
a50614a513f3 Uploaded
estrain
parents:
diff changeset
53 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
a50614a513f3 Uploaded
estrain
parents:
diff changeset
54 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('ganon_databases', [])
a50614a513f3 Uploaded
estrain
parents:
diff changeset
55 data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
56 return data_manager_dict
a50614a513f3 Uploaded
estrain
parents:
diff changeset
57
a50614a513f3 Uploaded
estrain
parents:
diff changeset
58 if __name__ == '__main__':
a50614a513f3 Uploaded
estrain
parents:
diff changeset
59 parser = argparse.ArgumentParser(description='Download RefSeq rRNA bacterial databases')
a50614a513f3 Uploaded
estrain
parents:
diff changeset
60 parser.add_argument('--output_directory', default='/tool-data/ganon', help='Directory to write output to')
a50614a513f3 Uploaded
estrain
parents:
diff changeset
61 parser.add_argument('--rrna', help='rRNA sequences to download (5S, 16S, or 23S)')
a50614a513f3 Uploaded
estrain
parents:
diff changeset
62 args = parser.parse_args()
a50614a513f3 Uploaded
estrain
parents:
diff changeset
63
a50614a513f3 Uploaded
estrain
parents:
diff changeset
64 output_directory = args.output_directory
a50614a513f3 Uploaded
estrain
parents:
diff changeset
65 if not os.path.exists(output_directory):
a50614a513f3 Uploaded
estrain
parents:
diff changeset
66 os.mkdir(output_directory)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
67
a50614a513f3 Uploaded
estrain
parents:
diff changeset
68 outfile=get_refseq_rrna(args.rrna)
a50614a513f3 Uploaded
estrain
parents:
diff changeset
69
a50614a513f3 Uploaded
estrain
parents:
diff changeset
70
a50614a513f3 Uploaded
estrain
parents:
diff changeset
71 data_manager_dict = {}
a50614a513f3 Uploaded
estrain
parents:
diff changeset
72 _add_data_table_entry(data_manager_dict=data_manager_dict,
a50614a513f3 Uploaded
estrain
parents:
diff changeset
73 data_table_entry=dict(value=args.rrna, dbkey=args.rrna, name=args.rrna, path=args.output_directory),
a50614a513f3 Uploaded
estrain
parents:
diff changeset
74 data_table_name='ganon_databases')
a50614a513f3 Uploaded
estrain
parents:
diff changeset
75 open("output_file", 'w').write(json.dumps(data_manager_dict, sort_keys=True))
a50614a513f3 Uploaded
estrain
parents:
diff changeset
76