Mercurial > repos > sanbi-uwc > data_manager_novocraft_index_builder
comparison data_manager/novocraft_index_builder.py @ 18:2b89ba1c0057 draft default tip
planemo upload for repository https://github.com/zipho/data_manager_novocraft_index_builder commit c8c46a5600bb091d701b8cf78f80a50c6b6812f4
author | sanbi-uwc |
---|---|
date | Tue, 21 Jun 2016 10:12:02 -0400 |
parents | db293ee25be5 |
children |
comparison
equal
deleted
inserted
replaced
17:db293ee25be5 | 18:2b89ba1c0057 |
---|---|
2 # Z. Mashologu (SANBI-UWC) | 2 # Z. Mashologu (SANBI-UWC) |
3 # import dict as dict | 3 # import dict as dict |
4 from __future__ import print_function | 4 from __future__ import print_function |
5 import os | 5 import os |
6 import sys | 6 import sys |
7 import urllib2 | |
8 import logging | 7 import logging |
9 import argparse | 8 import argparse |
10 import shlex | 9 import shlex |
11 from subprocess import check_call, CalledProcessError | 10 from subprocess import check_call, CalledProcessError |
12 | 11 |
14 | 13 |
15 from json import loads, dumps | 14 from json import loads, dumps |
16 | 15 |
17 DEFAULT_DATA_TABLE_NAME = "novocraft_index" | 16 DEFAULT_DATA_TABLE_NAME = "novocraft_index" |
18 | 17 |
19 def get_dbkey_id_name(params, dbkey_description=None): | 18 def get_dbkey_id_name(params): |
20 # TODO: ensure sequence_id is unique and does not already appear in location file | 19 # TODO: ensure sequence_id is unique and does not already appear in location file |
21 sequence_id = params['param_dict']['sequence_id'] | 20 sequence_id = params['param_dict']['sequence_id'] |
22 sequence_name = params['param_dict']['sequence_name'] | 21 sequence_name = params['param_dict']['sequence_name'] |
23 if not sequence_name: | 22 sequence_desc = params['param_dict']['sequence_desc'] |
24 sequence_name = dbkey_description | 23 if not sequence_desc: |
25 return sequence_id, sequence_name | 24 sequence_desc = sequence_name |
25 return sequence_id, sequence_name, sequence_desc | |
26 | 26 |
27 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME): | 27 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME): |
28 if os.path.exists(target_directory) and not os.path.isdir(target_directory): | 28 if os.path.exists(target_directory) and not os.path.isdir(target_directory): |
29 print("Output directory path already exists but is not a directory: {}".format(target_directory), | 29 print("Output directory path already exists but is not a directory: {}".format(target_directory), |
30 file=sys.stderr) | 30 file=sys.stderr) |
48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | 48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) |
49 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) | 49 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) |
50 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) | 50 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) |
51 return data_manager_dict | 51 return data_manager_dict |
52 | 52 |
53 def download_from_url( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): | |
54 # TODO: we should automatically do decompression here | |
55 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) | |
56 fasta_reader = [urllib2.urlopen(url) for url in urls] | |
57 | |
58 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, sequence_id, sequence_name, data_table_name) | |
59 | |
60 def download_from_history( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): | |
61 # TODO: allow multiple FASTA input files | |
62 input_filename = params['param_dict']['reference_source']['input_fasta'] | |
63 | |
64 _make_novocraft_index(data_manager_dict, input_filename, target_directory, sequence_id, sequence_name, data_table_name ) | |
65 | |
66 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) | |
67 | |
68 def main(): | 53 def main(): |
69 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this") | 54 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this") |
70 parser.add_argument('output_filename') | 55 parser.add_argument('output_filename') |
71 parser.add_argument('--dbkey_description') | 56 parser.add_argument('--input_filename') |
72 parser.add_argument('--data_table_name', default='novocraft_index') | 57 parser.add_argument('--data_table_name', default='novocraft_index') |
73 args = parser.parse_args() | 58 args = parser.parse_args() |
74 | 59 |
75 filename = args.output_filename | 60 filename = args.output_filename |
76 | 61 |
77 params = loads(open(filename).read()) | 62 params = loads(open(filename).read()) |
78 target_directory = params['output_data'][0]['extra_files_path'] | 63 target_directory = params['output_data'][0]['extra_files_path'] |
79 os.makedirs(target_directory) | 64 os.makedirs(target_directory) |
80 data_manager_dict = {} | 65 data_manager_dict = {} |
81 | 66 |
82 sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) | 67 sequence_id, sequence_name, sequence_desc = get_dbkey_id_name(params) |
83 | 68 |
84 # Fetch the FASTA | 69 #Make novocraft index |
85 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ | 70 _make_novocraft_index(data_manager_dict, args.input_filename, target_directory, sequence_id, sequence_name, args.data_table_name or DEFAULT_DATA_TABLE_NAME ) |
86 (data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME ) | |
87 | 71 |
88 open(filename, 'wb').write(dumps( data_manager_dict )) | 72 open(filename, 'wb').write(dumps( data_manager_dict )) |
89 | 73 |
90 if __name__ == "__main__": main() | 74 if __name__ == "__main__": main() |