Mercurial > repos > sanbi-uwc > data_manager_novocraft_index_builder
comparison data_manager/novocraft_index_builder.py @ 4:c276a826fc4b draft
planemo upload for repository https://github.com/zipho/data_manager_novocraft_index_builder commit 1e4e16c747ca6ef261d3307f47a09ff1d49756a1
author | sanbi-uwc |
---|---|
date | Sat, 05 Mar 2016 06:47:55 -0500 |
parents | 6cddc1a6e282 |
children | b7c01f1d6451 |
comparison
equal
deleted
inserted
replaced
3:38d9721a5251 | 4:c276a826fc4b |
---|---|
12 | 12 |
13 log = logging.getLogger(__name__) | 13 log = logging.getLogger(__name__) |
14 | 14 |
15 from json import loads, dumps | 15 from json import loads, dumps |
16 | 16 |
17 DEFAULT_DATA_TABLE_NAME = "novocraft_indexes" | |
18 | |
17 def get_dbkey_id_name(params, dbkey_description=None): | 19 def get_dbkey_id_name(params, dbkey_description=None): |
18 dbkey = params['param_dict']['dbkey'] | 20 dbkey = params['param_dict']['dbkey'] |
19 # TODO: ensure sequence_id is unique and does not already appear in location file | 21 # TODO: ensure sequence_id is unique and does not already appear in location file |
20 sequence_id = params['param_dict']['sequence_id'] | 22 sequence_id = params['param_dict']['sequence_id'] |
21 if not sequence_id: | 23 if not sequence_id: |
26 sequence_name = dbkey_description | 28 sequence_name = dbkey_description |
27 if not sequence_name: | 29 if not sequence_name: |
28 sequence_name = dbkey | 30 sequence_name = dbkey |
29 return dbkey, sequence_id, sequence_name | 31 return dbkey, sequence_id, sequence_name |
30 | 32 |
31 | 33 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME): |
32 def _make_novocraft_index(fasta_filename, target_directory): | |
33 if os.path.exists(target_directory) and not os.path.isdir(target_directory): | 34 if os.path.exists(target_directory) and not os.path.isdir(target_directory): |
34 print("Output directory path already exists but is not a directory: {}".format(target_directory), | 35 print("Output directory path already exists but is not a directory: {}".format(target_directory), |
35 file=sys.stderr) | 36 file=sys.stderr) |
36 elif not os.path.exists(target_directory): | 37 elif not os.path.exists(target_directory): |
37 os.mkdir(target_directory) | 38 os.mkdir(target_directory) |
49 cmdline = ('touch', '{}/foo.nix'.format(target_directory)) | 50 cmdline = ('touch', '{}/foo.nix'.format(target_directory)) |
50 try: | 51 try: |
51 check_call(cmdline) | 52 check_call(cmdline) |
52 except CalledProcessError: | 53 except CalledProcessError: |
53 print("Error building RNA STAR index", file=sys.stderr) | 54 print("Error building RNA STAR index", file=sys.stderr) |
54 return (target_directory) | |
55 | 55 |
56 data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory ) | |
57 _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) | |
56 | 58 |
57 def download_from_url(params, target_directory): | 59 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): |
60 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | |
61 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) | |
62 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) | |
63 return data_manager_dict | |
64 | |
65 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): | |
58 # TODO: we should automatically do decompression here | 66 # TODO: we should automatically do decompression here |
59 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) | 67 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) |
60 fasta_reader = [urllib2.urlopen(url) for url in urls] | 68 fasta_reader = [urllib2.urlopen(url) for url in urls] |
61 | 69 |
62 _make_novocraft_index(fasta_reader, target_directory) | 70 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, dbkey, sequence_id, sequence_name, data_table_name) |
63 | 71 |
64 | 72 def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): |
65 def download_from_history( params, target_directory): | |
66 # TODO: allow multiple FASTA input files | 73 # TODO: allow multiple FASTA input files |
67 input_filename = params['param_dict']['reference_source']['input_fasta'] | 74 input_filename = params['param_dict']['reference_source']['input_fasta'] |
68 | 75 |
69 _make_novocraft_index(input_filename, target_directory) | 76 _make_novocraft_index(data_manager_dict, input_filename, target_directory, dbkey, sequence_id, sequence_name, data_table_name) |
70 | 77 |
71 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) | 78 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) |
72 | 79 |
73 def main(): | 80 def main(): |
74 parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this") | 81 parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this") |
80 filename = args.output_filename | 87 filename = args.output_filename |
81 | 88 |
82 params = loads(open(filename).read()) | 89 params = loads(open(filename).read()) |
83 target_directory = params['output_data'][0]['extra_files_path'] | 90 target_directory = params['output_data'][0]['extra_files_path'] |
84 os.makedirs(target_directory) | 91 os.makedirs(target_directory) |
92 data_manager_dict = {} | |
85 | 93 |
86 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) | 94 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) |
95 | |
87 if dbkey in [None, '', '?']: | 96 if dbkey in [None, '', '?']: |
88 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey)) | 97 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey)) |
89 | 98 |
90 # Fetch the FASTA | 99 # Fetch the FASTA |
91 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ | 100 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ |
92 (params, target_directory) | 101 (data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME) |
93 | 102 |
94 data_table_entry = dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory) | 103 open(filename, 'wb').write(dumps( data_manager_dict )) |
95 | |
96 output_datatable_dict = dict(data_tables={args.data_table_name: [data_table_entry]}) | |
97 open(filename, 'wb').write(dumps(output_datatable_dict)) | |
98 | 104 |
99 if __name__ == "__main__": main() | 105 if __name__ == "__main__": main() |