Mercurial > repos > sanbi-uwc > data_manager_novocraft_index_builder
comparison data_manager/novocraft_index_builder.py @ 17:db293ee25be5 draft
planemo upload for repository https://github.com/zipho/data_manager_novocraft_index_builder commit 0745f158bbb0d5d190cc4503157d2d3092ab8cc5
author | sanbi-uwc |
---|---|
date | Fri, 15 Apr 2016 09:03:40 -0400 |
parents | d053e7b179b5 |
children | 2b89ba1c0057 |
comparison
equal
deleted
inserted
replaced
16:6437b3c7bb84 | 17:db293ee25be5 |
---|---|
15 from json import loads, dumps | 15 from json import loads, dumps |
16 | 16 |
17 DEFAULT_DATA_TABLE_NAME = "novocraft_index" | 17 DEFAULT_DATA_TABLE_NAME = "novocraft_index" |
18 | 18 |
19 def get_dbkey_id_name(params, dbkey_description=None): | 19 def get_dbkey_id_name(params, dbkey_description=None): |
20 dbkey = params['param_dict']['dbkey'] | |
21 # TODO: ensure sequence_id is unique and does not already appear in location file | 20 # TODO: ensure sequence_id is unique and does not already appear in location file |
22 sequence_id = params['param_dict']['sequence_id'] | 21 sequence_id = params['param_dict']['sequence_id'] |
23 if not sequence_id: | |
24 sequence_id = dbkey # uuid.uuid4() generate and use an uuid instead? | |
25 | |
26 sequence_name = params['param_dict']['sequence_name'] | 22 sequence_name = params['param_dict']['sequence_name'] |
27 if not sequence_name: | 23 if not sequence_name: |
28 sequence_name = dbkey_description | 24 sequence_name = dbkey_description |
29 if not sequence_name: | 25 return sequence_id, sequence_name |
30 sequence_name = dbkey | |
31 return dbkey, sequence_id, sequence_name | |
32 | 26 |
33 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name=DEFAULT_DATA_TABLE_NAME): | 27 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME): |
34 if os.path.exists(target_directory) and not os.path.isdir(target_directory): | 28 if os.path.exists(target_directory) and not os.path.isdir(target_directory): |
35 print("Output directory path already exists but is not a directory: {}".format(target_directory), | 29 print("Output directory path already exists but is not a directory: {}".format(target_directory), |
36 file=sys.stderr) | 30 file=sys.stderr) |
37 elif not os.path.exists(target_directory): | 31 elif not os.path.exists(target_directory): |
38 os.mkdir(target_directory) | 32 os.mkdir(target_directory) |
41 index_filename = os.path.join(target_directory, nix_file) | 35 index_filename = os.path.join(target_directory, nix_file) |
42 cmdline_str = 'novoindex {} {}'.format(index_filename, fasta_filename) | 36 cmdline_str = 'novoindex {} {}'.format(index_filename, fasta_filename) |
43 cmdline = shlex.split(cmdline_str) | 37 cmdline = shlex.split(cmdline_str) |
44 | 38 |
45 try: | 39 try: |
46 if add_system_module == 'true': | |
47 check_call(['module add novoindex']) | |
48 check_call(cmdline) | 40 check_call(cmdline) |
49 except CalledProcessError: | 41 except CalledProcessError: |
50 print("Error building RNA STAR index", file=sys.stderr) | 42 print("Error building RNA STAR index", file=sys.stderr) |
51 | 43 |
52 data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=index_filename ) | 44 data_table_entry = dict( value=sequence_id, dbkey=sequence_id, name=sequence_name, path=index_filename ) |
53 _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) | 45 _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) |
54 | 46 |
55 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): | 47 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): |
56 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | 48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) |
57 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) | 49 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) |
58 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) | 50 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) |
59 return data_manager_dict | 51 return data_manager_dict |
60 | 52 |
61 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name=DEFAULT_DATA_TABLE_NAME ): | 53 def download_from_url( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): |
62 # TODO: we should automatically do decompression here | 54 # TODO: we should automatically do decompression here |
63 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) | 55 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) |
64 fasta_reader = [urllib2.urlopen(url) for url in urls] | 56 fasta_reader = [urllib2.urlopen(url) for url in urls] |
65 | 57 |
66 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, dbkey, sequence_id, sequence_name, data_table_name, add_system_module) | 58 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, sequence_id, sequence_name, data_table_name) |
67 | 59 |
68 def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name=DEFAULT_DATA_TABLE_NAME ): | 60 def download_from_history( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ): |
69 # TODO: allow multiple FASTA input files | 61 # TODO: allow multiple FASTA input files |
70 input_filename = params['param_dict']['reference_source']['input_fasta'] | 62 input_filename = params['param_dict']['reference_source']['input_fasta'] |
71 | 63 |
72 _make_novocraft_index(data_manager_dict, input_filename, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name ) | 64 _make_novocraft_index(data_manager_dict, input_filename, target_directory, sequence_id, sequence_name, data_table_name ) |
73 | 65 |
74 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) | 66 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) |
75 | 67 |
76 def main(): | 68 def main(): |
77 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this") | 69 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this") |
78 parser.add_argument('output_filename') | 70 parser.add_argument('output_filename') |
79 parser.add_argument('--dbkey_description') | 71 parser.add_argument('--dbkey_description') |
80 parser.add_argument('--data_table_name', default='novocraft_index') | 72 parser.add_argument('--data_table_name', default='novocraft_index') |
81 parser.add_argument('--add_system_module', default=False) | |
82 args = parser.parse_args() | 73 args = parser.parse_args() |
83 | 74 |
84 filename = args.output_filename | 75 filename = args.output_filename |
85 | 76 |
86 params = loads(open(filename).read()) | 77 params = loads(open(filename).read()) |
87 target_directory = params['output_data'][0]['extra_files_path'] | 78 target_directory = params['output_data'][0]['extra_files_path'] |
88 os.makedirs(target_directory) | 79 os.makedirs(target_directory) |
89 data_manager_dict = {} | 80 data_manager_dict = {} |
90 | 81 |
91 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) | 82 sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) |
92 | |
93 if dbkey in [None, '', '?']: | |
94 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey)) | |
95 | 83 |
96 # Fetch the FASTA | 84 # Fetch the FASTA |
97 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ | 85 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ |
98 (data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, args.add_system_module, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME ) | 86 (data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME ) |
99 | 87 |
100 open(filename, 'wb').write(dumps( data_manager_dict )) | 88 open(filename, 'wb').write(dumps( data_manager_dict )) |
101 | 89 |
102 if __name__ == "__main__": main() | 90 if __name__ == "__main__": main() |