comparison data_manager/novocraft_index_builder.py @ 4:c276a826fc4b draft

planemo upload for repository https://github.com/zipho/data_manager_novocraft_index_builder commit 1e4e16c747ca6ef261d3307f47a09ff1d49756a1
author sanbi-uwc
date Sat, 05 Mar 2016 06:47:55 -0500
parents 6cddc1a6e282
children b7c01f1d6451
comparison
equal deleted inserted replaced
3:38d9721a5251 4:c276a826fc4b
12 12
13 log = logging.getLogger(__name__) 13 log = logging.getLogger(__name__)
14 14
15 from json import loads, dumps 15 from json import loads, dumps
16 16
17 DEFAULT_DATA_TABLE_NAME = "novocraft_indexes"
18
17 def get_dbkey_id_name(params, dbkey_description=None): 19 def get_dbkey_id_name(params, dbkey_description=None):
18 dbkey = params['param_dict']['dbkey'] 20 dbkey = params['param_dict']['dbkey']
19 # TODO: ensure sequence_id is unique and does not already appear in location file 21 # TODO: ensure sequence_id is unique and does not already appear in location file
20 sequence_id = params['param_dict']['sequence_id'] 22 sequence_id = params['param_dict']['sequence_id']
21 if not sequence_id: 23 if not sequence_id:
26 sequence_name = dbkey_description 28 sequence_name = dbkey_description
27 if not sequence_name: 29 if not sequence_name:
28 sequence_name = dbkey 30 sequence_name = dbkey
29 return dbkey, sequence_id, sequence_name 31 return dbkey, sequence_id, sequence_name
30 32
31 33 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME):
32 def _make_novocraft_index(fasta_filename, target_directory):
33 if os.path.exists(target_directory) and not os.path.isdir(target_directory): 34 if os.path.exists(target_directory) and not os.path.isdir(target_directory):
34 print("Output directory path already exists but is not a directory: {}".format(target_directory), 35 print("Output directory path already exists but is not a directory: {}".format(target_directory),
35 file=sys.stderr) 36 file=sys.stderr)
36 elif not os.path.exists(target_directory): 37 elif not os.path.exists(target_directory):
37 os.mkdir(target_directory) 38 os.mkdir(target_directory)
49 cmdline = ('touch', '{}/foo.nix'.format(target_directory)) 50 cmdline = ('touch', '{}/foo.nix'.format(target_directory))
50 try: 51 try:
51 check_call(cmdline) 52 check_call(cmdline)
52 except CalledProcessError: 53 except CalledProcessError:
53 print("Error building RNA STAR index", file=sys.stderr) 54 print("Error building RNA STAR index", file=sys.stderr)
54 return (target_directory)
55 55
56 data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory )
57 _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
56 58
57 def download_from_url(params, target_directory): 59 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
60 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
61 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
62 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
63 return data_manager_dict
64
65 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ):
58 # TODO: we should automatically do decompression here 66 # TODO: we should automatically do decompression here
59 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) 67 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n')))
60 fasta_reader = [urllib2.urlopen(url) for url in urls] 68 fasta_reader = [urllib2.urlopen(url) for url in urls]
61 69
62 _make_novocraft_index(fasta_reader, target_directory) 70 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, dbkey, sequence_id, sequence_name, data_table_name)
63 71
64 72 def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ):
65 def download_from_history( params, target_directory):
66 # TODO: allow multiple FASTA input files 73 # TODO: allow multiple FASTA input files
67 input_filename = params['param_dict']['reference_source']['input_fasta'] 74 input_filename = params['param_dict']['reference_source']['input_fasta']
68 75
69 _make_novocraft_index(input_filename, target_directory) 76 _make_novocraft_index(data_manager_dict, input_filename, target_directory, dbkey, sequence_id, sequence_name, data_table_name)
70 77
71 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) 78 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history)
72 79
73 def main(): 80 def main():
74 parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this") 81 parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this")
80 filename = args.output_filename 87 filename = args.output_filename
81 88
82 params = loads(open(filename).read()) 89 params = loads(open(filename).read())
83 target_directory = params['output_data'][0]['extra_files_path'] 90 target_directory = params['output_data'][0]['extra_files_path']
84 os.makedirs(target_directory) 91 os.makedirs(target_directory)
92 data_manager_dict = {}
85 93
86 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) 94 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description)
95
87 if dbkey in [None, '', '?']: 96 if dbkey in [None, '', '?']:
88 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey)) 97 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey))
89 98
90 # Fetch the FASTA 99 # Fetch the FASTA
91 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ 100 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\
92 (params, target_directory) 101 (data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME)
93 102
94 data_table_entry = dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory) 103 open(filename, 'wb').write(dumps( data_manager_dict ))
95
96 output_datatable_dict = dict(data_tables={args.data_table_name: [data_table_entry]})
97 open(filename, 'wb').write(dumps(output_datatable_dict))
98 104
99 if __name__ == "__main__": main() 105 if __name__ == "__main__": main()