comparison data_manager/novocraft_index_builder.py @ 17:db293ee25be5 draft

planemo upload for repository https://github.com/zipho/data_manager_novocraft_index_builder commit 0745f158bbb0d5d190cc4503157d2d3092ab8cc5
author sanbi-uwc
date Fri, 15 Apr 2016 09:03:40 -0400
parents d053e7b179b5
children 2b89ba1c0057
comparison
equal deleted inserted replaced
16:6437b3c7bb84 17:db293ee25be5
15 from json import loads, dumps 15 from json import loads, dumps
16 16
17 DEFAULT_DATA_TABLE_NAME = "novocraft_index" 17 DEFAULT_DATA_TABLE_NAME = "novocraft_index"
18 18
19 def get_dbkey_id_name(params, dbkey_description=None): 19 def get_dbkey_id_name(params, dbkey_description=None):
20 dbkey = params['param_dict']['dbkey']
21 # TODO: ensure sequence_id is unique and does not already appear in location file 20 # TODO: ensure sequence_id is unique and does not already appear in location file
22 sequence_id = params['param_dict']['sequence_id'] 21 sequence_id = params['param_dict']['sequence_id']
23 if not sequence_id:
24 sequence_id = dbkey # uuid.uuid4() generate and use an uuid instead?
25
26 sequence_name = params['param_dict']['sequence_name'] 22 sequence_name = params['param_dict']['sequence_name']
27 if not sequence_name: 23 if not sequence_name:
28 sequence_name = dbkey_description 24 sequence_name = dbkey_description
29 if not sequence_name: 25 return sequence_id, sequence_name
30 sequence_name = dbkey
31 return dbkey, sequence_id, sequence_name
32 26
33 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name=DEFAULT_DATA_TABLE_NAME): 27 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME):
34 if os.path.exists(target_directory) and not os.path.isdir(target_directory): 28 if os.path.exists(target_directory) and not os.path.isdir(target_directory):
35 print("Output directory path already exists but is not a directory: {}".format(target_directory), 29 print("Output directory path already exists but is not a directory: {}".format(target_directory),
36 file=sys.stderr) 30 file=sys.stderr)
37 elif not os.path.exists(target_directory): 31 elif not os.path.exists(target_directory):
38 os.mkdir(target_directory) 32 os.mkdir(target_directory)
41 index_filename = os.path.join(target_directory, nix_file) 35 index_filename = os.path.join(target_directory, nix_file)
42 cmdline_str = 'novoindex {} {}'.format(index_filename, fasta_filename) 36 cmdline_str = 'novoindex {} {}'.format(index_filename, fasta_filename)
43 cmdline = shlex.split(cmdline_str) 37 cmdline = shlex.split(cmdline_str)
44 38
45 try: 39 try:
46 if add_system_module == 'true':
47 check_call(['module add novoindex'])
48 check_call(cmdline) 40 check_call(cmdline)
49 except CalledProcessError: 41 except CalledProcessError:
50 print("Error building RNA STAR index", file=sys.stderr) 42 print("Error building RNA STAR index", file=sys.stderr)
51 43
52 data_table_entry = dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=index_filename ) 44 data_table_entry = dict( value=sequence_id, dbkey=sequence_id, name=sequence_name, path=index_filename )
53 _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ) 45 _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry )
54 46
55 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): 47 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
56 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
57 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) 49 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
58 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) 50 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
59 return data_manager_dict 51 return data_manager_dict
60 52
61 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name=DEFAULT_DATA_TABLE_NAME ): 53 def download_from_url( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ):
62 # TODO: we should automatically do decompression here 54 # TODO: we should automatically do decompression here
63 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) 55 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n')))
64 fasta_reader = [urllib2.urlopen(url) for url in urls] 56 fasta_reader = [urllib2.urlopen(url) for url in urls]
65 57
66 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, dbkey, sequence_id, sequence_name, data_table_name, add_system_module) 58 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, sequence_id, sequence_name, data_table_name)
67 59
68 def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name=DEFAULT_DATA_TABLE_NAME ): 60 def download_from_history( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ):
69 # TODO: allow multiple FASTA input files 61 # TODO: allow multiple FASTA input files
70 input_filename = params['param_dict']['reference_source']['input_fasta'] 62 input_filename = params['param_dict']['reference_source']['input_fasta']
71 63
72 _make_novocraft_index(data_manager_dict, input_filename, target_directory, dbkey, sequence_id, sequence_name, add_system_module, data_table_name ) 64 _make_novocraft_index(data_manager_dict, input_filename, target_directory, sequence_id, sequence_name, data_table_name )
73 65
74 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) 66 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history)
75 67
76 def main(): 68 def main():
77 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this") 69 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this")
78 parser.add_argument('output_filename') 70 parser.add_argument('output_filename')
79 parser.add_argument('--dbkey_description') 71 parser.add_argument('--dbkey_description')
80 parser.add_argument('--data_table_name', default='novocraft_index') 72 parser.add_argument('--data_table_name', default='novocraft_index')
81 parser.add_argument('--add_system_module', default=False)
82 args = parser.parse_args() 73 args = parser.parse_args()
83 74
84 filename = args.output_filename 75 filename = args.output_filename
85 76
86 params = loads(open(filename).read()) 77 params = loads(open(filename).read())
87 target_directory = params['output_data'][0]['extra_files_path'] 78 target_directory = params['output_data'][0]['extra_files_path']
88 os.makedirs(target_directory) 79 os.makedirs(target_directory)
89 data_manager_dict = {} 80 data_manager_dict = {}
90 81
91 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) 82 sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description)
92
93 if dbkey in [None, '', '?']:
94 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey))
95 83
96 # Fetch the FASTA 84 # Fetch the FASTA
97 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ 85 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\
98 (data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name, args.add_system_module, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME ) 86 (data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME )
99 87
100 open(filename, 'wb').write(dumps( data_manager_dict )) 88 open(filename, 'wb').write(dumps( data_manager_dict ))
101 89
102 if __name__ == "__main__": main() 90 if __name__ == "__main__": main()