comparison data_manager/novocraft_index_builder.py @ 18:2b89ba1c0057 draft default tip

planemo upload for repository https://github.com/zipho/data_manager_novocraft_index_builder commit c8c46a5600bb091d701b8cf78f80a50c6b6812f4
author sanbi-uwc
date Tue, 21 Jun 2016 10:12:02 -0400
parents db293ee25be5
children
comparison
equal deleted inserted replaced
17:db293ee25be5 18:2b89ba1c0057
2 # Z. Mashologu (SANBI-UWC) 2 # Z. Mashologu (SANBI-UWC)
3 # import dict as dict 3 # import dict as dict
4 from __future__ import print_function 4 from __future__ import print_function
5 import os 5 import os
6 import sys 6 import sys
7 import urllib2
8 import logging 7 import logging
9 import argparse 8 import argparse
10 import shlex 9 import shlex
11 from subprocess import check_call, CalledProcessError 10 from subprocess import check_call, CalledProcessError
12 11
14 13
15 from json import loads, dumps 14 from json import loads, dumps
16 15
17 DEFAULT_DATA_TABLE_NAME = "novocraft_index" 16 DEFAULT_DATA_TABLE_NAME = "novocraft_index"
18 17
19 def get_dbkey_id_name(params, dbkey_description=None): 18 def get_dbkey_id_name(params):
20 # TODO: ensure sequence_id is unique and does not already appear in location file 19 # TODO: ensure sequence_id is unique and does not already appear in location file
21 sequence_id = params['param_dict']['sequence_id'] 20 sequence_id = params['param_dict']['sequence_id']
22 sequence_name = params['param_dict']['sequence_name'] 21 sequence_name = params['param_dict']['sequence_name']
23 if not sequence_name: 22 sequence_desc = params['param_dict']['sequence_desc']
24 sequence_name = dbkey_description 23 if not sequence_desc:
25 return sequence_id, sequence_name 24 sequence_desc = sequence_name
25 return sequence_id, sequence_name, sequence_desc
26 26
27 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME): 27 def _make_novocraft_index(data_manager_dict, fasta_filename, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME):
28 if os.path.exists(target_directory) and not os.path.isdir(target_directory): 28 if os.path.exists(target_directory) and not os.path.isdir(target_directory):
29 print("Output directory path already exists but is not a directory: {}".format(target_directory), 29 print("Output directory path already exists but is not a directory: {}".format(target_directory),
30 file=sys.stderr) 30 file=sys.stderr)
48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) 48 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
49 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] ) 49 data_manager_dict['data_tables'][ data_table_name ] = data_manager_dict['data_tables'].get( data_table_name, [] )
50 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry ) 50 data_manager_dict['data_tables'][ data_table_name ].append( data_table_entry )
51 return data_manager_dict 51 return data_manager_dict
52 52
53 def download_from_url( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ):
54 # TODO: we should automatically do decompression here
55 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n')))
56 fasta_reader = [urllib2.urlopen(url) for url in urls]
57
58 _make_novocraft_index(data_manager_dict, fasta_reader, target_directory, sequence_id, sequence_name, data_table_name)
59
60 def download_from_history( data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=DEFAULT_DATA_TABLE_NAME ):
61 # TODO: allow multiple FASTA input files
62 input_filename = params['param_dict']['reference_source']['input_fasta']
63
64 _make_novocraft_index(data_manager_dict, input_filename, target_directory, sequence_id, sequence_name, data_table_name )
65
66 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history)
67
68 def main(): 53 def main():
69 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this") 54 parser = argparse.ArgumentParser(description="Generate Novo-craft genome index and JSON describing this")
70 parser.add_argument('output_filename') 55 parser.add_argument('output_filename')
71 parser.add_argument('--dbkey_description') 56 parser.add_argument('--input_filename')
72 parser.add_argument('--data_table_name', default='novocraft_index') 57 parser.add_argument('--data_table_name', default='novocraft_index')
73 args = parser.parse_args() 58 args = parser.parse_args()
74 59
75 filename = args.output_filename 60 filename = args.output_filename
76 61
77 params = loads(open(filename).read()) 62 params = loads(open(filename).read())
78 target_directory = params['output_data'][0]['extra_files_path'] 63 target_directory = params['output_data'][0]['extra_files_path']
79 os.makedirs(target_directory) 64 os.makedirs(target_directory)
80 data_manager_dict = {} 65 data_manager_dict = {}
81 66
82 sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=args.dbkey_description) 67 sequence_id, sequence_name, sequence_desc = get_dbkey_id_name(params)
83 68
84 # Fetch the FASTA 69 #Make novocraft index
85 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ 70 _make_novocraft_index(data_manager_dict, args.input_filename, target_directory, sequence_id, sequence_name, args.data_table_name or DEFAULT_DATA_TABLE_NAME )
86 (data_manager_dict, params, target_directory, sequence_id, sequence_name, data_table_name=args.data_table_name or DEFAULT_DATA_TABLE_NAME )
87 71
88 open(filename, 'wb').write(dumps( data_manager_dict )) 72 open(filename, 'wb').write(dumps( data_manager_dict ))
89 73
90 if __name__ == "__main__": main() 74 if __name__ == "__main__": main()