comparison data_manager/novoalign_index_builder.py @ 2:e51fb8188ed9 draft

planemo upload for repository https://github.com/zipho/data_manager_novoalign_index_builder commit 7c74dfc33ffac5de44aae81def9c374f5f5e2a20
author sanbi-uwc
date Thu, 03 Mar 2016 08:40:42 -0500
parents 85fbd52dbb36
children b63a406719c5
comparison
equal deleted inserted replaced
1:4d67344bdea7 2:e51fb8188ed9
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Z. Mashologu (SANBI-UWC) 2 # Z. Mashologu (SANBI-UWC)
3 #import dict as dict 3 # import dict as dict
4 import os 4 import os
5 import shutil 5 import sys
6 import optparse
7 import urllib2 6 import urllib2
8 import logging 7 import logging
9 log = logging.getLogger( __name__ ) 8 import argparse
9 import shlex
10 from subprocess import check_call, CalledProcessError
11 from __future__ import print_function
12
13 log = logging.getLogger(__name__)
10 14
11 from json import loads, dumps 15 from json import loads, dumps
12 16
13 def cleanup_before_exit( tmp_dir ): 17 def get_dbkey_id_name(params, dbkey_description=None):
14 if tmp_dir and os.path.exists( tmp_dir ): 18 dbkey = params['param_dict']['dbkey']
15 shutil.rmtree( tmp_dir ) 19 # TODO: ensure sequence_id is unique and does not already appear in location file
20 sequence_id = params['param_dict']['sequence_id']
21 if not sequence_id:
22 sequence_id = dbkey # uuid.uuid4() generate and use an uuid instead?
16 23
17 def _stream_fasta_to_file( fasta_stream, target_directory, params, close_stream=True ): 24 sequence_name = params['param_dict']['sequence_name']
18 fasta_base_filename = "%s.fa" % sequence_id 25 if not sequence_name:
19 fasta_filename = os.path.join( target_directory, fasta_base_filename ) 26 sequence_name = dbkey_description
20 fasta_writer = open( fasta_filename, 'wb+' ) 27 if not sequence_name:
28 sequence_name = dbkey
29 return dbkey, sequence_id, sequence_name
21 30
22 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1:
23 fasta_stream = fasta_stream[0]
24 31
25 if isinstance( fasta_stream, list ): 32 def _make_novocraft_index(fasta_filename, target_directory):
26 last_char = None 33 if os.path.exists(target_directory) and not os.path.isdir(target_directory):
27 for fh in fasta_stream: 34 print("Output directory path already exists but is not a directory: {}".format(target_directory),
28 if last_char not in [ None, '\n', '\r' ]: 35 file=sys.stderr)
29 fasta_writer.write( '\n' ) 36 elif not os.path.exists(target_directory):
30 while True: 37 os.mkdir(target_directory)
31 data = fh.read( CHUNK_SIZE ) 38
32 if data: 39 if 'GALAXY_SLOTS' in os.environ:
33 fasta_writer.write( data ) 40 nslots = os.environ['GALAXY_SLOTS']
34 last_char = data[-1]
35 else:
36 break
37 if close_stream:
38 fh.close()
39 else: 41 else:
40 while True: 42 nslots = 1
41 data = fasta_stream.read( CHUNK_SIZE )
42 if data:
43 fasta_writer.write( data )
44 else:
45 break
46 if close_stream:
47 fasta_stream.close()
48 43
49 fasta_writer.close() 44 cmdline_str = 'STAR --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} --runThreadN {}'.format(
45 target_directory,
46 fasta_filename,
47 nslots)
48 cmdline = shlex.split(cmdline_str)
49 try:
50 check_call(cmdline)
51 except CalledProcessError:
52 print("Error building RNA STAR index", file=sys.stderr)
53 return (target_directory)
50 54
51 return dict( path=fasta_base_filename )
52 55
53 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): 56 def download_from_url(params, target_directory):
54 #TODO: we should automatically do decompression here 57 # TODO: we should automatically do decompression here
55 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) 58 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n')))
56 fasta_reader = [ urllib2.urlopen( url ) for url in urls ] 59 fasta_reader = [urllib2.urlopen(url) for url in urls]
57 60
58 data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) 61 _make_novocraft_index(fasta_reader, target_directory)
59 _add_data_table_entry( data_manager_dict, data_table_entry )
60 62
61 def download_from_history( data_manager_dict, params, target_directory): 63
62 #TODO: allow multiple FASTA input files 64 def download_from_history( params, target_directory):
65 # TODO: allow multiple FASTA input files
63 input_filename = params['param_dict']['reference_source']['input_fasta'] 66 input_filename = params['param_dict']['reference_source']['input_fasta']
64 if isinstance( input_filename, list ):
65 fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ]
66 else:
67 fasta_reader = open( input_filename )
68 67
69 data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) 68 _make_novocraft_index(input_filename, target_directory)
70 _add_data_table_entry( data_manager_dict, data_table_entry )
71 69
72 def copy_from_directory( data_manager_dict, params, target_directory ): 70 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history)
73 input_filename = params['param_dict']['reference_source']['fasta_filename']
74 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink'
75 if create_symlink:
76 data_table_entry = _create_symlink( input_filename, target_directory )
77 else:
78 if isinstance( input_filename, list ):
79 fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ]
80 else:
81 fasta_reader = open( input_filename )
82 data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params )
83 _add_data_table_entry( data_manager_dict, data_table_entry )
84
85 def _create_symlink( input_filename, target_directory ):
86 fasta_base_filename = "%s.fa" % sequence_id
87 fasta_filename = os.path.join( target_directory, fasta_base_filename )
88 os.symlink( input_filename, fasta_filename )
89 return dict( path=fasta_base_filename )
90
91 REFERENCE_SOURCE_TO_DOWNLOAD = dict( url=download_from_url, history=download_from_history, directory=copy_from_directory )
92 71
93 def main(): 72 def main():
94 #Parse Command Line 73 parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this")
95 parser = optparse.OptionParser() 74 parser.add_argument('output_filename')
96 parser.add_option( '-d', '--data_table_name' ) 75 parser.add_argument('--data_table_name', default='novocraft_index')
97 (options, args) = parser.parse_args() 76 (options, args) = parser.parse_args()
98 77
99 filename = args[0] 78 filename = args.output_filename
100 79
101 params = loads( open( filename ).read() ) 80 params = loads(open(filename).read())
102 target_directory = params[ 'output_data' ][0]['extra_files_path'] 81 target_directory = params['output_data'][0]['extra_files_path']
103 os.mkdir( target_directory ) 82 os.mkdir(target_directory)
104 data_manager_dict = {}
105 83
106 #Fetch the FASTA 84 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=options.dbkey_description)
107 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory ) 85 if dbkey in [None, '', '?']:
86 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey))
108 87
109 #save info to json file 88 # Fetch the FASTA
110 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) 89 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\
90 (params, target_directory, dbkey, sequence_id, sequence_name)
91
92 data_table_entry = dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory)
93
94 output_datatable_dict = dict(data_tables={args.data_table_name: [data_table_entry]})
95 open(filename, 'wb').write(dumps(output_datatable_dict))
111 96
112 if __name__ == "__main__": main() 97 if __name__ == "__main__": main()