Mercurial > repos > sanbi-uwc > data_manager_novoalign_index_builder
comparison data_manager/novoalign_index_builder.py @ 2:e51fb8188ed9 draft
planemo upload for repository https://github.com/zipho/data_manager_novoalign_index_builder commit 7c74dfc33ffac5de44aae81def9c374f5f5e2a20
author | sanbi-uwc |
---|---|
date | Thu, 03 Mar 2016 08:40:42 -0500 |
parents | 85fbd52dbb36 |
children | b63a406719c5 |
comparison
equal
deleted
inserted
replaced
1:4d67344bdea7 | 2:e51fb8188ed9 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Z. Mashologu (SANBI-UWC) | 2 # Z. Mashologu (SANBI-UWC) |
3 #import dict as dict | 3 # import dict as dict |
4 import os | 4 import os |
5 import shutil | 5 import sys |
6 import optparse | |
7 import urllib2 | 6 import urllib2 |
8 import logging | 7 import logging |
9 log = logging.getLogger( __name__ ) | 8 import argparse |
9 import shlex | |
10 from subprocess import check_call, CalledProcessError | |
11 from __future__ import print_function | |
12 | |
13 log = logging.getLogger(__name__) | |
10 | 14 |
11 from json import loads, dumps | 15 from json import loads, dumps |
12 | 16 |
13 def cleanup_before_exit( tmp_dir ): | 17 def get_dbkey_id_name(params, dbkey_description=None): |
14 if tmp_dir and os.path.exists( tmp_dir ): | 18 dbkey = params['param_dict']['dbkey'] |
15 shutil.rmtree( tmp_dir ) | 19 # TODO: ensure sequence_id is unique and does not already appear in location file |
20 sequence_id = params['param_dict']['sequence_id'] | |
21 if not sequence_id: | |
22 sequence_id = dbkey # uuid.uuid4() generate and use an uuid instead? | |
16 | 23 |
17 def _stream_fasta_to_file( fasta_stream, target_directory, params, close_stream=True ): | 24 sequence_name = params['param_dict']['sequence_name'] |
18 fasta_base_filename = "%s.fa" % sequence_id | 25 if not sequence_name: |
19 fasta_filename = os.path.join( target_directory, fasta_base_filename ) | 26 sequence_name = dbkey_description |
20 fasta_writer = open( fasta_filename, 'wb+' ) | 27 if not sequence_name: |
28 sequence_name = dbkey | |
29 return dbkey, sequence_id, sequence_name | |
21 | 30 |
22 if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: | |
23 fasta_stream = fasta_stream[0] | |
24 | 31 |
25 if isinstance( fasta_stream, list ): | 32 def _make_novocraft_index(fasta_filename, target_directory): |
26 last_char = None | 33 if os.path.exists(target_directory) and not os.path.isdir(target_directory): |
27 for fh in fasta_stream: | 34 print("Output directory path already exists but is not a directory: {}".format(target_directory), |
28 if last_char not in [ None, '\n', '\r' ]: | 35 file=sys.stderr) |
29 fasta_writer.write( '\n' ) | 36 elif not os.path.exists(target_directory): |
30 while True: | 37 os.mkdir(target_directory) |
31 data = fh.read( CHUNK_SIZE ) | 38 |
32 if data: | 39 if 'GALAXY_SLOTS' in os.environ: |
33 fasta_writer.write( data ) | 40 nslots = os.environ['GALAXY_SLOTS'] |
34 last_char = data[-1] | |
35 else: | |
36 break | |
37 if close_stream: | |
38 fh.close() | |
39 else: | 41 else: |
40 while True: | 42 nslots = 1 |
41 data = fasta_stream.read( CHUNK_SIZE ) | |
42 if data: | |
43 fasta_writer.write( data ) | |
44 else: | |
45 break | |
46 if close_stream: | |
47 fasta_stream.close() | |
48 | 43 |
49 fasta_writer.close() | 44 cmdline_str = 'STAR --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} --runThreadN {}'.format( |
45 target_directory, | |
46 fasta_filename, | |
47 nslots) | |
48 cmdline = shlex.split(cmdline_str) | |
49 try: | |
50 check_call(cmdline) | |
51 except CalledProcessError: | |
52 print("Error building RNA STAR index", file=sys.stderr) | |
53 return (target_directory) | |
50 | 54 |
51 return dict( path=fasta_base_filename ) | |
52 | 55 |
53 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): | 56 def download_from_url(params, target_directory): |
54 #TODO: we should automatically do decompression here | 57 # TODO: we should automatically do decompression here |
55 urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) | 58 urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) |
56 fasta_reader = [ urllib2.urlopen( url ) for url in urls ] | 59 fasta_reader = [urllib2.urlopen(url) for url in urls] |
57 | 60 |
58 data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) | 61 _make_novocraft_index(fasta_reader, target_directory) |
59 _add_data_table_entry( data_manager_dict, data_table_entry ) | |
60 | 62 |
61 def download_from_history( data_manager_dict, params, target_directory): | 63 |
62 #TODO: allow multiple FASTA input files | 64 def download_from_history( params, target_directory): |
65 # TODO: allow multiple FASTA input files | |
63 input_filename = params['param_dict']['reference_source']['input_fasta'] | 66 input_filename = params['param_dict']['reference_source']['input_fasta'] |
64 if isinstance( input_filename, list ): | |
65 fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] | |
66 else: | |
67 fasta_reader = open( input_filename ) | |
68 | 67 |
69 data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) | 68 _make_novocraft_index(input_filename, target_directory) |
70 _add_data_table_entry( data_manager_dict, data_table_entry ) | |
71 | 69 |
72 def copy_from_directory( data_manager_dict, params, target_directory ): | 70 REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) |
73 input_filename = params['param_dict']['reference_source']['fasta_filename'] | |
74 create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' | |
75 if create_symlink: | |
76 data_table_entry = _create_symlink( input_filename, target_directory ) | |
77 else: | |
78 if isinstance( input_filename, list ): | |
79 fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] | |
80 else: | |
81 fasta_reader = open( input_filename ) | |
82 data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) | |
83 _add_data_table_entry( data_manager_dict, data_table_entry ) | |
84 | |
85 def _create_symlink( input_filename, target_directory ): | |
86 fasta_base_filename = "%s.fa" % sequence_id | |
87 fasta_filename = os.path.join( target_directory, fasta_base_filename ) | |
88 os.symlink( input_filename, fasta_filename ) | |
89 return dict( path=fasta_base_filename ) | |
90 | |
91 REFERENCE_SOURCE_TO_DOWNLOAD = dict( url=download_from_url, history=download_from_history, directory=copy_from_directory ) | |
92 | 71 |
93 def main(): | 72 def main(): |
94 #Parse Command Line | 73 parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this") |
95 parser = optparse.OptionParser() | 74 parser.add_argument('output_filename') |
96 parser.add_option( '-d', '--data_table_name' ) | 75 parser.add_argument('--data_table_name', default='novocraft_index') |
97 (options, args) = parser.parse_args() | 76 (options, args) = parser.parse_args() |
98 | 77 |
99 filename = args[0] | 78 filename = args.output_filename |
100 | 79 |
101 params = loads( open( filename ).read() ) | 80 params = loads(open(filename).read()) |
102 target_directory = params[ 'output_data' ][0]['extra_files_path'] | 81 target_directory = params['output_data'][0]['extra_files_path'] |
103 os.mkdir( target_directory ) | 82 os.mkdir(target_directory) |
104 data_manager_dict = {} | |
105 | 83 |
106 #Fetch the FASTA | 84 dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=options.dbkey_description) |
107 REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory ) | 85 if dbkey in [None, '', '?']: |
86 raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey)) | |
108 | 87 |
109 #save info to json file | 88 # Fetch the FASTA |
110 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) | 89 REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ |
90 (params, target_directory, dbkey, sequence_id, sequence_name) | |
91 | |
92 data_table_entry = dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory) | |
93 | |
94 output_datatable_dict = dict(data_tables={args.data_table_name: [data_table_entry]}) | |
95 open(filename, 'wb').write(dumps(output_datatable_dict)) | |
111 | 96 |
112 if __name__ == "__main__": main() | 97 if __name__ == "__main__": main() |