Mercurial > repos > sanbi-uwc > data_manager_novoalign_index_builder
changeset 2:e51fb8188ed9 draft
planemo upload for repository https://github.com/zipho/data_manager_novoalign_index_builder commit 7c74dfc33ffac5de44aae81def9c374f5f5e2a20
author | sanbi-uwc |
---|---|
date | Thu, 03 Mar 2016 08:40:42 -0500 |
parents | 4d67344bdea7 |
children | b63a406719c5 |
files | data_manager/novoalign_index_builder.py data_manager/novoalign_index_builder.xml |
diffstat | 2 files changed, 79 insertions(+), 98 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/novoalign_index_builder.py Thu Mar 03 06:28:09 2016 -0500 +++ b/data_manager/novoalign_index_builder.py Thu Mar 03 08:40:42 2016 -0500 @@ -1,112 +1,97 @@ #!/usr/bin/env python # Z. Mashologu (SANBI-UWC) -#import dict as dict +# import dict as dict import os -import shutil -import optparse +import sys import urllib2 import logging -log = logging.getLogger( __name__ ) +import argparse +import shlex +from subprocess import check_call, CalledProcessError +from __future__ import print_function + +log = logging.getLogger(__name__) from json import loads, dumps -def cleanup_before_exit( tmp_dir ): - if tmp_dir and os.path.exists( tmp_dir ): - shutil.rmtree( tmp_dir ) - -def _stream_fasta_to_file( fasta_stream, target_directory, params, close_stream=True ): - fasta_base_filename = "%s.fa" % sequence_id - fasta_filename = os.path.join( target_directory, fasta_base_filename ) - fasta_writer = open( fasta_filename, 'wb+' ) - - if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: - fasta_stream = fasta_stream[0] +def get_dbkey_id_name(params, dbkey_description=None): + dbkey = params['param_dict']['dbkey'] + # TODO: ensure sequence_id is unique and does not already appear in location file + sequence_id = params['param_dict']['sequence_id'] + if not sequence_id: + sequence_id = dbkey # uuid.uuid4() generate and use an uuid instead? - if isinstance( fasta_stream, list ): - last_char = None - for fh in fasta_stream: - if last_char not in [ None, '\n', '\r' ]: - fasta_writer.write( '\n' ) - while True: - data = fh.read( CHUNK_SIZE ) - if data: - fasta_writer.write( data ) - last_char = data[-1] - else: - break - if close_stream: - fh.close() + sequence_name = params['param_dict']['sequence_name'] + if not sequence_name: + sequence_name = dbkey_description + if not sequence_name: + sequence_name = dbkey + return dbkey, sequence_id, sequence_name + + +def _make_novocraft_index(fasta_filename, target_directory): + if os.path.exists(target_directory) and not os.path.isdir(target_directory): + print("Output directory path already exists but is not a directory: {}".format(target_directory), + file=sys.stderr) + elif not os.path.exists(target_directory): + os.mkdir(target_directory) + + if 'GALAXY_SLOTS' in os.environ: + nslots = os.environ['GALAXY_SLOTS'] else: - while True: - data = fasta_stream.read( CHUNK_SIZE ) - if data: - fasta_writer.write( data ) - else: - break - if close_stream: - fasta_stream.close() - - fasta_writer.close() - - return dict( path=fasta_base_filename ) + nslots = 1 -def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): - #TODO: we should automatically do decompression here - urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) - fasta_reader = [ urllib2.urlopen( url ) for url in urls ] - - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) - _add_data_table_entry( data_manager_dict, data_table_entry ) + cmdline_str = 'STAR --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} --runThreadN {}'.format( + target_directory, + fasta_filename, + nslots) + cmdline = shlex.split(cmdline_str) + try: + check_call(cmdline) + except CalledProcessError: + print("Error building RNA STAR index", file=sys.stderr) + return (target_directory) -def download_from_history( data_manager_dict, params, target_directory): - #TODO: allow multiple FASTA input files - input_filename = params['param_dict']['reference_source']['input_fasta'] - if isinstance( input_filename, list ): - fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] - else: - fasta_reader = open( input_filename ) - - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) - _add_data_table_entry( data_manager_dict, data_table_entry ) -def copy_from_directory( data_manager_dict, params, target_directory ): - input_filename = params['param_dict']['reference_source']['fasta_filename'] - create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' - if create_symlink: - data_table_entry = _create_symlink( input_filename, target_directory ) - else: - if isinstance( input_filename, list ): - fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] - else: - fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) - _add_data_table_entry( data_manager_dict, data_table_entry ) +def download_from_url(params, target_directory): + # TODO: we should automatically do decompression here + urls = filter(bool, map(lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split('\n'))) + fasta_reader = [urllib2.urlopen(url) for url in urls] + + _make_novocraft_index(fasta_reader, target_directory) + -def _create_symlink( input_filename, target_directory ): - fasta_base_filename = "%s.fa" % sequence_id - fasta_filename = os.path.join( target_directory, fasta_base_filename ) - os.symlink( input_filename, fasta_filename ) - return dict( path=fasta_base_filename ) +def download_from_history( params, target_directory): + # TODO: allow multiple FASTA input files + input_filename = params['param_dict']['reference_source']['input_fasta'] -REFERENCE_SOURCE_TO_DOWNLOAD = dict( url=download_from_url, history=download_from_history, directory=copy_from_directory ) + _make_novocraft_index(input_filename, target_directory) + +REFERENCE_SOURCE_TO_DOWNLOAD = dict(url=download_from_url, history=download_from_history) def main(): - #Parse Command Line - parser = optparse.OptionParser() - parser.add_option( '-d', '--data_table_name' ) + parser = argparse.ArgumentParser(description="Generate Novo-align genome index and JSON describing this") + parser.add_argument('output_filename') + parser.add_argument('--data_table_name', default='novocraft_index') (options, args) = parser.parse_args() - filename = args[0] + filename = args.output_filename - params = loads( open( filename ).read() ) - target_directory = params[ 'output_data' ][0]['extra_files_path'] - os.mkdir( target_directory ) - data_manager_dict = {} + params = loads(open(filename).read()) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + dbkey, sequence_id, sequence_name = get_dbkey_id_name(params, dbkey_description=options.dbkey_description) + if dbkey in [None, '', '?']: + raise Exception('"%s" is not a valid dbkey. You must specify a valid dbkey.' % (dbkey)) - #Fetch the FASTA - REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory ) + # Fetch the FASTA + REFERENCE_SOURCE_TO_DOWNLOAD[params['param_dict']['reference_source']['reference_source_selector']]\ + (params, target_directory, dbkey, sequence_id, sequence_name) - #save info to json file - open( filename, 'wb' ).write( dumps( data_manager_dict ) ) + data_table_entry = dict(value=sequence_id, dbkey=dbkey, name=sequence_name, path=target_directory) -if __name__ == "__main__": main() \ No newline at end of file + output_datatable_dict = dict(data_tables={args.data_table_name: [data_table_entry]}) + open(filename, 'wb').write(dumps(output_datatable_dict)) + +if __name__ == "__main__": main()
--- a/data_manager/novoalign_index_builder.xml Thu Mar 03 06:28:09 2016 -0500 +++ b/data_manager/novoalign_index_builder.xml Thu Mar 03 08:40:42 2016 -0500 @@ -1,22 +1,22 @@ <?xml version="1.0" encoding="utf-8" ?> <tool id="novoalign_index_builder" name="NOVO ALIGN index" tool_type="manage_data" version="0.0.1"> <description>Build an index for use by the Novo Align mapping tool</description> - <requirements> - <requirement type="package" version="0.0.1d">novoalign</requirement> - </requirements> <stdio> <exit_code range=":-1" /> <exit_code range="1:" /> </stdio> <command interpreter="python"> - novoalign_index_builder.py "${out_file}" --data_table_name "novo_index" + novoalign_index_builder.py "${out_file}" --dbkey_description ${ dbkey.get_display_text() } --data_table_name "novocraft_index" </command> <inputs> + <param name="dbkey" type="genomebuild" label="DBKEY to assign to data" /> + <param type="text" name="sequence_name" value="" label="Name of sequence" /> + <param type="text" name="sequence_desc" value="" label="Description of sequence" /> + <param type="text" name="sequence_id" value="" label="ID for sequence" /> <conditional name="reference_source"> <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> <option value="url">URL</option> <option value="history">History</option> - <option value="directory">Directory on Server</option> </param> <when value="url"> <param type="text" area="True" name="user_url" value="http://" label="URLs" optional="False" /> @@ -24,10 +24,6 @@ <when value="history"> <param name="input_fasta" type="data" format="fasta" label="FASTA File" multiple="False" optional="False" /> </when> - <when value="directory"> - <param type="text" name="fasta_filename" value="" label="Full path to FASTA File on disk" optional="False" /> - <param type="boolean" name="create_symlink" truevalue="create_symlink" falsevalue="copy_file" label="Create symlink to orignal data instead of copying" checked="False" /> - </when> </conditional> </inputs> <outputs>