Mercurial > repos > sanbi-uwc > data_manager_novoalign_index_builder
changeset 0:85fbd52dbb36 draft
planemo upload for repository https://github.com/zipho/data_manager_novoalign_index_builder commit d51fdc6291de173e829a839e98c6c3ae367d84bf
author | sanbi-uwc |
---|---|
date | Thu, 03 Mar 2016 05:59:41 -0500 |
parents | |
children | 4d67344bdea7 |
files | README.md data_manager/novoalign_index_builder.py data_manager/novoalign_index_builder.xml data_manager_conf.xml tool_dependencies.xml |
diffstat | 5 files changed, 188 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Mar 03 05:59:41 2016 -0500 @@ -0,0 +1,2 @@ +# Data Manager NovoAlign Index Builder +Data Manager to build Novo-Align index
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/novoalign_index_builder.py Thu Mar 03 05:59:41 2016 -0500 @@ -0,0 +1,112 @@ +#!/usr/bin/env python +# Z. Mashologu (SANBI-UWC) +#import dict as dict +import os +import shutil +import optparse +import urllib2 +import logging +log = logging.getLogger( __name__ ) + +from json import loads, dumps + +def cleanup_before_exit( tmp_dir ): + if tmp_dir and os.path.exists( tmp_dir ): + shutil.rmtree( tmp_dir ) + +def _stream_fasta_to_file( fasta_stream, target_directory, params, close_stream=True ): + fasta_base_filename = "%s.fa" % sequence_id + fasta_filename = os.path.join( target_directory, fasta_base_filename ) + fasta_writer = open( fasta_filename, 'wb+' ) + + if isinstance( fasta_stream, list ) and len( fasta_stream ) == 1: + fasta_stream = fasta_stream[0] + + if isinstance( fasta_stream, list ): + last_char = None + for fh in fasta_stream: + if last_char not in [ None, '\n', '\r' ]: + fasta_writer.write( '\n' ) + while True: + data = fh.read( CHUNK_SIZE ) + if data: + fasta_writer.write( data ) + last_char = data[-1] + else: + break + if close_stream: + fh.close() + else: + while True: + data = fasta_stream.read( CHUNK_SIZE ) + if data: + fasta_writer.write( data ) + else: + break + if close_stream: + fasta_stream.close() + + fasta_writer.close() + + return dict( path=fasta_base_filename ) + +def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): + #TODO: we should automatically do decompression here + urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) + fasta_reader = [ urllib2.urlopen( url ) for url in urls ] + + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) + _add_data_table_entry( data_manager_dict, data_table_entry ) + +def download_from_history( data_manager_dict, params, target_directory): + #TODO: allow multiple FASTA input files + input_filename = params['param_dict']['reference_source']['input_fasta'] + if isinstance( input_filename, list ): + fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] + else: + fasta_reader = open( input_filename ) + + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) + _add_data_table_entry( data_manager_dict, data_table_entry ) + +def copy_from_directory( data_manager_dict, params, target_directory ): + input_filename = params['param_dict']['reference_source']['fasta_filename'] + create_symlink = params['param_dict']['reference_source']['create_symlink'] == 'create_symlink' + if create_symlink: + data_table_entry = _create_symlink( input_filename, target_directory ) + else: + if isinstance( input_filename, list ): + fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] + else: + fasta_reader = open( input_filename ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, params ) + _add_data_table_entry( data_manager_dict, data_table_entry ) + +def _create_symlink( input_filename, target_directory ): + fasta_base_filename = "%s.fa" % sequence_id + fasta_filename = os.path.join( target_directory, fasta_base_filename ) + os.symlink( input_filename, fasta_filename ) + return dict( path=fasta_base_filename ) + +REFERENCE_SOURCE_TO_DOWNLOAD = dict( url=download_from_url, history=download_from_history, directory=copy_from_directory ) + +def main(): + #Parse Command Line + parser = optparse.OptionParser() + parser.add_option( '-d', '--data_table_name' ) + (options, args) = parser.parse_args() + + filename = args[0] + + params = loads( open( filename ).read() ) + target_directory = params[ 'output_data' ][0]['extra_files_path'] + os.mkdir( target_directory ) + data_manager_dict = {} + + #Fetch the FASTA + REFERENCE_SOURCE_TO_DOWNLOAD[ params['param_dict']['reference_source']['reference_source_selector'] ]( data_manager_dict, params, target_directory ) + + #save info to json file + open( filename, 'wb' ).write( dumps( data_manager_dict ) ) + +if __name__ == "__main__": main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/novoalign_index_builder.xml Thu Mar 03 05:59:41 2016 -0500 @@ -0,0 +1,53 @@ +<?xml version="1.0" encoding="utf-8" ?> +<tool id="novoalign_index_builder" name="NOVO ALIGN index" version="0.0.1"> + <description>Build an index for use by the Novo Align mapping tool</description> + <requirements> + <requirement type="package" version="0.0.1d">novoalign</requirement> + </requirements> + <stdio> + <exit_code range=":-1" /> + <exit_code range="1:" /> + </stdio> + <command interpreter="python"> + novoalign_index_builder.py "${out_file}" --data_table_name "novo_index" + </command> + <inputs> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> + <option value="url">URL</option> + <option value="history">History</option> + <option value="directory">Directory on Server</option> + </param> + <when value="url"> + <param type="text" area="True" name="user_url" value="http://" label="URLs" optional="False" /> + </when> + <when value="history"> + <param name="input_fasta" type="data" format="fasta" label="FASTA File" multiple="False" optional="False" /> + </when> + <when value="directory"> + <param type="text" name="fasta_filename" value="" label="Full path to FASTA File on disk" optional="False" /> + <param type="boolean" name="create_symlink" truevalue="create_symlink" falsevalue="copy_file" label="Create symlink to orignal data instead of copying" checked="False" /> + </when> + </conditional> + <!-- <param type="text" name="sequence_name" value="" label="Name of sequence" /> + <param type="text" name="sequence_id" value="" label="ID for sequence" /> --> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <tests> + <test> + <param name="dbkey" value="anoGam1"/> + <param name="sequence_name" value=""/> + <param name="sequence_id" value=""/> + <param name="reference_source_selector" value="history"/> + <param name="input_fasta" value="phiX174.fasta"/> + <param name="sort_selector" value="as_is"/> + <output name="out_file" file="phiX174_as_anoGam1.data_manager_json"/> + </test> + </tests> + <help>Help!</help> + <citations> + <citation></citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Thu Mar 03 05:59:41 2016 -0500 @@ -0,0 +1,18 @@ +<data_managers> + <data_manager tool_file="data_manager/novoalign_index_builder.xml" id="novoalign_index_builder" version="0.0.1"> + <data_table name="novo_indexes"> + <output> + <column name="value" /> + <column name="dbkey" /> + <column name="name" /> + <column name="path" output_ref="out_file" > + <move type="file"> + <source>${path}</source> + <target base="${GALAXY_DATA_INDEX_DIR}">${path}</target> + </move> + <value_translation>${GALAXY_DATA_INDEX_DIR}/${path}</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers> \ No newline at end of file