Mercurial > repos > blankenberg > data_manager_fetch_genome_all_fasta
changeset 7:079f737d675f default tip
Allow sorting FASTA by sequence identifier (chromosome).
author | Daniel Blankenberg <dan@bx.psu.edu> |
---|---|
date | Tue, 07 Jan 2014 14:49:01 -0500 |
parents | ca8b3709309e |
children | |
files | data_manager/data_manager_fetch_genome_all_fasta.py data_manager/data_manager_fetch_genome_all_fasta.xml |
diffstat | 2 files changed, 150 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_fetch_genome_all_fasta.py Tue Jun 04 14:56:54 2013 -0400 +++ b/data_manager/data_manager_fetch_genome_all_fasta.py Tue Jan 07 14:49:01 2014 -0500 @@ -64,6 +64,110 @@ def _get_stream_readers_for_bz2( file_obj, tmp_dir ): return [ bz2.BZ2File( file_obj.name, 'rb' ) ] +def sort_fasta( fasta_filename, sort_method, params ): + if sort_method is None: + return + assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method ) + return SORTING_METHODS[ sort_method ]( fasta_filename, params ) + +def _move_and_index_fasta_for_sorting( fasta_filename ): + unsorted_filename = tempfile.NamedTemporaryFile().name + shutil.move( fasta_filename, unsorted_filename ) + fasta_offsets = {} + unsorted_fh = open( unsorted_filename ) + while True: + offset = unsorted_fh.tell() + line = unsorted_fh.readline() + if not line: + break + if line.startswith( ">" ): + line = line.split( None, 1 )[0][1:] + fasta_offsets[ line ] = offset + unsorted_fh.close() + current_order = map( lambda x: x[1], sorted( map( lambda x: ( x[1], x[0] ), fasta_offsets.items() ) ) ) + return ( unsorted_filename, fasta_offsets, current_order ) + +def _write_sorted_fasta( sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename ): + unsorted_fh = open( unsorted_fasta_filename ) + sorted_fh = open( sorted_fasta_filename, 'wb+' ) + + for name in sorted_names: + offset = fasta_offsets[ name ] + unsorted_fh.seek( offset ) + sorted_fh.write( unsorted_fh.readline() ) + while True: + line = unsorted_fh.readline() + if not line or line.startswith( ">" ): + break + sorted_fh.write( line ) + unsorted_fh.close() + sorted_fh.close() + +def _sort_fasta_as_is( fasta_filename, params ): + return + +def _sort_fasta_lexicographical( fasta_filename, params ): + ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) + sorted_names = sorted( fasta_offsets.keys() ) + if sorted_names == current_order: + shutil.move( unsorted_filename, fasta_filename ) + else: + _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) + +def _sort_fasta_gatk( fasta_filename, params ): + #This method was added by reviewer request. + ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) + sorted_names = map( str, range( 1, 23 ) ) + [ 'X', 'Y' ] + #detect if we have chrN, or just N + has_chr = False + for chrom in sorted_names: + if "chr%s" % chrom in current_order: + has_chr = True + break + + if has_chr: + sorted_names = map( lambda x: "chr%s" % x, sorted_names) + sorted_names.insert( 0, "chrM" ) + else: + sorted_names.insert( 0, "MT" ) + sorted_names.extend( map( lambda x: "%s_random" % x, sorted_names ) ) + + existing_sorted_names = [] + for name in sorted_names: + if name in current_order: + existing_sorted_names.append( name ) + for name in current_order: + #TODO: confirm that non-canonical names do not need to be sorted specially + if name not in existing_sorted_names: + existing_sorted_names.append( name ) + + if existing_sorted_names == current_order: + shutil.move( unsorted_filename, fasta_filename ) + else: + _write_sorted_fasta( existing_sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) + +def _sort_fasta_custom( fasta_filename, params ): + ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) + sorted_names = [] + for id_repeat in params['param_dict']['sorting']['sequence_identifiers']: + sorted_names.append( id_repeat[ 'identifier' ] ) + handle_not_listed = params['param_dict']['sorting']['handle_not_listed']['handle_not_listed_selector'] + if handle_not_listed.startswith( 'keep' ): + add_list = [] + for name in current_order: + if name not in sorted_names: + add_list.append( name ) + if add_list: + if handle_not_listed == 'keep_append': + sorted_names.extend( add_list ) + else: + add_list.extend( sorted_names ) + sorted_names = add_list + if sorted_names == current_order: + shutil.move( unsorted_filename, fasta_filename ) + else: + _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) + def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu' UCSC_CHROM_FA_FILENAME = 'chromFa' @@ -112,7 +216,7 @@ fasta_readers = get_stream_reader( tmp_fasta, tmp_extract_dir ) - data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) for fasta_reader in fasta_readers: @@ -127,7 +231,7 @@ url = NCBI_DOWNLOAD_URL % requested_identifier fasta_reader = urllib2.urlopen( url ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): @@ -135,7 +239,7 @@ urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) fasta_reader = [ urllib2.urlopen( url ) for url in urls ] - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): @@ -146,7 +250,7 @@ else: fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def copy_from_directory( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): @@ -159,7 +263,7 @@ fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] else: fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def _add_data_table_entry( data_manager_dict, data_table_entry ): @@ -168,7 +272,7 @@ data_manager_dict['data_tables']['all_fasta'].append( data_table_entry ) return data_manager_dict -def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, sequence_id, sequence_name, close_stream=True ): +def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, sequence_id, sequence_name, params, close_stream=True ): fasta_base_filename = "%s.fa" % sequence_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) fasta_writer = open( fasta_filename, 'wb+' ) @@ -202,6 +306,8 @@ fasta_writer.close() + sort_fasta( fasta_filename, params['param_dict']['sorting']['sort_selector'], params ) + return dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) def _create_symlink( input_filename, target_directory, dbkey, sequence_id, sequence_name ): @@ -210,8 +316,12 @@ os.symlink( input_filename, fasta_filename ) return dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) + + + REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory ) +SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom ) def main(): #Parse Command Line
--- a/data_manager/data_manager_fetch_genome_all_fasta.xml Tue Jun 04 14:56:54 2013 -0400 +++ b/data_manager/data_manager_fetch_genome_all_fasta.xml Tue Jan 07 14:49:01 2014 -0500 @@ -8,7 +8,7 @@ <param type="text" name="sequence_id" value="" label="ID for sequence" /> <conditional name="reference_source"> <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> - <option value="ucsc">UCSC</option> + <option value="ucsc" selected="True">UCSC</option> <option value="ncbi">NCBI</option> <option value="url">URL</option> <option value="history">History</option> @@ -31,6 +31,38 @@ <param type="boolean" name="create_symlink" truevalue="create_symlink" falsevalue="copy_file" label="Create symlink to orignal data instead of copying" checked="False" /> </when> </conditional> + <conditional name="sorting"> + <param name="sort_selector" type="select" label="Choose the source for the reference genome"> + <option value="as_is" selected="True">As is</option> + <option value="lexicographical">Lexicographical</option> + <option value="gatk">GATK</option> + <option value="custom">Custom</option> + </param> + <when value="as_is"> + </when> + <when value="lexicographical"> + </when> + <when value="gatk"> + </when> + <when value="custom"> + <repeat name="sequence_identifiers" title="Sequence Identifiers" min="1" default="1"> + <param type="text" name="identifier" value="" label="Sequence Identifier" optional="False" /> + </repeat> + <conditional name="handle_not_listed"> + <param name="handle_not_listed_selector" type="select" label="How to handle non-specified Identifiers"> + <option value="discard" selected="True">Discard</option> + <option value="keep_append">Keep and Append</option> + <option value="keep_prepend">Keep and Prepend</option> + </param> + <when value="discard"> + </when> + <when value="keep_append"> + </when> + <when value="keep_prepend"> + </when> + </conditional> + </when> + </conditional> </inputs> <outputs> <data name="out_file" format="data_manager_json"/> @@ -44,6 +76,7 @@ <param name="sequence_id" value=""/> <param name="reference_source_selector" value="history"/> <param name="input_fasta" value="phiX174.fasta"/> + <param name="sort_selector" value="as_is"/> <output name="out_file" file="phiX174_as_anoGam1.data_manager_json"/> </test> </tests>