# HG changeset patch # User Daniel Blankenberg # Date 1389124141 18000 # Node ID 079f737d675f14861197cc4b07be14bc7d3a3afb # Parent ca8b3709309e76e247f62320c8ea5fd0bca922f9 Allow sorting FASTA by sequence identifier (chromosome). diff -r ca8b3709309e -r 079f737d675f data_manager/data_manager_fetch_genome_all_fasta.py --- a/data_manager/data_manager_fetch_genome_all_fasta.py Tue Jun 04 14:56:54 2013 -0400 +++ b/data_manager/data_manager_fetch_genome_all_fasta.py Tue Jan 07 14:49:01 2014 -0500 @@ -64,6 +64,110 @@ def _get_stream_readers_for_bz2( file_obj, tmp_dir ): return [ bz2.BZ2File( file_obj.name, 'rb' ) ] +def sort_fasta( fasta_filename, sort_method, params ): + if sort_method is None: + return + assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method ) + return SORTING_METHODS[ sort_method ]( fasta_filename, params ) + +def _move_and_index_fasta_for_sorting( fasta_filename ): + unsorted_filename = tempfile.NamedTemporaryFile().name + shutil.move( fasta_filename, unsorted_filename ) + fasta_offsets = {} + unsorted_fh = open( unsorted_filename ) + while True: + offset = unsorted_fh.tell() + line = unsorted_fh.readline() + if not line: + break + if line.startswith( ">" ): + line = line.split( None, 1 )[0][1:] + fasta_offsets[ line ] = offset + unsorted_fh.close() + current_order = map( lambda x: x[1], sorted( map( lambda x: ( x[1], x[0] ), fasta_offsets.items() ) ) ) + return ( unsorted_filename, fasta_offsets, current_order ) + +def _write_sorted_fasta( sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename ): + unsorted_fh = open( unsorted_fasta_filename ) + sorted_fh = open( sorted_fasta_filename, 'wb+' ) + + for name in sorted_names: + offset = fasta_offsets[ name ] + unsorted_fh.seek( offset ) + sorted_fh.write( unsorted_fh.readline() ) + while True: + line = unsorted_fh.readline() + if not line or line.startswith( ">" ): + break + sorted_fh.write( line ) + unsorted_fh.close() + sorted_fh.close() + +def _sort_fasta_as_is( fasta_filename, params ): + return + +def _sort_fasta_lexicographical( fasta_filename, params ): + ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) + sorted_names = sorted( fasta_offsets.keys() ) + if sorted_names == current_order: + shutil.move( unsorted_filename, fasta_filename ) + else: + _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) + +def _sort_fasta_gatk( fasta_filename, params ): + #This method was added by reviewer request. + ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) + sorted_names = map( str, range( 1, 23 ) ) + [ 'X', 'Y' ] + #detect if we have chrN, or just N + has_chr = False + for chrom in sorted_names: + if "chr%s" % chrom in current_order: + has_chr = True + break + + if has_chr: + sorted_names = map( lambda x: "chr%s" % x, sorted_names) + sorted_names.insert( 0, "chrM" ) + else: + sorted_names.insert( 0, "MT" ) + sorted_names.extend( map( lambda x: "%s_random" % x, sorted_names ) ) + + existing_sorted_names = [] + for name in sorted_names: + if name in current_order: + existing_sorted_names.append( name ) + for name in current_order: + #TODO: confirm that non-canonical names do not need to be sorted specially + if name not in existing_sorted_names: + existing_sorted_names.append( name ) + + if existing_sorted_names == current_order: + shutil.move( unsorted_filename, fasta_filename ) + else: + _write_sorted_fasta( existing_sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) + +def _sort_fasta_custom( fasta_filename, params ): + ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename ) + sorted_names = [] + for id_repeat in params['param_dict']['sorting']['sequence_identifiers']: + sorted_names.append( id_repeat[ 'identifier' ] ) + handle_not_listed = params['param_dict']['sorting']['handle_not_listed']['handle_not_listed_selector'] + if handle_not_listed.startswith( 'keep' ): + add_list = [] + for name in current_order: + if name not in sorted_names: + add_list.append( name ) + if add_list: + if handle_not_listed == 'keep_append': + sorted_names.extend( add_list ) + else: + add_list.extend( sorted_names ) + sorted_names = add_list + if sorted_names == current_order: + shutil.move( unsorted_filename, fasta_filename ) + else: + _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename ) + def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu' UCSC_CHROM_FA_FILENAME = 'chromFa' @@ -112,7 +216,7 @@ fasta_readers = get_stream_reader( tmp_fasta, tmp_extract_dir ) - data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) for fasta_reader in fasta_readers: @@ -127,7 +231,7 @@ url = NCBI_DOWNLOAD_URL % requested_identifier fasta_reader = urllib2.urlopen( url ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): @@ -135,7 +239,7 @@ urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) ) fasta_reader = [ urllib2.urlopen( url ) for url in urls ] - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): @@ -146,7 +250,7 @@ else: fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def copy_from_directory( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ): @@ -159,7 +263,7 @@ fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ] else: fasta_reader = open( input_filename ) - data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name ) + data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params ) _add_data_table_entry( data_manager_dict, data_table_entry ) def _add_data_table_entry( data_manager_dict, data_table_entry ): @@ -168,7 +272,7 @@ data_manager_dict['data_tables']['all_fasta'].append( data_table_entry ) return data_manager_dict -def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, sequence_id, sequence_name, close_stream=True ): +def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, sequence_id, sequence_name, params, close_stream=True ): fasta_base_filename = "%s.fa" % sequence_id fasta_filename = os.path.join( target_directory, fasta_base_filename ) fasta_writer = open( fasta_filename, 'wb+' ) @@ -202,6 +306,8 @@ fasta_writer.close() + sort_fasta( fasta_filename, params['param_dict']['sorting']['sort_selector'], params ) + return dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) def _create_symlink( input_filename, target_directory, dbkey, sequence_id, sequence_name ): @@ -210,8 +316,12 @@ os.symlink( input_filename, fasta_filename ) return dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename ) + + + REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory ) +SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom ) def main(): #Parse Command Line diff -r ca8b3709309e -r 079f737d675f data_manager/data_manager_fetch_genome_all_fasta.xml --- a/data_manager/data_manager_fetch_genome_all_fasta.xml Tue Jun 04 14:56:54 2013 -0400 +++ b/data_manager/data_manager_fetch_genome_all_fasta.xml Tue Jan 07 14:49:01 2014 -0500 @@ -8,7 +8,7 @@ - + @@ -31,6 +31,38 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -44,6 +76,7 @@ +