changeset 7:079f737d675f default tip

Allow sorting FASTA by sequence identifier (chromosome).
author Daniel Blankenberg <dan@bx.psu.edu>
date Tue, 07 Jan 2014 14:49:01 -0500
parents ca8b3709309e
children
files data_manager/data_manager_fetch_genome_all_fasta.py data_manager/data_manager_fetch_genome_all_fasta.xml
diffstat 2 files changed, 150 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/data_manager/data_manager_fetch_genome_all_fasta.py	Tue Jun 04 14:56:54 2013 -0400
+++ b/data_manager/data_manager_fetch_genome_all_fasta.py	Tue Jan 07 14:49:01 2014 -0500
@@ -64,6 +64,110 @@
 def _get_stream_readers_for_bz2( file_obj, tmp_dir ):
     return [ bz2.BZ2File( file_obj.name, 'rb' ) ]
 
+def sort_fasta( fasta_filename, sort_method, params ):
+    if sort_method is None:
+        return
+    assert sort_method in SORTING_METHODS, ValueError( "%s is not a valid sorting option." % sort_method )
+    return SORTING_METHODS[ sort_method ]( fasta_filename, params )
+
+def _move_and_index_fasta_for_sorting( fasta_filename ):
+    unsorted_filename = tempfile.NamedTemporaryFile().name
+    shutil.move( fasta_filename, unsorted_filename )
+    fasta_offsets = {}
+    unsorted_fh = open( unsorted_filename )
+    while True:
+        offset = unsorted_fh.tell()
+        line = unsorted_fh.readline()
+        if not line:
+            break
+        if line.startswith( ">" ):
+            line = line.split( None, 1 )[0][1:]
+            fasta_offsets[ line ] = offset
+    unsorted_fh.close()
+    current_order = map( lambda x: x[1], sorted( map( lambda x: ( x[1], x[0] ), fasta_offsets.items() ) ) )
+    return ( unsorted_filename, fasta_offsets, current_order )
+
+def _write_sorted_fasta( sorted_names, fasta_offsets, sorted_fasta_filename, unsorted_fasta_filename ):
+    unsorted_fh = open( unsorted_fasta_filename )
+    sorted_fh = open( sorted_fasta_filename, 'wb+' )
+    
+    for name in sorted_names:
+        offset = fasta_offsets[ name ]
+        unsorted_fh.seek( offset )
+        sorted_fh.write( unsorted_fh.readline() )
+        while True:
+            line = unsorted_fh.readline()
+            if not line or line.startswith( ">" ):
+                break
+            sorted_fh.write( line )
+    unsorted_fh.close()
+    sorted_fh.close()
+
+def _sort_fasta_as_is( fasta_filename, params ):
+    return
+
+def _sort_fasta_lexicographical( fasta_filename, params ):
+    ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename )
+    sorted_names = sorted( fasta_offsets.keys() )
+    if sorted_names == current_order:
+        shutil.move( unsorted_filename, fasta_filename )
+    else:
+        _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename )    
+
+def _sort_fasta_gatk( fasta_filename, params ):
+    #This method was added by reviewer request.
+    ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename )
+    sorted_names = map( str, range( 1, 23 ) ) + [ 'X', 'Y' ]
+    #detect if we have chrN, or just N
+    has_chr = False
+    for chrom in sorted_names:
+        if "chr%s" % chrom in current_order:
+            has_chr = True
+            break
+    
+    if has_chr:
+        sorted_names = map( lambda x: "chr%s" % x, sorted_names)
+        sorted_names.insert( 0, "chrM" )
+    else:
+        sorted_names.insert( 0, "MT" )
+    sorted_names.extend( map( lambda x: "%s_random" % x, sorted_names ) )
+    
+    existing_sorted_names = []
+    for name in sorted_names:
+        if name in current_order:
+            existing_sorted_names.append( name )
+    for name in current_order:
+        #TODO: confirm that non-canonical names do not need to be sorted specially
+        if name not in existing_sorted_names:
+            existing_sorted_names.append( name )
+    
+    if existing_sorted_names == current_order:
+        shutil.move( unsorted_filename, fasta_filename )
+    else:
+        _write_sorted_fasta( existing_sorted_names, fasta_offsets, fasta_filename, unsorted_filename )
+
+def _sort_fasta_custom( fasta_filename, params ):
+    ( unsorted_filename, fasta_offsets, current_order ) = _move_and_index_fasta_for_sorting( fasta_filename )
+    sorted_names = []
+    for id_repeat in params['param_dict']['sorting']['sequence_identifiers']:
+        sorted_names.append( id_repeat[ 'identifier' ] )
+    handle_not_listed = params['param_dict']['sorting']['handle_not_listed']['handle_not_listed_selector']
+    if handle_not_listed.startswith( 'keep' ):
+        add_list = []
+        for name in current_order:
+            if name not in sorted_names:
+                add_list.append( name )
+        if add_list:
+            if handle_not_listed == 'keep_append':
+                sorted_names.extend( add_list )
+            else:
+                add_list.extend( sorted_names )
+                sorted_names = add_list
+    if sorted_names == current_order:
+        shutil.move( unsorted_filename, fasta_filename )
+    else:
+        _write_sorted_fasta( sorted_names, fasta_offsets, fasta_filename, unsorted_filename )
+
 def download_from_ucsc( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ):
     UCSC_FTP_SERVER = 'hgdownload.cse.ucsc.edu'
     UCSC_CHROM_FA_FILENAME = 'chromFa'
@@ -112,7 +216,7 @@
     
     fasta_readers = get_stream_reader( tmp_fasta, tmp_extract_dir )
     
-    data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, sequence_id, sequence_name )
+    data_table_entry = _stream_fasta_to_file( fasta_readers, target_directory, dbkey, sequence_id, sequence_name, params )
     _add_data_table_entry( data_manager_dict, data_table_entry )
     
     for fasta_reader in fasta_readers:
@@ -127,7 +231,7 @@
     url = NCBI_DOWNLOAD_URL % requested_identifier
     fasta_reader = urllib2.urlopen( url )
     
-    data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name )
+    data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params )
     _add_data_table_entry( data_manager_dict, data_table_entry )
 
 def download_from_url( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ):
@@ -135,7 +239,7 @@
     urls = filter( bool, map( lambda x: x.strip(), params['param_dict']['reference_source']['user_url'].split( '\n' ) ) )
     fasta_reader = [ urllib2.urlopen( url ) for url in urls ]
     
-    data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name )
+    data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params )
     _add_data_table_entry( data_manager_dict, data_table_entry )
 
 def download_from_history( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ):
@@ -146,7 +250,7 @@
     else:
         fasta_reader = open( input_filename )
     
-    data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name )
+    data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params )
     _add_data_table_entry( data_manager_dict, data_table_entry )
 
 def copy_from_directory( data_manager_dict, params, target_directory, dbkey, sequence_id, sequence_name ):
@@ -159,7 +263,7 @@
             fasta_reader = [ open( filename, 'rb' ) for filename in input_filename ]
         else:
             fasta_reader = open( input_filename )    
-        data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name )
+        data_table_entry = _stream_fasta_to_file( fasta_reader, target_directory, dbkey, sequence_id, sequence_name, params )
     _add_data_table_entry( data_manager_dict, data_table_entry )
 
 def _add_data_table_entry( data_manager_dict, data_table_entry ):
@@ -168,7 +272,7 @@
     data_manager_dict['data_tables']['all_fasta'].append( data_table_entry )
     return data_manager_dict
 
-def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, sequence_id, sequence_name, close_stream=True ):
+def _stream_fasta_to_file( fasta_stream, target_directory, dbkey, sequence_id, sequence_name, params, close_stream=True ):
     fasta_base_filename = "%s.fa" % sequence_id
     fasta_filename = os.path.join( target_directory, fasta_base_filename )
     fasta_writer = open( fasta_filename, 'wb+' )
@@ -202,6 +306,8 @@
     
     fasta_writer.close()
     
+    sort_fasta( fasta_filename, params['param_dict']['sorting']['sort_selector'], params )
+    
     return dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename )
 
 def _create_symlink( input_filename, target_directory, dbkey, sequence_id, sequence_name ):
@@ -210,8 +316,12 @@
     os.symlink( input_filename, fasta_filename )
     return dict( value=sequence_id, dbkey=dbkey, name=sequence_name, path=fasta_base_filename )
 
+
+
+
 REFERENCE_SOURCE_TO_DOWNLOAD = dict( ucsc=download_from_ucsc, ncbi=download_from_ncbi, url=download_from_url, history=download_from_history, directory=copy_from_directory )
 
+SORTING_METHODS = dict( as_is=_sort_fasta_as_is, lexicographical=_sort_fasta_lexicographical, gatk=_sort_fasta_gatk, custom=_sort_fasta_custom )
 
 def main():
     #Parse Command Line
--- a/data_manager/data_manager_fetch_genome_all_fasta.xml	Tue Jun 04 14:56:54 2013 -0400
+++ b/data_manager/data_manager_fetch_genome_all_fasta.xml	Tue Jan 07 14:49:01 2014 -0500
@@ -8,7 +8,7 @@
         <param type="text" name="sequence_id" value="" label="ID for sequence" />
         <conditional name="reference_source">
           <param name="reference_source_selector" type="select" label="Choose the source for the reference genome">
-            <option value="ucsc">UCSC</option>
+            <option value="ucsc" selected="True">UCSC</option>
             <option value="ncbi">NCBI</option>
             <option value="url">URL</option>
             <option value="history">History</option>
@@ -31,6 +31,38 @@
             <param type="boolean" name="create_symlink" truevalue="create_symlink" falsevalue="copy_file" label="Create symlink to orignal data instead of copying" checked="False" />
           </when>
         </conditional>
+        <conditional name="sorting">
+          <param name="sort_selector" type="select" label="Choose the source for the reference genome">
+            <option value="as_is" selected="True">As is</option>
+            <option value="lexicographical">Lexicographical</option>
+            <option value="gatk">GATK</option>
+            <option value="custom">Custom</option>
+          </param>
+          <when value="as_is">
+          </when>
+          <when value="lexicographical">
+          </when>
+          <when value="gatk">
+          </when>
+          <when value="custom">
+            <repeat name="sequence_identifiers" title="Sequence Identifiers" min="1" default="1">
+                <param type="text" name="identifier" value="" label="Sequence Identifier" optional="False" />
+            </repeat>
+            <conditional name="handle_not_listed">
+                <param name="handle_not_listed_selector" type="select" label="How to handle non-specified Identifiers">
+                    <option value="discard" selected="True">Discard</option>
+                    <option value="keep_append">Keep and Append</option>
+                    <option value="keep_prepend">Keep and Prepend</option>
+                </param>
+                <when value="discard">
+                </when>
+                <when value="keep_append">
+                </when>
+                <when value="keep_prepend">
+                </when>
+            </conditional>
+          </when>
+        </conditional>
     </inputs>
     <outputs>
         <data name="out_file" format="data_manager_json"/>
@@ -44,6 +76,7 @@
             <param name="sequence_id" value=""/>
             <param name="reference_source_selector" value="history"/>
             <param name="input_fasta" value="phiX174.fasta"/>
+            <param name="sort_selector" value="as_is"/>
             <output name="out_file" file="phiX174_as_anoGam1.data_manager_json"/>
         </test>
     </tests>