view fasta_concatenate_by_species.py @ 1:d9f0a11824e9

Add bx-python dependency (for maf_utilities.py).
author Nate Coraor <nate@bx.psu.edu>
date Mon, 17 Nov 2014 10:08:37 -0500
parents a63b082a26eb
children c5311b7718d1
line wrap: on
line source

#!/usr/bin/env python
#Dan Blankenberg
"""
Takes a Multiple Alignment FASTA file and concatenates 
sequences for each species, resulting in one sequence 
alignment per species.
"""

import sys, tempfile
from utils.maf_utilities import iter_fasta_alignment
from utils.odict import odict

def __main__():
    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    species = odict()
    cur_size = 0
    for components in iter_fasta_alignment( input_filename ):
        species_not_written = species.keys()
        for component in components:
            if component.species not in species:
                species[component.species] = tempfile.TemporaryFile()
                species[component.species].write( "-" * cur_size )
            species[component.species].write( component.text )
            try:
                species_not_written.remove( component.species )
            except ValueError:
                #this is a new species
                pass
        for spec in species_not_written:
            species[spec].write( "-" * len( components[0].text ) )
        cur_size += len( components[0].text )
    out = open( output_filename, 'wb' )
    for spec, f in species.iteritems():
        f.seek( 0 )
        out.write( ">%s\n%s\n" % ( spec, f.read() ) )
    out.close()

if __name__ == "__main__" : __main__()