annotate fasta_concatenate_by_species.py @ 1:c5630f2908e6 default tip

Add tool dep.
author Nate Coraor <nate@bx.psu.edu>
date Mon, 17 Nov 2014 10:03:28 -0500
parents d5e8786674c7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
1 #!/usr/bin/env python
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
2 #Dan Blankenberg
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
3 """
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
4 Takes a Multiple Alignment FASTA file and concatenates
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
5 sequences for each species, resulting in one sequence
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
6 alignment per species.
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
7 """
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
8
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
9 import sys, tempfile
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
10 from utils.maf_utilities import iter_fasta_alignment
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
11 from utils.odict import odict
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
12
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
13 def __main__():
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
14 input_filename = sys.argv[1]
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
15 output_filename = sys.argv[2]
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
16 species = odict()
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
17 cur_size = 0
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
18 for components in iter_fasta_alignment( input_filename ):
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
19 species_not_written = species.keys()
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
20 for component in components:
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
21 if component.species not in species:
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
22 species[component.species] = tempfile.TemporaryFile()
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
23 species[component.species].write( "-" * cur_size )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
24 species[component.species].write( component.text )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
25 try:
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
26 species_not_written.remove( component.species )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
27 except ValueError:
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
28 #this is a new species
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
29 pass
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
30 for spec in species_not_written:
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
31 species[spec].write( "-" * len( components[0].text ) )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
32 cur_size += len( components[0].text )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
33 out = open( output_filename, 'wb' )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
34 for spec, f in species.iteritems():
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
35 f.seek( 0 )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
36 out.write( ">%s\n%s\n" % ( spec, f.read() ) )
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
37 out.close()
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
38
d5e8786674c7 From Main tool shed.
Nate Coraor <nate@bx.psu.edu>
parents:
diff changeset
39 if __name__ == "__main__" : __main__()