annotate abims_fasta2phylip.py @ 1:1c14c7a6539f draft default tip

Uploaded
author mish
date Wed, 24 Jul 2013 08:39:15 -0400
parents 851e52325ffc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
851e52325ffc Uploaded
mish
parents:
diff changeset
1 #!/usr/bin/env python
851e52325ffc Uploaded
mish
parents:
diff changeset
2
851e52325ffc Uploaded
mish
parents:
diff changeset
3 """
851e52325ffc Uploaded
mish
parents:
diff changeset
4 Convert fasta alignemnts to relaxed phylip ones in constant memory.
851e52325ffc Uploaded
mish
parents:
diff changeset
5 Written by Lucas Sinclair.
851e52325ffc Uploaded
mish
parents:
diff changeset
6 Kopimi.
851e52325ffc Uploaded
mish
parents:
diff changeset
7
851e52325ffc Uploaded
mish
parents:
diff changeset
8 You can use this script from the shell like this::
851e52325ffc Uploaded
mish
parents:
diff changeset
9 $ fasta_to_phylip seqs.fasta seqs.phylip
851e52325ffc Uploaded
mish
parents:
diff changeset
10 """
851e52325ffc Uploaded
mish
parents:
diff changeset
11
851e52325ffc Uploaded
mish
parents:
diff changeset
12 ###############################################################################
851e52325ffc Uploaded
mish
parents:
diff changeset
13 class Sequence(object):
851e52325ffc Uploaded
mish
parents:
diff changeset
14 """The Sequence object has a string *header* and
851e52325ffc Uploaded
mish
parents:
diff changeset
15 various representations."""
851e52325ffc Uploaded
mish
parents:
diff changeset
16
851e52325ffc Uploaded
mish
parents:
diff changeset
17 def __init__(self, header, seq):
851e52325ffc Uploaded
mish
parents:
diff changeset
18 self.header = re.findall('^>(\S+)', header)[0]
851e52325ffc Uploaded
mish
parents:
diff changeset
19 self.seq = seq
851e52325ffc Uploaded
mish
parents:
diff changeset
20
851e52325ffc Uploaded
mish
parents:
diff changeset
21 def __len__(self):
851e52325ffc Uploaded
mish
parents:
diff changeset
22 return len(self.seq)
851e52325ffc Uploaded
mish
parents:
diff changeset
23
851e52325ffc Uploaded
mish
parents:
diff changeset
24 @property
851e52325ffc Uploaded
mish
parents:
diff changeset
25 def phylip(self):
851e52325ffc Uploaded
mish
parents:
diff changeset
26 return self.header + " " + self.seq.replace('.','-') + "\n"
851e52325ffc Uploaded
mish
parents:
diff changeset
27
851e52325ffc Uploaded
mish
parents:
diff changeset
28 @property
851e52325ffc Uploaded
mish
parents:
diff changeset
29 def fasta(self):
851e52325ffc Uploaded
mish
parents:
diff changeset
30 return ">" + self.header + "\n" + self.seq + "\n"
851e52325ffc Uploaded
mish
parents:
diff changeset
31
851e52325ffc Uploaded
mish
parents:
diff changeset
32 def fasta_parse(path):
851e52325ffc Uploaded
mish
parents:
diff changeset
33 """Reads the file at *path* and yields
851e52325ffc Uploaded
mish
parents:
diff changeset
34 Sequence objects in a lazy fashion"""
851e52325ffc Uploaded
mish
parents:
diff changeset
35 header = ''
851e52325ffc Uploaded
mish
parents:
diff changeset
36 seq = ''
851e52325ffc Uploaded
mish
parents:
diff changeset
37 with open(path) as f:
851e52325ffc Uploaded
mish
parents:
diff changeset
38 for line in f:
851e52325ffc Uploaded
mish
parents:
diff changeset
39 line = line.strip('\n')
851e52325ffc Uploaded
mish
parents:
diff changeset
40 if line.startswith('>'):
851e52325ffc Uploaded
mish
parents:
diff changeset
41 if header: yield Sequence(header, seq)
851e52325ffc Uploaded
mish
parents:
diff changeset
42 header = line
851e52325ffc Uploaded
mish
parents:
diff changeset
43 seq = ''
851e52325ffc Uploaded
mish
parents:
diff changeset
44 continue
851e52325ffc Uploaded
mish
parents:
diff changeset
45 seq += line
851e52325ffc Uploaded
mish
parents:
diff changeset
46 yield Sequence(header, seq)
851e52325ffc Uploaded
mish
parents:
diff changeset
47
851e52325ffc Uploaded
mish
parents:
diff changeset
48 ###############################################################################
851e52325ffc Uploaded
mish
parents:
diff changeset
49 # The libraries we need #
851e52325ffc Uploaded
mish
parents:
diff changeset
50 import sys, os, random, string, re
851e52325ffc Uploaded
mish
parents:
diff changeset
51 # Get the shell arguments #
851e52325ffc Uploaded
mish
parents:
diff changeset
52 fa_path = sys.argv[1]
851e52325ffc Uploaded
mish
parents:
diff changeset
53 ph_path = sys.argv[2]
851e52325ffc Uploaded
mish
parents:
diff changeset
54 # Check that the path is valid #
851e52325ffc Uploaded
mish
parents:
diff changeset
55 if not os.path.exists(fa_path): raise Exception("No file at %s." % fa_path)
851e52325ffc Uploaded
mish
parents:
diff changeset
56 # Use our two functions #
851e52325ffc Uploaded
mish
parents:
diff changeset
57 seqs = fasta_parse(fa_path)
851e52325ffc Uploaded
mish
parents:
diff changeset
58 # Write the output to temporary file #
851e52325ffc Uploaded
mish
parents:
diff changeset
59 tm_path = ph_path + '.' + ''.join(random.choice(string.letters) for i in xrange(10))
851e52325ffc Uploaded
mish
parents:
diff changeset
60 # Count the sequences #
851e52325ffc Uploaded
mish
parents:
diff changeset
61 count = 0
851e52325ffc Uploaded
mish
parents:
diff changeset
62 with open(tm_path, 'w') as f:
851e52325ffc Uploaded
mish
parents:
diff changeset
63 for seq in seqs:
851e52325ffc Uploaded
mish
parents:
diff changeset
64 f.write(seq.phylip)
851e52325ffc Uploaded
mish
parents:
diff changeset
65 count += 1
851e52325ffc Uploaded
mish
parents:
diff changeset
66 # Add number of entries and length at the top #
851e52325ffc Uploaded
mish
parents:
diff changeset
67 with open(tm_path, 'r') as old, open(ph_path, 'w') as new:
851e52325ffc Uploaded
mish
parents:
diff changeset
68 new.write(" " + str(count) + " " + str(len(seq)) + "\n")
851e52325ffc Uploaded
mish
parents:
diff changeset
69 new.writelines(old)
851e52325ffc Uploaded
mish
parents:
diff changeset
70 # Clean up #
851e52325ffc Uploaded
mish
parents:
diff changeset
71 os.remove(tm_path)