diff abims_fasta2phylip.py @ 0:851e52325ffc draft

Uploaded
author mish
date Wed, 24 Jul 2013 08:35:01 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/abims_fasta2phylip.py	Wed Jul 24 08:35:01 2013 -0400
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+
+"""
+Convert fasta alignemnts to relaxed phylip ones in constant memory.
+Written by Lucas Sinclair.
+Kopimi.
+
+You can use this script from the shell like this::
+$ fasta_to_phylip seqs.fasta seqs.phylip
+"""
+
+###############################################################################
+class Sequence(object):
+    """The Sequence object has a string *header* and
+    various representations."""
+
+    def __init__(self, header, seq):
+        self.header = re.findall('^>(\S+)', header)[0]
+        self.seq = seq
+
+    def __len__(self):
+        return len(self.seq)
+
+    @property
+    def phylip(self):
+        return self.header + " " + self.seq.replace('.','-') + "\n"
+
+    @property
+    def fasta(self):
+        return ">" + self.header + "\n" + self.seq + "\n"
+
+def fasta_parse(path):
+    """Reads the file at *path* and yields
+       Sequence objects in a lazy fashion"""
+    header = ''
+    seq = ''
+    with open(path) as f:
+        for line in f:
+            line = line.strip('\n')
+            if line.startswith('>'):
+                if header: yield Sequence(header, seq)
+                header = line
+                seq = ''
+                continue
+            seq += line
+    yield Sequence(header, seq)
+
+###############################################################################
+# The libraries we need #
+import sys, os, random, string, re
+# Get the shell arguments #
+fa_path = sys.argv[1]
+ph_path = sys.argv[2]
+# Check that the path is valid #
+if not os.path.exists(fa_path): raise Exception("No file at %s." % fa_path)
+# Use our two functions #
+seqs = fasta_parse(fa_path)
+# Write the output to temporary file #
+tm_path = ph_path + '.' + ''.join(random.choice(string.letters) for i in xrange(10))
+# Count the sequences #
+count = 0
+with open(tm_path, 'w') as f:
+    for seq in seqs:
+        f.write(seq.phylip)
+        count += 1
+# Add number of entries and length at the top #
+with open(tm_path, 'r') as old, open(ph_path, 'w') as new:
+    new.write(" " + str(count) + " " + str(len(seq)) + "\n")
+    new.writelines(old)
+# Clean up #
+os.remove(tm_path)
\ No newline at end of file