diff swalign.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/swalign.py	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,95 @@
+import os
+import ctypes
+import string
+
+script_dir = os.path.dirname(os.path.realpath(__file__))
+swalign = ctypes.cdll.LoadLibrary(os.path.join(script_dir, 'libswalign.so'))
+
+REVCOMP_TABLE = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')
+
+
+# C struct for ctypes
+class SeqPairC(ctypes.Structure):
+  _fields_ = [
+    ('a', ctypes.c_char_p),
+    ('alen', ctypes.c_uint),
+    ('b', ctypes.c_char_p),
+    ('blen', ctypes.c_uint),
+  ]
+
+
+# C struct for ctypes
+class AlignC(ctypes.Structure):
+  _fields_ = [
+    ('seqs', ctypes.POINTER(SeqPairC)),
+    ('start_a', ctypes.c_int),
+    ('start_b', ctypes.c_int),
+    ('end_a', ctypes.c_int),
+    ('end_b', ctypes.c_int),
+    ('matches', ctypes.c_int),
+    ('score', ctypes.c_double),
+  ]
+
+
+# The Python version
+class Align(object):
+  def __init__(self, align_c):
+    self.target = align_c.seqs.contents.a
+    self.query = align_c.seqs.contents.b
+    # Where the first base of the target aligns on the query, in query coordinates (or 1, if <= 0).
+    self.start_target = align_c.start_a
+    # Where the first base of the query aligns on the target, in target coordinates (or 1, if <= 0).
+    self.start_query = align_c.start_b
+    # Where the last base of the target aligns on the query, in query coordinates.
+    self.end_target = align_c.end_a
+    # Where the last base of the query aligns on the target, in target coordinates.
+    self.end_query = align_c.end_b
+    self.matches = align_c.matches
+    self.score = align_c.score
+
+  # Provide this common function.
+  def __str__(self):
+    """Print a human-readable representation of the alignment."""
+    start_query = str(self.start_query)
+    start_target = str(self.start_target)
+    start_width = str(max(len(start_query), len(start_target)))
+    line_format = '{:'+start_width+'} {} {}'
+    output = line_format.format(start_target, self.target, self.end_target) + '\n'
+    output += line_format.format(start_query, self.query, self.end_query)
+    return output
+
+
+# Initialize functions (define types).
+swalign.smith_waterman.restype = ctypes.POINTER(AlignC)
+swalign.revcomp.restype = ctypes.c_char_p
+
+
+def smith_waterman(target, query):
+  seq_pair = SeqPairC(target, len(target), query, len(query))
+  align_c = swalign.smith_waterman(ctypes.pointer(seq_pair), 1).contents
+  return Align(align_c)
+
+
+def smith_waterman_duplex(target, query):
+  """Smith-Waterman align query to target in both orientations and return the best.
+  Convenience function that calls smith_waterman() twice, and returns the
+  alignment with the highest score."""
+  align = smith_waterman(target, query)
+  query_rc = revcomp(query)
+  align_rc = smith_waterman(target, query_rc)
+  if align_rc.score > align.score:
+    return align_rc
+  else:
+    return align
+
+
+def revcomp(seq):
+  """Return the reverse complement of the input sequence.
+  Leaves the input string unaltered."""
+  return seq.translate(REVCOMP_TABLE)[::-1]
+
+
+def revcomp_inplace(seq):
+  """Convert the input sequence to its reverse complement.
+  WARNING: This will alter the input string in-place!"""
+  swalign.revcomp(seq)