view swalign.py @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line source

import os
import ctypes
import string

script_dir = os.path.dirname(os.path.realpath(__file__))
swalign = ctypes.cdll.LoadLibrary(os.path.join(script_dir, 'libswalign.so'))

REVCOMP_TABLE = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB')


# C struct for ctypes
class SeqPairC(ctypes.Structure):
  _fields_ = [
    ('a', ctypes.c_char_p),
    ('alen', ctypes.c_uint),
    ('b', ctypes.c_char_p),
    ('blen', ctypes.c_uint),
  ]


# C struct for ctypes
class AlignC(ctypes.Structure):
  _fields_ = [
    ('seqs', ctypes.POINTER(SeqPairC)),
    ('start_a', ctypes.c_int),
    ('start_b', ctypes.c_int),
    ('end_a', ctypes.c_int),
    ('end_b', ctypes.c_int),
    ('matches', ctypes.c_int),
    ('score', ctypes.c_double),
  ]


# The Python version
class Align(object):
  def __init__(self, align_c):
    self.target = align_c.seqs.contents.a
    self.query = align_c.seqs.contents.b
    # Where the first base of the target aligns on the query, in query coordinates (or 1, if <= 0).
    self.start_target = align_c.start_a
    # Where the first base of the query aligns on the target, in target coordinates (or 1, if <= 0).
    self.start_query = align_c.start_b
    # Where the last base of the target aligns on the query, in query coordinates.
    self.end_target = align_c.end_a
    # Where the last base of the query aligns on the target, in target coordinates.
    self.end_query = align_c.end_b
    self.matches = align_c.matches
    self.score = align_c.score

  # Provide this common function.
  def __str__(self):
    """Print a human-readable representation of the alignment."""
    start_query = str(self.start_query)
    start_target = str(self.start_target)
    start_width = str(max(len(start_query), len(start_target)))
    line_format = '{:'+start_width+'} {} {}'
    output = line_format.format(start_target, self.target, self.end_target) + '\n'
    output += line_format.format(start_query, self.query, self.end_query)
    return output


# Initialize functions (define types).
swalign.smith_waterman.restype = ctypes.POINTER(AlignC)
swalign.revcomp.restype = ctypes.c_char_p


def smith_waterman(target, query):
  seq_pair = SeqPairC(target, len(target), query, len(query))
  align_c = swalign.smith_waterman(ctypes.pointer(seq_pair), 1).contents
  return Align(align_c)


def smith_waterman_duplex(target, query):
  """Smith-Waterman align query to target in both orientations and return the best.
  Convenience function that calls smith_waterman() twice, and returns the
  alignment with the highest score."""
  align = smith_waterman(target, query)
  query_rc = revcomp(query)
  align_rc = smith_waterman(target, query_rc)
  if align_rc.score > align.score:
    return align_rc
  else:
    return align


def revcomp(seq):
  """Return the reverse complement of the input sequence.
  Leaves the input string unaltered."""
  return seq.translate(REVCOMP_TABLE)[::-1]


def revcomp_inplace(seq):
  """Convert the input sequence to its reverse complement.
  WARNING: This will alter the input string in-place!"""
  swalign.revcomp(seq)