Mercurial > repos > nick > duplex
diff swalign.py @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/swalign.py Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,95 @@ +import os +import ctypes +import string + +script_dir = os.path.dirname(os.path.realpath(__file__)) +swalign = ctypes.cdll.LoadLibrary(os.path.join(script_dir, 'libswalign.so')) + +REVCOMP_TABLE = string.maketrans('acgtrymkbdhvACGTRYMKBDHV', 'tgcayrkmvhdbTGCAYRKMVHDB') + + +# C struct for ctypes +class SeqPairC(ctypes.Structure): + _fields_ = [ + ('a', ctypes.c_char_p), + ('alen', ctypes.c_uint), + ('b', ctypes.c_char_p), + ('blen', ctypes.c_uint), + ] + + +# C struct for ctypes +class AlignC(ctypes.Structure): + _fields_ = [ + ('seqs', ctypes.POINTER(SeqPairC)), + ('start_a', ctypes.c_int), + ('start_b', ctypes.c_int), + ('end_a', ctypes.c_int), + ('end_b', ctypes.c_int), + ('matches', ctypes.c_int), + ('score', ctypes.c_double), + ] + + +# The Python version +class Align(object): + def __init__(self, align_c): + self.target = align_c.seqs.contents.a + self.query = align_c.seqs.contents.b + # Where the first base of the target aligns on the query, in query coordinates (or 1, if <= 0). + self.start_target = align_c.start_a + # Where the first base of the query aligns on the target, in target coordinates (or 1, if <= 0). + self.start_query = align_c.start_b + # Where the last base of the target aligns on the query, in query coordinates. + self.end_target = align_c.end_a + # Where the last base of the query aligns on the target, in target coordinates. + self.end_query = align_c.end_b + self.matches = align_c.matches + self.score = align_c.score + + # Provide this common function. + def __str__(self): + """Print a human-readable representation of the alignment.""" + start_query = str(self.start_query) + start_target = str(self.start_target) + start_width = str(max(len(start_query), len(start_target))) + line_format = '{:'+start_width+'} {} {}' + output = line_format.format(start_target, self.target, self.end_target) + '\n' + output += line_format.format(start_query, self.query, self.end_query) + return output + + +# Initialize functions (define types). +swalign.smith_waterman.restype = ctypes.POINTER(AlignC) +swalign.revcomp.restype = ctypes.c_char_p + + +def smith_waterman(target, query): + seq_pair = SeqPairC(target, len(target), query, len(query)) + align_c = swalign.smith_waterman(ctypes.pointer(seq_pair), 1).contents + return Align(align_c) + + +def smith_waterman_duplex(target, query): + """Smith-Waterman align query to target in both orientations and return the best. + Convenience function that calls smith_waterman() twice, and returns the + alignment with the highest score.""" + align = smith_waterman(target, query) + query_rc = revcomp(query) + align_rc = smith_waterman(target, query_rc) + if align_rc.score > align.score: + return align_rc + else: + return align + + +def revcomp(seq): + """Return the reverse complement of the input sequence. + Leaves the input string unaltered.""" + return seq.translate(REVCOMP_TABLE)[::-1] + + +def revcomp_inplace(seq): + """Convert the input sequence to its reverse complement. + WARNING: This will alter the input string in-place!""" + swalign.revcomp(seq)
