diff align.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/align.c	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,192 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define NAIVE_TEST_WINDOW 6
+#define NAIVE_TEST_THRES 0.80
+#define NAIVE_TEST_MIN 2
+#define NAIVE_WINDOW 10
+#define NAIVE_THRES 0.80
+
+typedef struct Gap {
+  int seq;
+  int coord;
+  int length;
+  struct Gap *next;
+} Gap;
+
+typedef struct Gaps {
+  int length;
+  struct Gap *root;
+  struct Gap *tip;
+} Gaps;
+
+int _test_match(char *seq1, int start1, char *seq2, int start2);
+void add_gap(Gaps *gaps, int seq, int coord, int length);
+Gaps *make_gaps();
+char *insert_gaps(Gaps *gaps, char *seq, int seq_num);
+
+
+// A naive algorithm for aligning two sequences which are expected to be very similar to each other
+// and already nearly aligned.
+void naive2(char *seq1, char *seq2) {
+  Gaps *gaps = make_gaps();
+  int i = 0;
+  int j = 0;
+  int matches = 0;
+  while (seq1[i] != 0 && seq2[j] != 0) {
+    // Match?
+    printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j);
+    if (seq1[i] == seq2[j]) {
+      matches++;
+      i++;
+      j++;
+      continue;
+    }
+    printf("mismatch!\n");
+    // Mismatch. Start adding gaps until the mismatches go away.
+    int new_i = i;
+    int new_j = j;
+    int gap_seq = 0;
+    int success;
+    while (1) {
+      if (seq1[new_i] == 0 && seq2[new_j] == 0) {
+        break;
+      }
+      success = _test_match(seq1, new_i, seq2, j);
+      if (success) {
+        gap_seq = 2;
+        break;
+      }
+      if (seq1[new_i] != 0) {
+        new_i++;
+      }
+      success = _test_match(seq1, i, seq2, new_j);
+      if (success) {
+        gap_seq = 1;
+        break;
+      }
+      if (seq2[new_j] != 0) {
+        new_j++;
+      }
+    }
+    // Which sequence are we putting the gap in?
+    if (gap_seq == 0) {
+      printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j);
+      // No good gap found.
+    } else if (i == new_i && j == new_j) {
+      printf("No gap required.\n");
+    } else if (gap_seq == 1) {
+      printf("%dbp gap in seq1 at base %d.\n", new_j-j, j);
+      add_gap(gaps, 1, j, new_j-j);
+      j = new_j;
+    } else if (gap_seq == 2) {
+      printf("%dbp gap in seq2 at base %d.\n", new_i-i, i);
+      add_gap(gaps, 2, i, new_i-i);
+      i = new_i;
+    }
+    i++;
+    j++;
+  }
+
+  char *new_seq1 = insert_gaps(gaps, seq1, 1);
+  char *new_seq2 = insert_gaps(gaps, seq2, 2);
+  printf("alignment:\n%s\n%s\n", new_seq1, new_seq2);
+}
+
+// Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with
+// few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match
+// percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be
+// more than NAIVE_TEST_MIN.
+int _test_match(char *seq1, int start1, char *seq2, int start2) {
+  int matches = 0;
+  int total = 0;
+  char base1, base2;
+  int i;
+  for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) {
+    base1 = seq1[start1+i];
+    base2 = seq2[start2+i];
+    if (base1 == 0 || base2 == 0) {
+      break;
+    }
+    if (base1 == base2) {
+      matches++;
+    }
+    total++;
+  }
+  return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES;
+}
+
+Gaps *make_gaps() {
+  Gaps *gaps = malloc(sizeof(Gaps));
+  gaps->root = 0;
+  gaps->tip = 0;
+  gaps->length = 0;
+  return gaps;
+}
+
+void add_gap(Gaps *gaps, int seq, int coord, int length) {
+  Gap *gap = malloc(sizeof(Gap));
+  gap->next = 0;
+  gap->seq = seq;
+  gap->coord = coord;
+  gap->length = length;
+  if (gaps->root == 0) {
+    gaps->root = gap;
+  } else {
+    gaps->tip->next = gap;
+  }
+  gaps->tip = gap;
+  gaps->length++;
+}
+
+// Take gap information from the aligner and put them into the sequence string as "-" characters.
+char *insert_gaps(Gaps *gaps, char *seq, int seq_num) {
+  if (gaps->root == 0) {
+    return seq;
+  }
+
+  // How long should the new sequence be?
+  int extra_len = 0;
+  Gap *gap = gaps->root;
+  while (gap) {
+    if (gap->seq == seq_num) {
+      extra_len += gap->length;
+    }
+    gap = gap->next;
+  }
+
+  //TODO: Handle a situation with no gaps.
+  int new_len = extra_len + strlen(seq) + 1;
+  char *new_seq = malloc(sizeof(char) * new_len);
+  int i = 0;
+  int j = 0;
+  gap = gaps->root;
+  while (gap) {
+    // Check that it's a gap in our sequence.
+    if (gap->seq != seq_num) {
+      gap = gap->next;
+      continue;
+    }
+    // Copy verbatim all the sequence until the gap.
+    while (i <= gap->coord) {
+      new_seq[j] = seq[i];
+      i++;
+      j++;
+    }
+    // Add -'s the whole length of the gap.
+    while (j < gap->coord + gap->length + 1) {
+      new_seq[j] = '-';
+      j++;
+    }
+    gap = gap->next;
+  }
+  // Fill in the end sequence.
+  while (seq[i]) {
+    new_seq[j] = seq[i];
+    i++;
+    j++;
+  }
+  new_seq[new_len-1] = 0;
+  return new_seq;
+}