Mercurial > repos > nick > duplex
diff align.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/align.c Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,192 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#define NAIVE_TEST_WINDOW 6 +#define NAIVE_TEST_THRES 0.80 +#define NAIVE_TEST_MIN 2 +#define NAIVE_WINDOW 10 +#define NAIVE_THRES 0.80 + +typedef struct Gap { + int seq; + int coord; + int length; + struct Gap *next; +} Gap; + +typedef struct Gaps { + int length; + struct Gap *root; + struct Gap *tip; +} Gaps; + +int _test_match(char *seq1, int start1, char *seq2, int start2); +void add_gap(Gaps *gaps, int seq, int coord, int length); +Gaps *make_gaps(); +char *insert_gaps(Gaps *gaps, char *seq, int seq_num); + + +// A naive algorithm for aligning two sequences which are expected to be very similar to each other +// and already nearly aligned. +void naive2(char *seq1, char *seq2) { + Gaps *gaps = make_gaps(); + int i = 0; + int j = 0; + int matches = 0; + while (seq1[i] != 0 && seq2[j] != 0) { + // Match? + printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j); + if (seq1[i] == seq2[j]) { + matches++; + i++; + j++; + continue; + } + printf("mismatch!\n"); + // Mismatch. Start adding gaps until the mismatches go away. + int new_i = i; + int new_j = j; + int gap_seq = 0; + int success; + while (1) { + if (seq1[new_i] == 0 && seq2[new_j] == 0) { + break; + } + success = _test_match(seq1, new_i, seq2, j); + if (success) { + gap_seq = 2; + break; + } + if (seq1[new_i] != 0) { + new_i++; + } + success = _test_match(seq1, i, seq2, new_j); + if (success) { + gap_seq = 1; + break; + } + if (seq2[new_j] != 0) { + new_j++; + } + } + // Which sequence are we putting the gap in? + if (gap_seq == 0) { + printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j); + // No good gap found. + } else if (i == new_i && j == new_j) { + printf("No gap required.\n"); + } else if (gap_seq == 1) { + printf("%dbp gap in seq1 at base %d.\n", new_j-j, j); + add_gap(gaps, 1, j, new_j-j); + j = new_j; + } else if (gap_seq == 2) { + printf("%dbp gap in seq2 at base %d.\n", new_i-i, i); + add_gap(gaps, 2, i, new_i-i); + i = new_i; + } + i++; + j++; + } + + char *new_seq1 = insert_gaps(gaps, seq1, 1); + char *new_seq2 = insert_gaps(gaps, seq2, 2); + printf("alignment:\n%s\n%s\n", new_seq1, new_seq2); +} + +// Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with +// few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match +// percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be +// more than NAIVE_TEST_MIN. +int _test_match(char *seq1, int start1, char *seq2, int start2) { + int matches = 0; + int total = 0; + char base1, base2; + int i; + for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) { + base1 = seq1[start1+i]; + base2 = seq2[start2+i]; + if (base1 == 0 || base2 == 0) { + break; + } + if (base1 == base2) { + matches++; + } + total++; + } + return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES; +} + +Gaps *make_gaps() { + Gaps *gaps = malloc(sizeof(Gaps)); + gaps->root = 0; + gaps->tip = 0; + gaps->length = 0; + return gaps; +} + +void add_gap(Gaps *gaps, int seq, int coord, int length) { + Gap *gap = malloc(sizeof(Gap)); + gap->next = 0; + gap->seq = seq; + gap->coord = coord; + gap->length = length; + if (gaps->root == 0) { + gaps->root = gap; + } else { + gaps->tip->next = gap; + } + gaps->tip = gap; + gaps->length++; +} + +// Take gap information from the aligner and put them into the sequence string as "-" characters. +char *insert_gaps(Gaps *gaps, char *seq, int seq_num) { + if (gaps->root == 0) { + return seq; + } + + // How long should the new sequence be? + int extra_len = 0; + Gap *gap = gaps->root; + while (gap) { + if (gap->seq == seq_num) { + extra_len += gap->length; + } + gap = gap->next; + } + + //TODO: Handle a situation with no gaps. + int new_len = extra_len + strlen(seq) + 1; + char *new_seq = malloc(sizeof(char) * new_len); + int i = 0; + int j = 0; + gap = gaps->root; + while (gap) { + // Check that it's a gap in our sequence. + if (gap->seq != seq_num) { + gap = gap->next; + continue; + } + // Copy verbatim all the sequence until the gap. + while (i <= gap->coord) { + new_seq[j] = seq[i]; + i++; + j++; + } + // Add -'s the whole length of the gap. + while (j < gap->coord + gap->length + 1) { + new_seq[j] = '-'; + j++; + } + gap = gap->next; + } + // Fill in the end sequence. + while (seq[i]) { + new_seq[j] = seq[i]; + i++; + j++; + } + new_seq[new_len-1] = 0; + return new_seq; +}