Mercurial > repos > nick > duplex
view align.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line source
#include <stdio.h> #include <stdlib.h> #include <string.h> #define NAIVE_TEST_WINDOW 6 #define NAIVE_TEST_THRES 0.80 #define NAIVE_TEST_MIN 2 #define NAIVE_WINDOW 10 #define NAIVE_THRES 0.80 typedef struct Gap { int seq; int coord; int length; struct Gap *next; } Gap; typedef struct Gaps { int length; struct Gap *root; struct Gap *tip; } Gaps; int _test_match(char *seq1, int start1, char *seq2, int start2); void add_gap(Gaps *gaps, int seq, int coord, int length); Gaps *make_gaps(); char *insert_gaps(Gaps *gaps, char *seq, int seq_num); // A naive algorithm for aligning two sequences which are expected to be very similar to each other // and already nearly aligned. void naive2(char *seq1, char *seq2) { Gaps *gaps = make_gaps(); int i = 0; int j = 0; int matches = 0; while (seq1[i] != 0 && seq2[j] != 0) { // Match? printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j); if (seq1[i] == seq2[j]) { matches++; i++; j++; continue; } printf("mismatch!\n"); // Mismatch. Start adding gaps until the mismatches go away. int new_i = i; int new_j = j; int gap_seq = 0; int success; while (1) { if (seq1[new_i] == 0 && seq2[new_j] == 0) { break; } success = _test_match(seq1, new_i, seq2, j); if (success) { gap_seq = 2; break; } if (seq1[new_i] != 0) { new_i++; } success = _test_match(seq1, i, seq2, new_j); if (success) { gap_seq = 1; break; } if (seq2[new_j] != 0) { new_j++; } } // Which sequence are we putting the gap in? if (gap_seq == 0) { printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j); // No good gap found. } else if (i == new_i && j == new_j) { printf("No gap required.\n"); } else if (gap_seq == 1) { printf("%dbp gap in seq1 at base %d.\n", new_j-j, j); add_gap(gaps, 1, j, new_j-j); j = new_j; } else if (gap_seq == 2) { printf("%dbp gap in seq2 at base %d.\n", new_i-i, i); add_gap(gaps, 2, i, new_i-i); i = new_i; } i++; j++; } char *new_seq1 = insert_gaps(gaps, seq1, 1); char *new_seq2 = insert_gaps(gaps, seq2, 2); printf("alignment:\n%s\n%s\n", new_seq1, new_seq2); } // Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with // few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match // percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be // more than NAIVE_TEST_MIN. int _test_match(char *seq1, int start1, char *seq2, int start2) { int matches = 0; int total = 0; char base1, base2; int i; for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) { base1 = seq1[start1+i]; base2 = seq2[start2+i]; if (base1 == 0 || base2 == 0) { break; } if (base1 == base2) { matches++; } total++; } return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES; } Gaps *make_gaps() { Gaps *gaps = malloc(sizeof(Gaps)); gaps->root = 0; gaps->tip = 0; gaps->length = 0; return gaps; } void add_gap(Gaps *gaps, int seq, int coord, int length) { Gap *gap = malloc(sizeof(Gap)); gap->next = 0; gap->seq = seq; gap->coord = coord; gap->length = length; if (gaps->root == 0) { gaps->root = gap; } else { gaps->tip->next = gap; } gaps->tip = gap; gaps->length++; } // Take gap information from the aligner and put them into the sequence string as "-" characters. char *insert_gaps(Gaps *gaps, char *seq, int seq_num) { if (gaps->root == 0) { return seq; } // How long should the new sequence be? int extra_len = 0; Gap *gap = gaps->root; while (gap) { if (gap->seq == seq_num) { extra_len += gap->length; } gap = gap->next; } //TODO: Handle a situation with no gaps. int new_len = extra_len + strlen(seq) + 1; char *new_seq = malloc(sizeof(char) * new_len); int i = 0; int j = 0; gap = gaps->root; while (gap) { // Check that it's a gap in our sequence. if (gap->seq != seq_num) { gap = gap->next; continue; } // Copy verbatim all the sequence until the gap. while (i <= gap->coord) { new_seq[j] = seq[i]; i++; j++; } // Add -'s the whole length of the gap. while (j < gap->coord + gap->length + 1) { new_seq[j] = '-'; j++; } gap = gap->next; } // Fill in the end sequence. while (seq[i]) { new_seq[j] = seq[i]; i++; j++; } new_seq[new_len-1] = 0; return new_seq; }