Mercurial > repos > nick > duplex
diff swalign.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/swalign.c Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2010 Nicolaus Lance Hepler + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +// Note: That's an MIT license. +// All double-commented comments below are from Nicolaus Lance Hepler. +// Original repository: https://code.google.com/archive/p/swalign/ + +#include "swalign.h" + +// /* reverse a string in place, return str */ +static char* reverse(char *str) { + char *left = str; + char *right = left + strlen(str) - 1; + char tmp; + + while (left < right) { + tmp = *left; + *(left++) = *right; + *(right--) = tmp; + } + + return str; +} + +// Return the reverse complement of a sequence. +char* revcomp(char *str) { + char *left = str; + char *right = left + strlen(str) - 1; + char tmp; + + while (left < right) { + tmp = get_char_comp(*left); + *(left++) = get_char_comp(*right); + *(right--) = tmp; + } + + return str; +} + +// Return the complement of a base. +// Uses a simple lookup table: a string with the complements of all possible sequence characters. +static char get_char_comp(char c) { + int i = c - TRANS_OFFSET; + if (i < 0 || i > 57) { + return c; + } else { + return TRANS[i]; + } +} + +// // works globally +// Note: Currently the "local" flag isn't functional. It seems to always do a local alignment. +static align_t *traceback(seq_pair_t *problem, matrix_t *S, bool local) { + align_t *result = malloc(sizeof(align_t)); + seq_pair_t *seqs = malloc(sizeof(seq_pair_t)); + unsigned int i = S->m - 1; + unsigned int j = S->n - 1; + unsigned int k = 0; + // Create output strings. Allocate maximum potential length. + char c[S->m + S->n + 1]; + char d[S->m + S->n + 1]; + + memset(c, '\0', sizeof(c)); + memset(d, '\0', sizeof(d)); + + // This wasn't finished by NLH. Not functioning correctly yet. + // It seems the purpose is to start the traceback from the place where the score reaches its + // maximum instead of the very end (set i and j to those coordinates). + if (local == true) { + unsigned int l, m; + double max = FLT_MIN; + + for (l = 0; l < S->m; l++) { + for (m = 0; m < S->n; m++) { + if (S->mat[l][m].score > max) { + i = l; + j = m; + max = S->mat[l][m].score; + } + } + } + } + + double score = DBL_MIN; + int matches = 0; + int start_a = 0; + int start_b = 0; + int end_a = 0; + int end_b = 0; + bool move_i = false; + bool move_j = false; + // Walk back through the matrix from the end, taking the path determined by the "prev" values of + // each cell. Assemble the sequence along the way. + if (S->mat[i][j].prev[0] != 0 && S->mat[i][j].prev[1] != 0) { + while (i > 0 || j > 0) { + unsigned int new_i = S->mat[i][j].prev[0]; + unsigned int new_j = S->mat[i][j].prev[1]; + + // If we've moved in the i axis, add the new base to the sequence. Otherwise, it's a gap. + if (new_i < i) { + *(c+k) = *(problem->a+i-1); + move_i = true; + } else { + *(c+k) = '-'; + move_i = false; + } + + // If we've moved in the j axis, add the new base to the sequence. Otherwise, it's a gap. + if (new_j < j) { + *(d+k) = *(problem->b+j-1); + move_j = true; + } else { + *(d+k) = '-'; + move_j = false; + } + + if (S->mat[i][j].score > score) { + score = S->mat[i][j].score; + } + + if (move_i && move_j) { + if (*(c+k) == *(d+k)) { + matches++; + } + // Start and end of each sequence are in the coordinates of the other sequence. + start_a = new_j + 1; + start_b = new_i + 1; + if (! end_a) { + end_a = new_j + 1; + } + if (! end_b) { + end_b = new_i + 1; + } + } + + k++; + + i = new_i; + j = new_j; + } + } + + seqs->a = malloc(sizeof(char) * k + 1); + seqs->b = malloc(sizeof(char) * k + 1); + + memset(seqs->a, '\0', sizeof(seqs->a)); + memset(seqs->b, '\0', sizeof(seqs->b)); + + reverse(c); + reverse(d); + + strcpy(seqs->a, c); + strcpy(seqs->b, d); + + seqs->alen = k; + seqs->blen = k; + + result->seqs = seqs; + result->score = score; + result->matches = matches; + result->start_a = start_a; + result->start_b = start_b; + result->end_a = end_a; + result->end_b = end_b; + + return result; +} + +static matrix_t *create_matrix(unsigned int m, unsigned int n) { + matrix_t *S = malloc(sizeof(matrix_t)); + unsigned int i; + + S->m = m; + S->n = n; + + S->mat = malloc(sizeof(entry_t) * m * n); + + for (i = 0; i < m; i++) { + S->mat[i] = malloc(sizeof(entry_t) * n); + } + + return S; +} + +void destroy_matrix(matrix_t *S) { + unsigned int i; + for (i = 0; i < S->m; i++) { + free(S->mat[i]); + } + free(S->mat); + free(S); + return; +} + +// Print a visual representation of the path through the matrix. +void print_matrix(matrix_t *matrix, seq_pair_t *seq_pair) { + int i, j; + for (i = 0; i < matrix->m; i++) { + if (i == 0) { + printf("\t\t"); + for (j = 0; j < seq_pair->blen; j++) { + printf("%c\t", seq_pair->b[j]); + } + printf("\n"); + printf(" "); + for (j = 0; j < matrix->n; j++) { + printf("%d\t", j); + } + printf("\n"); + } + if (i == 0) { + printf(" 0 "); + } else { + printf("%c %4d ", seq_pair->a[i-1], i); + } + for (j = 0; j < matrix->n; j++) { + printf("%d,%d|%0.0f\t", matrix->mat[i][j].prev[0], matrix->mat[i][j].prev[1], matrix->mat[i][j].score); + } + printf("\n"); + } +} + +void destroy_seq_pair(seq_pair_t *pair) { + free(pair->a); + free(pair->b); + free(pair); + return; +} + +align_t *smith_waterman(seq_pair_t *problem, bool local) { + unsigned int m = problem->alen + 1; + unsigned int n = problem->blen + 1; + matrix_t *S = create_matrix(m, n); + align_t *result; + unsigned int i, j, k, l; + + S->mat[0][0].score = 0; + S->mat[0][0].prev[0] = 0; + S->mat[0][0].prev[1] = 0; + + for (i = 1; i <= problem->alen; i++) { + S->mat[i][0].score = 0.0; + S->mat[i][0].prev[0] = i-1; + S->mat[i][0].prev[1] = 0; + } + + for (j = 1; j <= problem->blen; j++) { + S->mat[0][j].score = 0.0; + S->mat[0][j].prev[0] = 0; + S->mat[0][j].prev[1] = j-1; + } + + for (i = 1; i <= problem->alen; i++) { + for (j = 1; j <= problem->blen; j++) { + int nw_score = (strncmp(problem->a+(i-1), problem->b+(j-1), 1) == 0) ? MATCH : MISMATCH; + + S->mat[i][j].score = DBL_MIN; + S->mat[i][j].prev[0] = 0; + S->mat[i][j].prev[1] = 0; + + for (k = 0; k <= 1; k++) { + for (l = 0; l <= 1; l++) { + int val; + + if (k == 0 && l == 0) { + continue; + } else if (k > 0 && l > 0) { + val = nw_score; + } else if (k > 0 || l > 0) { + if ((i == problem->alen && k == 0) || + (j == problem->blen && l == 0)) + val = 0.0; + else + val = GAP; + } else { + // do nothing.. + } + + val += S->mat[i-k][j-l].score; + + if (val > S->mat[i][j].score) { + S->mat[i][j].score = val; + S->mat[i][j].prev[0] = i-k; + S->mat[i][j].prev[1] = j-l; + } + } + } + } + } + + result = traceback(problem, S, local); + + // print_matrix(S, problem); + + destroy_matrix(S); + + return result; +} + +void print_alignment(align_t *result, int target_len, int query_len) { + printf("Score: %0.0f Matches: %d\n", result->score, result->matches); + printf("Target: %3d %s %-3d\n", result->start_a, result->seqs->a, result->end_a); + printf("Query: %3d %s %-3d\n", result->start_b, result->seqs->b, result->end_b); +} + +int main(int argc, const char **argv) { + + if (argc != 3) { + printf("usage: swalign TARGET_SEQ QUERY_SEQ\n"); + exit(1); + } + + { + seq_pair_t problem; + align_t *result; + char c[strlen(argv[1])], d[strlen(argv[2])]; + + strcpy(c, argv[1]); + strcpy(d, argv[2]); + + problem.a = c; + problem.alen = strlen(problem.a); + problem.b = d; + problem.blen = strlen(problem.b); + + result = smith_waterman(&problem, false); + + print_alignment(result, problem.alen, problem.blen); + } + + exit(0); +}
