Mercurial > repos > nick > duplex
view swalign.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
line wrap: on
line source
/* * Copyright (c) 2010 Nicolaus Lance Hepler * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ // Note: That's an MIT license. // All double-commented comments below are from Nicolaus Lance Hepler. // Original repository: https://code.google.com/archive/p/swalign/ #include "swalign.h" // /* reverse a string in place, return str */ static char* reverse(char *str) { char *left = str; char *right = left + strlen(str) - 1; char tmp; while (left < right) { tmp = *left; *(left++) = *right; *(right--) = tmp; } return str; } // Return the reverse complement of a sequence. char* revcomp(char *str) { char *left = str; char *right = left + strlen(str) - 1; char tmp; while (left < right) { tmp = get_char_comp(*left); *(left++) = get_char_comp(*right); *(right--) = tmp; } return str; } // Return the complement of a base. // Uses a simple lookup table: a string with the complements of all possible sequence characters. static char get_char_comp(char c) { int i = c - TRANS_OFFSET; if (i < 0 || i > 57) { return c; } else { return TRANS[i]; } } // // works globally // Note: Currently the "local" flag isn't functional. It seems to always do a local alignment. static align_t *traceback(seq_pair_t *problem, matrix_t *S, bool local) { align_t *result = malloc(sizeof(align_t)); seq_pair_t *seqs = malloc(sizeof(seq_pair_t)); unsigned int i = S->m - 1; unsigned int j = S->n - 1; unsigned int k = 0; // Create output strings. Allocate maximum potential length. char c[S->m + S->n + 1]; char d[S->m + S->n + 1]; memset(c, '\0', sizeof(c)); memset(d, '\0', sizeof(d)); // This wasn't finished by NLH. Not functioning correctly yet. // It seems the purpose is to start the traceback from the place where the score reaches its // maximum instead of the very end (set i and j to those coordinates). if (local == true) { unsigned int l, m; double max = FLT_MIN; for (l = 0; l < S->m; l++) { for (m = 0; m < S->n; m++) { if (S->mat[l][m].score > max) { i = l; j = m; max = S->mat[l][m].score; } } } } double score = DBL_MIN; int matches = 0; int start_a = 0; int start_b = 0; int end_a = 0; int end_b = 0; bool move_i = false; bool move_j = false; // Walk back through the matrix from the end, taking the path determined by the "prev" values of // each cell. Assemble the sequence along the way. if (S->mat[i][j].prev[0] != 0 && S->mat[i][j].prev[1] != 0) { while (i > 0 || j > 0) { unsigned int new_i = S->mat[i][j].prev[0]; unsigned int new_j = S->mat[i][j].prev[1]; // If we've moved in the i axis, add the new base to the sequence. Otherwise, it's a gap. if (new_i < i) { *(c+k) = *(problem->a+i-1); move_i = true; } else { *(c+k) = '-'; move_i = false; } // If we've moved in the j axis, add the new base to the sequence. Otherwise, it's a gap. if (new_j < j) { *(d+k) = *(problem->b+j-1); move_j = true; } else { *(d+k) = '-'; move_j = false; } if (S->mat[i][j].score > score) { score = S->mat[i][j].score; } if (move_i && move_j) { if (*(c+k) == *(d+k)) { matches++; } // Start and end of each sequence are in the coordinates of the other sequence. start_a = new_j + 1; start_b = new_i + 1; if (! end_a) { end_a = new_j + 1; } if (! end_b) { end_b = new_i + 1; } } k++; i = new_i; j = new_j; } } seqs->a = malloc(sizeof(char) * k + 1); seqs->b = malloc(sizeof(char) * k + 1); memset(seqs->a, '\0', sizeof(seqs->a)); memset(seqs->b, '\0', sizeof(seqs->b)); reverse(c); reverse(d); strcpy(seqs->a, c); strcpy(seqs->b, d); seqs->alen = k; seqs->blen = k; result->seqs = seqs; result->score = score; result->matches = matches; result->start_a = start_a; result->start_b = start_b; result->end_a = end_a; result->end_b = end_b; return result; } static matrix_t *create_matrix(unsigned int m, unsigned int n) { matrix_t *S = malloc(sizeof(matrix_t)); unsigned int i; S->m = m; S->n = n; S->mat = malloc(sizeof(entry_t) * m * n); for (i = 0; i < m; i++) { S->mat[i] = malloc(sizeof(entry_t) * n); } return S; } void destroy_matrix(matrix_t *S) { unsigned int i; for (i = 0; i < S->m; i++) { free(S->mat[i]); } free(S->mat); free(S); return; } // Print a visual representation of the path through the matrix. void print_matrix(matrix_t *matrix, seq_pair_t *seq_pair) { int i, j; for (i = 0; i < matrix->m; i++) { if (i == 0) { printf("\t\t"); for (j = 0; j < seq_pair->blen; j++) { printf("%c\t", seq_pair->b[j]); } printf("\n"); printf(" "); for (j = 0; j < matrix->n; j++) { printf("%d\t", j); } printf("\n"); } if (i == 0) { printf(" 0 "); } else { printf("%c %4d ", seq_pair->a[i-1], i); } for (j = 0; j < matrix->n; j++) { printf("%d,%d|%0.0f\t", matrix->mat[i][j].prev[0], matrix->mat[i][j].prev[1], matrix->mat[i][j].score); } printf("\n"); } } void destroy_seq_pair(seq_pair_t *pair) { free(pair->a); free(pair->b); free(pair); return; } align_t *smith_waterman(seq_pair_t *problem, bool local) { unsigned int m = problem->alen + 1; unsigned int n = problem->blen + 1; matrix_t *S = create_matrix(m, n); align_t *result; unsigned int i, j, k, l; S->mat[0][0].score = 0; S->mat[0][0].prev[0] = 0; S->mat[0][0].prev[1] = 0; for (i = 1; i <= problem->alen; i++) { S->mat[i][0].score = 0.0; S->mat[i][0].prev[0] = i-1; S->mat[i][0].prev[1] = 0; } for (j = 1; j <= problem->blen; j++) { S->mat[0][j].score = 0.0; S->mat[0][j].prev[0] = 0; S->mat[0][j].prev[1] = j-1; } for (i = 1; i <= problem->alen; i++) { for (j = 1; j <= problem->blen; j++) { int nw_score = (strncmp(problem->a+(i-1), problem->b+(j-1), 1) == 0) ? MATCH : MISMATCH; S->mat[i][j].score = DBL_MIN; S->mat[i][j].prev[0] = 0; S->mat[i][j].prev[1] = 0; for (k = 0; k <= 1; k++) { for (l = 0; l <= 1; l++) { int val; if (k == 0 && l == 0) { continue; } else if (k > 0 && l > 0) { val = nw_score; } else if (k > 0 || l > 0) { if ((i == problem->alen && k == 0) || (j == problem->blen && l == 0)) val = 0.0; else val = GAP; } else { // do nothing.. } val += S->mat[i-k][j-l].score; if (val > S->mat[i][j].score) { S->mat[i][j].score = val; S->mat[i][j].prev[0] = i-k; S->mat[i][j].prev[1] = j-l; } } } } } result = traceback(problem, S, local); // print_matrix(S, problem); destroy_matrix(S); return result; } void print_alignment(align_t *result, int target_len, int query_len) { printf("Score: %0.0f Matches: %d\n", result->score, result->matches); printf("Target: %3d %s %-3d\n", result->start_a, result->seqs->a, result->end_a); printf("Query: %3d %s %-3d\n", result->start_b, result->seqs->b, result->end_b); } int main(int argc, const char **argv) { if (argc != 3) { printf("usage: swalign TARGET_SEQ QUERY_SEQ\n"); exit(1); } { seq_pair_t problem; align_t *result; char c[strlen(argv[1])], d[strlen(argv[2])]; strcpy(c, argv[1]); strcpy(d, argv[2]); problem.a = c; problem.alen = strlen(problem.a); problem.b = d; problem.blen = strlen(problem.b); result = smith_waterman(&problem, false); print_alignment(result, problem.alen, problem.blen); } exit(0); }