comparison swalign.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 /*
2 * Copyright (c) 2010 Nicolaus Lance Hepler
3 *
4 * Permission is hereby granted, free of charge, to any person
5 * obtaining a copy of this software and associated documentation
6 * files (the "Software"), to deal in the Software without
7 * restriction, including without limitation the rights to use,
8 * copy, modify, merge, publish, distribute, sublicense, and/or sell
9 * copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following
11 * conditions:
12 *
13 * The above copyright notice and this permission notice shall be
14 * included in all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
18 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
23 * OTHER DEALINGS IN THE SOFTWARE.
24 */
25 // Note: That's an MIT license.
26 // All double-commented comments below are from Nicolaus Lance Hepler.
27 // Original repository: https://code.google.com/archive/p/swalign/
28
29 #include "swalign.h"
30
31 // /* reverse a string in place, return str */
32 static char* reverse(char *str) {
33 char *left = str;
34 char *right = left + strlen(str) - 1;
35 char tmp;
36
37 while (left < right) {
38 tmp = *left;
39 *(left++) = *right;
40 *(right--) = tmp;
41 }
42
43 return str;
44 }
45
46 // Return the reverse complement of a sequence.
47 char* revcomp(char *str) {
48 char *left = str;
49 char *right = left + strlen(str) - 1;
50 char tmp;
51
52 while (left < right) {
53 tmp = get_char_comp(*left);
54 *(left++) = get_char_comp(*right);
55 *(right--) = tmp;
56 }
57
58 return str;
59 }
60
61 // Return the complement of a base.
62 // Uses a simple lookup table: a string with the complements of all possible sequence characters.
63 static char get_char_comp(char c) {
64 int i = c - TRANS_OFFSET;
65 if (i < 0 || i > 57) {
66 return c;
67 } else {
68 return TRANS[i];
69 }
70 }
71
72 // // works globally
73 // Note: Currently the "local" flag isn't functional. It seems to always do a local alignment.
74 static align_t *traceback(seq_pair_t *problem, matrix_t *S, bool local) {
75 align_t *result = malloc(sizeof(align_t));
76 seq_pair_t *seqs = malloc(sizeof(seq_pair_t));
77 unsigned int i = S->m - 1;
78 unsigned int j = S->n - 1;
79 unsigned int k = 0;
80 // Create output strings. Allocate maximum potential length.
81 char c[S->m + S->n + 1];
82 char d[S->m + S->n + 1];
83
84 memset(c, '\0', sizeof(c));
85 memset(d, '\0', sizeof(d));
86
87 // This wasn't finished by NLH. Not functioning correctly yet.
88 // It seems the purpose is to start the traceback from the place where the score reaches its
89 // maximum instead of the very end (set i and j to those coordinates).
90 if (local == true) {
91 unsigned int l, m;
92 double max = FLT_MIN;
93
94 for (l = 0; l < S->m; l++) {
95 for (m = 0; m < S->n; m++) {
96 if (S->mat[l][m].score > max) {
97 i = l;
98 j = m;
99 max = S->mat[l][m].score;
100 }
101 }
102 }
103 }
104
105 double score = DBL_MIN;
106 int matches = 0;
107 int start_a = 0;
108 int start_b = 0;
109 int end_a = 0;
110 int end_b = 0;
111 bool move_i = false;
112 bool move_j = false;
113 // Walk back through the matrix from the end, taking the path determined by the "prev" values of
114 // each cell. Assemble the sequence along the way.
115 if (S->mat[i][j].prev[0] != 0 && S->mat[i][j].prev[1] != 0) {
116 while (i > 0 || j > 0) {
117 unsigned int new_i = S->mat[i][j].prev[0];
118 unsigned int new_j = S->mat[i][j].prev[1];
119
120 // If we've moved in the i axis, add the new base to the sequence. Otherwise, it's a gap.
121 if (new_i < i) {
122 *(c+k) = *(problem->a+i-1);
123 move_i = true;
124 } else {
125 *(c+k) = '-';
126 move_i = false;
127 }
128
129 // If we've moved in the j axis, add the new base to the sequence. Otherwise, it's a gap.
130 if (new_j < j) {
131 *(d+k) = *(problem->b+j-1);
132 move_j = true;
133 } else {
134 *(d+k) = '-';
135 move_j = false;
136 }
137
138 if (S->mat[i][j].score > score) {
139 score = S->mat[i][j].score;
140 }
141
142 if (move_i && move_j) {
143 if (*(c+k) == *(d+k)) {
144 matches++;
145 }
146 // Start and end of each sequence are in the coordinates of the other sequence.
147 start_a = new_j + 1;
148 start_b = new_i + 1;
149 if (! end_a) {
150 end_a = new_j + 1;
151 }
152 if (! end_b) {
153 end_b = new_i + 1;
154 }
155 }
156
157 k++;
158
159 i = new_i;
160 j = new_j;
161 }
162 }
163
164 seqs->a = malloc(sizeof(char) * k + 1);
165 seqs->b = malloc(sizeof(char) * k + 1);
166
167 memset(seqs->a, '\0', sizeof(seqs->a));
168 memset(seqs->b, '\0', sizeof(seqs->b));
169
170 reverse(c);
171 reverse(d);
172
173 strcpy(seqs->a, c);
174 strcpy(seqs->b, d);
175
176 seqs->alen = k;
177 seqs->blen = k;
178
179 result->seqs = seqs;
180 result->score = score;
181 result->matches = matches;
182 result->start_a = start_a;
183 result->start_b = start_b;
184 result->end_a = end_a;
185 result->end_b = end_b;
186
187 return result;
188 }
189
190 static matrix_t *create_matrix(unsigned int m, unsigned int n) {
191 matrix_t *S = malloc(sizeof(matrix_t));
192 unsigned int i;
193
194 S->m = m;
195 S->n = n;
196
197 S->mat = malloc(sizeof(entry_t) * m * n);
198
199 for (i = 0; i < m; i++) {
200 S->mat[i] = malloc(sizeof(entry_t) * n);
201 }
202
203 return S;
204 }
205
206 void destroy_matrix(matrix_t *S) {
207 unsigned int i;
208 for (i = 0; i < S->m; i++) {
209 free(S->mat[i]);
210 }
211 free(S->mat);
212 free(S);
213 return;
214 }
215
216 // Print a visual representation of the path through the matrix.
217 void print_matrix(matrix_t *matrix, seq_pair_t *seq_pair) {
218 int i, j;
219 for (i = 0; i < matrix->m; i++) {
220 if (i == 0) {
221 printf("\t\t");
222 for (j = 0; j < seq_pair->blen; j++) {
223 printf("%c\t", seq_pair->b[j]);
224 }
225 printf("\n");
226 printf(" ");
227 for (j = 0; j < matrix->n; j++) {
228 printf("%d\t", j);
229 }
230 printf("\n");
231 }
232 if (i == 0) {
233 printf(" 0 ");
234 } else {
235 printf("%c %4d ", seq_pair->a[i-1], i);
236 }
237 for (j = 0; j < matrix->n; j++) {
238 printf("%d,%d|%0.0f\t", matrix->mat[i][j].prev[0], matrix->mat[i][j].prev[1], matrix->mat[i][j].score);
239 }
240 printf("\n");
241 }
242 }
243
244 void destroy_seq_pair(seq_pair_t *pair) {
245 free(pair->a);
246 free(pair->b);
247 free(pair);
248 return;
249 }
250
251 align_t *smith_waterman(seq_pair_t *problem, bool local) {
252 unsigned int m = problem->alen + 1;
253 unsigned int n = problem->blen + 1;
254 matrix_t *S = create_matrix(m, n);
255 align_t *result;
256 unsigned int i, j, k, l;
257
258 S->mat[0][0].score = 0;
259 S->mat[0][0].prev[0] = 0;
260 S->mat[0][0].prev[1] = 0;
261
262 for (i = 1; i <= problem->alen; i++) {
263 S->mat[i][0].score = 0.0;
264 S->mat[i][0].prev[0] = i-1;
265 S->mat[i][0].prev[1] = 0;
266 }
267
268 for (j = 1; j <= problem->blen; j++) {
269 S->mat[0][j].score = 0.0;
270 S->mat[0][j].prev[0] = 0;
271 S->mat[0][j].prev[1] = j-1;
272 }
273
274 for (i = 1; i <= problem->alen; i++) {
275 for (j = 1; j <= problem->blen; j++) {
276 int nw_score = (strncmp(problem->a+(i-1), problem->b+(j-1), 1) == 0) ? MATCH : MISMATCH;
277
278 S->mat[i][j].score = DBL_MIN;
279 S->mat[i][j].prev[0] = 0;
280 S->mat[i][j].prev[1] = 0;
281
282 for (k = 0; k <= 1; k++) {
283 for (l = 0; l <= 1; l++) {
284 int val;
285
286 if (k == 0 && l == 0) {
287 continue;
288 } else if (k > 0 && l > 0) {
289 val = nw_score;
290 } else if (k > 0 || l > 0) {
291 if ((i == problem->alen && k == 0) ||
292 (j == problem->blen && l == 0))
293 val = 0.0;
294 else
295 val = GAP;
296 } else {
297 // do nothing..
298 }
299
300 val += S->mat[i-k][j-l].score;
301
302 if (val > S->mat[i][j].score) {
303 S->mat[i][j].score = val;
304 S->mat[i][j].prev[0] = i-k;
305 S->mat[i][j].prev[1] = j-l;
306 }
307 }
308 }
309 }
310 }
311
312 result = traceback(problem, S, local);
313
314 // print_matrix(S, problem);
315
316 destroy_matrix(S);
317
318 return result;
319 }
320
321 void print_alignment(align_t *result, int target_len, int query_len) {
322 printf("Score: %0.0f Matches: %d\n", result->score, result->matches);
323 printf("Target: %3d %s %-3d\n", result->start_a, result->seqs->a, result->end_a);
324 printf("Query: %3d %s %-3d\n", result->start_b, result->seqs->b, result->end_b);
325 }
326
327 int main(int argc, const char **argv) {
328
329 if (argc != 3) {
330 printf("usage: swalign TARGET_SEQ QUERY_SEQ\n");
331 exit(1);
332 }
333
334 {
335 seq_pair_t problem;
336 align_t *result;
337 char c[strlen(argv[1])], d[strlen(argv[2])];
338
339 strcpy(c, argv[1]);
340 strcpy(d, argv[2]);
341
342 problem.a = c;
343 problem.alen = strlen(problem.a);
344 problem.b = d;
345 problem.blen = strlen(problem.b);
346
347 result = smith_waterman(&problem, false);
348
349 print_alignment(result, problem.alen, problem.blen);
350 }
351
352 exit(0);
353 }