Mercurial > repos > nick > duplex
comparison swalign.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 /* | |
2 * Copyright (c) 2010 Nicolaus Lance Hepler | |
3 * | |
4 * Permission is hereby granted, free of charge, to any person | |
5 * obtaining a copy of this software and associated documentation | |
6 * files (the "Software"), to deal in the Software without | |
7 * restriction, including without limitation the rights to use, | |
8 * copy, modify, merge, publish, distribute, sublicense, and/or sell | |
9 * copies of the Software, and to permit persons to whom the | |
10 * Software is furnished to do so, subject to the following | |
11 * conditions: | |
12 * | |
13 * The above copyright notice and this permission notice shall be | |
14 * included in all copies or substantial portions of the Software. | |
15 * | |
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
18 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
20 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
21 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
23 * OTHER DEALINGS IN THE SOFTWARE. | |
24 */ | |
25 // Note: That's an MIT license. | |
26 // All double-commented comments below are from Nicolaus Lance Hepler. | |
27 // Original repository: https://code.google.com/archive/p/swalign/ | |
28 | |
29 #include "swalign.h" | |
30 | |
31 // /* reverse a string in place, return str */ | |
32 static char* reverse(char *str) { | |
33 char *left = str; | |
34 char *right = left + strlen(str) - 1; | |
35 char tmp; | |
36 | |
37 while (left < right) { | |
38 tmp = *left; | |
39 *(left++) = *right; | |
40 *(right--) = tmp; | |
41 } | |
42 | |
43 return str; | |
44 } | |
45 | |
46 // Return the reverse complement of a sequence. | |
47 char* revcomp(char *str) { | |
48 char *left = str; | |
49 char *right = left + strlen(str) - 1; | |
50 char tmp; | |
51 | |
52 while (left < right) { | |
53 tmp = get_char_comp(*left); | |
54 *(left++) = get_char_comp(*right); | |
55 *(right--) = tmp; | |
56 } | |
57 | |
58 return str; | |
59 } | |
60 | |
61 // Return the complement of a base. | |
62 // Uses a simple lookup table: a string with the complements of all possible sequence characters. | |
63 static char get_char_comp(char c) { | |
64 int i = c - TRANS_OFFSET; | |
65 if (i < 0 || i > 57) { | |
66 return c; | |
67 } else { | |
68 return TRANS[i]; | |
69 } | |
70 } | |
71 | |
72 // // works globally | |
73 // Note: Currently the "local" flag isn't functional. It seems to always do a local alignment. | |
74 static align_t *traceback(seq_pair_t *problem, matrix_t *S, bool local) { | |
75 align_t *result = malloc(sizeof(align_t)); | |
76 seq_pair_t *seqs = malloc(sizeof(seq_pair_t)); | |
77 unsigned int i = S->m - 1; | |
78 unsigned int j = S->n - 1; | |
79 unsigned int k = 0; | |
80 // Create output strings. Allocate maximum potential length. | |
81 char c[S->m + S->n + 1]; | |
82 char d[S->m + S->n + 1]; | |
83 | |
84 memset(c, '\0', sizeof(c)); | |
85 memset(d, '\0', sizeof(d)); | |
86 | |
87 // This wasn't finished by NLH. Not functioning correctly yet. | |
88 // It seems the purpose is to start the traceback from the place where the score reaches its | |
89 // maximum instead of the very end (set i and j to those coordinates). | |
90 if (local == true) { | |
91 unsigned int l, m; | |
92 double max = FLT_MIN; | |
93 | |
94 for (l = 0; l < S->m; l++) { | |
95 for (m = 0; m < S->n; m++) { | |
96 if (S->mat[l][m].score > max) { | |
97 i = l; | |
98 j = m; | |
99 max = S->mat[l][m].score; | |
100 } | |
101 } | |
102 } | |
103 } | |
104 | |
105 double score = DBL_MIN; | |
106 int matches = 0; | |
107 int start_a = 0; | |
108 int start_b = 0; | |
109 int end_a = 0; | |
110 int end_b = 0; | |
111 bool move_i = false; | |
112 bool move_j = false; | |
113 // Walk back through the matrix from the end, taking the path determined by the "prev" values of | |
114 // each cell. Assemble the sequence along the way. | |
115 if (S->mat[i][j].prev[0] != 0 && S->mat[i][j].prev[1] != 0) { | |
116 while (i > 0 || j > 0) { | |
117 unsigned int new_i = S->mat[i][j].prev[0]; | |
118 unsigned int new_j = S->mat[i][j].prev[1]; | |
119 | |
120 // If we've moved in the i axis, add the new base to the sequence. Otherwise, it's a gap. | |
121 if (new_i < i) { | |
122 *(c+k) = *(problem->a+i-1); | |
123 move_i = true; | |
124 } else { | |
125 *(c+k) = '-'; | |
126 move_i = false; | |
127 } | |
128 | |
129 // If we've moved in the j axis, add the new base to the sequence. Otherwise, it's a gap. | |
130 if (new_j < j) { | |
131 *(d+k) = *(problem->b+j-1); | |
132 move_j = true; | |
133 } else { | |
134 *(d+k) = '-'; | |
135 move_j = false; | |
136 } | |
137 | |
138 if (S->mat[i][j].score > score) { | |
139 score = S->mat[i][j].score; | |
140 } | |
141 | |
142 if (move_i && move_j) { | |
143 if (*(c+k) == *(d+k)) { | |
144 matches++; | |
145 } | |
146 // Start and end of each sequence are in the coordinates of the other sequence. | |
147 start_a = new_j + 1; | |
148 start_b = new_i + 1; | |
149 if (! end_a) { | |
150 end_a = new_j + 1; | |
151 } | |
152 if (! end_b) { | |
153 end_b = new_i + 1; | |
154 } | |
155 } | |
156 | |
157 k++; | |
158 | |
159 i = new_i; | |
160 j = new_j; | |
161 } | |
162 } | |
163 | |
164 seqs->a = malloc(sizeof(char) * k + 1); | |
165 seqs->b = malloc(sizeof(char) * k + 1); | |
166 | |
167 memset(seqs->a, '\0', sizeof(seqs->a)); | |
168 memset(seqs->b, '\0', sizeof(seqs->b)); | |
169 | |
170 reverse(c); | |
171 reverse(d); | |
172 | |
173 strcpy(seqs->a, c); | |
174 strcpy(seqs->b, d); | |
175 | |
176 seqs->alen = k; | |
177 seqs->blen = k; | |
178 | |
179 result->seqs = seqs; | |
180 result->score = score; | |
181 result->matches = matches; | |
182 result->start_a = start_a; | |
183 result->start_b = start_b; | |
184 result->end_a = end_a; | |
185 result->end_b = end_b; | |
186 | |
187 return result; | |
188 } | |
189 | |
190 static matrix_t *create_matrix(unsigned int m, unsigned int n) { | |
191 matrix_t *S = malloc(sizeof(matrix_t)); | |
192 unsigned int i; | |
193 | |
194 S->m = m; | |
195 S->n = n; | |
196 | |
197 S->mat = malloc(sizeof(entry_t) * m * n); | |
198 | |
199 for (i = 0; i < m; i++) { | |
200 S->mat[i] = malloc(sizeof(entry_t) * n); | |
201 } | |
202 | |
203 return S; | |
204 } | |
205 | |
206 void destroy_matrix(matrix_t *S) { | |
207 unsigned int i; | |
208 for (i = 0; i < S->m; i++) { | |
209 free(S->mat[i]); | |
210 } | |
211 free(S->mat); | |
212 free(S); | |
213 return; | |
214 } | |
215 | |
216 // Print a visual representation of the path through the matrix. | |
217 void print_matrix(matrix_t *matrix, seq_pair_t *seq_pair) { | |
218 int i, j; | |
219 for (i = 0; i < matrix->m; i++) { | |
220 if (i == 0) { | |
221 printf("\t\t"); | |
222 for (j = 0; j < seq_pair->blen; j++) { | |
223 printf("%c\t", seq_pair->b[j]); | |
224 } | |
225 printf("\n"); | |
226 printf(" "); | |
227 for (j = 0; j < matrix->n; j++) { | |
228 printf("%d\t", j); | |
229 } | |
230 printf("\n"); | |
231 } | |
232 if (i == 0) { | |
233 printf(" 0 "); | |
234 } else { | |
235 printf("%c %4d ", seq_pair->a[i-1], i); | |
236 } | |
237 for (j = 0; j < matrix->n; j++) { | |
238 printf("%d,%d|%0.0f\t", matrix->mat[i][j].prev[0], matrix->mat[i][j].prev[1], matrix->mat[i][j].score); | |
239 } | |
240 printf("\n"); | |
241 } | |
242 } | |
243 | |
244 void destroy_seq_pair(seq_pair_t *pair) { | |
245 free(pair->a); | |
246 free(pair->b); | |
247 free(pair); | |
248 return; | |
249 } | |
250 | |
251 align_t *smith_waterman(seq_pair_t *problem, bool local) { | |
252 unsigned int m = problem->alen + 1; | |
253 unsigned int n = problem->blen + 1; | |
254 matrix_t *S = create_matrix(m, n); | |
255 align_t *result; | |
256 unsigned int i, j, k, l; | |
257 | |
258 S->mat[0][0].score = 0; | |
259 S->mat[0][0].prev[0] = 0; | |
260 S->mat[0][0].prev[1] = 0; | |
261 | |
262 for (i = 1; i <= problem->alen; i++) { | |
263 S->mat[i][0].score = 0.0; | |
264 S->mat[i][0].prev[0] = i-1; | |
265 S->mat[i][0].prev[1] = 0; | |
266 } | |
267 | |
268 for (j = 1; j <= problem->blen; j++) { | |
269 S->mat[0][j].score = 0.0; | |
270 S->mat[0][j].prev[0] = 0; | |
271 S->mat[0][j].prev[1] = j-1; | |
272 } | |
273 | |
274 for (i = 1; i <= problem->alen; i++) { | |
275 for (j = 1; j <= problem->blen; j++) { | |
276 int nw_score = (strncmp(problem->a+(i-1), problem->b+(j-1), 1) == 0) ? MATCH : MISMATCH; | |
277 | |
278 S->mat[i][j].score = DBL_MIN; | |
279 S->mat[i][j].prev[0] = 0; | |
280 S->mat[i][j].prev[1] = 0; | |
281 | |
282 for (k = 0; k <= 1; k++) { | |
283 for (l = 0; l <= 1; l++) { | |
284 int val; | |
285 | |
286 if (k == 0 && l == 0) { | |
287 continue; | |
288 } else if (k > 0 && l > 0) { | |
289 val = nw_score; | |
290 } else if (k > 0 || l > 0) { | |
291 if ((i == problem->alen && k == 0) || | |
292 (j == problem->blen && l == 0)) | |
293 val = 0.0; | |
294 else | |
295 val = GAP; | |
296 } else { | |
297 // do nothing.. | |
298 } | |
299 | |
300 val += S->mat[i-k][j-l].score; | |
301 | |
302 if (val > S->mat[i][j].score) { | |
303 S->mat[i][j].score = val; | |
304 S->mat[i][j].prev[0] = i-k; | |
305 S->mat[i][j].prev[1] = j-l; | |
306 } | |
307 } | |
308 } | |
309 } | |
310 } | |
311 | |
312 result = traceback(problem, S, local); | |
313 | |
314 // print_matrix(S, problem); | |
315 | |
316 destroy_matrix(S); | |
317 | |
318 return result; | |
319 } | |
320 | |
321 void print_alignment(align_t *result, int target_len, int query_len) { | |
322 printf("Score: %0.0f Matches: %d\n", result->score, result->matches); | |
323 printf("Target: %3d %s %-3d\n", result->start_a, result->seqs->a, result->end_a); | |
324 printf("Query: %3d %s %-3d\n", result->start_b, result->seqs->b, result->end_b); | |
325 } | |
326 | |
327 int main(int argc, const char **argv) { | |
328 | |
329 if (argc != 3) { | |
330 printf("usage: swalign TARGET_SEQ QUERY_SEQ\n"); | |
331 exit(1); | |
332 } | |
333 | |
334 { | |
335 seq_pair_t problem; | |
336 align_t *result; | |
337 char c[strlen(argv[1])], d[strlen(argv[2])]; | |
338 | |
339 strcpy(c, argv[1]); | |
340 strcpy(d, argv[2]); | |
341 | |
342 problem.a = c; | |
343 problem.alen = strlen(problem.a); | |
344 problem.b = d; | |
345 problem.blen = strlen(problem.b); | |
346 | |
347 result = smith_waterman(&problem, false); | |
348 | |
349 print_alignment(result, problem.alen, problem.blen); | |
350 } | |
351 | |
352 exit(0); | |
353 } |