Mercurial > repos > nick > duplex
comparison align.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 #include <stdio.h> | |
2 #include <stdlib.h> | |
3 #include <string.h> | |
4 | |
5 #define NAIVE_TEST_WINDOW 6 | |
6 #define NAIVE_TEST_THRES 0.80 | |
7 #define NAIVE_TEST_MIN 2 | |
8 #define NAIVE_WINDOW 10 | |
9 #define NAIVE_THRES 0.80 | |
10 | |
11 typedef struct Gap { | |
12 int seq; | |
13 int coord; | |
14 int length; | |
15 struct Gap *next; | |
16 } Gap; | |
17 | |
18 typedef struct Gaps { | |
19 int length; | |
20 struct Gap *root; | |
21 struct Gap *tip; | |
22 } Gaps; | |
23 | |
24 int _test_match(char *seq1, int start1, char *seq2, int start2); | |
25 void add_gap(Gaps *gaps, int seq, int coord, int length); | |
26 Gaps *make_gaps(); | |
27 char *insert_gaps(Gaps *gaps, char *seq, int seq_num); | |
28 | |
29 | |
30 // A naive algorithm for aligning two sequences which are expected to be very similar to each other | |
31 // and already nearly aligned. | |
32 void naive2(char *seq1, char *seq2) { | |
33 Gaps *gaps = make_gaps(); | |
34 int i = 0; | |
35 int j = 0; | |
36 int matches = 0; | |
37 while (seq1[i] != 0 && seq2[j] != 0) { | |
38 // Match? | |
39 printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j); | |
40 if (seq1[i] == seq2[j]) { | |
41 matches++; | |
42 i++; | |
43 j++; | |
44 continue; | |
45 } | |
46 printf("mismatch!\n"); | |
47 // Mismatch. Start adding gaps until the mismatches go away. | |
48 int new_i = i; | |
49 int new_j = j; | |
50 int gap_seq = 0; | |
51 int success; | |
52 while (1) { | |
53 if (seq1[new_i] == 0 && seq2[new_j] == 0) { | |
54 break; | |
55 } | |
56 success = _test_match(seq1, new_i, seq2, j); | |
57 if (success) { | |
58 gap_seq = 2; | |
59 break; | |
60 } | |
61 if (seq1[new_i] != 0) { | |
62 new_i++; | |
63 } | |
64 success = _test_match(seq1, i, seq2, new_j); | |
65 if (success) { | |
66 gap_seq = 1; | |
67 break; | |
68 } | |
69 if (seq2[new_j] != 0) { | |
70 new_j++; | |
71 } | |
72 } | |
73 // Which sequence are we putting the gap in? | |
74 if (gap_seq == 0) { | |
75 printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j); | |
76 // No good gap found. | |
77 } else if (i == new_i && j == new_j) { | |
78 printf("No gap required.\n"); | |
79 } else if (gap_seq == 1) { | |
80 printf("%dbp gap in seq1 at base %d.\n", new_j-j, j); | |
81 add_gap(gaps, 1, j, new_j-j); | |
82 j = new_j; | |
83 } else if (gap_seq == 2) { | |
84 printf("%dbp gap in seq2 at base %d.\n", new_i-i, i); | |
85 add_gap(gaps, 2, i, new_i-i); | |
86 i = new_i; | |
87 } | |
88 i++; | |
89 j++; | |
90 } | |
91 | |
92 char *new_seq1 = insert_gaps(gaps, seq1, 1); | |
93 char *new_seq2 = insert_gaps(gaps, seq2, 2); | |
94 printf("alignment:\n%s\n%s\n", new_seq1, new_seq2); | |
95 } | |
96 | |
97 // Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with | |
98 // few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match | |
99 // percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be | |
100 // more than NAIVE_TEST_MIN. | |
101 int _test_match(char *seq1, int start1, char *seq2, int start2) { | |
102 int matches = 0; | |
103 int total = 0; | |
104 char base1, base2; | |
105 int i; | |
106 for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) { | |
107 base1 = seq1[start1+i]; | |
108 base2 = seq2[start2+i]; | |
109 if (base1 == 0 || base2 == 0) { | |
110 break; | |
111 } | |
112 if (base1 == base2) { | |
113 matches++; | |
114 } | |
115 total++; | |
116 } | |
117 return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES; | |
118 } | |
119 | |
120 Gaps *make_gaps() { | |
121 Gaps *gaps = malloc(sizeof(Gaps)); | |
122 gaps->root = 0; | |
123 gaps->tip = 0; | |
124 gaps->length = 0; | |
125 return gaps; | |
126 } | |
127 | |
128 void add_gap(Gaps *gaps, int seq, int coord, int length) { | |
129 Gap *gap = malloc(sizeof(Gap)); | |
130 gap->next = 0; | |
131 gap->seq = seq; | |
132 gap->coord = coord; | |
133 gap->length = length; | |
134 if (gaps->root == 0) { | |
135 gaps->root = gap; | |
136 } else { | |
137 gaps->tip->next = gap; | |
138 } | |
139 gaps->tip = gap; | |
140 gaps->length++; | |
141 } | |
142 | |
143 // Take gap information from the aligner and put them into the sequence string as "-" characters. | |
144 char *insert_gaps(Gaps *gaps, char *seq, int seq_num) { | |
145 if (gaps->root == 0) { | |
146 return seq; | |
147 } | |
148 | |
149 // How long should the new sequence be? | |
150 int extra_len = 0; | |
151 Gap *gap = gaps->root; | |
152 while (gap) { | |
153 if (gap->seq == seq_num) { | |
154 extra_len += gap->length; | |
155 } | |
156 gap = gap->next; | |
157 } | |
158 | |
159 //TODO: Handle a situation with no gaps. | |
160 int new_len = extra_len + strlen(seq) + 1; | |
161 char *new_seq = malloc(sizeof(char) * new_len); | |
162 int i = 0; | |
163 int j = 0; | |
164 gap = gaps->root; | |
165 while (gap) { | |
166 // Check that it's a gap in our sequence. | |
167 if (gap->seq != seq_num) { | |
168 gap = gap->next; | |
169 continue; | |
170 } | |
171 // Copy verbatim all the sequence until the gap. | |
172 while (i <= gap->coord) { | |
173 new_seq[j] = seq[i]; | |
174 i++; | |
175 j++; | |
176 } | |
177 // Add -'s the whole length of the gap. | |
178 while (j < gap->coord + gap->length + 1) { | |
179 new_seq[j] = '-'; | |
180 j++; | |
181 } | |
182 gap = gap->next; | |
183 } | |
184 // Fill in the end sequence. | |
185 while (seq[i]) { | |
186 new_seq[j] = seq[i]; | |
187 i++; | |
188 j++; | |
189 } | |
190 new_seq[new_len-1] = 0; | |
191 return new_seq; | |
192 } |