comparison align.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 #define NAIVE_TEST_WINDOW 6
6 #define NAIVE_TEST_THRES 0.80
7 #define NAIVE_TEST_MIN 2
8 #define NAIVE_WINDOW 10
9 #define NAIVE_THRES 0.80
10
11 typedef struct Gap {
12 int seq;
13 int coord;
14 int length;
15 struct Gap *next;
16 } Gap;
17
18 typedef struct Gaps {
19 int length;
20 struct Gap *root;
21 struct Gap *tip;
22 } Gaps;
23
24 int _test_match(char *seq1, int start1, char *seq2, int start2);
25 void add_gap(Gaps *gaps, int seq, int coord, int length);
26 Gaps *make_gaps();
27 char *insert_gaps(Gaps *gaps, char *seq, int seq_num);
28
29
30 // A naive algorithm for aligning two sequences which are expected to be very similar to each other
31 // and already nearly aligned.
32 void naive2(char *seq1, char *seq2) {
33 Gaps *gaps = make_gaps();
34 int i = 0;
35 int j = 0;
36 int matches = 0;
37 while (seq1[i] != 0 && seq2[j] != 0) {
38 // Match?
39 printf("%c %c | i %d j %d\n", seq1[i], seq2[j], i, j);
40 if (seq1[i] == seq2[j]) {
41 matches++;
42 i++;
43 j++;
44 continue;
45 }
46 printf("mismatch!\n");
47 // Mismatch. Start adding gaps until the mismatches go away.
48 int new_i = i;
49 int new_j = j;
50 int gap_seq = 0;
51 int success;
52 while (1) {
53 if (seq1[new_i] == 0 && seq2[new_j] == 0) {
54 break;
55 }
56 success = _test_match(seq1, new_i, seq2, j);
57 if (success) {
58 gap_seq = 2;
59 break;
60 }
61 if (seq1[new_i] != 0) {
62 new_i++;
63 }
64 success = _test_match(seq1, i, seq2, new_j);
65 if (success) {
66 gap_seq = 1;
67 break;
68 }
69 if (seq2[new_j] != 0) {
70 new_j++;
71 }
72 }
73 // Which sequence are we putting the gap in?
74 if (gap_seq == 0) {
75 printf("No good gap found. new_i: %d, new_j: %d\n", new_i, new_j);
76 // No good gap found.
77 } else if (i == new_i && j == new_j) {
78 printf("No gap required.\n");
79 } else if (gap_seq == 1) {
80 printf("%dbp gap in seq1 at base %d.\n", new_j-j, j);
81 add_gap(gaps, 1, j, new_j-j);
82 j = new_j;
83 } else if (gap_seq == 2) {
84 printf("%dbp gap in seq2 at base %d.\n", new_i-i, i);
85 add_gap(gaps, 2, i, new_i-i);
86 i = new_i;
87 }
88 i++;
89 j++;
90 }
91
92 char *new_seq1 = insert_gaps(gaps, seq1, 1);
93 char *new_seq2 = insert_gaps(gaps, seq2, 2);
94 printf("alignment:\n%s\n%s\n", new_seq1, new_seq2);
95 }
96
97 // Check if the few bases starting at start1 and start2 in seq1 and seq2, respectively, align with
98 // few mismatches. The number of bases checked is NAIVE_TEST_WINDOW, and they must have a match
99 // percentage greater than NAIVE_TEST_THRES. Also, the amount of sequence left to compare must be
100 // more than NAIVE_TEST_MIN.
101 int _test_match(char *seq1, int start1, char *seq2, int start2) {
102 int matches = 0;
103 int total = 0;
104 char base1, base2;
105 int i;
106 for (i = 0; i < NAIVE_TEST_WINDOW-1; i++) {
107 base1 = seq1[start1+i];
108 base2 = seq2[start2+i];
109 if (base1 == 0 || base2 == 0) {
110 break;
111 }
112 if (base1 == base2) {
113 matches++;
114 }
115 total++;
116 }
117 return total > NAIVE_TEST_MIN && (double)matches/total > NAIVE_TEST_THRES;
118 }
119
120 Gaps *make_gaps() {
121 Gaps *gaps = malloc(sizeof(Gaps));
122 gaps->root = 0;
123 gaps->tip = 0;
124 gaps->length = 0;
125 return gaps;
126 }
127
128 void add_gap(Gaps *gaps, int seq, int coord, int length) {
129 Gap *gap = malloc(sizeof(Gap));
130 gap->next = 0;
131 gap->seq = seq;
132 gap->coord = coord;
133 gap->length = length;
134 if (gaps->root == 0) {
135 gaps->root = gap;
136 } else {
137 gaps->tip->next = gap;
138 }
139 gaps->tip = gap;
140 gaps->length++;
141 }
142
143 // Take gap information from the aligner and put them into the sequence string as "-" characters.
144 char *insert_gaps(Gaps *gaps, char *seq, int seq_num) {
145 if (gaps->root == 0) {
146 return seq;
147 }
148
149 // How long should the new sequence be?
150 int extra_len = 0;
151 Gap *gap = gaps->root;
152 while (gap) {
153 if (gap->seq == seq_num) {
154 extra_len += gap->length;
155 }
156 gap = gap->next;
157 }
158
159 //TODO: Handle a situation with no gaps.
160 int new_len = extra_len + strlen(seq) + 1;
161 char *new_seq = malloc(sizeof(char) * new_len);
162 int i = 0;
163 int j = 0;
164 gap = gaps->root;
165 while (gap) {
166 // Check that it's a gap in our sequence.
167 if (gap->seq != seq_num) {
168 gap = gap->next;
169 continue;
170 }
171 // Copy verbatim all the sequence until the gap.
172 while (i <= gap->coord) {
173 new_seq[j] = seq[i];
174 i++;
175 j++;
176 }
177 // Add -'s the whole length of the gap.
178 while (j < gap->coord + gap->length + 1) {
179 new_seq[j] = '-';
180 j++;
181 }
182 gap = gap->next;
183 }
184 // Fill in the end sequence.
185 while (seq[i]) {
186 new_seq[j] = seq[i];
187 i++;
188 j++;
189 }
190 new_seq[new_len-1] = 0;
191 return new_seq;
192 }