comparison seqtools.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #include <stdlib.h>
2 #include <string.h>
3 #include <ctype.h>
4 #include <math.h>
5
6 // ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz
7 #define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz"
8 #define TRANS_OFFSET 65
9 #define TRANS_LEN 57
10
11 char* get_revcomp(char *input);
12 char get_char_comp(char c);
13 int *get_diffs_simple(char *cons, char *seqs[], int n_seqs);
14 double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs);
15 double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins);
16 char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2);
17 char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2);
18
19
20 // Return the reverse complement of a sequence.
21 // Makes a new copy of the string, so the original is not modified.
22 char* get_revcomp(char *input) {
23 int length = strlen(input);
24 char *output = malloc(sizeof(char) * length + 1);
25 int i, j;
26 for (i = 0, j = length - 1; i < length && j >= 0; i++, j--) {
27 output[j] = get_char_comp(input[i]);
28 }
29 output[length] = '\0';
30 return output;
31 }
32
33
34 // Return the complement of a base.
35 // Uses a simple lookup table: a string with the complements of all possible sequence characters.
36 char get_char_comp(char c) {
37 int i = c - TRANS_OFFSET;
38 if (i < 0 || i > TRANS_LEN) {
39 return c;
40 } else {
41 return TRANS[i];
42 }
43 }
44
45
46 /* Take an existing alignment and consensus and compute the number of differences between each
47 * sequence and the consensus.
48 * Known bugs:
49 * 1. Counts no differences in the following sequences:
50 * consensus: GA---CA
51 * seq 1: GA----A
52 * seq 2: GA--ACA
53 * 2. If a sequence starts with a gap, each base in the gap will be counted as a diff.
54 */
55 int *get_diffs_simple(char *cons, char *seqs[], int n_seqs) {
56 int *diffs = malloc(sizeof(int) * n_seqs);
57 int i = 0;
58 // Uppercase the consensus.
59 while (cons[i] != 0) {
60 cons[i] = toupper(cons[i]);
61 i++;
62 }
63 // Loop through the sequences in the alignment.
64 for (i = 0; i < n_seqs; i++) {
65 int in_gap;
66 diffs[i] = 0;
67 int j = 0;
68 // Compare each base of the sequence to the consensus.
69 while (seqs[i][j] != 0 && cons[j] != 0) {
70 if (cons[j] != '-' && seqs[i][j] != '-') {
71 in_gap = 0;
72 }
73 if (toupper(seqs[i][j]) != cons[j]) {
74 if (!in_gap) {
75 diffs[i]++;
76 }
77 }
78 if (cons[j] == '-' || seqs[i][j] == '-') {
79 in_gap = 1;
80 }
81 j++;
82 }
83 }
84 return diffs;
85 }
86
87
88 // Convert the output of get_diffs_simple() from raw diff counts to fractions of the total sequence
89 // lengths.
90 //TODO: Don't count gaps in sequence length.
91 double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs) {
92 int *diffs = get_diffs_simple(cons, seqs, n_seqs);
93 double *fracs = malloc(sizeof(double) * n_seqs);
94 int i;
95 for (i = 0; i < n_seqs; i++) {
96 int j = 0;
97 while (seqs[i][j] != 0 && cons[j] != 0) {
98 j++;
99 }
100 fracs[i] = (double)diffs[i]/j;
101 }
102 return fracs;
103 }
104
105
106 /* Take an existing alignment and consensus and compute the number of differences between each
107 * sequence and the consensus. Break each sequence into bins and tally the differences in each bin.
108 * Known bugs:
109 * 1. counts no differences in the following sequences:
110 * consensus: GA---CA
111 * seq 1: GA----A
112 * seq 2: GA--ACA
113 * 2. If a bin starts with a gap, each base in the gap will be counted as a diff.
114 */
115 int **get_diffs_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) {
116 int bin_size = (int)round((float)seq_len/bins);
117 // Initialize the diffs 2d array.
118 int **diffs = malloc(sizeof(int*) * n_seqs);
119 int i, j;
120 for (i = 0; i < n_seqs; i++) {
121 diffs[i] = malloc(bins * sizeof(int));
122 for (j = 0; j < bins; j++) {
123 diffs[i][j] = 0;
124 }
125 }
126 // Uppercase the consensus.
127 while (cons[i] != 0) {
128 cons[i] = toupper(cons[i]);
129 i++;
130 }
131 int bin, in_gap;
132 // Loop through the sequences in the alignment.
133 for (i = 0; i < n_seqs; i++) {
134 j = 0;
135 // Compare each base of the sequence to the consensus.
136 while (seqs[i][j] != 0 && cons[j] != 0) {
137 bin = j/bin_size;
138 if (bin >= bins) {
139 break;
140 }
141 if (cons[j] != '-' && seqs[i][j] != '-') {
142 in_gap = 0;
143 }
144 if (toupper(seqs[i][j]) != cons[j]) {
145 if (!in_gap) {
146 diffs[i][bin]++;
147 }
148 }
149 if (cons[j] == '-' || seqs[i][j] == '-') {
150 in_gap = 1;
151 }
152 j++;
153 }
154 }
155 return diffs;
156 }
157
158
159 // Convert the output of get_diffs_binned() from raw diff counts to fractions of the total bin
160 // lengths.
161 //TODO: Don't count gaps in bin length.
162 double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) {
163 int bin_size = (int)round((float)seq_len/bins);
164 int **diffs = get_diffs_binned(cons, seqs, n_seqs, seq_len, bins);
165 double **fracs = malloc(sizeof(double*) * n_seqs);
166 int i;
167 for (i = 0; i < n_seqs; i++) {
168 fracs[i] = malloc(sizeof(double) * bins);
169 // Create and init array of lengths of the bins.
170 int bin_lengths[bins];
171 int bin;
172 for (bin = 0; bin < bins; bin++) {
173 bin_lengths[bin] = 0;
174 }
175 // Tally size of each bin.
176 int j = 0;
177 while (seqs[i][j] != 0 && cons[j] != 0) {
178 int bin = j/bin_size;
179 if (bin >= bins) {
180 break;
181 }
182 bin_lengths[bin]++;
183 j++;
184 }
185 // For each bin, calculate the diff fraction = diffs / bin_length.
186 for (bin = 0; bin < bins; bin++) {
187 fracs[i][bin] = (double)diffs[i][bin]/bin_lengths[bin];
188 // printf("bin %d: %d / %d = %f\t", bin, diffs[i][bin], bin_lengths[bin], fracs[i][bin]);
189 }
190 // printf("\n");
191 }
192 return fracs;
193 }
194
195
196 // Take an input sequence and insert gaps according to another, already-aligned sequence with gaps.
197 // Input strings must be null-terminated. "gap_char1" is the character used for gaps in
198 // "gapped_seq", and "gap_char2" is the gap character in "inseq".
199 // N.B.: The ungapped length of "gapped_seq" must be equal to the length of "inseq".
200 char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2) {
201 if (gap_char1 == 0) {
202 gap_char1 = '-';
203 }
204 if (gap_char2 == 0) {
205 gap_char2 = '-';
206 }
207 int gapped_len = strlen(gapped_seq);
208 char *outseq = malloc(sizeof(char) * gapped_len + 1);
209
210 // Transfer characters from inseq to outseq, except when gapped_seq has a gap at that spot
211 // (insert a gap there instead).
212 int g, o, i;
213 for (g = 0, o = 0, i = 0; g < gapped_len; g++, o++) {
214 if (gapped_seq[g] == gap_char1) {
215 outseq[o] = gap_char2;
216 } else {
217 outseq[o] = inseq[i];
218 i++;
219 }
220 }
221 outseq[gapped_len] = '\0';
222
223 return outseq;
224 }
225
226
227 // Wrapper for transfer_gaps() when operating on a set of sequences at once.
228 char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1,
229 char gap_char2) {
230 char **outseqs = malloc(sizeof(char *) * n_seqs);
231 int i;
232 for (i = 0; i < n_seqs; i++) {
233 outseqs[i] = transfer_gaps(gapped_seqs[i], inseqs[i], gap_char1, gap_char2);
234 }
235 return outseqs;
236 }