Mercurial > repos > nick > duplex
comparison seqtools.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:836fa4fe9494 | 18:e4d75f9efb90 |
|---|---|
| 1 #include <stdlib.h> | |
| 2 #include <string.h> | |
| 3 #include <ctype.h> | |
| 4 #include <math.h> | |
| 5 | |
| 6 // ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz | |
| 7 #define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz" | |
| 8 #define TRANS_OFFSET 65 | |
| 9 #define TRANS_LEN 57 | |
| 10 | |
| 11 char* get_revcomp(char *input); | |
| 12 char get_char_comp(char c); | |
| 13 int *get_diffs_simple(char *cons, char *seqs[], int n_seqs); | |
| 14 double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs); | |
| 15 double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins); | |
| 16 char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2); | |
| 17 char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2); | |
| 18 | |
| 19 | |
| 20 // Return the reverse complement of a sequence. | |
| 21 // Makes a new copy of the string, so the original is not modified. | |
| 22 char* get_revcomp(char *input) { | |
| 23 int length = strlen(input); | |
| 24 char *output = malloc(sizeof(char) * length + 1); | |
| 25 int i, j; | |
| 26 for (i = 0, j = length - 1; i < length && j >= 0; i++, j--) { | |
| 27 output[j] = get_char_comp(input[i]); | |
| 28 } | |
| 29 output[length] = '\0'; | |
| 30 return output; | |
| 31 } | |
| 32 | |
| 33 | |
| 34 // Return the complement of a base. | |
| 35 // Uses a simple lookup table: a string with the complements of all possible sequence characters. | |
| 36 char get_char_comp(char c) { | |
| 37 int i = c - TRANS_OFFSET; | |
| 38 if (i < 0 || i > TRANS_LEN) { | |
| 39 return c; | |
| 40 } else { | |
| 41 return TRANS[i]; | |
| 42 } | |
| 43 } | |
| 44 | |
| 45 | |
| 46 /* Take an existing alignment and consensus and compute the number of differences between each | |
| 47 * sequence and the consensus. | |
| 48 * Known bugs: | |
| 49 * 1. Counts no differences in the following sequences: | |
| 50 * consensus: GA---CA | |
| 51 * seq 1: GA----A | |
| 52 * seq 2: GA--ACA | |
| 53 * 2. If a sequence starts with a gap, each base in the gap will be counted as a diff. | |
| 54 */ | |
| 55 int *get_diffs_simple(char *cons, char *seqs[], int n_seqs) { | |
| 56 int *diffs = malloc(sizeof(int) * n_seqs); | |
| 57 int i = 0; | |
| 58 // Uppercase the consensus. | |
| 59 while (cons[i] != 0) { | |
| 60 cons[i] = toupper(cons[i]); | |
| 61 i++; | |
| 62 } | |
| 63 // Loop through the sequences in the alignment. | |
| 64 for (i = 0; i < n_seqs; i++) { | |
| 65 int in_gap; | |
| 66 diffs[i] = 0; | |
| 67 int j = 0; | |
| 68 // Compare each base of the sequence to the consensus. | |
| 69 while (seqs[i][j] != 0 && cons[j] != 0) { | |
| 70 if (cons[j] != '-' && seqs[i][j] != '-') { | |
| 71 in_gap = 0; | |
| 72 } | |
| 73 if (toupper(seqs[i][j]) != cons[j]) { | |
| 74 if (!in_gap) { | |
| 75 diffs[i]++; | |
| 76 } | |
| 77 } | |
| 78 if (cons[j] == '-' || seqs[i][j] == '-') { | |
| 79 in_gap = 1; | |
| 80 } | |
| 81 j++; | |
| 82 } | |
| 83 } | |
| 84 return diffs; | |
| 85 } | |
| 86 | |
| 87 | |
| 88 // Convert the output of get_diffs_simple() from raw diff counts to fractions of the total sequence | |
| 89 // lengths. | |
| 90 //TODO: Don't count gaps in sequence length. | |
| 91 double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs) { | |
| 92 int *diffs = get_diffs_simple(cons, seqs, n_seqs); | |
| 93 double *fracs = malloc(sizeof(double) * n_seqs); | |
| 94 int i; | |
| 95 for (i = 0; i < n_seqs; i++) { | |
| 96 int j = 0; | |
| 97 while (seqs[i][j] != 0 && cons[j] != 0) { | |
| 98 j++; | |
| 99 } | |
| 100 fracs[i] = (double)diffs[i]/j; | |
| 101 } | |
| 102 return fracs; | |
| 103 } | |
| 104 | |
| 105 | |
| 106 /* Take an existing alignment and consensus and compute the number of differences between each | |
| 107 * sequence and the consensus. Break each sequence into bins and tally the differences in each bin. | |
| 108 * Known bugs: | |
| 109 * 1. counts no differences in the following sequences: | |
| 110 * consensus: GA---CA | |
| 111 * seq 1: GA----A | |
| 112 * seq 2: GA--ACA | |
| 113 * 2. If a bin starts with a gap, each base in the gap will be counted as a diff. | |
| 114 */ | |
| 115 int **get_diffs_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { | |
| 116 int bin_size = (int)round((float)seq_len/bins); | |
| 117 // Initialize the diffs 2d array. | |
| 118 int **diffs = malloc(sizeof(int*) * n_seqs); | |
| 119 int i, j; | |
| 120 for (i = 0; i < n_seqs; i++) { | |
| 121 diffs[i] = malloc(bins * sizeof(int)); | |
| 122 for (j = 0; j < bins; j++) { | |
| 123 diffs[i][j] = 0; | |
| 124 } | |
| 125 } | |
| 126 // Uppercase the consensus. | |
| 127 while (cons[i] != 0) { | |
| 128 cons[i] = toupper(cons[i]); | |
| 129 i++; | |
| 130 } | |
| 131 int bin, in_gap; | |
| 132 // Loop through the sequences in the alignment. | |
| 133 for (i = 0; i < n_seqs; i++) { | |
| 134 j = 0; | |
| 135 // Compare each base of the sequence to the consensus. | |
| 136 while (seqs[i][j] != 0 && cons[j] != 0) { | |
| 137 bin = j/bin_size; | |
| 138 if (bin >= bins) { | |
| 139 break; | |
| 140 } | |
| 141 if (cons[j] != '-' && seqs[i][j] != '-') { | |
| 142 in_gap = 0; | |
| 143 } | |
| 144 if (toupper(seqs[i][j]) != cons[j]) { | |
| 145 if (!in_gap) { | |
| 146 diffs[i][bin]++; | |
| 147 } | |
| 148 } | |
| 149 if (cons[j] == '-' || seqs[i][j] == '-') { | |
| 150 in_gap = 1; | |
| 151 } | |
| 152 j++; | |
| 153 } | |
| 154 } | |
| 155 return diffs; | |
| 156 } | |
| 157 | |
| 158 | |
| 159 // Convert the output of get_diffs_binned() from raw diff counts to fractions of the total bin | |
| 160 // lengths. | |
| 161 //TODO: Don't count gaps in bin length. | |
| 162 double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { | |
| 163 int bin_size = (int)round((float)seq_len/bins); | |
| 164 int **diffs = get_diffs_binned(cons, seqs, n_seqs, seq_len, bins); | |
| 165 double **fracs = malloc(sizeof(double*) * n_seqs); | |
| 166 int i; | |
| 167 for (i = 0; i < n_seqs; i++) { | |
| 168 fracs[i] = malloc(sizeof(double) * bins); | |
| 169 // Create and init array of lengths of the bins. | |
| 170 int bin_lengths[bins]; | |
| 171 int bin; | |
| 172 for (bin = 0; bin < bins; bin++) { | |
| 173 bin_lengths[bin] = 0; | |
| 174 } | |
| 175 // Tally size of each bin. | |
| 176 int j = 0; | |
| 177 while (seqs[i][j] != 0 && cons[j] != 0) { | |
| 178 int bin = j/bin_size; | |
| 179 if (bin >= bins) { | |
| 180 break; | |
| 181 } | |
| 182 bin_lengths[bin]++; | |
| 183 j++; | |
| 184 } | |
| 185 // For each bin, calculate the diff fraction = diffs / bin_length. | |
| 186 for (bin = 0; bin < bins; bin++) { | |
| 187 fracs[i][bin] = (double)diffs[i][bin]/bin_lengths[bin]; | |
| 188 // printf("bin %d: %d / %d = %f\t", bin, diffs[i][bin], bin_lengths[bin], fracs[i][bin]); | |
| 189 } | |
| 190 // printf("\n"); | |
| 191 } | |
| 192 return fracs; | |
| 193 } | |
| 194 | |
| 195 | |
| 196 // Take an input sequence and insert gaps according to another, already-aligned sequence with gaps. | |
| 197 // Input strings must be null-terminated. "gap_char1" is the character used for gaps in | |
| 198 // "gapped_seq", and "gap_char2" is the gap character in "inseq". | |
| 199 // N.B.: The ungapped length of "gapped_seq" must be equal to the length of "inseq". | |
| 200 char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2) { | |
| 201 if (gap_char1 == 0) { | |
| 202 gap_char1 = '-'; | |
| 203 } | |
| 204 if (gap_char2 == 0) { | |
| 205 gap_char2 = '-'; | |
| 206 } | |
| 207 int gapped_len = strlen(gapped_seq); | |
| 208 char *outseq = malloc(sizeof(char) * gapped_len + 1); | |
| 209 | |
| 210 // Transfer characters from inseq to outseq, except when gapped_seq has a gap at that spot | |
| 211 // (insert a gap there instead). | |
| 212 int g, o, i; | |
| 213 for (g = 0, o = 0, i = 0; g < gapped_len; g++, o++) { | |
| 214 if (gapped_seq[g] == gap_char1) { | |
| 215 outseq[o] = gap_char2; | |
| 216 } else { | |
| 217 outseq[o] = inseq[i]; | |
| 218 i++; | |
| 219 } | |
| 220 } | |
| 221 outseq[gapped_len] = '\0'; | |
| 222 | |
| 223 return outseq; | |
| 224 } | |
| 225 | |
| 226 | |
| 227 // Wrapper for transfer_gaps() when operating on a set of sequences at once. | |
| 228 char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, | |
| 229 char gap_char2) { | |
| 230 char **outseqs = malloc(sizeof(char *) * n_seqs); | |
| 231 int i; | |
| 232 for (i = 0; i < n_seqs; i++) { | |
| 233 outseqs[i] = transfer_gaps(gapped_seqs[i], inseqs[i], gap_char1, gap_char2); | |
| 234 } | |
| 235 return outseqs; | |
| 236 } |
