Mercurial > repos > nick > duplex
diff consensus.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus.c Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,614 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <limits.h> + +// N.B. This defines the valid bases, but it's also effectively defined in the switches in +// get_votes_simple(), get_votes_qual(), and get_base_prime(), and in the constant IUPAC_BASES. +#define N_BASES 6 +const char *BASES = "ACGTN-"; +/* A C G T N - A: 2 Compute IUPAC ambiguous base character by representing each base +A 4 6 10 14 22 26 C: 3 with a prime and multiplying. Then use a lookup table (an array +C 9 15 21 33 39 G: 5 where the index is the product of the two primes). +G 25 35 55 65 T: 7 +T 49 77 91 N: 11 +N 121 143 -: 13 1 2 3 4 5 6 7 +- 169 01234567890123456789012345678901234567890123456789012345678901234567890*/ +const char *IUPAC_BASES = "N...A.M..CR...WS.....YN..GN......N.K...N.........T.....N.........N....." +// 8 9 10 11 12 13 14 + "......N.............N.............................N..................." +// 15 16 17 + "..N.........................-"; +#define THRES_DEFAULT 0.5 +#define WIN_LEN 4 +#define GAP_CHAR ' ' + +int **get_votes_simple(char *align[], int n_seqs, int seq_len); +int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres); +int init_gap_qual_window(int *window, char *quals, int seq_len); +char get_gap_qual(int *window); +int push_qual(int *window, int win_edge, char *quals, int seq_len); +void print_window(int *window, int win_edge); +int **init_votes(int seq_len); +void free_votes(int *votes[], int seq_len); +void print_votes(char *consensus, int *votes[], int seq_len); +char *rm_gaps(char *consensus, int cons_len); +char *build_consensus(int *votes[], int seq_len, double thres); +char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres); +char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped); +int get_base_prime(char base); +char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double thres, + char qual_thres, int gapped); +char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[], + int n_seqs1, int n_seqs2, int seq_len, double cons_thres, + char qual_thres, int gapped, char *method); + + +// Tally the different bases at each position in an alignment. +// Returns an array of arrays: for each position in the alignment, an array of the number of times +// each base occurs at that position. The order of bases is as in the "BASES" constant. +int **get_votes_simple(char *align[], int n_seqs, int seq_len) { + int **votes = init_votes(seq_len); + + // Tally votes for each base. + int i, j; + for (i = 0; i < n_seqs; i++) { + for (j = 0; j < seq_len; j++) { + // N.B.: Could write this without hardcoded literals, but it's about 40% slower. + switch (toupper(align[i][j])) { + case 'A': + votes[j][0]++; + break; + case 'C': + votes[j][1]++; + break; + case 'G': + votes[j][2]++; + break; + case 'T': + votes[j][3]++; + break; + case 'N': + votes[j][4]++; + break; + case '-': + votes[j][5]++; + break; + } + } + } + + return votes; +} + + +// Tally votes for each base, ignoring bases with a quality score below "thres". +int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres) { + int **votes = init_votes(seq_len); + int *window = malloc(sizeof(int) * WIN_LEN * 2); + int win_edge; + + // Tally votes for each base. + char qual; + int i, j; + for (i = 0; i < n_seqs; i++) { + win_edge = init_gap_qual_window(window, quals[i], seq_len); + for (j = 0; j < seq_len; j++) { + // Figure out the quality score of the base (or gap). + if (align[i][j] == '-') { + qual = get_gap_qual(window); + } else { + win_edge = push_qual(window, win_edge, quals[i], seq_len); + qual = quals[i][j]; + } + // Don't count bases whose quality is less than the threshold. + if (qual < thres) { + continue; + } + // N.B.: Could write this without hardcoded literals, but it's about 40% slower. + switch (toupper(align[i][j])) { + case 'A': + votes[j][0]++; + break; + case 'C': + votes[j][1]++; + break; + case 'G': + votes[j][2]++; + break; + case 'T': + votes[j][3]++; + break; + case 'N': + votes[j][4]++; + break; + case '-': + votes[j][5]++; + break; + } + } + } + + free(window); + return votes; +} + + +/* Tally votes for each base, weighting by the PHRED score of the base. + * This is based on the theory of PHRED scores representing the literal probability of the base call + * being erroneous. Thus, if two reads show a C at a position, both with PHRED 20 (1/100 chance of + * error), then the chances of them both being wrong are 1/100 * 1/100 = 1/10000 (PHRED 40). + * So, theoretically and intuitively, it makes sense to trust two PHRED 20 C's over one PHRED 30 A. + * This seems better than arbitrarily deciding not to consider bases below a PHRED score threshold. + * How to decide when not to call the base? We could just say that if no base's score total is above + * a certain threshold, we call it an N. Theoretically, this threshold is the confidence we want in + * our final base calls. This could even replace the arbitrary 3 reads for a consensus threshold. + */ +int **get_votes_weighted(char *align[], char *quals[], int n_seqs, int seq_len) { + int **votes = init_votes(seq_len); + int *window = malloc(sizeof(int) * WIN_LEN * 2); + int win_edge; + + // Tally votes for each base. + char qual; + int i, j; + for (i = 0; i < n_seqs; i++) { + win_edge = init_gap_qual_window(window, quals[i], seq_len); + for (j = 0; j < seq_len; j++) { + // Figure out the quality score of the base (or gap). + if (align[i][j] == '-') { + qual = get_gap_qual(window); + } else { + win_edge = push_qual(window, win_edge, quals[i], seq_len); + qual = quals[i][j]; + } + // N.B.: Could write this without hardcoded literals, but it's about 40% slower. + switch (toupper(align[i][j])) { + case 'A': + votes[j][0] += qual; + break; + case 'C': + votes[j][1] += qual; + break; + case 'G': + votes[j][2] += qual; + break; + case 'T': + votes[j][3] += qual; + break; + case 'N': + votes[j][4] += qual; + break; + case '-': + votes[j][5] += qual; + break; + } + } + } + + free(window); + return votes; +} + + +/* Calculation of gap quality scores: + * "window" is an array of length 2*WIN_LEN, holding the quality scores of the non-gap bases WIN_LEN + * from the current one in both directions. When we're at the start or end, fill the slots beyond + * the edge of the sequence with -1 as a sentinel for "N/A". For example, after the 2nd base in the + * sequence, if WIN_LEN is 4 and there are no gaps yet, "window" should look something like this: + * base coordinates 0 1 2 3 4 5 + * quality scores = 9 A > E B + * array values [-1|-1|28|24||32|29|36|33] + * Usage: + * Allocate "window" and call init_gap_qual_window() to initialize it by filling its right side with + * the first WIN_LEN non-gap quality scores in the sequence. It will also return "win_edge", which + * tracks the coordinate of the rightmost quality score in "window". + * Then, start traversing the array of quality scores. For each score, if it's a gap character, call + * get_gap_qual() to get the computed quality score at the gap. Otherwise, call push_qual() to add + * another quality score to the end of the window. This will keep the window filled with all non-gap + * quality scores, and each time you call get_gap_qual(), you should be at the gap between the two + * quality scores at the center of the window. + */ + +// This does the initial fill of the window array, adding the first WIN_LEN bases to the right side +// and filling the left side with -1's. +int init_gap_qual_window(int *window, char *quals, int seq_len) { + // Fill left side with -1's (no quality information). + int i; + for (i = 0; i < WIN_LEN; i++) { + window[i] = -1; + } + // Fill right side with first WIN_LEN quality scores. Skip gaps, and if you run out of quality + // scores (if seq_len < WIN_LEN), fill the rest with -1. Leave win_edge at the last base we added. + i = WIN_LEN; + int win_edge = -1; + int quals_added = 0; + while (quals_added < WIN_LEN) { + win_edge++; + if (win_edge >= seq_len) { + win_edge = seq_len; + window[i] = -1; + i++; + quals_added++; + } else if (quals[win_edge] != GAP_CHAR) { + window[i] = quals[win_edge]; + i++; + quals_added++; + } + } + return win_edge; +} + + +// Push the next non-gap quality score onto the right side of the window. +int push_qual(int *window, int win_edge, char *quals, int seq_len) { + // Find the next quality score that's not a gap. + char next_qual = GAP_CHAR; + while (next_qual == GAP_CHAR) { + win_edge++; + if (win_edge < seq_len) { + next_qual = quals[win_edge]; + } else { + win_edge = seq_len; + next_qual = -1; + } + } + // Shift all the quality scores left add the new one. + int last_qual; + int i; + for (i = WIN_LEN*2 - 1; i >= 0; i--) { + last_qual = window[i]; + window[i] = next_qual; + next_qual = last_qual; + } + return win_edge; +} + + +// Compute the quality of the gap based on a weighted average of the quality scores in the window. +// The scores near the center of the window are weighted higher than the ones further away. +char get_gap_qual(int *window) { + int score_sum = 0; + int weight_sum = 0; + int weight = 1; + int i; + for (i = 0; i < WIN_LEN*2; i++) { + if (window[i] != -1) { + score_sum += window[i] * weight; + weight_sum += weight; + } + // Increase the weight until we get to the middle of the window (at WIN_LEN), then decrease it. + if (i < WIN_LEN - 1) { + weight++; + } else if (i > WIN_LEN - 1) { + weight--; + } + } + if (weight_sum > 0) { + // Divide by the sum of the weights to get the final quality score. + return (char) (score_sum/weight_sum); + } else { + return '\0'; + } +} + + +void print_window(int *window, int win_edge) { + printf("["); + int i; + for (i = 0; i < WIN_LEN*2; i++) { + printf("%c", window[i]); + if (i == WIN_LEN - 1) { + printf("||"); + } else if (i == WIN_LEN*2 - 1) { + printf("] %-2d\n", win_edge); + } else { + printf("|"); + } + } +} + + +int **init_votes(int seq_len) { + int **votes = malloc(sizeof(int *) * seq_len); + int i, j; + for (i = 0; i < seq_len; i++) { + votes[i] = malloc(sizeof(int) * N_BASES); + for (j = 0; j < N_BASES; j++) { + votes[i][j] = 0; + } + } + return votes; +} + + +void free_votes(int *votes[], int seq_len) { + int i; + for (i = 0; i < seq_len; i++) { + free(votes[i]); + } + free(votes); +} + + +void print_votes(char *consensus, int *votes[], int seq_len) { + int i, j; + printf(" "); + for (j = 0; j < N_BASES; j++) { + printf(" %c ", BASES[j]); + } + printf("\n"); + for (i = 0; i < seq_len; i++) { + printf("%c: ", consensus[i]); + for (j = 0; j < N_BASES; j++) { + if (votes[i][j]) { + printf("%2d ", votes[i][j]); + } else { + printf(" "); + } + } + printf("\n"); + } +} + + +// Take a consensus sequence which may have gaps ('-' characters) and remove them to produce the +// actual final sequence. "cons_len" should be the length of the original, gapped, sequence. +char *rm_gaps(char *consensus, int cons_len) { + char *output = malloc(sizeof(char) * cons_len + 1); + int i; + int j = 0; + for (i = 0; i < cons_len; i++) { + if (consensus[i] != '-') { + output[j] = consensus[i]; + j++; + } + } + output[j] = '\0'; + return output; +} + + +char *build_consensus(int *votes[], int seq_len, double thres) { + char *consensus = malloc(sizeof(char) * seq_len + 1); + + int i, j; + for (i = 0; i < seq_len; i++) { + int total = 0; + int max_vote = 0; + char max_base = 'N'; + for (j = 0; j < N_BASES; j++) { + total += votes[i][j]; + if (votes[i][j] > max_vote) { + max_vote = votes[i][j]; + max_base = BASES[j]; + } + if (total == 0) { + consensus[i] = 'N'; + } else if ((double)max_vote/total > thres) { + consensus[i] = max_base; + } else { + consensus[i] = 'N'; + } + } + } + + consensus[seq_len] = '\0'; + return consensus; +} + + +// Build a consensus sequence from two alignments by weighting each equally and considering only +// the frequency of each base in each alignment. +char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres) { + char *consensus = malloc(sizeof(char) * seq_len + 1); + + int i, j; + for (i = 0; i < seq_len; i++) { + // Sum the total votes at this position. + /*TODO: This does an extra loop through the votes to get the total so it can calculate actual + * frequencies in the second pass. Technically, this information could be gathered when + * originally tallying the votes in the get_votes functions. Or, the total could be assumed + * to be n_seqs if every base always contributes a vote (even when it's not in "ACGTN-"). + */ + int total1 = 0; + for (j = 0; j < N_BASES; j++) { + total1 += votes1[i][j]; + } + int total2 = 0; + for (j = 0; j < N_BASES; j++) { + total2 += votes2[i][j]; + } + double max_freq = 0.0; + char max_base = 'N'; + for (j = 0; j < N_BASES; j++) { + // Get the frequency of each base. + double freq1; + if (total1 > 0) { + freq1 = (double)votes1[i][j]/total1; + } + double freq2; + if (total2 > 0) { + freq2 = (double)votes2[i][j]/total2; + } + // frequency of the base = average of frequencies in the two sequences + double avg_freq; + if (total1 == 0 && total2 == 0) { + avg_freq = -1.0; + } else if (total1 == 0) { + avg_freq = freq2; + } else if (total2 == 0) { + avg_freq = freq1; + } else { + avg_freq = (freq1 + freq2) / 2; + } + // Track the highest frequency seen. + if (avg_freq > max_freq) { + max_freq = avg_freq; + max_base = BASES[j]; + } + } + if (max_freq > thres) { + consensus[i] = max_base; + } else { + consensus[i] = 'N'; + } + } + + consensus[seq_len] = '\0'; + return consensus; +} + + +// "cons1" and "cons2" must be null-terminated strings of equal lengths. +char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped) { + int seq_len = strlen(cons1); + char *cons = malloc(sizeof(char) * seq_len + 1); + int i = 0; + int base_prime1, base_prime2; + while (cons1[i] != '\0' && cons2[i] != '\0') { + base_prime1 = get_base_prime(cons1[i]); + base_prime2 = get_base_prime(cons2[i]); + cons[i] = IUPAC_BASES[base_prime1*base_prime2]; + i++; + } + cons[seq_len] = '\0'; + if (gapped) { + return cons; + } else { + return rm_gaps(cons, seq_len); + } +} + + +int get_base_prime(char base) { + switch (base) { + case 'A': + return 2; + case 'C': + return 3; + case 'G': + return 5; + case 'T': + return 7; + case 'N': + return 11; + case '-': + return 13; + default: + return 0; + } +} + + +// Convenience function to create a consensus in one step. +// Give 0 as "quals" to not use quality scores, and -1.0 as "cons_thres" to use the default +// consensus threshold when evaluating base votes. +char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double cons_thres, + char qual_thres, int gapped) { + if (cons_thres == -1.0) { + cons_thres = THRES_DEFAULT; + } + int **votes; + if (quals == 0) { + votes = get_votes_simple(align, n_seqs, seq_len); + } else { + votes = get_votes_qual(align, quals, n_seqs, seq_len, qual_thres); + } + char *consensus_gapped = build_consensus(votes, seq_len, cons_thres); + char *consensus; + if (gapped) { + consensus = consensus_gapped; + } else { + consensus = rm_gaps(consensus_gapped, seq_len); + } + free_votes(votes, seq_len); + return consensus; +} + + +char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[], + int n_seqs1, int n_seqs2, int seq_len, double cons_thres, + char qual_thres, int gapped, char *method) { + if (cons_thres == -1.0) { + cons_thres = THRES_DEFAULT; + } + int **votes1; + int **votes2; + if (quals1 == 0 || quals2 == 0) { + votes1 = get_votes_simple(align1, n_seqs1, seq_len); + votes2 = get_votes_simple(align2, n_seqs2, seq_len); + } else { + votes1 = get_votes_qual(align1, quals1, n_seqs1, seq_len, qual_thres); + votes2 = get_votes_qual(align2, quals2, n_seqs2, seq_len, qual_thres); + } + char *consensus_gapped; + if (!strncmp(method, "freq", 4)) { + consensus_gapped = build_consensus_duplex(votes1, votes2, seq_len, cons_thres); + } else if (!strncmp(method, "iupac", 5)) { + char *cons1 = build_consensus(votes1, seq_len, cons_thres); + char *cons2 = build_consensus(votes2, seq_len, cons_thres); + consensus_gapped = build_consensus_duplex_simple(cons1, cons2, 1); + } else { + return ""; + } + char *consensus; + if (gapped) { + consensus = consensus_gapped; + } else { + consensus = rm_gaps(consensus_gapped, seq_len); + } + free_votes(votes1, seq_len); + free_votes(votes2, seq_len); + return consensus; +} + + +void get_gap_quals(char *quals) { + int seq_len = strlen(quals); + int *window = malloc(sizeof(int) * WIN_LEN * 2); + int win_edge = init_gap_qual_window(window, quals, seq_len); + print_window(window, win_edge); + + int i; + char gap_qual; + for (i = 0; i < seq_len; i++) { + if (quals[i] == GAP_CHAR) { + gap_qual = get_gap_qual(window); + printf("gap %2d: %2d\n", i, gap_qual); + } else { + win_edge = push_qual(window, win_edge, quals, seq_len); + print_window(window, win_edge); + } + } +} + + +int main(int argc, char *argv[]) { + char **align = malloc(sizeof(char *) * (argc-1)); + + int seq_len = INT_MAX; + int i; + for (i = 1; i < argc; i++) { + if (strlen(argv[i]) < seq_len) { + seq_len = strlen(argv[i]); + } + align[i-1] = argv[i]; + } + + if (argc <= 1) { + return 1; + } + + get_gap_quals(align[0]); + return 0; + + int **votes = get_votes_simple(align, argc-1, seq_len); + char *consensus = build_consensus(votes, seq_len, THRES_DEFAULT); + print_votes(consensus, votes, seq_len); + printf("%s\n", consensus); + free_votes(votes, seq_len); + + return 0; +}
