Mercurial > repos > nick > duplex

diff consensus.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author: nick
date: Thu, 02 Feb 2017 18:44:31 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/consensus.c	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,614 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h>
+
+// N.B. This defines the valid bases, but it's also effectively defined in the switches in
+// get_votes_simple(), get_votes_qual(), and get_base_prime(), and in the constant IUPAC_BASES.
+#define N_BASES 6
+const char *BASES = "ACGTN-";
+/* A  C   G   T   N   -     A: 2    Compute IUPAC ambiguous base character by representing each base
+A  4  6  10  14  22  26     C: 3    with a prime and multiplying. Then use a lookup table (an array
+C     9  15  21  33  39     G: 5    where the index is the product of the two primes).
+G        25  35  55  65     T: 7
+T            49  77  91     N: 11
+N               121 143     -: 13    1         2         3         4         5         6         7
+-                   169    01234567890123456789012345678901234567890123456789012345678901234567890*/
+const char *IUPAC_BASES = "N...A.M..CR...WS.....YN..GN......N.K...N.........T.....N.........N....."
+//                                   8         9        10        11        12        13        14
+                           "......N.............N.............................N..................."
+//                                  15        16        17
+                           "..N.........................-";
+#define THRES_DEFAULT 0.5
+#define WIN_LEN 4
+#define GAP_CHAR ' '
+
+int **get_votes_simple(char *align[], int n_seqs, int seq_len);
+int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres);
+int init_gap_qual_window(int *window, char *quals, int seq_len);
+char get_gap_qual(int *window);
+int push_qual(int *window, int win_edge, char *quals, int seq_len);
+void print_window(int *window, int win_edge);
+int **init_votes(int seq_len);
+void free_votes(int *votes[], int seq_len);
+void print_votes(char *consensus, int *votes[], int seq_len);
+char *rm_gaps(char *consensus, int cons_len);
+char *build_consensus(int *votes[], int seq_len, double thres);
+char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres);
+char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped);
+int get_base_prime(char base);
+char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double thres,
+                    char qual_thres, int gapped);
+char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[],
+                           int n_seqs1, int n_seqs2, int seq_len, double cons_thres,
+                           char qual_thres, int gapped, char *method);
+
+
+// Tally the different bases at each position in an alignment.
+// Returns an array of arrays: for each position in the alignment, an array of the number of times
+// each base occurs at that position. The order of bases is as in the "BASES" constant.
+int **get_votes_simple(char *align[], int n_seqs, int seq_len) {
+  int **votes = init_votes(seq_len);
+
+  // Tally votes for each base.
+  int i, j;
+  for (i = 0; i < n_seqs; i++) {
+    for (j = 0; j < seq_len; j++) {
+      // N.B.: Could write this without hardcoded literals, but it's about 40% slower.
+      switch (toupper(align[i][j])) {
+        case 'A':
+          votes[j][0]++;
+          break;
+        case 'C':
+          votes[j][1]++;
+          break;
+        case 'G':
+          votes[j][2]++;
+          break;
+        case 'T':
+          votes[j][3]++;
+          break;
+        case 'N':
+          votes[j][4]++;
+          break;
+        case '-':
+          votes[j][5]++;
+          break;
+      }
+    }
+  }
+
+  return votes;
+}
+
+
+//  Tally votes for each base, ignoring bases with a quality score below "thres".
+int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres) {
+  int **votes = init_votes(seq_len);
+  int *window = malloc(sizeof(int) * WIN_LEN * 2);
+  int win_edge;
+
+  // Tally votes for each base.
+  char qual;
+  int i, j;
+  for (i = 0; i < n_seqs; i++) {
+    win_edge = init_gap_qual_window(window, quals[i], seq_len);
+    for (j = 0; j < seq_len; j++) {
+      // Figure out the quality score of the base (or gap).
+      if (align[i][j] == '-') {
+        qual = get_gap_qual(window);
+      } else {
+        win_edge = push_qual(window, win_edge, quals[i], seq_len);
+        qual = quals[i][j];
+      }
+      // Don't count bases whose quality is less than the threshold.
+      if (qual < thres) {
+        continue;
+      }
+      // N.B.: Could write this without hardcoded literals, but it's about 40% slower.
+      switch (toupper(align[i][j])) {
+        case 'A':
+          votes[j][0]++;
+          break;
+        case 'C':
+          votes[j][1]++;
+          break;
+        case 'G':
+          votes[j][2]++;
+          break;
+        case 'T':
+          votes[j][3]++;
+          break;
+        case 'N':
+          votes[j][4]++;
+          break;
+        case '-':
+          votes[j][5]++;
+          break;
+      }
+    }
+  }
+
+  free(window);
+  return votes;
+}
+
+
+/* Tally votes for each base, weighting by the PHRED score of the base.
+ * This is based on the theory of PHRED scores representing the literal probability of the base call
+ * being erroneous. Thus, if two reads show a C at a position, both with PHRED 20 (1/100 chance of
+ * error), then the chances of them both being wrong are 1/100 * 1/100 = 1/10000 (PHRED 40).
+ * So, theoretically and intuitively, it makes sense to trust two PHRED 20 C's over one PHRED 30 A.
+ * This seems better than arbitrarily deciding not to consider bases below a PHRED score threshold.
+ * How to decide when not to call the base? We could just say that if no base's score total is above
+ * a certain threshold, we call it an N. Theoretically, this threshold is the confidence we want in
+ * our final base calls. This could even replace the arbitrary 3 reads for a consensus threshold.
+ */
+int **get_votes_weighted(char *align[], char *quals[], int n_seqs, int seq_len) {
+  int **votes = init_votes(seq_len);
+  int *window = malloc(sizeof(int) * WIN_LEN * 2);
+  int win_edge;
+
+  // Tally votes for each base.
+  char qual;
+  int i, j;
+  for (i = 0; i < n_seqs; i++) {
+    win_edge = init_gap_qual_window(window, quals[i], seq_len);
+    for (j = 0; j < seq_len; j++) {
+      // Figure out the quality score of the base (or gap).
+      if (align[i][j] == '-') {
+        qual = get_gap_qual(window);
+      } else {
+        win_edge = push_qual(window, win_edge, quals[i], seq_len);
+        qual = quals[i][j];
+      }
+      // N.B.: Could write this without hardcoded literals, but it's about 40% slower.
+      switch (toupper(align[i][j])) {
+        case 'A':
+          votes[j][0] += qual;
+          break;
+        case 'C':
+          votes[j][1] += qual;
+          break;
+        case 'G':
+          votes[j][2] += qual;
+          break;
+        case 'T':
+          votes[j][3] += qual;
+          break;
+        case 'N':
+          votes[j][4] += qual;
+          break;
+        case '-':
+          votes[j][5] += qual;
+          break;
+      }
+    }
+  }
+
+  free(window);
+  return votes;
+}
+
+
+/* Calculation of gap quality scores:
+ * "window" is an array of length 2*WIN_LEN, holding the quality scores of the non-gap bases WIN_LEN
+ * from the current one in both directions. When we're at the start or end, fill the slots beyond
+ * the edge of the sequence with -1 as a sentinel for "N/A". For example, after the 2nd base in the
+ * sequence, if WIN_LEN is 4 and there are no gaps yet, "window" should look something like this:
+ *   base coordinates         0  1   2  3  4  5
+ *   quality scores           =  9   A  >  E  B
+ *   array values     [-1|-1|28|24||32|29|36|33]
+ * Usage:
+ * Allocate "window" and call init_gap_qual_window() to initialize it by filling its right side with
+ * the first WIN_LEN non-gap quality scores in the sequence. It will also return "win_edge", which
+ * tracks the coordinate of the rightmost quality score in "window".
+ * Then, start traversing the array of quality scores. For each score, if it's a gap character, call
+ * get_gap_qual() to get the computed quality score at the gap. Otherwise, call push_qual() to add
+ * another quality score to the end of the window. This will keep the window filled with all non-gap
+ * quality scores, and each time you call get_gap_qual(), you should be at the gap between the two
+ * quality scores at the center of the window.
+ */
+
+// This does the initial fill of the window array, adding the first WIN_LEN bases to the right side
+// and filling the left side with -1's.
+int init_gap_qual_window(int *window, char *quals, int seq_len) {
+  // Fill left side with -1's (no quality information).
+  int i;
+  for (i = 0; i < WIN_LEN; i++) {
+    window[i] = -1;
+  }
+  // Fill right side with first WIN_LEN quality scores. Skip gaps, and if you run out of quality
+  // scores (if seq_len < WIN_LEN), fill the rest with -1. Leave win_edge at the last base we added.
+  i = WIN_LEN;
+  int win_edge = -1;
+  int quals_added = 0;
+  while (quals_added < WIN_LEN) {
+    win_edge++;
+    if (win_edge >= seq_len) {
+      win_edge = seq_len;
+      window[i] = -1;
+      i++;
+      quals_added++;
+    } else if (quals[win_edge] != GAP_CHAR) {
+      window[i] = quals[win_edge];
+      i++;
+      quals_added++;
+    }
+  }
+  return win_edge;
+}
+
+
+// Push the next non-gap quality score onto the right side of the window.
+int push_qual(int *window, int win_edge, char *quals, int seq_len) {
+  // Find the next quality score that's not a gap.
+  char next_qual = GAP_CHAR;
+  while (next_qual == GAP_CHAR) {
+    win_edge++;
+    if (win_edge < seq_len) {
+      next_qual = quals[win_edge];
+    } else {
+      win_edge = seq_len;
+      next_qual = -1;
+    }
+  }
+  // Shift all the quality scores left add the new one.
+  int last_qual;
+  int i;
+  for (i = WIN_LEN*2 - 1; i >= 0; i--) {
+    last_qual = window[i];
+    window[i] = next_qual;
+    next_qual = last_qual;
+  }
+  return win_edge;
+}
+
+
+// Compute the quality of the gap based on a weighted average of the quality scores in the window.
+// The scores near the center of the window are weighted higher than the ones further away.
+char get_gap_qual(int *window) {
+  int score_sum = 0;
+  int weight_sum = 0;
+  int weight = 1;
+  int i;
+  for (i = 0; i < WIN_LEN*2; i++) {
+    if (window[i] != -1) {
+      score_sum += window[i] * weight;
+      weight_sum += weight;
+    }
+    // Increase the weight until we get to the middle of the window (at WIN_LEN), then decrease it.
+    if (i < WIN_LEN - 1) {
+      weight++;
+    } else if (i > WIN_LEN - 1) {
+      weight--;
+    }
+  }
+  if (weight_sum > 0) {
+    // Divide by the sum of the weights to get the final quality score.
+    return (char) (score_sum/weight_sum);
+  } else {
+    return '\0';
+  }
+}
+
+
+void print_window(int *window, int win_edge) {
+  printf("[");
+  int i;
+  for (i = 0; i < WIN_LEN*2; i++) {
+    printf("%c", window[i]);
+    if (i == WIN_LEN - 1) {
+      printf("||");
+    } else if (i == WIN_LEN*2 - 1) {
+      printf("] %-2d\n", win_edge);
+    } else {
+      printf("|");
+    }
+  }
+}
+
+
+int **init_votes(int seq_len) {
+  int **votes = malloc(sizeof(int *) * seq_len);
+  int i, j;
+  for (i = 0; i < seq_len; i++) {
+    votes[i] = malloc(sizeof(int) * N_BASES);
+    for (j = 0; j < N_BASES; j++) {
+      votes[i][j] = 0;
+    }
+  }
+  return votes;
+}
+
+
+void free_votes(int *votes[], int seq_len) {
+  int i;
+  for (i = 0; i < seq_len; i++) {
+    free(votes[i]);
+  }
+  free(votes);
+}
+
+
+void print_votes(char *consensus, int *votes[], int seq_len) {
+  int i, j;
+  printf("   ");
+  for (j = 0; j < N_BASES; j++) {
+    printf(" %c ", BASES[j]);
+  }
+  printf("\n");
+  for (i = 0; i < seq_len; i++) {
+    printf("%c: ", consensus[i]);
+    for (j = 0; j < N_BASES; j++) {
+      if (votes[i][j]) {
+        printf("%2d ", votes[i][j]);
+      } else {
+        printf("   ");
+      }
+    }
+    printf("\n");
+  }
+}
+
+
+// Take a consensus sequence which may have gaps ('-' characters) and remove them to produce the
+// actual final sequence. "cons_len" should be the length of the original, gapped, sequence.
+char *rm_gaps(char *consensus, int cons_len) {
+  char *output = malloc(sizeof(char) * cons_len + 1);
+  int i;
+  int j = 0;
+  for (i = 0; i < cons_len; i++) {
+    if (consensus[i] != '-') {
+      output[j] = consensus[i];
+      j++;
+    }
+  }
+  output[j] = '\0';
+  return output;
+}
+
+
+char *build_consensus(int *votes[], int seq_len, double thres) {
+  char *consensus = malloc(sizeof(char) * seq_len + 1);
+  
+  int i, j;
+  for (i = 0; i < seq_len; i++) {
+    int total = 0;
+    int max_vote = 0;
+    char max_base = 'N';
+    for (j = 0; j < N_BASES; j++) {
+      total += votes[i][j];
+      if (votes[i][j] > max_vote) {
+        max_vote = votes[i][j];
+        max_base = BASES[j];
+      }
+      if (total == 0) {
+        consensus[i] = 'N';
+      } else if ((double)max_vote/total > thres) {
+        consensus[i] = max_base;
+      } else {
+        consensus[i] = 'N';
+      }
+    }
+  }
+
+  consensus[seq_len] = '\0';
+  return consensus;
+}
+
+
+// Build a consensus sequence from two alignments by weighting each equally and considering only
+// the frequency of each base in each alignment.
+char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres) {
+  char *consensus = malloc(sizeof(char) * seq_len + 1);
+
+  int i, j;
+  for (i = 0; i < seq_len; i++) {
+    // Sum the total votes at this position.
+    /*TODO: This does an extra loop through the votes to get the total so it can calculate actual
+     *      frequencies in the second pass. Technically, this information could be gathered when
+     *      originally tallying the votes in the get_votes functions. Or, the total could be assumed
+     *      to be n_seqs if every base always contributes a vote (even when it's not in "ACGTN-").
+     */
+    int total1 = 0;
+    for (j = 0; j < N_BASES; j++) {
+      total1 += votes1[i][j];
+    }
+    int total2 = 0;
+    for (j = 0; j < N_BASES; j++) {
+      total2 += votes2[i][j];
+    }
+    double max_freq = 0.0;
+    char max_base = 'N';
+    for (j = 0; j < N_BASES; j++) {
+      // Get the frequency of each base.
+      double freq1;
+      if (total1 > 0) {
+        freq1 = (double)votes1[i][j]/total1;
+      }
+      double freq2;
+      if (total2 > 0) {
+        freq2 = (double)votes2[i][j]/total2;
+      }
+      // frequency of the base = average of frequencies in the two sequences
+      double avg_freq;
+      if (total1 == 0 && total2 == 0) {
+        avg_freq = -1.0;
+      } else if (total1 == 0) {
+        avg_freq = freq2;
+      } else if (total2 == 0) {
+        avg_freq = freq1;
+      } else {
+        avg_freq = (freq1 + freq2) / 2;
+      }
+      // Track the highest frequency seen.
+      if (avg_freq > max_freq) {
+        max_freq = avg_freq;
+        max_base = BASES[j];
+      }
+    }
+    if (max_freq > thres) {
+      consensus[i] = max_base;
+    } else {
+      consensus[i] = 'N';
+    }
+  }
+
+  consensus[seq_len] = '\0';
+  return consensus;
+}
+
+
+// "cons1" and "cons2" must be null-terminated strings of equal lengths.
+char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped) {
+  int seq_len = strlen(cons1);
+  char *cons = malloc(sizeof(char) * seq_len + 1);
+  int i = 0;
+  int base_prime1, base_prime2;
+  while (cons1[i] != '\0' && cons2[i] != '\0') {
+    base_prime1 = get_base_prime(cons1[i]);
+    base_prime2 = get_base_prime(cons2[i]);
+    cons[i] = IUPAC_BASES[base_prime1*base_prime2];
+    i++;
+  }
+  cons[seq_len] = '\0';
+  if (gapped) {
+    return cons;
+  } else {
+    return rm_gaps(cons, seq_len);
+  }
+}
+
+
+int get_base_prime(char base) {
+  switch (base) {
+    case 'A':
+      return 2;
+    case 'C':
+      return 3;
+    case 'G':
+      return 5;
+    case 'T':
+      return 7;
+    case 'N':
+      return 11;
+    case '-':
+      return 13;
+    default:
+      return 0;
+  }
+}
+
+
+// Convenience function to create a consensus in one step.
+// Give 0 as "quals" to not use quality scores, and -1.0 as "cons_thres" to use the default
+// consensus threshold when evaluating base votes.
+char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double cons_thres,
+                    char qual_thres, int gapped) {
+  if (cons_thres == -1.0) {
+    cons_thres = THRES_DEFAULT;
+  }
+  int **votes;
+  if (quals == 0) {
+    votes = get_votes_simple(align, n_seqs, seq_len);
+  } else {
+    votes = get_votes_qual(align, quals, n_seqs, seq_len, qual_thres);
+  }
+  char *consensus_gapped = build_consensus(votes, seq_len, cons_thres);
+  char *consensus;
+  if (gapped) {
+    consensus = consensus_gapped;
+  } else {
+    consensus = rm_gaps(consensus_gapped, seq_len);
+  }
+  free_votes(votes, seq_len);
+  return consensus;
+}
+
+
+char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[],
+                           int n_seqs1, int n_seqs2, int seq_len, double cons_thres,
+                           char qual_thres, int gapped, char *method) {
+  if (cons_thres == -1.0) {
+    cons_thres = THRES_DEFAULT;
+  }
+  int **votes1;
+  int **votes2;
+  if (quals1 == 0 || quals2 == 0) {
+    votes1 = get_votes_simple(align1, n_seqs1, seq_len);
+    votes2 = get_votes_simple(align2, n_seqs2, seq_len);
+  } else {
+    votes1 = get_votes_qual(align1, quals1, n_seqs1, seq_len, qual_thres);
+    votes2 = get_votes_qual(align2, quals2, n_seqs2, seq_len, qual_thres);
+  }
+  char *consensus_gapped;
+  if (!strncmp(method, "freq", 4)) {
+    consensus_gapped = build_consensus_duplex(votes1, votes2, seq_len, cons_thres);
+  } else if (!strncmp(method, "iupac", 5)) {
+    char *cons1 = build_consensus(votes1, seq_len, cons_thres);
+    char *cons2 = build_consensus(votes2, seq_len, cons_thres);
+    consensus_gapped = build_consensus_duplex_simple(cons1, cons2, 1);
+  } else {
+    return "";
+  }
+  char *consensus;
+  if (gapped) {
+    consensus = consensus_gapped;
+  } else {
+    consensus = rm_gaps(consensus_gapped, seq_len);
+  }
+  free_votes(votes1, seq_len);
+  free_votes(votes2, seq_len);
+  return consensus;
+}
+
+
+void get_gap_quals(char *quals) {
+  int seq_len = strlen(quals);
+  int *window = malloc(sizeof(int) * WIN_LEN * 2);
+  int win_edge = init_gap_qual_window(window, quals, seq_len);
+  print_window(window, win_edge);
+
+  int i;
+  char gap_qual;
+  for (i = 0; i < seq_len; i++) {
+    if (quals[i] == GAP_CHAR) {
+      gap_qual = get_gap_qual(window);
+      printf("gap %2d: %2d\n", i, gap_qual);
+    } else {
+      win_edge = push_qual(window, win_edge, quals, seq_len);
+      print_window(window, win_edge);
+    }
+  }
+}
+
+
+int main(int argc, char *argv[]) {
+  char **align = malloc(sizeof(char *) * (argc-1));
+
+  int seq_len = INT_MAX;
+  int i;
+  for (i = 1; i < argc; i++) {
+    if (strlen(argv[i]) < seq_len) {
+      seq_len = strlen(argv[i]);
+    }
+    align[i-1] = argv[i];
+  }
+
+  if (argc <= 1) {
+    return 1;
+  }
+
+  get_gap_quals(align[0]);
+  return 0;
+
+  int **votes = get_votes_simple(align, argc-1, seq_len);
+  char *consensus = build_consensus(votes, seq_len, THRES_DEFAULT);
+  print_votes(consensus, votes, seq_len);
+  printf("%s\n", consensus);
+  free_votes(votes, seq_len);
+
+  return 0;
+}
author	nick
date	Thu, 02 Feb 2017 18:44:31 -0500
parents
children