view consensus.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line source

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <limits.h>

// N.B. This defines the valid bases, but it's also effectively defined in the switches in
// get_votes_simple(), get_votes_qual(), and get_base_prime(), and in the constant IUPAC_BASES.
#define N_BASES 6
const char *BASES = "ACGTN-";
/* A  C   G   T   N   -     A: 2    Compute IUPAC ambiguous base character by representing each base
A  4  6  10  14  22  26     C: 3    with a prime and multiplying. Then use a lookup table (an array
C     9  15  21  33  39     G: 5    where the index is the product of the two primes).
G        25  35  55  65     T: 7
T            49  77  91     N: 11
N               121 143     -: 13    1         2         3         4         5         6         7
-                   169    01234567890123456789012345678901234567890123456789012345678901234567890*/
const char *IUPAC_BASES = "N...A.M..CR...WS.....YN..GN......N.K...N.........T.....N.........N....."
//                                   8         9        10        11        12        13        14
                           "......N.............N.............................N..................."
//                                  15        16        17
                           "..N.........................-";
#define THRES_DEFAULT 0.5
#define WIN_LEN 4
#define GAP_CHAR ' '

int **get_votes_simple(char *align[], int n_seqs, int seq_len);
int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres);
int init_gap_qual_window(int *window, char *quals, int seq_len);
char get_gap_qual(int *window);
int push_qual(int *window, int win_edge, char *quals, int seq_len);
void print_window(int *window, int win_edge);
int **init_votes(int seq_len);
void free_votes(int *votes[], int seq_len);
void print_votes(char *consensus, int *votes[], int seq_len);
char *rm_gaps(char *consensus, int cons_len);
char *build_consensus(int *votes[], int seq_len, double thres);
char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres);
char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped);
int get_base_prime(char base);
char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double thres,
                    char qual_thres, int gapped);
char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[],
                           int n_seqs1, int n_seqs2, int seq_len, double cons_thres,
                           char qual_thres, int gapped, char *method);


// Tally the different bases at each position in an alignment.
// Returns an array of arrays: for each position in the alignment, an array of the number of times
// each base occurs at that position. The order of bases is as in the "BASES" constant.
int **get_votes_simple(char *align[], int n_seqs, int seq_len) {
  int **votes = init_votes(seq_len);

  // Tally votes for each base.
  int i, j;
  for (i = 0; i < n_seqs; i++) {
    for (j = 0; j < seq_len; j++) {
      // N.B.: Could write this without hardcoded literals, but it's about 40% slower.
      switch (toupper(align[i][j])) {
        case 'A':
          votes[j][0]++;
          break;
        case 'C':
          votes[j][1]++;
          break;
        case 'G':
          votes[j][2]++;
          break;
        case 'T':
          votes[j][3]++;
          break;
        case 'N':
          votes[j][4]++;
          break;
        case '-':
          votes[j][5]++;
          break;
      }
    }
  }

  return votes;
}


//  Tally votes for each base, ignoring bases with a quality score below "thres".
int **get_votes_qual(char *align[], char *quals[], int n_seqs, int seq_len, char thres) {
  int **votes = init_votes(seq_len);
  int *window = malloc(sizeof(int) * WIN_LEN * 2);
  int win_edge;

  // Tally votes for each base.
  char qual;
  int i, j;
  for (i = 0; i < n_seqs; i++) {
    win_edge = init_gap_qual_window(window, quals[i], seq_len);
    for (j = 0; j < seq_len; j++) {
      // Figure out the quality score of the base (or gap).
      if (align[i][j] == '-') {
        qual = get_gap_qual(window);
      } else {
        win_edge = push_qual(window, win_edge, quals[i], seq_len);
        qual = quals[i][j];
      }
      // Don't count bases whose quality is less than the threshold.
      if (qual < thres) {
        continue;
      }
      // N.B.: Could write this without hardcoded literals, but it's about 40% slower.
      switch (toupper(align[i][j])) {
        case 'A':
          votes[j][0]++;
          break;
        case 'C':
          votes[j][1]++;
          break;
        case 'G':
          votes[j][2]++;
          break;
        case 'T':
          votes[j][3]++;
          break;
        case 'N':
          votes[j][4]++;
          break;
        case '-':
          votes[j][5]++;
          break;
      }
    }
  }

  free(window);
  return votes;
}


/* Tally votes for each base, weighting by the PHRED score of the base.
 * This is based on the theory of PHRED scores representing the literal probability of the base call
 * being erroneous. Thus, if two reads show a C at a position, both with PHRED 20 (1/100 chance of
 * error), then the chances of them both being wrong are 1/100 * 1/100 = 1/10000 (PHRED 40).
 * So, theoretically and intuitively, it makes sense to trust two PHRED 20 C's over one PHRED 30 A.
 * This seems better than arbitrarily deciding not to consider bases below a PHRED score threshold.
 * How to decide when not to call the base? We could just say that if no base's score total is above
 * a certain threshold, we call it an N. Theoretically, this threshold is the confidence we want in
 * our final base calls. This could even replace the arbitrary 3 reads for a consensus threshold.
 */
int **get_votes_weighted(char *align[], char *quals[], int n_seqs, int seq_len) {
  int **votes = init_votes(seq_len);
  int *window = malloc(sizeof(int) * WIN_LEN * 2);
  int win_edge;

  // Tally votes for each base.
  char qual;
  int i, j;
  for (i = 0; i < n_seqs; i++) {
    win_edge = init_gap_qual_window(window, quals[i], seq_len);
    for (j = 0; j < seq_len; j++) {
      // Figure out the quality score of the base (or gap).
      if (align[i][j] == '-') {
        qual = get_gap_qual(window);
      } else {
        win_edge = push_qual(window, win_edge, quals[i], seq_len);
        qual = quals[i][j];
      }
      // N.B.: Could write this without hardcoded literals, but it's about 40% slower.
      switch (toupper(align[i][j])) {
        case 'A':
          votes[j][0] += qual;
          break;
        case 'C':
          votes[j][1] += qual;
          break;
        case 'G':
          votes[j][2] += qual;
          break;
        case 'T':
          votes[j][3] += qual;
          break;
        case 'N':
          votes[j][4] += qual;
          break;
        case '-':
          votes[j][5] += qual;
          break;
      }
    }
  }

  free(window);
  return votes;
}


/* Calculation of gap quality scores:
 * "window" is an array of length 2*WIN_LEN, holding the quality scores of the non-gap bases WIN_LEN
 * from the current one in both directions. When we're at the start or end, fill the slots beyond
 * the edge of the sequence with -1 as a sentinel for "N/A". For example, after the 2nd base in the
 * sequence, if WIN_LEN is 4 and there are no gaps yet, "window" should look something like this:
 *   base coordinates         0  1   2  3  4  5
 *   quality scores           =  9   A  >  E  B
 *   array values     [-1|-1|28|24||32|29|36|33]
 * Usage:
 * Allocate "window" and call init_gap_qual_window() to initialize it by filling its right side with
 * the first WIN_LEN non-gap quality scores in the sequence. It will also return "win_edge", which
 * tracks the coordinate of the rightmost quality score in "window".
 * Then, start traversing the array of quality scores. For each score, if it's a gap character, call
 * get_gap_qual() to get the computed quality score at the gap. Otherwise, call push_qual() to add
 * another quality score to the end of the window. This will keep the window filled with all non-gap
 * quality scores, and each time you call get_gap_qual(), you should be at the gap between the two
 * quality scores at the center of the window.
 */

// This does the initial fill of the window array, adding the first WIN_LEN bases to the right side
// and filling the left side with -1's.
int init_gap_qual_window(int *window, char *quals, int seq_len) {
  // Fill left side with -1's (no quality information).
  int i;
  for (i = 0; i < WIN_LEN; i++) {
    window[i] = -1;
  }
  // Fill right side with first WIN_LEN quality scores. Skip gaps, and if you run out of quality
  // scores (if seq_len < WIN_LEN), fill the rest with -1. Leave win_edge at the last base we added.
  i = WIN_LEN;
  int win_edge = -1;
  int quals_added = 0;
  while (quals_added < WIN_LEN) {
    win_edge++;
    if (win_edge >= seq_len) {
      win_edge = seq_len;
      window[i] = -1;
      i++;
      quals_added++;
    } else if (quals[win_edge] != GAP_CHAR) {
      window[i] = quals[win_edge];
      i++;
      quals_added++;
    }
  }
  return win_edge;
}


// Push the next non-gap quality score onto the right side of the window.
int push_qual(int *window, int win_edge, char *quals, int seq_len) {
  // Find the next quality score that's not a gap.
  char next_qual = GAP_CHAR;
  while (next_qual == GAP_CHAR) {
    win_edge++;
    if (win_edge < seq_len) {
      next_qual = quals[win_edge];
    } else {
      win_edge = seq_len;
      next_qual = -1;
    }
  }
  // Shift all the quality scores left add the new one.
  int last_qual;
  int i;
  for (i = WIN_LEN*2 - 1; i >= 0; i--) {
    last_qual = window[i];
    window[i] = next_qual;
    next_qual = last_qual;
  }
  return win_edge;
}


// Compute the quality of the gap based on a weighted average of the quality scores in the window.
// The scores near the center of the window are weighted higher than the ones further away.
char get_gap_qual(int *window) {
  int score_sum = 0;
  int weight_sum = 0;
  int weight = 1;
  int i;
  for (i = 0; i < WIN_LEN*2; i++) {
    if (window[i] != -1) {
      score_sum += window[i] * weight;
      weight_sum += weight;
    }
    // Increase the weight until we get to the middle of the window (at WIN_LEN), then decrease it.
    if (i < WIN_LEN - 1) {
      weight++;
    } else if (i > WIN_LEN - 1) {
      weight--;
    }
  }
  if (weight_sum > 0) {
    // Divide by the sum of the weights to get the final quality score.
    return (char) (score_sum/weight_sum);
  } else {
    return '\0';
  }
}


void print_window(int *window, int win_edge) {
  printf("[");
  int i;
  for (i = 0; i < WIN_LEN*2; i++) {
    printf("%c", window[i]);
    if (i == WIN_LEN - 1) {
      printf("||");
    } else if (i == WIN_LEN*2 - 1) {
      printf("] %-2d\n", win_edge);
    } else {
      printf("|");
    }
  }
}


int **init_votes(int seq_len) {
  int **votes = malloc(sizeof(int *) * seq_len);
  int i, j;
  for (i = 0; i < seq_len; i++) {
    votes[i] = malloc(sizeof(int) * N_BASES);
    for (j = 0; j < N_BASES; j++) {
      votes[i][j] = 0;
    }
  }
  return votes;
}


void free_votes(int *votes[], int seq_len) {
  int i;
  for (i = 0; i < seq_len; i++) {
    free(votes[i]);
  }
  free(votes);
}


void print_votes(char *consensus, int *votes[], int seq_len) {
  int i, j;
  printf("   ");
  for (j = 0; j < N_BASES; j++) {
    printf(" %c ", BASES[j]);
  }
  printf("\n");
  for (i = 0; i < seq_len; i++) {
    printf("%c: ", consensus[i]);
    for (j = 0; j < N_BASES; j++) {
      if (votes[i][j]) {
        printf("%2d ", votes[i][j]);
      } else {
        printf("   ");
      }
    }
    printf("\n");
  }
}


// Take a consensus sequence which may have gaps ('-' characters) and remove them to produce the
// actual final sequence. "cons_len" should be the length of the original, gapped, sequence.
char *rm_gaps(char *consensus, int cons_len) {
  char *output = malloc(sizeof(char) * cons_len + 1);
  int i;
  int j = 0;
  for (i = 0; i < cons_len; i++) {
    if (consensus[i] != '-') {
      output[j] = consensus[i];
      j++;
    }
  }
  output[j] = '\0';
  return output;
}


char *build_consensus(int *votes[], int seq_len, double thres) {
  char *consensus = malloc(sizeof(char) * seq_len + 1);
  
  int i, j;
  for (i = 0; i < seq_len; i++) {
    int total = 0;
    int max_vote = 0;
    char max_base = 'N';
    for (j = 0; j < N_BASES; j++) {
      total += votes[i][j];
      if (votes[i][j] > max_vote) {
        max_vote = votes[i][j];
        max_base = BASES[j];
      }
      if (total == 0) {
        consensus[i] = 'N';
      } else if ((double)max_vote/total > thres) {
        consensus[i] = max_base;
      } else {
        consensus[i] = 'N';
      }
    }
  }

  consensus[seq_len] = '\0';
  return consensus;
}


// Build a consensus sequence from two alignments by weighting each equally and considering only
// the frequency of each base in each alignment.
char *build_consensus_duplex(int *votes1[], int *votes2[], int seq_len, double thres) {
  char *consensus = malloc(sizeof(char) * seq_len + 1);

  int i, j;
  for (i = 0; i < seq_len; i++) {
    // Sum the total votes at this position.
    /*TODO: This does an extra loop through the votes to get the total so it can calculate actual
     *      frequencies in the second pass. Technically, this information could be gathered when
     *      originally tallying the votes in the get_votes functions. Or, the total could be assumed
     *      to be n_seqs if every base always contributes a vote (even when it's not in "ACGTN-").
     */
    int total1 = 0;
    for (j = 0; j < N_BASES; j++) {
      total1 += votes1[i][j];
    }
    int total2 = 0;
    for (j = 0; j < N_BASES; j++) {
      total2 += votes2[i][j];
    }
    double max_freq = 0.0;
    char max_base = 'N';
    for (j = 0; j < N_BASES; j++) {
      // Get the frequency of each base.
      double freq1;
      if (total1 > 0) {
        freq1 = (double)votes1[i][j]/total1;
      }
      double freq2;
      if (total2 > 0) {
        freq2 = (double)votes2[i][j]/total2;
      }
      // frequency of the base = average of frequencies in the two sequences
      double avg_freq;
      if (total1 == 0 && total2 == 0) {
        avg_freq = -1.0;
      } else if (total1 == 0) {
        avg_freq = freq2;
      } else if (total2 == 0) {
        avg_freq = freq1;
      } else {
        avg_freq = (freq1 + freq2) / 2;
      }
      // Track the highest frequency seen.
      if (avg_freq > max_freq) {
        max_freq = avg_freq;
        max_base = BASES[j];
      }
    }
    if (max_freq > thres) {
      consensus[i] = max_base;
    } else {
      consensus[i] = 'N';
    }
  }

  consensus[seq_len] = '\0';
  return consensus;
}


// "cons1" and "cons2" must be null-terminated strings of equal lengths.
char *build_consensus_duplex_simple(char *cons1, char *cons2, int gapped) {
  int seq_len = strlen(cons1);
  char *cons = malloc(sizeof(char) * seq_len + 1);
  int i = 0;
  int base_prime1, base_prime2;
  while (cons1[i] != '\0' && cons2[i] != '\0') {
    base_prime1 = get_base_prime(cons1[i]);
    base_prime2 = get_base_prime(cons2[i]);
    cons[i] = IUPAC_BASES[base_prime1*base_prime2];
    i++;
  }
  cons[seq_len] = '\0';
  if (gapped) {
    return cons;
  } else {
    return rm_gaps(cons, seq_len);
  }
}


int get_base_prime(char base) {
  switch (base) {
    case 'A':
      return 2;
    case 'C':
      return 3;
    case 'G':
      return 5;
    case 'T':
      return 7;
    case 'N':
      return 11;
    case '-':
      return 13;
    default:
      return 0;
  }
}


// Convenience function to create a consensus in one step.
// Give 0 as "quals" to not use quality scores, and -1.0 as "cons_thres" to use the default
// consensus threshold when evaluating base votes.
char *get_consensus(char *align[], char *quals[], int n_seqs, int seq_len, double cons_thres,
                    char qual_thres, int gapped) {
  if (cons_thres == -1.0) {
    cons_thres = THRES_DEFAULT;
  }
  int **votes;
  if (quals == 0) {
    votes = get_votes_simple(align, n_seqs, seq_len);
  } else {
    votes = get_votes_qual(align, quals, n_seqs, seq_len, qual_thres);
  }
  char *consensus_gapped = build_consensus(votes, seq_len, cons_thres);
  char *consensus;
  if (gapped) {
    consensus = consensus_gapped;
  } else {
    consensus = rm_gaps(consensus_gapped, seq_len);
  }
  free_votes(votes, seq_len);
  return consensus;
}


char *get_consensus_duplex(char *align1[], char *align2[], char *quals1[], char *quals2[],
                           int n_seqs1, int n_seqs2, int seq_len, double cons_thres,
                           char qual_thres, int gapped, char *method) {
  if (cons_thres == -1.0) {
    cons_thres = THRES_DEFAULT;
  }
  int **votes1;
  int **votes2;
  if (quals1 == 0 || quals2 == 0) {
    votes1 = get_votes_simple(align1, n_seqs1, seq_len);
    votes2 = get_votes_simple(align2, n_seqs2, seq_len);
  } else {
    votes1 = get_votes_qual(align1, quals1, n_seqs1, seq_len, qual_thres);
    votes2 = get_votes_qual(align2, quals2, n_seqs2, seq_len, qual_thres);
  }
  char *consensus_gapped;
  if (!strncmp(method, "freq", 4)) {
    consensus_gapped = build_consensus_duplex(votes1, votes2, seq_len, cons_thres);
  } else if (!strncmp(method, "iupac", 5)) {
    char *cons1 = build_consensus(votes1, seq_len, cons_thres);
    char *cons2 = build_consensus(votes2, seq_len, cons_thres);
    consensus_gapped = build_consensus_duplex_simple(cons1, cons2, 1);
  } else {
    return "";
  }
  char *consensus;
  if (gapped) {
    consensus = consensus_gapped;
  } else {
    consensus = rm_gaps(consensus_gapped, seq_len);
  }
  free_votes(votes1, seq_len);
  free_votes(votes2, seq_len);
  return consensus;
}


void get_gap_quals(char *quals) {
  int seq_len = strlen(quals);
  int *window = malloc(sizeof(int) * WIN_LEN * 2);
  int win_edge = init_gap_qual_window(window, quals, seq_len);
  print_window(window, win_edge);

  int i;
  char gap_qual;
  for (i = 0; i < seq_len; i++) {
    if (quals[i] == GAP_CHAR) {
      gap_qual = get_gap_qual(window);
      printf("gap %2d: %2d\n", i, gap_qual);
    } else {
      win_edge = push_qual(window, win_edge, quals, seq_len);
      print_window(window, win_edge);
    }
  }
}


int main(int argc, char *argv[]) {
  char **align = malloc(sizeof(char *) * (argc-1));

  int seq_len = INT_MAX;
  int i;
  for (i = 1; i < argc; i++) {
    if (strlen(argv[i]) < seq_len) {
      seq_len = strlen(argv[i]);
    }
    align[i-1] = argv[i];
  }

  if (argc <= 1) {
    return 1;
  }

  get_gap_quals(align[0]);
  return 0;

  int **votes = get_votes_simple(align, argc-1, seq_len);
  char *consensus = build_consensus(votes, seq_len, THRES_DEFAULT);
  print_votes(consensus, votes, seq_len);
  printf("%s\n", consensus);
  free_votes(votes, seq_len);

  return 0;
}