diff seqtools.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seqtools.c	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,236 @@
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+//             ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz
+#define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz"
+#define TRANS_OFFSET 65
+#define TRANS_LEN 57
+
+char* get_revcomp(char *input);
+char get_char_comp(char c);
+int *get_diffs_simple(char *cons, char *seqs[], int n_seqs);
+double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs);
+double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins);
+char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2);
+char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2);
+
+
+// Return the reverse complement of a sequence.
+// Makes a new copy of the string, so the original is not modified.
+char* get_revcomp(char *input) {
+  int length = strlen(input);
+  char *output = malloc(sizeof(char) * length + 1);
+  int i, j;
+  for (i = 0, j = length - 1; i < length && j >= 0; i++, j--) {
+    output[j] = get_char_comp(input[i]);
+  }
+  output[length] = '\0';
+  return output;
+}
+
+
+// Return the complement of a base.
+// Uses a simple lookup table: a string with the complements of all possible sequence characters.
+char get_char_comp(char c) {
+  int i = c - TRANS_OFFSET;
+  if (i < 0 || i > TRANS_LEN) {
+    return c;
+  } else {
+    return TRANS[i];
+  }
+}
+
+
+/* Take an existing alignment and consensus and compute the number of differences between each
+ * sequence and the consensus.
+ * Known bugs:
+ * 1. Counts no differences in the following sequences:
+ *   consensus: GA---CA
+ *   seq 1:     GA----A
+ *   seq 2:     GA--ACA
+ * 2. If a sequence starts with a gap, each base in the gap will be counted as a diff.
+ */
+int *get_diffs_simple(char *cons, char *seqs[], int n_seqs) {
+  int *diffs = malloc(sizeof(int) * n_seqs);
+  int i = 0;
+  // Uppercase the consensus.
+  while (cons[i] != 0) {
+    cons[i] = toupper(cons[i]);
+    i++;
+  }
+  // Loop through the sequences in the alignment.
+  for (i = 0; i < n_seqs; i++) {
+    int in_gap;
+    diffs[i] = 0;
+    int j = 0;
+    // Compare each base of the sequence to the consensus.
+    while (seqs[i][j] != 0 && cons[j] != 0) {
+      if (cons[j] != '-' && seqs[i][j] != '-') {
+        in_gap = 0;
+      }
+      if (toupper(seqs[i][j]) != cons[j]) {
+        if (!in_gap) {
+          diffs[i]++;
+        }
+      }
+      if (cons[j] == '-' || seqs[i][j] == '-') {
+        in_gap = 1;
+      }
+      j++;
+    }
+  }
+  return diffs;
+}
+
+
+// Convert the output of get_diffs_simple() from raw diff counts to fractions of the total sequence
+// lengths.
+//TODO: Don't count gaps in sequence length.
+double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs) {
+  int *diffs = get_diffs_simple(cons, seqs, n_seqs);
+  double *fracs = malloc(sizeof(double) * n_seqs);
+  int i;
+  for (i = 0; i < n_seqs; i++) {
+    int j = 0;
+    while (seqs[i][j] != 0 && cons[j] != 0) {
+      j++;
+    }
+    fracs[i] = (double)diffs[i]/j;
+  }
+  return fracs;
+}
+
+
+/* Take an existing alignment and consensus and compute the number of differences between each
+ * sequence and the consensus. Break each sequence into bins and tally the differences in each bin.
+ * Known bugs:
+ * 1. counts no differences in the following sequences:
+ *   consensus: GA---CA
+ *   seq 1:     GA----A
+ *   seq 2:     GA--ACA
+ * 2. If a bin starts with a gap, each base in the gap will be counted as a diff.
+ */
+int **get_diffs_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) {
+  int bin_size = (int)round((float)seq_len/bins);
+  // Initialize the diffs 2d array.
+  int **diffs = malloc(sizeof(int*) * n_seqs);
+  int i, j;
+  for (i = 0; i < n_seqs; i++) {
+    diffs[i] = malloc(bins * sizeof(int));
+    for (j = 0; j < bins; j++) {
+      diffs[i][j] = 0;
+    }
+  }
+  // Uppercase the consensus.
+  while (cons[i] != 0) {
+    cons[i] = toupper(cons[i]);
+    i++;
+  }
+  int bin, in_gap;
+  // Loop through the sequences in the alignment.
+  for (i = 0; i < n_seqs; i++) {
+    j = 0;
+    // Compare each base of the sequence to the consensus.
+    while (seqs[i][j] != 0 && cons[j] != 0) {
+      bin = j/bin_size;
+      if (bin >= bins) {
+        break;
+      }
+      if (cons[j] != '-' && seqs[i][j] != '-') {
+        in_gap = 0;
+      }
+      if (toupper(seqs[i][j]) != cons[j]) {
+        if (!in_gap) {
+          diffs[i][bin]++;
+        }
+      }
+      if (cons[j] == '-' || seqs[i][j] == '-') {
+        in_gap = 1;
+      }
+      j++;
+    }
+  }
+  return diffs;
+}
+
+
+// Convert the output of get_diffs_binned() from raw diff counts to fractions of the total bin
+// lengths.
+//TODO: Don't count gaps in bin length.
+double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) {
+  int bin_size = (int)round((float)seq_len/bins);
+  int **diffs = get_diffs_binned(cons, seqs, n_seqs, seq_len, bins);
+  double **fracs = malloc(sizeof(double*) * n_seqs);
+  int i;
+  for (i = 0; i < n_seqs; i++) {
+    fracs[i] = malloc(sizeof(double) * bins);
+    // Create and init array of lengths of the bins.
+    int bin_lengths[bins];
+    int bin;
+    for (bin = 0; bin < bins; bin++) {
+      bin_lengths[bin] = 0;
+    }
+    // Tally size of each bin.
+    int j = 0;
+    while (seqs[i][j] != 0 && cons[j] != 0) {
+      int bin = j/bin_size;
+      if (bin >= bins) {
+        break;
+      }
+      bin_lengths[bin]++;
+      j++;
+    }
+    // For each bin, calculate the diff fraction = diffs / bin_length.
+    for (bin = 0; bin < bins; bin++) {
+      fracs[i][bin] = (double)diffs[i][bin]/bin_lengths[bin];
+      // printf("bin %d: %d / %d = %f\t", bin, diffs[i][bin], bin_lengths[bin], fracs[i][bin]);
+    }
+    // printf("\n");
+  }
+  return fracs;
+}
+
+
+// Take an input sequence and insert gaps according to another, already-aligned sequence with gaps.
+// Input strings must be null-terminated. "gap_char1" is the character used for gaps in
+// "gapped_seq", and "gap_char2" is the gap character in "inseq".
+// N.B.: The ungapped length of "gapped_seq" must be equal to the length of "inseq".
+char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2) {
+  if (gap_char1 == 0) {
+    gap_char1 = '-';
+  }
+  if (gap_char2 == 0) {
+    gap_char2 = '-';
+  }
+  int gapped_len = strlen(gapped_seq);
+  char *outseq = malloc(sizeof(char) * gapped_len + 1);
+
+  // Transfer characters from inseq to outseq, except when gapped_seq has a gap at that spot
+  // (insert a gap there instead).
+  int g, o, i;
+  for (g = 0, o = 0, i = 0; g < gapped_len; g++, o++) {
+    if (gapped_seq[g] == gap_char1) {
+      outseq[o] = gap_char2;
+    } else {
+      outseq[o] = inseq[i];
+      i++;
+    }
+  }
+  outseq[gapped_len] = '\0';
+
+  return outseq;
+}
+
+
+// Wrapper for transfer_gaps() when operating on a set of sequences at once.
+char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1,
+                           char gap_char2) {
+  char **outseqs = malloc(sizeof(char *) * n_seqs);
+  int i;
+  for (i = 0; i < n_seqs; i++) {
+    outseqs[i] = transfer_gaps(gapped_seqs[i], inseqs[i], gap_char1, gap_char2);
+  }
+  return outseqs;
+}