Mercurial > repos > nick > duplex
diff seqtools.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/seqtools.c Thu Feb 02 18:44:31 2017 -0500 @@ -0,0 +1,236 @@ +#include <stdlib.h> +#include <string.h> +#include <ctype.h> +#include <math.h> + +// ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz +#define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz" +#define TRANS_OFFSET 65 +#define TRANS_LEN 57 + +char* get_revcomp(char *input); +char get_char_comp(char c); +int *get_diffs_simple(char *cons, char *seqs[], int n_seqs); +double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs); +double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins); +char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2); +char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2); + + +// Return the reverse complement of a sequence. +// Makes a new copy of the string, so the original is not modified. +char* get_revcomp(char *input) { + int length = strlen(input); + char *output = malloc(sizeof(char) * length + 1); + int i, j; + for (i = 0, j = length - 1; i < length && j >= 0; i++, j--) { + output[j] = get_char_comp(input[i]); + } + output[length] = '\0'; + return output; +} + + +// Return the complement of a base. +// Uses a simple lookup table: a string with the complements of all possible sequence characters. +char get_char_comp(char c) { + int i = c - TRANS_OFFSET; + if (i < 0 || i > TRANS_LEN) { + return c; + } else { + return TRANS[i]; + } +} + + +/* Take an existing alignment and consensus and compute the number of differences between each + * sequence and the consensus. + * Known bugs: + * 1. Counts no differences in the following sequences: + * consensus: GA---CA + * seq 1: GA----A + * seq 2: GA--ACA + * 2. If a sequence starts with a gap, each base in the gap will be counted as a diff. + */ +int *get_diffs_simple(char *cons, char *seqs[], int n_seqs) { + int *diffs = malloc(sizeof(int) * n_seqs); + int i = 0; + // Uppercase the consensus. + while (cons[i] != 0) { + cons[i] = toupper(cons[i]); + i++; + } + // Loop through the sequences in the alignment. + for (i = 0; i < n_seqs; i++) { + int in_gap; + diffs[i] = 0; + int j = 0; + // Compare each base of the sequence to the consensus. + while (seqs[i][j] != 0 && cons[j] != 0) { + if (cons[j] != '-' && seqs[i][j] != '-') { + in_gap = 0; + } + if (toupper(seqs[i][j]) != cons[j]) { + if (!in_gap) { + diffs[i]++; + } + } + if (cons[j] == '-' || seqs[i][j] == '-') { + in_gap = 1; + } + j++; + } + } + return diffs; +} + + +// Convert the output of get_diffs_simple() from raw diff counts to fractions of the total sequence +// lengths. +//TODO: Don't count gaps in sequence length. +double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs) { + int *diffs = get_diffs_simple(cons, seqs, n_seqs); + double *fracs = malloc(sizeof(double) * n_seqs); + int i; + for (i = 0; i < n_seqs; i++) { + int j = 0; + while (seqs[i][j] != 0 && cons[j] != 0) { + j++; + } + fracs[i] = (double)diffs[i]/j; + } + return fracs; +} + + +/* Take an existing alignment and consensus and compute the number of differences between each + * sequence and the consensus. Break each sequence into bins and tally the differences in each bin. + * Known bugs: + * 1. counts no differences in the following sequences: + * consensus: GA---CA + * seq 1: GA----A + * seq 2: GA--ACA + * 2. If a bin starts with a gap, each base in the gap will be counted as a diff. + */ +int **get_diffs_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { + int bin_size = (int)round((float)seq_len/bins); + // Initialize the diffs 2d array. + int **diffs = malloc(sizeof(int*) * n_seqs); + int i, j; + for (i = 0; i < n_seqs; i++) { + diffs[i] = malloc(bins * sizeof(int)); + for (j = 0; j < bins; j++) { + diffs[i][j] = 0; + } + } + // Uppercase the consensus. + while (cons[i] != 0) { + cons[i] = toupper(cons[i]); + i++; + } + int bin, in_gap; + // Loop through the sequences in the alignment. + for (i = 0; i < n_seqs; i++) { + j = 0; + // Compare each base of the sequence to the consensus. + while (seqs[i][j] != 0 && cons[j] != 0) { + bin = j/bin_size; + if (bin >= bins) { + break; + } + if (cons[j] != '-' && seqs[i][j] != '-') { + in_gap = 0; + } + if (toupper(seqs[i][j]) != cons[j]) { + if (!in_gap) { + diffs[i][bin]++; + } + } + if (cons[j] == '-' || seqs[i][j] == '-') { + in_gap = 1; + } + j++; + } + } + return diffs; +} + + +// Convert the output of get_diffs_binned() from raw diff counts to fractions of the total bin +// lengths. +//TODO: Don't count gaps in bin length. +double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { + int bin_size = (int)round((float)seq_len/bins); + int **diffs = get_diffs_binned(cons, seqs, n_seqs, seq_len, bins); + double **fracs = malloc(sizeof(double*) * n_seqs); + int i; + for (i = 0; i < n_seqs; i++) { + fracs[i] = malloc(sizeof(double) * bins); + // Create and init array of lengths of the bins. + int bin_lengths[bins]; + int bin; + for (bin = 0; bin < bins; bin++) { + bin_lengths[bin] = 0; + } + // Tally size of each bin. + int j = 0; + while (seqs[i][j] != 0 && cons[j] != 0) { + int bin = j/bin_size; + if (bin >= bins) { + break; + } + bin_lengths[bin]++; + j++; + } + // For each bin, calculate the diff fraction = diffs / bin_length. + for (bin = 0; bin < bins; bin++) { + fracs[i][bin] = (double)diffs[i][bin]/bin_lengths[bin]; + // printf("bin %d: %d / %d = %f\t", bin, diffs[i][bin], bin_lengths[bin], fracs[i][bin]); + } + // printf("\n"); + } + return fracs; +} + + +// Take an input sequence and insert gaps according to another, already-aligned sequence with gaps. +// Input strings must be null-terminated. "gap_char1" is the character used for gaps in +// "gapped_seq", and "gap_char2" is the gap character in "inseq". +// N.B.: The ungapped length of "gapped_seq" must be equal to the length of "inseq". +char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2) { + if (gap_char1 == 0) { + gap_char1 = '-'; + } + if (gap_char2 == 0) { + gap_char2 = '-'; + } + int gapped_len = strlen(gapped_seq); + char *outseq = malloc(sizeof(char) * gapped_len + 1); + + // Transfer characters from inseq to outseq, except when gapped_seq has a gap at that spot + // (insert a gap there instead). + int g, o, i; + for (g = 0, o = 0, i = 0; g < gapped_len; g++, o++) { + if (gapped_seq[g] == gap_char1) { + outseq[o] = gap_char2; + } else { + outseq[o] = inseq[i]; + i++; + } + } + outseq[gapped_len] = '\0'; + + return outseq; +} + + +// Wrapper for transfer_gaps() when operating on a set of sequences at once. +char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, + char gap_char2) { + char **outseqs = malloc(sizeof(char *) * n_seqs); + int i; + for (i = 0; i < n_seqs; i++) { + outseqs[i] = transfer_gaps(gapped_seqs[i], inseqs[i], gap_char1, gap_char2); + } + return outseqs; +}
