Mercurial > repos > nick > duplex
view seqtools.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
| author | nick |
|---|---|
| date | Thu, 02 Feb 2017 18:44:31 -0500 |
| parents | |
| children |
line wrap: on
line source
#include <stdlib.h> #include <string.h> #include <ctype.h> #include <math.h> // ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz #define TRANS "TVGHEFCDIJMLKNOPQYWAABSXRZ[\\]^_`tvghefcdijmlknopqywaabsxrz" #define TRANS_OFFSET 65 #define TRANS_LEN 57 char* get_revcomp(char *input); char get_char_comp(char c); int *get_diffs_simple(char *cons, char *seqs[], int n_seqs); double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs); double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins); char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2); char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2); // Return the reverse complement of a sequence. // Makes a new copy of the string, so the original is not modified. char* get_revcomp(char *input) { int length = strlen(input); char *output = malloc(sizeof(char) * length + 1); int i, j; for (i = 0, j = length - 1; i < length && j >= 0; i++, j--) { output[j] = get_char_comp(input[i]); } output[length] = '\0'; return output; } // Return the complement of a base. // Uses a simple lookup table: a string with the complements of all possible sequence characters. char get_char_comp(char c) { int i = c - TRANS_OFFSET; if (i < 0 || i > TRANS_LEN) { return c; } else { return TRANS[i]; } } /* Take an existing alignment and consensus and compute the number of differences between each * sequence and the consensus. * Known bugs: * 1. Counts no differences in the following sequences: * consensus: GA---CA * seq 1: GA----A * seq 2: GA--ACA * 2. If a sequence starts with a gap, each base in the gap will be counted as a diff. */ int *get_diffs_simple(char *cons, char *seqs[], int n_seqs) { int *diffs = malloc(sizeof(int) * n_seqs); int i = 0; // Uppercase the consensus. while (cons[i] != 0) { cons[i] = toupper(cons[i]); i++; } // Loop through the sequences in the alignment. for (i = 0; i < n_seqs; i++) { int in_gap; diffs[i] = 0; int j = 0; // Compare each base of the sequence to the consensus. while (seqs[i][j] != 0 && cons[j] != 0) { if (cons[j] != '-' && seqs[i][j] != '-') { in_gap = 0; } if (toupper(seqs[i][j]) != cons[j]) { if (!in_gap) { diffs[i]++; } } if (cons[j] == '-' || seqs[i][j] == '-') { in_gap = 1; } j++; } } return diffs; } // Convert the output of get_diffs_simple() from raw diff counts to fractions of the total sequence // lengths. //TODO: Don't count gaps in sequence length. double *get_diffs_frac_simple(char *cons, char *seqs[], int n_seqs) { int *diffs = get_diffs_simple(cons, seqs, n_seqs); double *fracs = malloc(sizeof(double) * n_seqs); int i; for (i = 0; i < n_seqs; i++) { int j = 0; while (seqs[i][j] != 0 && cons[j] != 0) { j++; } fracs[i] = (double)diffs[i]/j; } return fracs; } /* Take an existing alignment and consensus and compute the number of differences between each * sequence and the consensus. Break each sequence into bins and tally the differences in each bin. * Known bugs: * 1. counts no differences in the following sequences: * consensus: GA---CA * seq 1: GA----A * seq 2: GA--ACA * 2. If a bin starts with a gap, each base in the gap will be counted as a diff. */ int **get_diffs_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { int bin_size = (int)round((float)seq_len/bins); // Initialize the diffs 2d array. int **diffs = malloc(sizeof(int*) * n_seqs); int i, j; for (i = 0; i < n_seqs; i++) { diffs[i] = malloc(bins * sizeof(int)); for (j = 0; j < bins; j++) { diffs[i][j] = 0; } } // Uppercase the consensus. while (cons[i] != 0) { cons[i] = toupper(cons[i]); i++; } int bin, in_gap; // Loop through the sequences in the alignment. for (i = 0; i < n_seqs; i++) { j = 0; // Compare each base of the sequence to the consensus. while (seqs[i][j] != 0 && cons[j] != 0) { bin = j/bin_size; if (bin >= bins) { break; } if (cons[j] != '-' && seqs[i][j] != '-') { in_gap = 0; } if (toupper(seqs[i][j]) != cons[j]) { if (!in_gap) { diffs[i][bin]++; } } if (cons[j] == '-' || seqs[i][j] == '-') { in_gap = 1; } j++; } } return diffs; } // Convert the output of get_diffs_binned() from raw diff counts to fractions of the total bin // lengths. //TODO: Don't count gaps in bin length. double **get_diffs_frac_binned(char *cons, char *seqs[], int n_seqs, int seq_len, int bins) { int bin_size = (int)round((float)seq_len/bins); int **diffs = get_diffs_binned(cons, seqs, n_seqs, seq_len, bins); double **fracs = malloc(sizeof(double*) * n_seqs); int i; for (i = 0; i < n_seqs; i++) { fracs[i] = malloc(sizeof(double) * bins); // Create and init array of lengths of the bins. int bin_lengths[bins]; int bin; for (bin = 0; bin < bins; bin++) { bin_lengths[bin] = 0; } // Tally size of each bin. int j = 0; while (seqs[i][j] != 0 && cons[j] != 0) { int bin = j/bin_size; if (bin >= bins) { break; } bin_lengths[bin]++; j++; } // For each bin, calculate the diff fraction = diffs / bin_length. for (bin = 0; bin < bins; bin++) { fracs[i][bin] = (double)diffs[i][bin]/bin_lengths[bin]; // printf("bin %d: %d / %d = %f\t", bin, diffs[i][bin], bin_lengths[bin], fracs[i][bin]); } // printf("\n"); } return fracs; } // Take an input sequence and insert gaps according to another, already-aligned sequence with gaps. // Input strings must be null-terminated. "gap_char1" is the character used for gaps in // "gapped_seq", and "gap_char2" is the gap character in "inseq". // N.B.: The ungapped length of "gapped_seq" must be equal to the length of "inseq". char *transfer_gaps(char *gapped_seq, char *inseq, char gap_char1, char gap_char2) { if (gap_char1 == 0) { gap_char1 = '-'; } if (gap_char2 == 0) { gap_char2 = '-'; } int gapped_len = strlen(gapped_seq); char *outseq = malloc(sizeof(char) * gapped_len + 1); // Transfer characters from inseq to outseq, except when gapped_seq has a gap at that spot // (insert a gap there instead). int g, o, i; for (g = 0, o = 0, i = 0; g < gapped_len; g++, o++) { if (gapped_seq[g] == gap_char1) { outseq[o] = gap_char2; } else { outseq[o] = inseq[i]; i++; } } outseq[gapped_len] = '\0'; return outseq; } // Wrapper for transfer_gaps() when operating on a set of sequences at once. char **transfer_gaps_multi(int n_seqs, char *gapped_seqs[], char *inseqs[], char gap_char1, char gap_char2) { char **outseqs = malloc(sizeof(char *) * n_seqs); int i; for (i = 0; i < n_seqs; i++) { outseqs[i] = transfer_gaps(gapped_seqs[i], inseqs[i], gap_char1, gap_char2); } return outseqs; }
