view genome_diversity/src/dpmix.c @ 0:73648da53556 default tip

Uploaded
author rico
date Mon, 09 Apr 2012 11:55:36 -0400
parents
children
line wrap: on
line source

/* dpmix -- admixture using dynamic programming
*
*    argv{1] = a Galaxy SNP table. For each of several individuals, the table
*              has four columns (#A, #B, genotype, quality) -- SNPs on the same
*	       chromosome must appear together, and in order of position
*    argv[2] = column with the chromosome name (position is the next column)
*    argv[3] = "all" or e.g., "chr20"
*    argv[4] = 1 if ancestral allele frequencies are estimated from SAMtools
*		genotypes; 0 means use read-coverage data.
*    argv[5] = 1 to add logarithms of probabilities, allowing unobserve alleles,
*	       0 to simply add probabilities
*    argv[6] = switch penalty (>= 0)
*    argv[7] = file giving heterochromatic intervals ('-' means that no file is
*	       given)
*    argv[8] = file name to write additional output
*    argv[9], argv[10], ...,  have the form "13:1:paul", "13:2:fred" or
*	      "13:0:Mary", meaning that the 13th and 14th columns (base 1)
*	      give the allele counts for an individual that is in ancestral
*	      population 1, ancestral population 2, or is a potentially admixed
*	      individual, resp.

What it does on Galaxy
The user specifies two "ancestral" populations (i.e., sources for chromosomes) and a set of potentially admixed individuals, and chooses between the sequence coverage or the estimated genotypes to measure the similarity of genomic intervals in admixed individuals to the two classes of ancestral chromosomes. The user also picks a "switch penalty", typically between 10 and 100. For each potentially admixed individual, the program divides the genome into three "genotypes": (0) homozygous for the second ancestral population (i.e., both chromosomes from that population), (1) heterozygous, or (2) homozygous for the second ancestral population. Parts of a chromosome that are labeled as "heterochromatic" via argv[7] are given the "non-genotype, 3. Smaller values of the switch penalty (corresponding to more ancient admixture events) generally lead to the reconstruction of more frequent changes between genotypes.
*/

#include "lib.h"
//#include <math.h>

// maximum length of a line from the table
#define MOST 5000

struct snp {
	double F1, F2;	// reference allele frequencies in the two populations
	int pos, het_end, *g; // position and an array of admixed genotypes
	struct snp *prev;	// we keep the list in order of decreasing pos
} *last;

struct admixed {
	char *name;
	int gcol, ge20, gt02;
	long long x[4];
} A[MOST];

// information about the specified individuals, namely column and population
// for the ancestral populations and genotype columns for the admixed.
struct ances {
	int col, pop;
	char *name;
} C[MOST];

// heterochromatic intervals
struct het {
	char *chr;
	int b, e;
} H[MOST];

// global variables
double *S[4];	// S[s][i] = best score ending in state s at SNP i
int *B[4],	// backpointer to state at the previous SNP
    *P;		// chromosome position
int nH, nI, nG, genotypes, nsnp, debug, chr_col, logs;
char this_chr[100];
double switch_penalty;
char buf[MOST], *status;
FILE *fp, *out;

// probability of producing genotype g in admixture state s
// given reference allele frequencies f1 and f2 in the ancestral populations
double score (double f1, double f2, int g, int s) {
	double p;

	if (s == 2) { // homozygous for the first ancestral population
		if (g == 2)
			p = f1*f1;
		else if (g == 0)
			p = (1.0-f1)*(1.0-f1);
		else
			p = 2.0*f1*(1.0-f1);
	} else if (s == 0) { // homozygous for the second ancestral population
		if (g == 2)
			p = f2*f2;
		else if (g == 0)
			p = (1.0-f2)*(1.0-f2);
		else
			p = 2.0*f2*(1.0-f2);
	} else { // one chromosome from each ancestral population
		if (s != 1)
			fatalf("bad state %d", s);
		if (g == 2)
			p = f1*f2;
		else if (g == 0)
			p = (1.0-f1)*(1.0-f2);
		else
			p = f1*(1.0-f2)  + (1.0-f1)*f2;
	}
	
//fprintf(stderr, "%f %f %d %d => %f\n", f1, f2, g, s, p);
	if (p < 0.0)
		fatalf("%f %f %d %d => %f", f1, f2, g, s, p);
	if (!logs)
		return p;
#ifdef NEVER
	if (p == 0.0)
		return -5.0;
	p = log(p);
	if (p < -5.0)
		p = -5.0;
	return p;
#endif
	fatal("dpmix: cannot happen");
}

char *get_chr_name() {
	static char tmp[MOST];
	char *s, *z = "\t\n";
	int i = chr_col;

	strcpy(tmp, buf);
	s = strtok(tmp, z);
	while (--i > 0)
		s = strtok(NULL, z);
	return s;
}

// process one potentially admixed individual
void one_admix(int a) {
	int i, j, m, state, prev_pos, b;
	double from[4];
	struct snp *p;

	for (i = nsnp-1, p = last; i >= 0 && p != NULL; --i, p = p->prev) {
		for (state = 0; state < 4; ++state) {
			// end (start, rather) in this state
			for (j = 0; j < 4; ++j) {
				// preceding state is j
				from[j] = S[j][i+1];
				if (j != state)
					from[j] -= switch_penalty;
				//if (abs(j-state) == 2)
					//from[j] -= switch_penalty;
			}
			for (m = 0, j = 1; j < 4; ++j)
				if (from[j] > from[m])
					m = j;
			S[state][i] = from[m];
			B[state][i] = m;
			if (state < 3)
				S[state][i] +=
				    score(p->F1, p->F2, p->g[a], state);
		}
		if (p->het_end == 1) {
			S[3][i] = 0;
			S[0][i] = S[1][i] = S[2][i] = -1000000;
		} else
			S[3][i] = -1000000;
		if (debug)
			fprintf(stderr, "%d: %f(%d) %f(%d) %f(%d)\n",
			  i, S[0][i], B[0][i], S[1][i], B[1][i], S[2][i],
			  B[2][i]);
	}

	// find the best initial state
	for (state = 0, j = 1; j < 4; ++j)
		if (S[j][0] > S[state][0])
			state = j;

	// trace back to find the switch points
	// A[a].x[state] records the total length of intervals in each state
	for (prev_pos = i = 0; i < nsnp; ++i) {
		if ((b = B[state][i]) != state) {
			if (prev_pos < P[i+1]-1)
				printf("%s\t%d\t%d\t%d\t%s\n",
				  this_chr, prev_pos, P[i+1], state, A[a].name);
			A[a].x[state] += (P[i+1]-prev_pos);
			prev_pos = P[i+1];
			state = b;
		}
	}
/*
	printf("%s\t%d\t%d\t%d\t%s\n",
	  this_chr, prev_pos, P[nsnp-1], state, A[a].name);
	A[a].x[state] += (P[nsnp-1]-prev_pos);
*/
} 

// add a heterochromatic interval to the SNP list
void add_het(int b, int e) {
	struct snp *new = ckalloc(sizeof(struct snp));
	int i;

	new->F1 = new->F2 = 0.0;
	new->pos = b;
	new->het_end = e;
	new->g = ckalloc(nG*sizeof(int));
	for (i = 0; i < nG; ++i)
		new->g[i] = 0;
	new->prev = last;
	last = new;
}

// Process one chromosome. Read the SNPs on the chromosome (the first one is
// already in the buf). For each SNP, determine the frequencies of the
// reference allele in the two ancestral populations, and put them in the
// linked list. Then call the dynamic-programming routine.
void one_chr() {
	char *s, *z = "\t\n";
	int X[MOST], n, i, g, A1, B1, A2, B2, a, do_read, p, pos, het;
	struct snp *new;
	double F1, F2;

	strcpy(this_chr, get_chr_name());
	nsnp = 0;
	last = NULL;
	for (het = 0; het < nH && !same_string(this_chr, H[het].chr); ++het)
		;
	// loop over the SNPs on the current chromosome
	for (do_read = 0; ; do_read = 1) {
		if (do_read && (status = fgets(buf, MOST, fp)) == NULL)
			break; 
		if (!same_string(get_chr_name(), this_chr))
			break;
		
		// set X[i] = atoi(i-th word of buf), i is base 1
		for (i = 1, s = strtok(buf, z); s != NULL;
		  ++i, s = strtok(NULL, z))
			X[i] = atoi(s);

		// insert heterochomatin intervals coming before the SNP
		pos = X[chr_col+1];
		while (het < nH && same_string(this_chr, H[het].chr) &&
		   H[het].b < pos) {
			add_het(H[het].b, 1);
			add_het(H[het].e, 2);
			nsnp+= 2;
			++het;
		}
			
		// should we discard this SNP?
		if ((pos = X[chr_col+1]) == -1)	// SNP not mapped to the reference
			continue;
		for (i = 0; i < nG && X[A[i].gcol] >= 0; ++i)
			;
		if (i < nG)	// genotype of admixed individual not called
			continue;

		// add SNP to a "backward pointing" linked list, recording the
		// "major" allele frequencies in the two reference populations
		// and genotypes in the potential admixed individuals
		for (i = A1 = B1 = A2 = B2 = 0; i < nI; ++i) {
			n = C[i].col;
			p = C[i].pop;
			if (genotypes) {
				g = X[n+2];
				if (g == -1)
					continue;
				if (g < 0 || g > 2)
					fatalf("invalid genotype %d", g);
				if (p == 1) {
					A1 += g;
					B1 += (2 - g);
				} else if (p == 2) {
					A2 += g;
					B2 += (2 - g);
				}
			} else {	// use read counts
				if (p == 1) {
					A1 += X[n];
					B1 += X[n+1];
				} else if (p == 2) {
					A2 += X[n];
					B2 += X[n+1];
				}
			}
		}
		if (A1+B1 == 0 || A2+B2 == 0)
			continue;
		++nsnp;
		new = ckalloc(sizeof(struct snp));
		new->pos = X[chr_col+1];
		new->F1 = F1 = (double)A1/(double)(A1+B1);
		new->F2 = F2 = (double)A2/(double)(A2+B2);
		new->het_end = 0;
		new->g = ckalloc(nG*sizeof(int));
		for (i = 0; i < nG; ++i) {
			g = new->g[i] = X[A[i].gcol];
			if (score(F1, F2, g, 2) >= score(F1, F2, g, 0))
				A[i].ge20++;
			else 
				A[i].gt02++;
		}
		if (F1 < 0.0 || F1 > 1.0)
			fatalf("F1 = %f (A1 = %d, B1 = %d) at snp %d",
			  F1, A1, B1, nsnp);
		if (F2 < 0.0 || F2 > 1.0)
			fatalf("F2 = %f (A2 = %d, B2 = %d) at snp %d",
			  F2, A2, B2, nsnp);
		new->prev = last;
		last = new;
	}
	// insert heterochomatin intervals that follow all SN
	while (het < nH && same_string(this_chr, H[het].chr)) {
		add_het(H[het].b, 1);
		add_het(H[het].e, 2);
		nsnp += 2;
		++het;
	}
/*
printf("nsnp = %d\n", nsnp);
for (i = nsnp-1, new = last; i >= 0 && new != NULL; --i, new = new->prev) {
printf("%d %d ", new->pos, new->het_end);
printf("%g %g ", new->F1, new->F2);
for (a = 0; a < nG; ++a)
printf("%d", new->g[a]);
putchar('\n');
}
//exit(0);
printf("\nbacktrace\n");
*/

	// allocate arrays for the DP analysis
	P = ckalloc(nsnp*sizeof(int));
	for (i = nsnp-1, new = last; i >= 0 && new != NULL;
	     --i, new = new->prev)
		P[i] = new->pos;

	for (i = 0; i < 4; ++i) {
		S[i] = ckalloc((nsnp+1)*sizeof(double));
		S[i][nsnp] = 0.0;
		B[i] = ckalloc((nsnp+1)*sizeof(int));
		B[i][nsnp] = 0;
	}
	
	// loop over possibly admixed individuals
	for (a = 0; a < nG; ++a)
		one_admix(a);

	// free the allocated storage
	while (last != NULL) {
		new = last;
		last = last->prev;
		free(new->g);
		free(new);
	}
	free(P);
	for (i = 0; i < 4; ++i) {
		free(S[i]);
		free(B[i]);
	}
}

int main(int argc, char **argv) {
	int n, i, j, k, saw[3];
	long long het_len;
	float N;
	char nam[100], *chr;

	if (argc < 8)
		fatal("args: table chr-col chr data-source switch heterochrom outfile n:1:name1 m:2:name2 ...");
	if (same_string(argv[argc-1], "debug")) {
		debug = 1;
		--argc;
	}

	// handle command-line arguments
	chr_col = atoi(argv[2]);
	chr = argv[3];
	genotypes = atoi(argv[4]);
	logs = atoi(argv[5]);
	if (logs)
		fatal("logarithms of probabilities -- under development");
	switch_penalty = atof(argv[6]);
/*
	if (logs)
		switch_penalty = log(switch_penalty);
*/
	if (switch_penalty < 0.0)
		fatal("negative switch penalty");
	out = ckopen(argv[8], "w");

	het_len = 0;
	if (!same_string(argv[7], "-")) {
		fp = ckopen(argv[7], "r");
		while (fgets(buf, MOST, fp)) {
			if (nH >= MOST)
				fatal("Too many heterochromatic intervals");
			if (sscanf(buf, "%s %d %d", nam, &i, &j) != 3)
				fatalf("gagging: %s", buf);
			H[nH].chr = copy_string(nam);
			H[nH].b = i;
			H[nH].e = j;
			het_len += (j - i);
			++nH;
		}
		fclose(fp);
	}
/*
for (i = 0; i < nH; ++i)
printf("%s %d %d\n", H[i].chr, H[i].b, H[i].e);
exit(0);
*/

	saw[1] = saw[2] = 0;
	// populations must be disjoint
	for (i = 9; i < argc; ++i) {
		if (sscanf(argv[i], "%d:%d:%s", &j, &k, nam) != 3)
			fatalf("not like 13:2:fred : %s", argv[i]);
		if (k < 0 || k > 2)
			fatalf("not population 0, 1 or 2: %s", argv[i]);
		saw[k] = 1;

		// seen this individual (i.e., column) before??
		for (n = 0; n < nI && C[n].col != j; ++n)
			;
		if (n < nI)
			fatal("populations are not disjoint");
		if (k == 0) {	// admixed individuals
			if (nG >= MOST)
				fatal("Too many admixed individuals");
			A[nG].name = copy_string(nam);
			A[nG++].gcol = j+2;
		} else {	// ancestral populations
			if (nI >= MOST)
				fatal("Too many ancestral individuals");
			C[nI].col = j;
			C[nI].pop = k;
			C[nI++].name = copy_string(nam);
		}
	}
		
	if (saw[0] == 0)
		fatal("no admixed individual is specified");
	if (saw[1] == 0)
		fatal("first reference population is empty");
	if (saw[2] == 0)
		fatal("second reference population is empty");

	// start the output file of text
	for (k = 1; k <= 2; ++k) {
		fprintf(out, "state %d agrees with:", k == 1 ? 2 : 0);
		for (i = 0; i < nI; ++i)
			if (C[i].pop == k)
				fprintf(out, " %s", C[i].name);
		putc('\n', out);
	}
	putc('\n', out);

	fp = ckopen(argv[1], "r");
	while ((status = fgets(buf, MOST, fp)) != NULL && buf[0] == '#')
		;
	if (same_string(chr, "all"))
		while (status != NULL)
			one_chr();
	else {	// skip to the specified chromosome
		while (!same_string(chr, get_chr_name()) &&
		       (status = fgets(buf, MOST, fp)) != NULL)
			;
		if (status != NULL)
			one_chr();
	}
	for (i = 0; i < nG; ++i) {
		fprintf(out,
		  "%s: %d SNPs where state 2 is at least as likely as state 0\n",
		  A[i].name, A[i].ge20);
		fprintf(out,
		  "%s: %d SNPs where state 0 is more likely than state 2\n\n",
		  A[i].name, A[i].gt02);
	}
	// write fractions in each state to the output text file
	for (i = 0; i < nG; ++i) {
		N = (float)(A[i].x[0] + A[i].x[1] + A[i].x[2])/100.0;
		fprintf(out, "%s: 0 = %1.1f%%, 1 = %1.1f%%, 2 = %1.1f%%\n",
		  A[i].name, (float)A[i].x[0]/N, (float)A[i].x[1]/N,
		  (float)A[i].x[2]/N); 
	}

	return 0;
}