view genome_diversity/src/Fst_ave.c @ 0:73648da53556 default tip

Uploaded
author rico
date Mon, 09 Apr 2012 11:55:36 -0400
parents
children
line wrap: on
line source

/* Fst_ave -- determine the average Fst values between two specified populations
*  and between two random populations
*
*    argv{1] = a Galaxy SNP table. For each of several individuals, the table
*              has four columns (#A, #B, genotype, quality).
*    argv[2] = 1 if Fst is estimated from SAMtools genotypes; 0 means use
*	        read-coverage data.
*    argv[3] = lower bound, for individual quality value if argv[2] = 1
*	       or for total number of reads per population if argv[2] = 0.
*	       SNPs not satisfying these lower bounds are ignored.
*    argv[4] = 1 to discard SNPs that appear fixed in the two populations
*    argv[5] = 1 for unbiased estimator, else 0 for the original Wright form.
*    argv[6] = k => 0 says report the average Fst and the largest average over k
*              randomly chosen splits into two populations of those sizes
*    argv[7], argv[8], ...,  have the form "13:1", "13:2" or "13:0", meaning
*             that the 13th and 14th columns (base 1) give the allele counts
*             for an individual that is in population 1, in population 2,
*             or in neither population.

What it does on Galaxy

The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to select individuals from a SNP table. No individual can be in both populations. Other choices are as follows.

Data soure. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations.

After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. SMPs not meeting these lower bounds are ignored.

The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded.

The user chooses which definition of Fst to use: Wright's original definition or Weir's unbiased estimator.

Finally, the user decides whether to use randomizations. If so, then the user specifies how many randomly generated population pairs (retaining the numbers of individuals of the originals) to generate, as well as the "population" of additional individuals (not in the first two popuations) that can be used in the ransmization process.

The program prints the average Fst for the original populations and the number of SNPs used to compute it. If randomizations were requested, it prints the average Fst for each randomly generated population pair, ending with a summary that includes the maximum and average value, and the highest-scoring population pair.
*/

#include "lib.h"
#include "Fst_lib.h"

// maximum legth of a line from the table
#define MOST 5000

// information about the specified individuals
// x is an array of nI values 0, 1, or 2;
// shuffling x creates random "populations"
int col[MOST], x[MOST], best_x[MOST];
int nI, lower_bound, unbiased, discard, genotypes, nsnp;

// each SNP has an array of counts
struct count {
	int A, B;
};

// linked list summarizes the Galaxy table
struct snp {
	struct count *c;
	struct snp *next;
} *start, *last;

// given the two populations specified by x[], return the average Fst
double ave_Fst() {
	double tot_Fst;
	struct snp *s;
	int i, A1, B1, A2, B2, too_few;


	// scan the SNPs
	tot_Fst = 0.0;
	nsnp = 0;
	for (s = start; s != NULL; s = s->next) {
		// get counts for the two populations at this SNP
		for (A1 = B1 = A2 = B2 = i = 0; i < nI; ++i) {
			if (s->c[i].A < 0) // no genotypes
				continue;
			if (x[i] == 1) {
				A1 += s->c[i].A;
				B1 += s->c[i].B;
			} else if (x[i] == 2) {
				A2 += s->c[i].A;
				B2 += s->c[i].B;
			}
		}
		if (discard && ((A1 == 0 && A2 == 0) || (B1 == 0 && B2 == 0)))
			continue;	// fixed in these two populations
		too_few = (genotypes ? 1 : lower_bound);
		if (A1+B1 >= too_few && A2+B2 >= too_few) {
			++nsnp;
			tot_Fst += Fst(A1, B1, A2, B2, unbiased);
		}
	}
	return tot_Fst/nsnp;
}

/* shuffle the values x[0], x[1], ... , x[nI-1];
*  Uses Algorithm P in page 125 of "The Art of Computer Programming (Vol II)
*  Seminumerical Programming", by Donald Knuth, Addison-Wesley, 1971.
*/
void shuffle() {
	int i, j, temp;

	for (i = nI - 1; i > 0; --i) {
		// swap what's in location i with location j, where 0 <= j <= i
		j = random() % (i+1);
		temp = x[i];
		x[i] = x[j];
		x[j] = temp;
	} 
}

int main(int argc, char **argv) {
	FILE *fp;
	char *p, *z = "\t\n", buf[MOST];
	int X[MOST], nshuff, n, i, j, k, saw[3], larger, all = 1;
	struct snp *new;
	double F, F1, largest_F, tot_F;

	if (argc < 7)
		fatal("args: table data-source lower_bound discard? unbiased? #shuffles n:1 m:2 ...");

	// handle command-line arguments
	genotypes = atoi(argv[2]);
	lower_bound = atoi(argv[3]);
	if (!genotypes && lower_bound <= 0)
		fatal("minimum coverage should exceed 0");
	discard = atoi(argv[4]);
	unbiased = atoi(argv[5]);
	nshuff = atoi(argv[6]);
	saw[0] = saw[1] = saw[2] = 0;
	// populations 1 and 2 must be disjoint
	// population 0 can be replaced by population 1 or 2 
	for (i = 7; i < argc; ++i) {
		if (sscanf(argv[i], "%d:%d", &j, &k) != 2)
			fatalf("not like 13:2 : %s", argv[i]);
		if (k < 0 || k > 2)
			fatalf("not population 0, 1 or 2: %s", argv[i]);
		saw[k] = 1;
		// seen this individual (i.e., column) before??
		for (n = 0; n < nI && col[n] != j; ++n)
			;
		if (n < nI) { // OK if one of the populations is 0
			if (k > 0) {
				if (x[n] > 0 && x[n] != k)
				  fatalf("column %d is in both populations", j);
				x[n] = k;
			}
		} else {
			col[nI] = j;
			x[nI] = k;
			++nI;
		}
	}
	if (saw[1] == 0)
		fatal("population 1 is empty");
	if (saw[2] == 0)
		fatal("population 2 is empty");

	// read the table of SNPs and store the essential allele counts
	fp = ckopen(argv[1], "r");
	while (fgets(buf, MOST, fp)) {
		if (buf[0] == '#')
			continue;
		new = ckalloc(sizeof(*new));
		new->next = NULL;
		new->c = ckalloc(nI*sizeof(struct count));
		// set X[i] = atoi(i-th word of buf), i is base 1
		for (i = 1, p = strtok(buf, z); p != NULL;
		  ++i, p = strtok(NULL, z))
			X[i] = atoi(p);
		for (i = 0; i < nI; ++i) {
			n = col[i];
			if (genotypes) {
				k = X[n+2];
				if (k == -1 || X[n+3] < lower_bound)
					new->c[i].A = new->c[i].B = -1;
				else {
					new->c[i].A = k;
					new->c[i].B = 2 - k;
				}
			} else {
				new->c[i].A = X[n];
				new->c[i].B = X[n+1];
			}
		}
		if (start == NULL)
			start = new;
		else
			last->next = new;
		last = new;
	}
	fclose(fp);

	F1 = ave_Fst();
	printf("average Fst is %5.5f, using %d SNPs\n", F1, nsnp);
	for (j = 0; j < nI; ++j)
		best_x[j] = x[j];
	for (tot_F = largest_F = 0.0, larger = i = 0; i < nshuff; ++i) {
		shuffle();
		if ((F = ave_Fst()) > F1)
			++larger;
		if (F > largest_F) {
			largest_F = F;
			for (j = 0; j < nI; ++j)
				best_x[j] = x[j];
		}
		tot_F += F;
		if (all)	// make this optional?
			printf("%d: %f\n", i+1, F);
	}
	if (nshuff > 0) {
		printf("%d of %d random groupings had a larger average Fst\n",
		  larger, nshuff);
		printf("largest = %5.5f, mean = %5.5f\n", largest_F,
		  tot_F/nshuff);
		if (largest_F > F1) {
			printf("first columns for the best two populations:\n");
			for (i = 0; i < nI; ++i)
				if (best_x[i] == 1)
					printf("%d ", col[i]);
			printf("and\n");
			for (i = 0; i < nI; ++i)
				if (best_x[i] == 2)
					printf("%d ", col[i]);
			putchar('\n');
		}
	}

	return 0;
}