Mercurial > repos > melissacline > ucsc_cancer_browser_stats
diff ttest/stats.py @ 0:12bb38e187b9
Uploaded, initial check-in
author | melissacline |
---|---|
date | Mon, 28 Jul 2014 20:12:17 -0400 (2014-07-29) |
parents | |
children | f1929875b1b3 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest/stats.py Mon Jul 28 20:12:17 2014 -0400 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + + + +import argparse +import math +import re +import sys +try: + import numpy +except: + from galaxy import eggs + eggs.require("numpy") + import numpy +try: + import scipy +except: + from galaxy import eggs + eggs.require( "scipy" ) + import scipy + import scipy.stats + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("genomicData", type=str) + parser.add_argument("clinicalVector", type=str) + parser.add_argument("outfile", type=str) + parser.add_argument('-a', type=str, default=None) + parser.add_argument('-b', type=str, default=None) + args = parser.parse_args() + clinicalVector = readClinicalFile(args.clinicalVector) + genomicFp = open(args.genomicData) + headerTokens = genomicFp.readline().rstrip().split("\t") + (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, + clinicalVector, + args.a, args.b) + print columnsGroupA, columnsGroupB + outfile = open(args.outfile, "w") + outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0])) + for row in genomicFp: + tokens = row.rstrip().split("\t") + symbol = tokens[0] + groupATokens = [tokens[ii] for ii in columnsGroupA] + groupBTokens = [tokens[ii] for ii in columnsGroupB] + groupAValues = [float(xx) for xx in groupATokens if xx != "NA"] + groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"] + (ttest, pvalue) = performTTest(groupAValues, groupBValues) + if ttest == None or pvalue == None: + outputLine = "%s\tNA\tNA" % (symbol) + else: + outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue) + if len(groupAValues) == 0: + outputLine = outputLine + "\tNA" + else: + medianA = numpy.median(groupAValues) + outputLine = "%s\t%f" % (outputLine, medianA) + if len(groupBValues) == 0: + outputLine = outputLine + "\tNA" + else: + medianB = numpy.median(groupBValues) + outputLine = "%s\t%f" % (outputLine, medianB) + if len(groupAValues) == 0 or len(groupBValues) == 0: + outputLine = outputLine + "\tNA" + else: + outputLine = "%s\t%f" % (outputLine, medianA - medianB) + outfile.write("%s\n" % (outputLine)) + outfile.close() + +def readClinicalFile(clinicalFile): + clinicalVector = dict() + fp = open(clinicalFile) + fp.readline() + for row in fp: + tokens = row.rstrip().split("\t") + clinicalVector[tokens[0]] = tokens[1] + fp.close() + return(clinicalVector) + +def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB): + columnsGroupA = list() + columnsGroupB = list() + for ii in range(len(headerTokens)): + id = headerTokens[ii] + if clinicalVector.has_key(id): + if clinicalVector[id] == optionA: + columnsGroupA.append(ii) + elif clinicalVector[id] == optionB: + columnsGroupB.append(ii) + return((columnsGroupA, columnsGroupB)) + +def performTTest(groupA, groupB): + from scipy import special + + epsilon = 1E-06 + countA = len(groupA) + countB = len(groupB) + meanA = numpy.mean(groupA) + meanB = numpy.mean(groupB) + varA = numpy.var(groupA) + varB = numpy.var(groupB) + degreesOfFreedom = countA + countB - 2 + sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom + if sVar == 0: + tStat = None + prob = None + else: + tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon) + + 1.0/(countB + epsilon))) + bottom = degreesOfFreedom + tStat * tStat + prob = special.betainc(0.5*degreesOfFreedom, 0.5, + degreesOfFreedom/bottom) + return(tStat, prob) + + + + +if __name__ == "__main__": + main()