Mercurial > repos > melissacline > ucsc_cancer_browser_stats
annotate ttest/stats.py @ 7:14c406f8fca6
Removed an extraneous print message
author | melissacline |
---|---|
date | Wed, 22 Oct 2014 18:14:35 -0700 |
parents | 6ca836b7e0b4 |
children | f55d3b781adb |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
3 | |
4 | |
5 import argparse | |
6 import math | |
7 import re | |
8 import sys | |
6 | 9 import numpy |
10 import scipy | |
11 import scipy.stats | |
0 | 12 |
13 def main(): | |
14 parser = argparse.ArgumentParser() | |
15 parser.add_argument("genomicData", type=str) | |
16 parser.add_argument("clinicalVector", type=str) | |
17 parser.add_argument("outfile", type=str) | |
18 parser.add_argument('-a', type=str, default=None) | |
19 parser.add_argument('-b', type=str, default=None) | |
20 args = parser.parse_args() | |
21 clinicalVector = readClinicalFile(args.clinicalVector) | |
22 genomicFp = open(args.genomicData) | |
23 headerTokens = genomicFp.readline().rstrip().split("\t") | |
24 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, | |
25 clinicalVector, | |
26 args.a, args.b) | |
27 outfile = open(args.outfile, "w") | |
28 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0])) | |
29 for row in genomicFp: | |
30 tokens = row.rstrip().split("\t") | |
31 symbol = tokens[0] | |
32 groupATokens = [tokens[ii] for ii in columnsGroupA] | |
33 groupBTokens = [tokens[ii] for ii in columnsGroupB] | |
34 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"] | |
35 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"] | |
36 (ttest, pvalue) = performTTest(groupAValues, groupBValues) | |
37 if ttest == None or pvalue == None: | |
38 outputLine = "%s\tNA\tNA" % (symbol) | |
39 else: | |
40 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue) | |
41 if len(groupAValues) == 0: | |
42 outputLine = outputLine + "\tNA" | |
43 else: | |
44 medianA = numpy.median(groupAValues) | |
45 outputLine = "%s\t%f" % (outputLine, medianA) | |
46 if len(groupBValues) == 0: | |
47 outputLine = outputLine + "\tNA" | |
48 else: | |
49 medianB = numpy.median(groupBValues) | |
50 outputLine = "%s\t%f" % (outputLine, medianB) | |
51 if len(groupAValues) == 0 or len(groupBValues) == 0: | |
52 outputLine = outputLine + "\tNA" | |
53 else: | |
54 outputLine = "%s\t%f" % (outputLine, medianA - medianB) | |
55 outfile.write("%s\n" % (outputLine)) | |
56 outfile.close() | |
57 | |
58 def readClinicalFile(clinicalFile): | |
59 clinicalVector = dict() | |
60 fp = open(clinicalFile) | |
61 fp.readline() | |
62 for row in fp: | |
63 tokens = row.rstrip().split("\t") | |
2
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
64 if len(tokens) > 1: |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
65 clinicalVector[tokens[0]] = tokens[1] |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
66 else: |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
67 clinicalVector[tokens[0]] = None |
0 | 68 fp.close() |
69 return(clinicalVector) | |
70 | |
71 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB): | |
72 columnsGroupA = list() | |
73 columnsGroupB = list() | |
74 for ii in range(len(headerTokens)): | |
75 id = headerTokens[ii] | |
76 if clinicalVector.has_key(id): | |
77 if clinicalVector[id] == optionA: | |
78 columnsGroupA.append(ii) | |
79 elif clinicalVector[id] == optionB: | |
80 columnsGroupB.append(ii) | |
81 return((columnsGroupA, columnsGroupB)) | |
82 | |
83 def performTTest(groupA, groupB): | |
84 from scipy import special | |
85 | |
86 epsilon = 1E-06 | |
87 countA = len(groupA) | |
88 countB = len(groupB) | |
89 meanA = numpy.mean(groupA) | |
90 meanB = numpy.mean(groupB) | |
91 varA = numpy.var(groupA) | |
92 varB = numpy.var(groupB) | |
93 degreesOfFreedom = countA + countB - 2 | |
94 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom | |
95 if sVar == 0: | |
96 tStat = None | |
97 prob = None | |
98 else: | |
99 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon) | |
100 + 1.0/(countB + epsilon))) | |
101 bottom = degreesOfFreedom + tStat * tStat | |
102 prob = special.betainc(0.5*degreesOfFreedom, 0.5, | |
103 degreesOfFreedom/bottom) | |
104 return(tStat, prob) | |
105 | |
106 | |
107 | |
108 | |
109 if __name__ == "__main__": | |
110 main() |