Mercurial > repos > melissacline > ucsc_cancer_browser_stats
annotate ttest/stats.py @ 6:6ca836b7e0b4
Changed the importing for stats.py
author | melissacline |
---|---|
date | Mon, 13 Oct 2014 18:52:54 -0700 |
parents | f1929875b1b3 |
children | 14c406f8fca6 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
3 | |
4 | |
5 import argparse | |
6 import math | |
7 import re | |
8 import sys | |
6 | 9 import numpy |
10 import scipy | |
11 import scipy.stats | |
0 | 12 |
13 def main(): | |
14 parser = argparse.ArgumentParser() | |
15 parser.add_argument("genomicData", type=str) | |
16 parser.add_argument("clinicalVector", type=str) | |
17 parser.add_argument("outfile", type=str) | |
18 parser.add_argument('-a', type=str, default=None) | |
19 parser.add_argument('-b', type=str, default=None) | |
20 args = parser.parse_args() | |
21 clinicalVector = readClinicalFile(args.clinicalVector) | |
22 genomicFp = open(args.genomicData) | |
23 headerTokens = genomicFp.readline().rstrip().split("\t") | |
24 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, | |
25 clinicalVector, | |
26 args.a, args.b) | |
27 print columnsGroupA, columnsGroupB | |
28 outfile = open(args.outfile, "w") | |
29 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0])) | |
30 for row in genomicFp: | |
31 tokens = row.rstrip().split("\t") | |
32 symbol = tokens[0] | |
33 groupATokens = [tokens[ii] for ii in columnsGroupA] | |
34 groupBTokens = [tokens[ii] for ii in columnsGroupB] | |
35 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"] | |
36 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"] | |
37 (ttest, pvalue) = performTTest(groupAValues, groupBValues) | |
38 if ttest == None or pvalue == None: | |
39 outputLine = "%s\tNA\tNA" % (symbol) | |
40 else: | |
41 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue) | |
42 if len(groupAValues) == 0: | |
43 outputLine = outputLine + "\tNA" | |
44 else: | |
45 medianA = numpy.median(groupAValues) | |
46 outputLine = "%s\t%f" % (outputLine, medianA) | |
47 if len(groupBValues) == 0: | |
48 outputLine = outputLine + "\tNA" | |
49 else: | |
50 medianB = numpy.median(groupBValues) | |
51 outputLine = "%s\t%f" % (outputLine, medianB) | |
52 if len(groupAValues) == 0 or len(groupBValues) == 0: | |
53 outputLine = outputLine + "\tNA" | |
54 else: | |
55 outputLine = "%s\t%f" % (outputLine, medianA - medianB) | |
56 outfile.write("%s\n" % (outputLine)) | |
57 outfile.close() | |
58 | |
59 def readClinicalFile(clinicalFile): | |
60 clinicalVector = dict() | |
61 fp = open(clinicalFile) | |
62 fp.readline() | |
63 for row in fp: | |
64 tokens = row.rstrip().split("\t") | |
2
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
65 if len(tokens) > 1: |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
66 clinicalVector[tokens[0]] = tokens[1] |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
67 else: |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
68 clinicalVector[tokens[0]] = None |
0 | 69 fp.close() |
70 return(clinicalVector) | |
71 | |
72 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB): | |
73 columnsGroupA = list() | |
74 columnsGroupB = list() | |
75 for ii in range(len(headerTokens)): | |
76 id = headerTokens[ii] | |
77 if clinicalVector.has_key(id): | |
78 if clinicalVector[id] == optionA: | |
79 columnsGroupA.append(ii) | |
80 elif clinicalVector[id] == optionB: | |
81 columnsGroupB.append(ii) | |
82 return((columnsGroupA, columnsGroupB)) | |
83 | |
84 def performTTest(groupA, groupB): | |
85 from scipy import special | |
86 | |
87 epsilon = 1E-06 | |
88 countA = len(groupA) | |
89 countB = len(groupB) | |
90 meanA = numpy.mean(groupA) | |
91 meanB = numpy.mean(groupB) | |
92 varA = numpy.var(groupA) | |
93 varB = numpy.var(groupB) | |
94 degreesOfFreedom = countA + countB - 2 | |
95 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom | |
96 if sVar == 0: | |
97 tStat = None | |
98 prob = None | |
99 else: | |
100 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon) | |
101 + 1.0/(countB + epsilon))) | |
102 bottom = degreesOfFreedom + tStat * tStat | |
103 prob = special.betainc(0.5*degreesOfFreedom, 0.5, | |
104 degreesOfFreedom/bottom) | |
105 return(tStat, prob) | |
106 | |
107 | |
108 | |
109 | |
110 if __name__ == "__main__": | |
111 main() |