annotate ttest/stats.py @ 7:14c406f8fca6

Removed an extraneous print message
author melissacline
date Wed, 22 Oct 2014 18:14:35 -0700
parents 6ca836b7e0b4
children f55d3b781adb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
1 #!/usr/bin/env python
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
2
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
3
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
4
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
5 import argparse
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
6 import math
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
7 import re
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
8 import sys
6
6ca836b7e0b4 Changed the importing for stats.py
melissacline
parents: 2
diff changeset
9 import numpy
6ca836b7e0b4 Changed the importing for stats.py
melissacline
parents: 2
diff changeset
10 import scipy
6ca836b7e0b4 Changed the importing for stats.py
melissacline
parents: 2
diff changeset
11 import scipy.stats
0
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
12
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
13 def main():
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
14 parser = argparse.ArgumentParser()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
15 parser.add_argument("genomicData", type=str)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
16 parser.add_argument("clinicalVector", type=str)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
17 parser.add_argument("outfile", type=str)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
18 parser.add_argument('-a', type=str, default=None)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
19 parser.add_argument('-b', type=str, default=None)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
20 args = parser.parse_args()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
21 clinicalVector = readClinicalFile(args.clinicalVector)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
22 genomicFp = open(args.genomicData)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
23 headerTokens = genomicFp.readline().rstrip().split("\t")
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
24 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens,
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
25 clinicalVector,
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
26 args.a, args.b)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
27 outfile = open(args.outfile, "w")
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
28 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0]))
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
29 for row in genomicFp:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
30 tokens = row.rstrip().split("\t")
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
31 symbol = tokens[0]
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
32 groupATokens = [tokens[ii] for ii in columnsGroupA]
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
33 groupBTokens = [tokens[ii] for ii in columnsGroupB]
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
34 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"]
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
35 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"]
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
36 (ttest, pvalue) = performTTest(groupAValues, groupBValues)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
37 if ttest == None or pvalue == None:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
38 outputLine = "%s\tNA\tNA" % (symbol)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
39 else:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
40 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
41 if len(groupAValues) == 0:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
42 outputLine = outputLine + "\tNA"
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
43 else:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
44 medianA = numpy.median(groupAValues)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
45 outputLine = "%s\t%f" % (outputLine, medianA)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
46 if len(groupBValues) == 0:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
47 outputLine = outputLine + "\tNA"
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
48 else:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
49 medianB = numpy.median(groupBValues)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
50 outputLine = "%s\t%f" % (outputLine, medianB)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
51 if len(groupAValues) == 0 or len(groupBValues) == 0:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
52 outputLine = outputLine + "\tNA"
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
53 else:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
54 outputLine = "%s\t%f" % (outputLine, medianA - medianB)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
55 outfile.write("%s\n" % (outputLine))
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
56 outfile.close()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
57
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
58 def readClinicalFile(clinicalFile):
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
59 clinicalVector = dict()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
60 fp = open(clinicalFile)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
61 fp.readline()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
62 for row in fp:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
63 tokens = row.rstrip().split("\t")
2
f1929875b1b3 Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents: 0
diff changeset
64 if len(tokens) > 1:
f1929875b1b3 Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents: 0
diff changeset
65 clinicalVector[tokens[0]] = tokens[1]
f1929875b1b3 Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents: 0
diff changeset
66 else:
f1929875b1b3 Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents: 0
diff changeset
67 clinicalVector[tokens[0]] = None
0
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
68 fp.close()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
69 return(clinicalVector)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
70
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
71 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB):
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
72 columnsGroupA = list()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
73 columnsGroupB = list()
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
74 for ii in range(len(headerTokens)):
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
75 id = headerTokens[ii]
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
76 if clinicalVector.has_key(id):
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
77 if clinicalVector[id] == optionA:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
78 columnsGroupA.append(ii)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
79 elif clinicalVector[id] == optionB:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
80 columnsGroupB.append(ii)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
81 return((columnsGroupA, columnsGroupB))
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
82
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
83 def performTTest(groupA, groupB):
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
84 from scipy import special
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
85
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
86 epsilon = 1E-06
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
87 countA = len(groupA)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
88 countB = len(groupB)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
89 meanA = numpy.mean(groupA)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
90 meanB = numpy.mean(groupB)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
91 varA = numpy.var(groupA)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
92 varB = numpy.var(groupB)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
93 degreesOfFreedom = countA + countB - 2
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
94 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
95 if sVar == 0:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
96 tStat = None
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
97 prob = None
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
98 else:
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
99 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
100 + 1.0/(countB + epsilon)))
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
101 bottom = degreesOfFreedom + tStat * tStat
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
102 prob = special.betainc(0.5*degreesOfFreedom, 0.5,
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
103 degreesOfFreedom/bottom)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
104 return(tStat, prob)
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
105
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
106
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
107
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
108
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
109 if __name__ == "__main__":
12bb38e187b9 Uploaded, initial check-in
melissacline
parents:
diff changeset
110 main()