comparison ttest/stats.py @ 0:12bb38e187b9

Uploaded, initial check-in
author melissacline
date Mon, 28 Jul 2014 20:12:17 -0400 (2014-07-29)
parents
children f1929875b1b3
comparison
equal deleted inserted replaced
-1:000000000000 0:12bb38e187b9
1 #!/usr/bin/env python
2
3
4
5 import argparse
6 import math
7 import re
8 import sys
9 try:
10 import numpy
11 except:
12 from galaxy import eggs
13 eggs.require("numpy")
14 import numpy
15 try:
16 import scipy
17 except:
18 from galaxy import eggs
19 eggs.require( "scipy" )
20 import scipy
21 import scipy.stats
22
23 def main():
24 parser = argparse.ArgumentParser()
25 parser.add_argument("genomicData", type=str)
26 parser.add_argument("clinicalVector", type=str)
27 parser.add_argument("outfile", type=str)
28 parser.add_argument('-a', type=str, default=None)
29 parser.add_argument('-b', type=str, default=None)
30 args = parser.parse_args()
31 clinicalVector = readClinicalFile(args.clinicalVector)
32 genomicFp = open(args.genomicData)
33 headerTokens = genomicFp.readline().rstrip().split("\t")
34 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens,
35 clinicalVector,
36 args.a, args.b)
37 print columnsGroupA, columnsGroupB
38 outfile = open(args.outfile, "w")
39 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0]))
40 for row in genomicFp:
41 tokens = row.rstrip().split("\t")
42 symbol = tokens[0]
43 groupATokens = [tokens[ii] for ii in columnsGroupA]
44 groupBTokens = [tokens[ii] for ii in columnsGroupB]
45 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"]
46 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"]
47 (ttest, pvalue) = performTTest(groupAValues, groupBValues)
48 if ttest == None or pvalue == None:
49 outputLine = "%s\tNA\tNA" % (symbol)
50 else:
51 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue)
52 if len(groupAValues) == 0:
53 outputLine = outputLine + "\tNA"
54 else:
55 medianA = numpy.median(groupAValues)
56 outputLine = "%s\t%f" % (outputLine, medianA)
57 if len(groupBValues) == 0:
58 outputLine = outputLine + "\tNA"
59 else:
60 medianB = numpy.median(groupBValues)
61 outputLine = "%s\t%f" % (outputLine, medianB)
62 if len(groupAValues) == 0 or len(groupBValues) == 0:
63 outputLine = outputLine + "\tNA"
64 else:
65 outputLine = "%s\t%f" % (outputLine, medianA - medianB)
66 outfile.write("%s\n" % (outputLine))
67 outfile.close()
68
69 def readClinicalFile(clinicalFile):
70 clinicalVector = dict()
71 fp = open(clinicalFile)
72 fp.readline()
73 for row in fp:
74 tokens = row.rstrip().split("\t")
75 clinicalVector[tokens[0]] = tokens[1]
76 fp.close()
77 return(clinicalVector)
78
79 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB):
80 columnsGroupA = list()
81 columnsGroupB = list()
82 for ii in range(len(headerTokens)):
83 id = headerTokens[ii]
84 if clinicalVector.has_key(id):
85 if clinicalVector[id] == optionA:
86 columnsGroupA.append(ii)
87 elif clinicalVector[id] == optionB:
88 columnsGroupB.append(ii)
89 return((columnsGroupA, columnsGroupB))
90
91 def performTTest(groupA, groupB):
92 from scipy import special
93
94 epsilon = 1E-06
95 countA = len(groupA)
96 countB = len(groupB)
97 meanA = numpy.mean(groupA)
98 meanB = numpy.mean(groupB)
99 varA = numpy.var(groupA)
100 varB = numpy.var(groupB)
101 degreesOfFreedom = countA + countB - 2
102 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom
103 if sVar == 0:
104 tStat = None
105 prob = None
106 else:
107 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon)
108 + 1.0/(countB + epsilon)))
109 bottom = degreesOfFreedom + tStat * tStat
110 prob = special.betainc(0.5*degreesOfFreedom, 0.5,
111 degreesOfFreedom/bottom)
112 return(tStat, prob)
113
114
115
116
117 if __name__ == "__main__":
118 main()