Mercurial > repos > melissacline > ucsc_cancer_browser_stats
annotate ttest/stats.py @ 8:f55d3b781adb
check file input ignore bad lines`
author | jingchunzhu@gmail.com |
---|---|
date | Thu, 30 Jul 2015 21:42:15 -0700 |
parents | 14c406f8fca6 |
children | fd8529cd1564 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/env python |
2 | |
3 | |
4 | |
5 import argparse | |
6 import math | |
7 import re | |
8 import sys | |
6 | 9 import numpy |
10 import scipy | |
11 import scipy.stats | |
0 | 12 |
13 def main(): | |
14 parser = argparse.ArgumentParser() | |
15 parser.add_argument("genomicData", type=str) | |
16 parser.add_argument("clinicalVector", type=str) | |
17 parser.add_argument("outfile", type=str) | |
18 parser.add_argument('-a', type=str, default=None) | |
19 parser.add_argument('-b', type=str, default=None) | |
20 args = parser.parse_args() | |
21 clinicalVector = readClinicalFile(args.clinicalVector) | |
22 genomicFp = open(args.genomicData) | |
23 headerTokens = genomicFp.readline().rstrip().split("\t") | |
24 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, | |
25 clinicalVector, | |
26 args.a, args.b) | |
27 outfile = open(args.outfile, "w") | |
28 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0])) | |
29 for row in genomicFp: | |
30 tokens = row.rstrip().split("\t") | |
8 | 31 if len(tokens)!= len(headerTokens): |
32 continue | |
0 | 33 symbol = tokens[0] |
34 groupATokens = [tokens[ii] for ii in columnsGroupA] | |
35 groupBTokens = [tokens[ii] for ii in columnsGroupB] | |
36 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"] | |
37 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"] | |
38 (ttest, pvalue) = performTTest(groupAValues, groupBValues) | |
39 if ttest == None or pvalue == None: | |
40 outputLine = "%s\tNA\tNA" % (symbol) | |
41 else: | |
42 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue) | |
43 if len(groupAValues) == 0: | |
44 outputLine = outputLine + "\tNA" | |
45 else: | |
46 medianA = numpy.median(groupAValues) | |
47 outputLine = "%s\t%f" % (outputLine, medianA) | |
48 if len(groupBValues) == 0: | |
49 outputLine = outputLine + "\tNA" | |
50 else: | |
51 medianB = numpy.median(groupBValues) | |
52 outputLine = "%s\t%f" % (outputLine, medianB) | |
53 if len(groupAValues) == 0 or len(groupBValues) == 0: | |
54 outputLine = outputLine + "\tNA" | |
55 else: | |
56 outputLine = "%s\t%f" % (outputLine, medianA - medianB) | |
57 outfile.write("%s\n" % (outputLine)) | |
58 outfile.close() | |
59 | |
60 def readClinicalFile(clinicalFile): | |
61 clinicalVector = dict() | |
62 fp = open(clinicalFile) | |
63 fp.readline() | |
64 for row in fp: | |
65 tokens = row.rstrip().split("\t") | |
2
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
66 if len(tokens) > 1: |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
67 clinicalVector[tokens[0]] = tokens[1] |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
68 else: |
f1929875b1b3
Bug fix: added support for the condition that the clinical vector (from which the two categorical values are drawn) might be empty for some rows
melissacline
parents:
0
diff
changeset
|
69 clinicalVector[tokens[0]] = None |
0 | 70 fp.close() |
71 return(clinicalVector) | |
72 | |
73 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB): | |
74 columnsGroupA = list() | |
75 columnsGroupB = list() | |
76 for ii in range(len(headerTokens)): | |
77 id = headerTokens[ii] | |
78 if clinicalVector.has_key(id): | |
79 if clinicalVector[id] == optionA: | |
80 columnsGroupA.append(ii) | |
81 elif clinicalVector[id] == optionB: | |
82 columnsGroupB.append(ii) | |
83 return((columnsGroupA, columnsGroupB)) | |
84 | |
85 def performTTest(groupA, groupB): | |
86 from scipy import special | |
87 | |
88 epsilon = 1E-06 | |
89 countA = len(groupA) | |
90 countB = len(groupB) | |
91 meanA = numpy.mean(groupA) | |
92 meanB = numpy.mean(groupB) | |
93 varA = numpy.var(groupA) | |
94 varB = numpy.var(groupB) | |
95 degreesOfFreedom = countA + countB - 2 | |
96 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom | |
97 if sVar == 0: | |
98 tStat = None | |
99 prob = None | |
100 else: | |
101 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon) | |
102 + 1.0/(countB + epsilon))) | |
103 bottom = degreesOfFreedom + tStat * tStat | |
104 prob = special.betainc(0.5*degreesOfFreedom, 0.5, | |
105 degreesOfFreedom/bottom) | |
106 return(tStat, prob) | |
107 | |
108 | |
109 | |
110 | |
111 if __name__ == "__main__": | |
112 main() |