Mercurial > repos > melissacline > ucsc_cancer_browser_stats
comparison ttest/stats.py @ 0:12bb38e187b9
Uploaded, initial check-in
author | melissacline |
---|---|
date | Mon, 28 Jul 2014 20:12:17 -0400 (2014-07-29) |
parents | |
children | f1929875b1b3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:12bb38e187b9 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 | |
4 | |
5 import argparse | |
6 import math | |
7 import re | |
8 import sys | |
9 try: | |
10 import numpy | |
11 except: | |
12 from galaxy import eggs | |
13 eggs.require("numpy") | |
14 import numpy | |
15 try: | |
16 import scipy | |
17 except: | |
18 from galaxy import eggs | |
19 eggs.require( "scipy" ) | |
20 import scipy | |
21 import scipy.stats | |
22 | |
23 def main(): | |
24 parser = argparse.ArgumentParser() | |
25 parser.add_argument("genomicData", type=str) | |
26 parser.add_argument("clinicalVector", type=str) | |
27 parser.add_argument("outfile", type=str) | |
28 parser.add_argument('-a', type=str, default=None) | |
29 parser.add_argument('-b', type=str, default=None) | |
30 args = parser.parse_args() | |
31 clinicalVector = readClinicalFile(args.clinicalVector) | |
32 genomicFp = open(args.genomicData) | |
33 headerTokens = genomicFp.readline().rstrip().split("\t") | |
34 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, | |
35 clinicalVector, | |
36 args.a, args.b) | |
37 print columnsGroupA, columnsGroupB | |
38 outfile = open(args.outfile, "w") | |
39 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0])) | |
40 for row in genomicFp: | |
41 tokens = row.rstrip().split("\t") | |
42 symbol = tokens[0] | |
43 groupATokens = [tokens[ii] for ii in columnsGroupA] | |
44 groupBTokens = [tokens[ii] for ii in columnsGroupB] | |
45 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"] | |
46 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"] | |
47 (ttest, pvalue) = performTTest(groupAValues, groupBValues) | |
48 if ttest == None or pvalue == None: | |
49 outputLine = "%s\tNA\tNA" % (symbol) | |
50 else: | |
51 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue) | |
52 if len(groupAValues) == 0: | |
53 outputLine = outputLine + "\tNA" | |
54 else: | |
55 medianA = numpy.median(groupAValues) | |
56 outputLine = "%s\t%f" % (outputLine, medianA) | |
57 if len(groupBValues) == 0: | |
58 outputLine = outputLine + "\tNA" | |
59 else: | |
60 medianB = numpy.median(groupBValues) | |
61 outputLine = "%s\t%f" % (outputLine, medianB) | |
62 if len(groupAValues) == 0 or len(groupBValues) == 0: | |
63 outputLine = outputLine + "\tNA" | |
64 else: | |
65 outputLine = "%s\t%f" % (outputLine, medianA - medianB) | |
66 outfile.write("%s\n" % (outputLine)) | |
67 outfile.close() | |
68 | |
69 def readClinicalFile(clinicalFile): | |
70 clinicalVector = dict() | |
71 fp = open(clinicalFile) | |
72 fp.readline() | |
73 for row in fp: | |
74 tokens = row.rstrip().split("\t") | |
75 clinicalVector[tokens[0]] = tokens[1] | |
76 fp.close() | |
77 return(clinicalVector) | |
78 | |
79 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB): | |
80 columnsGroupA = list() | |
81 columnsGroupB = list() | |
82 for ii in range(len(headerTokens)): | |
83 id = headerTokens[ii] | |
84 if clinicalVector.has_key(id): | |
85 if clinicalVector[id] == optionA: | |
86 columnsGroupA.append(ii) | |
87 elif clinicalVector[id] == optionB: | |
88 columnsGroupB.append(ii) | |
89 return((columnsGroupA, columnsGroupB)) | |
90 | |
91 def performTTest(groupA, groupB): | |
92 from scipy import special | |
93 | |
94 epsilon = 1E-06 | |
95 countA = len(groupA) | |
96 countB = len(groupB) | |
97 meanA = numpy.mean(groupA) | |
98 meanB = numpy.mean(groupB) | |
99 varA = numpy.var(groupA) | |
100 varB = numpy.var(groupB) | |
101 degreesOfFreedom = countA + countB - 2 | |
102 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom | |
103 if sVar == 0: | |
104 tStat = None | |
105 prob = None | |
106 else: | |
107 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon) | |
108 + 1.0/(countB + epsilon))) | |
109 bottom = degreesOfFreedom + tStat * tStat | |
110 prob = special.betainc(0.5*degreesOfFreedom, 0.5, | |
111 degreesOfFreedom/bottom) | |
112 return(tStat, prob) | |
113 | |
114 | |
115 | |
116 | |
117 if __name__ == "__main__": | |
118 main() |