0
|
1 #!/usr/bin/env python
|
|
2
|
|
3
|
|
4
|
|
5 import argparse
|
|
6 import math
|
|
7 import re
|
|
8 import sys
|
|
9 try:
|
|
10 import numpy
|
|
11 except:
|
|
12 from galaxy import eggs
|
|
13 eggs.require("numpy")
|
|
14 import numpy
|
|
15 try:
|
|
16 import scipy
|
|
17 except:
|
|
18 from galaxy import eggs
|
|
19 eggs.require( "scipy" )
|
|
20 import scipy
|
|
21 import scipy.stats
|
|
22
|
|
23 def main():
|
|
24 parser = argparse.ArgumentParser()
|
|
25 parser.add_argument("genomicData", type=str)
|
|
26 parser.add_argument("clinicalVector", type=str)
|
|
27 parser.add_argument("outfile", type=str)
|
|
28 parser.add_argument('-a', type=str, default=None)
|
|
29 parser.add_argument('-b', type=str, default=None)
|
|
30 args = parser.parse_args()
|
|
31 clinicalVector = readClinicalFile(args.clinicalVector)
|
|
32 genomicFp = open(args.genomicData)
|
|
33 headerTokens = genomicFp.readline().rstrip().split("\t")
|
|
34 (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens,
|
|
35 clinicalVector,
|
|
36 args.a, args.b)
|
|
37 print columnsGroupA, columnsGroupB
|
|
38 outfile = open(args.outfile, "w")
|
|
39 outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0]))
|
|
40 for row in genomicFp:
|
|
41 tokens = row.rstrip().split("\t")
|
|
42 symbol = tokens[0]
|
|
43 groupATokens = [tokens[ii] for ii in columnsGroupA]
|
|
44 groupBTokens = [tokens[ii] for ii in columnsGroupB]
|
|
45 groupAValues = [float(xx) for xx in groupATokens if xx != "NA"]
|
|
46 groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"]
|
|
47 (ttest, pvalue) = performTTest(groupAValues, groupBValues)
|
|
48 if ttest == None or pvalue == None:
|
|
49 outputLine = "%s\tNA\tNA" % (symbol)
|
|
50 else:
|
|
51 outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue)
|
|
52 if len(groupAValues) == 0:
|
|
53 outputLine = outputLine + "\tNA"
|
|
54 else:
|
|
55 medianA = numpy.median(groupAValues)
|
|
56 outputLine = "%s\t%f" % (outputLine, medianA)
|
|
57 if len(groupBValues) == 0:
|
|
58 outputLine = outputLine + "\tNA"
|
|
59 else:
|
|
60 medianB = numpy.median(groupBValues)
|
|
61 outputLine = "%s\t%f" % (outputLine, medianB)
|
|
62 if len(groupAValues) == 0 or len(groupBValues) == 0:
|
|
63 outputLine = outputLine + "\tNA"
|
|
64 else:
|
|
65 outputLine = "%s\t%f" % (outputLine, medianA - medianB)
|
|
66 outfile.write("%s\n" % (outputLine))
|
|
67 outfile.close()
|
|
68
|
|
69 def readClinicalFile(clinicalFile):
|
|
70 clinicalVector = dict()
|
|
71 fp = open(clinicalFile)
|
|
72 fp.readline()
|
|
73 for row in fp:
|
|
74 tokens = row.rstrip().split("\t")
|
|
75 clinicalVector[tokens[0]] = tokens[1]
|
|
76 fp.close()
|
|
77 return(clinicalVector)
|
|
78
|
|
79 def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB):
|
|
80 columnsGroupA = list()
|
|
81 columnsGroupB = list()
|
|
82 for ii in range(len(headerTokens)):
|
|
83 id = headerTokens[ii]
|
|
84 if clinicalVector.has_key(id):
|
|
85 if clinicalVector[id] == optionA:
|
|
86 columnsGroupA.append(ii)
|
|
87 elif clinicalVector[id] == optionB:
|
|
88 columnsGroupB.append(ii)
|
|
89 return((columnsGroupA, columnsGroupB))
|
|
90
|
|
91 def performTTest(groupA, groupB):
|
|
92 from scipy import special
|
|
93
|
|
94 epsilon = 1E-06
|
|
95 countA = len(groupA)
|
|
96 countB = len(groupB)
|
|
97 meanA = numpy.mean(groupA)
|
|
98 meanB = numpy.mean(groupB)
|
|
99 varA = numpy.var(groupA)
|
|
100 varB = numpy.var(groupB)
|
|
101 degreesOfFreedom = countA + countB - 2
|
|
102 sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom
|
|
103 if sVar == 0:
|
|
104 tStat = None
|
|
105 prob = None
|
|
106 else:
|
|
107 tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon)
|
|
108 + 1.0/(countB + epsilon)))
|
|
109 bottom = degreesOfFreedom + tStat * tStat
|
|
110 prob = special.betainc(0.5*degreesOfFreedom, 0.5,
|
|
111 degreesOfFreedom/bottom)
|
|
112 return(tStat, prob)
|
|
113
|
|
114
|
|
115
|
|
116
|
|
117 if __name__ == "__main__":
|
|
118 main()
|