diff ttest/stats.py @ 0:12bb38e187b9

Uploaded, initial check-in
author melissacline
date Mon, 28 Jul 2014 20:12:17 -0400
parents
children f1929875b1b3
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttest/stats.py	Mon Jul 28 20:12:17 2014 -0400
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+
+
+import argparse
+import math
+import re
+import sys
+try:
+    import numpy
+except:
+    from galaxy import eggs
+    eggs.require("numpy")
+    import numpy
+try:
+    import scipy
+except:
+    from galaxy import eggs
+    eggs.require( "scipy" )
+    import scipy
+    import scipy.stats
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("genomicData", type=str)
+    parser.add_argument("clinicalVector", type=str)
+    parser.add_argument("outfile", type=str)
+    parser.add_argument('-a', type=str, default=None)
+    parser.add_argument('-b', type=str, default=None)
+    args = parser.parse_args()
+    clinicalVector = readClinicalFile(args.clinicalVector)
+    genomicFp = open(args.genomicData)
+    headerTokens = genomicFp.readline().rstrip().split("\t")
+    (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, 
+                                                        clinicalVector,
+                                                        args.a, args.b)
+    print columnsGroupA, columnsGroupB
+    outfile = open(args.outfile, "w")
+    outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0]))
+    for row in genomicFp:
+        tokens = row.rstrip().split("\t")
+        symbol = tokens[0]
+        groupATokens = [tokens[ii] for ii in columnsGroupA]
+        groupBTokens = [tokens[ii] for ii in columnsGroupB]
+        groupAValues = [float(xx) for xx in groupATokens if xx != "NA"]
+        groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"]
+        (ttest, pvalue) = performTTest(groupAValues, groupBValues)
+        if ttest == None or pvalue == None:
+            outputLine = "%s\tNA\tNA" % (symbol)
+        else:
+            outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue)
+        if len(groupAValues) == 0:
+            outputLine = outputLine + "\tNA"
+        else:
+            medianA = numpy.median(groupAValues)
+            outputLine = "%s\t%f" % (outputLine, medianA)
+        if len(groupBValues) == 0:
+            outputLine = outputLine + "\tNA"
+        else:
+            medianB = numpy.median(groupBValues)
+            outputLine = "%s\t%f" % (outputLine, medianB)
+        if len(groupAValues) == 0 or len(groupBValues) == 0:
+            outputLine = outputLine + "\tNA"
+        else:
+            outputLine = "%s\t%f" % (outputLine, medianA - medianB)
+        outfile.write("%s\n" % (outputLine))
+    outfile.close()
+
+def readClinicalFile(clinicalFile):
+    clinicalVector = dict()
+    fp = open(clinicalFile)
+    fp.readline()
+    for row in fp:
+        tokens = row.rstrip().split("\t")
+        clinicalVector[tokens[0]] = tokens[1]
+    fp.close()
+    return(clinicalVector)
+
+def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB):
+    columnsGroupA = list()
+    columnsGroupB = list()
+    for ii in range(len(headerTokens)):
+        id = headerTokens[ii]
+        if clinicalVector.has_key(id):
+            if clinicalVector[id] == optionA:
+                columnsGroupA.append(ii)
+            elif clinicalVector[id] == optionB:
+                columnsGroupB.append(ii)
+    return((columnsGroupA, columnsGroupB))
+
+def performTTest(groupA, groupB):
+    from scipy import special
+    
+    epsilon = 1E-06
+    countA = len(groupA)
+    countB = len(groupB)
+    meanA = numpy.mean(groupA)
+    meanB = numpy.mean(groupB)
+    varA = numpy.var(groupA)
+    varB = numpy.var(groupB)
+    degreesOfFreedom = countA + countB - 2
+    sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom
+    if sVar == 0:
+        tStat = None
+        prob = None
+    else:
+        tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon)     
+                                                  + 1.0/(countB + epsilon)))  
+        bottom = degreesOfFreedom + tStat * tStat
+        prob = special.betainc(0.5*degreesOfFreedom, 0.5, 
+                               degreesOfFreedom/bottom)
+    return(tStat, prob)
+
+    
+
+
+if __name__ == "__main__":
+    main()