Mercurial > repos > melissacline > ucsc_cancer_browser_stats

Binary file ttest/._sample.stats.output.txt has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttest/sample.clinical.matrix.txt	Mon Jul 28 20:12:17 2014 -0400
@@ -0,0 +1,11 @@
+sample_id	Value
+1	A
+2	A
+3	B
+4	C
+5	B
+6	B
+7	A
+8	A
+9	B
+10	A
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttest/sample.genomic.matrix.txt	Mon Jul 28 20:12:17 2014 -0400
@@ -0,0 +1,3 @@
+Gene	1	2	3	4	5	6	7	8	9	10
+G1	2.0	2.2	3.2	1.1	5.1	8.1	3.2	1.1	8.1	0.2
+G2	0.1	8.2	9.1	4.2	6.1	4.9	3.9	2.3	1.1	0.2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttest/sample.stats.output.txt	Mon Jul 28 20:12:17 2014 -0400
@@ -0,0 +1,3 @@
+Gene	Statistic	pValue	Median1	Median2	Delta
+G1	-4.168999	0.004194	2.000000	6.600000	-4.600000
+G2	-1.198486	0.269724	2.300000	5.500000	-3.200000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttest/stats.py	Mon Jul 28 20:12:17 2014 -0400
@@ -0,0 +1,118 @@
+#!/usr/bin/env python
+
+
+
+import argparse
+import math
+import re
+import sys
+try:
+    import numpy
+except:
+    from galaxy import eggs
+    eggs.require("numpy")
+    import numpy
+try:
+    import scipy
+except:
+    from galaxy import eggs
+    eggs.require( "scipy" )
+    import scipy
+    import scipy.stats
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("genomicData", type=str)
+    parser.add_argument("clinicalVector", type=str)
+    parser.add_argument("outfile", type=str)
+    parser.add_argument('-a', type=str, default=None)
+    parser.add_argument('-b', type=str, default=None)
+    args = parser.parse_args()
+    clinicalVector = readClinicalFile(args.clinicalVector)
+    genomicFp = open(args.genomicData)
+    headerTokens = genomicFp.readline().rstrip().split("\t")
+    (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens,
+                                                        clinicalVector,
+                                                        args.a, args.b)
+    print columnsGroupA, columnsGroupB
+    outfile = open(args.outfile, "w")
+    outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0]))
+    for row in genomicFp:
+        tokens = row.rstrip().split("\t")
+        symbol = tokens[0]
+        groupATokens = [tokens[ii] for ii in columnsGroupA]
+        groupBTokens = [tokens[ii] for ii in columnsGroupB]
+        groupAValues = [float(xx) for xx in groupATokens if xx != "NA"]
+        groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"]
+        (ttest, pvalue) = performTTest(groupAValues, groupBValues)
+        if ttest == None or pvalue == None:
+            outputLine = "%s\tNA\tNA" % (symbol)
+        else:
+            outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue)
+        if len(groupAValues) == 0:
+            outputLine = outputLine + "\tNA"
+        else:
+            medianA = numpy.median(groupAValues)
+            outputLine = "%s\t%f" % (outputLine, medianA)
+        if len(groupBValues) == 0:
+            outputLine = outputLine + "\tNA"
+        else:
+            medianB = numpy.median(groupBValues)
+            outputLine = "%s\t%f" % (outputLine, medianB)
+        if len(groupAValues) == 0 or len(groupBValues) == 0:
+            outputLine = outputLine + "\tNA"
+        else:
+            outputLine = "%s\t%f" % (outputLine, medianA - medianB)
+        outfile.write("%s\n" % (outputLine))
+    outfile.close()
+
+def readClinicalFile(clinicalFile):
+    clinicalVector = dict()
+    fp = open(clinicalFile)
+    fp.readline()
+    for row in fp:
+        tokens = row.rstrip().split("\t")
+        clinicalVector[tokens[0]] = tokens[1]
+    fp.close()
+    return(clinicalVector)
+
+def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB):
+    columnsGroupA = list()
+    columnsGroupB = list()
+    for ii in range(len(headerTokens)):
+        id = headerTokens[ii]
+        if clinicalVector.has_key(id):
+            if clinicalVector[id] == optionA:
+                columnsGroupA.append(ii)
+            elif clinicalVector[id] == optionB:
+                columnsGroupB.append(ii)
+    return((columnsGroupA, columnsGroupB))
+
+def performTTest(groupA, groupB):
+    from scipy import special
+
+    epsilon = 1E-06
+    countA = len(groupA)
+    countB = len(groupB)
+    meanA = numpy.mean(groupA)
+    meanB = numpy.mean(groupB)
+    varA = numpy.var(groupA)
+    varB = numpy.var(groupB)
+    degreesOfFreedom = countA + countB - 2
+    sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom
+    if sVar == 0:
+        tStat = None
+        prob = None
+    else:
+        tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon)
+                                                  + 1.0/(countB + epsilon)))
+        bottom = degreesOfFreedom + tStat * tStat
+        prob = special.betainc(0.5*degreesOfFreedom, 0.5,
+                               degreesOfFreedom/bottom)
+    return(tStat, prob)
+
+
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ttest/stats.xml	Mon Jul 28 20:12:17 2014 -0400
@@ -0,0 +1,104 @@
+<tool id="ucscCancerBrowserStats" description="Statistical Tests of Difference" name="UCSC Cancer Browser Stats" version="0.0.1">
+  <description>Apply statistical tests of difference to the rows in a genomic matrix, where the columns are categorized by a second (clinical) matrix</description>
+  <command interpreter="python">
+    stats.py  $genomicMatrix $clinicalFeatures $outFile -a="${category1}" -b="${category2}"
+  </command>
+  <inputs>
+    <param format="tabular" name="genomicMatrix" type="data" label="Genomic Matrix"/>
+    <param format="tabular" name="clinicalFeatures" type="data" label="Clinical Matrix"/>
+    <param type="text" name="category1" label="Category 1" optional="false"/>
+    <param type="text" name="category2" label="Category 2" optional="false"/>
+  </inputs>
+  <outputs>
+    <data format="tabular" name="outFile" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">numpy</requirement>
+  </requirements>
+  <tests>
+    <param name="genomicMatrix" value="sample.genomic.matrix.txt"/>
+    <param name="clinicalMatrix" value="sample.clinical.matrix.txt"/>
+    <param name="category1" value="A"/>
+    <param name="category2" value="B"/>
+    <output name="outFile" value="sample.stats.output.txt"/>
+  </tests>
+  <help>
+
+This tool performs statistical tests found in the UCSC Cancer Genomics
+Browser.  The input data is a genomic matrix (containing genomic data,
+with rows representing genes or probes and columns representing
+samples or patients), a clinical matrix of two (or more) columns
+assigning categorical values to the samples, and two categorical
+values of interest.  The tool identifies the samples corresponding to
+each categorical value, then identifies the columns in the genomic
+matrix corresponding to those sets of samples, which identifies two
+groups of columns.  For each row in the genomic matrix, it extracts
+the value for those two sets of columns, performs a t-test on the two
+sets of values, and returns the result for the row.  Any values for
+any columns NOT pertaining to one of the categorical values of
+interest are ignored.
+
+The user runs this tool with th following steps:
+
+
+1. Specify a genomic matrix.  The expected format is with rows representing
+genes and columns representing samples, and the first line contains sample
+names.
+
+2. Specify a clinical matrix.  Here, rows indicate samples, columns
+indicate clinical features, and the header row contains feature names.
+The first column MUST indicate the sample names, and MUST correspond
+to the column names of the genomic matrix.  The clinical feature of
+interest MUST be in the second column.  Any other columns will be
+ignored.
+
+
+3. Indicate two clinical values that you want to use for defining the
+two groups.  For example, the two groups could be "Red group" and
+"Green group", 0 and 1, or whatever.
+
+The output indicates, for each row, the t-statistic reporting on the
+difference between the two groups of columns (as specified by the two
+clinical values), the p-value corresponding to that t-statistic, the
+median value for each group, and the difference between the medians.  If it
+cannot calculate these values, it returns a vector of NAs.
+
+For example, given the following genomic matrix for (1)::
+
+    Gene  1    2    3    4    5    6    7    8    9    10
+    G1    2.0  2.2  3.2  1.1  5.1  8.1  3.2  1.1  8.1  0.2
+    G2    0.1  8.2  9.1  4.2  6.1  4.9  3.9  2.3  1.1  0.2
+
+and given the following clinical matrix for (2)::
+
+    sample_id Value
+    1         A
+    2         A
+    3         B
+    4         C
+    5         B
+    6         B
+    7         A
+    8         A
+    9         B
+    10        A
+
+and given A for Category 1 and B for Category 2
+
+the tool will assemble the following two groups of values::
+
+    G1 A:(2.0, 2.2, 3.2, 1.1, 0.2) B:(3.2, 5.1, 8.1, 8.1)
+    G2 A:(0.1, 8.2, 3.9, 2.3, 0.2) B:(9.1, 6.1, 4.9, 1.1)
+
+Note that the values for sample_id 4 do not appear, because it has a Value
+of C in the second column, which is neither A nor B.
+
+And it will return the output::
+
+    Gene Statistic  pValue    Median1   Median2   Delta
+    G1   -4.168999  0.004194  2.000000  6.600000  -4.600000
+    G2   -1.198486  0.269724  2.300000  5.500000  -3.200000
+
+
+  </help>
+</tool>