Mercurial > repos > melissacline > ucsc_cancer_browser_stats
changeset 0:12bb38e187b9
Uploaded, initial check-in
author | melissacline |
---|---|
date | Mon, 28 Jul 2014 20:12:17 -0400 |
parents | |
children | 895f0a636f37 |
files | ttest/._sample.stats.output.txt ttest/sample.clinical.matrix.txt ttest/sample.genomic.matrix.txt ttest/sample.stats.output.txt ttest/stats.py ttest/stats.xml |
diffstat | 6 files changed, 239 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest/sample.clinical.matrix.txt Mon Jul 28 20:12:17 2014 -0400 @@ -0,0 +1,11 @@ +sample_id Value +1 A +2 A +3 B +4 C +5 B +6 B +7 A +8 A +9 B +10 A
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest/sample.genomic.matrix.txt Mon Jul 28 20:12:17 2014 -0400 @@ -0,0 +1,3 @@ +Gene 1 2 3 4 5 6 7 8 9 10 +G1 2.0 2.2 3.2 1.1 5.1 8.1 3.2 1.1 8.1 0.2 +G2 0.1 8.2 9.1 4.2 6.1 4.9 3.9 2.3 1.1 0.2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest/sample.stats.output.txt Mon Jul 28 20:12:17 2014 -0400 @@ -0,0 +1,3 @@ +Gene Statistic pValue Median1 Median2 Delta +G1 -4.168999 0.004194 2.000000 6.600000 -4.600000 +G2 -1.198486 0.269724 2.300000 5.500000 -3.200000
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest/stats.py Mon Jul 28 20:12:17 2014 -0400 @@ -0,0 +1,118 @@ +#!/usr/bin/env python + + + +import argparse +import math +import re +import sys +try: + import numpy +except: + from galaxy import eggs + eggs.require("numpy") + import numpy +try: + import scipy +except: + from galaxy import eggs + eggs.require( "scipy" ) + import scipy + import scipy.stats + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("genomicData", type=str) + parser.add_argument("clinicalVector", type=str) + parser.add_argument("outfile", type=str) + parser.add_argument('-a', type=str, default=None) + parser.add_argument('-b', type=str, default=None) + args = parser.parse_args() + clinicalVector = readClinicalFile(args.clinicalVector) + genomicFp = open(args.genomicData) + headerTokens = genomicFp.readline().rstrip().split("\t") + (columnsGroupA, columnsGroupB) = headerLineToGroups(headerTokens, + clinicalVector, + args.a, args.b) + print columnsGroupA, columnsGroupB + outfile = open(args.outfile, "w") + outfile.write("%s\tStatistic\tpValue\tMedian1\tMedian2\tDelta\n" % (headerTokens[0])) + for row in genomicFp: + tokens = row.rstrip().split("\t") + symbol = tokens[0] + groupATokens = [tokens[ii] for ii in columnsGroupA] + groupBTokens = [tokens[ii] for ii in columnsGroupB] + groupAValues = [float(xx) for xx in groupATokens if xx != "NA"] + groupBValues = [float(xx) for xx in groupBTokens if xx != "NA"] + (ttest, pvalue) = performTTest(groupAValues, groupBValues) + if ttest == None or pvalue == None: + outputLine = "%s\tNA\tNA" % (symbol) + else: + outputLine= "%s\t%f\t%f" % (symbol, ttest, pvalue) + if len(groupAValues) == 0: + outputLine = outputLine + "\tNA" + else: + medianA = numpy.median(groupAValues) + outputLine = "%s\t%f" % (outputLine, medianA) + if len(groupBValues) == 0: + outputLine = outputLine + "\tNA" + else: + medianB = numpy.median(groupBValues) + outputLine = "%s\t%f" % (outputLine, medianB) + if len(groupAValues) == 0 or len(groupBValues) == 0: + outputLine = outputLine + "\tNA" + else: + outputLine = "%s\t%f" % (outputLine, medianA - medianB) + outfile.write("%s\n" % (outputLine)) + outfile.close() + +def readClinicalFile(clinicalFile): + clinicalVector = dict() + fp = open(clinicalFile) + fp.readline() + for row in fp: + tokens = row.rstrip().split("\t") + clinicalVector[tokens[0]] = tokens[1] + fp.close() + return(clinicalVector) + +def headerLineToGroups(headerTokens, clinicalVector, optionA, optionB): + columnsGroupA = list() + columnsGroupB = list() + for ii in range(len(headerTokens)): + id = headerTokens[ii] + if clinicalVector.has_key(id): + if clinicalVector[id] == optionA: + columnsGroupA.append(ii) + elif clinicalVector[id] == optionB: + columnsGroupB.append(ii) + return((columnsGroupA, columnsGroupB)) + +def performTTest(groupA, groupB): + from scipy import special + + epsilon = 1E-06 + countA = len(groupA) + countB = len(groupB) + meanA = numpy.mean(groupA) + meanB = numpy.mean(groupB) + varA = numpy.var(groupA) + varB = numpy.var(groupB) + degreesOfFreedom = countA + countB - 2 + sVar = ((countA - 1) * varA + (countB - 1) * varB) / degreesOfFreedom + if sVar == 0: + tStat = None + prob = None + else: + tStat = (meanA - meanB)/math.sqrt(sVar * (1.0/(countA + epsilon) + + 1.0/(countB + epsilon))) + bottom = degreesOfFreedom + tStat * tStat + prob = special.betainc(0.5*degreesOfFreedom, 0.5, + degreesOfFreedom/bottom) + return(tStat, prob) + + + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ttest/stats.xml Mon Jul 28 20:12:17 2014 -0400 @@ -0,0 +1,104 @@ +<tool id="ucscCancerBrowserStats" description="Statistical Tests of Difference" name="UCSC Cancer Browser Stats" version="0.0.1"> + <description>Apply statistical tests of difference to the rows in a genomic matrix, where the columns are categorized by a second (clinical) matrix</description> + <command interpreter="python"> + stats.py $genomicMatrix $clinicalFeatures $outFile -a="${category1}" -b="${category2}" + </command> + <inputs> + <param format="tabular" name="genomicMatrix" type="data" label="Genomic Matrix"/> + <param format="tabular" name="clinicalFeatures" type="data" label="Clinical Matrix"/> + <param type="text" name="category1" label="Category 1" optional="false"/> + <param type="text" name="category2" label="Category 2" optional="false"/> + </inputs> + <outputs> + <data format="tabular" name="outFile" /> + </outputs> + <requirements> + <requirement type="python-module">numpy</requirement> + </requirements> + <tests> + <param name="genomicMatrix" value="sample.genomic.matrix.txt"/> + <param name="clinicalMatrix" value="sample.clinical.matrix.txt"/> + <param name="category1" value="A"/> + <param name="category2" value="B"/> + <output name="outFile" value="sample.stats.output.txt"/> + </tests> + <help> + +This tool performs statistical tests found in the UCSC Cancer Genomics +Browser. The input data is a genomic matrix (containing genomic data, +with rows representing genes or probes and columns representing +samples or patients), a clinical matrix of two (or more) columns +assigning categorical values to the samples, and two categorical +values of interest. The tool identifies the samples corresponding to +each categorical value, then identifies the columns in the genomic +matrix corresponding to those sets of samples, which identifies two +groups of columns. For each row in the genomic matrix, it extracts +the value for those two sets of columns, performs a t-test on the two +sets of values, and returns the result for the row. Any values for +any columns NOT pertaining to one of the categorical values of +interest are ignored. + +The user runs this tool with th following steps: + + +1. Specify a genomic matrix. The expected format is with rows representing +genes and columns representing samples, and the first line contains sample +names. + +2. Specify a clinical matrix. Here, rows indicate samples, columns +indicate clinical features, and the header row contains feature names. +The first column MUST indicate the sample names, and MUST correspond +to the column names of the genomic matrix. The clinical feature of +interest MUST be in the second column. Any other columns will be +ignored. + + +3. Indicate two clinical values that you want to use for defining the +two groups. For example, the two groups could be "Red group" and +"Green group", 0 and 1, or whatever. + +The output indicates, for each row, the t-statistic reporting on the +difference between the two groups of columns (as specified by the two +clinical values), the p-value corresponding to that t-statistic, the +median value for each group, and the difference between the medians. If it +cannot calculate these values, it returns a vector of NAs. + +For example, given the following genomic matrix for (1):: + + Gene 1 2 3 4 5 6 7 8 9 10 + G1 2.0 2.2 3.2 1.1 5.1 8.1 3.2 1.1 8.1 0.2 + G2 0.1 8.2 9.1 4.2 6.1 4.9 3.9 2.3 1.1 0.2 + +and given the following clinical matrix for (2):: + + sample_id Value + 1 A + 2 A + 3 B + 4 C + 5 B + 6 B + 7 A + 8 A + 9 B + 10 A + +and given A for Category 1 and B for Category 2 + +the tool will assemble the following two groups of values:: + + G1 A:(2.0, 2.2, 3.2, 1.1, 0.2) B:(3.2, 5.1, 8.1, 8.1) + G2 A:(0.1, 8.2, 3.9, 2.3, 0.2) B:(9.1, 6.1, 4.9, 1.1) + +Note that the values for sample_id 4 do not appear, because it has a Value +of C in the second column, which is neither A nor B. + +And it will return the output:: + + Gene Statistic pValue Median1 Median2 Delta + G1 -4.168999 0.004194 2.000000 6.600000 -4.600000 + G2 -1.198486 0.269724 2.300000 5.500000 -3.200000 + + + </help> +</tool>