Mercurial > repos > adam-novak > hexagram

--- a/hexagram/hexagram.py	Tue Jun 11 18:26:25 2013 -0400
+++ b/hexagram/hexagram.py	Thu Jun 13 16:47:13 2013 -0400
@@ -59,9 +59,14 @@
         help="Galaxy-escaped name of the query signature")
     parser.add_argument("--window_size", type=int, default=20,
         help="size of the window to use when looking for clusters")
+    parser.add_argument("--truncation_edges", type=int, default=10,
+        help="number of edges for DrL truncate to pass per node")
     parser.add_argument("--no-stats", dest="stats", action="store_false",
         default=True,
         help="disable cluster-finding statistics")
+    parser.add_argument("--include-singletons", dest="singletons",
+        action="store_true", default=False,
+        help="add self-edges to retain unconnected points")

     return parser.parse_args(args)

@@ -677,8 +682,23 @@
     print "Regularizing similarity matrix..."
     sys.stdout.flush()

+    # This holds a list of all unique signature names in the similarity matrix.
+    # We can use it to add edges to keep singletons.
+    signatures = set()
+
     for parts in sim_reader:
+        # Keep the signature names used
+        signatures.add(parts[0])
+        signatures.add(parts[1])
+
+        # Save the line to the regularized file
         sim_writer.list_line(parts)
+
+    if options.singletons:
+        # Now add a self-edge on every node, so we don't drop nodes with no
+        # other strictly positive edges
+        for signature in signatures:
+            sim_writer.line(signature, signature, 1)

     sim_reader.close()
     sim_writer.close()
@@ -689,7 +709,8 @@
     # TODO: pass a truncation level
     print "DrL: Truncating..."
     sys.stdout.flush()
-    subprocess.check_call(["truncate", drl_basename])
+    subprocess.check_call(["truncate", "-t", str(options.truncation_edges),
+        drl_basename])

     # Run the DrL layout engine.
     print "DrL: Doing layout..."
@@ -708,7 +729,9 @@
     # This holds a reader for the DrL output
     coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r"))

-    # This holds a dict from signature name string to (x, y) float tuple
+    # This holds a dict from signature name string to (x, y) float tuple. It is
+    # also our official collection of node names that made it through DrL, and
+    # therefore need their score data sent to the client.
     nodes = {}

     print "Reading DrL output..."
@@ -840,6 +863,13 @@
                 # This is the signature that this line is about
                 signature_name = parts[0]

+                if signature_name not in nodes:
+                    # This signature wasn't in our DrL output. Don't bother
+                    # putting its layer data in our visualization. This saves
+                    # space and makes the client-side layer counts accurate for
+                    # the data actually displayable.
+                    continue
+
                 # These are the scores for all the layers for this signature
                 layer_scores = parts[1:]
--- a/hexagram/hexagram.xml	Tue Jun 11 18:26:25 2013 -0400
+++ b/hexagram/hexagram.xml	Thu Jun 13 16:47:13 2013 -0400
@@ -1,7 +1,17 @@
+<?xml version="1.0"?>
 <tool id="hexagram" name="Hexagram Visualization" version="0.1">
     <description>Interactive hex grid clustering visualization</description>
     <requirements>
+        <!--
+            Go get the drl-graph-layout package as defined in
+            tool_dependencies.xml
+        -->
         <requirement type="package" version="1.1">drl-graph-layout</requirement>
+        <!--
+            And go get some Python modules that aren't standard.
+        -->
+        <requirement type="python-module">numpy</requirement>
+        <requirement type="python-module">scipy</requirement>
     </requirements>
     <!--
         This is the command to run as a Cheetah template.
@@ -20,6 +30,10 @@
         #end if
         --html "$output"
         --directory "$output.files_path"
+        --truncation_edges $edges
+        #if $singletons
+            --include-singletons
+        #end if
         #if $nostats
             --no-stats
         #end if
@@ -33,9 +47,13 @@
         </repeat>
         <param name="colormaps" type="data" format="text" optional="true"
             label="Colormap configuration file"/>
+        <param name="edges" type="integer" value="10"
+            label="Number of edges to use per node"/>
         <param name="query" type="text"
             label="Name of query signature"
-            help="A signature name, or empty for no query."/>
+            help="A signature name, or empty for no query"/>
+        <param name="singletons" type="boolean"
+            label="Keep unconnected singleton signatures"/>
         <param name="nostats" type="boolean"
             label="Skip calculation of heatmap clumpiness statistics"/>
     </inputs>
--- a/hexagram/layers.tab	Tue Jun 11 18:26:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-metformin_0.00001	layer_0.tab	10	86
-a_very long_long_long_name_with_no convenient_text_wrapping_breakpoints <br/> <br/> <br/>now with some extra spaced text that should already wrap fine	layer_1.tab	11	86
-tissue	layer_2.tab	1	20
-iCluster.k25	layer_3.tab	2	20
-what 1	layer_4.tab	2	20	12
-zap 2	layer_5.tab	2	20	11
--- a/hexagram/matrices.tab	Tue Jun 11 18:26:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1 +0,0 @@
-matrix_0.tab
--- a/hexagram/matrix_0.tab	Tue Jun 11 18:26:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-id	metformin_0.00001	a_very long_long_long_name_with_no convenient_text_wrapping_breakpoints <br/> <br/> <br/>now with some extra spaced text that should already wrap fine	tissue	iCluster.k25	binary 1	binary 2
-GSE14206_GPL8_stage=T3A_PHENOTYPE.tab	0.103656268715241	0.0662108455001999	0	1	1	0
-GSE18655_GPL5858_recurrence=No_Rec_PHENOTYPE.tab	0.00736496699455639	-0.0030872481570366	2	2	0	0
-GSE21034_GPL10264_pathological_stage=T4_PHENOTYPE.tab	0.212395889719257	0.104946746175943	3	3	1	1
-TCGA_tumor_level=Middle_PHENOTYPE.tab	-0.0750289367770828	-0.0160926008102573	4	4	0	1
-GSE14206_GPL8_stage=T2A_PHENOTYPE.tab	0.101591559700421	0.0581690019909618	5	5	1	1
-TCGA_clinical_spread_ct2=Induration_and+or_Nodularity_Involves___or_=__?_of_one_lobe__cT2a__PHENOTYPE.tab	-0.0448303097941127	-0.0555992149962499	6	6	1	0
-GSE21034_GPL10264_ERG-fusion_gex=True_PHENOTYPE.tab	0.0292437277400893	-0.0210673135177115	7	7	1	1
-GSE14206_GPL8_stage=T2C_PHENOTYPE.tab	-0.0942458205785588	-0.0632178924636636	8	8	1	0
-GSE18655_GPL5858_age_MEAN.tab	-0.0398766576526588	0.00359207190540213	9	9	0	1
-GSE18655_GPL5858_psa_MEAN.tab	-0.0959320347649498	-0.00882728114771138	11	10	0	1
-GSE21034_GPL10264_Gene_fusion=True_PHENOTYPE.tab	0.0292437277400893	-0.0210673135177115	1	11	0	0
-GSE14206_GPL8_stage=T3B_PHENOTYPE.tab	-0.105814702233279	-0.0740118918016848	2	12	0	0
-GSE21034_GPL10264_pathological_stage=NA_PHENOTYPE.tab	0.112017018347965	0.0251898331610073	3	13	1	0
-TCGA_zone_of_origin=Peripheral_Zone_PHENOTYPE.tab	-0.00304197273959563	-0.0207284395193551	4	14	1	1
-TCGA_diagnostic_mri_results=Extraprostatic_Extension_Localized__e.g._seminal_vesicles__PHENOTYPE.tab	0.00993944807969242	-0.0317703371649353	5	15	0	0
-TCGA_pathologic_spread_pt4=YES_PHENOTYPE.tab	-0.0724829088312745	-0.0274093321577233	6	16	0	1
-TCGA_shortest_dimension_MEAN.tab	-0.0777725626701397	-0.0166257461335536	7	17	1	1
-GSE21034_GPL10264_clint_stage=T2C_PHENOTYPE.tab	0.192148495282519	0.077324537538078	8	18	1	1
-TCGA_diagnostic_ct_abd_pelvis_performed=YES_PHENOTYPE.tab	0.098060120151555	0.0451944074068774	9	19	1	1
-GSE14206_GPL887_ets_group=ESE3_Low_PHENOTYPE.tab	-0.0777826036647061	-0.028060859132074	10	20	1	1
--- a/hexagram/statistics.js	Tue Jun 11 18:26:25 2013 -0400
+++ b/hexagram/statistics.js	Thu Jun 13 16:47:13 2013 -0400
@@ -189,9 +189,13 @@
     if(is_binary) {
         // This is a binary/dichotomous layer, so run a binomial test.
         return binomial_compare(layer_data, in_list, out_list, all_list);
+    } else if (is_discrete) {
+        // This is a multinomial/categorical layer
+        // TODO: statistics for discrete non-binary layers
+        return NaN;
     } else {
-        // TODO: statistics that aren't on binary layers
-        return NaN;
+        // This is a continuous layer, so run a t test
+        return t_compare(layer_data, in_list, out_list, all_list);
     }

 }
@@ -243,6 +247,147 @@
     return statistics_for_layer(layer_data, in_list, out_list, all_list);
 }

+function t_compare(layer_data, in_list, out_list, all_list) {
+    // Given the data of a continuous layer object (an object from signature
+    // name to float (or undefined)), and arrays of the names of "in" and "out"
+    // signatures, do a t test test for whether the in signatures differ from
+    // the out signatures. Returns the p value of the test (two-tailed), or NaN
+    // if the test cannot be performed (due to, e.g. fewer than 2 samples in one
+    // category).
+
+    // Go through the in list and calculate all the summary statistics
+    // How many non-NaN values?
+    var number_in = 0;
+    // What is the sum?
+    var sum_in = 0;
+
+    for(var i = 0; i < in_list.length; i++) {
+        if(!isNaN(layer_data[in_list[i]])) {
+            number_in++;
+            sum_in += layer_data[in_list[i]];
+        }
+    }
+
+    // We've done one pass, so we know if we have any in list data actually
+    if(number_in < 2) {
+        // Not enough to run the t test
+        return NaN;
+    }
+
+    // What is the mean?
+    var mean_in = sum_in / number_in;
+
+    // What is the second moment (sum of squares of differences from the mean)
+    var second_moment_in = 0;
+    for(var i = 0; i < in_list.length; i++) {
+        if(!isNaN(layer_data[in_list[i]])) {
+            second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2);
+        }
+    }
+
+    // What is the unbiased variance?
+    unbiased_variance_in = second_moment_in / (number_in - 1);
+
+    // Now go through the same process for the out list
+    // How many non-NaN values?
+    var number_out = 0;
+    // What is the sum?
+    var sum_out = 0;
+
+    for(var i = 0; i < out_list.length; i++) {
+        if(!isNaN(layer_data[out_list[i]])) {
+            number_out++;
+            sum_out += layer_data[out_list[i]];
+        }
+    }
+
+    // We've done one pass, so we know if we have any out list data actually
+    if(number_out < 2) {
+        // Not enough to run the t test
+        return NaN;
+    }
+
+    // What is the mean?
+    var mean_out = sum_out / number_out;
+
+    // What is the second moment (sum of squares of differences from the mean)
+    var second_moment_out = 0;
+    for(var i = 0; i < out_list.length; i++) {
+        if(!isNaN(layer_data[out_list[i]])) {
+            second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out,
+                2);
+        }
+    }
+
+    // What is the unbiased variance?
+    unbiased_variance_out = second_moment_out / (number_out - 1);
+
+    // We can't do the test if both variances are 0
+    if(unbiased_variance_in == 0 && unbiased_variance_out == 0) {
+        return NaN;
+    }
+
+    // Now we can calculate the t test two-tailed p value
+    var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out,
+        unbiased_variance_out, number_out);
+
+    print("p=" + p_value);
+
+    // And return it
+    return p_value;
+}
+
+function t_test(mean_in, unbiased_variance_in, number_in, mean_out,
+    unbiased_variance_out, number_out) {
+
+    // Given the mean, unbiased variance, and number of samples for both the in
+    // group and the out group, compute the p value for the t test with unequal
+    // sample sizes and unequal variances, testing to see whether the means
+    // differ (a two-tailed "Welch's" t test). See
+    // https://en.wikipedia.org/wiki/Student%27s_t-test
+    // Assumes we have enough samples to actually perform the test.
+
+    print("Running t test with mean " + mean_in + " and variance " +
+        unbiased_variance_in + " at n=" + number_in + " versus mean " +
+        mean_out + " and variance " + unbiased_variance_out + " at n=" +
+        number_out);
+
+    // First, calculate the t statistic, which is where our observations fall on
+    // the t distribution.
+    var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in /
+        number_in) + (unbiased_variance_out / number_out));
+
+
+    // Calculate the degrees of freedom for the particular t distribution that
+    // we ought to compare the statistic against
+    var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) +
+        (unbiased_variance_out / number_out), 2) /
+        ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) +
+        (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1)));
+
+    print("t = " + t_statistic + ", DoF = " + degrees_of_freedom);
+
+    // Now we have to compare the t statistic to the t test CDF available via
+    // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log)
+    // where:
+    // q is the t statistic value to calculate the cdf at
+    // df is the degrees of freedom
+    // ncp is the "mu" parameter for the t distributiuon. I think this sets the
+    // mean, and it's OK to leave blank.
+    // lower_tail presumably specifies if we want the lower or upper tail of the
+    // CDF. Defaults to true.
+    // Log specifies if we want the log probability. Defaults to false.
+
+    // Make the t statistic be on the low side of the distribution, and
+    // calculate the lower tail's area using the CDF.
+    var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic),
+        degrees_of_freedom);
+
+    // Return the two-tailed p value, which, since the t distribution is
+    // symmetric, is just twice the single-tail probability
+    return 2 * one_tail_probability;
+
+}

 function binomial_compare(layer_data, in_list, out_list, all_list) {
     // Given the data of a binary layer object (an object from signature name to
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hexagram/tool_dependencies.xml	Thu Jun 13 16:47:13 2013 -0400
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<!--
+Defines how to install the binaries that this tool depends on (in this case, DrL).
+Based on the examples at http://wiki.galaxyproject.org/ToolShedToolFeatures
+and http://toolshed.g2.bx.psu.edu/repos/jjohnson/defuse/file/f65857c1b92e/tool_dependencies.xml
+-->
+<tool_dependency>
+    <package name="drl-graph-layout" version="1.1">
+        <install version="1.0"><!-- This is the install tag version, not the package version -->
+            <actions>
+                <action type="shell_command">hg clone https://bitbucket.org/adam_novak/drl-graph-layout</action>
+                <!--
+                TODO: We're supposed to copy the right Configuration.mk file.
+                Not doing so assumes our system is GNU
+                -->
+                <action type="shell_command">hg up -r drl-graph-layout-1.1</action>
+                <action type="shell_command">make</action>
+                <action type="move_directory_files">
+                    <source_directory>bin</source_directory>
+                    <destination_directory>$INSTALL_DIR/bin</destination_directory>
+                </action>
+                <action type="set_environment">
+                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
+                    <!-- Now we can access DrL tools like truncate (at the expense of GNU truncate) -->
+                </action>
+            </actions>
+        </install>
+        <readme>
+        This installs the latest DrL Graph Layout tool from Adam Novak's Bitbucket, because Shawn Martin has stopped maintaining it.
+        </readme>
+    </package>
+</tool_dependency>
--- a/hexagram/tool_dependency.xml	Tue Jun 11 18:26:25 2013 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,32 +0,0 @@
-<?xml version="1.0"?>
-<!--
-Defines how to install the binaries that this tool depends on (in this case, DrL).
-Based on the examples at http://wiki.galaxyproject.org/ToolShedToolFeatures
-and http://toolshed.g2.bx.psu.edu/repos/jjohnson/defuse/file/f65857c1b92e/tool_dependencies.xml
--->
-<tool_dependency>
-    <package name="drl-graph-layout" version="1.1">
-        <install version="1.0"><!-- This is the install tag version, not the package version -->
-            <actions>
-                <action type="shell_command">hg clone https://bitbucket.org/adam_novak/drl-graph-layout</action>
-                <!--
-                TODO: We're supposed to copy the right Configuration.mk file.
-                Not doing so assumes our system is GNU
-                -->
-                <action type="shell_command">hg up -r drl-graph-layout-1.1</action>
-                <action type="shell_command">make</action>
-                <action type="move_directory_files">
-                    <source_directory>bin</source_directory>
-                    <destination_directory>$INSTALL_DIR/bin</destination_directory>
-                </action>
-                <action type="set_environment">
-                    <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable>
-                    <!-- Now we can access DrL tools like truncate (at the expense of GNU truncate) -->
-                </action>
-            </actions>
-        </install>
-        <readme>
-        This installs the latest DrL Graph Layout tool from Adam Novak's Bitbucket, because Shawn Martin has stopped maintaining it.
-        </readme>
-    </package>
-</tool_dependency>
Binary file hexagram/tsv.pyc has changed