Mercurial > repos > adam-novak > hexagram
changeset 1:ec5ade08ac8a draft
Hopefully fixed dependency XML.
author | adam-novak |
---|---|
date | Thu, 13 Jun 2013 16:47:13 -0400 |
parents | 95ff566506f4 |
children | d9bcb6bc9e47 |
files | hexagram/hexagram.py hexagram/hexagram.xml hexagram/layers.tab hexagram/matrices.tab hexagram/matrix_0.tab hexagram/statistics.js hexagram/tool_dependencies.xml hexagram/tool_dependency.xml hexagram/tsv.pyc |
diffstat | 9 files changed, 230 insertions(+), 65 deletions(-) [+] |
line wrap: on
line diff
--- a/hexagram/hexagram.py Tue Jun 11 18:26:25 2013 -0400 +++ b/hexagram/hexagram.py Thu Jun 13 16:47:13 2013 -0400 @@ -59,9 +59,14 @@ help="Galaxy-escaped name of the query signature") parser.add_argument("--window_size", type=int, default=20, help="size of the window to use when looking for clusters") + parser.add_argument("--truncation_edges", type=int, default=10, + help="number of edges for DrL truncate to pass per node") parser.add_argument("--no-stats", dest="stats", action="store_false", default=True, help="disable cluster-finding statistics") + parser.add_argument("--include-singletons", dest="singletons", + action="store_true", default=False, + help="add self-edges to retain unconnected points") return parser.parse_args(args) @@ -677,8 +682,23 @@ print "Regularizing similarity matrix..." sys.stdout.flush() + # This holds a list of all unique signature names in the similarity matrix. + # We can use it to add edges to keep singletons. + signatures = set() + for parts in sim_reader: + # Keep the signature names used + signatures.add(parts[0]) + signatures.add(parts[1]) + + # Save the line to the regularized file sim_writer.list_line(parts) + + if options.singletons: + # Now add a self-edge on every node, so we don't drop nodes with no + # other strictly positive edges + for signature in signatures: + sim_writer.line(signature, signature, 1) sim_reader.close() sim_writer.close() @@ -689,7 +709,8 @@ # TODO: pass a truncation level print "DrL: Truncating..." sys.stdout.flush() - subprocess.check_call(["truncate", drl_basename]) + subprocess.check_call(["truncate", "-t", str(options.truncation_edges), + drl_basename]) # Run the DrL layout engine. print "DrL: Doing layout..." @@ -708,7 +729,9 @@ # This holds a reader for the DrL output coord_reader = tsv.TsvReader(open(drl_basename + ".coord", "r")) - # This holds a dict from signature name string to (x, y) float tuple + # This holds a dict from signature name string to (x, y) float tuple. It is + # also our official collection of node names that made it through DrL, and + # therefore need their score data sent to the client. nodes = {} print "Reading DrL output..." @@ -840,6 +863,13 @@ # This is the signature that this line is about signature_name = parts[0] + if signature_name not in nodes: + # This signature wasn't in our DrL output. Don't bother + # putting its layer data in our visualization. This saves + # space and makes the client-side layer counts accurate for + # the data actually displayable. + continue + # These are the scores for all the layers for this signature layer_scores = parts[1:]
--- a/hexagram/hexagram.xml Tue Jun 11 18:26:25 2013 -0400 +++ b/hexagram/hexagram.xml Thu Jun 13 16:47:13 2013 -0400 @@ -1,7 +1,17 @@ +<?xml version="1.0"?> <tool id="hexagram" name="Hexagram Visualization" version="0.1"> <description>Interactive hex grid clustering visualization</description> <requirements> + <!-- + Go get the drl-graph-layout package as defined in + tool_dependencies.xml + --> <requirement type="package" version="1.1">drl-graph-layout</requirement> + <!-- + And go get some Python modules that aren't standard. + --> + <requirement type="python-module">numpy</requirement> + <requirement type="python-module">scipy</requirement> </requirements> <!-- This is the command to run as a Cheetah template. @@ -20,6 +30,10 @@ #end if --html "$output" --directory "$output.files_path" + --truncation_edges $edges + #if $singletons + --include-singletons + #end if #if $nostats --no-stats #end if @@ -33,9 +47,13 @@ </repeat> <param name="colormaps" type="data" format="text" optional="true" label="Colormap configuration file"/> + <param name="edges" type="integer" value="10" + label="Number of edges to use per node"/> <param name="query" type="text" label="Name of query signature" - help="A signature name, or empty for no query."/> + help="A signature name, or empty for no query"/> + <param name="singletons" type="boolean" + label="Keep unconnected singleton signatures"/> <param name="nostats" type="boolean" label="Skip calculation of heatmap clumpiness statistics"/> </inputs>
--- a/hexagram/layers.tab Tue Jun 11 18:26:25 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -metformin_0.00001 layer_0.tab 10 86 -a_very long_long_long_name_with_no convenient_text_wrapping_breakpoints <br/> <br/> <br/>now with some extra spaced text that should already wrap fine layer_1.tab 11 86 -tissue layer_2.tab 1 20 -iCluster.k25 layer_3.tab 2 20 -what 1 layer_4.tab 2 20 12 -zap 2 layer_5.tab 2 20 11
--- a/hexagram/matrices.tab Tue Jun 11 18:26:25 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1 +0,0 @@ -matrix_0.tab
--- a/hexagram/matrix_0.tab Tue Jun 11 18:26:25 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -id metformin_0.00001 a_very long_long_long_name_with_no convenient_text_wrapping_breakpoints <br/> <br/> <br/>now with some extra spaced text that should already wrap fine tissue iCluster.k25 binary 1 binary 2 -GSE14206_GPL8_stage=T3A_PHENOTYPE.tab 0.103656268715241 0.0662108455001999 0 1 1 0 -GSE18655_GPL5858_recurrence=No_Rec_PHENOTYPE.tab 0.00736496699455639 -0.0030872481570366 2 2 0 0 -GSE21034_GPL10264_pathological_stage=T4_PHENOTYPE.tab 0.212395889719257 0.104946746175943 3 3 1 1 -TCGA_tumor_level=Middle_PHENOTYPE.tab -0.0750289367770828 -0.0160926008102573 4 4 0 1 -GSE14206_GPL8_stage=T2A_PHENOTYPE.tab 0.101591559700421 0.0581690019909618 5 5 1 1 -TCGA_clinical_spread_ct2=Induration_and+or_Nodularity_Involves___or_=__?_of_one_lobe__cT2a__PHENOTYPE.tab -0.0448303097941127 -0.0555992149962499 6 6 1 0 -GSE21034_GPL10264_ERG-fusion_gex=True_PHENOTYPE.tab 0.0292437277400893 -0.0210673135177115 7 7 1 1 -GSE14206_GPL8_stage=T2C_PHENOTYPE.tab -0.0942458205785588 -0.0632178924636636 8 8 1 0 -GSE18655_GPL5858_age_MEAN.tab -0.0398766576526588 0.00359207190540213 9 9 0 1 -GSE18655_GPL5858_psa_MEAN.tab -0.0959320347649498 -0.00882728114771138 11 10 0 1 -GSE21034_GPL10264_Gene_fusion=True_PHENOTYPE.tab 0.0292437277400893 -0.0210673135177115 1 11 0 0 -GSE14206_GPL8_stage=T3B_PHENOTYPE.tab -0.105814702233279 -0.0740118918016848 2 12 0 0 -GSE21034_GPL10264_pathological_stage=NA_PHENOTYPE.tab 0.112017018347965 0.0251898331610073 3 13 1 0 -TCGA_zone_of_origin=Peripheral_Zone_PHENOTYPE.tab -0.00304197273959563 -0.0207284395193551 4 14 1 1 -TCGA_diagnostic_mri_results=Extraprostatic_Extension_Localized__e.g._seminal_vesicles__PHENOTYPE.tab 0.00993944807969242 -0.0317703371649353 5 15 0 0 -TCGA_pathologic_spread_pt4=YES_PHENOTYPE.tab -0.0724829088312745 -0.0274093321577233 6 16 0 1 -TCGA_shortest_dimension_MEAN.tab -0.0777725626701397 -0.0166257461335536 7 17 1 1 -GSE21034_GPL10264_clint_stage=T2C_PHENOTYPE.tab 0.192148495282519 0.077324537538078 8 18 1 1 -TCGA_diagnostic_ct_abd_pelvis_performed=YES_PHENOTYPE.tab 0.098060120151555 0.0451944074068774 9 19 1 1 -GSE14206_GPL887_ets_group=ESE3_Low_PHENOTYPE.tab -0.0777826036647061 -0.028060859132074 10 20 1 1
--- a/hexagram/statistics.js Tue Jun 11 18:26:25 2013 -0400 +++ b/hexagram/statistics.js Thu Jun 13 16:47:13 2013 -0400 @@ -189,9 +189,13 @@ if(is_binary) { // This is a binary/dichotomous layer, so run a binomial test. return binomial_compare(layer_data, in_list, out_list, all_list); + } else if (is_discrete) { + // This is a multinomial/categorical layer + // TODO: statistics for discrete non-binary layers + return NaN; } else { - // TODO: statistics that aren't on binary layers - return NaN; + // This is a continuous layer, so run a t test + return t_compare(layer_data, in_list, out_list, all_list); } } @@ -243,6 +247,147 @@ return statistics_for_layer(layer_data, in_list, out_list, all_list); } +function t_compare(layer_data, in_list, out_list, all_list) { + // Given the data of a continuous layer object (an object from signature + // name to float (or undefined)), and arrays of the names of "in" and "out" + // signatures, do a t test test for whether the in signatures differ from + // the out signatures. Returns the p value of the test (two-tailed), or NaN + // if the test cannot be performed (due to, e.g. fewer than 2 samples in one + // category). + + // Go through the in list and calculate all the summary statistics + // How many non-NaN values? + var number_in = 0; + // What is the sum? + var sum_in = 0; + + for(var i = 0; i < in_list.length; i++) { + if(!isNaN(layer_data[in_list[i]])) { + number_in++; + sum_in += layer_data[in_list[i]]; + } + } + + // We've done one pass, so we know if we have any in list data actually + if(number_in < 2) { + // Not enough to run the t test + return NaN; + } + + // What is the mean? + var mean_in = sum_in / number_in; + + // What is the second moment (sum of squares of differences from the mean) + var second_moment_in = 0; + for(var i = 0; i < in_list.length; i++) { + if(!isNaN(layer_data[in_list[i]])) { + second_moment_in += Math.pow(layer_data[in_list[i]] - mean_in, 2); + } + } + + // What is the unbiased variance? + unbiased_variance_in = second_moment_in / (number_in - 1); + + // Now go through the same process for the out list + // How many non-NaN values? + var number_out = 0; + // What is the sum? + var sum_out = 0; + + for(var i = 0; i < out_list.length; i++) { + if(!isNaN(layer_data[out_list[i]])) { + number_out++; + sum_out += layer_data[out_list[i]]; + } + } + + // We've done one pass, so we know if we have any out list data actually + if(number_out < 2) { + // Not enough to run the t test + return NaN; + } + + // What is the mean? + var mean_out = sum_out / number_out; + + // What is the second moment (sum of squares of differences from the mean) + var second_moment_out = 0; + for(var i = 0; i < out_list.length; i++) { + if(!isNaN(layer_data[out_list[i]])) { + second_moment_out += Math.pow(layer_data[out_list[i]] - mean_out, + 2); + } + } + + // What is the unbiased variance? + unbiased_variance_out = second_moment_out / (number_out - 1); + + // We can't do the test if both variances are 0 + if(unbiased_variance_in == 0 && unbiased_variance_out == 0) { + return NaN; + } + + // Now we can calculate the t test two-tailed p value + var p_value = t_test(mean_in, unbiased_variance_in, number_in, mean_out, + unbiased_variance_out, number_out); + + print("p=" + p_value); + + // And return it + return p_value; +} + +function t_test(mean_in, unbiased_variance_in, number_in, mean_out, + unbiased_variance_out, number_out) { + + // Given the mean, unbiased variance, and number of samples for both the in + // group and the out group, compute the p value for the t test with unequal + // sample sizes and unequal variances, testing to see whether the means + // differ (a two-tailed "Welch's" t test). See + // https://en.wikipedia.org/wiki/Student%27s_t-test + // Assumes we have enough samples to actually perform the test. + + print("Running t test with mean " + mean_in + " and variance " + + unbiased_variance_in + " at n=" + number_in + " versus mean " + + mean_out + " and variance " + unbiased_variance_out + " at n=" + + number_out); + + // First, calculate the t statistic, which is where our observations fall on + // the t distribution. + var t_statistic = (mean_in - mean_out) / Math.sqrt((unbiased_variance_in / + number_in) + (unbiased_variance_out / number_out)); + + + // Calculate the degrees of freedom for the particular t distribution that + // we ought to compare the statistic against + var degrees_of_freedom = Math.pow((unbiased_variance_in / number_in) + + (unbiased_variance_out / number_out), 2) / + ((Math.pow(unbiased_variance_in / number_in, 2) / (number_in - 1)) + + (Math.pow(unbiased_variance_out / number_out, 2) / (number_out - 1))); + + print("t = " + t_statistic + ", DoF = " + degrees_of_freedom); + + // Now we have to compare the t statistic to the t test CDF available via + // the totally undocumented jstat.pt = function(q, df, ncp, lower_tail, log) + // where: + // q is the t statistic value to calculate the cdf at + // df is the degrees of freedom + // ncp is the "mu" parameter for the t distributiuon. I think this sets the + // mean, and it's OK to leave blank. + // lower_tail presumably specifies if we want the lower or upper tail of the + // CDF. Defaults to true. + // Log specifies if we want the log probability. Defaults to false. + + // Make the t statistic be on the low side of the distribution, and + // calculate the lower tail's area using the CDF. + var one_tail_probability = jstat.pt(0 - Math.abs(t_statistic), + degrees_of_freedom); + + // Return the two-tailed p value, which, since the t distribution is + // symmetric, is just twice the single-tail probability + return 2 * one_tail_probability; + +} function binomial_compare(layer_data, in_list, out_list, all_list) { // Given the data of a binary layer object (an object from signature name to
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hexagram/tool_dependencies.xml Thu Jun 13 16:47:13 2013 -0400 @@ -0,0 +1,32 @@ +<?xml version="1.0"?> +<!-- +Defines how to install the binaries that this tool depends on (in this case, DrL). +Based on the examples at http://wiki.galaxyproject.org/ToolShedToolFeatures +and http://toolshed.g2.bx.psu.edu/repos/jjohnson/defuse/file/f65857c1b92e/tool_dependencies.xml +--> +<tool_dependency> + <package name="drl-graph-layout" version="1.1"> + <install version="1.0"><!-- This is the install tag version, not the package version --> + <actions> + <action type="shell_command">hg clone https://bitbucket.org/adam_novak/drl-graph-layout</action> + <!-- + TODO: We're supposed to copy the right Configuration.mk file. + Not doing so assumes our system is GNU + --> + <action type="shell_command">hg up -r drl-graph-layout-1.1</action> + <action type="shell_command">make</action> + <action type="move_directory_files"> + <source_directory>bin</source_directory> + <destination_directory>$INSTALL_DIR/bin</destination_directory> + </action> + <action type="set_environment"> + <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> + <!-- Now we can access DrL tools like truncate (at the expense of GNU truncate) --> + </action> + </actions> + </install> + <readme> + This installs the latest DrL Graph Layout tool from Adam Novak's Bitbucket, because Shawn Martin has stopped maintaining it. + </readme> + </package> +</tool_dependency>
--- a/hexagram/tool_dependency.xml Tue Jun 11 18:26:25 2013 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -<?xml version="1.0"?> -<!-- -Defines how to install the binaries that this tool depends on (in this case, DrL). -Based on the examples at http://wiki.galaxyproject.org/ToolShedToolFeatures -and http://toolshed.g2.bx.psu.edu/repos/jjohnson/defuse/file/f65857c1b92e/tool_dependencies.xml ---> -<tool_dependency> - <package name="drl-graph-layout" version="1.1"> - <install version="1.0"><!-- This is the install tag version, not the package version --> - <actions> - <action type="shell_command">hg clone https://bitbucket.org/adam_novak/drl-graph-layout</action> - <!-- - TODO: We're supposed to copy the right Configuration.mk file. - Not doing so assumes our system is GNU - --> - <action type="shell_command">hg up -r drl-graph-layout-1.1</action> - <action type="shell_command">make</action> - <action type="move_directory_files"> - <source_directory>bin</source_directory> - <destination_directory>$INSTALL_DIR/bin</destination_directory> - </action> - <action type="set_environment"> - <environment_variable name="PATH" action="prepend_to">$INSTALL_DIR/bin</environment_variable> - <!-- Now we can access DrL tools like truncate (at the expense of GNU truncate) --> - </action> - </actions> - </install> - <readme> - This installs the latest DrL Graph Layout tool from Adam Novak's Bitbucket, because Shawn Martin has stopped maintaining it. - </readme> - </package> -</tool_dependency>