Mercurial > repos > hackdna > pathprint
changeset 0:0cebe436a553 draft default tip
Initial upload.
author | hackdna |
---|---|
date | Fri, 17 May 2013 14:29:33 -0400 |
parents | |
children | |
files | README consensus.r consensus.xml distance.r distance.xml fingerprint.r fingerprint.xml galaxy-pathprint.r heatmap.r heatmap.xml histogram.r histogram.xml similar-experiments.r similar-experiments.xml |
diffstat | 14 files changed, 1045 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,21 @@ +Each of these files describes the interface for one of the tools in Galaxy: + + * fingerprint.xml - Create Pathway Fingerprint + * distance.xml - Calculate Distance to Consensus + * similar-experiments.xml - Find Similar Experiments in GEO + * histogram.xml - Generate a Histogram + * heatmap.xml - Generate a Heatmap + +Each interface calls a corresponding R file that performs the actions: + + * fingerprint.r + * distance.r + * similar-experiments.r + * histogram.r + * heatmap.r + +Shared fuctions are kept in this file: + + * galaxy-pathprint.r + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,32 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +data(GEO.metadata.matrix); + +usage <- function() { + stop("Usage: consensus.r <input> <threshold> <output>", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +threshold <- ifelse(! is.na(args[2]), as.numeric(args[2]), usage()) +output <- ifelse(! is.na(args[3]), args[3], usage()) + +fingerprint <- loadFingerprint(input); +consensus <- consensusFingerprint(fingerprint, threshold); + +saveConsensus(consensus, output); + +quit("no", 0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,43 @@ +<tool id="pathprint-consensus" name="Create a Consensus"> + <command>consensus.r $input $threshold $output</command> + + <inputs> + <param name="input" type="data" label="Pathway Fingerprint" help="Submit a Pathway Fingerprint file from your history"/> + + <param name="threshold" type="select" label="Threshold" help="Select a threshold value"> + <option value="0.9">0.9</option> + <option value="0.8" selected="true">0.8 (default)</option> + <option value="0.7">0.7</option> + <option value="0.6">0.6</option> + <option value="0.5">0.5</option> + </param> + </inputs> + + <outputs> + <data format="tabular" name="output" label="Consensus Fingerprint"/> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> + +**What it does** + +This tool identifies the most prevalent PathPrint value for each pathway across the submitted samples and produces a consensus PathPrint. + +The method calculates the mean value of the pathway activities across samples, and then applies the threshold such that if the mean is greater than threshold, the pathway is assigned a consensus of +1. If the mean is less than -threshold, a value of -1 is assigned. Otherwise, the consensus is assigned a value of 0. + +----- + +**Input:** PathPrint fingerprint for multiple samples and a threshold value. + +**Output:** A single consensus PathPrint fingerprint representing the most commonly occurring value + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distance.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,33 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +data(GEO.metadata.matrix); + +usage <- function() { + stop("Usage: distance.r <input> <output> <output2>", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +output <- ifelse(! is.na(args[2]), args[2], usage()) +output2 <- ifelse(! is.na(args[3]), args[3], usage()) + +fingerprint <- loadFingerprint(input); +distance <- calculateDistanceToPluripotent(fingerprint, verbose = 0); +saveDistance(distance, output); + +generateHistograms(distance, output2); + +quit("no", 0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distance.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,44 @@ +<tool id="pathprint-distance" name="Find Distance to Pluripotent Signature"> + +<!-- +<description>Find Distance to Pluripotent Signature</description> +--> + + <command>distance.r $input $output $output2</command> + + <inputs> + <param name="input" type="data" label="Pathway Fingerprint" help="Submit a PathPrint Fingerprint file from your history to determine how similar your experiment is to pluripotent samples in GEO"/> + </inputs> + + <outputs> + <data format="tabular" name="output" label="Distance from pluripotent consensus"/> + <data format="pdf" name="output2" label="Histogram"/> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> + +**What it does** + +This tool takes the computed PathPrint Fingerprint as input and uses it to determine how similar +your data are to a consensus signature generated from a set of pluripotent samples in GEO. + +----- + +**Input**: A PathPrint Fingerprint file from your history + +**Output**: + +1. Distance from pluripotent consensus: tab delimited file containing distance and p-value for each sample from the pluripotent consensus signature. + +2. Histogram: PDF file containing two histograms: (1) distribution of distances for ~180,000 GEO samples from the pluripotent signature; (2) distribution of distances for the input samples from the pluripotent signature. + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + </help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fingerprint.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,50 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(affy) +library(GEOquery); +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +data(GEO.metadata.matrix); + +usage <- function() { + sink(stderr(), type = "message"); + stop("Usage: fingerprint.r [ARGS]", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +type <- ifelse(! is.na(args[1]), args[1], usage()) + +if (type == "geo") { + geoID <- ifelse(! is.na(args[2]), args[2], usage()); + output <- ifelse(! is.na(args[3]), args[3], usage()); + + fingerprint <- generateFingerprint(geoID); +} else if (type == "cel") { + input <- ifelse(! is.na(args[2]), args[2], usage()); + output <- ifelse(! is.na(args[3]), args[3], usage()); + + fingerprint <- loadFingerprintFromCELFile(input); +} else if (type == "expr") { + input <- ifelse(! is.na(args[2]), args[2], usage()); + platform <- ifelse(! is.na(args[3]), args[3], usage()); + output <- ifelse(! is.na(args[4]), args[4], usage()); + + fingerprint <- loadFingerprintFromExprsFile(input, platform); +} else { + usage(); +}; + +saveFingerprint(fingerprint, output); + +quit("no", 0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fingerprint.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,126 @@ +<tool id="pathprint-fingerprint" name="Create Pathway Fingerprint"> + +<!-- +<description>Create Pathway Fingerprint.</description> +--> + +<command> +fingerprint.r +#if $query.type == "geo" +geo $query.input $output +#elif $query.type == "cel" +cel $query.input $output +#elif $query.type == "expr" +expr $query.input $query.platform $output +#end if +<!-- +> /dev/null 2>&1 +--> +</command> + +<inputs> + <conditional name="query"> + <param name="type" type="select" label="Query source" help="Select an expression matrix from your User History, or enter a GEO identifier to automatically download an experiment from GEO."> + <option value="geo">GEO ID</option> +<!-- <option value="cel">CEL File</option> --> + <option value="expr" selected="true">Expression Matrix</option> + </param> + + <when value="geo"> + <param name="input" size="10" type="text" value="GSM99618" label="GEO ID" help="GSM and GSE identifiers are supported. E.g. GSM99618 or GSE4416. Please see below for the list of supported platforms."> + <validator type="regex" message="The provided GEO ID does not appear to be valid.">^(GSM|GSE|gsm|gse)\d+$</validator> + </param> + </when> + +<!-- shs 042412: no affycel file type available in Upload File --> +<!-- <when value="cel"> + <param name="input" type="data" label="CEL file" help="Submit an Affymetrix CEL file from your history"/> + </when> --> + + <when value="expr"> + <param name="input" type="data" label="Expression Matrix" help="The expression matrix is required to have the following format: columns = samples; rows = normalized expression values." /> + + <param name="platform" type="select" label="Platform" help="Select the array platform. Only those in the dropdown list are supported."> + <option value="GPL72">GPL72 - Affymetrix Drosophila Genome Array</option> + <option value="GPL81">GPL81 - Affymetrix Mouse Genome U74A Version 2 Array</option> + <option value="GPL85">GPL85 - Affymetrix Rat Genome U34 Array</option> + <option value="GPL91">GPL91 - Affymetrix Human Genome U95A Array</option> + <option value="GPL96">GPL96 - Affymetrix Human Genome U133A Array</option> + <option value="GPL200">GPL200 - Affymetrix C. elegans Genome Array</option> + <option value="GPL339">GPL339 - Affymetrix Mouse Expression 430A Array</option> + <option value="GPL341">GPL341 - Affymetrix Rat Expression 230A Array</option> + <option value="GPL570">GPL570 - Affymetrix Human Genome U133 Plus 2.0 Array</option> + <option value="GPL571">GPL571 - Affymetrix Human Genome U133A 2.0 Array</option> + <option value="GPL1261">GPL1261 - Affymetrix Mouse Genome 430 2.0 Array</option> + <option value="GPL1319">GPL1319 - Affymetrix Zebrafish Genome Array</option> + <option value="GPL1322">GPL1322 - Affymetrix Drosophila Genome 2.0 Array</option> + <option value="GPL1355">GPL1355 - Affymetrix Rat Genome 230 2.0 Array</option> + <option value="GPL2700">GPL2700 - Sentrix HumanRef-8 Expression BeadChip</option> + <option value="GPL2986">GPL2986 - ABI Human Genome Survey Microarray Version 2</option> + <option value="GPL2995">GPL2995 - ABI Mouse Genome Survey Microarray</option> + <option value="GPL3921">GPL3921 - Affymetrix HT Human Genome U133A Array</option> + <option value="GPL4685">GPL4685 - Affymetrix GeneChip HT-HG_U133A Early Access Array</option> + <option value="GPL6102">GPL6102 - Illumina human-6 v2.0 expression beadchip</option> + <option value="GPL6103">GPL6103 - Illumina mouseRef-8 v1.1 expression beadchip</option> + <option value="GPL6104">GPL6104 - Illumina humanRef-8 v2.0 expression beadchip</option> + <option value="GPL6105">GPL6105 - Illumina mouse-6 v1.1 expression beadchip</option> + <option value="GPL6244">GPL6244 - Affymetrix Human Gene 1.0 ST Array [transcript (gene) version]</option> + <option value="GPL6246">GPL6246 - Affymetrix Mouse Gene 1.0 ST Array [transcript (gene) version]</option> + <option value="GPL6333">GPL6333 - Illumina Mouse Ref-6 V1</option> + <option value="GPL6883">GPL6883 - Illumina HumanRef-8 v3.0 expression beadchip</option> + <option value="GPL6884">GPL6884 - Illumina HumanWG-6 v3.0 expression beadchip</option> + <option value="GPL6885">GPL6885 - Illumina MouseRef-8 v2.0 expression beadchip</option> + <option value="GPL6887">GPL6887 - Illumina MouseWG-6 v2.0 expression beadchip</option> + <option value="GPL6947">GPL6947 - Illumina HumanHT-12 v3.0 expression beadchip</option> + <option value="GPL8300">GPL8300 - Affymetrix Human Genome U95 Version 2 Array</option> + <option value="GPL8321">GPL8321 - Affymetrix Mouse Genome 430A 2.0 Array</option> + </param> + </when> + </conditional> +</inputs> + +<outputs> + <data format="tabular" name="output" label="PathPrint fingerprint"/> +</outputs> + +<stdio> + <exit_code range="1:" level="fatal" /> +</stdio> + +<help> + +**What it does** + +This tool computes a functional pathway profile for the input data. Gene expression values +for each sample are mapped to a set of 633 pathways that have been curated from KEGG, +Reactome, WikiPathways and Netpath. Pathways are assigned a value of [-1,0,1], where -1 +indicates low expression, 0 indicates average expression, and +1 indicates high expression +of the pathway relative to a background comprised of 188,502 arrays from the Gene +Expression Omnibus (GEO). + +.. class:: infomark + +Be sure to select the appropriate platform for your experiment or you may get spurious results. A list of supported platforms is available at the http://compbio.sph.harvard.edu/hidelab/pathprint/Platform_coverage.html. + +----- + +**Input** + +1. Expression Matrix (tabular format): columns represent samples and rows are normalized expression values for each sample. Also specify the appropriate platform (see note above). + +OR + +2. GEO series ID (GSE) or sample ID (GSM) + + +**Output** + +PathPrint Fingerprint: a tabular format file where columns represent samples and +row values are pathway activities + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. +</help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-pathprint.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,492 @@ + +## List of arrays supported by PathPrint as of version 1.2.2 This will need to +## be updated/replaced with a better version as PathPrint's support changes. +supportedArrays <- function() { + return(matrix(c("GPL72", "DrosGenome1", "Affymetrix Drosophila Genome Array", "drosophila", + "GPL85", "RG_U34A", "Affymetrix Rat Genome U34 Array", "rat", + "GPL91", "HG_U95A", "Affymetrix Human Genome U95A Array", "human", + "GPL96", "HG-U133A", "Affymetrix Human Genome U133A Array", "human", + "GPL200", "Celegans", "Affymetrix C. elegans Genome Array", "C.elegans", + "GPL339", "MOE430A", "Affymetrix Mouse Expression 430A Array", "mouse", + "GPL341", "RAE230A", "Affymetrix Rat Expression 230A Array", "rat", + "GPL570", "HG-U133_Plus_2", "Affymetrix Human Genome U133 Plus 2.0 Array", "human", + "GPL571", "HG-U133A_2", "Affymetrix Human Genome U133A 2.0 Array", "human", + "GPL1261", "Mouse430_2", "Affymetrix Mouse Genome 430 2.0 Array", "mouse", + "GPL1319", "Zebrafish", "Affymetrix Zebrafish Genome Array", "zebrafish", + "GPL1322", "Drosophila_2", "Affymetrix Drosophila Genome 2.0 Array", "drosophila", + "GPL1355", "Rat230_2", "Affymetrix Rat Genome 230 2.0 Array", "rat", + "GPL2700", "", "Sentrix HumanRef-8 Expression BeadChip", "human", + "GPL2986", "", "ABI Human Genome Survey Microarray Version 2", "human", + "GPL2995", "", "ABI Mouse Genome Survey Microarray", "mouse", + "GPL3921", "HT_HG-U133A", "Affymetrix HT Human Genome U133A Array", "human", + "GPL4685", "U133AAofAv2", "Affymetrix GeneChip HT-HG_U133A Early Access Array", "human", + "GPL6102", "", "Illumina human-6 v2.0 expression beadchip", "human", + "GPL6103", "", "Illumina mouseRef-8 v1.1 expression beadchip", "mouse", + "GPL6104", "", "Illumina humanRef-8 v2.0 expression beadchip", "human", + "GPL6105", "", "Illumina mouse-6 v1.1 expression beadchip", "mouse", + "GPL6333", "", "Illumina Mouse Ref-6 V1", "mouse", + "GPL6883", "", "Illumina HumanRef-8 v3.0 expression beadchip", "human", + "GPL6884", "", "Illumina HumanWG-6 v3.0 expression beadchip", "human", + "GPL6885", "", "Illumina MouseRef-8 v2.0 expression beadchip", "mouse", + "GPL6887", "", "Illumina MouseWG-6 v2.0 expression beadchip", "mouse", + "GPL6947", "", "Illumina HumanHT-12 V3.0 expression beadchip", "human", + "GPL8300", "HG_U95Av2", "Affymetrix Human Genome U95 Version 2 Array", "human", + "GPL8321", "Mouse430A_2", "Affymetrix Mouse Genome 430A 2.0 Array", "mouse"), + ncol = 4, byrow = TRUE)) +} + +fingerprintGSM <- function(geoID) { + if (is.existingGeoID(geoID)) { + ## If we've already got the GEO ID in the matrix, don't bother loading it + ## again, just pull out the data we need. + gsm.fingerprint <- GEO.fingerprint.matrix[,geoID] + } else { + ## We weren't able to find the GEO ID in the matrix so try to load all the + ## data we need from NCBI. + try(gsm <- getGEO(geoID, GSElimits = NULL), silent = FALSE) + if (! exists("gsm")) { + sink(stderr(), type = "message"); + stop(sprintf("Unable to load GEO id '%s': is the GEO id valid?\n", geoID)) + } + + gsm.exprs <- Table(gsm) + gsm.platform <- Meta(gsm)$platform_id + gsm.species <- Meta(gsm)$organism_ch1 + + ## Compute the fingerprint for the GSM id. + gsm.fingerprint <- exprs2fingerprint(exprs = gsm.exprs, + platform = gsm.platform, + species = gsm.species, + progressBar = FALSE) + } + + ## Convert the fingerprint to a data frame. + gsm.fingerprint <- as.data.frame(gsm.fingerprint) + + ## Tag the fingerprint as having been calculated on a GSM id. + attr(gsm.fingerprint, 'fingerprintType') <- 'GSM' + attr(gsm.fingerprint, 'fingerprintGEO') <- geoID + + return(gsm.fingerprint) +} + +fingerprintGSE <- function(geoID) { + if (is.existingGeoID(geoID) && 0) { + ## If we've already got the GEO ID in the matrix, don't bother loading it + ## again, just pull out the data we need. + gse <- na.omit(GEO.metadata.matrix[GEO.metadata.matrix$GSE == geoID,]) + gse.gsm <- gse[,"GSM"] + + gse.fingerprint <- GEO.fingerprint.matrix[,gse.gsm] + } else { + try(gse <- getGEO(geoID), silent = FALSE) + if (! exists("gse")) { + sink(stderr(), type = "message"); + stop(sprintf("Unable to load GEO id '%s': is the GEO id valid?\n", geoID)) + } else { + print(gse) + } + + gse.exprs <- exprs(gse[[1]]) + gse.platform <- annotation(gse[[1]]) + gse.species <- as.character(unique(phenoData(gse[[1]])$organism_ch1)) + + ## Compute the fingerprint for the GSE id. + gse.fingerprint <- exprs2fingerprint(exprs = gse.exprs, + platform = gse.platform, + species = gse.species, + progressBar = FALSE) + } + + ## Tag the fingerprint as having been calculated on a GSE id. + attr(gse.fingerprint, 'fingerprintType') <- 'GSE' + attr(gse.fingerprint, 'fingerprintGEO') <- geoID + + return(gse.fingerprint) +} + +generateFingerprint <- function(geoID) { + if (is.geoID(geoID)) { + if (is.geoGSM(geoID)) { + return(fingerprintGSM(geoID)) + } else { + return(fingerprintGSE(geoID)) + } + } else { + sink(stderr(), type = "message"); + stop(paste("not a GEO ID:", geoID)) + } +} + +loadFingerprint <- function(file) { + fingerprint <- read.delim(file, header = TRUE, sep = "\t"); + return(fingerprint); +} + +saveFingerprint <- function(fingerprint, file) { + if (! is.null(attr(fingerprint, 'fingerprintType'))) { + if ((attr(fingerprint, 'fingerprintType') == 'GSE')) { + data <- fingerprint[,1:ncol(fingerprint)] + cols <- colnames(fingerprint); + + write.table(data, row.names = TRUE, + col.names = cols, + file = file, + quote = FALSE, + sep = "\t") + } else { + data <- data.frame(fingerprint[,1]) + rownames(data) <- rownames(fingerprint) + write.table(data, row.names = TRUE, + col.names = attr(fingerprint, 'fingerprintGEO'), + file = file, + quote = FALSE, + sep = "\t") + } + } else { + sink(stderr(), type = "message"); + stop("Unable to save fingerprint: unknown fingerprint type") + } +} + +saveConsensus <- function(consensus, file) { + write.table(consensus, row.names = TRUE, + col.names = TRUE, + file = file, + quote = FALSE, + sep = "\t") +} + +calculateDistanceToGEO <- function(consensus) { +# sample from matrix to speed it up +# sample <- sample(dim(GEO.fingerprint.matrix)[2], 10000, replace=FALSE) +# GEO.distance <- consensusDistance(consensus, GEO.fingerprint.matrix[,sample]) + GEO.distance <- consensusDistance(consensus, GEO.fingerprint.matrix) + + similar.GEO <- GEO.metadata.matrix[ + match(rownames(GEO.distance), GEO.metadata.matrix$GSM), + c("GSM", "GSE", "GPL", "Source")] + + similar.GEO <- cbind(similar.GEO[1:sum(GEO.distance$pvalue < 0.01),], + GEO.distance[1:sum(GEO.distance$pvalue < 0.01),]) + return(similar.GEO) + +} + +calculateDistanceToPluripotent <- function(fingerprint, verbose = 0) { + #if (ncol(fingerprint) > 1) { + if (verbose) print("Calculating distances from pluripotent consensus for multiple sample") + ## Calculate the distance from the pluripotent consensus for a multiple sample. + pluripotent.consensus <- consensusFingerprint(GEO.fingerprint.matrix[,pluripotents.frame$GSM], 0.9) + fingerprint.distance <- consensusDistance(pluripotent.consensus, fingerprint) + if(ncol(fingerprint)==1) { + rownames(fingerprint.distance) <- colnames(fingerprint) + } + #} else { + # if (verbose) print("Calculating distances from pluripotent consensus for single sample") + # ## Calculate the distance from the pluripotent consensus for a single sample. + # pluripotent <- GEO.fingerprint.matrix[,pluripotents.frame$GSM] + # fingerprint.distance <- data.frame(consensusDistance(fingerprint, pluripotent)) + #} + + return(fingerprint.distance) +} + +saveDistance <- function(distance, file, pvalue = 1) { + write.table(distance, row.names = TRUE, + col.names = TRUE, + file = file, + quote = FALSE, + sep = "\t") +} + +loadDistance <- function(file, pvalue = 1) { + distance <- read.delim(file, header = TRUE, sep = "\t"); + return(distance); +} + +generateHistograms <- function(distance, filename) { + if (nrow(distance) == 0) { + sink(stderr(), type = "message"); + stop("Unable to generate histogram: no distances to plot - try increasing p-value?") + } + + pdf(filename, width = 8, height = 8) + + xlab1 <- "Distance of GEO records from pluripotent consensus" + xlab2 <- "Distance of submitted data from pluripotent consensus" + + ## Calculate the distance of the pluripotent to GEO. + pluripotent.consensus <- consensusFingerprint(GEO.fingerprint.matrix[,pluripotents.frame$GSM], 0.9) + geo.distance = calculateDistanceToGEO(pluripotent.consensus) + + ## Output the first histogram. + par(oma = c(5, 2, 2, 2), mfcol = c(2,1), mar = c(0, 4, 4, 2)) + hist(geo.distance[,"distance"], col ="grey", + main = "", + nclass = 50, + xlab = xlab1, + xlim = c(0,1)) + + ## Output the second histogram. + par(mar = c(7, 4, 4, 2)) + hist(distance[,"distance"], cex.lab = 0.8, + col = "green", + main = "", + xlab = xlab2, + xlim = c(0,1)) + mtext(cex = 0.8, + line = 0.5, + side = 3, + text = "Distance of GEO records from pluripotent consensus") + + invisible(dev.off()) +} + +generateHeatmap <- function(fingerprint, sdev, filename) { + + if(ncol(fingerprint) <= 1) { + sink(stderr(), type = "message"); + stop("unable to generate heatmap on fingerprint with single column. Requires a dataset with multiple samples.", + call. = FALSE) + } + + heatmap.data <- fingerprint[apply(fingerprint, 1, sd) > sdev, ] + if(dim(heatmap.data)[1] < 2) { + sink(stderr(), type="message"); + stop("unable to generate heatmap - try lowering the standard deviation cutoff", + call. = FALSE) + } + + library(pheatmap) + pdf(filename, width = 8, height = 10) + # make sure row and column names are readable + fontsize_row = 10 - nrow(heatmap.data) / 15 + if (fontsize_row < 2) { + show_rownames = FALSE + } + else { + show_rownames = TRUE + } + fontsize_col = 10 - ncol(heatmap.data) / 15 + if (fontsize_row < 2) { + show_colnames = FALSE + } + else { + show_colnames = TRUE + } + pheatmap(as.matrix(heatmap.data), + col = c("blue", "white", "red"), + legend_breaks=c(-1,0,1), + fontsize_row=fontsize_row, + fontsize_col=fontsize_col, + cluster_cols=TRUE, + main=paste("All samples, sd=", sdev, sep=""), + show_rownames=show_rownames, + show_colnames=show_colnames + ) + invisible(dev.off()) +} + +loadFingerprintFromCELFile <- function(filename) { + ## Load the data from the provided CEL file. + tryCatch({ data <- ReadAffy(filenames = c(filename)) }, + error = function(err) { + sink(stderr(), type = "message"); + stop("unable to parse CEL file - ensure provided file is valid", + call. = FALSE) + }) + + ## Get the reported platform, if any, from the CEL file. + platform <- getPlatformFromArrayName(cdfName(data)) + if (is.null(platform)) { + sink(stderr(), type = "message"); + stop(sprintf("Unable to determine from platform from CEL file")); + }; + if (! is.supportedPlatform(platform)) { + sink(stderr(), type = "message"); + stop(sprintf("The '%s' platform is not supported by PathPrint", platform)) + } + + ## Create the fingerprint based on the CEL file data. + cel.fingerprint <- exprs2fingerprint(exprs(rma(data)), platform, getPlatformSpecies(platform)) + + ## Convert the fingerprint to a data frame. + cel.fingerprint <- as.data.frame(cel.fingerprint) + + ## Tag the fingerprint as having been calculated on either a GSM or GSE id. + if (ncol(cel.fingerprint) == 1) { + attr(cel.fingerprint, 'fingerprintType') <- 'GSM' + } else { + attr(cel.fingerprint, 'fingerprintType') <- 'GSE' + } + attr(cel.fingerprint, 'fingerprintGEO') <- 'Unknown GEO ID' + + ## Return the fingerprint + return(cel.fingerprint) +} + +loadFingerprintFromExprsFile <- function(filename, platform) { + if (! is.supportedPlatform(platform)) + stop(sprintf("The '%s' platform is not supported by PathPrint", platform)) + + ## Load the data from the provided expression set file. + tryCatch({ data <- read.delim(filename, sep=("\t")) }, + error = function(err) { + stop(sprintf("Unable to parse expression set file: %s", err), + call. = FALSE) + }) + + species <- getPlatformSpecies(platform); + print(platform) + print(species) + + ## Create the fingerprint based on the expression set file data. + tryCatch({ exprs.fingerprint <- exprs2fingerprint(as.data.frame(data), platform, getPlatformSpecies(platform)) }, + error = function(err) { + print(err); + stop(sprintf("Expression set to fingerprint conversion failed. Please ensure the platform is correct.")) + }) + + ## Convert the fingerprint to a data frame. + exprs.fingerprint <- as.data.frame(exprs.fingerprint) + + ## Tag the fingerprint as having been calculated on either a GSM or GSE id. + if (ncol(exprs.fingerprint) == 1) { + attr(exprs.fingerprint, 'fingerprintType') <- 'GSM' + } else { + attr(exprs.fingerprint, 'fingerprintType') <- 'GSE' + } + + ## Return the fingerprint + return(exprs.fingerprint) +} + +## +## Helper functions for validating if a CEL file is supported by PathPrint +## and for accessing data about the platform or species it supports. +## + +is.existingGeoID <- function(id) { + if (is.geoID(id)) { + if (is.geoGSM(id)) { + id %in% GEO.metadata.matrix[,"GSM"] + } else { + id %in% GEO.metadata.matrix[,"GSE"] + } + } else { + NULL + } +} + +## Check to see if the provided platform is supported by PathPrint. +is.supportedPlatform <- function(platform) { + platform %in% supportedArrays()[,1] +} + +## Check to see if the provided species is supported by PathPrint. +is.supportedSpecies <- function(species) { + species %in% supportedArrays()[,4] +} + +## Given the array name, return the corresponding species. +getPlatformSpecies <- function(platform) { + if (is.supportedPlatform(platform)) { + return(supportedArrays()[grep(sprintf("^%s$", platform), (supportedArrays()[,1])), 4]) + } else { + return(NULL) + } +} + +## Lookup the platform using the array name. +getPlatformFromArrayName <- function(array) { + platform <- supportedArrays()[grep(sprintf("^%s$", array), (supportedArrays()[,2])), 1] + + if (length(platform)) + return(platform) + else + return(NULL) +} + +generateSimilarExperiments <- function(data, filename) { + sink(filename, append=FALSE, split=FALSE) + + cat("<html>\n") + cat("<head>\n") + cat("<title>Similar Experiments in GEO</title>\n") + cat(" + <style type='text/css'> + + + .data-table { + border-collapse: collapse; + width: 100%; + } + + .data-table td, th { + border: 1px solid lightslategrey; + color: #000080; + font-family: Verdana,Geneva,Arial,sans-serif; + padding: 4px; + font-size: 75%; + overflow: hidden; + text-overflow: ellipsis; + } + + .data-table th { + font-weight: bold; + } + </style>\n"); + cat("<head>\n") + cat("<body>\n") + cat("<table class='data-table'>\n") + cat("<tr>") + cat("<th>GSM ID</th>") + cat("<th>GSE ID</th>") + cat("<th>GPL ID</th>") + cat("<th>Source</th>") + cat("<th>distance</th>") + cat("<th>p-value</th>") + cat("</tr>\n") +for(i in 1:nrow(data)) { + cat("<tr>", + "<td><a href='", createGEOLink(data$GSM[i]), "' target=0>", data$GSM[i], "</a></td>", + "<td><a href='", createGEOLink(data$GSE[i]), "' target=0>", data$GSE[i], "</a></td>", + "<td><a href='", createGEOLink(data$GPL[i]), "' target=0>", data$GPL[i], "</a></td>", + "<td>", data$Source[i], "</td>", + "<td>", data$distance[i], "</td>", + "<td>", data$pvalue[i], "</td>", + "</tr>\n", sep = '') + } + + cat("</table>\n") + cat("</body\n") + cat("</html>\n") + + sink() +} + +## +## Helper functions +## + +## Create a link to the GEO ID. +createGEOLink <- function(id) { + sprintf("http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s", id) +} + +## Check if the provided ID corresponds to the GEO GSM format. +is.geoGSM <- function(id) { + length(grep("^GSM\\d+$", id, ignore.case = TRUE)) > 0 +} + +## Check if the provided ID corresponds to the GEO GSE format. +is.geoGSE <- function(id) { + length(grep("^GSE\\d+$", id, ignore.case = TRUE)) > 0 +} + +## Check if the provided ID corresponds to the GEO format. +is.geoID <- function(id) { + return(is.geoGSM(id) | is.geoGSE(id)) +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/heatmap.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +usage <- function() { + stop("Usage: heatmap.r <fingerprint> <sdev> <output>", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +sdev <- ifelse(! is.na(args[2]), as.numeric(args[2]), usage()) +output <- ifelse(! is.na(args[3]), args[3], usage()) + + +fingerprint <- loadFingerprint(input); +fingerprint <- data.frame(fingerprint[complete.cases(fingerprint),]); + +generateHeatmap(fingerprint, sdev, output); + +quit("no", 0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/heatmap.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,52 @@ +<tool id="pathprint-heatmap" name="Generate a Heatmap"> + +<!-- +<description>Generate a Heatmap</description> +--> + +<command>heatmap.r $input $sdev $output</command> + +<inputs> + <param name="input" type="data" label="Pathway Fingerprint" help="Submit a Pathway Fingerprint file from your history. Note that your file must contain multiple samples"/> + <param name="sdev" type="select" label="Standard deviation" help="Display only pathways with the selected standard deviation across samples"> + <option value="0.9">0.9</option> + <option value="0.8">0.8</option> + <option value="0.7" selected="true">0.7 (default)</option> + <option value="0.6">0.6</option> + <option value="0.5">0.5</option> + </param> +</inputs> + +<outputs> + <data format="pdf" name="output" label="Heatmap"/> +</outputs> + +<stdio> + <exit_code range="1:" level="fatal" /> +</stdio> + +<help> +**What it does** + +This tool creates a heatmap showing pathways with highest variability across samples (i.e. above the selected standard deviation threshold). + +.. class:: infomark + +You must have multiple samples in your PathPrint fingerprint file to generate a heatmap. + +----- + +**Input**: A PathPrint fingerprint file containing multiple samples from your History and a standard deviation cutoff. + +Samples that are very similar to each other may not have any pathways meeting the standard deviation threhold, which will result in an error. If this occurs, try lowering the threshold. + +If too many pathways are returned, labels will not be shown. Try increasing the standard deviation cutoff. + +**Output**: PDF file showing a heatmap of the most variable pathway activities + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. +</help> +</tool> +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +usage <- function() { + stop("Usage: histogram.r <input> <output>", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +output <- ifelse(! is.na(args[2]), args[2], usage()) + +distance <- loadDistance(input); +generateHistograms(distance, output); + +quit("no", 0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,21 @@ +<tool id="pathprint-histogram" name="Generate a Histogram"> + + <command>histogram.r $input $output</command> + + <inputs> + <param name="input" type="data" label="Consensus Distance" help="Submit a Consensus Distance file from your history"/> + </inputs> + + <outputs> + <data format="pdf" name="output" label="Histogram"/> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> + Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/similar-experiments.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +usage <- function() { + stop("Usage: similar_experiments.r <input> <output>", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +output <- ifelse(! is.na(args[2]), args[2], usage()) + +threshold <- 0.8 + +fingerprint <- loadFingerprint(input); +consensus <- consensusFingerprint(fingerprint, threshold); +distance <- calculateDistanceToGEO(consensus); + +# Display only the first 50 closest experiments +#distance <- distance[1:50,] + +generateSimilarExperiments(distance, output); + +quit("no", 0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/similar-experiments.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,38 @@ +<tool id="pathprint-similar" name="Find Similar Experiments in GEO"> + +<!-- +<description>Find Similar Experiments in GEO</description> +--> + + <command>similar-experiments.r $input $output</command> + + <inputs> + <param name="input" type="data" label="Pathway Fingerprint" help="Submit a PathPrint Fingerprint from your history"/> + </inputs> + + <outputs> + <data format="html" name="output" label="Similar Experiments in GEO"/> + </outputs> + + <stdio> + <exit_code range="1:" level="fatal" /> + </stdio> + + <help> + +**What it does** + +This tool takes a PathPrint Fingerprint as input and identifies experiments in GEO that are most similar based on a Manhattan distance. + +----- + +**Input:** PathPrint Fingerprint file from your History + +**Output:** Html page showing the most similar GEO experiments, their distance and associated p-value. Entries are linked to GEO. + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + </help> +</tool> +