# HG changeset patch # User hackdna # Date 1368815373 14400 # Node ID 0cebe436a5531c592f4501e72c9d44ab46925c04 Initial upload. diff -r 000000000000 -r 0cebe436a553 README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,21 @@ +Each of these files describes the interface for one of the tools in Galaxy: + + * fingerprint.xml - Create Pathway Fingerprint + * distance.xml - Calculate Distance to Consensus + * similar-experiments.xml - Find Similar Experiments in GEO + * histogram.xml - Generate a Histogram + * heatmap.xml - Generate a Heatmap + +Each interface calls a corresponding R file that performs the actions: + + * fingerprint.r + * distance.r + * similar-experiments.r + * histogram.r + * heatmap.r + +Shared fuctions are kept in this file: + + * galaxy-pathprint.r + + diff -r 000000000000 -r 0cebe436a553 consensus.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,32 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +data(GEO.metadata.matrix); + +usage <- function() { + stop("Usage: consensus.r ", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +threshold <- ifelse(! is.na(args[2]), as.numeric(args[2]), usage()) +output <- ifelse(! is.na(args[3]), args[3], usage()) + +fingerprint <- loadFingerprint(input); +consensus <- consensusFingerprint(fingerprint, threshold); + +saveConsensus(consensus, output); + +quit("no", 0) diff -r 000000000000 -r 0cebe436a553 consensus.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/consensus.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,43 @@ + + consensus.r $input $threshold $output + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool identifies the most prevalent PathPrint value for each pathway across the submitted samples and produces a consensus PathPrint. + +The method calculates the mean value of the pathway activities across samples, and then applies the threshold such that if the mean is greater than threshold, the pathway is assigned a consensus of +1. If the mean is less than -threshold, a value of -1 is assigned. Otherwise, the consensus is assigned a value of 0. + +----- + +**Input:** PathPrint fingerprint for multiple samples and a threshold value. + +**Output:** A single consensus PathPrint fingerprint representing the most commonly occurring value + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + + + diff -r 000000000000 -r 0cebe436a553 distance.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distance.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,33 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +data(GEO.metadata.matrix); + +usage <- function() { + stop("Usage: distance.r ", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +output <- ifelse(! is.na(args[2]), args[2], usage()) +output2 <- ifelse(! is.na(args[3]), args[3], usage()) + +fingerprint <- loadFingerprint(input); +distance <- calculateDistanceToPluripotent(fingerprint, verbose = 0); +saveDistance(distance, output); + +generateHistograms(distance, output2); + +quit("no", 0) diff -r 000000000000 -r 0cebe436a553 distance.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distance.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,44 @@ + + + + + distance.r $input $output $output2 + + + + + + + + + + + + + + + + +**What it does** + +This tool takes the computed PathPrint Fingerprint as input and uses it to determine how similar +your data are to a consensus signature generated from a set of pluripotent samples in GEO. + +----- + +**Input**: A PathPrint Fingerprint file from your history + +**Output**: + +1. Distance from pluripotent consensus: tab delimited file containing distance and p-value for each sample from the pluripotent consensus signature. + +2. Histogram: PDF file containing two histograms: (1) distribution of distances for ~180,000 GEO samples from the pluripotent signature; (2) distribution of distances for the input samples from the pluripotent signature. + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + + + diff -r 000000000000 -r 0cebe436a553 fingerprint.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fingerprint.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,50 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(affy) +library(GEOquery); +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +data(GEO.metadata.matrix); + +usage <- function() { + sink(stderr(), type = "message"); + stop("Usage: fingerprint.r [ARGS]", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +type <- ifelse(! is.na(args[1]), args[1], usage()) + +if (type == "geo") { + geoID <- ifelse(! is.na(args[2]), args[2], usage()); + output <- ifelse(! is.na(args[3]), args[3], usage()); + + fingerprint <- generateFingerprint(geoID); +} else if (type == "cel") { + input <- ifelse(! is.na(args[2]), args[2], usage()); + output <- ifelse(! is.na(args[3]), args[3], usage()); + + fingerprint <- loadFingerprintFromCELFile(input); +} else if (type == "expr") { + input <- ifelse(! is.na(args[2]), args[2], usage()); + platform <- ifelse(! is.na(args[3]), args[3], usage()); + output <- ifelse(! is.na(args[4]), args[4], usage()); + + fingerprint <- loadFingerprintFromExprsFile(input, platform); +} else { + usage(); +}; + +saveFingerprint(fingerprint, output); + +quit("no", 0) diff -r 000000000000 -r 0cebe436a553 fingerprint.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fingerprint.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,126 @@ + + + + + +fingerprint.r +#if $query.type == "geo" +geo $query.input $output +#elif $query.type == "cel" +cel $query.input $output +#elif $query.type == "expr" +expr $query.input $query.platform $output +#end if + + + + + + + + + + + + + + ^(GSM|GSE|gsm|gse)\d+$ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool computes a functional pathway profile for the input data. Gene expression values +for each sample are mapped to a set of 633 pathways that have been curated from KEGG, +Reactome, WikiPathways and Netpath. Pathways are assigned a value of [-1,0,1], where -1 +indicates low expression, 0 indicates average expression, and +1 indicates high expression +of the pathway relative to a background comprised of 188,502 arrays from the Gene +Expression Omnibus (GEO). + +.. class:: infomark + +Be sure to select the appropriate platform for your experiment or you may get spurious results. A list of supported platforms is available at the http://compbio.sph.harvard.edu/hidelab/pathprint/Platform_coverage.html. + +----- + +**Input** + +1. Expression Matrix (tabular format): columns represent samples and rows are normalized expression values for each sample. Also specify the appropriate platform (see note above). + +OR + +2. GEO series ID (GSE) or sample ID (GSM) + + +**Output** + +PathPrint Fingerprint: a tabular format file where columns represent samples and +row values are pathway activities + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + + + diff -r 000000000000 -r 0cebe436a553 galaxy-pathprint.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/galaxy-pathprint.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,492 @@ + +## List of arrays supported by PathPrint as of version 1.2.2 This will need to +## be updated/replaced with a better version as PathPrint's support changes. +supportedArrays <- function() { + return(matrix(c("GPL72", "DrosGenome1", "Affymetrix Drosophila Genome Array", "drosophila", + "GPL85", "RG_U34A", "Affymetrix Rat Genome U34 Array", "rat", + "GPL91", "HG_U95A", "Affymetrix Human Genome U95A Array", "human", + "GPL96", "HG-U133A", "Affymetrix Human Genome U133A Array", "human", + "GPL200", "Celegans", "Affymetrix C. elegans Genome Array", "C.elegans", + "GPL339", "MOE430A", "Affymetrix Mouse Expression 430A Array", "mouse", + "GPL341", "RAE230A", "Affymetrix Rat Expression 230A Array", "rat", + "GPL570", "HG-U133_Plus_2", "Affymetrix Human Genome U133 Plus 2.0 Array", "human", + "GPL571", "HG-U133A_2", "Affymetrix Human Genome U133A 2.0 Array", "human", + "GPL1261", "Mouse430_2", "Affymetrix Mouse Genome 430 2.0 Array", "mouse", + "GPL1319", "Zebrafish", "Affymetrix Zebrafish Genome Array", "zebrafish", + "GPL1322", "Drosophila_2", "Affymetrix Drosophila Genome 2.0 Array", "drosophila", + "GPL1355", "Rat230_2", "Affymetrix Rat Genome 230 2.0 Array", "rat", + "GPL2700", "", "Sentrix HumanRef-8 Expression BeadChip", "human", + "GPL2986", "", "ABI Human Genome Survey Microarray Version 2", "human", + "GPL2995", "", "ABI Mouse Genome Survey Microarray", "mouse", + "GPL3921", "HT_HG-U133A", "Affymetrix HT Human Genome U133A Array", "human", + "GPL4685", "U133AAofAv2", "Affymetrix GeneChip HT-HG_U133A Early Access Array", "human", + "GPL6102", "", "Illumina human-6 v2.0 expression beadchip", "human", + "GPL6103", "", "Illumina mouseRef-8 v1.1 expression beadchip", "mouse", + "GPL6104", "", "Illumina humanRef-8 v2.0 expression beadchip", "human", + "GPL6105", "", "Illumina mouse-6 v1.1 expression beadchip", "mouse", + "GPL6333", "", "Illumina Mouse Ref-6 V1", "mouse", + "GPL6883", "", "Illumina HumanRef-8 v3.0 expression beadchip", "human", + "GPL6884", "", "Illumina HumanWG-6 v3.0 expression beadchip", "human", + "GPL6885", "", "Illumina MouseRef-8 v2.0 expression beadchip", "mouse", + "GPL6887", "", "Illumina MouseWG-6 v2.0 expression beadchip", "mouse", + "GPL6947", "", "Illumina HumanHT-12 V3.0 expression beadchip", "human", + "GPL8300", "HG_U95Av2", "Affymetrix Human Genome U95 Version 2 Array", "human", + "GPL8321", "Mouse430A_2", "Affymetrix Mouse Genome 430A 2.0 Array", "mouse"), + ncol = 4, byrow = TRUE)) +} + +fingerprintGSM <- function(geoID) { + if (is.existingGeoID(geoID)) { + ## If we've already got the GEO ID in the matrix, don't bother loading it + ## again, just pull out the data we need. + gsm.fingerprint <- GEO.fingerprint.matrix[,geoID] + } else { + ## We weren't able to find the GEO ID in the matrix so try to load all the + ## data we need from NCBI. + try(gsm <- getGEO(geoID, GSElimits = NULL), silent = FALSE) + if (! exists("gsm")) { + sink(stderr(), type = "message"); + stop(sprintf("Unable to load GEO id '%s': is the GEO id valid?\n", geoID)) + } + + gsm.exprs <- Table(gsm) + gsm.platform <- Meta(gsm)$platform_id + gsm.species <- Meta(gsm)$organism_ch1 + + ## Compute the fingerprint for the GSM id. + gsm.fingerprint <- exprs2fingerprint(exprs = gsm.exprs, + platform = gsm.platform, + species = gsm.species, + progressBar = FALSE) + } + + ## Convert the fingerprint to a data frame. + gsm.fingerprint <- as.data.frame(gsm.fingerprint) + + ## Tag the fingerprint as having been calculated on a GSM id. + attr(gsm.fingerprint, 'fingerprintType') <- 'GSM' + attr(gsm.fingerprint, 'fingerprintGEO') <- geoID + + return(gsm.fingerprint) +} + +fingerprintGSE <- function(geoID) { + if (is.existingGeoID(geoID) && 0) { + ## If we've already got the GEO ID in the matrix, don't bother loading it + ## again, just pull out the data we need. + gse <- na.omit(GEO.metadata.matrix[GEO.metadata.matrix$GSE == geoID,]) + gse.gsm <- gse[,"GSM"] + + gse.fingerprint <- GEO.fingerprint.matrix[,gse.gsm] + } else { + try(gse <- getGEO(geoID), silent = FALSE) + if (! exists("gse")) { + sink(stderr(), type = "message"); + stop(sprintf("Unable to load GEO id '%s': is the GEO id valid?\n", geoID)) + } else { + print(gse) + } + + gse.exprs <- exprs(gse[[1]]) + gse.platform <- annotation(gse[[1]]) + gse.species <- as.character(unique(phenoData(gse[[1]])$organism_ch1)) + + ## Compute the fingerprint for the GSE id. + gse.fingerprint <- exprs2fingerprint(exprs = gse.exprs, + platform = gse.platform, + species = gse.species, + progressBar = FALSE) + } + + ## Tag the fingerprint as having been calculated on a GSE id. + attr(gse.fingerprint, 'fingerprintType') <- 'GSE' + attr(gse.fingerprint, 'fingerprintGEO') <- geoID + + return(gse.fingerprint) +} + +generateFingerprint <- function(geoID) { + if (is.geoID(geoID)) { + if (is.geoGSM(geoID)) { + return(fingerprintGSM(geoID)) + } else { + return(fingerprintGSE(geoID)) + } + } else { + sink(stderr(), type = "message"); + stop(paste("not a GEO ID:", geoID)) + } +} + +loadFingerprint <- function(file) { + fingerprint <- read.delim(file, header = TRUE, sep = "\t"); + return(fingerprint); +} + +saveFingerprint <- function(fingerprint, file) { + if (! is.null(attr(fingerprint, 'fingerprintType'))) { + if ((attr(fingerprint, 'fingerprintType') == 'GSE')) { + data <- fingerprint[,1:ncol(fingerprint)] + cols <- colnames(fingerprint); + + write.table(data, row.names = TRUE, + col.names = cols, + file = file, + quote = FALSE, + sep = "\t") + } else { + data <- data.frame(fingerprint[,1]) + rownames(data) <- rownames(fingerprint) + write.table(data, row.names = TRUE, + col.names = attr(fingerprint, 'fingerprintGEO'), + file = file, + quote = FALSE, + sep = "\t") + } + } else { + sink(stderr(), type = "message"); + stop("Unable to save fingerprint: unknown fingerprint type") + } +} + +saveConsensus <- function(consensus, file) { + write.table(consensus, row.names = TRUE, + col.names = TRUE, + file = file, + quote = FALSE, + sep = "\t") +} + +calculateDistanceToGEO <- function(consensus) { +# sample from matrix to speed it up +# sample <- sample(dim(GEO.fingerprint.matrix)[2], 10000, replace=FALSE) +# GEO.distance <- consensusDistance(consensus, GEO.fingerprint.matrix[,sample]) + GEO.distance <- consensusDistance(consensus, GEO.fingerprint.matrix) + + similar.GEO <- GEO.metadata.matrix[ + match(rownames(GEO.distance), GEO.metadata.matrix$GSM), + c("GSM", "GSE", "GPL", "Source")] + + similar.GEO <- cbind(similar.GEO[1:sum(GEO.distance$pvalue < 0.01),], + GEO.distance[1:sum(GEO.distance$pvalue < 0.01),]) + return(similar.GEO) + +} + +calculateDistanceToPluripotent <- function(fingerprint, verbose = 0) { + #if (ncol(fingerprint) > 1) { + if (verbose) print("Calculating distances from pluripotent consensus for multiple sample") + ## Calculate the distance from the pluripotent consensus for a multiple sample. + pluripotent.consensus <- consensusFingerprint(GEO.fingerprint.matrix[,pluripotents.frame$GSM], 0.9) + fingerprint.distance <- consensusDistance(pluripotent.consensus, fingerprint) + if(ncol(fingerprint)==1) { + rownames(fingerprint.distance) <- colnames(fingerprint) + } + #} else { + # if (verbose) print("Calculating distances from pluripotent consensus for single sample") + # ## Calculate the distance from the pluripotent consensus for a single sample. + # pluripotent <- GEO.fingerprint.matrix[,pluripotents.frame$GSM] + # fingerprint.distance <- data.frame(consensusDistance(fingerprint, pluripotent)) + #} + + return(fingerprint.distance) +} + +saveDistance <- function(distance, file, pvalue = 1) { + write.table(distance, row.names = TRUE, + col.names = TRUE, + file = file, + quote = FALSE, + sep = "\t") +} + +loadDistance <- function(file, pvalue = 1) { + distance <- read.delim(file, header = TRUE, sep = "\t"); + return(distance); +} + +generateHistograms <- function(distance, filename) { + if (nrow(distance) == 0) { + sink(stderr(), type = "message"); + stop("Unable to generate histogram: no distances to plot - try increasing p-value?") + } + + pdf(filename, width = 8, height = 8) + + xlab1 <- "Distance of GEO records from pluripotent consensus" + xlab2 <- "Distance of submitted data from pluripotent consensus" + + ## Calculate the distance of the pluripotent to GEO. + pluripotent.consensus <- consensusFingerprint(GEO.fingerprint.matrix[,pluripotents.frame$GSM], 0.9) + geo.distance = calculateDistanceToGEO(pluripotent.consensus) + + ## Output the first histogram. + par(oma = c(5, 2, 2, 2), mfcol = c(2,1), mar = c(0, 4, 4, 2)) + hist(geo.distance[,"distance"], col ="grey", + main = "", + nclass = 50, + xlab = xlab1, + xlim = c(0,1)) + + ## Output the second histogram. + par(mar = c(7, 4, 4, 2)) + hist(distance[,"distance"], cex.lab = 0.8, + col = "green", + main = "", + xlab = xlab2, + xlim = c(0,1)) + mtext(cex = 0.8, + line = 0.5, + side = 3, + text = "Distance of GEO records from pluripotent consensus") + + invisible(dev.off()) +} + +generateHeatmap <- function(fingerprint, sdev, filename) { + + if(ncol(fingerprint) <= 1) { + sink(stderr(), type = "message"); + stop("unable to generate heatmap on fingerprint with single column. Requires a dataset with multiple samples.", + call. = FALSE) + } + + heatmap.data <- fingerprint[apply(fingerprint, 1, sd) > sdev, ] + if(dim(heatmap.data)[1] < 2) { + sink(stderr(), type="message"); + stop("unable to generate heatmap - try lowering the standard deviation cutoff", + call. = FALSE) + } + + library(pheatmap) + pdf(filename, width = 8, height = 10) + # make sure row and column names are readable + fontsize_row = 10 - nrow(heatmap.data) / 15 + if (fontsize_row < 2) { + show_rownames = FALSE + } + else { + show_rownames = TRUE + } + fontsize_col = 10 - ncol(heatmap.data) / 15 + if (fontsize_row < 2) { + show_colnames = FALSE + } + else { + show_colnames = TRUE + } + pheatmap(as.matrix(heatmap.data), + col = c("blue", "white", "red"), + legend_breaks=c(-1,0,1), + fontsize_row=fontsize_row, + fontsize_col=fontsize_col, + cluster_cols=TRUE, + main=paste("All samples, sd=", sdev, sep=""), + show_rownames=show_rownames, + show_colnames=show_colnames + ) + invisible(dev.off()) +} + +loadFingerprintFromCELFile <- function(filename) { + ## Load the data from the provided CEL file. + tryCatch({ data <- ReadAffy(filenames = c(filename)) }, + error = function(err) { + sink(stderr(), type = "message"); + stop("unable to parse CEL file - ensure provided file is valid", + call. = FALSE) + }) + + ## Get the reported platform, if any, from the CEL file. + platform <- getPlatformFromArrayName(cdfName(data)) + if (is.null(platform)) { + sink(stderr(), type = "message"); + stop(sprintf("Unable to determine from platform from CEL file")); + }; + if (! is.supportedPlatform(platform)) { + sink(stderr(), type = "message"); + stop(sprintf("The '%s' platform is not supported by PathPrint", platform)) + } + + ## Create the fingerprint based on the CEL file data. + cel.fingerprint <- exprs2fingerprint(exprs(rma(data)), platform, getPlatformSpecies(platform)) + + ## Convert the fingerprint to a data frame. + cel.fingerprint <- as.data.frame(cel.fingerprint) + + ## Tag the fingerprint as having been calculated on either a GSM or GSE id. + if (ncol(cel.fingerprint) == 1) { + attr(cel.fingerprint, 'fingerprintType') <- 'GSM' + } else { + attr(cel.fingerprint, 'fingerprintType') <- 'GSE' + } + attr(cel.fingerprint, 'fingerprintGEO') <- 'Unknown GEO ID' + + ## Return the fingerprint + return(cel.fingerprint) +} + +loadFingerprintFromExprsFile <- function(filename, platform) { + if (! is.supportedPlatform(platform)) + stop(sprintf("The '%s' platform is not supported by PathPrint", platform)) + + ## Load the data from the provided expression set file. + tryCatch({ data <- read.delim(filename, sep=("\t")) }, + error = function(err) { + stop(sprintf("Unable to parse expression set file: %s", err), + call. = FALSE) + }) + + species <- getPlatformSpecies(platform); + print(platform) + print(species) + + ## Create the fingerprint based on the expression set file data. + tryCatch({ exprs.fingerprint <- exprs2fingerprint(as.data.frame(data), platform, getPlatformSpecies(platform)) }, + error = function(err) { + print(err); + stop(sprintf("Expression set to fingerprint conversion failed. Please ensure the platform is correct.")) + }) + + ## Convert the fingerprint to a data frame. + exprs.fingerprint <- as.data.frame(exprs.fingerprint) + + ## Tag the fingerprint as having been calculated on either a GSM or GSE id. + if (ncol(exprs.fingerprint) == 1) { + attr(exprs.fingerprint, 'fingerprintType') <- 'GSM' + } else { + attr(exprs.fingerprint, 'fingerprintType') <- 'GSE' + } + + ## Return the fingerprint + return(exprs.fingerprint) +} + +## +## Helper functions for validating if a CEL file is supported by PathPrint +## and for accessing data about the platform or species it supports. +## + +is.existingGeoID <- function(id) { + if (is.geoID(id)) { + if (is.geoGSM(id)) { + id %in% GEO.metadata.matrix[,"GSM"] + } else { + id %in% GEO.metadata.matrix[,"GSE"] + } + } else { + NULL + } +} + +## Check to see if the provided platform is supported by PathPrint. +is.supportedPlatform <- function(platform) { + platform %in% supportedArrays()[,1] +} + +## Check to see if the provided species is supported by PathPrint. +is.supportedSpecies <- function(species) { + species %in% supportedArrays()[,4] +} + +## Given the array name, return the corresponding species. +getPlatformSpecies <- function(platform) { + if (is.supportedPlatform(platform)) { + return(supportedArrays()[grep(sprintf("^%s$", platform), (supportedArrays()[,1])), 4]) + } else { + return(NULL) + } +} + +## Lookup the platform using the array name. +getPlatformFromArrayName <- function(array) { + platform <- supportedArrays()[grep(sprintf("^%s$", array), (supportedArrays()[,2])), 1] + + if (length(platform)) + return(platform) + else + return(NULL) +} + +generateSimilarExperiments <- function(data, filename) { + sink(filename, append=FALSE, split=FALSE) + + cat("\n") + cat("\n") + cat("Similar Experiments in GEO\n") + cat(" + \n"); + cat("\n") + cat("\n") + cat("\n") + cat("") + cat("") + cat("") + cat("") + cat("") + cat("") + cat("") + cat("\n") +for(i in 1:nrow(data)) { + cat("", + "", + "", + "", + "", + "", + "", + "\n", sep = '') + } + + cat("
GSM IDGSE IDGPL IDSourcedistancep-value
", data$GSM[i], "", data$GSE[i], "", data$GPL[i], "", data$Source[i], "", data$distance[i], "", data$pvalue[i], "
\n") + cat("\n") + + sink() +} + +## +## Helper functions +## + +## Create a link to the GEO ID. +createGEOLink <- function(id) { + sprintf("http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s", id) +} + +## Check if the provided ID corresponds to the GEO GSM format. +is.geoGSM <- function(id) { + length(grep("^GSM\\d+$", id, ignore.case = TRUE)) > 0 +} + +## Check if the provided ID corresponds to the GEO GSE format. +is.geoGSE <- function(id) { + length(grep("^GSE\\d+$", id, ignore.case = TRUE)) > 0 +} + +## Check if the provided ID corresponds to the GEO format. +is.geoID <- function(id) { + return(is.geoGSM(id) | is.geoGSE(id)) +} + diff -r 000000000000 -r 0cebe436a553 heatmap.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/heatmap.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,31 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +usage <- function() { + stop("Usage: heatmap.r ", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +sdev <- ifelse(! is.na(args[2]), as.numeric(args[2]), usage()) +output <- ifelse(! is.na(args[3]), args[3], usage()) + + +fingerprint <- loadFingerprint(input); +fingerprint <- data.frame(fingerprint[complete.cases(fingerprint),]); + +generateHeatmap(fingerprint, sdev, output); + +quit("no", 0) diff -r 000000000000 -r 0cebe436a553 heatmap.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/heatmap.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,52 @@ + + + + +heatmap.r $input $sdev $output + + + + + + + + + + + + + + + + + + + + + +**What it does** + +This tool creates a heatmap showing pathways with highest variability across samples (i.e. above the selected standard deviation threshold). + +.. class:: infomark + +You must have multiple samples in your PathPrint fingerprint file to generate a heatmap. + +----- + +**Input**: A PathPrint fingerprint file containing multiple samples from your History and a standard deviation cutoff. + +Samples that are very similar to each other may not have any pathways meeting the standard deviation threhold, which will result in an error. If this occurs, try lowering the threshold. + +If too many pathways are returned, labels will not be shown. Try increasing the standard deviation cutoff. + +**Output**: PDF file showing a heatmap of the most variable pathway activities + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + + + diff -r 000000000000 -r 0cebe436a553 histogram.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,27 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +usage <- function() { + stop("Usage: histogram.r ", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +output <- ifelse(! is.na(args[2]), args[2], usage()) + +distance <- loadDistance(input); +generateHistograms(distance, output); + +quit("no", 0) diff -r 000000000000 -r 0cebe436a553 histogram.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/histogram.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,21 @@ + + + histogram.r $input $output + + + + + + + + + + + + + + + Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + + + diff -r 000000000000 -r 0cebe436a553 similar-experiments.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/similar-experiments.r Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript + +#sink(file("/dev/null", "w"), type = "message"); + +library(pathprint); + +# Figure out the relative path to the galaxy-pathprint.r library. +script.args <- commandArgs(trailingOnly = FALSE); +script.name <- sub("--file=", "", script.args[grep("--file=", script.args)]) +script.base <- dirname(script.name) +library.path <- file.path(script.base, "galaxy-pathprint.r"); +source(library.path) + +usage <- function() { + stop("Usage: similar_experiments.r ", call. = FALSE) +} + +## Get the command line arguments. +args <- commandArgs(trailingOnly = TRUE) + +input <- ifelse(! is.na(args[1]), args[1], usage()) +output <- ifelse(! is.na(args[2]), args[2], usage()) + +threshold <- 0.8 + +fingerprint <- loadFingerprint(input); +consensus <- consensusFingerprint(fingerprint, threshold); +distance <- calculateDistanceToGEO(consensus); + +# Display only the first 50 closest experiments +#distance <- distance[1:50,] + +generateSimilarExperiments(distance, output); + +quit("no", 0) diff -r 000000000000 -r 0cebe436a553 similar-experiments.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/similar-experiments.xml Fri May 17 14:29:33 2013 -0400 @@ -0,0 +1,38 @@ + + + + + similar-experiments.r $input $output + + + + + + + + + + + + + + + +**What it does** + +This tool takes a PathPrint Fingerprint as input and identifies experiments in GEO that are most similar based on a Manhattan distance. + +----- + +**Input:** PathPrint Fingerprint file from your History + +**Output:** Html page showing the most similar GEO experiments, their distance and associated p-value. Entries are linked to GEO. + +----- + +Feedback is welcome. Please send your questions, comments and suggestions to scde@hsci.harvard.edu. + + +