diff galaxy-pathprint.r @ 0:0cebe436a553 draft default tip

Initial upload.
author hackdna
date Fri, 17 May 2013 14:29:33 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/galaxy-pathprint.r	Fri May 17 14:29:33 2013 -0400
@@ -0,0 +1,492 @@
+
+## List of arrays supported by PathPrint as of version 1.2.2  This will need to
+## be updated/replaced with a better version as PathPrint's support changes.
+supportedArrays <- function() {
+  return(matrix(c("GPL72",   "DrosGenome1",    "Affymetrix Drosophila Genome Array",                 "drosophila",
+                  "GPL85",   "RG_U34A",        "Affymetrix Rat Genome U34 Array",                    "rat",
+                  "GPL91",   "HG_U95A",        "Affymetrix Human Genome U95A Array",                 "human",
+                  "GPL96",   "HG-U133A",       "Affymetrix Human Genome U133A Array",                "human",
+                  "GPL200",  "Celegans",       "Affymetrix C. elegans Genome Array",                 "C.elegans",
+                  "GPL339",  "MOE430A",        "Affymetrix Mouse Expression 430A Array",             "mouse",
+                  "GPL341",  "RAE230A",        "Affymetrix Rat Expression 230A Array",               "rat",
+                  "GPL570",  "HG-U133_Plus_2", "Affymetrix Human Genome U133 Plus 2.0 Array",        "human",
+                  "GPL571",  "HG-U133A_2",     "Affymetrix Human Genome U133A 2.0 Array",            "human",
+                  "GPL1261", "Mouse430_2",     "Affymetrix Mouse Genome 430 2.0 Array",              "mouse",
+                  "GPL1319", "Zebrafish",      "Affymetrix Zebrafish Genome Array",                  "zebrafish",
+                  "GPL1322", "Drosophila_2",   "Affymetrix Drosophila Genome 2.0 Array",             "drosophila",
+                  "GPL1355", "Rat230_2",       "Affymetrix Rat Genome 230 2.0 Array",                "rat",
+                  "GPL2700", "",               "Sentrix HumanRef-8 Expression BeadChip",             "human",
+                  "GPL2986", "",               "ABI Human Genome Survey Microarray Version 2",       "human",
+                  "GPL2995", "",               "ABI Mouse Genome Survey Microarray",                 "mouse",
+                  "GPL3921", "HT_HG-U133A",    "Affymetrix HT Human Genome U133A Array",             "human",
+                  "GPL4685", "U133AAofAv2",    "Affymetrix GeneChip HT-HG_U133A Early Access Array", "human",
+                  "GPL6102", "",               "Illumina human-6 v2.0 expression beadchip",          "human",
+                  "GPL6103", "",               "Illumina mouseRef-8 v1.1 expression beadchip",       "mouse",
+                  "GPL6104", "",               "Illumina humanRef-8 v2.0 expression beadchip",       "human",
+                  "GPL6105", "",               "Illumina mouse-6 v1.1 expression beadchip",          "mouse",
+                  "GPL6333", "",               "Illumina Mouse Ref-6 V1",                            "mouse",
+                  "GPL6883", "",               "Illumina HumanRef-8 v3.0 expression beadchip",       "human",
+                  "GPL6884", "",               "Illumina HumanWG-6 v3.0 expression beadchip",        "human",
+                  "GPL6885", "",               "Illumina MouseRef-8 v2.0 expression beadchip",       "mouse",
+                  "GPL6887", "",               "Illumina MouseWG-6 v2.0 expression beadchip",        "mouse",
+                  "GPL6947", "",               "Illumina HumanHT-12 V3.0 expression beadchip",       "human",
+                  "GPL8300", "HG_U95Av2",      "Affymetrix Human Genome U95 Version 2 Array",        "human",
+                  "GPL8321", "Mouse430A_2",    "Affymetrix Mouse Genome 430A 2.0 Array",             "mouse"),
+                ncol = 4, byrow = TRUE))
+}
+
+fingerprintGSM <- function(geoID) {
+  if (is.existingGeoID(geoID)) {
+    ## If we've already got the GEO ID in the matrix, don't bother loading it
+    ## again, just pull out the data we need.
+    gsm.fingerprint <- GEO.fingerprint.matrix[,geoID]
+  } else {
+    ## We weren't able to find the GEO ID in the matrix so try to load all the
+    ## data we need from NCBI.
+    try(gsm <- getGEO(geoID, GSElimits = NULL), silent = FALSE)
+    if (! exists("gsm")) {
+      sink(stderr(), type = "message");
+      stop(sprintf("Unable to load GEO id '%s': is the GEO id valid?\n", geoID))
+    }
+    
+    gsm.exprs       <- Table(gsm)
+    gsm.platform    <- Meta(gsm)$platform_id
+    gsm.species     <- Meta(gsm)$organism_ch1
+
+    ## Compute the fingerprint for the GSM id.
+    gsm.fingerprint <- exprs2fingerprint(exprs       = gsm.exprs,
+                                         platform    = gsm.platform,
+                                         species     = gsm.species,
+                                         progressBar = FALSE)
+  }
+
+  ## Convert the fingerprint to a data frame.
+  gsm.fingerprint <- as.data.frame(gsm.fingerprint)
+  
+  ## Tag the fingerprint as having been calculated on a GSM id.
+  attr(gsm.fingerprint, 'fingerprintType') <- 'GSM'
+  attr(gsm.fingerprint, 'fingerprintGEO')  <- geoID
+  
+  return(gsm.fingerprint)
+}
+
+fingerprintGSE <- function(geoID) {
+  if (is.existingGeoID(geoID) && 0) {
+    ## If we've already got the GEO ID in the matrix, don't bother loading it
+    ## again, just pull out the data we need.
+    gse             <- na.omit(GEO.metadata.matrix[GEO.metadata.matrix$GSE == geoID,])
+    gse.gsm         <- gse[,"GSM"]
+
+    gse.fingerprint <- GEO.fingerprint.matrix[,gse.gsm]
+  } else {
+    try(gse <- getGEO(geoID), silent = FALSE)
+    if (! exists("gse")) {
+      sink(stderr(), type = "message");
+      stop(sprintf("Unable to load GEO id '%s': is the GEO id valid?\n", geoID))
+    } else {
+      print(gse)
+    }
+    
+    gse.exprs       <- exprs(gse[[1]])
+    gse.platform    <- annotation(gse[[1]])
+    gse.species     <- as.character(unique(phenoData(gse[[1]])$organism_ch1))
+
+    ## Compute the fingerprint for the GSE id.
+    gse.fingerprint <- exprs2fingerprint(exprs       = gse.exprs,
+                                         platform    = gse.platform,
+                                         species     = gse.species,
+                                         progressBar = FALSE)
+  }
+
+  ## Tag the fingerprint as having been calculated on a GSE id.
+  attr(gse.fingerprint, 'fingerprintType') <- 'GSE'
+  attr(gse.fingerprint, 'fingerprintGEO')  <- geoID
+  
+  return(gse.fingerprint)
+}
+
+generateFingerprint <- function(geoID) {
+  if (is.geoID(geoID)) {
+    if (is.geoGSM(geoID)) {
+      return(fingerprintGSM(geoID))
+    } else {
+      return(fingerprintGSE(geoID))
+    }
+  } else {
+    sink(stderr(), type = "message");
+    stop(paste("not a GEO ID:", geoID))
+  }
+}
+
+loadFingerprint <- function(file) {
+  fingerprint <- read.delim(file, header = TRUE, sep = "\t");
+  return(fingerprint);
+}
+
+saveFingerprint <- function(fingerprint, file) {
+  if (! is.null(attr(fingerprint, 'fingerprintType'))) {
+    if ((attr(fingerprint, 'fingerprintType') == 'GSE')) {
+      data <- fingerprint[,1:ncol(fingerprint)]
+      cols <- colnames(fingerprint);
+
+      write.table(data, row.names = TRUE,
+      			col.names = cols,
+                        file      = file,
+                        quote     = FALSE,
+                        sep       = "\t")
+    } else {
+      	   data <- data.frame(fingerprint[,1])
+      	   rownames(data) <- rownames(fingerprint)
+      	   write.table(data, row.names = TRUE,
+                               col.names = attr(fingerprint, 'fingerprintGEO'),
+                               file      = file,
+                               quote     = FALSE,
+                               sep       = "\t")
+    }     
+  } else {
+    sink(stderr(), type = "message");
+    stop("Unable to save fingerprint: unknown fingerprint type")
+  }
+}
+
+saveConsensus <- function(consensus, file) {
+  write.table(consensus, row.names = TRUE,
+                       	 col.names = TRUE,
+                         file      = file,
+                         quote     = FALSE,
+                         sep       = "\t")
+}
+
+calculateDistanceToGEO <- function(consensus) {
+# sample from matrix to	speed it up
+# sample <- sample(dim(GEO.fingerprint.matrix)[2], 10000, replace=FALSE)
+# GEO.distance  <- consensusDistance(consensus, GEO.fingerprint.matrix[,sample])
+  GEO.distance  <- consensusDistance(consensus, GEO.fingerprint.matrix)
+
+  similar.GEO <- GEO.metadata.matrix[
+        match(rownames(GEO.distance), GEO.metadata.matrix$GSM),
+        c("GSM", "GSE", "GPL", "Source")]
+                                             
+  similar.GEO <- cbind(similar.GEO[1:sum(GEO.distance$pvalue < 0.01),], 
+                         GEO.distance[1:sum(GEO.distance$pvalue < 0.01),])
+  return(similar.GEO)
+
+}
+
+calculateDistanceToPluripotent <- function(fingerprint, verbose = 0) {
+  #if (ncol(fingerprint) > 1) {
+    if (verbose) print("Calculating distances from pluripotent consensus for multiple sample")
+    ## Calculate the distance from the pluripotent consensus for a multiple sample.
+    pluripotent.consensus <- consensusFingerprint(GEO.fingerprint.matrix[,pluripotents.frame$GSM], 0.9)
+    fingerprint.distance  <- consensusDistance(pluripotent.consensus, fingerprint)
+    if(ncol(fingerprint)==1) {
+	rownames(fingerprint.distance) <- colnames(fingerprint) 
+    }    
+  #} else {
+  #  if (verbose) print("Calculating distances from pluripotent consensus for single sample")
+  #  ## Calculate the distance from the pluripotent consensus for a single sample.
+  #  pluripotent <- GEO.fingerprint.matrix[,pluripotents.frame$GSM]
+  #  fingerprint.distance <- data.frame(consensusDistance(fingerprint, pluripotent))
+  #}
+
+  return(fingerprint.distance)
+}
+
+saveDistance <- function(distance, file, pvalue = 1) {
+  write.table(distance, row.names = TRUE,
+                        col.names = TRUE,
+                        file      = file,
+                        quote     = FALSE,
+                        sep       = "\t")
+}
+
+loadDistance <- function(file, pvalue = 1) {
+  distance <- read.delim(file, header = TRUE, sep = "\t");
+  return(distance);
+}
+
+generateHistograms <- function(distance, filename) {
+  if (nrow(distance) == 0) {
+    sink(stderr(), type = "message");
+    stop("Unable to generate histogram: no distances to plot - try increasing p-value?")
+  }
+
+  pdf(filename, width = 8, height = 8)
+
+  xlab1 <- "Distance of GEO records from pluripotent consensus"
+  xlab2 <- "Distance of submitted data from pluripotent consensus"
+
+  ## Calculate the distance of the pluripotent to GEO.
+  pluripotent.consensus <- consensusFingerprint(GEO.fingerprint.matrix[,pluripotents.frame$GSM], 0.9)
+  geo.distance = calculateDistanceToGEO(pluripotent.consensus)
+  
+  ## Output the first histogram.
+  par(oma = c(5, 2, 2, 2), mfcol = c(2,1), mar = c(0, 4, 4, 2))
+  hist(geo.distance[,"distance"], col    ="grey", 
+                                  main   = "",
+                                  nclass = 50,
+                                  xlab   = xlab1,
+                                  xlim   = c(0,1))
+
+  ## Output the second histogram.
+  par(mar = c(7, 4, 4, 2))
+  hist(distance[,"distance"], cex.lab = 0.8,
+                              col     = "green",
+                              main    = "", 
+                              xlab    = xlab2,
+                              xlim    = c(0,1)) 
+  mtext(cex  = 0.8,
+        line = 0.5,
+        side = 3,
+        text = "Distance of GEO records from pluripotent consensus")
+
+  invisible(dev.off())
+}
+
+generateHeatmap <- function(fingerprint, sdev, filename) {
+
+  if(ncol(fingerprint) <= 1) {
+    sink(stderr(), type = "message");
+    stop("unable to generate heatmap on fingerprint with single column. Requires a dataset with multiple samples.",
+          call. = FALSE)
+  }
+
+  heatmap.data <- fingerprint[apply(fingerprint, 1, sd) > sdev, ]
+  if(dim(heatmap.data)[1] < 2) {
+    sink(stderr(), type="message");
+    stop("unable to generate heatmap - try lowering the standard deviation cutoff",
+	call. = FALSE)
+  }  
+
+  library(pheatmap)
+  pdf(filename, width = 8, height = 10)
+  # make sure row and column names are readable
+  fontsize_row = 10 - nrow(heatmap.data) / 15
+  if (fontsize_row < 2) {
+    show_rownames = FALSE
+  }
+  else {
+    show_rownames = TRUE
+  }
+  fontsize_col = 10 - ncol(heatmap.data) / 15
+  if (fontsize_row < 2) {
+    show_colnames = FALSE
+  }
+  else {
+    show_colnames = TRUE
+  }
+  pheatmap(as.matrix(heatmap.data),
+           col = c("blue", "white", "red"),
+        	 legend_breaks=c(-1,0,1),
+           fontsize_row=fontsize_row,
+           fontsize_col=fontsize_col,
+           cluster_cols=TRUE,
+           main=paste("All samples, sd=", sdev, sep=""),
+           show_rownames=show_rownames,
+           show_colnames=show_colnames
+  )
+  invisible(dev.off())
+}
+
+loadFingerprintFromCELFile <- function(filename) {
+  ## Load the data from the provided CEL file.
+  tryCatch({ data <- ReadAffy(filenames = c(filename)) },
+             error = function(err) {
+                       sink(stderr(), type = "message");
+                       stop("unable to parse CEL file - ensure provided file is valid",
+                            call. = FALSE)
+                     })
+
+  ## Get the reported platform, if any, from the CEL file.
+  platform <- getPlatformFromArrayName(cdfName(data))
+  if (is.null(platform)) {
+    sink(stderr(), type = "message");
+    stop(sprintf("Unable to determine from platform from CEL file"));
+  };
+  if (! is.supportedPlatform(platform)) {
+    sink(stderr(), type = "message");
+    stop(sprintf("The '%s' platform is not supported by PathPrint", platform))
+  }
+
+  ## Create the fingerprint based on the CEL file data.
+  cel.fingerprint <- exprs2fingerprint(exprs(rma(data)), platform, getPlatformSpecies(platform))
+
+  ## Convert the fingerprint to a data frame.
+  cel.fingerprint <- as.data.frame(cel.fingerprint)
+  
+  ## Tag the fingerprint as having been calculated on either a GSM or GSE id.
+  if (ncol(cel.fingerprint) == 1) {
+    attr(cel.fingerprint, 'fingerprintType') <- 'GSM'
+  } else {
+    attr(cel.fingerprint, 'fingerprintType') <- 'GSE'
+  }
+  attr(cel.fingerprint, 'fingerprintGEO') <- 'Unknown GEO ID'
+  
+  ## Return the fingerprint
+  return(cel.fingerprint)
+}
+
+loadFingerprintFromExprsFile <- function(filename, platform) {
+  if (! is.supportedPlatform(platform))
+    stop(sprintf("The '%s' platform is not supported by PathPrint", platform))
+  
+  ## Load the data from the provided expression set file.
+  tryCatch({ data <- read.delim(filename, sep=("\t")) },
+             error = function(err) {
+                       stop(sprintf("Unable to parse expression set file: %s", err),
+                            call. = FALSE)
+                     })
+
+  species <- getPlatformSpecies(platform);
+  print(platform)
+  print(species)
+
+  ## Create the fingerprint based on the expression set file data.
+  tryCatch({ exprs.fingerprint <- exprs2fingerprint(as.data.frame(data), platform, getPlatformSpecies(platform)) },
+           error = function(err) {
+	   	   		 print(err);
+                       stop(sprintf("Expression set to fingerprint conversion failed.  Please ensure the platform is correct."))
+                     })
+
+  ## Convert the fingerprint to a data frame.
+  exprs.fingerprint <- as.data.frame(exprs.fingerprint)
+
+  ## Tag the fingerprint as having been calculated on either a GSM or GSE id.
+  if (ncol(exprs.fingerprint) == 1) {
+    attr(exprs.fingerprint, 'fingerprintType') <- 'GSM'
+  } else {
+    attr(exprs.fingerprint, 'fingerprintType') <- 'GSE'
+  }
+  
+  ## Return the fingerprint
+  return(exprs.fingerprint)
+}
+
+##
+## Helper functions for validating if a CEL file is supported by PathPrint
+## and for accessing data about the platform or species it supports.
+##
+
+is.existingGeoID <- function(id) {
+  if (is.geoID(id)) {
+    if (is.geoGSM(id)) {
+      id %in% GEO.metadata.matrix[,"GSM"]
+    } else {
+      id %in% GEO.metadata.matrix[,"GSE"]
+    }
+  } else {
+    NULL
+  }
+}
+
+## Check to see if the provided platform is supported by PathPrint.
+is.supportedPlatform <- function(platform) {
+  platform %in% supportedArrays()[,1]
+}
+
+## Check to see if the provided species is supported by PathPrint.
+is.supportedSpecies <- function(species) {
+  species %in% supportedArrays()[,4]
+}
+ 
+## Given the array name, return the corresponding species.
+getPlatformSpecies <- function(platform) {
+  if (is.supportedPlatform(platform)) {
+    return(supportedArrays()[grep(sprintf("^%s$", platform), (supportedArrays()[,1])), 4])
+  } else {
+    return(NULL)
+  }
+}
+
+## Lookup the platform using the array name.
+getPlatformFromArrayName <- function(array) {
+  platform <- supportedArrays()[grep(sprintf("^%s$", array), (supportedArrays()[,2])), 1]
+
+  if (length(platform))
+    return(platform)
+  else
+    return(NULL)
+}
+
+generateSimilarExperiments <- function(data, filename) {
+  sink(filename, append=FALSE, split=FALSE)
+
+  cat("<html>\n")
+  cat("<head>\n")
+  cat("<title>Similar Experiments in GEO</title>\n")
+  cat("
+    <style type='text/css'>
+
+
+      .data-table {
+        border-collapse:       collapse;
+        width:                 100%;
+      }
+
+      .data-table td, th {
+        border:                1px solid lightslategrey;
+        color:                 #000080;
+        font-family:           Verdana,Geneva,Arial,sans-serif;
+        padding:               4px;
+        font-size:             75%;
+        overflow:              hidden;
+        text-overflow:         ellipsis;
+      }
+
+      .data-table th {
+        font-weight:           bold;
+      }
+    </style>\n");
+  cat("<head>\n")
+  cat("<body>\n")
+  cat("<table class='data-table'>\n")
+  cat("<tr>")
+  cat("<th>GSM ID</th>")
+  cat("<th>GSE ID</th>")
+  cat("<th>GPL ID</th>")
+  cat("<th>Source</th>")
+  cat("<th>distance</th>")
+  cat("<th>p-value</th>")
+  cat("</tr>\n")
+for(i in 1:nrow(data)) {
+    cat("<tr>",
+	"<td><a href='", createGEOLink(data$GSM[i]), "' target=0>", data$GSM[i], "</a></td>",
+        "<td><a href='", createGEOLink(data$GSE[i]), "' target=0>", data$GSE[i], "</a></td>",
+        "<td><a href='", createGEOLink(data$GPL[i]), "' target=0>", data$GPL[i], "</a></td>",
+        "<td>", data$Source[i], "</td>",
+        "<td>", data$distance[i], "</td>",
+        "<td>", data$pvalue[i], "</td>",
+        "</tr>\n", sep = '')
+  }
+
+  cat("</table>\n")
+  cat("</body\n")
+  cat("</html>\n")
+
+  sink()
+}
+
+##
+## Helper functions
+##
+
+## Create a link to the GEO ID.
+createGEOLink <- function(id) {
+  sprintf("http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s", id)
+}
+
+## Check if the provided ID corresponds to the GEO GSM format.
+is.geoGSM <- function(id) {
+  length(grep("^GSM\\d+$", id, ignore.case = TRUE)) > 0
+}
+
+## Check if the provided ID corresponds to the GEO GSE format.
+is.geoGSE <- function(id) {
+  length(grep("^GSE\\d+$", id, ignore.case = TRUE)) > 0
+}
+
+## Check if the provided ID corresponds to the GEO format.
+is.geoID <- function(id) {
+  return(is.geoGSM(id) | is.geoGSE(id))
+}
+