changeset 2:0b79cac1bbaa draft

Uploaded
author melpetera
date Thu, 23 Nov 2017 09:37:50 -0500
parents 53f3f87ece44
children 8fb560e858ab
files Testtest/GCMS-test_output.R pouik.txt
diffstat 1 files changed, 0 insertions(+), 450 deletions(-) [+]
line wrap: on
line diff
--- a/Testtest/GCMS-test_output.R	Thu Nov 23 09:35:25 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,450 +0,0 @@
-# author: Pauline Ribeyre
-
-
-#####################
-# required packages #
-#####################
-
-library("parallel") # provides cluster methods and "parLapplyLB" function
-library("grDevices")  # provides jpeg handling methods
-
-
-############
-#   data   #
-############
-
-update_list_of_file_names <- function() {
-  # Writes in a file the list of file names using the files created previously in ./RDatas.
-  
-  file.create(source_file_names, showWarnings = FALSE) # erase the file
-  directory <- "RDatas"
-  files <- list.files(directory, full.names = TRUE)
-  
-  for (f in files) {
-    f <- substr(f, 8, nchar(f) - 6)
-    write(f, file = source_file_names, append = TRUE)
-  }
-  
-}
-
-
-titles_to_columns <- function(indicators) {
-  # Parse the "title" column of the indicator dataframe to separate the different parameters and their values.
-  #
-  # Args:
-  #   indicators: dataframe (one row per test) containing the results.
-  #
-  # Returns:
-  #   new_indicators: copy of "indicators" with one column added for each parameter that varied.
-  
-  default_settings <- FALSE
-  
-  # parse the title
-  param_names <- c()
-  for (title in indicators$title) {
-    
-    if (title == "default_settings") {
-      default_settings <- TRUE
-      break
-    }
-    
-    else {
-      params <- strsplit(title, "_")[[1]]
-      for (param in params) {
-        name <- strsplit(param, "=")[[1]][1]
-        
-        # some lowercase values create errors because they are primitive functions
-        substr(name, 1, 1) <- toupper(substr(name, 1, 1))
-        
-        if (!name %in% param_names)
-          param_names <- c(param_names, name)
-        value <- strsplit(param, "=")[[1]][2]
-        if (!exists(name))
-          assign(name, c())
-        assign(name, c(get(name), value))
-      }
-    }
-    
-  }
-  
-  new_indicators <- indicators
-  
-  # add the columns to the dataframe
-  if (!default_settings) {
-    for (name in param_names) {
-      new_indicators <- cbind(new_indicators, get(name))
-      names(new_indicators)[ncol(new_indicators)] <- name
-    }
-    
-    # order the dataframe's columns
-    order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names)))
-    new_indicators <- new_indicators[, order]
-  }
-  
-  return (new_indicators)
-  
-}
-
-
-create_summary <- function(nb_cores, count_duplicates = FALSE) {
-  # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test.
-  #
-  # Args:
-  #   nb_cores: maximum number of cores to use.
-  #   count_duplicates: calculate the number of count_duplicates obtained by each test (slow).
-  #
-  # Returns:
-  #   A dataframe (one row per test) containing the results:
-  #     title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt).
-# 
-#   # Returns:
-#   #   A list containing:
-#   #   1: dataframe (one row per test) containing the results:
-#   #     title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt).
-#   #   2: list of the settings sets for each test.
-  
-  time.start <- Sys.time() # start the timer
-  
-  file_names <- readLines(source_file_names)
-  
-  dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved
-  
-  # summ <- create_summary_parallel(1, file_names)
-  # cat("indic:\n",summary[1][[1]],"\n")
-  
-  # run the function on several cores
-  if (length(file_names) != 0) {
-    if (length(file_names) < nb_cores)
-      nb_cores <- length(file_names)
-    cluster <- makeCluster(nb_cores) #, outfile = "")
-    summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb)
-    stopCluster(cluster)
-  }
-  else
-    stop("There are no files to generate the output from.")
-  
-  # concatenate the results obtained by all the cores
-  indicators <- NULL
-  for (summary in summaries)
-    indicators <- rbind(indicators, t(summary))
-  # settings_list <- c()
-  # for (summary in summaries) {
-  #   settings <- summary[2][[1]]
-  #   indic <- summary[1][[1]]
-  # 
-  #   indicators <- rbind(indicators, t(indic))
-  #   settings_list <- c(settings_list, settings)
-  # }
-  indicators <- data.frame(indicators)
-
-  col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5",
-                 "f10E5to10E6", "f10E6to10E7", "f10E7")
-  
-  if (check_ions)
-    col_names <- c(col_names, ions_name)
-  
-  names(indicators)[1:length(col_names)] <- col_names
-  if (count_duplicates)
-    names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions")
-  
-  indicators <- titles_to_columns(indicators)
-
-  time.end <- Sys.time() # stop the timer
-  Tdiff <- difftime(time.end, time.start)
-  print(Tdiff)
-  
-  # return (list(indicators, settings_list))
-  return (indicators)
-  
-}
-
-
-create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) {
-  # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test.
-  #
-  # Args:
-  #   n: index of the current test, to select the corresponding title.
-  #   file_names:  list of titles (one for each test) (concatenation of the values taken by the varied parameters).
-  #   nb_cores: maximum number of cores to use.
-  #   count_duplicates: calculate the number of count_duplicates obtained by each test (slow).
-  #
-  # Returns:
-  #   A dataframe (one row per test) containing the results:
-  #     title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine.
-  # 
-  # # Returns:
-  # #   A list containing:
-  # #   1: dataframe (one row per test) containing the results:
-  # #     title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine.
-  # #   2: list of the settings sets for each test.
-  
-  source(source_spectrum, environment())
-  
-  intensities_x <- c(1000, 10000, 100000, 1000000, 10000000)
-  
-  this_title <- file_names[n]
-  
-  # calculate the number of zeros
-  file_title <- paste0("Peak_tables/", this_title, ".tsv")
-  peak_table <- read.table(file_title, sep="\t", header=TRUE)
-  peak_table_values <- peak_table[,5:ncol(peak_table)]
-  zeros <- sum(peak_table$nb_zeros)
-  nb_lines <- nrow(peak_table)
-  zeros_per_line <- round(zeros/nb_lines, 4)
-  
-  # count the number of ions by intensity range
-  intensities_y <- c()
-  intensities_y[1] <- length(rowMeans(
-    peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]])
-  for (i in 2:length(intensities_x))
-    intensities_y[i] <- length(rowMeans(
-      peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] &
-                                       rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]])
-  intensities_y[i + 1] <- length(rowMeans(
-    peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]])
-  
-  # load the settings
-  file_title <- paste0("RDatas/", this_title, ".RData")
-  load(file_title)
-
-  # count the number of count_duplicates and record in a file
-  if (count_duplicates) {
-    file_title <- paste0("count_duplicates/", this_title, ".tsv")
-    nb_count_duplicates <- data.frame(count_duplicates_function(GC_results))
-    names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates")
-    write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE)
-    count_duplicates <- nrow(nb_count_duplicates)
-    
-    nb_ions <- 0
-    pseudospectra <- GC_results$PseudoSpectra
-    for (ps in pseudospectra) {
-      nb_ions <- nb_ions + nrow(ps$pspectrum)
-    }
-  }
-  
-  summary <- c(this_title, nb_lines, zeros_per_line, intensities_y)
-  
-  # check the presence of ions
-  if (nb_ions_to_check > 0) {
-    for (i in 1:nb_ions_to_check) {
-      name <- ions_name[[i]]
-      rt <- ions_rt[[i]]
-      mzs <- ions_mzs[[i]]
-      cat("Check:", name, rt, mzs, "\n")
-      
-      value <- is_ion_present(GC_results, rt = rt, mz_list = mzs)
-      assign(name, value)
-      summary <- c(summary, get(name))
-    }
-  }
-  
-  # check the presence of ion valine 12C and 13C
-  # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146))
-
-  # summary <- c(summary, valine)
-  
-  if (count_duplicates)
-    summary <- c(summary, count_duplicates, nb_ions)
-
-  # return (list(summary, settings))
-  return (summary)
-  
-}
-
-
-############
-#  graphs  #
-############
-
-ions_per_intensity <- function(indicators) {
-  # For each test, plots the number of ions for each range of intensity.
-  #
-  # Args:
-  #   indicators: dataframe (one row per test) containing the results. 
-  
-  pdf(intensity_graph_out)
-  par(mar = c(5.1,4.1,5,2.1))
-  
-  for (i in 1:nrow(indicators)) {
-    
-    indic <- indicators[i,]
-    
-    names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7")
-    intensities_y <- c()
-    
-    for (colname in names) {
-      intensities_y <- c(intensities_y, indic[colname])
-    }
-    intensities_y <- unlist(intensities_y)
-    intensities_y <- as.numeric(levels(intensities_y))[intensities_y]
-    
-    title <- as.character(indic$title)
-    title <- strsplit(title, "_")[[1]]
-    plot_title <- ""
-    for (i in 1:length(title)) {
-      plot_title <- paste(plot_title, title[[i]])
-      if (i %% 2 == 0)
-        plot_title <- paste(plot_title, "\n")
-      else
-        plot_title <- paste(plot_title, " ")
-    }
-    
-    barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions",
-            main = plot_title, cex.main = 0.8)
-  
-  }
-  
-  dev.off()
-  
-}
-
-
-graph_results <- function(indicators, criteria) {
-  # Plots the results.
-  #
-  # Args:
-  #   indicators: dataframe (one row per test) containing the results.
-  
-  first_criteria <- grep("nb_pseudospectra", names(indicators))
-  nb_params <- first_criteria - 2
-  this_criteria <- grep(criteria, names(indicators))
-  
-  # values taken by each parameter
-  values <- list()
-  for (i in 2:(1 + nb_params)) {
-    lev <- unique(indicators[,i])
-    values[[i - 1]] <- lev
-  }
-  
-  length <- lapply(values, length)
-  
-  # indexes of the parameters taking the most values
-  longest_1 <- which.max(length); length[longest_1] <- -1
-  longest_2 <- which.max(length); length[longest_2] <- -1
-  longest_3 <- which.max(length); length[longest_3] <- -1
-  
-  # indexes of the other parameters
-  shortest <- which(length != -1)
-  
-  # all combinations of the values taken by these parameters
-  combinations <- list()
-  for (s in shortest) {
-    page <- indicators[,s + 1]
-    combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page))))
-  }
-  names(combinations) <- names(indicators)[shortest + 1]
-  combinations <- expand.grid(combinations)
-  
-  # save the plots in a pdf file
-  pdf(compare_graph_out)
-  nb_rows <- length(values[longest_3][[1]])
-  nb_cols <- length(values[longest_2][[1]])
-  par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1))
-  
-  # plot parameters
-  x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1]))))
-  min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line)))
-  max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line)))
-  
-  for (rowi in 1:nrow(combinations)) {
-    row <- combinations[rowi,]
-    title <- ""
-    lines <- indicators
-    
-    for (coli in 1:length(row)) {
-      value <- as.numeric(as.character(row[coli]))
-      title <- paste(title, names(row[coli]), "=", value)
-      if (coli %% 2 == 0)
-        title <- paste(title, "\n")
-      else
-        title <- paste(title, " ")
-      lines <- lines[lines[names(row[coli])] == value,]
-    }
-    
-    # values taken by the parameters
-    in_plot <- lines[,longest_1 + 1]
-    vertical <- lines[,longest_2 + 1]
-    horizontal <-  lines[,longest_3 + 1]
-    
-    # for each horizontal value
-    for (horiz in sort(unique(horizontal))) {
-      
-      # for each vertical value
-      for (vertic in sort(unique(vertical))) {
-        
-        y <- c()
-        for (this_y in sort(unique(in_plot))) {
-          # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,]
-          line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,]
-          if (nrow(line) != 1)
-            stop("To plot the results, each set of the parameters' values must be represented exactly 1 time")
-          
-          value <- line[,this_criteria]
-          value <- as.numeric(as.character(value))
-          y <- c(y, value)
-        }
-        
-        this_title <- paste(title,
-                       names(indicators)[longest_3 + 1], "=", horiz, "\n",
-                       names(indicators)[longest_2 + 1], "=", vertic)
-        
-        # plot this graph
-        plot(x, y, ylim = c(min_zeros, max_zeros),
-             type = "b",
-             xlab = names(indicators)[longest_1 + 1],
-             ylab = criteria,
-             main = this_title,
-             cex.main = 0.8)
-        
-      }
-      
-    }
-  
-  }
-  
-  par(mfrow = c(1,1))
-  dev.off()
-  
-}
-
-
-############
-#   main   #
-############
-
-update_list_of_file_names()
-summary <- create_summary(nb_cores, count_duplicates = count_duplicates)
-
-# settings_list <- summary[2][[1]]
-# indicators_ <- summary[1][[1]]
-indicators_ <- summary
-
-indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line
-# indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers
-
-# record the summary in a file
-indicators_to_write <- indicators[, !names(indicators) %in%
-                                    c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5",
-                                      "f10E5to10E6", "f10E6to10E7", "f10E7")]
-write.table(indicators_to_write,
-            file = summary_out,
-            quote = FALSE,
-            row.names = FALSE,
-            sep = "\t")
-
-# ions per intensity graph
-ions_per_intensity(indicators_)
-
-# plots to compare each set of parameters
-graph_results(indicators_, criteria = "zeros_per_line")
-
-# zip of pseudospectra .msp and .tsv files
-system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"'))
-system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out))
-
-# zip of count_duplicates .tsv files
-if (count_duplicates) {
-  system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"'))
-  system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out))
-}
-