Mercurial > repos > melpetera > testtest
changeset 2:0b79cac1bbaa draft
Uploaded
author | melpetera |
---|---|
date | Thu, 23 Nov 2017 09:37:50 -0500 |
parents | 53f3f87ece44 |
children | 8fb560e858ab |
files | Testtest/GCMS-test_output.R pouik.txt |
diffstat | 1 files changed, 0 insertions(+), 450 deletions(-) [+] |
line wrap: on
line diff
--- a/Testtest/GCMS-test_output.R Thu Nov 23 09:35:25 2017 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,450 +0,0 @@ -# author: Pauline Ribeyre - - -##################### -# required packages # -##################### - -library("parallel") # provides cluster methods and "parLapplyLB" function -library("grDevices") # provides jpeg handling methods - - -############ -# data # -############ - -update_list_of_file_names <- function() { - # Writes in a file the list of file names using the files created previously in ./RDatas. - - file.create(source_file_names, showWarnings = FALSE) # erase the file - directory <- "RDatas" - files <- list.files(directory, full.names = TRUE) - - for (f in files) { - f <- substr(f, 8, nchar(f) - 6) - write(f, file = source_file_names, append = TRUE) - } - -} - - -titles_to_columns <- function(indicators) { - # Parse the "title" column of the indicator dataframe to separate the different parameters and their values. - # - # Args: - # indicators: dataframe (one row per test) containing the results. - # - # Returns: - # new_indicators: copy of "indicators" with one column added for each parameter that varied. - - default_settings <- FALSE - - # parse the title - param_names <- c() - for (title in indicators$title) { - - if (title == "default_settings") { - default_settings <- TRUE - break - } - - else { - params <- strsplit(title, "_")[[1]] - for (param in params) { - name <- strsplit(param, "=")[[1]][1] - - # some lowercase values create errors because they are primitive functions - substr(name, 1, 1) <- toupper(substr(name, 1, 1)) - - if (!name %in% param_names) - param_names <- c(param_names, name) - value <- strsplit(param, "=")[[1]][2] - if (!exists(name)) - assign(name, c()) - assign(name, c(get(name), value)) - } - } - - } - - new_indicators <- indicators - - # add the columns to the dataframe - if (!default_settings) { - for (name in param_names) { - new_indicators <- cbind(new_indicators, get(name)) - names(new_indicators)[ncol(new_indicators)] <- name - } - - # order the dataframe's columns - order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names))) - new_indicators <- new_indicators[, order] - } - - return (new_indicators) - -} - - -create_summary <- function(nb_cores, count_duplicates = FALSE) { - # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. - # - # Args: - # nb_cores: maximum number of cores to use. - # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). - # - # Returns: - # A dataframe (one row per test) containing the results: - # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). -# -# # Returns: -# # A list containing: -# # 1: dataframe (one row per test) containing the results: -# # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). -# # 2: list of the settings sets for each test. - - time.start <- Sys.time() # start the timer - - file_names <- readLines(source_file_names) - - dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved - - # summ <- create_summary_parallel(1, file_names) - # cat("indic:\n",summary[1][[1]],"\n") - - # run the function on several cores - if (length(file_names) != 0) { - if (length(file_names) < nb_cores) - nb_cores <- length(file_names) - cluster <- makeCluster(nb_cores) #, outfile = "") - summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb) - stopCluster(cluster) - } - else - stop("There are no files to generate the output from.") - - # concatenate the results obtained by all the cores - indicators <- NULL - for (summary in summaries) - indicators <- rbind(indicators, t(summary)) - # settings_list <- c() - # for (summary in summaries) { - # settings <- summary[2][[1]] - # indic <- summary[1][[1]] - # - # indicators <- rbind(indicators, t(indic)) - # settings_list <- c(settings_list, settings) - # } - indicators <- data.frame(indicators) - - col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5", - "f10E5to10E6", "f10E6to10E7", "f10E7") - - if (check_ions) - col_names <- c(col_names, ions_name) - - names(indicators)[1:length(col_names)] <- col_names - if (count_duplicates) - names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions") - - indicators <- titles_to_columns(indicators) - - time.end <- Sys.time() # stop the timer - Tdiff <- difftime(time.end, time.start) - print(Tdiff) - - # return (list(indicators, settings_list)) - return (indicators) - -} - - -create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) { - # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. - # - # Args: - # n: index of the current test, to select the corresponding title. - # file_names: list of titles (one for each test) (concatenation of the values taken by the varied parameters). - # nb_cores: maximum number of cores to use. - # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). - # - # Returns: - # A dataframe (one row per test) containing the results: - # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. - # - # # Returns: - # # A list containing: - # # 1: dataframe (one row per test) containing the results: - # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. - # # 2: list of the settings sets for each test. - - source(source_spectrum, environment()) - - intensities_x <- c(1000, 10000, 100000, 1000000, 10000000) - - this_title <- file_names[n] - - # calculate the number of zeros - file_title <- paste0("Peak_tables/", this_title, ".tsv") - peak_table <- read.table(file_title, sep="\t", header=TRUE) - peak_table_values <- peak_table[,5:ncol(peak_table)] - zeros <- sum(peak_table$nb_zeros) - nb_lines <- nrow(peak_table) - zeros_per_line <- round(zeros/nb_lines, 4) - - # count the number of ions by intensity range - intensities_y <- c() - intensities_y[1] <- length(rowMeans( - peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]]) - for (i in 2:length(intensities_x)) - intensities_y[i] <- length(rowMeans( - peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] & - rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]]) - intensities_y[i + 1] <- length(rowMeans( - peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]]) - - # load the settings - file_title <- paste0("RDatas/", this_title, ".RData") - load(file_title) - - # count the number of count_duplicates and record in a file - if (count_duplicates) { - file_title <- paste0("count_duplicates/", this_title, ".tsv") - nb_count_duplicates <- data.frame(count_duplicates_function(GC_results)) - names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates") - write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE) - count_duplicates <- nrow(nb_count_duplicates) - - nb_ions <- 0 - pseudospectra <- GC_results$PseudoSpectra - for (ps in pseudospectra) { - nb_ions <- nb_ions + nrow(ps$pspectrum) - } - } - - summary <- c(this_title, nb_lines, zeros_per_line, intensities_y) - - # check the presence of ions - if (nb_ions_to_check > 0) { - for (i in 1:nb_ions_to_check) { - name <- ions_name[[i]] - rt <- ions_rt[[i]] - mzs <- ions_mzs[[i]] - cat("Check:", name, rt, mzs, "\n") - - value <- is_ion_present(GC_results, rt = rt, mz_list = mzs) - assign(name, value) - summary <- c(summary, get(name)) - } - } - - # check the presence of ion valine 12C and 13C - # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146)) - - # summary <- c(summary, valine) - - if (count_duplicates) - summary <- c(summary, count_duplicates, nb_ions) - - # return (list(summary, settings)) - return (summary) - -} - - -############ -# graphs # -############ - -ions_per_intensity <- function(indicators) { - # For each test, plots the number of ions for each range of intensity. - # - # Args: - # indicators: dataframe (one row per test) containing the results. - - pdf(intensity_graph_out) - par(mar = c(5.1,4.1,5,2.1)) - - for (i in 1:nrow(indicators)) { - - indic <- indicators[i,] - - names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7") - intensities_y <- c() - - for (colname in names) { - intensities_y <- c(intensities_y, indic[colname]) - } - intensities_y <- unlist(intensities_y) - intensities_y <- as.numeric(levels(intensities_y))[intensities_y] - - title <- as.character(indic$title) - title <- strsplit(title, "_")[[1]] - plot_title <- "" - for (i in 1:length(title)) { - plot_title <- paste(plot_title, title[[i]]) - if (i %% 2 == 0) - plot_title <- paste(plot_title, "\n") - else - plot_title <- paste(plot_title, " ") - } - - barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions", - main = plot_title, cex.main = 0.8) - - } - - dev.off() - -} - - -graph_results <- function(indicators, criteria) { - # Plots the results. - # - # Args: - # indicators: dataframe (one row per test) containing the results. - - first_criteria <- grep("nb_pseudospectra", names(indicators)) - nb_params <- first_criteria - 2 - this_criteria <- grep(criteria, names(indicators)) - - # values taken by each parameter - values <- list() - for (i in 2:(1 + nb_params)) { - lev <- unique(indicators[,i]) - values[[i - 1]] <- lev - } - - length <- lapply(values, length) - - # indexes of the parameters taking the most values - longest_1 <- which.max(length); length[longest_1] <- -1 - longest_2 <- which.max(length); length[longest_2] <- -1 - longest_3 <- which.max(length); length[longest_3] <- -1 - - # indexes of the other parameters - shortest <- which(length != -1) - - # all combinations of the values taken by these parameters - combinations <- list() - for (s in shortest) { - page <- indicators[,s + 1] - combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page)))) - } - names(combinations) <- names(indicators)[shortest + 1] - combinations <- expand.grid(combinations) - - # save the plots in a pdf file - pdf(compare_graph_out) - nb_rows <- length(values[longest_3][[1]]) - nb_cols <- length(values[longest_2][[1]]) - par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1)) - - # plot parameters - x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1])))) - min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line))) - max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line))) - - for (rowi in 1:nrow(combinations)) { - row <- combinations[rowi,] - title <- "" - lines <- indicators - - for (coli in 1:length(row)) { - value <- as.numeric(as.character(row[coli])) - title <- paste(title, names(row[coli]), "=", value) - if (coli %% 2 == 0) - title <- paste(title, "\n") - else - title <- paste(title, " ") - lines <- lines[lines[names(row[coli])] == value,] - } - - # values taken by the parameters - in_plot <- lines[,longest_1 + 1] - vertical <- lines[,longest_2 + 1] - horizontal <- lines[,longest_3 + 1] - - # for each horizontal value - for (horiz in sort(unique(horizontal))) { - - # for each vertical value - for (vertic in sort(unique(vertical))) { - - y <- c() - for (this_y in sort(unique(in_plot))) { - # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,] - line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,] - if (nrow(line) != 1) - stop("To plot the results, each set of the parameters' values must be represented exactly 1 time") - - value <- line[,this_criteria] - value <- as.numeric(as.character(value)) - y <- c(y, value) - } - - this_title <- paste(title, - names(indicators)[longest_3 + 1], "=", horiz, "\n", - names(indicators)[longest_2 + 1], "=", vertic) - - # plot this graph - plot(x, y, ylim = c(min_zeros, max_zeros), - type = "b", - xlab = names(indicators)[longest_1 + 1], - ylab = criteria, - main = this_title, - cex.main = 0.8) - - } - - } - - } - - par(mfrow = c(1,1)) - dev.off() - -} - - -############ -# main # -############ - -update_list_of_file_names() -summary <- create_summary(nb_cores, count_duplicates = count_duplicates) - -# settings_list <- summary[2][[1]] -# indicators_ <- summary[1][[1]] -indicators_ <- summary - -indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line -# indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers - -# record the summary in a file -indicators_to_write <- indicators[, !names(indicators) %in% - c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5", - "f10E5to10E6", "f10E6to10E7", "f10E7")] -write.table(indicators_to_write, - file = summary_out, - quote = FALSE, - row.names = FALSE, - sep = "\t") - -# ions per intensity graph -ions_per_intensity(indicators_) - -# plots to compare each set of parameters -graph_results(indicators_, criteria = "zeros_per_line") - -# zip of pseudospectra .msp and .tsv files -system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"')) -system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out)) - -# zip of count_duplicates .tsv files -if (count_duplicates) { - system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"')) - system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out)) -} -