Mercurial > repos > melpetera > testtest
diff Testtest/GCMS-test_output.R @ 0:40de28c7d3fb draft
Uploaded
author | melpetera |
---|---|
date | Thu, 23 Nov 2017 08:50:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test_output.R Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,450 @@ +# author: Pauline Ribeyre + + +##################### +# required packages # +##################### + +library("parallel") # provides cluster methods and "parLapplyLB" function +library("grDevices") # provides jpeg handling methods + + +############ +# data # +############ + +update_list_of_file_names <- function() { + # Writes in a file the list of file names using the files created previously in ./RDatas. + + file.create(source_file_names, showWarnings = FALSE) # erase the file + directory <- "RDatas" + files <- list.files(directory, full.names = TRUE) + + for (f in files) { + f <- substr(f, 8, nchar(f) - 6) + write(f, file = source_file_names, append = TRUE) + } + +} + + +titles_to_columns <- function(indicators) { + # Parse the "title" column of the indicator dataframe to separate the different parameters and their values. + # + # Args: + # indicators: dataframe (one row per test) containing the results. + # + # Returns: + # new_indicators: copy of "indicators" with one column added for each parameter that varied. + + default_settings <- FALSE + + # parse the title + param_names <- c() + for (title in indicators$title) { + + if (title == "default_settings") { + default_settings <- TRUE + break + } + + else { + params <- strsplit(title, "_")[[1]] + for (param in params) { + name <- strsplit(param, "=")[[1]][1] + + # some lowercase values create errors because they are primitive functions + substr(name, 1, 1) <- toupper(substr(name, 1, 1)) + + if (!name %in% param_names) + param_names <- c(param_names, name) + value <- strsplit(param, "=")[[1]][2] + if (!exists(name)) + assign(name, c()) + assign(name, c(get(name), value)) + } + } + + } + + new_indicators <- indicators + + # add the columns to the dataframe + if (!default_settings) { + for (name in param_names) { + new_indicators <- cbind(new_indicators, get(name)) + names(new_indicators)[ncol(new_indicators)] <- name + } + + # order the dataframe's columns + order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names))) + new_indicators <- new_indicators[, order] + } + + return (new_indicators) + +} + + +create_summary <- function(nb_cores, count_duplicates = FALSE) { + # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. + # + # Args: + # nb_cores: maximum number of cores to use. + # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). + # + # Returns: + # A dataframe (one row per test) containing the results: + # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). +# +# # Returns: +# # A list containing: +# # 1: dataframe (one row per test) containing the results: +# # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). +# # 2: list of the settings sets for each test. + + time.start <- Sys.time() # start the timer + + file_names <- readLines(source_file_names) + + dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved + + # summ <- create_summary_parallel(1, file_names) + # cat("indic:\n",summary[1][[1]],"\n") + + # run the function on several cores + if (length(file_names) != 0) { + if (length(file_names) < nb_cores) + nb_cores <- length(file_names) + cluster <- makeCluster(nb_cores) #, outfile = "") + summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb) + stopCluster(cluster) + } + else + stop("There are no files to generate the output from.") + + # concatenate the results obtained by all the cores + indicators <- NULL + for (summary in summaries) + indicators <- rbind(indicators, t(summary)) + # settings_list <- c() + # for (summary in summaries) { + # settings <- summary[2][[1]] + # indic <- summary[1][[1]] + # + # indicators <- rbind(indicators, t(indic)) + # settings_list <- c(settings_list, settings) + # } + indicators <- data.frame(indicators) + + col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5", + "f10E5to10E6", "f10E6to10E7", "f10E7") + + if (check_ions) + col_names <- c(col_names, ions_name) + + names(indicators)[1:length(col_names)] <- col_names + if (count_duplicates) + names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions") + + indicators <- titles_to_columns(indicators) + + time.end <- Sys.time() # stop the timer + Tdiff <- difftime(time.end, time.start) + print(Tdiff) + + # return (list(indicators, settings_list)) + return (indicators) + +} + + +create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) { + # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. + # + # Args: + # n: index of the current test, to select the corresponding title. + # file_names: list of titles (one for each test) (concatenation of the values taken by the varied parameters). + # nb_cores: maximum number of cores to use. + # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). + # + # Returns: + # A dataframe (one row per test) containing the results: + # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. + # + # # Returns: + # # A list containing: + # # 1: dataframe (one row per test) containing the results: + # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. + # # 2: list of the settings sets for each test. + + source(source_spectrum, environment()) + + intensities_x <- c(1000, 10000, 100000, 1000000, 10000000) + + this_title <- file_names[n] + + # calculate the number of zeros + file_title <- paste0("Peak_tables/", this_title, ".tsv") + peak_table <- read.table(file_title, sep="\t", header=TRUE) + peak_table_values <- peak_table[,5:ncol(peak_table)] + zeros <- sum(peak_table$nb_zeros) + nb_lines <- nrow(peak_table) + zeros_per_line <- round(zeros/nb_lines, 4) + + # count the number of ions by intensity range + intensities_y <- c() + intensities_y[1] <- length(rowMeans( + peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]]) + for (i in 2:length(intensities_x)) + intensities_y[i] <- length(rowMeans( + peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] & + rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]]) + intensities_y[i + 1] <- length(rowMeans( + peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]]) + + # load the settings + file_title <- paste0("RDatas/", this_title, ".RData") + load(file_title) + + # count the number of count_duplicates and record in a file + if (count_duplicates) { + file_title <- paste0("count_duplicates/", this_title, ".tsv") + nb_count_duplicates <- data.frame(count_duplicates_function(GC_results)) + names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates") + write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE) + count_duplicates <- nrow(nb_count_duplicates) + + nb_ions <- 0 + pseudospectra <- GC_results$PseudoSpectra + for (ps in pseudospectra) { + nb_ions <- nb_ions + nrow(ps$pspectrum) + } + } + + summary <- c(this_title, nb_lines, zeros_per_line, intensities_y) + + # check the presence of ions + if (nb_ions_to_check > 0) { + for (i in 1:nb_ions_to_check) { + name <- ions_name[[i]] + rt <- ions_rt[[i]] + mzs <- ions_mzs[[i]] + cat("Check:", name, rt, mzs, "\n") + + value <- is_ion_present(GC_results, rt = rt, mz_list = mzs) + assign(name, value) + summary <- c(summary, get(name)) + } + } + + # check the presence of ion valine 12C and 13C + # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146)) + + # summary <- c(summary, valine) + + if (count_duplicates) + summary <- c(summary, count_duplicates, nb_ions) + + # return (list(summary, settings)) + return (summary) + +} + + +############ +# graphs # +############ + +ions_per_intensity <- function(indicators) { + # For each test, plots the number of ions for each range of intensity. + # + # Args: + # indicators: dataframe (one row per test) containing the results. + + pdf(intensity_graph_out) + par(mar = c(5.1,4.1,5,2.1)) + + for (i in 1:nrow(indicators)) { + + indic <- indicators[i,] + + names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7") + intensities_y <- c() + + for (colname in names) { + intensities_y <- c(intensities_y, indic[colname]) + } + intensities_y <- unlist(intensities_y) + intensities_y <- as.numeric(levels(intensities_y))[intensities_y] + + title <- as.character(indic$title) + title <- strsplit(title, "_")[[1]] + plot_title <- "" + for (i in 1:length(title)) { + plot_title <- paste(plot_title, title[[i]]) + if (i %% 2 == 0) + plot_title <- paste(plot_title, "\n") + else + plot_title <- paste(plot_title, " ") + } + + barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions", + main = plot_title, cex.main = 0.8) + + } + + dev.off() + +} + + +graph_results <- function(indicators, criteria) { + # Plots the results. + # + # Args: + # indicators: dataframe (one row per test) containing the results. + + first_criteria <- grep("nb_pseudospectra", names(indicators)) + nb_params <- first_criteria - 2 + this_criteria <- grep(criteria, names(indicators)) + + # values taken by each parameter + values <- list() + for (i in 2:(1 + nb_params)) { + lev <- unique(indicators[,i]) + values[[i - 1]] <- lev + } + + length <- lapply(values, length) + + # indexes of the parameters taking the most values + longest_1 <- which.max(length); length[longest_1] <- -1 + longest_2 <- which.max(length); length[longest_2] <- -1 + longest_3 <- which.max(length); length[longest_3] <- -1 + + # indexes of the other parameters + shortest <- which(length != -1) + + # all combinations of the values taken by these parameters + combinations <- list() + for (s in shortest) { + page <- indicators[,s + 1] + combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page)))) + } + names(combinations) <- names(indicators)[shortest + 1] + combinations <- expand.grid(combinations) + + # save the plots in a pdf file + pdf(compare_graph_out) + nb_rows <- length(values[longest_3][[1]]) + nb_cols <- length(values[longest_2][[1]]) + par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1)) + + # plot parameters + x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1])))) + min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line))) + max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line))) + + for (rowi in 1:nrow(combinations)) { + row <- combinations[rowi,] + title <- "" + lines <- indicators + + for (coli in 1:length(row)) { + value <- as.numeric(as.character(row[coli])) + title <- paste(title, names(row[coli]), "=", value) + if (coli %% 2 == 0) + title <- paste(title, "\n") + else + title <- paste(title, " ") + lines <- lines[lines[names(row[coli])] == value,] + } + + # values taken by the parameters + in_plot <- lines[,longest_1 + 1] + vertical <- lines[,longest_2 + 1] + horizontal <- lines[,longest_3 + 1] + + # for each horizontal value + for (horiz in sort(unique(horizontal))) { + + # for each vertical value + for (vertic in sort(unique(vertical))) { + + y <- c() + for (this_y in sort(unique(in_plot))) { + # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,] + line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,] + if (nrow(line) != 1) + stop("To plot the results, each set of the parameters' values must be represented exactly 1 time") + + value <- line[,this_criteria] + value <- as.numeric(as.character(value)) + y <- c(y, value) + } + + this_title <- paste(title, + names(indicators)[longest_3 + 1], "=", horiz, "\n", + names(indicators)[longest_2 + 1], "=", vertic) + + # plot this graph + plot(x, y, ylim = c(min_zeros, max_zeros), + type = "b", + xlab = names(indicators)[longest_1 + 1], + ylab = criteria, + main = this_title, + cex.main = 0.8) + + } + + } + + } + + par(mfrow = c(1,1)) + dev.off() + +} + + +############ +# main # +############ + +update_list_of_file_names() +summary <- create_summary(nb_cores, count_duplicates = count_duplicates) + +# settings_list <- summary[2][[1]] +# indicators_ <- summary[1][[1]] +indicators_ <- summary + +indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line +# indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers + +# record the summary in a file +indicators_to_write <- indicators[, !names(indicators) %in% + c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5", + "f10E5to10E6", "f10E6to10E7", "f10E7")] +write.table(indicators_to_write, + file = summary_out, + quote = FALSE, + row.names = FALSE, + sep = "\t") + +# ions per intensity graph +ions_per_intensity(indicators_) + +# plots to compare each set of parameters +graph_results(indicators_, criteria = "zeros_per_line") + +# zip of pseudospectra .msp and .tsv files +system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"')) +system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out)) + +# zip of count_duplicates .tsv files +if (count_duplicates) { + system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"')) + system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out)) +} +