# HG changeset patch # User melpetera # Date 1511445014 18000 # Node ID 40de28c7d3fbbf9f72e08ec90ecd8087bbcd4bcc Uploaded diff -r 000000000000 -r 40de28c7d3fb Testtest/GCMS-test.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test.R Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,220 @@ +# author: Pauline Ribeyre + + +##################### +# required packages # +##################### + +library(batch) # provides "parseCommandargv" function + + +############ +# init # +############ + +# read the parameters +argv_list <- parseCommandArgs(evaluate = FALSE) #, trailingOnly = TRUE) +argv <- unlist(argv_list) + +# common environment to share variables between files +env <- new.env() + +# timestamp made unique by adding a random element +env$timestamp <- round(as.numeric(Sys.time()) + runif(1, 0, 100)) + +# set working directory +tmpwd <- paste0("/galaxy/data/", env$timestamp) +# system("chmod -R 0777 .") +# dir.create(tmpwd, mode = "0777") +dir.create(tmpwd) +setwd(tmpwd) + +# readme file +readme_msg <- paste0("\"", env$timestamp, "\"", + " is a temporary folder storing data for an instance of Galaxy tool \"GCMS-test\" launched on ", + Sys.time(), ". ", + "This folder should be deleted automatically at the end of the tool's execution. ", + "If the instance ended or has been aborted and this folder still exists, it can be deleted manually.") +write(readme_msg, file = "README.txt") + +################ +# source files # +################ + +source_local <- function(fname){ + # Gets a file's full name in Galaxy's directories. + # + # Args: + # fname: the name of the file. + # + # Returns: + # The full name of the file. + + argv <- commandArgs(trailingOnly = FALSE) + base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) + return (paste(base_dir, fname, sep="/")) + +} + +env$source_settings <- source_local("GCMS-test_settings.R") +env$source_analyze <- source_local("GCMS-test_analyze.R") +env$source_output <- source_local("GCMS-test_output.R") +env$source_spectrum <- source_local("GCMS-test_spectrum.R") + +env$source_file_names <- "file_names.txt" + + +############# +# functions # +############# + +args_to_vary_list <- function() { + # Creates the list of settings variations using the arguments. + # + # Returns: + # The list of settings variations. + + param <- c("step", "steps", "mzdiff", "fwhm", "simthresh", "snthresh", "max", + "minclassfraction", "minclasssize", "rtdiff") #, "minfeat") + + PeakPicking_param <- c("step", "steps", "mzdiff", "fwhm", "snthresh", "max") + betweenSamples_param <- c("min.class.fraction", "min.class.size", "rtdiff", "simthresh") + + vary <- list() + + for (p in param) { + + min <- as.numeric(argv[[paste0(p, "_min")]]) + max <- as.numeric(argv[[paste0(p, "_max")]]) + step <- as.numeric(argv[[paste0(p, "_step")]]) + + if (p == "minclassfraction") + p <- "min.class.fraction" + else if (p == "minclasssize") + p <- "min.class.size" + + if (p %in% PeakPicking_param) + name <- paste0("PeakPicking$", p) + else if (p %in% betweenSamples_param) + name <- paste0("betweenSamples.", p) + else + stop("\"", p, "\" is not a valid parameter to vary.") + + # prevent seq error + if (min > max) + max <- min + + range <- seq(min, max, step) + vary[[length(vary) + 1]] <- c(p, name, range) + + } + + return (vary) + +} + + +read_ions_to_check <- function() { + # Reads the arguments concerning the ions to check. + + env$ions_name <- list() + env$ions_rt <- list() + env$ions_mzs <- list() + + env$nb_ions_to_check <- length(grep("ions_name_", names(argv))) + + if (env$nb_ions_to_check > 0) { + for (i in 1:env$nb_ions_to_check) { + env$ions_name[length(env$ions_name) + 1] <- argv[[paste0("ions_name_", i)]] + env$ions_rt[length(env$ions_rt) + 1] <- as.numeric(argv[[paste0("ions_rt_", i)]]) + mzs <- argv[[paste0("ions_mzs_", i)]] + mzs <- as.numeric(strsplit(mzs, ",")[[1]]) + env$ions_mzs[length(env$ions_mzs) + 1] <- list(mzs) + } + } + +} + + +############ +# inputs # +############ + +env$nb_cores <- as.numeric(argv[["nb_cores"]]) +cat("Nb cores:", env$nb_cores, "\n") + +# get data from zip file +if (!is.null(argv[["zip_file"]])) { + env$cdf_files <- unzip(argv[["zip_file"]]) +} else { + stop("No files have been provided.") +} + +# vary parameters +env$vary_list <- args_to_vary_list() + +# count duplicate ions +env$count_duplicates <- argv[["count_duplicates"]] == "true" +if (env$count_duplicates) { + + # delta rt + env$duplicates_delta_rt <- as.numeric(argv[["duplicates_delta_rt"]]) + + # delta mz + env$duplicates_delta_mz <- as.numeric(argv[["duplicates_delta_mz"]]) + +} + +# check the presence of ions +env$nb_ions_to_check <- 0 +env$check_ions <- argv[["check_ions"]] == "true" +if (env$check_ions) { + + read_ions_to_check() + + # delta rt + env$ions_delta_rt <- as.numeric(argv[["ions_delta_rt"]]) + + # delta mz + env$ions_delta_mz <- as.numeric(argv[["ions_delta_mz"]]) + +} + + +############# +# outputs # +############# + +env$summary_out <- argv[["summary_out"]] +env$peakspectra_out <- argv[["peakspectra_out"]] +env$intensity_graph_out <- argv[["intensity_graph_out"]] + +if (env$count_duplicates) + env$count_duplicates_out <- argv[["duplicates_out"]] + +env$compare_graph_out <- argv[["compare_graph_out"]] + + +########## +# run # +########## + +cat("---------- Settings ----------\n") +sys.source(env$source_settings, env) + + +cat("---------- Analysis ----------\n") +sys.source(env$source_analyze, env) + + +cat("---------- Output ----------\n") +sys.source(env$source_output, env) + + +########## +# end # +########## + +# delete temporary files +setwd("..") +unlink(tmpwd, recursive = TRUE) diff -r 000000000000 -r 40de28c7d3fb Testtest/GCMS-test.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test.xml Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,263 @@ + + Test parameters for GC-MS data preprocessing using metaMS package + + + bioconductor-metams + bioconductor-xcms + r-batch + + + + + + + + Rscript $__tool_directory__/GCMS-test.R + + + + + nb_cores \${GALAXY_SLOTS} + + zip_file $zip_file + + step_min $step.step_min + step_max $step.step_max + step_step $step.step_step + + steps_min $steps.steps_min + steps_max $steps.steps_max + steps_step $steps.steps_step + + mzdiff_min $mzdiff.mzdiff_min + mzdiff_max $mzdiff.mzdiff_max + mzdiff_step $mzdiff.mzdiff_step + + fwhm_min $fwhm.fwhm_min + fwhm_max $fwhm.fwhm_max + fwhm_step $fwhm.fwhm_step + + simthresh_min $simthresh.simthresh_min + simthresh_max $simthresh.simthresh_max + simthresh_step $simthresh.simthresh_step + + snthresh_min $snthresh.snthresh_min + snthresh_max $snthresh.snthresh_max + snthresh_step $snthresh.snthresh_step + + max_min $max.max_min + max_max $max.max_max + max_step $max.max_step + + minclassfraction_min $minclassfraction.minclassfraction_min + minclassfraction_max $minclassfraction.minclassfraction_max + minclassfraction_step $minclassfraction.minclassfraction_step + + minclasssize_min $minclasssize.minclasssize_min + minclasssize_max $minclasssize.minclasssize_max + minclasssize_step $minclasssize.minclasssize_step + + rtdiff_min $rtdiff.rtdiff_min + rtdiff_max $rtdiff.rtdiff_max + rtdiff_step $rtdiff.rtdiff_step + + + + count_duplicates $outputs_options.count_duplicates.count_duplicates_bool + #if $outputs_options.count_duplicates.count_duplicates_bool: + duplicates_delta_rt $outputs_options.count_duplicates.duplicates_delta_rt + duplicates_delta_mz $outputs_options.count_duplicates.duplicates_delta_mz + #end if + + check_ions $outputs_options.check_ions.check_ions_bool + #if $outputs_options.check_ions.check_ions_bool: + ions_delta_rt $outputs_options.check_ions.ions_delta_rt + ions_delta_mz $outputs_options.check_ions.ions_delta_mz + + #for $i, $s in enumerate($outputs_options.check_ions.ions_to_check) + ions_name_${i+1} $s.name + ions_rt_${i+1} $s.rt + ions_mzs_${i+1} $s.mzs + #end for + #end if + + + + summary_out $summary_out + peakspectra_out $peakspectra_out + intensity_graph_out $intensity_graph_out + + #if $outputs_options.count_duplicates.count_duplicates_bool: + duplicates_out $duplicates_out + #end if + + compare_graph_out $compare_graph_out + + + + + +
+ + + +
+ +
+ + + +
+ +
+ + + +
+ +
+ + + +
+ + + +
+ + + +
+ +
+ + + +
+ +
+ + + +
+ +
+ + + +
+ +
+ + + +
+ +
+ + + +
+ +
+ + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + outputs_options['count_duplicates']['count_duplicates_bool'] + + + + + + +**Author** Pauline Ribeyre + +========= +GCMS-test +========= + +----------- +Description +----------- + +This tool allows the user to run the method metaMS.runGC() with several sets of parameters and to compare the results of each run. + +----------------- +Workflow position +----------------- + +**Upstream tools** + +You must start from here. + +**Downstream tools** + ++---------------------------+---------------------------------------+--------+ +| Name | Output file | Format | ++===========================+=======================================+========+ +| | | | ++---------------------------+---------------------------------------+--------+ + +---------- +Parameters +---------- + +The **zip file** contains the data (the chromatograms to analyse with metaMS.runGC). + +Each section allows the user to vary one parameter by choosing a start value, a stop value and a step for this parameter. + +The last section allows the user to set output preferences. + +------- +Outputs +------- + +The output file **summary.tsv** is a tabular file. It contains the summary of the tests. + +The output file **peakspectra.zip** is a zip file. It contains the .msp and .tsv peakspectra files for each test. + +The output file **intensity_graph.zip** is a pdf file. It contains the "number of ions vs intensity range" graph for each test. + +The output file **duplicates.zip** is a zip file. It contains the .tsv duplicates file for each test. + +--------- +Changelog +--------- + +**Version 1.0 - 21/07/2017** + + +
\ No newline at end of file diff -r 000000000000 -r 40de28c7d3fb Testtest/GCMS-test_analyze.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test_analyze.R Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,149 @@ +# author: Pauline Ribeyre + + +##################### +# required packages # +##################### + +library("metaMS") # provides "runGC" function + + +############ +# analysis # +############ + +peakspectra_table <- function(GC_results, file_title) { + # Saves the pseudospectra in 2 files (text and tabular formats). + + names <- c() + rts <- c() + rt.sds <- c() + mzs_df <- list() + for (ps in GC_results$PseudoSpectra) { + + name <- ps$Name + rt <- ps$rt + rt.sd <- ps$rt.sd + + names <- c(names, name) + rts <- c(rts, rt) + rt.sds <- c(rt.sds, rt.sd) + + spectrum <- data.frame(ps$pspectrum) + mz <- c() + maxo <- c() + for (i in 1:nrow(spectrum)) { + ion <- spectrum[i,] + mz <- c(mz, ion$mz) + maxo <- c(maxo, ion$maxo) + } + df <- data.frame(name, rt, rt.sd, mz, maxo) + mzs_df[[length(mzs_df) + 1]] <- df + + } + + df = do.call(rbind, mzs_df) + + write.table(df, + file = file_title, + quote = FALSE, + row.names = FALSE, + sep = "\t") + +} + + +my_runGC <- function(n, cdf_files, titles_to_test, settings_to_test) { + # Runs the data analysis and records the results. + # + # Args: + # n: index of the current test, to select the corresponding title and settings set. + # cdf_files: list of the data files' names. + # titles_to_test: list of titles (one for each settings set) (concatenation of the values taken by the varied parameters). + # settings_to_test: list of settings sets for runGC. + + library("metaMS") + + settings <- settings_to_test[n][[1]] + title <- titles_to_test[n] + + print(title) + + if (!file.exists(paste0("Peak_tables/", title, ".tsv"))) { + + # run + GC_results <- runGC(files = cdf_files, settings = settings, returnXset = TRUE, nSlaves = 20) + + # order the result table by retention time + peak_table <- GC_results$PeakTable <- GC_results$PeakTable[order(GC_results$PeakTable[,"rt"]),] + peak_table_values <- peak_table[,5:(5 + length(cdf_files) - 1)] + peak_table$nb_zeros <- apply(peak_table_values, 1, function(x) sum(x == 0)) + zeros_per_l <- sum(peak_table$nb_zeros)/nrow(peak_table) + + # record the table in a file + file_title <- paste0("Peak_tables/", title, ".tsv") + write.table(peak_table, file = file_title, sep = "\t", row.names = FALSE) # /!\ title length -> cannot open connexion + + # record the RData + file_title <- paste0("RDatas/", title, ".RData") + # save(GC_results, settings, file = file_title) + save(GC_results, file = file_title) + + # record the pseudospectra in files (.msp and .tsv) + file_title <- paste0("Pseudospectra/", title, ".msp") + write.msp(GC_results$PseudoSpectra, file = file_title, newFile = TRUE) + file_title <- paste0("Pseudospectra/", title, ".tsv") + peakspectra_table(GC_results, file_title) + + cat(paste(zeros_per_l, "zeros per line.\n\n")) + + } # end if + +} + + +runGC_vary_parameters_parallel <- function(nb_cores, cdf_files, settings, vary) { + # Calculates the number of sets of paramaters and runs the analysis on several cores. + # + # Args: + # nb_cores: maximum number of cores to use. + # cdf_files: list of the data files' names. + # settings: default settings for runGC(). + # vary: list of parameters to vary and the values each parameter must take. + + # calculate the number of possibilities with the parameters' ranges + nb_possibilites <- 1 + for (param in vary) { + range_param <- param[3:length(param)] + nb_possibilites <- nb_possibilites * length(range_param) + } + cat("Settings variations:", nb_possibilites, "combinations.\n") + + dir.create("RDatas", showWarnings = FALSE) #create the folder where the RDatas will be saved + dir.create("Peak_tables", showWarnings = FALSE) #create the folder where the peak tables will be saved + # dir.create("Ions_per_intensity", showWarnings = FALSE) #create the folder where the nb of ions per intensity range will be saved + dir.create("Pseudospectra", showWarnings = FALSE) #create the folder where the pseudospectra will be saved + + time.start <- Sys.time() # start the timer + + # run the function on several cores + if (length(titles_to_test) < nb_cores) + nb_cores <- length(titles_to_test) + cluster <- makeCluster(nb_cores)#, outfile = "") + parLapplyLB(cluster, 1:length(titles_to_test), my_runGC, cdf_files, titles_to_test, settings_to_test) + print(titles_to_test) + stopCluster(cluster) + + time.end <- Sys.time() # stop the timer + Tdiff <- difftime(time.end, time.start) + print(Tdiff) + +} + + +############ +# main # +############ + +runGC_vary_parameters_parallel(nb_cores, cdf_files, settings, vary_list) + diff -r 000000000000 -r 40de28c7d3fb Testtest/GCMS-test_output.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test_output.R Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,450 @@ +# author: Pauline Ribeyre + + +##################### +# required packages # +##################### + +library("parallel") # provides cluster methods and "parLapplyLB" function +library("grDevices") # provides jpeg handling methods + + +############ +# data # +############ + +update_list_of_file_names <- function() { + # Writes in a file the list of file names using the files created previously in ./RDatas. + + file.create(source_file_names, showWarnings = FALSE) # erase the file + directory <- "RDatas" + files <- list.files(directory, full.names = TRUE) + + for (f in files) { + f <- substr(f, 8, nchar(f) - 6) + write(f, file = source_file_names, append = TRUE) + } + +} + + +titles_to_columns <- function(indicators) { + # Parse the "title" column of the indicator dataframe to separate the different parameters and their values. + # + # Args: + # indicators: dataframe (one row per test) containing the results. + # + # Returns: + # new_indicators: copy of "indicators" with one column added for each parameter that varied. + + default_settings <- FALSE + + # parse the title + param_names <- c() + for (title in indicators$title) { + + if (title == "default_settings") { + default_settings <- TRUE + break + } + + else { + params <- strsplit(title, "_")[[1]] + for (param in params) { + name <- strsplit(param, "=")[[1]][1] + + # some lowercase values create errors because they are primitive functions + substr(name, 1, 1) <- toupper(substr(name, 1, 1)) + + if (!name %in% param_names) + param_names <- c(param_names, name) + value <- strsplit(param, "=")[[1]][2] + if (!exists(name)) + assign(name, c()) + assign(name, c(get(name), value)) + } + } + + } + + new_indicators <- indicators + + # add the columns to the dataframe + if (!default_settings) { + for (name in param_names) { + new_indicators <- cbind(new_indicators, get(name)) + names(new_indicators)[ncol(new_indicators)] <- name + } + + # order the dataframe's columns + order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names))) + new_indicators <- new_indicators[, order] + } + + return (new_indicators) + +} + + +create_summary <- function(nb_cores, count_duplicates = FALSE) { + # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. + # + # Args: + # nb_cores: maximum number of cores to use. + # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). + # + # Returns: + # A dataframe (one row per test) containing the results: + # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). +# +# # Returns: +# # A list containing: +# # 1: dataframe (one row per test) containing the results: +# # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). +# # 2: list of the settings sets for each test. + + time.start <- Sys.time() # start the timer + + file_names <- readLines(source_file_names) + + dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved + + # summ <- create_summary_parallel(1, file_names) + # cat("indic:\n",summary[1][[1]],"\n") + + # run the function on several cores + if (length(file_names) != 0) { + if (length(file_names) < nb_cores) + nb_cores <- length(file_names) + cluster <- makeCluster(nb_cores) #, outfile = "") + summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb) + stopCluster(cluster) + } + else + stop("There are no files to generate the output from.") + + # concatenate the results obtained by all the cores + indicators <- NULL + for (summary in summaries) + indicators <- rbind(indicators, t(summary)) + # settings_list <- c() + # for (summary in summaries) { + # settings <- summary[2][[1]] + # indic <- summary[1][[1]] + # + # indicators <- rbind(indicators, t(indic)) + # settings_list <- c(settings_list, settings) + # } + indicators <- data.frame(indicators) + + col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5", + "f10E5to10E6", "f10E6to10E7", "f10E7") + + if (check_ions) + col_names <- c(col_names, ions_name) + + names(indicators)[1:length(col_names)] <- col_names + if (count_duplicates) + names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions") + + indicators <- titles_to_columns(indicators) + + time.end <- Sys.time() # stop the timer + Tdiff <- difftime(time.end, time.start) + print(Tdiff) + + # return (list(indicators, settings_list)) + return (indicators) + +} + + +create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) { + # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. + # + # Args: + # n: index of the current test, to select the corresponding title. + # file_names: list of titles (one for each test) (concatenation of the values taken by the varied parameters). + # nb_cores: maximum number of cores to use. + # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). + # + # Returns: + # A dataframe (one row per test) containing the results: + # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. + # + # # Returns: + # # A list containing: + # # 1: dataframe (one row per test) containing the results: + # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. + # # 2: list of the settings sets for each test. + + source(source_spectrum, environment()) + + intensities_x <- c(1000, 10000, 100000, 1000000, 10000000) + + this_title <- file_names[n] + + # calculate the number of zeros + file_title <- paste0("Peak_tables/", this_title, ".tsv") + peak_table <- read.table(file_title, sep="\t", header=TRUE) + peak_table_values <- peak_table[,5:ncol(peak_table)] + zeros <- sum(peak_table$nb_zeros) + nb_lines <- nrow(peak_table) + zeros_per_line <- round(zeros/nb_lines, 4) + + # count the number of ions by intensity range + intensities_y <- c() + intensities_y[1] <- length(rowMeans( + peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]]) + for (i in 2:length(intensities_x)) + intensities_y[i] <- length(rowMeans( + peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] & + rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]]) + intensities_y[i + 1] <- length(rowMeans( + peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]]) + + # load the settings + file_title <- paste0("RDatas/", this_title, ".RData") + load(file_title) + + # count the number of count_duplicates and record in a file + if (count_duplicates) { + file_title <- paste0("count_duplicates/", this_title, ".tsv") + nb_count_duplicates <- data.frame(count_duplicates_function(GC_results)) + names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates") + write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE) + count_duplicates <- nrow(nb_count_duplicates) + + nb_ions <- 0 + pseudospectra <- GC_results$PseudoSpectra + for (ps in pseudospectra) { + nb_ions <- nb_ions + nrow(ps$pspectrum) + } + } + + summary <- c(this_title, nb_lines, zeros_per_line, intensities_y) + + # check the presence of ions + if (nb_ions_to_check > 0) { + for (i in 1:nb_ions_to_check) { + name <- ions_name[[i]] + rt <- ions_rt[[i]] + mzs <- ions_mzs[[i]] + cat("Check:", name, rt, mzs, "\n") + + value <- is_ion_present(GC_results, rt = rt, mz_list = mzs) + assign(name, value) + summary <- c(summary, get(name)) + } + } + + # check the presence of ion valine 12C and 13C + # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146)) + + # summary <- c(summary, valine) + + if (count_duplicates) + summary <- c(summary, count_duplicates, nb_ions) + + # return (list(summary, settings)) + return (summary) + +} + + +############ +# graphs # +############ + +ions_per_intensity <- function(indicators) { + # For each test, plots the number of ions for each range of intensity. + # + # Args: + # indicators: dataframe (one row per test) containing the results. + + pdf(intensity_graph_out) + par(mar = c(5.1,4.1,5,2.1)) + + for (i in 1:nrow(indicators)) { + + indic <- indicators[i,] + + names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7") + intensities_y <- c() + + for (colname in names) { + intensities_y <- c(intensities_y, indic[colname]) + } + intensities_y <- unlist(intensities_y) + intensities_y <- as.numeric(levels(intensities_y))[intensities_y] + + title <- as.character(indic$title) + title <- strsplit(title, "_")[[1]] + plot_title <- "" + for (i in 1:length(title)) { + plot_title <- paste(plot_title, title[[i]]) + if (i %% 2 == 0) + plot_title <- paste(plot_title, "\n") + else + plot_title <- paste(plot_title, " ") + } + + barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions", + main = plot_title, cex.main = 0.8) + + } + + dev.off() + +} + + +graph_results <- function(indicators, criteria) { + # Plots the results. + # + # Args: + # indicators: dataframe (one row per test) containing the results. + + first_criteria <- grep("nb_pseudospectra", names(indicators)) + nb_params <- first_criteria - 2 + this_criteria <- grep(criteria, names(indicators)) + + # values taken by each parameter + values <- list() + for (i in 2:(1 + nb_params)) { + lev <- unique(indicators[,i]) + values[[i - 1]] <- lev + } + + length <- lapply(values, length) + + # indexes of the parameters taking the most values + longest_1 <- which.max(length); length[longest_1] <- -1 + longest_2 <- which.max(length); length[longest_2] <- -1 + longest_3 <- which.max(length); length[longest_3] <- -1 + + # indexes of the other parameters + shortest <- which(length != -1) + + # all combinations of the values taken by these parameters + combinations <- list() + for (s in shortest) { + page <- indicators[,s + 1] + combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page)))) + } + names(combinations) <- names(indicators)[shortest + 1] + combinations <- expand.grid(combinations) + + # save the plots in a pdf file + pdf(compare_graph_out) + nb_rows <- length(values[longest_3][[1]]) + nb_cols <- length(values[longest_2][[1]]) + par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1)) + + # plot parameters + x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1])))) + min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line))) + max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line))) + + for (rowi in 1:nrow(combinations)) { + row <- combinations[rowi,] + title <- "" + lines <- indicators + + for (coli in 1:length(row)) { + value <- as.numeric(as.character(row[coli])) + title <- paste(title, names(row[coli]), "=", value) + if (coli %% 2 == 0) + title <- paste(title, "\n") + else + title <- paste(title, " ") + lines <- lines[lines[names(row[coli])] == value,] + } + + # values taken by the parameters + in_plot <- lines[,longest_1 + 1] + vertical <- lines[,longest_2 + 1] + horizontal <- lines[,longest_3 + 1] + + # for each horizontal value + for (horiz in sort(unique(horizontal))) { + + # for each vertical value + for (vertic in sort(unique(vertical))) { + + y <- c() + for (this_y in sort(unique(in_plot))) { + # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,] + line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,] + if (nrow(line) != 1) + stop("To plot the results, each set of the parameters' values must be represented exactly 1 time") + + value <- line[,this_criteria] + value <- as.numeric(as.character(value)) + y <- c(y, value) + } + + this_title <- paste(title, + names(indicators)[longest_3 + 1], "=", horiz, "\n", + names(indicators)[longest_2 + 1], "=", vertic) + + # plot this graph + plot(x, y, ylim = c(min_zeros, max_zeros), + type = "b", + xlab = names(indicators)[longest_1 + 1], + ylab = criteria, + main = this_title, + cex.main = 0.8) + + } + + } + + } + + par(mfrow = c(1,1)) + dev.off() + +} + + +############ +# main # +############ + +update_list_of_file_names() +summary <- create_summary(nb_cores, count_duplicates = count_duplicates) + +# settings_list <- summary[2][[1]] +# indicators_ <- summary[1][[1]] +indicators_ <- summary + +indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line +# indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers + +# record the summary in a file +indicators_to_write <- indicators[, !names(indicators) %in% + c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5", + "f10E5to10E6", "f10E6to10E7", "f10E7")] +write.table(indicators_to_write, + file = summary_out, + quote = FALSE, + row.names = FALSE, + sep = "\t") + +# ions per intensity graph +ions_per_intensity(indicators_) + +# plots to compare each set of parameters +graph_results(indicators_, criteria = "zeros_per_line") + +# zip of pseudospectra .msp and .tsv files +system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"')) +system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out)) + +# zip of count_duplicates .tsv files +if (count_duplicates) { + system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"')) + system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out)) +} + diff -r 000000000000 -r 40de28c7d3fb Testtest/GCMS-test_settings.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test_settings.R Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,227 @@ +# author: Pauline Ribeyre + + +##################### +# required packages # +##################### + +library("metaMS") # provides "metaMSsettings" class + + +#################### +# default settings # +#################### + +# PeakPicking +PeakPicking.method = "matchedFilter" +PeakPicking.step = 0.05 +PeakPicking.steps = 2 +PeakPicking.mzdiff = 0.5 +PeakPicking.fwhm = 5 +PeakPicking.snthresh = 2 +PeakPicking.max = 500 + +# Alignment +# ? + +# CAMERA +CAMERA.perfwhm = 1 + +# DBconstruction +DBconstruction.minintens = 0.0 +DBconstruction.rttol = 0.1 +DBconstruction.intensityMeasure = "maxo" +DBconstruction.DBthreshold = .80 +DBconstruction.minfeat = 5 + +# match2DB +match2DB.rtdiff = 0.5 +match2DB.minfeat = 5 +# match2DB.rtval = ? +# match2DB.mzdiff = ? +# match2DB.ppm = ? +match2DB.simthresh = 0.8 +match2DB.timeComparison = "rt" +match2DB.RIdiff = 5 + +# betweenSamples +betweenSamples.min.class.fraction = 0.8 # pools +# betweenSamples.min.class.fraction = 0.5 # samples +betweenSamples.min.class.size = 5 +betweenSamples.timeComparison = "rt" +betweenSamples.rtdiff = 0.5 +betweenSamples.RIdiff = 2 # default setting +# betweenSamples.RIdiff = 0.05 # w4m setting +betweenSamples.simthresh = 0.8 + +# matchIrrelevants +# matchIrrelevants.irrelevantClasses = c("Bleeding", "Plasticizers") +# matchIrrelevants.RIdiff = ? +# matchIrrelevants.rtdiff = ? +# matchIrrelevants.simthresh = ? +# matchIrrelevants.timeComparison = "RI" + +# TODO RI option + + +################### +# settings object # +################### + +# settings <- metaMSsettings("protocolName" = "settings", "chrom" = "GC") + +settings <- metaMSsettings("protocolName" = "settings", + "chrom" = "GC", + + PeakPicking = list( + method = PeakPicking.method, + step = PeakPicking.step, + steps = PeakPicking.steps, + mzdiff = PeakPicking.mzdiff, + fwhm = PeakPicking.fwhm, + snthresh = PeakPicking.snthresh, + max = PeakPicking.max), + + CAMERA = list(perfwhm = CAMERA.perfwhm)) + +metaSetting(settings, "DBconstruction") <- list( + minintens = DBconstruction.minintens, + rttol = DBconstruction.rttol, + intensityMeasure = DBconstruction.intensityMeasure, + DBthreshold = DBconstruction.DBthreshold, + minfeat = DBconstruction.minfeat) + +metaSetting(settings, "match2DB") <- list( + simthresh = match2DB.simthresh, + timeComparison = match2DB.timeComparison, + rtdiff = match2DB.rtdiff, + RIdiff = match2DB.RIdiff, + minfeat = match2DB.minfeat) + +metaSetting(settings, "betweenSamples") <- list( + min.class.fraction = betweenSamples.min.class.fraction, + min.class.size = betweenSamples.min.class.size, + timeComparison = betweenSamples.timeComparison, + rtdiff = betweenSamples.rtdiff, + RIdiff = betweenSamples.RIdiff, + simthresh = betweenSamples.simthresh) + +# metaSetting(GALAXY.GC, "matchIrrelevants") <- list( +# irrelevantClasses = matchIrrelevants.irrelevantClasses, +# RIdiff = matchIrrelevants.RIdiff, +# rtdiff = matchIrrelevants.rtdiff, +# simthresh = matchIrrelevants.simthresh, +# timeComparison = matchIrrelevants.timeComparison) + + +# best settings pool +settings@betweenSamples.min.class.size = 5 +settings@PeakPicking$fwhm = 6 +settings@PeakPicking$mzdiff = -0.5 +settings@betweenSamples.min.class.fraction = 0.8 +settings@betweenSamples.simthresh = 0.7 +settings@betweenSamples.rtdiff = 0.05 + + +# best settings samples +settings@betweenSamples.min.class.size = 5 +settings@PeakPicking$fwhm = 6 +settings@PeakPicking$mzdiff = 0.5 +settings@betweenSamples.min.class.fraction = 0.5 +settings@betweenSamples.simthresh = 0.7 +settings@betweenSamples.rtdiff = 0.05 + + +################# +# vary settings # +################# + +pop <- function(list) { + # Removes the last item from a list. + # + # Args: + # list: the list to pop. + # + # Returns: + # The last item of the list before it was removed. + + val <- list[length(list)] + assign(as.character(substitute(list)), list[-length(list)], parent.frame()) + return (val) + +} + + +vary_list_to_settings <- function(settings, vary_list, title = "", titles_to_test = c(), settings_to_test = c()) { + # Reads the list of parameters to vary and their ranges and builds a list of file titles and settings from it. + # + # Args: + # settings: default settings for runGC. + # vary_list: list of parameters to vary and the values each parameter must take. + # title: title of the current file (concatenation of the values taken by the varied parameters). + # titles_to_test: list of titles (one for each settings set) (concatenation of the values taken by the varied parameters). + # settings_to_test: list of settings sets for runGC. + # + # Returns: + # A list containing: + # 1: the list of file titles ; + # 2: the list of runGC() settings. + + if (length(vary_list) == 0) { # if there are no parameters to vary + + titles_to_test <- c(titles_to_test, "default_settings") + settings_to_test <- c(settings_to_test, settings) + + } + + else { + + param <- pop(vary_list)[[1]] + + title <- paste0(title, param[1]) + setting <- param[2] + range <- as.numeric(param[3:length(param)]) + + # for each possible value of the parameter + for (p in range) { + + # change the settings + isSlot <- regexpr("\\$", setting)[1] + + if (isSlot == -1) { # the parameter is a slot in class metaMSsettings + slot(settings, setting) <- p + } + else { # the parameter is not a slot in class metaMSsettings + slot(settings, substr(setting, 1, isSlot - 1))[substr(setting, isSlot + 1, nchar(setting))] <- p + } + + new_title <- paste0(title, "=", p, "_") + + # save the settings in a list + if (length(vary_list) == 0) { + titles_to_test <- c(titles_to_test, new_title) + settings_to_test <- c(settings_to_test, settings) + } + else { + res <- vary_list_to_settings(settings, vary_list, new_title, titles_to_test, settings_to_test) + titles_to_test <- res[[1]] + settings_to_test <- res[[2]] + } + + } # end for + + } # end else + + return (list(titles_to_test, settings_to_test)) + +} + + +############ +# main # +############ + +# settings to vary and ranges of values +res <- vary_list_to_settings(settings, vary_list) +titles_to_test <- res[[1]] +settings_to_test <- res[[2]] diff -r 000000000000 -r 40de28c7d3fb Testtest/GCMS-test_spectrum.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Testtest/GCMS-test_spectrum.R Thu Nov 23 08:50:14 2017 -0500 @@ -0,0 +1,135 @@ +# author: Pauline Ribeyre + + +############# +# find ions # +############# + +is_ion_present <- function(GC_results, rt, mz_list) { + # Looks for a ion in the pseudospectra. + # + # Args: + # GC_results: result of a test by runGC(). + # rt: retention time of the ion to look for. + # mz_list: list of the ion's isotopes' m/z. + # + # Returns: + # The number of times the ion was found in the pseudospectra. + + delta_rt <- ions_delta_rt + delta_mz <- ions_delta_mz + + res = 0 + + pseudospectra <- GC_results$PseudoSpectra + for (peak in pseudospectra) { # check all the peaks + if (peak$rt >= rt - delta_rt && peak$rt <= rt + delta_rt) { # check if the retention time matches + + ps <- peak$pspectrum + found_ion <- rep(NA, length(mz_list)) + + for (i in 1:length(mz_list)) { # check the presence of all the ions given in parameter + + ion <- mz_list[i][[1]] + found_ion[i] <- sum((ps[,"mz"] >= ion - delta_mz) & (ps[,"mz"] <= ion + delta_mz)) + + } # end for + + if (length(which(found_ion > 1)) > 0) { + stop("The ion was found more than 1 time in the same pseudospectrum.") + } + + if (sum(found_ion) == length(mz_list)) { + res <- res + 1 + } + + } # end if + } # end for + + return (res) + +} + + +############## +# duplicates # +############## + +count_duplicates_function <- function(GC_results) { + # Counts the number of ions that have been found in more than one pseudospectra (slow). + # + # Args: + # GC_results: result of a test by runGC(). + # + # Returns: + # The number of duplicates. + + delta_rt <- duplicates_delta_rt + delta_mz <- duplicates_delta_mz + + pseudospectra <- GC_results$PseudoSpectra + + # order by pseudospectrum rt + rts <- sapply(pseudospectra, function(x) x$rt) + rts_order <- order(rts) + + duplicates <- NULL # record all the duplicates we find + already_counted <- NULL # record which duplicates have already been counted + + for (pspec in 1:length(rts)) { # for each pseudospectrum + + i <- rts_order[pspec] + this_rt <- rts[i] + + # number of following pseudospectra to consider + k <- 0 + while (pspec + k + 1 < length(rts_order) && rts[rts_order[pspec + k + 1]] < this_rt + delta_rt) { # check if the retention time matches + k <- k + 1 + } + + mzs <- lapply(pseudospectra[rts_order[pspec:(pspec + k)]], function(x) x$pspectrum) + mzs_sorted <- NULL + for (j in 1:(k + 1)) { + mzs_sorted <- rbind(mzs_sorted, t(sapply(1:length(mzs[[j]][,1]), function(n) c(rts[rts_order[pspec + j - 1]], mzs[[j]][,1][n])))) + } + mzs_sorted <- mzs_sorted[order(mzs_sorted[,2]),] # column 1: rt ; column 2: m/z + + i <- 1 + while (i <= nrow(mzs_sorted)) { # for each m/z + + j <- 1 + this_already_counted <- nrow(merge(data.frame(t(mzs_sorted[i,])), data.frame(already_counted))) > 0 + + if (is.null(already_counted) || !this_already_counted) { + this_mz <- mzs_sorted[i,2] + already_counted <- rbind(already_counted, mzs_sorted[i,]) + same_ion <- 1 # count how many times this m/z appears + + while (j <= nrow(mzs_sorted) - i && mzs_sorted[i+j,2] >= this_mz - delta_mz && mzs_sorted[i+j,2] <= this_mz + delta_mz) { # check if the m/z matches + this_already_counted <- nrow(merge(data.frame(t(mzs_sorted[i+j,])), data.frame(already_counted))) > 0 + + if (!this_already_counted) { # do not count a duplicate more than once + same_ion <- same_ion + 1 + already_counted <- rbind(already_counted, mzs_sorted[i+j,]) # record which duplicates have already been counted + this_mz <- mzs_sorted[i+j,2] + } + + j <- j + 1 + } + + # first/min m/z, last/max m/z, rt where we started, number of duplicates + if (same_ion > 1) + duplicates <- rbind(duplicates, c(mzs_sorted[i,2], this_mz, mzs_sorted[i,1], same_ion)) + } + i <- i + j + } + + } + + if (is.null(duplicates)) + duplicates <- c(0, 0, 0, 0) + + return (duplicates) + +} +