comparison Testtest/GCMS-test_output.R @ 0:40de28c7d3fb draft

Uploaded
author melpetera
date Thu, 23 Nov 2017 08:50:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:40de28c7d3fb
1 # author: Pauline Ribeyre
2
3
4 #####################
5 # required packages #
6 #####################
7
8 library("parallel") # provides cluster methods and "parLapplyLB" function
9 library("grDevices") # provides jpeg handling methods
10
11
12 ############
13 # data #
14 ############
15
16 update_list_of_file_names <- function() {
17 # Writes in a file the list of file names using the files created previously in ./RDatas.
18
19 file.create(source_file_names, showWarnings = FALSE) # erase the file
20 directory <- "RDatas"
21 files <- list.files(directory, full.names = TRUE)
22
23 for (f in files) {
24 f <- substr(f, 8, nchar(f) - 6)
25 write(f, file = source_file_names, append = TRUE)
26 }
27
28 }
29
30
31 titles_to_columns <- function(indicators) {
32 # Parse the "title" column of the indicator dataframe to separate the different parameters and their values.
33 #
34 # Args:
35 # indicators: dataframe (one row per test) containing the results.
36 #
37 # Returns:
38 # new_indicators: copy of "indicators" with one column added for each parameter that varied.
39
40 default_settings <- FALSE
41
42 # parse the title
43 param_names <- c()
44 for (title in indicators$title) {
45
46 if (title == "default_settings") {
47 default_settings <- TRUE
48 break
49 }
50
51 else {
52 params <- strsplit(title, "_")[[1]]
53 for (param in params) {
54 name <- strsplit(param, "=")[[1]][1]
55
56 # some lowercase values create errors because they are primitive functions
57 substr(name, 1, 1) <- toupper(substr(name, 1, 1))
58
59 if (!name %in% param_names)
60 param_names <- c(param_names, name)
61 value <- strsplit(param, "=")[[1]][2]
62 if (!exists(name))
63 assign(name, c())
64 assign(name, c(get(name), value))
65 }
66 }
67
68 }
69
70 new_indicators <- indicators
71
72 # add the columns to the dataframe
73 if (!default_settings) {
74 for (name in param_names) {
75 new_indicators <- cbind(new_indicators, get(name))
76 names(new_indicators)[ncol(new_indicators)] <- name
77 }
78
79 # order the dataframe's columns
80 order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names)))
81 new_indicators <- new_indicators[, order]
82 }
83
84 return (new_indicators)
85
86 }
87
88
89 create_summary <- function(nb_cores, count_duplicates = FALSE) {
90 # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test.
91 #
92 # Args:
93 # nb_cores: maximum number of cores to use.
94 # count_duplicates: calculate the number of count_duplicates obtained by each test (slow).
95 #
96 # Returns:
97 # A dataframe (one row per test) containing the results:
98 # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt).
99 #
100 # # Returns:
101 # # A list containing:
102 # # 1: dataframe (one row per test) containing the results:
103 # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt).
104 # # 2: list of the settings sets for each test.
105
106 time.start <- Sys.time() # start the timer
107
108 file_names <- readLines(source_file_names)
109
110 dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved
111
112 # summ <- create_summary_parallel(1, file_names)
113 # cat("indic:\n",summary[1][[1]],"\n")
114
115 # run the function on several cores
116 if (length(file_names) != 0) {
117 if (length(file_names) < nb_cores)
118 nb_cores <- length(file_names)
119 cluster <- makeCluster(nb_cores) #, outfile = "")
120 summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb)
121 stopCluster(cluster)
122 }
123 else
124 stop("There are no files to generate the output from.")
125
126 # concatenate the results obtained by all the cores
127 indicators <- NULL
128 for (summary in summaries)
129 indicators <- rbind(indicators, t(summary))
130 # settings_list <- c()
131 # for (summary in summaries) {
132 # settings <- summary[2][[1]]
133 # indic <- summary[1][[1]]
134 #
135 # indicators <- rbind(indicators, t(indic))
136 # settings_list <- c(settings_list, settings)
137 # }
138 indicators <- data.frame(indicators)
139
140 col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5",
141 "f10E5to10E6", "f10E6to10E7", "f10E7")
142
143 if (check_ions)
144 col_names <- c(col_names, ions_name)
145
146 names(indicators)[1:length(col_names)] <- col_names
147 if (count_duplicates)
148 names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions")
149
150 indicators <- titles_to_columns(indicators)
151
152 time.end <- Sys.time() # stop the timer
153 Tdiff <- difftime(time.end, time.start)
154 print(Tdiff)
155
156 # return (list(indicators, settings_list))
157 return (indicators)
158
159 }
160
161
162 create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) {
163 # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test.
164 #
165 # Args:
166 # n: index of the current test, to select the corresponding title.
167 # file_names: list of titles (one for each test) (concatenation of the values taken by the varied parameters).
168 # nb_cores: maximum number of cores to use.
169 # count_duplicates: calculate the number of count_duplicates obtained by each test (slow).
170 #
171 # Returns:
172 # A dataframe (one row per test) containing the results:
173 # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine.
174 #
175 # # Returns:
176 # # A list containing:
177 # # 1: dataframe (one row per test) containing the results:
178 # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine.
179 # # 2: list of the settings sets for each test.
180
181 source(source_spectrum, environment())
182
183 intensities_x <- c(1000, 10000, 100000, 1000000, 10000000)
184
185 this_title <- file_names[n]
186
187 # calculate the number of zeros
188 file_title <- paste0("Peak_tables/", this_title, ".tsv")
189 peak_table <- read.table(file_title, sep="\t", header=TRUE)
190 peak_table_values <- peak_table[,5:ncol(peak_table)]
191 zeros <- sum(peak_table$nb_zeros)
192 nb_lines <- nrow(peak_table)
193 zeros_per_line <- round(zeros/nb_lines, 4)
194
195 # count the number of ions by intensity range
196 intensities_y <- c()
197 intensities_y[1] <- length(rowMeans(
198 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]])
199 for (i in 2:length(intensities_x))
200 intensities_y[i] <- length(rowMeans(
201 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] &
202 rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]])
203 intensities_y[i + 1] <- length(rowMeans(
204 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]])
205
206 # load the settings
207 file_title <- paste0("RDatas/", this_title, ".RData")
208 load(file_title)
209
210 # count the number of count_duplicates and record in a file
211 if (count_duplicates) {
212 file_title <- paste0("count_duplicates/", this_title, ".tsv")
213 nb_count_duplicates <- data.frame(count_duplicates_function(GC_results))
214 names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates")
215 write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE)
216 count_duplicates <- nrow(nb_count_duplicates)
217
218 nb_ions <- 0
219 pseudospectra <- GC_results$PseudoSpectra
220 for (ps in pseudospectra) {
221 nb_ions <- nb_ions + nrow(ps$pspectrum)
222 }
223 }
224
225 summary <- c(this_title, nb_lines, zeros_per_line, intensities_y)
226
227 # check the presence of ions
228 if (nb_ions_to_check > 0) {
229 for (i in 1:nb_ions_to_check) {
230 name <- ions_name[[i]]
231 rt <- ions_rt[[i]]
232 mzs <- ions_mzs[[i]]
233 cat("Check:", name, rt, mzs, "\n")
234
235 value <- is_ion_present(GC_results, rt = rt, mz_list = mzs)
236 assign(name, value)
237 summary <- c(summary, get(name))
238 }
239 }
240
241 # check the presence of ion valine 12C and 13C
242 # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146))
243
244 # summary <- c(summary, valine)
245
246 if (count_duplicates)
247 summary <- c(summary, count_duplicates, nb_ions)
248
249 # return (list(summary, settings))
250 return (summary)
251
252 }
253
254
255 ############
256 # graphs #
257 ############
258
259 ions_per_intensity <- function(indicators) {
260 # For each test, plots the number of ions for each range of intensity.
261 #
262 # Args:
263 # indicators: dataframe (one row per test) containing the results.
264
265 pdf(intensity_graph_out)
266 par(mar = c(5.1,4.1,5,2.1))
267
268 for (i in 1:nrow(indicators)) {
269
270 indic <- indicators[i,]
271
272 names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7")
273 intensities_y <- c()
274
275 for (colname in names) {
276 intensities_y <- c(intensities_y, indic[colname])
277 }
278 intensities_y <- unlist(intensities_y)
279 intensities_y <- as.numeric(levels(intensities_y))[intensities_y]
280
281 title <- as.character(indic$title)
282 title <- strsplit(title, "_")[[1]]
283 plot_title <- ""
284 for (i in 1:length(title)) {
285 plot_title <- paste(plot_title, title[[i]])
286 if (i %% 2 == 0)
287 plot_title <- paste(plot_title, "\n")
288 else
289 plot_title <- paste(plot_title, " ")
290 }
291
292 barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions",
293 main = plot_title, cex.main = 0.8)
294
295 }
296
297 dev.off()
298
299 }
300
301
302 graph_results <- function(indicators, criteria) {
303 # Plots the results.
304 #
305 # Args:
306 # indicators: dataframe (one row per test) containing the results.
307
308 first_criteria <- grep("nb_pseudospectra", names(indicators))
309 nb_params <- first_criteria - 2
310 this_criteria <- grep(criteria, names(indicators))
311
312 # values taken by each parameter
313 values <- list()
314 for (i in 2:(1 + nb_params)) {
315 lev <- unique(indicators[,i])
316 values[[i - 1]] <- lev
317 }
318
319 length <- lapply(values, length)
320
321 # indexes of the parameters taking the most values
322 longest_1 <- which.max(length); length[longest_1] <- -1
323 longest_2 <- which.max(length); length[longest_2] <- -1
324 longest_3 <- which.max(length); length[longest_3] <- -1
325
326 # indexes of the other parameters
327 shortest <- which(length != -1)
328
329 # all combinations of the values taken by these parameters
330 combinations <- list()
331 for (s in shortest) {
332 page <- indicators[,s + 1]
333 combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page))))
334 }
335 names(combinations) <- names(indicators)[shortest + 1]
336 combinations <- expand.grid(combinations)
337
338 # save the plots in a pdf file
339 pdf(compare_graph_out)
340 nb_rows <- length(values[longest_3][[1]])
341 nb_cols <- length(values[longest_2][[1]])
342 par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1))
343
344 # plot parameters
345 x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1]))))
346 min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line)))
347 max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line)))
348
349 for (rowi in 1:nrow(combinations)) {
350 row <- combinations[rowi,]
351 title <- ""
352 lines <- indicators
353
354 for (coli in 1:length(row)) {
355 value <- as.numeric(as.character(row[coli]))
356 title <- paste(title, names(row[coli]), "=", value)
357 if (coli %% 2 == 0)
358 title <- paste(title, "\n")
359 else
360 title <- paste(title, " ")
361 lines <- lines[lines[names(row[coli])] == value,]
362 }
363
364 # values taken by the parameters
365 in_plot <- lines[,longest_1 + 1]
366 vertical <- lines[,longest_2 + 1]
367 horizontal <- lines[,longest_3 + 1]
368
369 # for each horizontal value
370 for (horiz in sort(unique(horizontal))) {
371
372 # for each vertical value
373 for (vertic in sort(unique(vertical))) {
374
375 y <- c()
376 for (this_y in sort(unique(in_plot))) {
377 # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,]
378 line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,]
379 if (nrow(line) != 1)
380 stop("To plot the results, each set of the parameters' values must be represented exactly 1 time")
381
382 value <- line[,this_criteria]
383 value <- as.numeric(as.character(value))
384 y <- c(y, value)
385 }
386
387 this_title <- paste(title,
388 names(indicators)[longest_3 + 1], "=", horiz, "\n",
389 names(indicators)[longest_2 + 1], "=", vertic)
390
391 # plot this graph
392 plot(x, y, ylim = c(min_zeros, max_zeros),
393 type = "b",
394 xlab = names(indicators)[longest_1 + 1],
395 ylab = criteria,
396 main = this_title,
397 cex.main = 0.8)
398
399 }
400
401 }
402
403 }
404
405 par(mfrow = c(1,1))
406 dev.off()
407
408 }
409
410
411 ############
412 # main #
413 ############
414
415 update_list_of_file_names()
416 summary <- create_summary(nb_cores, count_duplicates = count_duplicates)
417
418 # settings_list <- summary[2][[1]]
419 # indicators_ <- summary[1][[1]]
420 indicators_ <- summary
421
422 indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line
423 # indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers
424
425 # record the summary in a file
426 indicators_to_write <- indicators[, !names(indicators) %in%
427 c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5",
428 "f10E5to10E6", "f10E6to10E7", "f10E7")]
429 write.table(indicators_to_write,
430 file = summary_out,
431 quote = FALSE,
432 row.names = FALSE,
433 sep = "\t")
434
435 # ions per intensity graph
436 ions_per_intensity(indicators_)
437
438 # plots to compare each set of parameters
439 graph_results(indicators_, criteria = "zeros_per_line")
440
441 # zip of pseudospectra .msp and .tsv files
442 system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"'))
443 system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out))
444
445 # zip of count_duplicates .tsv files
446 if (count_duplicates) {
447 system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"'))
448 system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out))
449 }
450