0
|
1 # author: Pauline Ribeyre
|
|
2
|
|
3
|
|
4 #####################
|
|
5 # required packages #
|
|
6 #####################
|
|
7
|
|
8 library("parallel") # provides cluster methods and "parLapplyLB" function
|
|
9 library("grDevices") # provides jpeg handling methods
|
|
10
|
|
11
|
|
12 ############
|
|
13 # data #
|
|
14 ############
|
|
15
|
|
16 update_list_of_file_names <- function() {
|
|
17 # Writes in a file the list of file names using the files created previously in ./RDatas.
|
|
18
|
|
19 file.create(source_file_names, showWarnings = FALSE) # erase the file
|
|
20 directory <- "RDatas"
|
|
21 files <- list.files(directory, full.names = TRUE)
|
|
22
|
|
23 for (f in files) {
|
|
24 f <- substr(f, 8, nchar(f) - 6)
|
|
25 write(f, file = source_file_names, append = TRUE)
|
|
26 }
|
|
27
|
|
28 }
|
|
29
|
|
30
|
|
31 titles_to_columns <- function(indicators) {
|
|
32 # Parse the "title" column of the indicator dataframe to separate the different parameters and their values.
|
|
33 #
|
|
34 # Args:
|
|
35 # indicators: dataframe (one row per test) containing the results.
|
|
36 #
|
|
37 # Returns:
|
|
38 # new_indicators: copy of "indicators" with one column added for each parameter that varied.
|
|
39
|
|
40 default_settings <- FALSE
|
|
41
|
|
42 # parse the title
|
|
43 param_names <- c()
|
|
44 for (title in indicators$title) {
|
|
45
|
|
46 if (title == "default_settings") {
|
|
47 default_settings <- TRUE
|
|
48 break
|
|
49 }
|
|
50
|
|
51 else {
|
|
52 params <- strsplit(title, "_")[[1]]
|
|
53 for (param in params) {
|
|
54 name <- strsplit(param, "=")[[1]][1]
|
|
55
|
|
56 # some lowercase values create errors because they are primitive functions
|
|
57 substr(name, 1, 1) <- toupper(substr(name, 1, 1))
|
|
58
|
|
59 if (!name %in% param_names)
|
|
60 param_names <- c(param_names, name)
|
|
61 value <- strsplit(param, "=")[[1]][2]
|
|
62 if (!exists(name))
|
|
63 assign(name, c())
|
|
64 assign(name, c(get(name), value))
|
|
65 }
|
|
66 }
|
|
67
|
|
68 }
|
|
69
|
|
70 new_indicators <- indicators
|
|
71
|
|
72 # add the columns to the dataframe
|
|
73 if (!default_settings) {
|
|
74 for (name in param_names) {
|
|
75 new_indicators <- cbind(new_indicators, get(name))
|
|
76 names(new_indicators)[ncol(new_indicators)] <- name
|
|
77 }
|
|
78
|
|
79 # order the dataframe's columns
|
|
80 order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names)))
|
|
81 new_indicators <- new_indicators[, order]
|
|
82 }
|
|
83
|
|
84 return (new_indicators)
|
|
85
|
|
86 }
|
|
87
|
|
88
|
|
89 create_summary <- function(nb_cores, count_duplicates = FALSE) {
|
|
90 # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test.
|
|
91 #
|
|
92 # Args:
|
|
93 # nb_cores: maximum number of cores to use.
|
|
94 # count_duplicates: calculate the number of count_duplicates obtained by each test (slow).
|
|
95 #
|
|
96 # Returns:
|
|
97 # A dataframe (one row per test) containing the results:
|
|
98 # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt).
|
|
99 #
|
|
100 # # Returns:
|
|
101 # # A list containing:
|
|
102 # # 1: dataframe (one row per test) containing the results:
|
|
103 # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt).
|
|
104 # # 2: list of the settings sets for each test.
|
|
105
|
|
106 time.start <- Sys.time() # start the timer
|
|
107
|
|
108 file_names <- readLines(source_file_names)
|
|
109
|
|
110 dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved
|
|
111
|
|
112 # summ <- create_summary_parallel(1, file_names)
|
|
113 # cat("indic:\n",summary[1][[1]],"\n")
|
|
114
|
|
115 # run the function on several cores
|
|
116 if (length(file_names) != 0) {
|
|
117 if (length(file_names) < nb_cores)
|
|
118 nb_cores <- length(file_names)
|
|
119 cluster <- makeCluster(nb_cores) #, outfile = "")
|
|
120 summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb)
|
|
121 stopCluster(cluster)
|
|
122 }
|
|
123 else
|
|
124 stop("There are no files to generate the output from.")
|
|
125
|
|
126 # concatenate the results obtained by all the cores
|
|
127 indicators <- NULL
|
|
128 for (summary in summaries)
|
|
129 indicators <- rbind(indicators, t(summary))
|
|
130 # settings_list <- c()
|
|
131 # for (summary in summaries) {
|
|
132 # settings <- summary[2][[1]]
|
|
133 # indic <- summary[1][[1]]
|
|
134 #
|
|
135 # indicators <- rbind(indicators, t(indic))
|
|
136 # settings_list <- c(settings_list, settings)
|
|
137 # }
|
|
138 indicators <- data.frame(indicators)
|
|
139
|
|
140 col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5",
|
|
141 "f10E5to10E6", "f10E6to10E7", "f10E7")
|
|
142
|
|
143 if (check_ions)
|
|
144 col_names <- c(col_names, ions_name)
|
|
145
|
|
146 names(indicators)[1:length(col_names)] <- col_names
|
|
147 if (count_duplicates)
|
|
148 names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions")
|
|
149
|
|
150 indicators <- titles_to_columns(indicators)
|
|
151
|
|
152 time.end <- Sys.time() # stop the timer
|
|
153 Tdiff <- difftime(time.end, time.start)
|
|
154 print(Tdiff)
|
|
155
|
|
156 # return (list(indicators, settings_list))
|
|
157 return (indicators)
|
|
158
|
|
159 }
|
|
160
|
|
161
|
|
162 create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) {
|
|
163 # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test.
|
|
164 #
|
|
165 # Args:
|
|
166 # n: index of the current test, to select the corresponding title.
|
|
167 # file_names: list of titles (one for each test) (concatenation of the values taken by the varied parameters).
|
|
168 # nb_cores: maximum number of cores to use.
|
|
169 # count_duplicates: calculate the number of count_duplicates obtained by each test (slow).
|
|
170 #
|
|
171 # Returns:
|
|
172 # A dataframe (one row per test) containing the results:
|
|
173 # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine.
|
|
174 #
|
|
175 # # Returns:
|
|
176 # # A list containing:
|
|
177 # # 1: dataframe (one row per test) containing the results:
|
|
178 # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine.
|
|
179 # # 2: list of the settings sets for each test.
|
|
180
|
|
181 source(source_spectrum, environment())
|
|
182
|
|
183 intensities_x <- c(1000, 10000, 100000, 1000000, 10000000)
|
|
184
|
|
185 this_title <- file_names[n]
|
|
186
|
|
187 # calculate the number of zeros
|
|
188 file_title <- paste0("Peak_tables/", this_title, ".tsv")
|
|
189 peak_table <- read.table(file_title, sep="\t", header=TRUE)
|
|
190 peak_table_values <- peak_table[,5:ncol(peak_table)]
|
|
191 zeros <- sum(peak_table$nb_zeros)
|
|
192 nb_lines <- nrow(peak_table)
|
|
193 zeros_per_line <- round(zeros/nb_lines, 4)
|
|
194
|
|
195 # count the number of ions by intensity range
|
|
196 intensities_y <- c()
|
|
197 intensities_y[1] <- length(rowMeans(
|
|
198 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]])
|
|
199 for (i in 2:length(intensities_x))
|
|
200 intensities_y[i] <- length(rowMeans(
|
|
201 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] &
|
|
202 rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]])
|
|
203 intensities_y[i + 1] <- length(rowMeans(
|
|
204 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]])
|
|
205
|
|
206 # load the settings
|
|
207 file_title <- paste0("RDatas/", this_title, ".RData")
|
|
208 load(file_title)
|
|
209
|
|
210 # count the number of count_duplicates and record in a file
|
|
211 if (count_duplicates) {
|
|
212 file_title <- paste0("count_duplicates/", this_title, ".tsv")
|
|
213 nb_count_duplicates <- data.frame(count_duplicates_function(GC_results))
|
|
214 names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates")
|
|
215 write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE)
|
|
216 count_duplicates <- nrow(nb_count_duplicates)
|
|
217
|
|
218 nb_ions <- 0
|
|
219 pseudospectra <- GC_results$PseudoSpectra
|
|
220 for (ps in pseudospectra) {
|
|
221 nb_ions <- nb_ions + nrow(ps$pspectrum)
|
|
222 }
|
|
223 }
|
|
224
|
|
225 summary <- c(this_title, nb_lines, zeros_per_line, intensities_y)
|
|
226
|
|
227 # check the presence of ions
|
|
228 if (nb_ions_to_check > 0) {
|
|
229 for (i in 1:nb_ions_to_check) {
|
|
230 name <- ions_name[[i]]
|
|
231 rt <- ions_rt[[i]]
|
|
232 mzs <- ions_mzs[[i]]
|
|
233 cat("Check:", name, rt, mzs, "\n")
|
|
234
|
|
235 value <- is_ion_present(GC_results, rt = rt, mz_list = mzs)
|
|
236 assign(name, value)
|
|
237 summary <- c(summary, get(name))
|
|
238 }
|
|
239 }
|
|
240
|
|
241 # check the presence of ion valine 12C and 13C
|
|
242 # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146))
|
|
243
|
|
244 # summary <- c(summary, valine)
|
|
245
|
|
246 if (count_duplicates)
|
|
247 summary <- c(summary, count_duplicates, nb_ions)
|
|
248
|
|
249 # return (list(summary, settings))
|
|
250 return (summary)
|
|
251
|
|
252 }
|
|
253
|
|
254
|
|
255 ############
|
|
256 # graphs #
|
|
257 ############
|
|
258
|
|
259 ions_per_intensity <- function(indicators) {
|
|
260 # For each test, plots the number of ions for each range of intensity.
|
|
261 #
|
|
262 # Args:
|
|
263 # indicators: dataframe (one row per test) containing the results.
|
|
264
|
|
265 pdf(intensity_graph_out)
|
|
266 par(mar = c(5.1,4.1,5,2.1))
|
|
267
|
|
268 for (i in 1:nrow(indicators)) {
|
|
269
|
|
270 indic <- indicators[i,]
|
|
271
|
|
272 names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7")
|
|
273 intensities_y <- c()
|
|
274
|
|
275 for (colname in names) {
|
|
276 intensities_y <- c(intensities_y, indic[colname])
|
|
277 }
|
|
278 intensities_y <- unlist(intensities_y)
|
|
279 intensities_y <- as.numeric(levels(intensities_y))[intensities_y]
|
|
280
|
|
281 title <- as.character(indic$title)
|
|
282 title <- strsplit(title, "_")[[1]]
|
|
283 plot_title <- ""
|
|
284 for (i in 1:length(title)) {
|
|
285 plot_title <- paste(plot_title, title[[i]])
|
|
286 if (i %% 2 == 0)
|
|
287 plot_title <- paste(plot_title, "\n")
|
|
288 else
|
|
289 plot_title <- paste(plot_title, " ")
|
|
290 }
|
|
291
|
|
292 barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions",
|
|
293 main = plot_title, cex.main = 0.8)
|
|
294
|
|
295 }
|
|
296
|
|
297 dev.off()
|
|
298
|
|
299 }
|
|
300
|
|
301
|
|
302 graph_results <- function(indicators, criteria) {
|
|
303 # Plots the results.
|
|
304 #
|
|
305 # Args:
|
|
306 # indicators: dataframe (one row per test) containing the results.
|
|
307
|
|
308 first_criteria <- grep("nb_pseudospectra", names(indicators))
|
|
309 nb_params <- first_criteria - 2
|
|
310 this_criteria <- grep(criteria, names(indicators))
|
|
311
|
|
312 # values taken by each parameter
|
|
313 values <- list()
|
|
314 for (i in 2:(1 + nb_params)) {
|
|
315 lev <- unique(indicators[,i])
|
|
316 values[[i - 1]] <- lev
|
|
317 }
|
|
318
|
|
319 length <- lapply(values, length)
|
|
320
|
|
321 # indexes of the parameters taking the most values
|
|
322 longest_1 <- which.max(length); length[longest_1] <- -1
|
|
323 longest_2 <- which.max(length); length[longest_2] <- -1
|
|
324 longest_3 <- which.max(length); length[longest_3] <- -1
|
|
325
|
|
326 # indexes of the other parameters
|
|
327 shortest <- which(length != -1)
|
|
328
|
|
329 # all combinations of the values taken by these parameters
|
|
330 combinations <- list()
|
|
331 for (s in shortest) {
|
|
332 page <- indicators[,s + 1]
|
|
333 combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page))))
|
|
334 }
|
|
335 names(combinations) <- names(indicators)[shortest + 1]
|
|
336 combinations <- expand.grid(combinations)
|
|
337
|
|
338 # save the plots in a pdf file
|
|
339 pdf(compare_graph_out)
|
|
340 nb_rows <- length(values[longest_3][[1]])
|
|
341 nb_cols <- length(values[longest_2][[1]])
|
|
342 par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1))
|
|
343
|
|
344 # plot parameters
|
|
345 x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1]))))
|
|
346 min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line)))
|
|
347 max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line)))
|
|
348
|
|
349 for (rowi in 1:nrow(combinations)) {
|
|
350 row <- combinations[rowi,]
|
|
351 title <- ""
|
|
352 lines <- indicators
|
|
353
|
|
354 for (coli in 1:length(row)) {
|
|
355 value <- as.numeric(as.character(row[coli]))
|
|
356 title <- paste(title, names(row[coli]), "=", value)
|
|
357 if (coli %% 2 == 0)
|
|
358 title <- paste(title, "\n")
|
|
359 else
|
|
360 title <- paste(title, " ")
|
|
361 lines <- lines[lines[names(row[coli])] == value,]
|
|
362 }
|
|
363
|
|
364 # values taken by the parameters
|
|
365 in_plot <- lines[,longest_1 + 1]
|
|
366 vertical <- lines[,longest_2 + 1]
|
|
367 horizontal <- lines[,longest_3 + 1]
|
|
368
|
|
369 # for each horizontal value
|
|
370 for (horiz in sort(unique(horizontal))) {
|
|
371
|
|
372 # for each vertical value
|
|
373 for (vertic in sort(unique(vertical))) {
|
|
374
|
|
375 y <- c()
|
|
376 for (this_y in sort(unique(in_plot))) {
|
|
377 # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,]
|
|
378 line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,]
|
|
379 if (nrow(line) != 1)
|
|
380 stop("To plot the results, each set of the parameters' values must be represented exactly 1 time")
|
|
381
|
|
382 value <- line[,this_criteria]
|
|
383 value <- as.numeric(as.character(value))
|
|
384 y <- c(y, value)
|
|
385 }
|
|
386
|
|
387 this_title <- paste(title,
|
|
388 names(indicators)[longest_3 + 1], "=", horiz, "\n",
|
|
389 names(indicators)[longest_2 + 1], "=", vertic)
|
|
390
|
|
391 # plot this graph
|
|
392 plot(x, y, ylim = c(min_zeros, max_zeros),
|
|
393 type = "b",
|
|
394 xlab = names(indicators)[longest_1 + 1],
|
|
395 ylab = criteria,
|
|
396 main = this_title,
|
|
397 cex.main = 0.8)
|
|
398
|
|
399 }
|
|
400
|
|
401 }
|
|
402
|
|
403 }
|
|
404
|
|
405 par(mfrow = c(1,1))
|
|
406 dev.off()
|
|
407
|
|
408 }
|
|
409
|
|
410
|
|
411 ############
|
|
412 # main #
|
|
413 ############
|
|
414
|
|
415 update_list_of_file_names()
|
|
416 summary <- create_summary(nb_cores, count_duplicates = count_duplicates)
|
|
417
|
|
418 # settings_list <- summary[2][[1]]
|
|
419 # indicators_ <- summary[1][[1]]
|
|
420 indicators_ <- summary
|
|
421
|
|
422 indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line
|
|
423 # indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers
|
|
424
|
|
425 # record the summary in a file
|
|
426 indicators_to_write <- indicators[, !names(indicators) %in%
|
|
427 c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5",
|
|
428 "f10E5to10E6", "f10E6to10E7", "f10E7")]
|
|
429 write.table(indicators_to_write,
|
|
430 file = summary_out,
|
|
431 quote = FALSE,
|
|
432 row.names = FALSE,
|
|
433 sep = "\t")
|
|
434
|
|
435 # ions per intensity graph
|
|
436 ions_per_intensity(indicators_)
|
|
437
|
|
438 # plots to compare each set of parameters
|
|
439 graph_results(indicators_, criteria = "zeros_per_line")
|
|
440
|
|
441 # zip of pseudospectra .msp and .tsv files
|
|
442 system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"'))
|
|
443 system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out))
|
|
444
|
|
445 # zip of count_duplicates .tsv files
|
|
446 if (count_duplicates) {
|
|
447 system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"'))
|
|
448 system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out))
|
|
449 }
|
|
450
|