Mercurial > repos > melpetera > testtest
comparison Testtest/GCMS-test_output.R @ 0:40de28c7d3fb draft
Uploaded
author | melpetera |
---|---|
date | Thu, 23 Nov 2017 08:50:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:40de28c7d3fb |
---|---|
1 # author: Pauline Ribeyre | |
2 | |
3 | |
4 ##################### | |
5 # required packages # | |
6 ##################### | |
7 | |
8 library("parallel") # provides cluster methods and "parLapplyLB" function | |
9 library("grDevices") # provides jpeg handling methods | |
10 | |
11 | |
12 ############ | |
13 # data # | |
14 ############ | |
15 | |
16 update_list_of_file_names <- function() { | |
17 # Writes in a file the list of file names using the files created previously in ./RDatas. | |
18 | |
19 file.create(source_file_names, showWarnings = FALSE) # erase the file | |
20 directory <- "RDatas" | |
21 files <- list.files(directory, full.names = TRUE) | |
22 | |
23 for (f in files) { | |
24 f <- substr(f, 8, nchar(f) - 6) | |
25 write(f, file = source_file_names, append = TRUE) | |
26 } | |
27 | |
28 } | |
29 | |
30 | |
31 titles_to_columns <- function(indicators) { | |
32 # Parse the "title" column of the indicator dataframe to separate the different parameters and their values. | |
33 # | |
34 # Args: | |
35 # indicators: dataframe (one row per test) containing the results. | |
36 # | |
37 # Returns: | |
38 # new_indicators: copy of "indicators" with one column added for each parameter that varied. | |
39 | |
40 default_settings <- FALSE | |
41 | |
42 # parse the title | |
43 param_names <- c() | |
44 for (title in indicators$title) { | |
45 | |
46 if (title == "default_settings") { | |
47 default_settings <- TRUE | |
48 break | |
49 } | |
50 | |
51 else { | |
52 params <- strsplit(title, "_")[[1]] | |
53 for (param in params) { | |
54 name <- strsplit(param, "=")[[1]][1] | |
55 | |
56 # some lowercase values create errors because they are primitive functions | |
57 substr(name, 1, 1) <- toupper(substr(name, 1, 1)) | |
58 | |
59 if (!name %in% param_names) | |
60 param_names <- c(param_names, name) | |
61 value <- strsplit(param, "=")[[1]][2] | |
62 if (!exists(name)) | |
63 assign(name, c()) | |
64 assign(name, c(get(name), value)) | |
65 } | |
66 } | |
67 | |
68 } | |
69 | |
70 new_indicators <- indicators | |
71 | |
72 # add the columns to the dataframe | |
73 if (!default_settings) { | |
74 for (name in param_names) { | |
75 new_indicators <- cbind(new_indicators, get(name)) | |
76 names(new_indicators)[ncol(new_indicators)] <- name | |
77 } | |
78 | |
79 # order the dataframe's columns | |
80 order <- c(1, (ncol(new_indicators) - length(param_names) + 1) : ncol(new_indicators), 2 : (ncol(new_indicators) - length(param_names))) | |
81 new_indicators <- new_indicators[, order] | |
82 } | |
83 | |
84 return (new_indicators) | |
85 | |
86 } | |
87 | |
88 | |
89 create_summary <- function(nb_cores, count_duplicates = FALSE) { | |
90 # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. | |
91 # | |
92 # Args: | |
93 # nb_cores: maximum number of cores to use. | |
94 # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). | |
95 # | |
96 # Returns: | |
97 # A dataframe (one row per test) containing the results: | |
98 # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). | |
99 # | |
100 # # Returns: | |
101 # # A list containing: | |
102 # # 1: dataframe (one row per test) containing the results: | |
103 # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine, nb count_duplicates (opt). | |
104 # # 2: list of the settings sets for each test. | |
105 | |
106 time.start <- Sys.time() # start the timer | |
107 | |
108 file_names <- readLines(source_file_names) | |
109 | |
110 dir.create("count_duplicates", showWarnings = FALSE) #create the folder where the details of the count_duplicates will be saved | |
111 | |
112 # summ <- create_summary_parallel(1, file_names) | |
113 # cat("indic:\n",summary[1][[1]],"\n") | |
114 | |
115 # run the function on several cores | |
116 if (length(file_names) != 0) { | |
117 if (length(file_names) < nb_cores) | |
118 nb_cores <- length(file_names) | |
119 cluster <- makeCluster(nb_cores) #, outfile = "") | |
120 summaries <- parLapplyLB(cluster, 1:length(file_names), create_summary_parallel, file_names = file_names, count_duplicates = count_duplicates) #, pb = pb) | |
121 stopCluster(cluster) | |
122 } | |
123 else | |
124 stop("There are no files to generate the output from.") | |
125 | |
126 # concatenate the results obtained by all the cores | |
127 indicators <- NULL | |
128 for (summary in summaries) | |
129 indicators <- rbind(indicators, t(summary)) | |
130 # settings_list <- c() | |
131 # for (summary in summaries) { | |
132 # settings <- summary[2][[1]] | |
133 # indic <- summary[1][[1]] | |
134 # | |
135 # indicators <- rbind(indicators, t(indic)) | |
136 # settings_list <- c(settings_list, settings) | |
137 # } | |
138 indicators <- data.frame(indicators) | |
139 | |
140 col_names <- c("title", "nb_pseudospectra", "zeros_per_line", "f0to10E3", "f10E3to10E4", "f10E4to10E5", | |
141 "f10E5to10E6", "f10E6to10E7", "f10E7") | |
142 | |
143 if (check_ions) | |
144 col_names <- c(col_names, ions_name) | |
145 | |
146 names(indicators)[1:length(col_names)] <- col_names | |
147 if (count_duplicates) | |
148 names(indicators)[(ncol(indicators) - 1):ncol(indicators)] <- c("count_duplicates", "nb_ions") | |
149 | |
150 indicators <- titles_to_columns(indicators) | |
151 | |
152 time.end <- Sys.time() # stop the timer | |
153 Tdiff <- difftime(time.end, time.start) | |
154 print(Tdiff) | |
155 | |
156 # return (list(indicators, settings_list)) | |
157 return (indicators) | |
158 | |
159 } | |
160 | |
161 | |
162 create_summary_parallel <- function(n, file_names, count_duplicates = FALSE) { #}, pb) { | |
163 # Reads the files created by runGC_vary_parameters() and calculates the quality criteria for each test. | |
164 # | |
165 # Args: | |
166 # n: index of the current test, to select the corresponding title. | |
167 # file_names: list of titles (one for each test) (concatenation of the values taken by the varied parameters). | |
168 # nb_cores: maximum number of cores to use. | |
169 # count_duplicates: calculate the number of count_duplicates obtained by each test (slow). | |
170 # | |
171 # Returns: | |
172 # A dataframe (one row per test) containing the results: | |
173 # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. | |
174 # | |
175 # # Returns: | |
176 # # A list containing: | |
177 # # 1: dataframe (one row per test) containing the results: | |
178 # # title, nb ions, nb zeros/line, nb ions/intensity range, presence of valine. | |
179 # # 2: list of the settings sets for each test. | |
180 | |
181 source(source_spectrum, environment()) | |
182 | |
183 intensities_x <- c(1000, 10000, 100000, 1000000, 10000000) | |
184 | |
185 this_title <- file_names[n] | |
186 | |
187 # calculate the number of zeros | |
188 file_title <- paste0("Peak_tables/", this_title, ".tsv") | |
189 peak_table <- read.table(file_title, sep="\t", header=TRUE) | |
190 peak_table_values <- peak_table[,5:ncol(peak_table)] | |
191 zeros <- sum(peak_table$nb_zeros) | |
192 nb_lines <- nrow(peak_table) | |
193 zeros_per_line <- round(zeros/nb_lines, 4) | |
194 | |
195 # count the number of ions by intensity range | |
196 intensities_y <- c() | |
197 intensities_y[1] <- length(rowMeans( | |
198 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[1]]) | |
199 for (i in 2:length(intensities_x)) | |
200 intensities_y[i] <- length(rowMeans( | |
201 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) < intensities_x[i] & | |
202 rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i - 1]]) | |
203 intensities_y[i + 1] <- length(rowMeans( | |
204 peak_table_values, na.rm = TRUE)[rowMeans(peak_table_values, na.rm = TRUE) > intensities_x[i]]) | |
205 | |
206 # load the settings | |
207 file_title <- paste0("RDatas/", this_title, ".RData") | |
208 load(file_title) | |
209 | |
210 # count the number of count_duplicates and record in a file | |
211 if (count_duplicates) { | |
212 file_title <- paste0("count_duplicates/", this_title, ".tsv") | |
213 nb_count_duplicates <- data.frame(count_duplicates_function(GC_results)) | |
214 names(nb_count_duplicates) <- c("mz_min", "mz_max", "rt", "count_duplicates") | |
215 write.table(nb_count_duplicates, file = file_title, sep = "\t", row.names = FALSE) | |
216 count_duplicates <- nrow(nb_count_duplicates) | |
217 | |
218 nb_ions <- 0 | |
219 pseudospectra <- GC_results$PseudoSpectra | |
220 for (ps in pseudospectra) { | |
221 nb_ions <- nb_ions + nrow(ps$pspectrum) | |
222 } | |
223 } | |
224 | |
225 summary <- c(this_title, nb_lines, zeros_per_line, intensities_y) | |
226 | |
227 # check the presence of ions | |
228 if (nb_ions_to_check > 0) { | |
229 for (i in 1:nb_ions_to_check) { | |
230 name <- ions_name[[i]] | |
231 rt <- ions_rt[[i]] | |
232 mzs <- ions_mzs[[i]] | |
233 cat("Check:", name, rt, mzs, "\n") | |
234 | |
235 value <- is_ion_present(GC_results, rt = rt, mz_list = mzs) | |
236 assign(name, value) | |
237 summary <- c(summary, get(name)) | |
238 } | |
239 } | |
240 | |
241 # check the presence of ion valine 12C and 13C | |
242 # valine <- is_ion_present(GC_results, rt = 9.67, mz_list = list(218.1105, 219.1146)) | |
243 | |
244 # summary <- c(summary, valine) | |
245 | |
246 if (count_duplicates) | |
247 summary <- c(summary, count_duplicates, nb_ions) | |
248 | |
249 # return (list(summary, settings)) | |
250 return (summary) | |
251 | |
252 } | |
253 | |
254 | |
255 ############ | |
256 # graphs # | |
257 ############ | |
258 | |
259 ions_per_intensity <- function(indicators) { | |
260 # For each test, plots the number of ions for each range of intensity. | |
261 # | |
262 # Args: | |
263 # indicators: dataframe (one row per test) containing the results. | |
264 | |
265 pdf(intensity_graph_out) | |
266 par(mar = c(5.1,4.1,5,2.1)) | |
267 | |
268 for (i in 1:nrow(indicators)) { | |
269 | |
270 indic <- indicators[i,] | |
271 | |
272 names <- c("f0to10E3", "f10E3to10E4", "f10E4to10E5", "f10E5to10E6", "f10E6to10E7", "f10E7") | |
273 intensities_y <- c() | |
274 | |
275 for (colname in names) { | |
276 intensities_y <- c(intensities_y, indic[colname]) | |
277 } | |
278 intensities_y <- unlist(intensities_y) | |
279 intensities_y <- as.numeric(levels(intensities_y))[intensities_y] | |
280 | |
281 title <- as.character(indic$title) | |
282 title <- strsplit(title, "_")[[1]] | |
283 plot_title <- "" | |
284 for (i in 1:length(title)) { | |
285 plot_title <- paste(plot_title, title[[i]]) | |
286 if (i %% 2 == 0) | |
287 plot_title <- paste(plot_title, "\n") | |
288 else | |
289 plot_title <- paste(plot_title, " ") | |
290 } | |
291 | |
292 barplot(intensities_y, names.arg = names, xlab = "intensity", ylab = "number of ions", | |
293 main = plot_title, cex.main = 0.8) | |
294 | |
295 } | |
296 | |
297 dev.off() | |
298 | |
299 } | |
300 | |
301 | |
302 graph_results <- function(indicators, criteria) { | |
303 # Plots the results. | |
304 # | |
305 # Args: | |
306 # indicators: dataframe (one row per test) containing the results. | |
307 | |
308 first_criteria <- grep("nb_pseudospectra", names(indicators)) | |
309 nb_params <- first_criteria - 2 | |
310 this_criteria <- grep(criteria, names(indicators)) | |
311 | |
312 # values taken by each parameter | |
313 values <- list() | |
314 for (i in 2:(1 + nb_params)) { | |
315 lev <- unique(indicators[,i]) | |
316 values[[i - 1]] <- lev | |
317 } | |
318 | |
319 length <- lapply(values, length) | |
320 | |
321 # indexes of the parameters taking the most values | |
322 longest_1 <- which.max(length); length[longest_1] <- -1 | |
323 longest_2 <- which.max(length); length[longest_2] <- -1 | |
324 longest_3 <- which.max(length); length[longest_3] <- -1 | |
325 | |
326 # indexes of the other parameters | |
327 shortest <- which(length != -1) | |
328 | |
329 # all combinations of the values taken by these parameters | |
330 combinations <- list() | |
331 for (s in shortest) { | |
332 page <- indicators[,s + 1] | |
333 combinations[[length(combinations) + 1]] <- sort(as.numeric(as.character(unique(page)))) | |
334 } | |
335 names(combinations) <- names(indicators)[shortest + 1] | |
336 combinations <- expand.grid(combinations) | |
337 | |
338 # save the plots in a pdf file | |
339 pdf(compare_graph_out) | |
340 nb_rows <- length(values[longest_3][[1]]) | |
341 nb_cols <- length(values[longest_2][[1]]) | |
342 par(mfrow = c(nb_rows, nb_cols), mar = c(4.5,4.5,5,1)) | |
343 | |
344 # plot parameters | |
345 x <- sort(as.numeric(as.character(unique(indicators[,longest_1 + 1])))) | |
346 min_zeros <- min(as.numeric(as.character(indicators$zeros_per_line))) | |
347 max_zeros <- max(as.numeric(as.character(indicators$zeros_per_line))) | |
348 | |
349 for (rowi in 1:nrow(combinations)) { | |
350 row <- combinations[rowi,] | |
351 title <- "" | |
352 lines <- indicators | |
353 | |
354 for (coli in 1:length(row)) { | |
355 value <- as.numeric(as.character(row[coli])) | |
356 title <- paste(title, names(row[coli]), "=", value) | |
357 if (coli %% 2 == 0) | |
358 title <- paste(title, "\n") | |
359 else | |
360 title <- paste(title, " ") | |
361 lines <- lines[lines[names(row[coli])] == value,] | |
362 } | |
363 | |
364 # values taken by the parameters | |
365 in_plot <- lines[,longest_1 + 1] | |
366 vertical <- lines[,longest_2 + 1] | |
367 horizontal <- lines[,longest_3 + 1] | |
368 | |
369 # for each horizontal value | |
370 for (horiz in sort(unique(horizontal))) { | |
371 | |
372 # for each vertical value | |
373 for (vertic in sort(unique(vertical))) { | |
374 | |
375 y <- c() | |
376 for (this_y in sort(unique(in_plot))) { | |
377 # line <- line[page == p & horizontal == horiz & vertical == vertic & in_plot == this_y,] | |
378 line <- lines[horizontal == horiz & vertical == vertic & in_plot == this_y,] | |
379 if (nrow(line) != 1) | |
380 stop("To plot the results, each set of the parameters' values must be represented exactly 1 time") | |
381 | |
382 value <- line[,this_criteria] | |
383 value <- as.numeric(as.character(value)) | |
384 y <- c(y, value) | |
385 } | |
386 | |
387 this_title <- paste(title, | |
388 names(indicators)[longest_3 + 1], "=", horiz, "\n", | |
389 names(indicators)[longest_2 + 1], "=", vertic) | |
390 | |
391 # plot this graph | |
392 plot(x, y, ylim = c(min_zeros, max_zeros), | |
393 type = "b", | |
394 xlab = names(indicators)[longest_1 + 1], | |
395 ylab = criteria, | |
396 main = this_title, | |
397 cex.main = 0.8) | |
398 | |
399 } | |
400 | |
401 } | |
402 | |
403 } | |
404 | |
405 par(mfrow = c(1,1)) | |
406 dev.off() | |
407 | |
408 } | |
409 | |
410 | |
411 ############ | |
412 # main # | |
413 ############ | |
414 | |
415 update_list_of_file_names() | |
416 summary <- create_summary(nb_cores, count_duplicates = count_duplicates) | |
417 | |
418 # settings_list <- summary[2][[1]] | |
419 # indicators_ <- summary[1][[1]] | |
420 indicators_ <- summary | |
421 | |
422 indicators <- indicators_[order(indicators_$zeros_per_line),] # sort by number of zeros per line | |
423 # indicators_ <- indicators[order(as.numeric(row.names(indicators))),] # order back to row numbers | |
424 | |
425 # record the summary in a file | |
426 indicators_to_write <- indicators[, !names(indicators) %in% | |
427 c("title", "f0to10E3", "f10E3to10E4", "f10E4to10E5", | |
428 "f10E5to10E6", "f10E6to10E7", "f10E7")] | |
429 write.table(indicators_to_write, | |
430 file = summary_out, | |
431 quote = FALSE, | |
432 row.names = FALSE, | |
433 sep = "\t") | |
434 | |
435 # ions per intensity graph | |
436 ions_per_intensity(indicators_) | |
437 | |
438 # plots to compare each set of parameters | |
439 graph_results(indicators_, criteria = "zeros_per_line") | |
440 | |
441 # zip of pseudospectra .msp and .tsv files | |
442 system(paste0('cd Pseudospectra ; ls . | grep -e "msp$" -e "tsv$" | zip -r -@ "peakspectra_out.zip"')) | |
443 system(paste("cd Pseudospectra ; mv peakspectra_out.zip", peakspectra_out)) | |
444 | |
445 # zip of count_duplicates .tsv files | |
446 if (count_duplicates) { | |
447 system(paste0('cd count_duplicates ; ls . | grep "tsv$" | zip -r -@ "count_duplicates_out.zip"')) | |
448 system(paste("cd count_duplicates ; mv count_duplicates_out.zip", count_duplicates_out)) | |
449 } | |
450 |