comparison lib.r @ 7:dca722aecb67 draft

planemo upload for repository https://github.com/workflow4metabolomics/xcms commit e384d6dd5f410799ec211f73bca0b5d5d7bc651e
author lecorguille
date Thu, 01 Mar 2018 04:15:19 -0500
parents
children 6b5504f877ff
comparison
equal deleted inserted replaced
6:5c7a7484dc51 7:dca722aecb67
1 #@authors ABiMS TEAM, Y. Guitton
2 # lib.r for Galaxy Workflow4Metabolomics xcms tools
3
4 #@author G. Le Corguille
5 # solve an issue with batch if arguments are logical TRUE/FALSE
6 parseCommandArgs <- function(...) {
7 args <- batch::parseCommandArgs(...)
8 for (key in names(args)) {
9 if (args[key] %in% c("TRUE","FALSE"))
10 args[key] = as.logical(args[key])
11 }
12 return(args)
13 }
14
15 #@author G. Le Corguille
16 # This function will
17 # - load the packages
18 # - display the sessionInfo
19 loadAndDisplayPackages <- function(pkgs) {
20 for(pkg in pkgs) suppressPackageStartupMessages( stopifnot( library(pkg, quietly=TRUE, logical.return=TRUE, character.only=TRUE)))
21
22 sessioninfo = sessionInfo()
23 cat(sessioninfo$R.version$version.string,"\n")
24 cat("Main packages:\n")
25 for (pkg in names(sessioninfo$otherPkgs)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n")
26 cat("Other loaded packages:\n")
27 for (pkg in names(sessioninfo$loadedOnly)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n")
28 }
29
30 #@author G. Le Corguille
31 # This function convert if it is required the Retention Time in minutes
32 RTSecondToMinute <- function(variableMetadata, convertRTMinute) {
33 if (convertRTMinute){
34 #converting the retention times (seconds) into minutes
35 print("converting the retention times into minutes in the variableMetadata")
36 variableMetadata[,"rt"] <- variableMetadata[,"rt"]/60
37 variableMetadata[,"rtmin"] <- variableMetadata[,"rtmin"]/60
38 variableMetadata[,"rtmax"] <- variableMetadata[,"rtmax"]/60
39 }
40 return (variableMetadata)
41 }
42
43 #@author G. Le Corguille
44 # This function format ions identifiers
45 formatIonIdentifiers <- function(variableMetadata, numDigitsRT=0, numDigitsMZ=0) {
46 splitDeco <- strsplit(as.character(variableMetadata$name),"_")
47 idsDeco <- sapply(splitDeco, function(x) { deco=unlist(x)[2]; if (is.na(deco)) return ("") else return(paste0("_",deco)) })
48 namecustom <- make.unique(paste0("M",round(variableMetadata[,"mz"],numDigitsMZ),"T",round(variableMetadata[,"rt"],numDigitsRT),idsDeco))
49 variableMetadata <- cbind(name=variableMetadata$name, namecustom=namecustom, variableMetadata[,!(colnames(variableMetadata) %in% c("name"))])
50 return(variableMetadata)
51 }
52
53 #@author G. Le Corguille
54 # Draw the plotChromPeakDensity 3 per page in a pdf file
55 getPlotChromPeakDensity <- function(xdata) {
56 pdf(file="plotChromPeakDensity.pdf", width=16, height=12)
57
58 par(mfrow = c(3, 1), mar = c(4, 4, 1, 0.5))
59
60 group_colors <- brewer.pal(3, "Set1")[1:length(unique(xdata$sample_group))]
61 names(group_colors) <- unique(xdata$sample_group)
62
63 xlim <- c(min(featureDefinitions(xdata)$rtmin), max(featureDefinitions(xdata)$rtmax))
64 for (i in 1:nrow(featureDefinitions(xdata))) {
65 plotChromPeakDensity(xdata, mz=c(featureDefinitions(xdata)[i,]$mzmin,featureDefinitions(xdata)[i,]$mzmax), col=group_colors, pch=16, xlim=xlim)
66 legend("topright", legend=names(group_colors), col=group_colors, cex=0.8, lty=1)
67 }
68
69 dev.off()
70 }
71
72 #@author G. Le Corguille
73 # Draw the plotChromPeakDensity 3 per page in a pdf file
74 getPlotAdjustedRtime <- function(xdata) {
75 pdf(file="raw_vs_adjusted_rt.pdf", width=16, height=12)
76 # Color by group
77 group_colors <- brewer.pal(3, "Set1")[1:length(unique(xdata$sample_group))]
78 names(group_colors) <- unique(xdata$sample_group)
79 plotAdjustedRtime(xdata, col = group_colors[xdata$sample_group])
80 legend("topright", legend=names(group_colors), col=group_colors, cex=0.8, lty=1)
81 # Color by sample
82 plotAdjustedRtime(xdata, col = rainbow(length(xdata@phenoData@data$sample_name)))
83 legend("topright", legend=xdata@phenoData@data$sample_name, col=rainbow(length(xdata@phenoData@data$sample_name)), cex=0.8, lty=1)
84 dev.off()
85 }
86
87 #@author G. Le Corguille
88 # value: intensity values to be used into, maxo or intb
89 getPeaklistW4M <- function(xdata, intval="into", convertRTMinute=F, numDigitsMZ=4, numDigitsRT=0, variableMetadataOutput, dataMatrixOutput) {
90 dataMatrix <- featureValues(xdata, method="medret", value=intval)
91 colnames(dataMatrix) <- tools::file_path_sans_ext(colnames(dataMatrix))
92 dataMatrix = cbind(name=groupnamesW4M(xdata), dataMatrix)
93 variableMetadata <- featureDefinitions(xdata)
94 colnames(variableMetadata)[1] = "mz"; colnames(variableMetadata)[4] = "rt"
95 variableMetadata = data.frame(name=groupnamesW4M(xdata), variableMetadata)
96
97 variableMetadata <- RTSecondToMinute(variableMetadata, convertRTMinute)
98 variableMetadata <- formatIonIdentifiers(variableMetadata, numDigitsRT=numDigitsRT, numDigitsMZ=numDigitsMZ)
99
100 write.table(variableMetadata, file=variableMetadataOutput,sep="\t",quote=F,row.names=F)
101 write.table(dataMatrix, file=dataMatrixOutput,sep="\t",quote=F,row.names=F)
102
103 }
104
105 #@author Y. Guitton
106 getBPC <- function(file,rtcor=NULL, ...) {
107 object <- xcmsRaw(file)
108 sel <- profRange(object, ...)
109 cbind(if (is.null(rtcor)) object@scantime[sel$scanidx] else rtcor ,xcms:::colMax(object@env$profile[sel$massidx,sel$scanidx,drop=FALSE]))
110 #plotChrom(xcmsRaw(file), base=T)
111 }
112
113 #@author Y. Guitton
114 getBPCs <- function (xcmsSet=NULL, pdfname="BPCs.pdf",rt=c("raw","corrected"), scanrange=NULL) {
115 cat("Creating BIC pdf...\n")
116
117 if (is.null(xcmsSet)) {
118 cat("Enter an xcmsSet \n")
119 stop()
120 } else {
121 files <- filepaths(xcmsSet)
122 }
123
124 phenoDataClass <- as.vector(levels(xcmsSet@phenoData[,"class"])) #sometime phenoData have more than 1 column use first as class
125
126 classnames <- vector("list",length(phenoDataClass))
127 for (i in 1:length(phenoDataClass)){
128 classnames[[i]] <- which( xcmsSet@phenoData[,"class"]==phenoDataClass[i])
129 }
130
131 N <- dim(phenoData(xcmsSet))[1]
132
133 TIC <- vector("list",N)
134
135
136 for (j in 1:N) {
137
138 TIC[[j]] <- getBPC(files[j])
139 #good for raw
140 # seems strange for corrected
141 #errors if scanrange used in xcmsSetgeneration
142 if (!is.null(xcmsSet) && rt == "corrected")
143 rtcor <- xcmsSet@rt$corrected[[j]]
144 else
145 rtcor <- NULL
146
147 TIC[[j]] <- getBPC(files[j],rtcor=rtcor)
148 # TIC[[j]][,1]<-rtcor
149 }
150
151
152
153 pdf(pdfname,w=16,h=10)
154 cols <- rainbow(N)
155 lty <- 1:N
156 pch <- 1:N
157 #search for max x and max y in BPCs
158 xlim <- range(sapply(TIC, function(x) range(x[,1])))
159 ylim <- range(sapply(TIC, function(x) range(x[,2])))
160 ylim <- c(-ylim[2], ylim[2])
161
162
163 ##plot start
164
165 if (length(phenoDataClass)>2){
166 for (k in 1:(length(phenoDataClass)-1)){
167 for (l in (k+1):length(phenoDataClass)){
168 #print(paste(phenoDataClass[k],"vs",phenoDataClass[l],sep=" "))
169 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Base Peak Chromatograms \n","BPCs_",phenoDataClass[k]," vs ",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="BPC")
170 colvect <- NULL
171 for (j in 1:length(classnames[[k]])) {
172 tic <- TIC[[classnames[[k]][j]]]
173 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l")
174 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l")
175 colvect <- append(colvect,cols[classnames[[k]][j]])
176 }
177 for (j in 1:length(classnames[[l]])) {
178 # i <- class2names[j]
179 tic <- TIC[[classnames[[l]][j]]]
180 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l")
181 colvect <- append(colvect,cols[classnames[[l]][j]])
182 }
183 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch)
184 }
185 }
186 }#end if length >2
187
188 if (length(phenoDataClass)==2){
189 k <- 1
190 l <- 2
191 colvect <- NULL
192 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Base Peak Chromatograms \n","BPCs_",phenoDataClass[k],"vs",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="BPC")
193
194 for (j in 1:length(classnames[[k]])) {
195
196 tic <- TIC[[classnames[[k]][j]]]
197 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l")
198 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l")
199 colvect<-append(colvect,cols[classnames[[k]][j]])
200 }
201 for (j in 1:length(classnames[[l]])) {
202 # i <- class2names[j]
203 tic <- TIC[[classnames[[l]][j]]]
204 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l")
205 colvect <- append(colvect,cols[classnames[[l]][j]])
206 }
207 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch)
208
209 }#end length ==2
210
211 #case where only one class
212 if (length(phenoDataClass)==1){
213 k <- 1
214 ylim <- range(sapply(TIC, function(x) range(x[,2])))
215 colvect <- NULL
216 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Base Peak Chromatograms \n","BPCs_",phenoDataClass[k], sep=""), xlab="Retention Time (min)", ylab="BPC")
217
218 for (j in 1:length(classnames[[k]])) {
219 tic <- TIC[[classnames[[k]][j]]]
220 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l")
221 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l")
222 colvect <- append(colvect,cols[classnames[[k]][j]])
223 }
224
225 legend("topright",paste(basename(files[c(classnames[[k]])])), col=colvect, lty=lty, pch=pch)
226
227 }#end length ==1
228
229 dev.off() #pdf(pdfname,w=16,h=10)
230
231 invisible(TIC)
232 }
233
234
235
236 #@author Y. Guitton
237 getTIC <- function(file, rtcor=NULL) {
238 object <- xcmsRaw(file)
239 cbind(if (is.null(rtcor)) object@scantime else rtcor, rawEIC(object, mzrange=range(object@env$mz))$intensity)
240 }
241
242 #overlay TIC from all files in current folder or from xcmsSet, create pdf
243 #@author Y. Guitton
244 getTICs <- function(xcmsSet=NULL,files=NULL, pdfname="TICs.pdf", rt=c("raw","corrected")) {
245 cat("Creating TIC pdf...\n")
246
247 if (is.null(xcmsSet)) {
248 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]", "[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
249 filepattern <- paste(paste("\\.", filepattern, "$", sep=""), collapse="|")
250 if (is.null(files))
251 files <- getwd()
252 info <- file.info(files)
253 listed <- list.files(files[info$isdir], pattern=filepattern, recursive=TRUE, full.names=TRUE)
254 files <- c(files[!info$isdir], listed)
255 } else {
256 files <- filepaths(xcmsSet)
257 }
258
259 phenoDataClass <- as.vector(levels(xcmsSet@phenoData[,"class"])) #sometime phenoData have more than 1 column use first as class
260 classnames <- vector("list",length(phenoDataClass))
261 for (i in 1:length(phenoDataClass)){
262 classnames[[i]] <- which( xcmsSet@phenoData[,"class"]==phenoDataClass[i])
263 }
264
265 N <- length(files)
266 TIC <- vector("list",N)
267
268 for (i in 1:N) {
269 if (!is.null(xcmsSet) && rt == "corrected")
270 rtcor <- xcmsSet@rt$corrected[[i]] else
271 rtcor <- NULL
272 TIC[[i]] <- getTIC(files[i], rtcor=rtcor)
273 }
274
275 pdf(pdfname, w=16, h=10)
276 cols <- rainbow(N)
277 lty <- 1:N
278 pch <- 1:N
279 #search for max x and max y in TICs
280 xlim <- range(sapply(TIC, function(x) range(x[,1])))
281 ylim <- range(sapply(TIC, function(x) range(x[,2])))
282 ylim <- c(-ylim[2], ylim[2])
283
284
285 ##plot start
286 if (length(phenoDataClass)>2){
287 for (k in 1:(length(phenoDataClass)-1)){
288 for (l in (k+1):length(phenoDataClass)){
289 #print(paste(phenoDataClass[k],"vs",phenoDataClass[l],sep=" "))
290 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Total Ion Chromatograms \n","TICs_",phenoDataClass[k]," vs ",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="TIC")
291 colvect <- NULL
292 for (j in 1:length(classnames[[k]])) {
293 tic <- TIC[[classnames[[k]][j]]]
294 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l")
295 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l")
296 colvect <- append(colvect,cols[classnames[[k]][j]])
297 }
298 for (j in 1:length(classnames[[l]])) {
299 # i=class2names[j]
300 tic <- TIC[[classnames[[l]][j]]]
301 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l")
302 colvect <- append(colvect,cols[classnames[[l]][j]])
303 }
304 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch)
305 }
306 }
307 }#end if length >2
308 if (length(phenoDataClass)==2){
309 k <- 1
310 l <- 2
311
312 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Total Ion Chromatograms \n","TICs_",phenoDataClass[k],"vs",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="TIC")
313 colvect <- NULL
314 for (j in 1:length(classnames[[k]])) {
315 tic <- TIC[[classnames[[k]][j]]]
316 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l")
317 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l")
318 colvect <- append(colvect,cols[classnames[[k]][j]])
319 }
320 for (j in 1:length(classnames[[l]])) {
321 # i <- class2names[j]
322 tic <- TIC[[classnames[[l]][j]]]
323 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l")
324 colvect <- append(colvect,cols[classnames[[l]][j]])
325 }
326 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch)
327
328 }#end length ==2
329
330 #case where only one class
331 if (length(phenoDataClass)==1){
332 k <- 1
333 ylim <- range(sapply(TIC, function(x) range(x[,2])))
334
335 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Total Ion Chromatograms \n","TICs_",phenoDataClass[k], sep=""), xlab="Retention Time (min)", ylab="TIC")
336 colvect <- NULL
337 for (j in 1:length(classnames[[k]])) {
338 tic <- TIC[[classnames[[k]][j]]]
339 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l")
340 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l")
341 colvect <- append(colvect,cols[classnames[[k]][j]])
342 }
343
344 legend("topright",paste(basename(files[c(classnames[[k]])])), col=colvect, lty=lty, pch=pch)
345
346 }#end length ==1
347
348 dev.off() #pdf(pdfname,w=16,h=10)
349
350 invisible(TIC)
351 }
352
353
354
355 # Get the polarities from all the samples of a condition
356 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
357 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
358 getSampleMetadata <- function(xdata=NULL, sampleMetadataOutput="sampleMetadata.tsv") {
359 cat("Creating the sampleMetadata file...\n")
360
361 #Create the sampleMetada dataframe
362 sampleMetadata <- xdata@phenoData@data
363 rownames(sampleMetadata) <- NULL
364 colnames(sampleMetadata) <- c("sampleMetadata", "class")
365
366 sampleNamesOrigin <- sampleMetadata$sampleMetadata
367 sampleNamesMakeNames <- make.names(sampleNamesOrigin)
368
369 if (any(duplicated(sampleNamesMakeNames))) {
370 write("\n\nERROR: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names().\nIn your case, at least two columns after the renaming obtain the same name, thus XCMS will collapse those columns per name.", stderr())
371 for (sampleName in sampleNamesOrigin) {
372 write(paste(sampleName,"\t->\t",make.names(sampleName)),stderr())
373 }
374 stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.")
375 }
376
377 if (!all(sampleNamesOrigin == sampleNamesMakeNames)) {
378 cat("\n\nWARNING: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names()\nIn your case, one or more sample names will be renamed in the sampleMetadata and dataMatrix files:\n")
379 for (sampleName in sampleNamesOrigin) {
380 cat(paste(sampleName,"\t->\t",make.names(sampleName),"\n"))
381 }
382 }
383
384 sampleMetadata$sampleMetadata <- sampleNamesMakeNames
385
386
387 #For each sample file, the following actions are done
388 for (fileIdx in 1:length(fileNames(xdata))) {
389 #Check if the file is in the CDF format
390 if (!mzR:::netCDFIsFile(fileNames(xdata))) {
391
392 # If the column isn't exist, with add one filled with NA
393 if (is.null(sampleMetadata$polarity)) sampleMetadata$polarity <- NA
394
395 #Extract the polarity (a list of polarities)
396 polarity <- fData(xdata)[fData(xdata)$fileIdx == fileIdx,"polarity"]
397 #Verify if all the scans have the same polarity
398 uniq_list <- unique(polarity)
399 if (length(uniq_list)>1){
400 polarity <- "mixed"
401 } else {
402 polarity <- as.character(uniq_list)
403 }
404
405 #Set the polarity attribute
406 sampleMetadata$polarity[fileIdx] <- polarity
407 }
408
409 }
410
411 write.table(sampleMetadata, sep="\t", quote=FALSE, row.names=FALSE, file=sampleMetadataOutput)
412
413 return(list("sampleNamesOrigin"=sampleNamesOrigin, "sampleNamesMakeNames"=sampleNamesMakeNames))
414
415 }
416
417
418 # This function check if xcms will found all the files
419 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
420 checkFilesCompatibilityWithXcms <- function(directory) {
421 cat("Checking files filenames compatibilities with xmcs...\n")
422 # WHAT XCMS WILL FIND
423 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
424 filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|")
425 info <- file.info(directory)
426 listed <- list.files(directory[info$isdir], pattern=filepattern, recursive=TRUE, full.names=TRUE)
427 files <- c(directory[!info$isdir], listed)
428 files_abs <- file.path(getwd(), files)
429 exists <- file.exists(files_abs)
430 files[exists] <- files_abs[exists]
431 files[exists] <- sub("//","/",files[exists])
432
433 # WHAT IS ON THE FILESYSTEM
434 filesystem_filepaths <- system(paste("find $PWD/",directory," -not -name '\\.*' -not -path '*conda-env*' -type f -name \"*\"", sep=""), intern=T)
435 filesystem_filepaths <- filesystem_filepaths[grep(filepattern, filesystem_filepaths, perl=T)]
436
437 # COMPARISON
438 if (!is.na(table(filesystem_filepaths %in% files)["FALSE"])) {
439 write("\n\nERROR: List of the files which will not be imported by xcmsSet",stderr())
440 write(filesystem_filepaths[!(filesystem_filepaths %in% files)],stderr())
441 stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.")
442 }
443 }
444
445
446 #This function list the compatible files within the directory as xcms did
447 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM
448 getMSFiles <- function (directory) {
449 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
450 filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|")
451 info <- file.info(directory)
452 listed <- list.files(directory[info$isdir], pattern=filepattern,recursive=TRUE, full.names=TRUE)
453 files <- c(directory[!info$isdir], listed)
454 exists <- file.exists(files)
455 files <- files[exists]
456 return(files)
457 }
458
459 # This function check if XML contains special caracters. It also checks integrity and completness.
460 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
461 checkXmlStructure <- function (directory) {
462 cat("Checking XML structure...\n")
463
464 cmd <- paste("IFS=$'\n'; for xml in $(find",directory,"-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'); do if [ $(xmllint --nonet --noout \"$xml\" 2> /dev/null; echo $?) -gt 0 ]; then echo $xml;fi; done;")
465 capture <- system(cmd, intern=TRUE)
466
467 if (length(capture)>0){
468 #message=paste("The following mzXML or mzML file is incorrect, please check these files first:",capture)
469 write("\n\nERROR: The following mzXML or mzML file(s) are incorrect, please check these files first:", stderr())
470 write(capture, stderr())
471 stop("ERROR: xcmsSet cannot continue with incorrect mzXML or mzML files")
472 }
473
474 }
475
476
477 # This function check if XML contain special characters
478 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM
479 deleteXmlBadCharacters<- function (directory) {
480 cat("Checking Non ASCII characters in the XML...\n")
481
482 processed <- F
483 l <- system( paste("find",directory, "-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'"), intern=TRUE)
484 for (i in l){
485 cmd <- paste("LC_ALL=C grep '[^ -~]' \"", i, "\"", sep="")
486 capture <- suppressWarnings(system(cmd, intern=TRUE))
487 if (length(capture)>0){
488 cmd <- paste("perl -i -pe 's/[^[:ascii:]]//g;'",i)
489 print( paste("WARNING: Non ASCII characters have been removed from the ",i,"file") )
490 c <- system(cmd, intern=TRUE)
491 capture <- ""
492 processed <- T
493 }
494 }
495 if (processed) cat("\n\n")
496 return(processed)
497 }
498
499
500 # This function will compute MD5 checksum to check the data integrity
501 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr
502 getMd5sum <- function (directory) {
503 cat("Compute md5 checksum...\n")
504 # WHAT XCMS WILL FIND
505 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]")
506 filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|")
507 info <- file.info(directory)
508 listed <- list.files(directory[info$isdir], pattern=filepattern, recursive=TRUE, full.names=TRUE)
509 files <- c(directory[!info$isdir], listed)
510 exists <- file.exists(files)
511 files <- files[exists]
512
513 library(tools)
514
515 #cat("\n\n")
516
517 return(as.matrix(md5sum(files)))
518 }
519
520
521 # This function get the raw file path from the arguments
522 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr
523 getRawfilePathFromArguments <- function(singlefile, zipfile, args) {
524 if (!is.null(args$zipfile)) zipfile <- args$zipfile
525 if (!is.null(args$zipfilePositive)) zipfile <- args$zipfilePositive
526 if (!is.null(args$zipfileNegative)) zipfile <- args$zipfileNegative
527
528 if (!is.null(args$singlefile_galaxyPath)) {
529 singlefile_galaxyPaths <- args$singlefile_galaxyPath;
530 singlefile_sampleNames <- args$singlefile_sampleName
531 }
532 if (!is.null(args$singlefile_galaxyPathPositive)) {
533 singlefile_galaxyPaths <- args$singlefile_galaxyPathPositive;
534 singlefile_sampleNames <- args$singlefile_sampleNamePositive
535 }
536 if (!is.null(args$singlefile_galaxyPathNegative)) {
537 singlefile_galaxyPaths <- args$singlefile_galaxyPathNegative;
538 singlefile_sampleNames <- args$singlefile_sampleNameNegative
539 }
540 if (exists("singlefile_galaxyPaths")){
541 singlefile_galaxyPaths <- unlist(strsplit(singlefile_galaxyPaths,","))
542 singlefile_sampleNames <- unlist(strsplit(singlefile_sampleNames,","))
543
544 singlefile <- NULL
545 for (singlefile_galaxyPath_i in seq(1:length(singlefile_galaxyPaths))) {
546 singlefile_galaxyPath <- singlefile_galaxyPaths[singlefile_galaxyPath_i]
547 singlefile_sampleName <- singlefile_sampleNames[singlefile_galaxyPath_i]
548 singlefile[[singlefile_sampleName]] <- singlefile_galaxyPath
549 }
550 }
551 for (argument in c("zipfile","zipfilePositive","zipfileNegative","singlefile_galaxyPath","singlefile_sampleName","singlefile_galaxyPathPositive","singlefile_sampleNamePositive","singlefile_galaxyPathNegative","singlefile_sampleNameNegative")) {
552 args[[argument]] <- NULL
553 }
554 return(list(zipfile=zipfile, singlefile=singlefile, args=args))
555 }
556
557
558 # This function retrieve the raw file in the working directory
559 # - if zipfile: unzip the file with its directory tree
560 # - if singlefiles: set symlink with the good filename
561 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr
562 retrieveRawfileInTheWorkingDirectory <- function(singlefile, zipfile) {
563 if(!is.null(singlefile) && (length("singlefile")>0)) {
564 for (singlefile_sampleName in names(singlefile)) {
565 singlefile_galaxyPath <- singlefile[[singlefile_sampleName]]
566 if(!file.exists(singlefile_galaxyPath)){
567 error_message <- paste("Cannot access the sample:",singlefile_sampleName,"located:",singlefile_galaxyPath,". Please, contact your administrator ... if you have one!")
568 print(error_message); stop(error_message)
569 }
570
571 if (!suppressWarnings( try (file.link(singlefile_galaxyPath, singlefile_sampleName), silent=T)))
572 file.copy(singlefile_galaxyPath, singlefile_sampleName)
573
574 }
575 directory <- "."
576
577 }
578 if(!is.null(zipfile) && (zipfile != "")) {
579 if(!file.exists(zipfile)){
580 error_message <- paste("Cannot access the Zip file:",zipfile,". Please, contact your administrator ... if you have one!")
581 print(error_message)
582 stop(error_message)
583 }
584
585 #list all file in the zip file
586 #zip_files <- unzip(zipfile,list=T)[,"Name"]
587
588 #unzip
589 suppressWarnings(unzip(zipfile, unzip="unzip"))
590
591 #get the directory name
592 suppressWarnings(filesInZip <- unzip(zipfile, list=T))
593 directories <- unique(unlist(lapply(strsplit(filesInZip$Name,"/"), function(x) x[1])))
594 directories <- directories[!(directories %in% c("__MACOSX")) & file.info(directories)$isdir]
595 directory <- "."
596 if (length(directories) == 1) directory <- directories
597
598 cat("files_root_directory\t",directory,"\n")
599
600 }
601 return (directory)
602 }
603
604
605 # This function retrieve a xset like object
606 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr
607 getxcmsSetObject <- function(xobject) {
608 # XCMS 1.x
609 if (class(xobject) == "xcmsSet")
610 return (xobject)
611 # XCMS 3.x
612 if (class(xobject) == "XCMSnExp") {
613 # Get the legacy xcmsSet object
614 suppressWarnings(xset <- as(xobject, 'xcmsSet'))
615 sampclass(xset) <- xset@phenoData$sample_group
616 return (xset)
617 }
618 }
619
620
621 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7
622 # https://github.com/sneumann/xcms/issues/250
623 groupnamesW4M <- function(xdata, mzdec = 0, rtdec = 0) {
624 mzfmt <- paste("%.", mzdec, "f", sep = "")
625 rtfmt <- paste("%.", rtdec, "f", sep = "")
626
627 gnames <- paste("M", sprintf(mzfmt, featureDefinitions(xdata)[,"mzmed"]), "T",
628 sprintf(rtfmt, featureDefinitions(xdata)[,"rtmed"]), sep = "")
629
630 if (any(dup <- duplicated(gnames)))
631 for (dupname in unique(gnames[dup])) {
632 dupidx <- which(gnames == dupname)
633 gnames[dupidx] <- paste(gnames[dupidx], seq(along = dupidx), sep = "_")
634 }
635
636 return (gnames)
637 }
638
639 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7
640 # https://github.com/sneumann/xcms/issues/247
641 .concatenate_XCMSnExp <- function(...) {
642 x <- list(...)
643 if (length(x) == 0)
644 return(NULL)
645 if (length(x) == 1)
646 return(x[[1]])
647 ## Check that all are XCMSnExp objects.
648 if (!all(unlist(lapply(x, function(z) is(z, "XCMSnExp")))))
649 stop("All passed objects should be 'XCMSnExp' objects")
650 new_x <- as(.concatenate_OnDiskMSnExp(...), "XCMSnExp")
651 ## If any of the XCMSnExp has alignment results or detected features drop
652 ## them!
653 x <- lapply(x, function(z) {
654 if (hasAdjustedRtime(z)) {
655 z <- dropAdjustedRtime(z)
656 warning("Adjusted retention times found, had to drop them.")
657 }
658 if (hasFeatures(z)) {
659 z <- dropFeatureDefinitions(z)
660 warning("Feature definitions found, had to drop them.")
661 }
662 z
663 })
664 ## Combine peaks
665 fls <- lapply(x, fileNames)
666 startidx <- cumsum(lengths(fls))
667 pks <- lapply(x, chromPeaks)
668 procH <- lapply(x, processHistory)
669 for (i in 2:length(fls)) {
670 pks[[i]][, "sample"] <- pks[[i]][, "sample"] + startidx[i - 1]
671 procH[[i]] <- lapply(procH[[i]], function(z) {
672 z@fileIndex <- as.integer(z@fileIndex + startidx[i - 1])
673 z
674 })
675 }
676 pks <- do.call(rbind, pks)
677 new_x@.processHistory <- unlist(procH)
678 chromPeaks(new_x) <- pks
679 if (validObject(new_x))
680 new_x
681 }
682
683 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7
684 # https://github.com/sneumann/xcms/issues/247
685 .concatenate_OnDiskMSnExp <- function(...) {
686 x <- list(...)
687 if (length(x) == 0)
688 return(NULL)
689 if (length(x) == 1)
690 return(x[[1]])
691 ## Check that all are XCMSnExp objects.
692 if (!all(unlist(lapply(x, function(z) is(z, "OnDiskMSnExp")))))
693 stop("All passed objects should be 'OnDiskMSnExp' objects")
694 ## Check processingQueue
695 procQ <- lapply(x, function(z) z@spectraProcessingQueue)
696 new_procQ <- procQ[[1]]
697 is_ok <- unlist(lapply(procQ, function(z)
698 !is.character(all.equal(new_procQ, z))
699 ))
700 if (any(!is_ok)) {
701 warning("Processing queues from the submitted objects differ! ",
702 "Dropping the processing queue.")
703 new_procQ <- list()
704 }
705 ## processingData
706 fls <- lapply(x, function(z) z@processingData@files)
707 startidx <- cumsum(lengths(fls))
708 ## featureData
709 featd <- lapply(x, fData)
710 ## Have to update the file index and the spectrum names.
711 for (i in 2:length(featd)) {
712 featd[[i]]$fileIdx <- featd[[i]]$fileIdx + startidx[i - 1]
713 rownames(featd[[i]]) <- MSnbase:::formatFileSpectrumNames(
714 fileIds = featd[[i]]$fileIdx,
715 spectrumIds = featd[[i]]$spIdx,
716 nSpectra = nrow(featd[[i]]),
717 nFiles = length(unlist(fls))
718 )
719 }
720 featd <- do.call(rbind, featd)
721 featd$spectrum <- 1:nrow(featd)
722 ## experimentData
723 expdata <- lapply(x, function(z) {
724 ed <- z@experimentData
725 data.frame(instrumentManufacturer = ed@instrumentManufacturer,
726 instrumentModel = ed@instrumentModel,
727 ionSource = ed@ionSource,
728 analyser = ed@analyser,
729 detectorType = ed@detectorType,
730 stringsAsFactors = FALSE)
731 })
732 expdata <- do.call(rbind, expdata)
733 expdata <- new("MIAPE",
734 instrumentManufacturer = expdata$instrumentManufacturer,
735 instrumentModel = expdata$instrumentModel,
736 ionSource = expdata$ionSource,
737 analyser = expdata$analyser,
738 detectorType = expdata$detectorType)
739
740 ## protocolData
741 protodata <- lapply(x, function(z) z@protocolData)
742 if (any(unlist(lapply(protodata, nrow)) > 0))
743 warning("Found non-empty protocol data, but merging protocol data is",
744 " currently not supported. Skipped.")
745 ## phenoData
746 pdata <- do.call(rbind, lapply(x, pData))
747 res <- new(
748 "OnDiskMSnExp",
749 phenoData = new("NAnnotatedDataFrame", data = pdata),
750 featureData = new("AnnotatedDataFrame", featd),
751 processingData = new("MSnProcess",
752 processing = paste0("Concatenated [", date(), "]"),
753 files = unlist(fls), smoothed = NA),
754 experimentData = expdata,
755 spectraProcessingQueue = new_procQ)
756 if (validObject(res))
757 res
758 }
759
760 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7
761 # https://github.com/sneumann/xcms/issues/247
762 c.XCMSnExp <- function(...) {
763 .concatenate_XCMSnExp(...)
764 }