Mercurial > repos > lecorguille > xcms_merge
comparison lib.r @ 7:dca722aecb67 draft
planemo upload for repository https://github.com/workflow4metabolomics/xcms commit e384d6dd5f410799ec211f73bca0b5d5d7bc651e
author | lecorguille |
---|---|
date | Thu, 01 Mar 2018 04:15:19 -0500 |
parents | |
children | 6b5504f877ff |
comparison
equal
deleted
inserted
replaced
6:5c7a7484dc51 | 7:dca722aecb67 |
---|---|
1 #@authors ABiMS TEAM, Y. Guitton | |
2 # lib.r for Galaxy Workflow4Metabolomics xcms tools | |
3 | |
4 #@author G. Le Corguille | |
5 # solve an issue with batch if arguments are logical TRUE/FALSE | |
6 parseCommandArgs <- function(...) { | |
7 args <- batch::parseCommandArgs(...) | |
8 for (key in names(args)) { | |
9 if (args[key] %in% c("TRUE","FALSE")) | |
10 args[key] = as.logical(args[key]) | |
11 } | |
12 return(args) | |
13 } | |
14 | |
15 #@author G. Le Corguille | |
16 # This function will | |
17 # - load the packages | |
18 # - display the sessionInfo | |
19 loadAndDisplayPackages <- function(pkgs) { | |
20 for(pkg in pkgs) suppressPackageStartupMessages( stopifnot( library(pkg, quietly=TRUE, logical.return=TRUE, character.only=TRUE))) | |
21 | |
22 sessioninfo = sessionInfo() | |
23 cat(sessioninfo$R.version$version.string,"\n") | |
24 cat("Main packages:\n") | |
25 for (pkg in names(sessioninfo$otherPkgs)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n") | |
26 cat("Other loaded packages:\n") | |
27 for (pkg in names(sessioninfo$loadedOnly)) { cat(paste(pkg,packageVersion(pkg)),"\t") }; cat("\n") | |
28 } | |
29 | |
30 #@author G. Le Corguille | |
31 # This function convert if it is required the Retention Time in minutes | |
32 RTSecondToMinute <- function(variableMetadata, convertRTMinute) { | |
33 if (convertRTMinute){ | |
34 #converting the retention times (seconds) into minutes | |
35 print("converting the retention times into minutes in the variableMetadata") | |
36 variableMetadata[,"rt"] <- variableMetadata[,"rt"]/60 | |
37 variableMetadata[,"rtmin"] <- variableMetadata[,"rtmin"]/60 | |
38 variableMetadata[,"rtmax"] <- variableMetadata[,"rtmax"]/60 | |
39 } | |
40 return (variableMetadata) | |
41 } | |
42 | |
43 #@author G. Le Corguille | |
44 # This function format ions identifiers | |
45 formatIonIdentifiers <- function(variableMetadata, numDigitsRT=0, numDigitsMZ=0) { | |
46 splitDeco <- strsplit(as.character(variableMetadata$name),"_") | |
47 idsDeco <- sapply(splitDeco, function(x) { deco=unlist(x)[2]; if (is.na(deco)) return ("") else return(paste0("_",deco)) }) | |
48 namecustom <- make.unique(paste0("M",round(variableMetadata[,"mz"],numDigitsMZ),"T",round(variableMetadata[,"rt"],numDigitsRT),idsDeco)) | |
49 variableMetadata <- cbind(name=variableMetadata$name, namecustom=namecustom, variableMetadata[,!(colnames(variableMetadata) %in% c("name"))]) | |
50 return(variableMetadata) | |
51 } | |
52 | |
53 #@author G. Le Corguille | |
54 # Draw the plotChromPeakDensity 3 per page in a pdf file | |
55 getPlotChromPeakDensity <- function(xdata) { | |
56 pdf(file="plotChromPeakDensity.pdf", width=16, height=12) | |
57 | |
58 par(mfrow = c(3, 1), mar = c(4, 4, 1, 0.5)) | |
59 | |
60 group_colors <- brewer.pal(3, "Set1")[1:length(unique(xdata$sample_group))] | |
61 names(group_colors) <- unique(xdata$sample_group) | |
62 | |
63 xlim <- c(min(featureDefinitions(xdata)$rtmin), max(featureDefinitions(xdata)$rtmax)) | |
64 for (i in 1:nrow(featureDefinitions(xdata))) { | |
65 plotChromPeakDensity(xdata, mz=c(featureDefinitions(xdata)[i,]$mzmin,featureDefinitions(xdata)[i,]$mzmax), col=group_colors, pch=16, xlim=xlim) | |
66 legend("topright", legend=names(group_colors), col=group_colors, cex=0.8, lty=1) | |
67 } | |
68 | |
69 dev.off() | |
70 } | |
71 | |
72 #@author G. Le Corguille | |
73 # Draw the plotChromPeakDensity 3 per page in a pdf file | |
74 getPlotAdjustedRtime <- function(xdata) { | |
75 pdf(file="raw_vs_adjusted_rt.pdf", width=16, height=12) | |
76 # Color by group | |
77 group_colors <- brewer.pal(3, "Set1")[1:length(unique(xdata$sample_group))] | |
78 names(group_colors) <- unique(xdata$sample_group) | |
79 plotAdjustedRtime(xdata, col = group_colors[xdata$sample_group]) | |
80 legend("topright", legend=names(group_colors), col=group_colors, cex=0.8, lty=1) | |
81 # Color by sample | |
82 plotAdjustedRtime(xdata, col = rainbow(length(xdata@phenoData@data$sample_name))) | |
83 legend("topright", legend=xdata@phenoData@data$sample_name, col=rainbow(length(xdata@phenoData@data$sample_name)), cex=0.8, lty=1) | |
84 dev.off() | |
85 } | |
86 | |
87 #@author G. Le Corguille | |
88 # value: intensity values to be used into, maxo or intb | |
89 getPeaklistW4M <- function(xdata, intval="into", convertRTMinute=F, numDigitsMZ=4, numDigitsRT=0, variableMetadataOutput, dataMatrixOutput) { | |
90 dataMatrix <- featureValues(xdata, method="medret", value=intval) | |
91 colnames(dataMatrix) <- tools::file_path_sans_ext(colnames(dataMatrix)) | |
92 dataMatrix = cbind(name=groupnamesW4M(xdata), dataMatrix) | |
93 variableMetadata <- featureDefinitions(xdata) | |
94 colnames(variableMetadata)[1] = "mz"; colnames(variableMetadata)[4] = "rt" | |
95 variableMetadata = data.frame(name=groupnamesW4M(xdata), variableMetadata) | |
96 | |
97 variableMetadata <- RTSecondToMinute(variableMetadata, convertRTMinute) | |
98 variableMetadata <- formatIonIdentifiers(variableMetadata, numDigitsRT=numDigitsRT, numDigitsMZ=numDigitsMZ) | |
99 | |
100 write.table(variableMetadata, file=variableMetadataOutput,sep="\t",quote=F,row.names=F) | |
101 write.table(dataMatrix, file=dataMatrixOutput,sep="\t",quote=F,row.names=F) | |
102 | |
103 } | |
104 | |
105 #@author Y. Guitton | |
106 getBPC <- function(file,rtcor=NULL, ...) { | |
107 object <- xcmsRaw(file) | |
108 sel <- profRange(object, ...) | |
109 cbind(if (is.null(rtcor)) object@scantime[sel$scanidx] else rtcor ,xcms:::colMax(object@env$profile[sel$massidx,sel$scanidx,drop=FALSE])) | |
110 #plotChrom(xcmsRaw(file), base=T) | |
111 } | |
112 | |
113 #@author Y. Guitton | |
114 getBPCs <- function (xcmsSet=NULL, pdfname="BPCs.pdf",rt=c("raw","corrected"), scanrange=NULL) { | |
115 cat("Creating BIC pdf...\n") | |
116 | |
117 if (is.null(xcmsSet)) { | |
118 cat("Enter an xcmsSet \n") | |
119 stop() | |
120 } else { | |
121 files <- filepaths(xcmsSet) | |
122 } | |
123 | |
124 phenoDataClass <- as.vector(levels(xcmsSet@phenoData[,"class"])) #sometime phenoData have more than 1 column use first as class | |
125 | |
126 classnames <- vector("list",length(phenoDataClass)) | |
127 for (i in 1:length(phenoDataClass)){ | |
128 classnames[[i]] <- which( xcmsSet@phenoData[,"class"]==phenoDataClass[i]) | |
129 } | |
130 | |
131 N <- dim(phenoData(xcmsSet))[1] | |
132 | |
133 TIC <- vector("list",N) | |
134 | |
135 | |
136 for (j in 1:N) { | |
137 | |
138 TIC[[j]] <- getBPC(files[j]) | |
139 #good for raw | |
140 # seems strange for corrected | |
141 #errors if scanrange used in xcmsSetgeneration | |
142 if (!is.null(xcmsSet) && rt == "corrected") | |
143 rtcor <- xcmsSet@rt$corrected[[j]] | |
144 else | |
145 rtcor <- NULL | |
146 | |
147 TIC[[j]] <- getBPC(files[j],rtcor=rtcor) | |
148 # TIC[[j]][,1]<-rtcor | |
149 } | |
150 | |
151 | |
152 | |
153 pdf(pdfname,w=16,h=10) | |
154 cols <- rainbow(N) | |
155 lty <- 1:N | |
156 pch <- 1:N | |
157 #search for max x and max y in BPCs | |
158 xlim <- range(sapply(TIC, function(x) range(x[,1]))) | |
159 ylim <- range(sapply(TIC, function(x) range(x[,2]))) | |
160 ylim <- c(-ylim[2], ylim[2]) | |
161 | |
162 | |
163 ##plot start | |
164 | |
165 if (length(phenoDataClass)>2){ | |
166 for (k in 1:(length(phenoDataClass)-1)){ | |
167 for (l in (k+1):length(phenoDataClass)){ | |
168 #print(paste(phenoDataClass[k],"vs",phenoDataClass[l],sep=" ")) | |
169 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Base Peak Chromatograms \n","BPCs_",phenoDataClass[k]," vs ",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="BPC") | |
170 colvect <- NULL | |
171 for (j in 1:length(classnames[[k]])) { | |
172 tic <- TIC[[classnames[[k]][j]]] | |
173 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l") | |
174 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l") | |
175 colvect <- append(colvect,cols[classnames[[k]][j]]) | |
176 } | |
177 for (j in 1:length(classnames[[l]])) { | |
178 # i <- class2names[j] | |
179 tic <- TIC[[classnames[[l]][j]]] | |
180 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l") | |
181 colvect <- append(colvect,cols[classnames[[l]][j]]) | |
182 } | |
183 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch) | |
184 } | |
185 } | |
186 }#end if length >2 | |
187 | |
188 if (length(phenoDataClass)==2){ | |
189 k <- 1 | |
190 l <- 2 | |
191 colvect <- NULL | |
192 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Base Peak Chromatograms \n","BPCs_",phenoDataClass[k],"vs",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="BPC") | |
193 | |
194 for (j in 1:length(classnames[[k]])) { | |
195 | |
196 tic <- TIC[[classnames[[k]][j]]] | |
197 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l") | |
198 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l") | |
199 colvect<-append(colvect,cols[classnames[[k]][j]]) | |
200 } | |
201 for (j in 1:length(classnames[[l]])) { | |
202 # i <- class2names[j] | |
203 tic <- TIC[[classnames[[l]][j]]] | |
204 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l") | |
205 colvect <- append(colvect,cols[classnames[[l]][j]]) | |
206 } | |
207 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch) | |
208 | |
209 }#end length ==2 | |
210 | |
211 #case where only one class | |
212 if (length(phenoDataClass)==1){ | |
213 k <- 1 | |
214 ylim <- range(sapply(TIC, function(x) range(x[,2]))) | |
215 colvect <- NULL | |
216 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Base Peak Chromatograms \n","BPCs_",phenoDataClass[k], sep=""), xlab="Retention Time (min)", ylab="BPC") | |
217 | |
218 for (j in 1:length(classnames[[k]])) { | |
219 tic <- TIC[[classnames[[k]][j]]] | |
220 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l") | |
221 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l") | |
222 colvect <- append(colvect,cols[classnames[[k]][j]]) | |
223 } | |
224 | |
225 legend("topright",paste(basename(files[c(classnames[[k]])])), col=colvect, lty=lty, pch=pch) | |
226 | |
227 }#end length ==1 | |
228 | |
229 dev.off() #pdf(pdfname,w=16,h=10) | |
230 | |
231 invisible(TIC) | |
232 } | |
233 | |
234 | |
235 | |
236 #@author Y. Guitton | |
237 getTIC <- function(file, rtcor=NULL) { | |
238 object <- xcmsRaw(file) | |
239 cbind(if (is.null(rtcor)) object@scantime else rtcor, rawEIC(object, mzrange=range(object@env$mz))$intensity) | |
240 } | |
241 | |
242 #overlay TIC from all files in current folder or from xcmsSet, create pdf | |
243 #@author Y. Guitton | |
244 getTICs <- function(xcmsSet=NULL,files=NULL, pdfname="TICs.pdf", rt=c("raw","corrected")) { | |
245 cat("Creating TIC pdf...\n") | |
246 | |
247 if (is.null(xcmsSet)) { | |
248 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]", "[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
249 filepattern <- paste(paste("\\.", filepattern, "$", sep=""), collapse="|") | |
250 if (is.null(files)) | |
251 files <- getwd() | |
252 info <- file.info(files) | |
253 listed <- list.files(files[info$isdir], pattern=filepattern, recursive=TRUE, full.names=TRUE) | |
254 files <- c(files[!info$isdir], listed) | |
255 } else { | |
256 files <- filepaths(xcmsSet) | |
257 } | |
258 | |
259 phenoDataClass <- as.vector(levels(xcmsSet@phenoData[,"class"])) #sometime phenoData have more than 1 column use first as class | |
260 classnames <- vector("list",length(phenoDataClass)) | |
261 for (i in 1:length(phenoDataClass)){ | |
262 classnames[[i]] <- which( xcmsSet@phenoData[,"class"]==phenoDataClass[i]) | |
263 } | |
264 | |
265 N <- length(files) | |
266 TIC <- vector("list",N) | |
267 | |
268 for (i in 1:N) { | |
269 if (!is.null(xcmsSet) && rt == "corrected") | |
270 rtcor <- xcmsSet@rt$corrected[[i]] else | |
271 rtcor <- NULL | |
272 TIC[[i]] <- getTIC(files[i], rtcor=rtcor) | |
273 } | |
274 | |
275 pdf(pdfname, w=16, h=10) | |
276 cols <- rainbow(N) | |
277 lty <- 1:N | |
278 pch <- 1:N | |
279 #search for max x and max y in TICs | |
280 xlim <- range(sapply(TIC, function(x) range(x[,1]))) | |
281 ylim <- range(sapply(TIC, function(x) range(x[,2]))) | |
282 ylim <- c(-ylim[2], ylim[2]) | |
283 | |
284 | |
285 ##plot start | |
286 if (length(phenoDataClass)>2){ | |
287 for (k in 1:(length(phenoDataClass)-1)){ | |
288 for (l in (k+1):length(phenoDataClass)){ | |
289 #print(paste(phenoDataClass[k],"vs",phenoDataClass[l],sep=" ")) | |
290 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Total Ion Chromatograms \n","TICs_",phenoDataClass[k]," vs ",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="TIC") | |
291 colvect <- NULL | |
292 for (j in 1:length(classnames[[k]])) { | |
293 tic <- TIC[[classnames[[k]][j]]] | |
294 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l") | |
295 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l") | |
296 colvect <- append(colvect,cols[classnames[[k]][j]]) | |
297 } | |
298 for (j in 1:length(classnames[[l]])) { | |
299 # i=class2names[j] | |
300 tic <- TIC[[classnames[[l]][j]]] | |
301 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l") | |
302 colvect <- append(colvect,cols[classnames[[l]][j]]) | |
303 } | |
304 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch) | |
305 } | |
306 } | |
307 }#end if length >2 | |
308 if (length(phenoDataClass)==2){ | |
309 k <- 1 | |
310 l <- 2 | |
311 | |
312 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Total Ion Chromatograms \n","TICs_",phenoDataClass[k],"vs",phenoDataClass[l], sep=""), xlab="Retention Time (min)", ylab="TIC") | |
313 colvect <- NULL | |
314 for (j in 1:length(classnames[[k]])) { | |
315 tic <- TIC[[classnames[[k]][j]]] | |
316 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l") | |
317 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l") | |
318 colvect <- append(colvect,cols[classnames[[k]][j]]) | |
319 } | |
320 for (j in 1:length(classnames[[l]])) { | |
321 # i <- class2names[j] | |
322 tic <- TIC[[classnames[[l]][j]]] | |
323 points(tic[,1]/60, -tic[,2], col=cols[classnames[[l]][j]], pch=pch[classnames[[l]][j]], type="l") | |
324 colvect <- append(colvect,cols[classnames[[l]][j]]) | |
325 } | |
326 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col=colvect, lty=lty, pch=pch) | |
327 | |
328 }#end length ==2 | |
329 | |
330 #case where only one class | |
331 if (length(phenoDataClass)==1){ | |
332 k <- 1 | |
333 ylim <- range(sapply(TIC, function(x) range(x[,2]))) | |
334 | |
335 plot(0, 0, type="n", xlim=xlim/60, ylim=ylim, main=paste("Total Ion Chromatograms \n","TICs_",phenoDataClass[k], sep=""), xlab="Retention Time (min)", ylab="TIC") | |
336 colvect <- NULL | |
337 for (j in 1:length(classnames[[k]])) { | |
338 tic <- TIC[[classnames[[k]][j]]] | |
339 # points(tic[,1]/60, tic[,2], col=cols[i], pch=pch[i], type="l") | |
340 points(tic[,1]/60, tic[,2], col=cols[classnames[[k]][j]], pch=pch[classnames[[k]][j]], type="l") | |
341 colvect <- append(colvect,cols[classnames[[k]][j]]) | |
342 } | |
343 | |
344 legend("topright",paste(basename(files[c(classnames[[k]])])), col=colvect, lty=lty, pch=pch) | |
345 | |
346 }#end length ==1 | |
347 | |
348 dev.off() #pdf(pdfname,w=16,h=10) | |
349 | |
350 invisible(TIC) | |
351 } | |
352 | |
353 | |
354 | |
355 # Get the polarities from all the samples of a condition | |
356 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM | |
357 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM | |
358 getSampleMetadata <- function(xdata=NULL, sampleMetadataOutput="sampleMetadata.tsv") { | |
359 cat("Creating the sampleMetadata file...\n") | |
360 | |
361 #Create the sampleMetada dataframe | |
362 sampleMetadata <- xdata@phenoData@data | |
363 rownames(sampleMetadata) <- NULL | |
364 colnames(sampleMetadata) <- c("sampleMetadata", "class") | |
365 | |
366 sampleNamesOrigin <- sampleMetadata$sampleMetadata | |
367 sampleNamesMakeNames <- make.names(sampleNamesOrigin) | |
368 | |
369 if (any(duplicated(sampleNamesMakeNames))) { | |
370 write("\n\nERROR: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names().\nIn your case, at least two columns after the renaming obtain the same name, thus XCMS will collapse those columns per name.", stderr()) | |
371 for (sampleName in sampleNamesOrigin) { | |
372 write(paste(sampleName,"\t->\t",make.names(sampleName)),stderr()) | |
373 } | |
374 stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.") | |
375 } | |
376 | |
377 if (!all(sampleNamesOrigin == sampleNamesMakeNames)) { | |
378 cat("\n\nWARNING: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names()\nIn your case, one or more sample names will be renamed in the sampleMetadata and dataMatrix files:\n") | |
379 for (sampleName in sampleNamesOrigin) { | |
380 cat(paste(sampleName,"\t->\t",make.names(sampleName),"\n")) | |
381 } | |
382 } | |
383 | |
384 sampleMetadata$sampleMetadata <- sampleNamesMakeNames | |
385 | |
386 | |
387 #For each sample file, the following actions are done | |
388 for (fileIdx in 1:length(fileNames(xdata))) { | |
389 #Check if the file is in the CDF format | |
390 if (!mzR:::netCDFIsFile(fileNames(xdata))) { | |
391 | |
392 # If the column isn't exist, with add one filled with NA | |
393 if (is.null(sampleMetadata$polarity)) sampleMetadata$polarity <- NA | |
394 | |
395 #Extract the polarity (a list of polarities) | |
396 polarity <- fData(xdata)[fData(xdata)$fileIdx == fileIdx,"polarity"] | |
397 #Verify if all the scans have the same polarity | |
398 uniq_list <- unique(polarity) | |
399 if (length(uniq_list)>1){ | |
400 polarity <- "mixed" | |
401 } else { | |
402 polarity <- as.character(uniq_list) | |
403 } | |
404 | |
405 #Set the polarity attribute | |
406 sampleMetadata$polarity[fileIdx] <- polarity | |
407 } | |
408 | |
409 } | |
410 | |
411 write.table(sampleMetadata, sep="\t", quote=FALSE, row.names=FALSE, file=sampleMetadataOutput) | |
412 | |
413 return(list("sampleNamesOrigin"=sampleNamesOrigin, "sampleNamesMakeNames"=sampleNamesMakeNames)) | |
414 | |
415 } | |
416 | |
417 | |
418 # This function check if xcms will found all the files | |
419 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM | |
420 checkFilesCompatibilityWithXcms <- function(directory) { | |
421 cat("Checking files filenames compatibilities with xmcs...\n") | |
422 # WHAT XCMS WILL FIND | |
423 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
424 filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|") | |
425 info <- file.info(directory) | |
426 listed <- list.files(directory[info$isdir], pattern=filepattern, recursive=TRUE, full.names=TRUE) | |
427 files <- c(directory[!info$isdir], listed) | |
428 files_abs <- file.path(getwd(), files) | |
429 exists <- file.exists(files_abs) | |
430 files[exists] <- files_abs[exists] | |
431 files[exists] <- sub("//","/",files[exists]) | |
432 | |
433 # WHAT IS ON THE FILESYSTEM | |
434 filesystem_filepaths <- system(paste("find $PWD/",directory," -not -name '\\.*' -not -path '*conda-env*' -type f -name \"*\"", sep=""), intern=T) | |
435 filesystem_filepaths <- filesystem_filepaths[grep(filepattern, filesystem_filepaths, perl=T)] | |
436 | |
437 # COMPARISON | |
438 if (!is.na(table(filesystem_filepaths %in% files)["FALSE"])) { | |
439 write("\n\nERROR: List of the files which will not be imported by xcmsSet",stderr()) | |
440 write(filesystem_filepaths[!(filesystem_filepaths %in% files)],stderr()) | |
441 stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.") | |
442 } | |
443 } | |
444 | |
445 | |
446 #This function list the compatible files within the directory as xcms did | |
447 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM | |
448 getMSFiles <- function (directory) { | |
449 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
450 filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|") | |
451 info <- file.info(directory) | |
452 listed <- list.files(directory[info$isdir], pattern=filepattern,recursive=TRUE, full.names=TRUE) | |
453 files <- c(directory[!info$isdir], listed) | |
454 exists <- file.exists(files) | |
455 files <- files[exists] | |
456 return(files) | |
457 } | |
458 | |
459 # This function check if XML contains special caracters. It also checks integrity and completness. | |
460 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM | |
461 checkXmlStructure <- function (directory) { | |
462 cat("Checking XML structure...\n") | |
463 | |
464 cmd <- paste("IFS=$'\n'; for xml in $(find",directory,"-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'); do if [ $(xmllint --nonet --noout \"$xml\" 2> /dev/null; echo $?) -gt 0 ]; then echo $xml;fi; done;") | |
465 capture <- system(cmd, intern=TRUE) | |
466 | |
467 if (length(capture)>0){ | |
468 #message=paste("The following mzXML or mzML file is incorrect, please check these files first:",capture) | |
469 write("\n\nERROR: The following mzXML or mzML file(s) are incorrect, please check these files first:", stderr()) | |
470 write(capture, stderr()) | |
471 stop("ERROR: xcmsSet cannot continue with incorrect mzXML or mzML files") | |
472 } | |
473 | |
474 } | |
475 | |
476 | |
477 # This function check if XML contain special characters | |
478 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM | |
479 deleteXmlBadCharacters<- function (directory) { | |
480 cat("Checking Non ASCII characters in the XML...\n") | |
481 | |
482 processed <- F | |
483 l <- system( paste("find",directory, "-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'"), intern=TRUE) | |
484 for (i in l){ | |
485 cmd <- paste("LC_ALL=C grep '[^ -~]' \"", i, "\"", sep="") | |
486 capture <- suppressWarnings(system(cmd, intern=TRUE)) | |
487 if (length(capture)>0){ | |
488 cmd <- paste("perl -i -pe 's/[^[:ascii:]]//g;'",i) | |
489 print( paste("WARNING: Non ASCII characters have been removed from the ",i,"file") ) | |
490 c <- system(cmd, intern=TRUE) | |
491 capture <- "" | |
492 processed <- T | |
493 } | |
494 } | |
495 if (processed) cat("\n\n") | |
496 return(processed) | |
497 } | |
498 | |
499 | |
500 # This function will compute MD5 checksum to check the data integrity | |
501 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr | |
502 getMd5sum <- function (directory) { | |
503 cat("Compute md5 checksum...\n") | |
504 # WHAT XCMS WILL FIND | |
505 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
506 filepattern <- paste(paste("\\.", filepattern, "$", sep=""),collapse="|") | |
507 info <- file.info(directory) | |
508 listed <- list.files(directory[info$isdir], pattern=filepattern, recursive=TRUE, full.names=TRUE) | |
509 files <- c(directory[!info$isdir], listed) | |
510 exists <- file.exists(files) | |
511 files <- files[exists] | |
512 | |
513 library(tools) | |
514 | |
515 #cat("\n\n") | |
516 | |
517 return(as.matrix(md5sum(files))) | |
518 } | |
519 | |
520 | |
521 # This function get the raw file path from the arguments | |
522 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr | |
523 getRawfilePathFromArguments <- function(singlefile, zipfile, args) { | |
524 if (!is.null(args$zipfile)) zipfile <- args$zipfile | |
525 if (!is.null(args$zipfilePositive)) zipfile <- args$zipfilePositive | |
526 if (!is.null(args$zipfileNegative)) zipfile <- args$zipfileNegative | |
527 | |
528 if (!is.null(args$singlefile_galaxyPath)) { | |
529 singlefile_galaxyPaths <- args$singlefile_galaxyPath; | |
530 singlefile_sampleNames <- args$singlefile_sampleName | |
531 } | |
532 if (!is.null(args$singlefile_galaxyPathPositive)) { | |
533 singlefile_galaxyPaths <- args$singlefile_galaxyPathPositive; | |
534 singlefile_sampleNames <- args$singlefile_sampleNamePositive | |
535 } | |
536 if (!is.null(args$singlefile_galaxyPathNegative)) { | |
537 singlefile_galaxyPaths <- args$singlefile_galaxyPathNegative; | |
538 singlefile_sampleNames <- args$singlefile_sampleNameNegative | |
539 } | |
540 if (exists("singlefile_galaxyPaths")){ | |
541 singlefile_galaxyPaths <- unlist(strsplit(singlefile_galaxyPaths,",")) | |
542 singlefile_sampleNames <- unlist(strsplit(singlefile_sampleNames,",")) | |
543 | |
544 singlefile <- NULL | |
545 for (singlefile_galaxyPath_i in seq(1:length(singlefile_galaxyPaths))) { | |
546 singlefile_galaxyPath <- singlefile_galaxyPaths[singlefile_galaxyPath_i] | |
547 singlefile_sampleName <- singlefile_sampleNames[singlefile_galaxyPath_i] | |
548 singlefile[[singlefile_sampleName]] <- singlefile_galaxyPath | |
549 } | |
550 } | |
551 for (argument in c("zipfile","zipfilePositive","zipfileNegative","singlefile_galaxyPath","singlefile_sampleName","singlefile_galaxyPathPositive","singlefile_sampleNamePositive","singlefile_galaxyPathNegative","singlefile_sampleNameNegative")) { | |
552 args[[argument]] <- NULL | |
553 } | |
554 return(list(zipfile=zipfile, singlefile=singlefile, args=args)) | |
555 } | |
556 | |
557 | |
558 # This function retrieve the raw file in the working directory | |
559 # - if zipfile: unzip the file with its directory tree | |
560 # - if singlefiles: set symlink with the good filename | |
561 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr | |
562 retrieveRawfileInTheWorkingDirectory <- function(singlefile, zipfile) { | |
563 if(!is.null(singlefile) && (length("singlefile")>0)) { | |
564 for (singlefile_sampleName in names(singlefile)) { | |
565 singlefile_galaxyPath <- singlefile[[singlefile_sampleName]] | |
566 if(!file.exists(singlefile_galaxyPath)){ | |
567 error_message <- paste("Cannot access the sample:",singlefile_sampleName,"located:",singlefile_galaxyPath,". Please, contact your administrator ... if you have one!") | |
568 print(error_message); stop(error_message) | |
569 } | |
570 | |
571 if (!suppressWarnings( try (file.link(singlefile_galaxyPath, singlefile_sampleName), silent=T))) | |
572 file.copy(singlefile_galaxyPath, singlefile_sampleName) | |
573 | |
574 } | |
575 directory <- "." | |
576 | |
577 } | |
578 if(!is.null(zipfile) && (zipfile != "")) { | |
579 if(!file.exists(zipfile)){ | |
580 error_message <- paste("Cannot access the Zip file:",zipfile,". Please, contact your administrator ... if you have one!") | |
581 print(error_message) | |
582 stop(error_message) | |
583 } | |
584 | |
585 #list all file in the zip file | |
586 #zip_files <- unzip(zipfile,list=T)[,"Name"] | |
587 | |
588 #unzip | |
589 suppressWarnings(unzip(zipfile, unzip="unzip")) | |
590 | |
591 #get the directory name | |
592 suppressWarnings(filesInZip <- unzip(zipfile, list=T)) | |
593 directories <- unique(unlist(lapply(strsplit(filesInZip$Name,"/"), function(x) x[1]))) | |
594 directories <- directories[!(directories %in% c("__MACOSX")) & file.info(directories)$isdir] | |
595 directory <- "." | |
596 if (length(directories) == 1) directory <- directories | |
597 | |
598 cat("files_root_directory\t",directory,"\n") | |
599 | |
600 } | |
601 return (directory) | |
602 } | |
603 | |
604 | |
605 # This function retrieve a xset like object | |
606 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr | |
607 getxcmsSetObject <- function(xobject) { | |
608 # XCMS 1.x | |
609 if (class(xobject) == "xcmsSet") | |
610 return (xobject) | |
611 # XCMS 3.x | |
612 if (class(xobject) == "XCMSnExp") { | |
613 # Get the legacy xcmsSet object | |
614 suppressWarnings(xset <- as(xobject, 'xcmsSet')) | |
615 sampclass(xset) <- xset@phenoData$sample_group | |
616 return (xset) | |
617 } | |
618 } | |
619 | |
620 | |
621 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7 | |
622 # https://github.com/sneumann/xcms/issues/250 | |
623 groupnamesW4M <- function(xdata, mzdec = 0, rtdec = 0) { | |
624 mzfmt <- paste("%.", mzdec, "f", sep = "") | |
625 rtfmt <- paste("%.", rtdec, "f", sep = "") | |
626 | |
627 gnames <- paste("M", sprintf(mzfmt, featureDefinitions(xdata)[,"mzmed"]), "T", | |
628 sprintf(rtfmt, featureDefinitions(xdata)[,"rtmed"]), sep = "") | |
629 | |
630 if (any(dup <- duplicated(gnames))) | |
631 for (dupname in unique(gnames[dup])) { | |
632 dupidx <- which(gnames == dupname) | |
633 gnames[dupidx] <- paste(gnames[dupidx], seq(along = dupidx), sep = "_") | |
634 } | |
635 | |
636 return (gnames) | |
637 } | |
638 | |
639 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7 | |
640 # https://github.com/sneumann/xcms/issues/247 | |
641 .concatenate_XCMSnExp <- function(...) { | |
642 x <- list(...) | |
643 if (length(x) == 0) | |
644 return(NULL) | |
645 if (length(x) == 1) | |
646 return(x[[1]]) | |
647 ## Check that all are XCMSnExp objects. | |
648 if (!all(unlist(lapply(x, function(z) is(z, "XCMSnExp"))))) | |
649 stop("All passed objects should be 'XCMSnExp' objects") | |
650 new_x <- as(.concatenate_OnDiskMSnExp(...), "XCMSnExp") | |
651 ## If any of the XCMSnExp has alignment results or detected features drop | |
652 ## them! | |
653 x <- lapply(x, function(z) { | |
654 if (hasAdjustedRtime(z)) { | |
655 z <- dropAdjustedRtime(z) | |
656 warning("Adjusted retention times found, had to drop them.") | |
657 } | |
658 if (hasFeatures(z)) { | |
659 z <- dropFeatureDefinitions(z) | |
660 warning("Feature definitions found, had to drop them.") | |
661 } | |
662 z | |
663 }) | |
664 ## Combine peaks | |
665 fls <- lapply(x, fileNames) | |
666 startidx <- cumsum(lengths(fls)) | |
667 pks <- lapply(x, chromPeaks) | |
668 procH <- lapply(x, processHistory) | |
669 for (i in 2:length(fls)) { | |
670 pks[[i]][, "sample"] <- pks[[i]][, "sample"] + startidx[i - 1] | |
671 procH[[i]] <- lapply(procH[[i]], function(z) { | |
672 z@fileIndex <- as.integer(z@fileIndex + startidx[i - 1]) | |
673 z | |
674 }) | |
675 } | |
676 pks <- do.call(rbind, pks) | |
677 new_x@.processHistory <- unlist(procH) | |
678 chromPeaks(new_x) <- pks | |
679 if (validObject(new_x)) | |
680 new_x | |
681 } | |
682 | |
683 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7 | |
684 # https://github.com/sneumann/xcms/issues/247 | |
685 .concatenate_OnDiskMSnExp <- function(...) { | |
686 x <- list(...) | |
687 if (length(x) == 0) | |
688 return(NULL) | |
689 if (length(x) == 1) | |
690 return(x[[1]]) | |
691 ## Check that all are XCMSnExp objects. | |
692 if (!all(unlist(lapply(x, function(z) is(z, "OnDiskMSnExp"))))) | |
693 stop("All passed objects should be 'OnDiskMSnExp' objects") | |
694 ## Check processingQueue | |
695 procQ <- lapply(x, function(z) z@spectraProcessingQueue) | |
696 new_procQ <- procQ[[1]] | |
697 is_ok <- unlist(lapply(procQ, function(z) | |
698 !is.character(all.equal(new_procQ, z)) | |
699 )) | |
700 if (any(!is_ok)) { | |
701 warning("Processing queues from the submitted objects differ! ", | |
702 "Dropping the processing queue.") | |
703 new_procQ <- list() | |
704 } | |
705 ## processingData | |
706 fls <- lapply(x, function(z) z@processingData@files) | |
707 startidx <- cumsum(lengths(fls)) | |
708 ## featureData | |
709 featd <- lapply(x, fData) | |
710 ## Have to update the file index and the spectrum names. | |
711 for (i in 2:length(featd)) { | |
712 featd[[i]]$fileIdx <- featd[[i]]$fileIdx + startidx[i - 1] | |
713 rownames(featd[[i]]) <- MSnbase:::formatFileSpectrumNames( | |
714 fileIds = featd[[i]]$fileIdx, | |
715 spectrumIds = featd[[i]]$spIdx, | |
716 nSpectra = nrow(featd[[i]]), | |
717 nFiles = length(unlist(fls)) | |
718 ) | |
719 } | |
720 featd <- do.call(rbind, featd) | |
721 featd$spectrum <- 1:nrow(featd) | |
722 ## experimentData | |
723 expdata <- lapply(x, function(z) { | |
724 ed <- z@experimentData | |
725 data.frame(instrumentManufacturer = ed@instrumentManufacturer, | |
726 instrumentModel = ed@instrumentModel, | |
727 ionSource = ed@ionSource, | |
728 analyser = ed@analyser, | |
729 detectorType = ed@detectorType, | |
730 stringsAsFactors = FALSE) | |
731 }) | |
732 expdata <- do.call(rbind, expdata) | |
733 expdata <- new("MIAPE", | |
734 instrumentManufacturer = expdata$instrumentManufacturer, | |
735 instrumentModel = expdata$instrumentModel, | |
736 ionSource = expdata$ionSource, | |
737 analyser = expdata$analyser, | |
738 detectorType = expdata$detectorType) | |
739 | |
740 ## protocolData | |
741 protodata <- lapply(x, function(z) z@protocolData) | |
742 if (any(unlist(lapply(protodata, nrow)) > 0)) | |
743 warning("Found non-empty protocol data, but merging protocol data is", | |
744 " currently not supported. Skipped.") | |
745 ## phenoData | |
746 pdata <- do.call(rbind, lapply(x, pData)) | |
747 res <- new( | |
748 "OnDiskMSnExp", | |
749 phenoData = new("NAnnotatedDataFrame", data = pdata), | |
750 featureData = new("AnnotatedDataFrame", featd), | |
751 processingData = new("MSnProcess", | |
752 processing = paste0("Concatenated [", date(), "]"), | |
753 files = unlist(fls), smoothed = NA), | |
754 experimentData = expdata, | |
755 spectraProcessingQueue = new_procQ) | |
756 if (validObject(res)) | |
757 res | |
758 } | |
759 | |
760 #@TODO: remove this function as soon as we can use xcms 3.x.x from Bioconductor 3.7 | |
761 # https://github.com/sneumann/xcms/issues/247 | |
762 c.XCMSnExp <- function(...) { | |
763 .concatenate_XCMSnExp(...) | |
764 } |