# HG changeset patch # User prog # Date 1489655155 14400 # Node ID 1ba222315fd5985ff3db4fb86470e41e54f26d74 # Parent abfba8eb1c8dfe8262302108869db81860c577ae planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit 947b8707b06176a4801de64a71c8771617311ffb diff -r abfba8eb1c8d -r 1ba222315fd5 MsDb.R --- a/MsDb.R Tue Mar 14 12:35:23 2017 -0400 +++ b/MsDb.R Thu Mar 16 05:05:55 2017 -0400 @@ -9,21 +9,22 @@ # CLASS DECLARATION # ##################### - MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character")) + MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character", .rt.unit = "character")) ############### # CONSTRUCTOR # ############### MsDb$methods( initialize = function(...) { + + callSuper(...) .observers <<- NULL .output.streams <<- NULL .input.stream <<- NULL .prec <<- MSDB.DFT.PREC .mz.tol.unit <<- MSDB.DFT.MZTOLUNIT - - callSuper(...) + .rt.unit <<- MSDB.RTUNIT.SEC }) #################### @@ -126,6 +127,10 @@ stop("Method setDbMsModes() not implemented in concrete class.") }) + ################### + # SET MZ TOL UNIT # + ################### + MsDb$methods( setMzTolUnit = function(mztolunit) { if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS) @@ -134,6 +139,18 @@ .mz.tol.unit <<- mztolunit }) + ############### + # SET RT UNIT # + ############### + + MsDb$methods( setRtUnit = function(unit) { + + if ( ! unit %in% MSDB.RTUNIT.VALS) + stop(paste0("RT unit must be one of: ", paste(MSDB.RTUNIT.VALS, collapse = ', '), ".")) + + .rt.unit <<- unit + }) + #################### # HANDLE COMPOUNDS # #################### diff -r abfba8eb1c8d -r 1ba222315fd5 MsDbInputDataFrameStream.R --- a/MsDbInputDataFrameStream.R Tue Mar 14 12:35:23 2017 -0400 +++ b/MsDbInputDataFrameStream.R Thu Mar 16 05:05:55 2017 -0400 @@ -7,18 +7,19 @@ # CLASS DECLARATION # ##################### - MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer")) + MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer", .rtunit = 'character')) ############### # CONSTRUCTOR # ############### - MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), ...) { + MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), rtunit = MSDB.RTUNIT.SEC, ...) { + + callSuper(input.fields = input.fields, ...) .df <<- df .i <<- 0L - - callSuper(input.fields = input.fields, ...) + .rtunit <<- rtunit }) ########## @@ -39,10 +40,15 @@ MsDbInputDataFrameStream$methods( getRt = function() { - if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]])) - return(.self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]]) + rt <- NULL - return(NULL) + if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]])) { + rt <- .self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]] + if (.self$.rtunit == MSDB.RTUNIT.MIN) + rt <- rt * 60 + } + + return(rt) }) ########### diff -r abfba8eb1c8d -r 1ba222315fd5 MsFileDb.R --- a/MsFileDb.R Tue Mar 14 12:35:23 2017 -0400 +++ b/MsFileDb.R Thu Mar 16 05:05:55 2017 -0400 @@ -395,8 +395,10 @@ db <- db[db[[MSDB.TAG.COL]] %in% col,] # Filter on retention time - if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) - db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ] + if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) { + scale <- if (.self$.rt.unit == MSDB.RTUNIT.MIN) 60 else 1 + db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ] + } # Remove retention times and column information if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) { @@ -487,6 +489,9 @@ rt[col] <- list(colrts) } + if (.self$.rt.unit == MSDB.RTUNIT.MIN) + rt <- 60 * rt + return(rt) }) diff -r abfba8eb1c8d -r 1ba222315fd5 MsPeakForestDb.R --- a/MsPeakForestDb.R Tue Mar 14 12:35:23 2017 -0400 +++ b/MsPeakForestDb.R Thu Mar 16 05:05:55 2017 -0400 @@ -16,6 +16,8 @@ MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, token = NA_character_, ...) { + callSuper(...) + # Check URL if (is.null(url) || is.na(url)) stop("No URL defined for new MsPeakForestDb instance.") @@ -26,8 +28,7 @@ .url.scheduler <<- UrlRequestScheduler$new(n = 3, useragent = useragent) .self$.url.scheduler$setVerbose(1L) .token <<- token - - callSuper(...) + .rt.unit <<- MSDB.RTUNIT.MIN }) ########### @@ -53,11 +54,9 @@ if (ret.type == 'json') { - library(RJSONIO) + res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE) - res <- fromJSON(content, nullValue = NA) - - if (class(res) == 'list' && 'success' %in% names(res) && res$success == FALSE) { + if (is.null(res)) { param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '') stop(paste0("Failed to run web service. URL was \"", url, param.str, "\".")) } @@ -66,8 +65,7 @@ if (grepl('^[0-9]+$', content, perl = TRUE)) res <- as.integer(content) else { - library(RJSONIO) - res <- fromJSON(content, nullValue = NA) + res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE) } } } @@ -141,6 +139,7 @@ for (s in spectra) if (is.na(col) || s$liquidChromatography$columnCode %in% col) { ret.time <- (s$RTmin + s$RTmax) / 2 + ret.time <- ret.time * 60 # Retention time are in minutes in Peakforest, but we want them in seconds c <- s$liquidChromatography$columnCode if (c %in% names(rt)) { if ( ! ret.time %in% rt[[c]]) @@ -262,21 +261,21 @@ results <- data.frame(MSDB.TAG.MOLID = character(), MSDB.TAG.MOLNAMES = character(), MSDB.TAG.MOLMASS = numeric(), MSDB.TAG.MZTHEO = numeric(), MSDB.TAG.COMP = character(), MSDB.TAG.ATTR = character(), MSDB.TAG.INCHI = character(), MSDB.TAG.INCHIKEY = character(), MSDB.TAG.CHEBI = character(), MSDB.TAG.HMDB = character(), MSDB.TAG.KEGG = character(), MSDB.TAG.PUBCHEM = character()) for (x in spectra) { if ('source' %in% names(x) && is.list(x$source)) - mztheo <- if ('theoricalMass' %in% names(x)) as.numeric(x$theoricalMass) else NA_real_ - comp <- if ('composition' %in% names(x)) x$composition else NA_character_ - attr <- if ('attribution' %in% names(x)) x$attribution else NA_character_ + mztheo <- if ('mz' %in% names(x) && ! is.null(x$mz)) as.numeric(x$mz) else NA_real_ + comp <- if ('composition' %in% names(x) && ! is.null(x$composition)) x$composition else NA_character_ + attr <- if ('attribution' %in% names(x) && ! is.null(x$attribution)) x$attribution else NA_character_ if ('listOfCompounds' %in% names(x$source)) { - molids <- vapply(x$source$listOfCompounds, function(c) as.character(c$id), FUN.VALUE = '') - molnames <- vapply(x$source$listOfCompounds, function(c) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP), FUN.VALUE = '') - mass <- vapply(x$source$listOfCompounds, function(c) as.character(c$averageMass), FUN.VALUE = '') - inchi <- vapply(x$source$listOfCompounds, function(c) as.character(c$inChI), FUN.VALUE = '') - inchikey <- vapply(x$source$listOfCompounds, function(c) as.character(c$inChIKey), FUN.VALUE = '') - chebi <- vapply(x$source$listOfCompounds, function(c) as.character(c$ChEBI), FUN.VALUE = '') + molids <- vapply(x$source$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '') + molnames <- vapply(x$source$listOfCompounds, function(c) if ('names' %in% names(c) && ! is.null(c$names)) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP) else NA_character_, FUN.VALUE = '') + mass <- vapply(x$source$listOfCompounds, function(c) if ( ! 'averageMass' %in% names(c) || is.null(c$averageMass)) NA_real_ else as.double(c$averageMass), FUN.VALUE = 0.0) + inchi <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChI' %in% names(c) || is.null(c$inChI)) NA_character_ else as.character(c$inChI), FUN.VALUE = '') + inchikey <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChIKey' %in% names(c) || is.null(c$inChIKey)) NA_character_ else as.character(c$inChIKey), FUN.VALUE = '') + chebi <- vapply(x$source$listOfCompounds, function(c) if ('ChEBI' %in% names(c) && ! is.null(c$ChEBI)) as.character(c$ChEBI) else NA_character_, FUN.VALUE = '') chebi[chebi == 'CHEBI:null'] <- NA_character_ - hmdb <- vapply(x$source$listOfCompounds, function(c) as.character(c$HMDB), FUN.VALUE = '') + hmdb <- vapply(x$source$listOfCompounds, function(c) if ('HMDB' %in% names(c) && ! is.null(c$HMDB)) as.character(c$HMDB) else NA_character_, FUN.VALUE = '') hmdb[hmdb == 'HMDBnull'] <- NA_character_ - kegg <- vapply(x$source$listOfCompounds, function(c) as.character(c$KEGG), FUN.VALUE = '') - pubchem <- vapply(x$source$listOfCompounds, function(c) as.character(c$PubChemCID), FUN.VALUE = '') + kegg <- vapply(x$source$listOfCompounds, function(c) if ( ! 'KEGG' %in% names(c) || is.null(c$KEGG)) NA_character_ else as.character(c$KEGG), FUN.VALUE = '') + pubchem <- vapply(x$source$listOfCompounds, function(c) if ( ! 'PubChemCID' %in% names(c) || is.null(c$PubChemCID)) NA_character_ else as.character(c$PubChemCID), FUN.VALUE = '') if (length(molids) > 0 && length(molids) == length(molnames)) results <- rbind(results, data.frame(MSDB.TAG.MOLID = molids, MSDB.TAG.MOLNAMES = molnames, MSDB.TAG.MOLMASS = mass, MSDB.TAG.MZTHEO = mztheo, MSDB.TAG.COMP = comp, MSDB.TAG.ATTR = attr, MSDB.TAG.INCHI = inchi, MSDB.TAG.INCHIKEY = inchikey, MSDB.TAG.CHEBI = chebi, MSDB.TAG.HMDB = hmdb, MSDB.TAG.KEGG = kegg, MSDB.TAG.PUBCHEM = pubchem, stringsAsFactors = FALSE)) } @@ -292,17 +291,27 @@ url <- paste0('spectra/lcms/range-rt-min/', rt.low, '/', rt.high) params <- NULL if ( ! is.null(col)) - params <- c(columns = paste(col, collapse = ',')) + params <- c(columns = paste(col[['id']], collapse = ',')) # Run query rtspectra <- .self$.get.url(url = url, params = params) + # Get compound/molecule IDs - for (x in spectra) - rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = vapply(x$listOfCompounds, function(c) as.character(c$id), FUN.VALUE = ''), - MSDB.TAG.COL = as.character(x$liquidChromatography$columnCode), - MSDB.TAG.COLRT = (as.numeric(x$RTmin) + as.numeric(x$RTmax)) / 2, - stringsAsFactors = FALSE)) + for (x in rtspectra) + if (all(c('listOfCompounds', 'liquidChromatography') %in% names(x))) { + molids <- vapply(x$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '') + if (length(molids) > 0) { + col <- if ('columnCode' %in% names(x$liquidChromatography) && ! is.null(x$liquidChromatography$columnCode)) as.character(x$liquidChromatography$columnCode) else NA_character_ + rtmin <- if ('RTmin' %in% names(x) && ! is.null(x$RTmin)) as.double(x$RTmin) else NA_real_ + rtmax <- if ('RTmax' %in% names(x) && ! is.null(x$RTmax)) as.double(x$RTmax) else NA_real_ + colrt <- (rtmin + rtmax) / 2 + rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = molids, + MSDB.TAG.COL = col, + MSDB.TAG.COLRT = colrt, + stringsAsFactors = FALSE)) + } + } } # Add retention times and column info diff -r abfba8eb1c8d -r 1ba222315fd5 lcmsmatching.xml --- a/lcmsmatching.xml Tue Mar 14 12:35:23 2017 -0400 +++ b/lcmsmatching.xml Thu Mar 16 05:05:55 2017 -0400 @@ -1,4 +1,4 @@ - + Annotation of MS peaks using matching on a spectra database. @@ -9,7 +9,7 @@ r-xml r-bitops r-rcurl - r-rjsonio + r-jsonlite diff -r abfba8eb1c8d -r 1ba222315fd5 msdb-common.R --- a/msdb-common.R Tue Mar 14 12:35:23 2017 -0400 +++ b/msdb-common.R Thu Mar 16 05:05:55 2017 -0400 @@ -46,6 +46,11 @@ MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN) + # Authorized rt units + MSDB.RTUNIT.SEC <- 's' + MSDB.RTUNIT.MIN <- 'min' + MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN) + # Default values MSDB.DFT.PREC <- list() MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+") diff -r abfba8eb1c8d -r 1ba222315fd5 search-mz --- a/search-mz Tue Mar 14 12:35:23 2017 -0400 +++ b/search-mz Thu Mar 16 05:05:55 2017 -0400 @@ -51,6 +51,8 @@ MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES) MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',') MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',') +MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC +MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC DEFAULT.ARG.VALUES <- MSDB.DFT DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields()) DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields()) @@ -181,6 +183,7 @@ 'rttolx', 'x', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), 'rttoly', 'y', 1, 'numeric', paste0('Tolerance on retention times. Unset by default.'), 'rtcol', 'c', 1, 'character', paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'), + 'rtunit', NA_character_, 1, 'character', paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'), 'all-cols', NA_character_, 0, 'logical', 'Use all available chromatographic columns to match retention times.', 'check-cols', NA_character_, 0, 'logical', 'Check that the chromatographic column names specified with option -c really exist.', 'list-cols', NA_character_, 0, 'logical', 'List all chromatographic columns present in the database. Write list inside the file specified by -o option.', @@ -206,8 +209,9 @@ 'db-name', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', 'db-user', NA_character_, 1, 'character', 'User of the database. Used by the "4tabsql" database.', 'db-password', NA_character_, 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', - 'db-fields', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'), - 'db-ms-modes', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'), + 'db-fields', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".'), + 'db-ms-modes', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'), + 'db-rt-unit', NA_character_, 1, 'character', paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'), 'db-token', NA_character_, 1, 'character', 'Database token. Used by Peakforest database.', 'debug', NA_character_, 0, 'logical', 'Set debug mode.' ) @@ -542,14 +546,16 @@ stop(paste0("You are running an MZ/RT match run on your input data, but no retention time column named '", opt[['input-col-names']][['rt']],"' can be found inside your input file.")) # Set streams -input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']]) +input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']], rtunit = opt[['rtunit']]) main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']])) peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']])) invisible(db$setInputStream(input.stream)) db$addOutputStreams(c(main.output, peaks.output)) -# Set M/Z tolerance unit +# Set database units db$setMzTolUnit(opt$mztolunit) +if ( ! is.null(opt[['db-rt-unit']]) && opt$database == 'file') + db$setRtUnit(opt[['db-rt-unit']]) # Search database mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG