Mercurial > repos > prog > lcmsmatching

--- a/MsDb.R	Tue Mar 14 12:35:23 2017 -0400
+++ b/MsDb.R	Thu Mar 16 05:05:55 2017 -0400
@@ -9,21 +9,22 @@
 	# CLASS DECLARATION #
 	#####################

-	MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character"))
+	MsDb <- setRefClass("MsDb", fields = list(.observers = "ANY", .prec = "list", .output.streams = "ANY", .input.stream = "ANY", .mz.tol.unit = "character", .rt.unit = "character"))

 	###############
 	# CONSTRUCTOR #
 	###############

 	MsDb$methods( initialize = function(...) {
+
+		callSuper(...)

 		.observers <<- NULL
 		.output.streams <<- NULL
 		.input.stream <<- NULL
 		.prec <<- MSDB.DFT.PREC
 		.mz.tol.unit <<- MSDB.DFT.MZTOLUNIT
-
-		callSuper(...)
+		.rt.unit <<- MSDB.RTUNIT.SEC
 	})

 	####################
@@ -126,6 +127,10 @@
 		stop("Method setDbMsModes() not implemented in concrete class.")
 	})

+	###################
+	# SET MZ TOL UNIT #
+	###################
+
 	MsDb$methods( setMzTolUnit = function(mztolunit) {

 		if ( ! mztolunit %in% MSDB.MZTOLUNIT.VALS)
@@ -134,6 +139,18 @@
 		.mz.tol.unit <<- mztolunit
 	})

+	###############
+	# SET RT UNIT #
+	###############
+
+	MsDb$methods( setRtUnit = function(unit) {
+
+		if ( ! unit %in% MSDB.RTUNIT.VALS)
+			stop(paste0("RT unit must be one of: ", paste(MSDB.RTUNIT.VALS, collapse = ', '), "."))
+
+		.rt.unit <<- unit
+	})
+
 	####################
 	# HANDLE COMPOUNDS #
 	####################
--- a/MsDbInputDataFrameStream.R	Tue Mar 14 12:35:23 2017 -0400
+++ b/MsDbInputDataFrameStream.R	Thu Mar 16 05:05:55 2017 -0400
@@ -7,18 +7,19 @@
 	# CLASS DECLARATION #
 	#####################

-	MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer"))
+	MsDbInputDataFrameStream <- setRefClass("MsDbInputDataFrameStream", contains = 'MsDbInputStream', fields = list( .df = "ANY", .i = "integer", .rtunit = 'character'))

 	###############
 	# CONSTRUCTOR #
 	###############

-	MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), ...) {
+	MsDbInputDataFrameStream$methods( initialize = function(df = data.frame(), input.fields = msdb.get.dft.input.fields(), rtunit = MSDB.RTUNIT.SEC, ...) {
+
+		callSuper(input.fields = input.fields, ...)

 		.df <<- df
 		.i <<- 0L
-
-		callSuper(input.fields = input.fields, ...)
+		.rtunit <<- rtunit
 	})

 	##########
@@ -39,10 +40,15 @@

 	MsDbInputDataFrameStream$methods( getRt = function() {

-		if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]]))
-			return(.self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]])
+		rt <- NULL

-		return(NULL)
+		if (.self$.i > 0 && .self$.i <= nrow(.self$.df) && ! is.null(.self$.input.fields[[MSDB.TAG.RT]])) {
+			rt <- .self$.df[.self$.i, .self$.input.fields[[MSDB.TAG.RT]]]
+			if (.self$.rtunit == MSDB.RTUNIT.MIN)
+				rt <- rt * 60
+		}
+
+		return(rt)
 	})

 	###########
--- a/MsFileDb.R	Tue Mar 14 12:35:23 2017 -0400
+++ b/MsFileDb.R	Thu Mar 16 05:05:55 2017 -0400
@@ -395,8 +395,10 @@
 			db <- db[db[[MSDB.TAG.COL]] %in% col,]

 		# Filter on retention time
-		if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high))
-			db <- db[db[[MSDB.TAG.COLRT]] >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ]
+		if ( ! is.null(rt.low) && ! is.na(rt.low) && ! is.null(rt.high) && ! is.na(rt.high)) {
+			scale <- if (.self$.rt.unit == MSDB.RTUNIT.MIN) 60 else 1
+			db <- db[db[[MSDB.TAG.COLRT]] * scale >= rt.low & db[[MSDB.TAG.COLRT]] <= rt.high, ]
+		}

 		# Remove retention times and column information
 		if (is.null(col) || is.na(col) || is.null(rt.low) || is.na(rt.low) || is.null(rt.high) || is.na(rt.high)) {
@@ -487,6 +489,9 @@
 			rt[col] <- list(colrts)
 		}

+		if (.self$.rt.unit == MSDB.RTUNIT.MIN)
+			rt <- 60 * rt
+
 		return(rt)
 	})
--- a/MsPeakForestDb.R	Tue Mar 14 12:35:23 2017 -0400
+++ b/MsPeakForestDb.R	Thu Mar 16 05:05:55 2017 -0400
@@ -16,6 +16,8 @@

 	MsPeakForestDb$methods( initialize = function(url = NA_character_, useragent = NA_character_, token = NA_character_, ...) {

+		callSuper(...)
+
 		# Check URL
 		if (is.null(url) || is.na(url))
 		    stop("No URL defined for new MsPeakForestDb instance.")
@@ -26,8 +28,7 @@
 		.url.scheduler <<- UrlRequestScheduler$new(n = 3, useragent = useragent)
 		.self$.url.scheduler$setVerbose(1L)
 		.token <<- token
-
-		callSuper(...)
+		.rt.unit <<- MSDB.RTUNIT.MIN
 	})

 	###########
@@ -53,11 +54,9 @@

 		if (ret.type == 'json') {

-			library(RJSONIO)
+			res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE)

-			res <- fromJSON(content, nullValue = NA)
-
-			if (class(res) == 'list' && 'success' %in% names(res) && res$success == FALSE) {
+			if (is.null(res)) {
 				param.str <- if (is.null(params)) '' else paste('?', vapply(names(params), function(p) paste(p, params[[p]], sep = '='), FUN.VALUE = ''), collapse = '&', sep = '')
 				stop(paste0("Failed to run web service. URL was \"", url, param.str, "\"."))
 			}
@@ -66,8 +65,7 @@
 				if (grepl('^[0-9]+$', content, perl = TRUE))
 					res <- as.integer(content)
 				else {
-					library(RJSONIO)
-					res <- fromJSON(content, nullValue = NA)
+					res <- jsonlite::fromJSON(content, simplifyDataFrame = FALSE)
 				}
 			}
 		}
@@ -141,6 +139,7 @@
 			for (s in spectra)
 				if (is.na(col) || s$liquidChromatography$columnCode %in% col) {
 					ret.time <- (s$RTmin + s$RTmax) / 2
+					ret.time <- ret.time * 60 # Retention time are in minutes in Peakforest, but we want them in seconds
 					c <- s$liquidChromatography$columnCode
 					if (c %in% names(rt)) {
 						if ( ! ret.time %in% rt[[c]])
@@ -262,21 +261,21 @@
 		results <- data.frame(MSDB.TAG.MOLID = character(), MSDB.TAG.MOLNAMES = character(), MSDB.TAG.MOLMASS = numeric(), MSDB.TAG.MZTHEO = numeric(), MSDB.TAG.COMP = character(), MSDB.TAG.ATTR = character(), MSDB.TAG.INCHI = character(), MSDB.TAG.INCHIKEY = character(), MSDB.TAG.CHEBI = character(), MSDB.TAG.HMDB = character(), MSDB.TAG.KEGG = character(), MSDB.TAG.PUBCHEM = character())
 		for (x in spectra) {
 			if ('source' %in% names(x) && is.list(x$source))
-				mztheo <- if ('theoricalMass' %in% names(x)) as.numeric(x$theoricalMass) else NA_real_
-				comp <- if ('composition' %in% names(x)) x$composition else NA_character_
-				attr <- if ('attribution' %in% names(x)) x$attribution else NA_character_
+				mztheo <- if ('mz' %in% names(x) && ! is.null(x$mz)) as.numeric(x$mz) else NA_real_
+				comp <- if ('composition' %in% names(x) && ! is.null(x$composition)) x$composition else NA_character_
+				attr <- if ('attribution' %in% names(x) && ! is.null(x$attribution)) x$attribution else NA_character_
 				if ('listOfCompounds' %in% names(x$source)) {
-					molids <- vapply(x$source$listOfCompounds, function(c) as.character(c$id), FUN.VALUE = '')
-					molnames <- vapply(x$source$listOfCompounds, function(c) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP), FUN.VALUE = '')
-					mass <- vapply(x$source$listOfCompounds, function(c) as.character(c$averageMass), FUN.VALUE = '')
-					inchi <- vapply(x$source$listOfCompounds, function(c) as.character(c$inChI), FUN.VALUE = '')
-					inchikey <- vapply(x$source$listOfCompounds, function(c) as.character(c$inChIKey), FUN.VALUE = '')
-					chebi <- vapply(x$source$listOfCompounds, function(c) as.character(c$ChEBI), FUN.VALUE = '')
+					molids <- vapply(x$source$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '')
+					molnames <- vapply(x$source$listOfCompounds, function(c) if ('names' %in% names(c) && ! is.null(c$names)) paste(c$names, collapse = MSDB.MULTIVAL.FIELD.SEP) else NA_character_, FUN.VALUE = '')
+					mass <- vapply(x$source$listOfCompounds, function(c) if ( ! 'averageMass' %in% names(c) || is.null(c$averageMass)) NA_real_ else as.double(c$averageMass), FUN.VALUE = 0.0)
+					inchi <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChI' %in% names(c) || is.null(c$inChI)) NA_character_ else as.character(c$inChI), FUN.VALUE = '')
+					inchikey <- vapply(x$source$listOfCompounds, function(c) if ( ! 'inChIKey' %in% names(c) || is.null(c$inChIKey)) NA_character_ else as.character(c$inChIKey), FUN.VALUE = '')
+					chebi <- vapply(x$source$listOfCompounds, function(c) if ('ChEBI'  %in% names(c) && ! is.null(c$ChEBI)) as.character(c$ChEBI) else NA_character_, FUN.VALUE = '')
 					chebi[chebi == 'CHEBI:null'] <- NA_character_
-					hmdb <- vapply(x$source$listOfCompounds, function(c) as.character(c$HMDB), FUN.VALUE = '')
+					hmdb <- vapply(x$source$listOfCompounds, function(c) if ('HMDB' %in% names(c) && ! is.null(c$HMDB)) as.character(c$HMDB) else NA_character_, FUN.VALUE = '')
 					hmdb[hmdb == 'HMDBnull'] <- NA_character_
-					kegg <- vapply(x$source$listOfCompounds, function(c) as.character(c$KEGG), FUN.VALUE = '')
-					pubchem <- vapply(x$source$listOfCompounds, function(c) as.character(c$PubChemCID), FUN.VALUE = '')
+					kegg <- vapply(x$source$listOfCompounds, function(c) if ( ! 'KEGG' %in% names(c) || is.null(c$KEGG)) NA_character_ else as.character(c$KEGG), FUN.VALUE = '')
+					pubchem <- vapply(x$source$listOfCompounds, function(c) if ( ! 'PubChemCID' %in% names(c) || is.null(c$PubChemCID)) NA_character_ else as.character(c$PubChemCID), FUN.VALUE = '')
 					if (length(molids) > 0 && length(molids) == length(molnames))
 						results <- rbind(results, data.frame(MSDB.TAG.MOLID = molids, MSDB.TAG.MOLNAMES = molnames, MSDB.TAG.MOLMASS = mass, MSDB.TAG.MZTHEO = mztheo, MSDB.TAG.COMP = comp, MSDB.TAG.ATTR = attr, MSDB.TAG.INCHI = inchi, MSDB.TAG.INCHIKEY = inchikey, MSDB.TAG.CHEBI = chebi, MSDB.TAG.HMDB = hmdb, MSDB.TAG.KEGG = kegg, MSDB.TAG.PUBCHEM = pubchem, stringsAsFactors = FALSE))
 				}
@@ -292,17 +291,27 @@
 				url <- paste0('spectra/lcms/range-rt-min/', rt.low, '/', rt.high)
 				params <- NULL
 				if ( ! is.null(col))
-					params <- c(columns = paste(col, collapse = ','))
+					params <- c(columns = paste(col[['id']], collapse = ','))

 				# Run query
 				rtspectra <- .self$.get.url(url = url, params = params)

+
 				# Get compound/molecule IDs
-				for (x in spectra)
-					rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = vapply(x$listOfCompounds, function(c) as.character(c$id), FUN.VALUE = ''),
-				                                   	   MSDB.TAG.COL = as.character(x$liquidChromatography$columnCode),
-				                                   	   MSDB.TAG.COLRT = (as.numeric(x$RTmin) + as.numeric(x$RTmax)) / 2,
-					                                   	   stringsAsFactors = FALSE))
+				for (x in rtspectra)
+					if (all(c('listOfCompounds', 'liquidChromatography') %in% names(x))) {
+						molids <- vapply(x$listOfCompounds, function(c) if ('id' %in% names(c) && ! is.null(c$id)) as.character(c$id) else NA_character_, FUN.VALUE = '')
+						if (length(molids) > 0) {
+							col <- if ('columnCode' %in% names(x$liquidChromatography) && ! is.null(x$liquidChromatography$columnCode)) as.character(x$liquidChromatography$columnCode) else NA_character_
+							rtmin <- if ('RTmin' %in% names(x) && ! is.null(x$RTmin)) as.double(x$RTmin) else NA_real_
+							rtmax <- if ('RTmax' %in% names(x) && ! is.null(x$RTmax)) as.double(x$RTmax) else NA_real_
+							colrt <- (rtmin + rtmax) / 2
+							rt.res <- rbind(rt.res, data.frame(MSDB.TAG.MOLID = molids,
+				                                   	   	   	   MSDB.TAG.COL = col,
+				                                   	   	   	   MSDB.TAG.COLRT = colrt,
+					                                   	   	   stringsAsFactors = FALSE))
+						}
+					}
 			}

 			# Add retention times and column info
--- a/lcmsmatching.xml	Tue Mar 14 12:35:23 2017 -0400
+++ b/lcmsmatching.xml	Thu Mar 16 05:05:55 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="lcmsmatching" name="LC/MS matching" version="3.2.0" profile="16.01">
+<tool id="lcmsmatching" name="LC/MS matching" version="3.2.1" profile="16.01">

 	<description>Annotation of MS peaks using matching on a spectra database.</description>

@@ -9,7 +9,7 @@
 		<requirement type="package" version="3.98">r-xml</requirement>
 		<requirement type="package" version="1.0_6">r-bitops</requirement>
 		<requirement type="package" version="1.95">r-rcurl</requirement>
-		<requirement type="package" version="1.3">r-rjsonio</requirement>
+		<requirement type="package" version="1.3">r-jsonlite</requirement>
 	</requirements>

 	<code file="list-chrom-cols.py"/>
--- a/msdb-common.R	Tue Mar 14 12:35:23 2017 -0400
+++ b/msdb-common.R	Thu Mar 16 05:05:55 2017 -0400
@@ -46,6 +46,11 @@
 	MSDB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio
 	MSDB.MZTOLUNIT.VALS <- c(MSDB.MZTOLUNIT.PPM, MSDB.MZTOLUNIT.PLAIN)

+	# Authorized rt units
+	MSDB.RTUNIT.SEC <- 's'
+	MSDB.RTUNIT.MIN <- 'min'
+	MSDB.RTUNIT.VALS <- c(MSDB.RTUNIT.SEC ,MSDB.RTUNIT.MIN)
+
 	# Default values
 	MSDB.DFT.PREC <- list()
 	MSDB.DFT.PREC[[MSDB.TAG.POS]] <- c("[(M+H)]+", "[M+H]+", "[(M+Na)]+", "[M+Na]+", "[(M+K)]+", "[M+K]+")
--- a/search-mz	Tue Mar 14 12:35:23 2017 -0400
+++ b/search-mz	Thu Mar 16 05:05:55 2017 -0400
@@ -51,6 +51,8 @@
 MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES)
 MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',')
 MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',')
+MSDB.DFT[['db-rt-unit']] <- MSDB.RTUNIT.SEC
+MSDB.DFT[['rtunit']] <- MSDB.RTUNIT.SEC
 DEFAULT.ARG.VALUES <- MSDB.DFT
 DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
 DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
@@ -181,6 +183,7 @@
 		'rttolx',           'x',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
 		'rttoly',           'y',            1,  'numeric',      paste0('Tolerance on retention times. Unset by default.'),
 		'rtcol',            'c',            1,  'character',    paste0('Chromatographic column to use. Unset by default. If set, use the corresponding column to filter on retention times, if retention times are provided.'),
+		'rtunit',           NA_character_,  1,  'character',    paste0('Retention time unit for the input file. Default is ', MSDB.DFT$rtunit, '. Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'),
 		'all-cols',         NA_character_,  0,  'logical',      'Use all available chromatographic columns to match retention times.',
 		'check-cols',       NA_character_,  0,  'logical',      'Check that the chromatographic column names specified with option -c really exist.',
 		'list-cols',        NA_character_,  0,  'logical',      'List all chromatographic columns present in the database. Write list inside the file specified by -o option.',
@@ -206,8 +209,9 @@
 		'db-name',          NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
 		'db-user',          NA_character_,  1,  'character',    'User of the database. Used by the "4tabsql" database.',
 		'db-password',      NA_character_,  1,  'character',    'Password of the database user. Used by the "4tabsql" database.',
-		'db-fields',        NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'),
-		'db-ms-modes',      NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'),
+		'db-fields',        NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the field names to be used in the single file database. Default is "', MSDB.DFT[['db-fields']], '".'),
+		'db-ms-modes',      NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the MS modes to be used in the single file database. Default is "', MSDB.DFT[['db-ms-modes']], '".'),
+		'db-rt-unit',       NA_character_,  1,  'character',    paste0('Retention time unit for the database, used in the single file database. Default is "', MSDB.DFT[['db-rt-unit']], '". Allowed values are:', paste(MSDB.RTUNIT.VALS, collapse = ", "), '.'),
 		'db-token',         NA_character_,  1,  'character',    'Database token. Used by Peakforest database.',
 		'debug',            NA_character_,  0,  'logical',      'Set debug mode.'
 		)
@@ -542,14 +546,16 @@
 	stop(paste0("You are running an MZ/RT match run on your input data, but no retention time column named '", opt[['input-col-names']][['rt']],"' can be found inside your input file."))

 # Set streams
-input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']])
+input.stream <- MsDbInputDataFrameStream$new(df = input, input.fields = opt[['input-col-names']], rtunit = opt[['rtunit']])
 main.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], one.line = ! is.null(opt[['same-rows']]), match.sep = opt[['molids-sep']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]))
 peaks.output <- MsDbOutputDataFrameStream$new(keep.unused = ! is.null(opt[['same-cols']]), output.fields = opt[['output-col-names']], first.val = ! is.null(opt[['first-val']]), ascii = ! is.null(opt[['excel2011comp']]), nogreek = ! is.null(opt[['excel2011comp']]), noapostrophe = ! is.null(opt[['excel2011comp']]), noplusminus = ! is.null(opt[['excel2011comp']]))
 invisible(db$setInputStream(input.stream))
 db$addOutputStreams(c(main.output, peaks.output))

-# Set M/Z tolerance unit
+# Set database units
 db$setMzTolUnit(opt$mztolunit)
+if ( ! is.null(opt[['db-rt-unit']]) && opt$database == 'file')
+	db$setRtUnit(opt[['db-rt-unit']])

 # Search database
 mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG