lcmsmatching: search-mz comparison

comparison search-mz @ 1:45e985cd8e9e draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8-dirty

author	prog
date	Tue, 31 Jan 2017 05:27:24 -0500
parents	3afe41d3e9e7
children	1ba222315fd5

comparison

equal deleted inserted replaced

-:3afe41d3e9e7
+:45e985cd8e9e
 source(file.path(dirname(script.path), 'strhlp.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'fshlp.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'biodb-common.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'nethlp.R'), chdir = TRUE)
+# Missing paste0() function in R 2.14.1
+if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15)
+	paste0 <- function(...) paste(..., sep = '')
 #############
 # CONSTANTS #
 #############
 PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE)
+USERAGENT <- 'search-mz ; pierrick.roger@gmail.com'
 # Authorized database types
 MSDB.XLS <- 'xls'
 MSDB.4TABSQL <- '4tabsql'
 MSDB.FILE <- 'file'
 MSDB.DFT[['mztolunit']] <- MSDB.DFT.MZTOLUNIT
 MSDB.DFT[['precursor-rt-tol']] <- 5
 MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP
 MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields())
 MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES)
-MSDB.DFT[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
-MSDB.DFT[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
 MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',')
 MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',')
+DEFAULT.ARG.VALUES <- MSDB.DFT
+DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
+DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
 ##############
 # PRINT HELP #
 ##############
 	# Parse retention time columns
 	if ( ! is.null(opt$rtcol))
 		opt$rtcol <- strsplit(opt$rtcol, ',')[[1]]
 	# Parse input column names
-	if ( ! is.null(opt[['input-col-names']])) {
+	if (is.null(opt[['input-col-names']])) {
+		opt[['input-col-names']] <- msdb.get.dft.input.fields()
+	}
+	else {
 		custcols <- split.kv.list(opt[['input-col-names']])
-		dftcols <- split.kv.list(MSDB.DFT[['input-col-names']])
+		dftcols <- msdb.get.dft.input.fields()
 		opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)])
 	}
 	# Parse output column names
-	if ( ! is.null(opt[['output-col-names']])) {
+	if (is.null(opt[['output-col-names']])) {
+		# By default keep input col names for output
+		opt[['output-col-names']] <- msdb.get.dft.output.fields()
+		input.cols <- names(opt[['input-col-names']])
+		output.cols <- names(opt[['output-col-names']])
+		opt[['output-col-names']] <- c(opt[['input-col-names']][input.cols %in% output.cols], opt[['output-col-names']][ ! output.cols %in% input.cols])
+	}
+	else {
 		custcols <- split.kv.list(opt[['output-col-names']])
-		dftcols <- split.kv.list(MSDB.DFT[['output-col-names']])
+		dftcols <- msdb.get.dft.output.fields()
 		opt[['output-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)])
 	}
 	# Parse lists of precursors
 	if ( ! is.null(opt[['pos-prec']]))
 # PRINT DEFAULT ARGUMENT VALUES #
 #################################
 print.dft.arg.val <- function(opt) {
-	print.flags <- MSDB.DFT
+	print.flags <- DEFAULT.ARG.VALUES
 	names(print.flags) <- vapply(names(print.flags), function(x) paste0('print-', x), FUN.VALUE = '')
 	for (f in names(print.flags))
 		if ( ! is.null(opt[[f]])) {
 			cat(print.flags[[f]])
 			q(status = 0)
 make.getopt.spec.print.dft <- function() {
 	spec <- character()
-	for (f in names(MSDB.DFT))
+	for (f in names(DEFAULT.ARG.VALUES))
 		spec <- c(spec, paste0('print-', f), NA_character_, 0, 'logical', paste0('Print default value of --', f))
 	return(spec)
 }
 		'no-main-table-in-html-output',  NA_character_,  0,  'logical',      'Do not display main table in HTML output.',
 		'precursor-match',  NA_character_,  0,  'logical',      'Remove peaks whose molecule precursor peak has not been matched. Unset by default.',
 		'precursor-rt-tol', NA_character_,  1,  'numeric',      paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'),
 		'pos-prec',         NA_character_,  1,  'character',    paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'),
 		'neg-prec',         NA_character_,  1,  'character',    paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".'),
-		'input-col-names',  NA_character_,  1,  'character',    paste0('Set the input column names. Default is "', MSDB.DFT[['input-col-names']], '".'),
+		'input-col-names',  NA_character_,  1,  'character',    paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".'),
-		'output-col-names', NA_character_,  1,  'character',    paste0('Set the output column names. Default is "', MSDB.DFT[['output-col-names']], '".'),
+		'output-col-names', NA_character_,  1,  'character',    paste0('Set the output column names. Default is "', DEFAULT.ARG.VALUES[['output-col-names']], '".'),
 		'molids-sep',       NA_character_,  1,  'character',    paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'),
 		'first-val',        NA_character_,  0,  'logical',      'Keep only the first value in multi-value fields. Unset by default.',
 		'excel2011comp',            NA_character_,  0,  'logical',      'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.',
 		'database',         'd',            1,  'character',    paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'),
 		'url',              NA_character_,  1,  'character',    'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.',
 		'cache-dir',        NA_character_,  1,  'character',    'Path to directory where to store cache files. Only used when database flag is set to "xls".',
-		'useragent',        NA_character_,  1,  'character',    'User agent. Used by the "Peakforest" database.',
 		'db-name',          NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
-		'db-user',          NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
+		'db-user',          NA_character_,  1,  'character',    'User of the database. Used by the "4tabsql" database.',
-		'db-password',      NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
+		'db-password',      NA_character_,  1,  'character',    'Password of the database user. Used by the "4tabsql" database.',
 		'db-fields',        NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'),
 		'db-ms-modes',      NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'),
+		'db-token',         NA_character_,  1,  'character',    'Database token. Used by Peakforest database.',
 		'debug',            NA_character_,  0,  'logical',      'Set debug mode.'
 		)
 	spec <- c(spec, make.getopt.spec.print.dft())
 	opt <- set.dft.arg.val(opt) # Set default values
 	opt <- parse.arg.val(opt) # Parse list values
 	# Check values
 	error <- .check.db.conn.opts(opt)
-	if (is.null(opt[['output-file']])) {
+	if (is.null(opt[['output-file']]) && is.null(opt[['list-cols']])) {
 		warning("You must set a path for the output file.")
 		error <- TRUE
 	}
 	if (is.null(opt[['list-cols']])) {
 		if (is.null(opt[['input-file']])) {
 			}
 		}
 		if (opt$database == MSDB.PEAKFOREST) {
 			if (is.null(opt$url)) {
 				warning("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.")
-				error <- TRUE
-			}
-			if (is.null(opt$useragent)) {
-				warning("When using PeakForest database, you must specify a user agent with option --useragent.")
 				error <- TRUE
 			}
 		}
 		return(error)
 			precursors[[MSDB.TAG.POS]] <- opt[['pos-prec']]
 			precursors[[MSDB.TAG.NEG]] <- opt[['neg-prec']]
 		}
 		db <- switch(opt$database,
-		             peakforest = MsPeakForestDb$new(url = opt$url, useragent = opt$useragent),
+		             peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]),
-		             xls = MsXlsDb(db_dir = opt$url, cache_dir = opt[['cache-dir']]),
+		             xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]),
-		             '4tabsql' = Ms4TabSqlDb(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]),
+		             '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]),
-		             file = MsFileDb(file = opt$url),
+		             file = MsFileDb$new(file = opt$url),
 		             NULL)
 		db$setPrecursors(precursors)
 		if (db$areDbFieldsSettable())
 			db$setDbFields(opt[['db-fields']])
 		if (db$areDbMsModesSettable())
 ###############
 output.html <- function(db, main, peaks, file, opt, output.fields) {
 	# Replace public database IDs by URLs
-	if ( ! is.null(peaks))
+	if ( ! is.null(peaks) || ! is.null(main)) {
+		# Conversion from extdb id field to extdb name
+		extdb2classdb = list()
+		extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG
+		extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB
+		extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI
+		extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP
+		# Loop on all dbs
 		for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) {
 			field <- output.fields[[extdb]]
-			if (field %in% colnames(peaks))
+			if ( ! is.null(peaks) && field %in% colnames(peaks))
-				peaks[[field]] <- vapply(peaks[[field]], function(id) paste0('<a href="', get.entry.url(class = extdb, accession = id, content.type = RBIODB.HTML), '">', id, '</a>'), FUN.VALUE = '')
+				peaks[[field]] <- vapply(peaks[[field]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '')
-		}
+			if ( ! is.null(main) && field %in% colnames(main))
+				main[[field]] <- vapply(main[[field]], function(ids) if (is.na(ids) || nchar(ids) == 0) '' else paste(vapply(strsplit(ids, opt[['molids-sep']])[[1]], function(id) paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = ''), collapse = opt[['molids-sep']]), FUN.VALUE = '')
+		}
+	}
 	# Write HTML
 	html <- HtmlWriter(file = file)
 	html$writeBegTag('html')
 	html$writeBegTag('header')
+	html$writeTag('meta', attr = c(charset = "UTF-8"))
 	html$writeTag('title', text = "LC/MS matching results")
 	html$writeBegTag('style')
 	html$write('table, th, td { border-collapse: collapse; }')
 	html$write('table, th { border: 1px solid black; }')
 	html$write('td { border-left: 1px solid black; border-right: 1px solid black; }')
 	html$writeTag('h1', text = "LC/MS matching")
 	# Write parameters
 	html$writeTag('h2', text = "Parameters")
 	html$writeBegTag('ul')
-	html$writeTag('li', paste0("Mode = ", opt$mode, "."))
+	html$writeTag('li', text = paste0("Mode = ", opt$mode, "."))
-	html$writeTag('li', paste0("M/Z precision = ", opt$mzprec, "."))
+	html$writeTag('li', text = paste0("M/Z precision = ", opt$mzprec, "."))
-	html$writeTag('li', paste0("M/Z shift = ", opt$mzshift, "."))
+	html$writeTag('li', text = paste0("M/Z shift = ", opt$mzshift, "."))
-	html$writeTag('li', paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
+	html$writeTag('li', text = paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
 	if ( ! is.null(opt[['precursor-match']])) {
-		html$writeTag('li', paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
+		html$writeTag('li', text = paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
-		html$writeTag('li', paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
+		html$writeTag('li', text = paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
 	}
 	if ( ! is.null(opt$rtcol)) {
-		html$writeTag('li', paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
+		html$writeTag('li', text = paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
-		html$writeTag('li', paste0("RTX = ", opt$rttolx, "."))
+		html$writeTag('li', text = paste0("RTX = ", opt$rttolx, "."))
-		html$writeTag('li', paste0("RTY = ", opt$rttoly, "."))
+		html$writeTag('li', text = paste0("RTY = ", opt$rttoly, "."))
 		if ( ! is.null(opt[['precursor-match']]))
-			html$writeTag('li', paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
+			html$writeTag('li', text = paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
 	}
 	html$writeEndTag('ul')
 	# Write results
 	html$writeTag('h2', text = "Results")
 db <- .load.db(opt)
 # Print columns
 if ( ! is.null(opt[['list-cols']])) {
 	cols <- db$getChromCol()
-	df.write.tsv(cols, file = opt[['output-file']])
+	df.write.tsv(cols, file = if (is.null(opt[['output-file']])) stdout() else opt[['output-file']])
 	q(status = 0)
 }
 # Read input
 if ( ! is.null(opt[['input-file']]) && ! file.exists(opt[['input-file']]))
 	stop(paste0("Input file \"", opt[['input-file']], "\" does not exist."))
 if (file.info(opt[['input-file']])$size > 0) {
 	# Load file into data frame
-	input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t")
+	input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE)
 	# Convert each column that is identified by a number into a name
 	for (field in names(opt[['input-col-names']])) {
 		if ( ! opt[['input-col-names']][[field]] %in% colnames(input) && length(grep('^[0-9]+$', opt[['input-col-names']][[field]])) > 0) {
 			col.index <- as.integer(opt[['input-col-names']][[field]])
 if ( ! is.null(opt[['all-cols']]))
 	opt$rtcol <- db$getChromCol()
 # Check chrom columns
 if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) {
-	dbcols <- db$getChromCol()
+	dbcols <- db$getChromCol()[['id']]
 	unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols]
 	if (length(unknown.cols) > 0) {
 		stop(paste0("Unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nAllowed chromatographic column names are:\n", paste(dbcols, collapse = "\n")))
 	}
 }
 # Search database
 mode <- if (opt$mode == POS_MODE) MSDB.TAG.POS else MSDB.TAG.NEG
 db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']])
 # Write output
+main.output$moveColumnsToBeginning(colnames(input))
+peaks.output$moveColumnsToBeginning(colnames(input))
 # TODO Create a class MsDbOutputCsvFileStream
 df.write.tsv(main.output$getDataFrame(), file = opt[['output-file']], row.names = FALSE)
 if ( ! is.null(opt[['peak-output-file']]))
 	# TODO Create a class MsDbOutputCsvFileStream
 	df.write.tsv(peaks.output$getDataFrame(), file = opt[['peak-output-file']], row.names = FALSE)

Mercurial > repos > prog > lcmsmatching

comparison search-mz @ 1:45e985cd8e9e draft