diff search-mz @ 1:45e985cd8e9e draft

planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8-dirty
author prog
date Tue, 31 Jan 2017 05:27:24 -0500
parents 3afe41d3e9e7
children 1ba222315fd5
line wrap: on
line diff
--- a/search-mz	Mon Jul 11 09:12:03 2016 -0400
+++ b/search-mz	Tue Jan 31 05:27:24 2017 -0500
@@ -17,11 +17,16 @@
 source(file.path(dirname(script.path), 'biodb-common.R'), chdir = TRUE)
 source(file.path(dirname(script.path), 'nethlp.R'), chdir = TRUE)
 
+# Missing paste0() function in R 2.14.1
+if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15)
+	paste0 <- function(...) paste(..., sep = '')
+
 #############
 # CONSTANTS #
 #############
 
 PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE)
+USERAGENT <- 'search-mz ; pierrick.roger@gmail.com'
 
 # Authorized database types
 MSDB.XLS <- 'xls'
@@ -44,10 +49,11 @@
 MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP
 MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields())
 MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES)
-MSDB.DFT[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
-MSDB.DFT[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
 MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',')
 MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',')
+DEFAULT.ARG.VALUES <- MSDB.DFT
+DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields())
+DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields())
 
 ##############
 # PRINT HELP #
@@ -103,16 +109,26 @@
 		opt$rtcol <- strsplit(opt$rtcol, ',')[[1]]
 
 	# Parse input column names
-	if ( ! is.null(opt[['input-col-names']])) {
+	if (is.null(opt[['input-col-names']])) {
+		opt[['input-col-names']] <- msdb.get.dft.input.fields()
+	}
+	else {
 		custcols <- split.kv.list(opt[['input-col-names']])
-		dftcols <- split.kv.list(MSDB.DFT[['input-col-names']])
+		dftcols <- msdb.get.dft.input.fields()
 		opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) 
 	}
 
 	# Parse output column names
-	if ( ! is.null(opt[['output-col-names']])) {
+	if (is.null(opt[['output-col-names']])) {
+		# By default keep input col names for output
+		opt[['output-col-names']] <- msdb.get.dft.output.fields()
+		input.cols <- names(opt[['input-col-names']])
+		output.cols <- names(opt[['output-col-names']])
+		opt[['output-col-names']] <- c(opt[['input-col-names']][input.cols %in% output.cols], opt[['output-col-names']][ ! output.cols %in% input.cols])
+	}
+	else {
 		custcols <- split.kv.list(opt[['output-col-names']])
-		dftcols <- split.kv.list(MSDB.DFT[['output-col-names']])
+		dftcols <- msdb.get.dft.output.fields()
 		opt[['output-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) 
 	}
 
@@ -131,7 +147,7 @@
 
 print.dft.arg.val <- function(opt) {
 
-	print.flags <- MSDB.DFT
+	print.flags <- DEFAULT.ARG.VALUES
 	names(print.flags) <- vapply(names(print.flags), function(x) paste0('print-', x), FUN.VALUE = '')
 	for (f in names(print.flags))
 		if ( ! is.null(opt[[f]])) {
@@ -144,7 +160,7 @@
 
 	spec <- character()
 
-	for (f in names(MSDB.DFT))
+	for (f in names(DEFAULT.ARG.VALUES))
 		spec <- c(spec, paste0('print-', f), NA_character_, 0, 'logical', paste0('Print default value of --', f))
 
 	return(spec)
@@ -179,20 +195,20 @@
 		'precursor-rt-tol', NA_character_,  1,  'numeric',      paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'),
 		'pos-prec',         NA_character_,  1,  'character',    paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'),
 		'neg-prec',         NA_character_,  1,  'character',    paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".'),
-		'input-col-names',  NA_character_,  1,  'character',    paste0('Set the input column names. Default is "', MSDB.DFT[['input-col-names']], '".'),
-		'output-col-names', NA_character_,  1,  'character',    paste0('Set the output column names. Default is "', MSDB.DFT[['output-col-names']], '".'),
+		'input-col-names',  NA_character_,  1,  'character',    paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".'),
+		'output-col-names', NA_character_,  1,  'character',    paste0('Set the output column names. Default is "', DEFAULT.ARG.VALUES[['output-col-names']], '".'),
 		'molids-sep',       NA_character_,  1,  'character',    paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'),
 		'first-val',        NA_character_,  0,  'logical',      'Keep only the first value in multi-value fields. Unset by default.',
 		'excel2011comp',            NA_character_,  0,  'logical',      'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.',
 		'database',         'd',            1,  'character',    paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'),
 		'url',              NA_character_,  1,  'character',    'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.',
 		'cache-dir',        NA_character_,  1,  'character',    'Path to directory where to store cache files. Only used when database flag is set to "xls".',
-		'useragent',        NA_character_,  1,  'character',    'User agent. Used by the "Peakforest" database.',
 		'db-name',          NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
-		'db-user',          NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
-		'db-password',      NA_character_,  1,  'character',    'Name of the database. Used by the "4tabsql" database.',
+		'db-user',          NA_character_,  1,  'character',    'User of the database. Used by the "4tabsql" database.',
+		'db-password',      NA_character_,  1,  'character',    'Password of the database user. Used by the "4tabsql" database.',
 		'db-fields',        NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'),
 		'db-ms-modes',      NA_character_,  1,  'character',    paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'),
+		'db-token',         NA_character_,  1,  'character',    'Database token. Used by Peakforest database.',
 		'debug',            NA_character_,  0,  'logical',      'Set debug mode.'
 		)
 
@@ -224,7 +240,7 @@
 
 	# Check values
 	error <- .check.db.conn.opts(opt)
-	if (is.null(opt[['output-file']])) {
+	if (is.null(opt[['output-file']]) && is.null(opt[['list-cols']])) {
 		warning("You must set a path for the output file.")
 		error <- TRUE
 	}
@@ -327,10 +343,6 @@
 				warning("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.")
 				error <- TRUE
 			}
-			if (is.null(opt$useragent)) {
-				warning("When using PeakForest database, you must specify a user agent with option --useragent.")
-				error <- TRUE
-			}
 		}
 	
 		return(error)
@@ -363,10 +375,10 @@
 		}
 
 		db <- switch(opt$database,
-		             peakforest = MsPeakForestDb$new(url = opt$url, useragent = opt$useragent),
-		             xls = MsXlsDb(db_dir = opt$url, cache_dir = opt[['cache-dir']]),
-		             '4tabsql' = Ms4TabSqlDb(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]),
-		             file = MsFileDb(file = opt$url),
+		             peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]),
+		             xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]),
+		             '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]),
+		             file = MsFileDb$new(file = opt$url),
 		             NULL)
 		db$setPrecursors(precursors)
 		if (db$areDbFieldsSettable())
@@ -385,17 +397,29 @@
 output.html <- function(db, main, peaks, file, opt, output.fields) {
 
 	# Replace public database IDs by URLs
-	if ( ! is.null(peaks))
+	if ( ! is.null(peaks) || ! is.null(main)) {
+		# Conversion from extdb id field to extdb name
+		extdb2classdb = list()
+		extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG
+		extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB
+		extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI
+		extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP
+
+		# Loop on all dbs
 		for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) {
 			field <- output.fields[[extdb]]
-			if (field %in% colnames(peaks))
-				peaks[[field]] <- vapply(peaks[[field]], function(id) paste0('<a href="', get.entry.url(class = extdb, accession = id, content.type = RBIODB.HTML), '">', id, '</a>'), FUN.VALUE = '')
+			if ( ! is.null(peaks) && field %in% colnames(peaks))
+				peaks[[field]] <- vapply(peaks[[field]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '')
+			if ( ! is.null(main) && field %in% colnames(main))
+				main[[field]] <- vapply(main[[field]], function(ids) if (is.na(ids) || nchar(ids) == 0) '' else paste(vapply(strsplit(ids, opt[['molids-sep']])[[1]], function(id) paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = ''), collapse = opt[['molids-sep']]), FUN.VALUE = '')
 		}
+	}
 
 	# Write HTML
 	html <- HtmlWriter(file = file)
 	html$writeBegTag('html')
 	html$writeBegTag('header')
+	html$writeTag('meta', attr = c(charset = "UTF-8"))
 	html$writeTag('title', text = "LC/MS matching results")
 	html$writeBegTag('style')
 	html$write('table, th, td { border-collapse: collapse; }')
@@ -413,20 +437,20 @@
 	# Write parameters
 	html$writeTag('h2', text = "Parameters")
 	html$writeBegTag('ul')
-	html$writeTag('li', paste0("Mode = ", opt$mode, "."))
-	html$writeTag('li', paste0("M/Z precision = ", opt$mzprec, "."))
-	html$writeTag('li', paste0("M/Z shift = ", opt$mzshift, "."))
-	html$writeTag('li', paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
+	html$writeTag('li', text = paste0("Mode = ", opt$mode, "."))
+	html$writeTag('li', text = paste0("M/Z precision = ", opt$mzprec, "."))
+	html$writeTag('li', text = paste0("M/Z shift = ", opt$mzshift, "."))
+	html$writeTag('li', text = paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), "."))
 	if ( ! is.null(opt[['precursor-match']])) {
-		html$writeTag('li', paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
-		html$writeTag('li', paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
+		html$writeTag('li', text = paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), "."))
+		html$writeTag('li', text = paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), "."))
 	}
 	if ( ! is.null(opt$rtcol)) {
-		html$writeTag('li', paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
-		html$writeTag('li', paste0("RTX = ", opt$rttolx, "."))
-		html$writeTag('li', paste0("RTY = ", opt$rttoly, "."))
+		html$writeTag('li', text = paste0("Columns = ", paste(opt$rtcol, collapse = ", "), "."))
+		html$writeTag('li', text = paste0("RTX = ", opt$rttolx, "."))
+		html$writeTag('li', text = paste0("RTY = ", opt$rttoly, "."))
 		if ( ! is.null(opt[['precursor-match']]))
-			html$writeTag('li', paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
+			html$writeTag('li', text = paste0("RTZ = ", opt[['precursor-rt-tol']], "."))
 	}
 	html$writeEndTag('ul')
 
@@ -469,7 +493,7 @@
 # Print columns
 if ( ! is.null(opt[['list-cols']])) {
 	cols <- db$getChromCol()
-	df.write.tsv(cols, file = opt[['output-file']])
+	df.write.tsv(cols, file = if (is.null(opt[['output-file']])) stdout() else opt[['output-file']])
 	q(status = 0)
 }
 
@@ -479,7 +503,7 @@
 if (file.info(opt[['input-file']])$size > 0) {
 
 	# Load file into data frame
-	input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t")
+	input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE)
 
 	# Convert each column that is identified by a number into a name
 	for (field in names(opt[['input-col-names']])) {
@@ -506,7 +530,7 @@
 
 # Check chrom columns
 if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) {
-	dbcols <- db$getChromCol()
+	dbcols <- db$getChromCol()[['id']]
 	unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols]
 	if (length(unknown.cols) > 0) {
 		stop(paste0("Unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nAllowed chromatographic column names are:\n", paste(dbcols, collapse = "\n")))
@@ -532,6 +556,8 @@
 db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']])
 
 # Write output
+main.output$moveColumnsToBeginning(colnames(input))
+peaks.output$moveColumnsToBeginning(colnames(input))
 # TODO Create a class MsDbOutputCsvFileStream
 df.write.tsv(main.output$getDataFrame(), file = opt[['output-file']], row.names = FALSE)
 if ( ! is.null(opt[['peak-output-file']]))