Mercurial > repos > prog > lcmsmatching
diff search-mz @ 1:45e985cd8e9e draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8-dirty
| author | prog |
|---|---|
| date | Tue, 31 Jan 2017 05:27:24 -0500 |
| parents | 3afe41d3e9e7 |
| children | 1ba222315fd5 |
line wrap: on
line diff
--- a/search-mz Mon Jul 11 09:12:03 2016 -0400 +++ b/search-mz Tue Jan 31 05:27:24 2017 -0500 @@ -17,11 +17,16 @@ source(file.path(dirname(script.path), 'biodb-common.R'), chdir = TRUE) source(file.path(dirname(script.path), 'nethlp.R'), chdir = TRUE) +# Missing paste0() function in R 2.14.1 +if (as.integer(R.Version()$major) == 2 && as.numeric(R.Version()$minor) < 15) + paste0 <- function(...) paste(..., sep = '') + ############# # CONSTANTS # ############# PROG <- sub('^.*/([^/]+)$', '\\1', commandArgs()[4], perl = TRUE) +USERAGENT <- 'search-mz ; pierrick.roger@gmail.com' # Authorized database types MSDB.XLS <- 'xls' @@ -44,10 +49,11 @@ MSDB.DFT[['molids-sep']] <- MSDB.DFT.MATCH.SEP MSDB.DFT[['db-fields']] <- concat.kv.list(msdb.get.dft.db.fields()) MSDB.DFT[['db-ms-modes']] <- concat.kv.list(MSDB.DFT.MODES) -MSDB.DFT[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields()) -MSDB.DFT[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields()) MSDB.DFT[['pos-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.POS]], collapse = ',') MSDB.DFT[['neg-prec']] <- paste(MSDB.DFT.PREC[[MSDB.TAG.NEG]], collapse = ',') +DEFAULT.ARG.VALUES <- MSDB.DFT +DEFAULT.ARG.VALUES[['input-col-names']] <- concat.kv.list(msdb.get.dft.input.fields()) +DEFAULT.ARG.VALUES[['output-col-names']] <- concat.kv.list(msdb.get.dft.output.fields()) ############## # PRINT HELP # @@ -103,16 +109,26 @@ opt$rtcol <- strsplit(opt$rtcol, ',')[[1]] # Parse input column names - if ( ! is.null(opt[['input-col-names']])) { + if (is.null(opt[['input-col-names']])) { + opt[['input-col-names']] <- msdb.get.dft.input.fields() + } + else { custcols <- split.kv.list(opt[['input-col-names']]) - dftcols <- split.kv.list(MSDB.DFT[['input-col-names']]) + dftcols <- msdb.get.dft.input.fields() opt[['input-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) } # Parse output column names - if ( ! is.null(opt[['output-col-names']])) { + if (is.null(opt[['output-col-names']])) { + # By default keep input col names for output + opt[['output-col-names']] <- msdb.get.dft.output.fields() + input.cols <- names(opt[['input-col-names']]) + output.cols <- names(opt[['output-col-names']]) + opt[['output-col-names']] <- c(opt[['input-col-names']][input.cols %in% output.cols], opt[['output-col-names']][ ! output.cols %in% input.cols]) + } + else { custcols <- split.kv.list(opt[['output-col-names']]) - dftcols <- split.kv.list(MSDB.DFT[['output-col-names']]) + dftcols <- msdb.get.dft.output.fields() opt[['output-col-names']] <- c(custcols, dftcols[ ! names(dftcols) %in% names(custcols)]) } @@ -131,7 +147,7 @@ print.dft.arg.val <- function(opt) { - print.flags <- MSDB.DFT + print.flags <- DEFAULT.ARG.VALUES names(print.flags) <- vapply(names(print.flags), function(x) paste0('print-', x), FUN.VALUE = '') for (f in names(print.flags)) if ( ! is.null(opt[[f]])) { @@ -144,7 +160,7 @@ spec <- character() - for (f in names(MSDB.DFT)) + for (f in names(DEFAULT.ARG.VALUES)) spec <- c(spec, paste0('print-', f), NA_character_, 0, 'logical', paste0('Print default value of --', f)) return(spec) @@ -179,20 +195,20 @@ 'precursor-rt-tol', NA_character_, 1, 'numeric', paste0('Precursor retention time tolerance. Only used when precursor-match is enabled. Default is ', MSDB.DFT[['precursor-rt-tol']], '.'), 'pos-prec', NA_character_, 1, 'character', paste0('Set the list of precursors to use in positive mode. Default is "', MSDB.DFT[['pos-prec']], '".'), 'neg-prec', NA_character_, 1, 'character', paste0('Set the list of precursors to use in negative mode. Default is "', MSDB.DFT[['neg-prec']], '".'), - 'input-col-names', NA_character_, 1, 'character', paste0('Set the input column names. Default is "', MSDB.DFT[['input-col-names']], '".'), - 'output-col-names', NA_character_, 1, 'character', paste0('Set the output column names. Default is "', MSDB.DFT[['output-col-names']], '".'), + 'input-col-names', NA_character_, 1, 'character', paste0('Set the input column names. Default is "', DEFAULT.ARG.VALUES[['input-col-names']], '".'), + 'output-col-names', NA_character_, 1, 'character', paste0('Set the output column names. Default is "', DEFAULT.ARG.VALUES[['output-col-names']], '".'), 'molids-sep', NA_character_, 1, 'character', paste0('Set character separator used to when concatenating molecule IDs in output. Default is "', MSDB.DFT[['molids-sep']] , '".'), 'first-val', NA_character_, 0, 'logical', 'Keep only the first value in multi-value fields. Unset by default.', 'excel2011comp', NA_character_, 0, 'logical', 'Excel 2011 compatiblity mode. Output ASCII text files instead of UTF-8 files, where greek letters are replaced with their latin names, plusminus sign is replaced with +- and apostrophe is replaced with \"prime\". All other non-ASCII characters are repladed with underscore.', 'database', 'd', 1, 'character', paste0('Set database to use: "xls" for an Excel database, "file" for a single file database, "4tabsql" for a 4Tab SQL database, and "peakforest" for a connection to PeakForest database.'), 'url', NA_character_, 1, 'character', 'URL of database. For "peakforest" database it is the HTTP URL, for the "xls" database it is the path to the directory containing the Excel files, for the "file" database it is the path to the file database and for the "4tabsql" database it is the IP address of the server.', 'cache-dir', NA_character_, 1, 'character', 'Path to directory where to store cache files. Only used when database flag is set to "xls".', - 'useragent', NA_character_, 1, 'character', 'User agent. Used by the "Peakforest" database.', 'db-name', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', - 'db-user', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', - 'db-password', NA_character_, 1, 'character', 'Name of the database. Used by the "4tabsql" database.', + 'db-user', NA_character_, 1, 'character', 'User of the database. Used by the "4tabsql" database.', + 'db-password', NA_character_, 1, 'character', 'Password of the database user. Used by the "4tabsql" database.', 'db-fields', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the field names to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-fields']], '".'), 'db-ms-modes', NA_character_, 1, 'character', paste0('Comma separated key/value list giving the MS modes to be used in the single file database (option --db-file). Default is "', MSDB.DFT[['db-ms-modes']], '".'), + 'db-token', NA_character_, 1, 'character', 'Database token. Used by Peakforest database.', 'debug', NA_character_, 0, 'logical', 'Set debug mode.' ) @@ -224,7 +240,7 @@ # Check values error <- .check.db.conn.opts(opt) - if (is.null(opt[['output-file']])) { + if (is.null(opt[['output-file']]) && is.null(opt[['list-cols']])) { warning("You must set a path for the output file.") error <- TRUE } @@ -327,10 +343,6 @@ warning("When using PeakForest database, you must specify the URL of the PeakForest server with option --url.") error <- TRUE } - if (is.null(opt$useragent)) { - warning("When using PeakForest database, you must specify a user agent with option --useragent.") - error <- TRUE - } } return(error) @@ -363,10 +375,10 @@ } db <- switch(opt$database, - peakforest = MsPeakForestDb$new(url = opt$url, useragent = opt$useragent), - xls = MsXlsDb(db_dir = opt$url, cache_dir = opt[['cache-dir']]), - '4tabsql' = Ms4TabSqlDb(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), - file = MsFileDb(file = opt$url), + peakforest = MsPeakForestDb$new(url = opt$url, useragent = USERAGENT, token = opt[['db-token']]), + xls = MsXlsDb$new(db_dir = opt$url, cache_dir = opt[['cache-dir']]), + '4tabsql' = Ms4TabSqlDb$new(host = extract.address(opt$url), port = extract.port(opt$url), dbname = opt[['db-name']], user = opt[['db-user']], password = opt[['db-password']]), + file = MsFileDb$new(file = opt$url), NULL) db$setPrecursors(precursors) if (db$areDbFieldsSettable()) @@ -385,17 +397,29 @@ output.html <- function(db, main, peaks, file, opt, output.fields) { # Replace public database IDs by URLs - if ( ! is.null(peaks)) + if ( ! is.null(peaks) || ! is.null(main)) { + # Conversion from extdb id field to extdb name + extdb2classdb = list() + extdb2classdb[MSDB.TAG.KEGG] = BIODB.KEGG + extdb2classdb[MSDB.TAG.HMDB] = BIODB.HMDB + extdb2classdb[MSDB.TAG.CHEBI] = BIODB.CHEBI + extdb2classdb[MSDB.TAG.PUBCHEM] = BIODB.PUBCHEMCOMP + + # Loop on all dbs for (extdb in c(MSDB.TAG.KEGG, MSDB.TAG.HMDB, MSDB.TAG.CHEBI, MSDB.TAG.PUBCHEM)) { field <- output.fields[[extdb]] - if (field %in% colnames(peaks)) - peaks[[field]] <- vapply(peaks[[field]], function(id) paste0('<a href="', get.entry.url(class = extdb, accession = id, content.type = RBIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') + if ( ! is.null(peaks) && field %in% colnames(peaks)) + peaks[[field]] <- vapply(peaks[[field]], function(id) if (is.na(id)) '' else paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = '') + if ( ! is.null(main) && field %in% colnames(main)) + main[[field]] <- vapply(main[[field]], function(ids) if (is.na(ids) || nchar(ids) == 0) '' else paste(vapply(strsplit(ids, opt[['molids-sep']])[[1]], function(id) paste0('<a href="', get.entry.url(class = extdb2classdb[[extdb]], accession = id, content.type = BIODB.HTML), '">', id, '</a>'), FUN.VALUE = ''), collapse = opt[['molids-sep']]), FUN.VALUE = '') } + } # Write HTML html <- HtmlWriter(file = file) html$writeBegTag('html') html$writeBegTag('header') + html$writeTag('meta', attr = c(charset = "UTF-8")) html$writeTag('title', text = "LC/MS matching results") html$writeBegTag('style') html$write('table, th, td { border-collapse: collapse; }') @@ -413,20 +437,20 @@ # Write parameters html$writeTag('h2', text = "Parameters") html$writeBegTag('ul') - html$writeTag('li', paste0("Mode = ", opt$mode, ".")) - html$writeTag('li', paste0("M/Z precision = ", opt$mzprec, ".")) - html$writeTag('li', paste0("M/Z shift = ", opt$mzshift, ".")) - html$writeTag('li', paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), ".")) + html$writeTag('li', text = paste0("Mode = ", opt$mode, ".")) + html$writeTag('li', text = paste0("M/Z precision = ", opt$mzprec, ".")) + html$writeTag('li', text = paste0("M/Z shift = ", opt$mzshift, ".")) + html$writeTag('li', text = paste0("Precursor match = ", (if (is.null(opt[['precursor-match']])) "no" else "yes"), ".")) if ( ! is.null(opt[['precursor-match']])) { - html$writeTag('li', paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), ".")) - html$writeTag('li', paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), ".")) + html$writeTag('li', text = paste0("Positive precursors = ", paste0(opt[['pos-prec']], collapse = ', '), ".")) + html$writeTag('li', text = paste0("Negative precursors = ", paste0(opt[['neg-prec']], collapse = ', '), ".")) } if ( ! is.null(opt$rtcol)) { - html$writeTag('li', paste0("Columns = ", paste(opt$rtcol, collapse = ", "), ".")) - html$writeTag('li', paste0("RTX = ", opt$rttolx, ".")) - html$writeTag('li', paste0("RTY = ", opt$rttoly, ".")) + html$writeTag('li', text = paste0("Columns = ", paste(opt$rtcol, collapse = ", "), ".")) + html$writeTag('li', text = paste0("RTX = ", opt$rttolx, ".")) + html$writeTag('li', text = paste0("RTY = ", opt$rttoly, ".")) if ( ! is.null(opt[['precursor-match']])) - html$writeTag('li', paste0("RTZ = ", opt[['precursor-rt-tol']], ".")) + html$writeTag('li', text = paste0("RTZ = ", opt[['precursor-rt-tol']], ".")) } html$writeEndTag('ul') @@ -469,7 +493,7 @@ # Print columns if ( ! is.null(opt[['list-cols']])) { cols <- db$getChromCol() - df.write.tsv(cols, file = opt[['output-file']]) + df.write.tsv(cols, file = if (is.null(opt[['output-file']])) stdout() else opt[['output-file']]) q(status = 0) } @@ -479,7 +503,7 @@ if (file.info(opt[['input-file']])$size > 0) { # Load file into data frame - input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t") + input <- read.table(file = opt[['input-file']], header = TRUE, sep = "\t", stringsAsFactor = FALSE) # Convert each column that is identified by a number into a name for (field in names(opt[['input-col-names']])) { @@ -506,7 +530,7 @@ # Check chrom columns if ( ! is.null(opt[['check-cols']]) && ! is.null(opt$rtcol)) { - dbcols <- db$getChromCol() + dbcols <- db$getChromCol()[['id']] unknown.cols <- opt$rtcol[ ! opt$rtcol %in% dbcols] if (length(unknown.cols) > 0) { stop(paste0("Unknown chromatographic column", (if (length(unknown.cols) > 1) 's' else ''), ': ', paste(unknown.cols, collapse = ', '), ".\nAllowed chromatographic column names are:\n", paste(dbcols, collapse = "\n"))) @@ -532,6 +556,8 @@ db$searchForMzRtList(mode = mode, shift = opt$mzshift, prec = opt$mzprec, rt.tol = opt$rttol, rt.tol.x = opt$rttolx, rt.tol.y = opt$rttoly, col = opt$rtcol, precursor.match = ! is.null(opt[['precursor-match']]), precursor.rt.tol = opt[['precursor-rt-tol']]) # Write output +main.output$moveColumnsToBeginning(colnames(input)) +peaks.output$moveColumnsToBeginning(colnames(input)) # TODO Create a class MsDbOutputCsvFileStream df.write.tsv(main.output$getDataFrame(), file = opt[['output-file']], row.names = FALSE) if ( ! is.null(opt[['peak-output-file']]))
