Mercurial > repos > prog > lcmsmatching
diff biodb-common.R @ 1:45e985cd8e9e draft
planemo upload for repository https://github.com/workflow4metabolomics/lcmsmatching.git commit d4048accde6bdfd5b3e14f5394902d38991854f8-dirty
| author | prog |
|---|---|
| date | Tue, 31 Jan 2017 05:27:24 -0500 |
| parents | 3afe41d3e9e7 |
| children |
line wrap: on
line diff
--- a/biodb-common.R Mon Jul 11 09:12:03 2016 -0400 +++ b/biodb-common.R Tue Jan 31 05:27:24 2017 -0500 @@ -1,181 +1,350 @@ -if ( ! exists('RBIODB.COMPOUND')) { # Do not load again if already loaded +if ( ! exists('BIODB.XML')) { + + ############### + # CACHE MODES # + ############### + + BIODB.CACHE.READ.ONLY <- 'read-only' + BIODB.CACHE.READ.WRITE <- 'read-write' + BIODB.CACHE.WRITE.ONLY <- 'write-only' + + ####################### + # ENTRY CONTENT TYPES # + ####################### + + BIODB.HTML <- 'html' + BIODB.TXT <- 'txt' + BIODB.XML <- 'xml' + BIODB.CSV <- 'csv' + BIODB.DATAFRAME <- 'dataframe' + BIODB.JSON <- 'json' ############# - # CONSTANTS # + # DATABASES # ############# - - # Entry types - RBIODB.COMPOUND <- 'compound' - RBIODB.SPECTRUM <- 'spectrum' - - # Entry content types - RBIODB.HTML <- 'html' - RBIODB.TXT <- 'txt' - RBIODB.XML <- 'xml' - RBIODB.CSV <- 'csv' - RBIODB.ANY <- 'any' - # Class names - RBIODB.CHEBI <- 'chebi' - RBIODB.KEGG <- 'kegg' - RBIODB.PUBCHEM <- 'pubchem' - RBIODB.HMDB <- 'hmdb' - RBIODB.CHEMSPIDER <- 'chemspider' - RBIODB.ENZYME <- 'enzyme' - RBIODB.LIPIDMAPS <- 'lipidmaps' - RBIODB.MIRBASE <- 'mirbase' - RBIODB.NCBIGENE <- 'ncbigene' - RBIODB.NCBICCDS <- 'ncbiccds' - RBIODB.UNIPROT <- 'uniprot' - RBIODB.MASSBANK <- 'massbank' + BIODB.CHEBI <- 'chebi' + BIODB.KEGG <- 'kegg' + BIODB.PUBCHEMCOMP <- 'pubchemcomp' # Compound database + BIODB.PUBCHEMSUB <- 'pubchemsub' # Substance database + BIODB.HMDB <- 'hmdb' + BIODB.CHEMSPIDER <- 'chemspider' + BIODB.ENZYME <- 'enzyme' + BIODB.LIPIDMAPS <- 'lipidmaps' + BIODB.MIRBASE <- 'mirbase' + BIODB.NCBIGENE <- 'ncbigene' + BIODB.NCBICCDS <- 'ncbiccds' + BIODB.UNIPROT <- 'uniprot' + BIODB.MASSBANK <- 'massbank' + BIODB.MASSFILEDB <- 'massfiledb' + BIODB.PEAKFOREST <- 'peakforest' + + BIODB.DATABASES <- c(BIODB.CHEBI, BIODB.KEGG, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.HMDB, BIODB.CHEMSPIDER, BIODB.ENZYME, BIODB.LIPIDMAPS, BIODB.MIRBASE, BIODB.NCBIGENE, BIODB.NCBICCDS, BIODB.UNIPROT, BIODB.MASSBANK, BIODB.MASSFILEDB, BIODB.PEAKFOREST) + + ########## + # FIELDS # + ########## - # Fields - RBIODB.COMPOUND <- 'compound' - RBIODB.ACCESSION <- 'accession' - RBIODB.DESCRIPTION <- 'description' - RBIODB.PROTEIN.DESCRIPTION <- 'protdesc' - RBIODB.NAME <- 'name' - RBIODB.FULLNAMES <- 'fullnames' - RBIODB.SYNONYMS <- 'synonyms' - RBIODB.SYMBOL <- 'symbol' - RBIODB.GENE.SYMBOLS <- 'genesymbols' - RBIODB.CHEBI.ID <- 'chebiid' - RBIODB.LIPIDMAPS.ID <- 'lipidmapsid' - RBIODB.KEGG.ID <- 'keggid' - RBIODB.HMDB.ID <- 'hmdbid' - RBIODB.ENZYME.ID <- 'enzymeid' - RBIODB.NCBI.CCDS.ID <- 'ncbiccdsid' - RBIODB.NCBI.GENE.ID <- 'ncbigeneid' - RBIODB.PUBCHEM.ID <- 'pubchemid' - RBIODB.UNIPROT.ID <- 'uniprotid' - RBIODB.INCHI <- 'inchi' - RBIODB.INCHIKEY <- 'inchikey' - RBIODB.MSDEV <- 'msdev' - RBIODB.MSDEVTYPE <- 'msdevtype' - RBIODB.MSTYPE <- 'mstype' - RBIODB.MSMODE <- 'msmode' - RBIODB.MSPRECMZ <- 'msprecmz' # numeric - RBIODB.MSPRECANNOT <- 'msprecannot' - RBIODB.FORMULA <- 'formula' - RBIODB.SUPER.CLASS <- 'superclass' - RBIODB.MASS <- 'mass' - RBIODB.AVERAGE.MASS <- 'averagemass' - RBIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' - RBIODB.SEQUENCE <- 'sequence' - RBIODB.LOCATION <- 'location' - RBIODB.LENGTH <- 'length' - RBIODB.NB.PEAKS <- 'nbpeaks' - RBIODB.NB.PEAKS <- 'nbpeaks' - RBIODB.PEAKS <- 'peaks' + BIODB.ACCESSION <- 'accession' + BIODB.DESCRIPTION <- 'description' + BIODB.PROTEIN.DESCRIPTION <- 'protdesc' + BIODB.NAME <- 'name' + BIODB.COMP.IUPAC.NAME.ALLOWED <- 'comp.iupac.name.allowed' + BIODB.COMP.IUPAC.NAME.TRAD <- 'comp.iupac.name.trad' + BIODB.COMP.IUPAC.NAME.SYST <- 'comp.iupac.name.syst' + BIODB.COMP.IUPAC.NAME.PREF <- 'comp.iupac.name.pref' + BIODB.COMP.IUPAC.NAME.CAS <- 'comp.iupac.name.cas' + BIODB.FULLNAMES <- 'fullnames' + BIODB.SYNONYMS <- 'synonyms' + BIODB.SYMBOL <- 'symbol' + BIODB.GENE.SYMBOLS <- 'genesymbols' + BIODB.CHEBI.ID <- 'chebiid' + BIODB.LIPIDMAPS.ID <- 'lipidmapsid' + BIODB.KEGG.ID <- 'keggid' + BIODB.HMDB.ID <- 'hmdbid' + BIODB.ENZYME.ID <- 'enzymeid' + BIODB.NCBI.CCDS.ID <- 'ncbiccdsid' + BIODB.NCBI.GENE.ID <- 'ncbigeneid' + BIODB.PUBCHEMCOMP.ID <- 'pubchemcompid' + BIODB.PUBCHEMSUB.ID <- 'pubchemsubid' + BIODB.CHEMSPIDER.ID <- 'chemspiderid' + BIODB.UNIPROT.ID <- 'uniprotid' + BIODB.CAS.ID <- 'casid' + BIODB.PEAKFOREST.ID <- 'peakforestid' + BIODB.SMILES <- 'smiles' + BIODB.INCHI <- 'inchi' + BIODB.INCHIKEY <- 'inchikey' + BIODB.MSDEV <- 'msdev' + BIODB.MSDEVTYPE <- 'msdevtype' + BIODB.MSTYPE <- 'mstype' + BIODB.MSMODE <- 'msmode' + BIODB.MSPRECMZ <- 'msprecmz' # numeric + BIODB.MSPRECANNOT <- 'msprecannot' + BIODB.FORMULA <- 'formula' + BIODB.SUPER.CLASS <- 'superclass' + BIODB.MASS <- 'mass' + BIODB.AVERAGE.MASS <- 'averagemass' + BIODB.MONOISOTOPIC.MASS <- 'monoisotopicmass' + BIODB.SEQUENCE <- 'sequence' + BIODB.LOCATION <- 'location' + BIODB.LENGTH <- 'length' + BIODB.NB.PEAKS <- 'nbpeaks' + BIODB.PEAKS <- 'peaks' + BIODB.COMPOUNDS <- 'compounds' + BIODB.NB.COMPOUNDS <- 'nbcompounds' + BIODB.COMPOUND.ID <- 'compoundid' + BIODB.COMPOUND.MASS <- 'compoundmass' + BIODB.COMPOUND.COMP <- 'compoundcomp' + BIODB.CHROM.COL <- 'chromcol' # Chromatographic column + BIODB.CHROM.COL.RT <- 'chromcolrt' # Retention time measured on chromatographic column + BIODB.ID <- 'id' + BIODB.TITLE <- 'title' + BIODB.PEAK.MZ <- 'mz' + BIODB.PEAK.RT <- 'rt' + BIODB.PEAK.MZEXP <- 'mzexp' + BIODB.PEAK.MZTHEO <- 'mztheo' + BIODB.PEAK.FORMULA <- 'formula' + BIODB.PEAK.FORMULA.COUNT <- 'formula.count' + BIODB.PEAK.COMP <- 'peakcomp' # Peak composition + BIODB.PEAK.ATTR <- 'peakattr' # Peak attribution + BIODB.PEAK.MASS <- 'mass' +# BIODB.PEAK.ATTR <- 'attr' + BIODB.PEAK.ERROR.PPM <- 'error.ppm' + BIODB.PEAK.INTENSITY <- 'intensity' + BIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' # Mode values - RBIODB.MSMODE.NEG <- 'neg' - RBIODB.MSMODE.POS <- 'pos' + BIODB.MSMODE.NEG <- 'neg' + BIODB.MSMODE.POS <- 'pos' - # Cardinalities - RBIODB.CARD.ONE <- '1' - RBIODB.CARD.MANY <- '*' + # Tolerance values + BIODB.TOL <- 'mztol' + BIODB.MZTOLUNIT.PPM <- 'ppm' + BIODB.MZTOLUNIT.PLAIN <- 'plain' # same as mz: mass-to-charge ratio + BIODB.MZTOLUNIT.VALS <- c(BIODB.MZTOLUNIT.PPM, BIODB.MZTOLUNIT.PLAIN) + + ######################## + # MS-MS MEASURE VALUES # + ######################## + + BIODB.MSMS.DIST.COS <- "cosine" + BIODB.MSMS.DIST.WCOSINE <- "wcosine" + BIODB.MSMS.DIST.PKERNEL <- "pkernel" + BIODB.MSMS.DIST <- c(BIODB.MSMS.DIST.COS, BIODB.MSMS.DIST.WCOSINE, BIODB.MSMS.DIST.PKERNEL) + + + ################# + # CARDINALITIES # + ################# + + BIODB.CARD.ONE <- '1' + BIODB.CARD.MANY <- '*' + + ##################### + #INTENSITy NOTATIONS# + ##################### + + BIODB.GROUP.INTENSITY<-c(BIODB.PEAK.INTENSITY,BIODB.PEAK.RELATIVE.INTENSITY) - # Field attributes - RBIODB.FIELDS <- data.frame(matrix(c( - # FIELD NAME CLASS CARDINALITY - RBIODB.COMPOUND, 'BiodEntry', RBIODB.CARD.ONE, - RBIODB.ACCESSION, 'character', RBIODB.CARD.ONE, - RBIODB.DESCRIPTION, 'character', RBIODB.CARD.ONE, - RBIODB.NAME, 'character', RBIODB.CARD.ONE, - RBIODB.FULLNAMES, 'character', RBIODB.CARD.MANY, - RBIODB.SYNONYMS, 'character', RBIODB.CARD.MANY, - RBIODB.PROTEIN.DESCRIPTION, 'character', RBIODB.CARD.ONE, - RBIODB.SYMBOL, 'character', RBIODB.CARD.ONE, - RBIODB.GENE.SYMBOLS, 'character', RBIODB.CARD.MANY, - RBIODB.CHEBI.ID, 'character', RBIODB.CARD.ONE, - RBIODB.LIPIDMAPS.ID, 'character', RBIODB.CARD.ONE, - RBIODB.KEGG.ID, 'character', RBIODB.CARD.ONE, - RBIODB.HMDB.ID, 'character', RBIODB.CARD.ONE, - RBIODB.ENZYME.ID, 'character', RBIODB.CARD.ONE, - RBIODB.PUBCHEM.ID, 'character', RBIODB.CARD.ONE, - RBIODB.UNIPROT.ID, 'character', RBIODB.CARD.ONE, - RBIODB.NCBI.CCDS.ID, 'character', RBIODB.CARD.ONE, - RBIODB.NCBI.GENE.ID, 'character', RBIODB.CARD.ONE, - RBIODB.INCHI, 'character', RBIODB.CARD.ONE, - RBIODB.INCHIKEY, 'character', RBIODB.CARD.ONE, - RBIODB.MSDEV, 'character', RBIODB.CARD.ONE, - RBIODB.MSDEVTYPE, 'character', RBIODB.CARD.ONE, - RBIODB.MSTYPE, 'character', RBIODB.CARD.ONE, - RBIODB.MSMODE, 'character', RBIODB.CARD.ONE, - RBIODB.MSPRECMZ, 'double', RBIODB.CARD.ONE, - RBIODB.MSPRECANNOT, 'character', RBIODB.CARD.ONE, - RBIODB.FORMULA, 'character', RBIODB.CARD.ONE, - RBIODB.SUPER.CLASS, 'character', RBIODB.CARD.ONE, - RBIODB.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.AVERAGE.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.MONOISOTOPIC.MASS, 'double', RBIODB.CARD.ONE, - RBIODB.SEQUENCE, 'character', RBIODB.CARD.ONE, - RBIODB.LENGTH, 'integer', RBIODB.CARD.ONE, - RBIODB.LOCATION, 'character', RBIODB.CARD.ONE, - RBIODB.NB.PEAKS, 'integer', RBIODB.CARD.ONE, - RBIODB.PEAKS, 'data.frame', RBIODB.CARD.ONE - ), byrow = TRUE, ncol = 3), stringsAsFactors = FALSE) - colnames(RBIODB.FIELDS) <- c('name', 'class', 'cardinality') + ########################## + # ENTRY FIELD ATTRIBUTES # + ########################## + # FIELD NAME CLASS CARDINALITY TYPE + BIODB.FIELDS <- data.frame(matrix(c( + BIODB.ACCESSION, 'character', BIODB.CARD.ONE, 'none', + BIODB.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', + BIODB.NAME, 'character', BIODB.CARD.ONE, 'name', + BIODB.COMP.IUPAC.NAME.ALLOWED, 'character', BIODB.CARD.ONE, 'name', + BIODB.COMP.IUPAC.NAME.TRAD, 'character', BIODB.CARD.ONE, 'name', + BIODB.COMP.IUPAC.NAME.SYST, 'character', BIODB.CARD.ONE, 'name', + BIODB.COMP.IUPAC.NAME.PREF, 'character', BIODB.CARD.ONE, 'name', + BIODB.COMP.IUPAC.NAME.CAS, 'character', BIODB.CARD.ONE, 'name', + BIODB.FULLNAMES, 'character', BIODB.CARD.MANY, 'name', + BIODB.SYNONYMS, 'character', BIODB.CARD.MANY, 'name', + BIODB.PROTEIN.DESCRIPTION, 'character', BIODB.CARD.ONE, 'none', + BIODB.SYMBOL, 'character', BIODB.CARD.ONE, 'none', + BIODB.GENE.SYMBOLS, 'character', BIODB.CARD.MANY, 'none', + BIODB.NB.COMPOUNDS, 'integer', BIODB.CARD.ONE, 'none', + BIODB.COMPOUNDS, 'object', BIODB.CARD.MANY, 'none', + BIODB.CHEBI.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.LIPIDMAPS.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.KEGG.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.HMDB.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.ENZYME.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.PUBCHEMCOMP.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.PUBCHEMSUB.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.PEAKFOREST.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.UNIPROT.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.NCBI.CCDS.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.NCBI.GENE.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.INCHI, 'character', BIODB.CARD.ONE, 'none', + BIODB.INCHIKEY, 'character', BIODB.CARD.ONE, 'none', + BIODB.MSDEV, 'character', BIODB.CARD.ONE, 'none', + BIODB.MSDEVTYPE, 'character', BIODB.CARD.ONE, 'none', + BIODB.MSTYPE, 'character', BIODB.CARD.ONE, 'none', + BIODB.MSMODE, 'character', BIODB.CARD.ONE, 'none', + BIODB.MSPRECMZ, 'double', BIODB.CARD.ONE, 'none', + BIODB.PEAK.MZTHEO, 'double', BIODB.CARD.ONE, 'none', + BIODB.MSPRECANNOT, 'character', BIODB.CARD.ONE, 'none', + BIODB.FORMULA, 'character', BIODB.CARD.ONE, 'none', + BIODB.SUPER.CLASS, 'character', BIODB.CARD.ONE, 'none', + BIODB.MASS, 'double', BIODB.CARD.ONE, 'none', + BIODB.AVERAGE.MASS, 'double', BIODB.CARD.ONE, 'none', + BIODB.MONOISOTOPIC.MASS, 'double', BIODB.CARD.ONE, 'none', + BIODB.SEQUENCE, 'character', BIODB.CARD.ONE, 'none', + BIODB.LENGTH, 'integer', BIODB.CARD.ONE, 'none', + BIODB.LOCATION, 'character', BIODB.CARD.ONE, 'none', + BIODB.NB.PEAKS, 'integer', BIODB.CARD.ONE, 'none', + BIODB.PEAKS, 'data.frame', BIODB.CARD.ONE, 'none', + BIODB.SMILES, 'character', BIODB.CARD.ONE, 'none', + BIODB.CHEMSPIDER.ID, 'character', BIODB.CARD.ONE, 'none', + BIODB.CAS.ID, 'character', BIODB.CARD.ONE, 'none' + ), byrow = TRUE, ncol = 4), stringsAsFactors = FALSE) + colnames(BIODB.FIELDS) <- c('name', 'class', 'cardinality', 'type') - # How to compute a missing field ? - RBIODB.FIELD.COMPUTING <- list() - RBIODB.FIELD.COMPUTING[[RBIODB.INCHI]] <- c(RBIODB.CHEBI) - RBIODB.FIELD.COMPUTING[[RBIODB.INCHIKEY]] <- c(RBIODB.CHEBI) - RBIODB.FIELD.COMPUTING[[RBIODB.SEQUENCE]] <- c(RBIODB.NCBICCDS) + ######################### + # GET DATABASE ID FIELD # + ######################### + + biodb.get.database.id.field <- function(database) { + + id.field <- NA_character_ + + if (database %in% BIODB.DATABASES) { + id.field <- paste0(database, 'id') + if ( ! id.field %in% BIODB.FIELDS[['name']]) + stop(paste0('No ID field defined for database ', database, '.')) + } + + return(id.field) + } - # Peaks data frame columns - RBIODB.PEAK.MZ <- 'mz' - RBIODB.PEAK.FORMULA <- 'formula' - RBIODB.PEAK.FORMULA.COUNT <- 'formula.count' - RBIODB.PEAK.MASS <- 'mass' - RBIODB.PEAK.ERROR.PPM <- 'error.ppm' - RBIODB.PEAK.INTENSITY <- 'intensity' - RBIODB.PEAK.RELATIVE.INTENSITY <- 'relative.intensity' - RBIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) - colnames(RBIODB.PEAK.DF.EXAMPLE) <- c(RBIODB.PEAK.MZ, RBIODB.PEAK.INTENSITY, RBIODB.PEAK.RELATIVE.INTENSITY, RBIODB.PEAK.FORMULA, RBIODB.PEAK.FORMULA.COUNT, RBIODB.PEAK.MASS, RBIODB.PEAK.ERROR.PPM) + ##################### + # COMPUTABLE FIELDS # + ##################### + + BIODB.FIELD.COMPUTING <- list() + BIODB.FIELD.COMPUTING[[BIODB.INCHI]] <- c(BIODB.CHEBI) + BIODB.FIELD.COMPUTING[[BIODB.INCHIKEY]] <- c(BIODB.CHEBI) + BIODB.FIELD.COMPUTING[[BIODB.SEQUENCE]] <- c(BIODB.NCBICCDS) + + #################### + # PEAKS DATA FRAME # + #################### + + # Example + BIODB.PEAK.DF.EXAMPLE <- data.frame(mz = double(), int = double(), rel.int = integer(), formula = character(), formula.count <- integer(), mass = double(), error = double(), stringsAsFactors = FALSE) + colnames(BIODB.PEAK.DF.EXAMPLE) <- c(BIODB.PEAK.MZ, BIODB.PEAK.INTENSITY, BIODB.PEAK.RELATIVE.INTENSITY, BIODB.PEAK.FORMULA, BIODB.PEAK.FORMULA.COUNT, BIODB.PEAK.MASS, BIODB.PEAK.ERROR.PPM) ################# # GET ENTRY URL # ################# # TODO Let the choice to use either jp or eu - RBIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/getRecordInfo" - RBIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/getRecordInfo" + BIODB.MASSBANK.JP.WS.URL <- "http://www.massbank.jp/api/services/MassBankAPI/" + BIODB.MASSBANK.EU.WS.URL <- "http://massbank.eu/api/services/MassBankAPI/" - get.entry.url <- function(class, accession, content.type = RBIODB.ANY) { + .do.get.entry.url <- function(class, accession, content.type = BIODB.HTML, base.url = NA_character_, token = NA_character_) { + + # Only certain databases can handle multiple accession ids + if ( ! class %in% c(BIODB.MASSBANK, BIODB.CHEMSPIDER, BIODB.PUBCHEMCOMP, BIODB.PUBCHEMSUB, BIODB.PEAKFOREST) && length(accession) > 1) + stop(paste0("Cannot build a URL for getting multiple entries for class ", class, ".")) + # Get URL url <- switch(class, - chebi = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, - chemspider = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html') else NULL, - enzyme = if (content.type %in% c(RBIODB.ANY, RBIODB.TXT)) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, + chebi = if (content.type == BIODB.HTML) paste0('https://www.ebi.ac.uk/chebi/searchId.do?chebiId=', accession) else NULL, + chemspider = { + token.param <- if (is.na(token)) '' else paste('&token', token, sep = '=') + switch(content.type, + html = paste0('http://www.chemspider.com/Chemical-Structure.', accession, '.html'), + xml = paste0('http://www.chemspider.com/MassSpecAPI.asmx/GetExtendedCompoundInfoArray?', paste(paste0('CSIDs=', accession), collapse = '&'), token.param), + NULL) + }, + enzyme = if (content.type == BIODB.TXT) paste0('http://enzyme.expasy.org/EC/', accession, '.txt') else NULL, hmdb = switch(content.type, xml = paste0('http://www.hmdb.ca/metabolites/', accession, '.xml'), html = paste0('http://www.hmdb.ca/metabolites/', accession), - any = paste0('http://www.hmdb.ca/metabolites/', accession), NULL), kegg = switch(content.type, txt = paste0('http://rest.kegg.jp/get/', accession), html = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), - any = paste0('http://www.genome.jp/dbget-bin/www_bget?cpd:', accession), + NULL), + lipidmaps = if (content.type == BIODB.CSV) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, + massbank = if (content.type == BIODB.TXT) paste0((if (is.na(base.url)) BIODB.MASSBANK.EU.WS.URL else base.url), 'getRecordInfo?ids=', paste(accession, collapse = ',')) else NULL, + mirbase = if (content.type == BIODB.HTML) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, + pubchemcomp = switch(content.type, + xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/', paste(accession, collapse = ','), '/XML'), + html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), NULL), - lipidmaps = if (content.type %in% c(RBIODB.ANY, RBIODB.CSV)) paste0('http://www.lipidmaps.org/data/LMSDRecord.php?Mode=File&LMID=', accession, '&OutputType=CSV&OutputQuote=No') else NULL, - massbank = if (content.type %in% c(RBIODB.ANY, RBIODB.TXT)) paste0(RBIODB.MASSBANK.EU.WS.URL, '?ids=', paste(accession, collapse = ',')) else NULL, - mirbase = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('http://www.mirbase.org/cgi-bin/mature.pl?mature_acc=', accession) else NULL, - pubchem = { - accession <- gsub(' ', '', accession, perl = TRUE) - accession <- gsub('^CID', '', accession, perl = TRUE) - switch(content.type, - xml = paste0('http://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/', accession, '/XML/?response_type=save&response_basename=CID_', accession), - html = paste0('http://pubchem.ncbi.nlm.nih.gov/compound/', accession), - NULL) - }, - ncbigene = if (content.type %in% c(RBIODB.ANY, RBIODB.XML)) paste0('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, - ncbiccds = if (content.type %in% c(RBIODB.ANY, RBIODB.HTML)) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), - uniprot = if (content.type %in% c(RBIODB.ANY, RBIODB.XML)) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), + pubchemsub = switch(content.type, + xml = paste0('https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/', paste(accession, collapse = ','), '/XML'), + html = paste0('http://pubchem.ncbi.nlm.nih.gov/substance/', accession), + NULL), + ncbigene = if (content.type == BIODB.XML) paste0('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=gene&id=', accession, '&rettype=xml&retmode=text') else NULL, + ncbiccds = if (content.type == BIODB.HTML) paste0('https://www.ncbi.nlm.nih.gov/CCDS/CcdsBrowse.cgi?REQUEST=CCDS&GO=MainBrowse&DATA=', accession), + uniprot = if (content.type == BIODB.XML) paste0('http://www.uniprot.org/uniprot/', accession, '.xml'), + peakforest = switch(content.type, + html= paste0('https://peakforest.org/home?PFs=',accession), + json= paste0('https://peakforest-alpha.inra.fr/rest/spectra/lcms/ids/',paste(accession,sep=','),'?token=',token), + NULL ) - + ) return(url) } + + get.entry.url <- function(class, accession, content.type = BIODB.HTML, max.length = 0, base.url = NA_character_, token = NA_character_) { + + if (length(accession) == 0) + return(NULL) + + full.url <- .do.get.entry.url(class, accession, content.type = content.type, base.url = base.url, token = token) + if (max.length == 0 || nchar(full.url) <= max.length) + return(if (max.length == 0) full.url else list(url = full.url, n = length(accession))) + + # Find max size URL + a <- 1 + b <- length(accession) + while (a < b) { + m <- as.integer((a + b) / 2) + url <- .do.get.entry.url(class, accession[1:m], content.type = content.type, base.url = base.url, token = token) + if (nchar(url) <= max.length && m != a) + a <- m + else + b <- m + } + url <- .do.get.entry.url(class, accession[1:a], content.type = content.type, base.url = base.url, token = token) + + return(list( url = url, n = a)) + } + + ################# + # PRINT MESSAGE # + ################# + + BIODB.DEBUG <- 1 + BIODB.LEVEL.NAMES <- c('DEBUG') + + .print.msg <- function(msg, level = BIODB.DEBUG, class = NA_character_) { + cat(paste0(BIODB.LEVEL.NAMES[[level]], if (is.na(class)) '' else paste0(", ", class), ": ", msg, "\n"), file = stderr()) + } + + ##################### + # BIODB GET ENV VAR # + ##################### + + .biodb.get.env.var <- function(v) { + + # Get all env vars + env <- Sys.getenv() + + # Make env var name + env.var <- paste(c('BIODB', toupper(v)), collapse = '_') + + # Look if this env var exists + if (env.var %in% names(env)) + return(env[[env.var]]) + + return(NA_character_) + } }
