Mercurial > repos > artbio > ngsplot
view lib/genedb.r @ 4:9652d08d8441 draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/ngsplot commit 73a8ef46dddb0a1b224720bcdfe46050ee9a1b8b
author | artbio |
---|---|
date | Tue, 26 Dec 2017 08:07:56 -0500 |
parents | 3ca58369469c |
children |
line wrap: on
line source
chromFormat <- function(crn, ...){ # Format chromosome names to our standard. # Args: # crn: character vector of chromosome names. # Returns: character vector of formatted chromosome names. crn <- sub('.fa$', '', crn) nochr.i <- grep('^chr', crn, invert=T) crn[nochr.i] <- paste('chr', crn[nochr.i], sep='') crn } FIScoreIntersect <- function(db.info, v.finfo){ # Further info score based on intersection with database FI columns. # Used to select database files. # Args: # db.info: one record of annotation database table. # v.finfo: further info that is split into vector. # Returns: a score for the intersection between database and further info. length(intersect(as.vector(db.info[c("FI.1", "FI.2", "FI.3")]), v.finfo)) } SetupPlotCoord <- function(args.tbl, ctg.tbl, default.tbl, dbfile.tbl, progpath, genome, reg2plot, lgint, flanksize, samprate, galaxy) { # Load genomic coordinates for plot based on the input arguments. # Args: # args.tbl: input argument table # ctg.tbl: coverage-genelist-title table # default.tbl: default settings of databases and plot setting # dbfile.tbl: details of database files(.RData). # progpath: program path derived from NGSPLOT # genome: genome name, such as mm9, hg19. # reg2plot: tss, tes, genebody, etc. # lgint: boolean tag for large interval. # flanksize: flanking region size. # samprate: sampling rate # galaxy: boolean tag for galaxy installation. if(reg2plot!='bed'){ # Subset using genome-region combination. key <- default.tbl$Genome == genome & default.tbl$Region == reg2plot if(sum(key) == 0) { stop("The combination of genome and region does not exist. You may need to install the genome or the region does not exist yet.\n") } anno.parameters <- default.tbl[key, ] db.match.mask <- dbfile.tbl$Genome == genome & dbfile.tbl$Region == reg2plot anno.db.candidates <- dbfile.tbl[db.match.mask, ] # Database flavor. if('-D' %in% names(args.tbl)){ database <- as.character(args.tbl['-D']) database.allowed <- unique(anno.db.candidates$DB) stopifnot(database %in% database.allowed) }else{ database <- as.character(anno.parameters$DefaultDB) } anno.db.candidates <- anno.db.candidates[anno.db.candidates$DB == database, ] if (file.exists(file.path(progpath, 'database', genome, reg2plot))){ prefix <- file.path(progpath, 'database', genome, reg2plot) }else{ prefix <- file.path(progpath, 'database', genome) } # Further info to subset genomic regions. if('-F' %in% names(args.tbl)){ finfo <- as.character(args.tbl['-F']) } else { finfo <- NULL } Labs <- unlist(strsplit(as.character(anno.parameters$PointLab[1]), ",")) if(length(Labs)==1){ pint <- TRUE }else{ pint <- FALSE } if(!is.null(finfo)){ # use finfo to subset DB tables. v.finfo <- unlist(strsplit(finfo, ',')) # Get the intersect of the v.finfo and FI columns of databases, # and choose the best fit. fi.score <- apply(anno.db.candidates, 1, FIScoreIntersect, v.finfo=v.finfo) anno.db.candidates <- anno.db.candidates[fi.score == max(fi.score), ] # RNA-seq tag. if(reg2plot == 'genebody' && 'rnaseq' %in% v.finfo) { rnaseq.gb <- TRUE }else{ rnaseq.gb <- FALSE } } else { rnaseq.gb <- FALSE } anno.db.candidates <- anno.db.candidates[order(anno.db.candidates$dbScore), ] f.load <- file.path(prefix, anno.db.candidates$"db.file"[1]) if (!file.exists(f.load)){ stop("The requested database file does not exist. You may have a corrupted database. Consider reinstalling the genome.\n") # cat("\nDownloading database:\n") # download.file(anno.db.candidates$"URL"[1], destfile=f.load, # method="curl") # stopifnot(file.exists(f.load)) } # Load genomic coordinates. cat("\nUsing database:\n") cat(paste(f.load, "\n", sep="")) load(f.load) # load coordinates into variable: genome.coord } else if(reg2plot == 'bed') { rnaseq.gb <- FALSE } # Identify unique regions. reg.list <- as.factor(ctg.tbl$glist) uniq.reg <- levels(reg.list) # Determine plot coordinates for each unique region. coord.list <- vector('list', length(uniq.reg)) names(coord.list) <- uniq.reg ReadBedCoord <- function(bed.file) { # Read a bed into memory as a dataframe. # Args: # bed.file: path of the bed file to be read. # Returns: genome coordinates as a dataframe. if(!galaxy && length(grep("\\.bed([0-9]+)?$", bed.file, ignore.case=T)) == 0) { warning(sprintf("File name: '%s' does not seem to a correct name for bed file.\n", bed.file)) } bed.coord <- read.table(bed.file, sep="\t", quote="\"") if(ncol(bed.coord) <3){ stop("A bed file must contain at least 3 columns! The format is: chrom, start, end, gname, tid, strand. Columns 4-6 are optional\n") } genome.coord <- data.frame(chrom=chromFormat(bed.coord[, 1]), start=bed.coord[, 2] + 1, end=bed.coord[, 3], gid=NA, gname='N', tid='N', strand='+', byname.uniq=T, bygid.uniq=NA) # Perform sanity check for bed file. if(!all(genome.coord$start <= genome.coord$end)) { stop(sprintf("Sanity check for bed file: %s failed. Bed files are 0-based and right-side open.\n", bed.file)) } # Deal with columns 4-6 (Optional). if(ncol(bed.coord) >=4){ genome.coord$gname <- bed.coord[, 4] } if(ncol(bed.coord) >=5){ genome.coord$tid <- bed.coord[, 5] } if(ncol(bed.coord) >=6){ genome.coord$strand <- bed.coord[, 6] } genome.coord } # Sample a vector but preserve its original sequential order. sampleInSequence <- function(x, samprate) { x[ifelse(runif(length(x)) <= samprate, T, F)] } ValidateGeneSubset <- function(gid.match, tid.match, gname.match) { # To validate that there is no ambiguous hits in gene match. # Args: # XXX.match: positional hits with respect to the gene database. 0 means # no hit. subset.matrix <- rbind(gid.match, tid.match, gname.match) g.valid <- sapply(subset.matrix, function(col.hits) { length(unique(col.hits[col.hits > 0])) <= 1 }) all(g.valid) } for(i in 1:length(uniq.reg)) { ur <- uniq.reg[i] if(reg2plot == "bed") { coord.list[[i]] <- ReadBedCoord(ur) } else if(ur == '-1') { # use whole genome. coord.list[[i]] <- genome.coord[genome.coord$byname.uniq, ] } else { # read gene list from text file. gene.list <- read.table(ur, as.is=T, comment.char='#')$V1 gid.match <- match(gene.list, genome.coord$gid, nomatch=0) gname.match <- match(gene.list, genome.coord$gname, nomatch=0) tid.match <- match(gene.list, genome.coord$tid, nomatch=0) if(!ValidateGeneSubset(gid.match, tid.match, gname.match)) { stop("\nAmbiguous hits in gene database. This shall never happen! Contact ngs.plot maintainers.\n") } subset.idx <- pmax(gid.match, tid.match, gname.match) # subset.idx <- gid.match + tid.match + gname.match subset.idx <- subset.idx[subset.idx != 0] if(length(subset.idx) == 0) { stop("\nGene subset size becomes zero. Are you using the correct database?\n") } coord.list[[i]] <- genome.coord[subset.idx, ] } # Do sampling. if(samprate < 1) { samp.idx <- sampleInSequence(1:nrow(coord.list[[i]]), samprate) coord.list[[i]] <- coord.list[[i]][samp.idx, ] } } # If bed file, set boolean tag for point interval, i.e. interval=1bp. if(reg2plot == "bed") { pint <- all(sapply(coord.list, function(cd) all(cd$start == cd$end))) if(pint) { Labs <- c("Center") } else { Labs <- c("5'End", "3'End") } } # Set boolean tag for large interval display. if(!pint && is.na(lgint)) { if(median(sapply(coord.list, function(cd) { median(cd$end - cd$start + 1) })) > flanksize) { lgint <- 1 } else { lgint <- 0 } } res <- list(coord.list=coord.list, rnaseq.gb=rnaseq.gb, lgint=lgint, reg.list=reg.list, uniq.reg=uniq.reg, pint=pint, Labs=Labs) # Add exon models to the result if RNA-seq. if(rnaseq.gb) { em.load <- file.path(prefix, paste(genome, database, 'exonmodel.RData', sep='.')) load(em.load) # load exon models into variable: exonmodel res$exonmodel <- exonmodel } res }