Mercurial > repos > zzhou > spp_phantompeak
changeset 14:918fecc1e7bb draft
Deleted selected files
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:15:02 -0500 |
parents | 87a0c5397b55 |
children | e689b83b0257 |
files | spp/DESCRIPTION spp/NAMESPACE spp/R/zroutines.R spp/configure spp/configure.ac spp/man/add.broad.peak.regions.Rd spp/man/find.binding.positions.Rd spp/man/get.binding.characteristics.Rd spp/man/get.broad.enrichment.clusters.Rd spp/man/get.conservative.fold.enrichment.profile.Rd spp/man/get.mser.Rd spp/man/get.mser.interpolation.Rd spp/man/get.smoothed.enrichment.mle.Rd spp/man/get.smoothed.tag.density.Rd spp/man/output.binding.results.Rd spp/man/read.bam.tags.Rd spp/man/read.bin.maqmap.tags.Rd spp/man/read.bowtie.tags.Rd spp/man/read.eland.tags.Rd spp/man/read.maqmap.tags.Rd spp/man/read.meland.tags.Rd spp/man/remove.local.tag.anomalies.Rd spp/man/select.informative.tags.Rd spp/man/spp-package.Rd spp/man/write.broadpeak.info.Rd spp/man/write.narrowpeak.binding.Rd spp/man/writewig.Rd spp/src/BGZF.cpp spp/src/BGZF.h spp/src/BamAlignment.cpp spp/src/BamAlignment.h spp/src/BamAux.h spp/src/BamIndex.cpp spp/src/BamIndex.h spp/src/BamMultiReader.cpp spp/src/BamMultiReader.h spp/src/BamReader.cpp spp/src/BamReader.h spp/src/BamReader_p.cpp spp/src/BamReader_p.h spp/src/BamStandardIndex_p.cpp spp/src/BamStandardIndex_p.h spp/src/BamToolsIndex_p.cpp spp/src/BamToolsIndex_p.h spp/src/BamWriter.cpp spp/src/BamWriter.h spp/src/BamWriter_p.cpp spp/src/BamWriter_p.h spp/src/Makevars.in spp/src/api_global.h spp/src/bamread.cpp spp/src/bamtools_global.h spp/src/bed2vector.cpp spp/src/cdensum.c spp/src/const.h spp/src/maqmap.c spp/src/maqmap.h spp/src/maqread.cpp spp/src/pc.h spp/src/peaks.cpp spp/src/wdl.cpp |
diffstat | 61 files changed, 0 insertions(+), 18612 deletions(-) [+] |
line wrap: on
line diff
--- a/spp/DESCRIPTION Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -Package: spp -Type: Package -Title: some description -Version: 1.0 -Date: 2008-11-10 -Author: Peter K -Depends: caTools -Maintainer: peterK<peterk@compbio.med.harvard.edu> -Description: Describe the package -License: GPL-2 -LazyLoad: yes -Packaged: Wed Nov 12 10:42:54 2008; vidhuch
--- a/spp/NAMESPACE Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -useDynLib(spp) - -exportPattern("^[^\\.]")
--- a/spp/R/zroutines.R Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2501 +0,0 @@ -#library(caTools) -#dyn.load("src/bed2vector.so"); -#dyn.load("src/wdl.so"); -#dyn.load("src/peaks.so"); -#dyn.load("src/cdensum.so"); - - -# -------- ROUTINES FOR READING IN THE DATA FILES ------------ -# fix.chromosome.names : remove ".fa" suffix from match sequence names -read.eland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T,max.eland.tag.length=-1,extended=F,multi=F) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - storage.mode(max.eland.tag.length) <- "integer"; - callfunction <- "read_eland"; - if(extended) { callfunction <- "read_eland_extended"; }; - if(multi) { callfunction <- "read_eland_multi"; }; - tl <- lapply(.Call(callfunction,filename,rtn,max.eland.tag.length),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); - } -} - -read.tagalign.tags <- function(filename,fix.chromosome.names=T,fix.quality=T) { - tl <- lapply(.Call("read_tagalign",filename),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - #if(fix.quality) { - # d$n <- 4-cut(d$n,breaks=c(0,250,500,750,1000),labels=F) - #} - if(fix.quality) { # Anshul: changed the way the quality field is processed - if (min(d$n)<0.5){ - d$n = ceiling(1000/4^d$n); - } - break.vals <- unique(sort(c(0,unique(d$n)))); - d$n <- length(break.vals)-1-cut(d$n,breaks=break.vals,labels=F); - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); -} - - -read.short.arachne.tags <- function(filename,fix.chromosome.names=F) { - tl <- lapply(.Call("read_arachne",filename),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); -} - - -read.arachne.tags <- function(filename,fix.chromosome.names=F) { - tl <- lapply(.Call("read_arachne_long",filename),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - d$l <- d$l[xo]; - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l))); -} - -read.bowtie.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - tl <- lapply(.Call("read_bowtie",filename,rtn),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); - } -} - -read.bam.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - tl <- lapply(.Call("read_bam",filename,rtn),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); - } -} - - -read.helicos.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F,include.length.info=T) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - tl <- lapply(.Call("read_helicostabf",filename,rtn),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - d$l <- d$l[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l),names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l))); - } -} - -read.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - tl <- lapply(.Call("read_maqmap",filename,rtn),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); - } -} - - -read.bin.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - tl <- lapply(.Call("read_binmaqmap",filename,rtn),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); - } -} - - -# read in tags from an extended eland format with match length information -read.meland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { - if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; - tl <- lapply(.Call("read_meland",filename,rtn),function(d) { - xo <- order(abs(d$t)); - d$t <- d$t[xo]; - d$n <- d$n[xo]; - d$l <- d$l[xo]; - if(read.tag.names) { - d$s <- d$s[xo]; - } - return(d); - }); - - if(fix.chromosome.names) { - # remove ".fa" - names(tl) <- gsub("\\.fa","",names(tl)) - } - # separate tags and quality - chrl <- names(tl); names(chrl) <- chrl; - # reformulate quality scores into monotonic integers - ml <- max(unlist(lapply(tl,function(d) max(d$l)))); - qual <- lapply(chrl,function(chr) (ml-tl[[chr]]$l)+tl[[chr]]$n/10); - if(read.tag.names) { - return(list(tags=lapply(tl,function(d) d$t),quality=qual,names=lapply(tl,function(d) d$s))); - } else { - return(list(tags=lapply(tl,function(d) d$t),quality=qual)); - } -} - -# -------- ROUTINES FOR ASSESSING BINDING PATTERN AND SELECTING INFORMATIVE TAGS ------------ - -# removes tag positions that have anomalously high counts on both strands -# z - z-score used to determine anomalous bins -# zo - z used to filter out one-strand matches -# trim.fraction - fraction of top bins to discard when calculating overall background density -remove.tag.anomalies <- function(data, bin=1,trim.fraction=1e-3,z=5,zo=3*z) { - - t.remove.tag.anomalies <- function(tv,bin=1,trim.fraction=1e-3,z=5,zo=3*z,return.indecies=F) { - tt <- table(floor(tv/bin)); - - # trim value - stt <- sort(as.numeric(tt)); - stt <- stt[1:(length(stt)*(1-trim.fraction))]; - mtc <- mean(stt); tcd <- sqrt(var(stt)); - - thr <- max(1,ceiling(mtc+z*tcd)); - thr.o <- max(1,ceiling(mtc+zo*tcd)); - # filter tt - tt <- tt[tt>=thr] - # get + and - tags - tp <- as.numeric(names(tt)); - pti <- tp>0; - it <- intersect(tp[pti],(-1)*tp[!pti]); - # add one-strand matches - it <- unique(c(it,tp[tt>=thr.o])); - sit <- c(it,(-1)*it); - - if(bin>1) { - sit <- sit*bin; - sit <- c(sit,unlist(lapply(1:bin,function(i) sit+i))) - } - if(return.indecies) { - return(!tv %in% sit); - } else { - return(tv[!tv %in% sit]); - } - } - - vil <- lapply(data$tags,t.remove.tag.anomalies,return.indecies=T,bin=bin,trim.fraction=trim.fraction,z=z,zo=zo); - chrl <- names(data$tags); names(chrl) <- chrl; - data$tags <- lapply(chrl,function(chr) data$tags[[chr]][vil[[chr]]]); - # count tags to remove empty chromosomes - nt <- unlist(lapply(data$tags,length)); - if(any(nt==0)) { - data$tags <- data$tags[nt!=0] - } - - if(!is.null(data$quality)) { - data$quality <- lapply(chrl,function(chr) data$quality[[chr]][vil[[chr]]]); - data$quality <- data$quality[nt!=0]; - } - if(!is.null(data$names)) { - data$names <- lapply(chrl,function(chr) data$names[[chr]][vil[[chr]]]); - data$names <- data$names[nt!=0]; - } - - return(data); -} - -# caps or removes tag positions that are significantly higher than local background -remove.local.tag.anomalies <- function(tags,window.size=200,eliminate.fold=10,cap.fold=4,z.threshold=3) { - lapply(tags,filter.singular.positions.by.local.density,window.size=2e2,eliminate.fold=10,cap.fold=4,z.threshold=3); -} - - - -# assess strand cross-correlation, determine peak position, determine appropriate window size -# for binding detection. -get.binding.characteristics <- function(data,srange=c(50,500),bin=5,cluster=NULL,debug=F,min.tag.count=1e3,acceptance.z.score=3,remove.tag.anomalies=T,anomalies.z=5,accept.all.tags=F) { - if(remove.tag.anomalies) { - data <- remove.tag.anomalies(data,z=anomalies.z); - } - - # take highest quality tag bin - if(!is.null(data$quality) & !accept.all.tags) { - min.bin <- min(unlist(lapply(data$quality,min))) - chrl <- names(data$tags); names(chrl) <- chrl; - otl <- lapply(chrl,function(chr) data$tags[[chr]][data$quality[[chr]]==min.bin]); - } else { - otl <- data$tags; - } - # remove empty chromosomes - otl <- otl[unlist(lapply(otl,length))!=0]; - - - # calculate strand scc - if(!is.null(cluster)) { - cc <- clusterApplyLB(cluster,otl,tag.scc,srange=srange,bin=bin); - names(cc) <- names(otl); - } else { - cc <- lapply(otl,tag.scc,srange=srange,bin=bin); - } - ccl<-list(sample=cc); - ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,return.ac=T,ttl=list(sample=otl),plot=F)[[1]] - ccl.av <- data.frame(x=as.numeric(names(ccl.av)),y=as.numeric(ccl.av)); - - # find peak - pi <- which.max(ccl.av$y); - - # determine width at third-height - th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/3+ccl.av$y[length(ccl.av$y)] - whs <- max(ccl.av$x[ccl.av$y>=th]); - - if (! is.integer(whs)) { # Anshul: added this to avoid situations where whs ends up being -Inf - whs <- ccl.av$x[ min(c(2*pi,length(ccl.av$y))) ] - } - - # determine acceptance of different quality bins - - # calculates tag scc for the best tags, and combinations of best tag category with every other category - # for subsequent selection of acceptable categories - scc.acceptance.calc <- function() { - - qr <- range(unlist(lapply(data$quality,range))) - - # start with best tags - - # determine half-width for scc calculations - pi <- which.max(ccl.av$y); - - # determine width at half-height - th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/2+ccl.av$y[length(ccl.av$y)] - lwhs <- max(ccl.av$x[ccl.av$y>=th])-ccl.av$x[pi]; - lwhs <- max(c(20,bin*10,lwhs)); - srange <- ccl.av$x[pi]+c(-lwhs,lwhs) - - # calculate chromosome-average scc - t.scc <- function(tags) { - if(is.null(cluster)) { - cc <- lapply(tags,tag.scc,srange=srange,bin=bin); - } else { - cc <- clusterApplyLB(cluster,tags,tag.scc,srange=srange,bin=bin); names(cc) <- names(tags); - } - return(t.plotavcc(1,type='l',ccl=list(cc),ttl=list(tags),plot=F,return.ac=T)) - } - - - # returns info list for a given tag length (lv), mismatch count (nv) - t.cat <- function(qual) { - # construct tag set - if(qual==qr[1]) { - ts <- otl; - } else { - nts <- names(otl); names(nts) <- nts; - # select tags - at <- lapply(nts,function(chr) data$tags[[chr]][data$quality[[chr]]==qual]); - ntags <- sum(unlist(lapply(at,length))); - if(ntags<min.tag.count) { return(NULL); } - - # append to otl - ts <- lapply(nts,function(nam) c(otl[[nam]],at[[nam]])); - } - - return(t.scc(ts)); - } - - - # calculate cross-correlation values for each quality bin - ql <- sort(unique(unlist(lapply(data$quality,unique)))); names(ql) <- ql; - - qccl <- lapply(ql,t.cat); - - # acceptance tests - ac <- c(T,unlist(lapply(qccl[-1],function(d) if(is.null(d)) { return(F) } else { t.test(d-qccl[[as.character(min.bin)]],alternative="greater")$p.value<pnorm(acceptance.z.score,lower.tail=F) }))); names(ac) <- names(qccl); - return(list(informative.bins=ac,quality.cc=qccl)) - } - - if(accept.all.tags | is.null(data$quality)) { - return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs)) - } else { - acc <- scc.acceptance.calc(); - return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs,quality.bin.acceptance=acc)); - } - -} - - -# select a set of informative tags based on the pre-calculated binding characteristics -select.informative.tags <- function(data,binding.characteristics=NULL) { - if(is.null(binding.characteristics)) { - return(data$tags); - } - if(is.null(binding.characteristics$quality.bin.acceptance)) { - cat("binding characteristics doesn't contain quality selection info, accepting all tags\n"); - return(data$tags); - } - - ib <- binding.characteristics$quality.bin.acceptance$informative.bins; - abn <- names(ib)[ib] - - chrl <- names(data$tags); names(chrl) <- chrl; - lapply(chrl,function(chr) { - data$tags[[chr]][as.character(data$quality[[chr]]) %in% abn] - }) -} - -# -------- ROUTINES FOR CALLING BINDING POSITIONS ------------ - -# determine binding positions -# signal.data - IP tag lists -# control.data - input tag lists -# e.value - desired E-value threshold (either E-value or FDR threshold must be provided) -# fdr - desired FDR threshold -# min.dist - minimal distance between detected positions -# tag.count.whs - size of the window to be used to estimate confidence interval of the peak fold enrichment ratios -# enrichmnent.z - Z-score defining the desired confidence level for enrichment interval estimates -# enrichment.background.scales - define how many tiems larger should be the window for estimating background -# tag density when evaluating peak enrichment confidence intervals. -# If multiple values are given, multiple independent interval estimates will be -# calculated. -# tec.filter - whether to mask out the regions that exhibit significant background enrichment -# tec.window.size, tec.z - window size and Z-score for maksing out significant background enrichment regions -# -# If the control.data is not provided, the method will assess significance of the determined binding positions -# based on the randomizations of the original data. The following paramters control such randomizations: -# n.randomizations - number of randomizations to be performed -# shuffle.window - size of the bin that defines the tags that are kept together during randomization. -# value of 0 means that all tags are shuffled independently -# -# Binding detection methods: -# tag.wtd - default method. -# must specify parameter "whs", which is the half-size of the window used to calculate binding scores -# tag.lwcc - LWCC method; -# must specify whs - a size of the window used to calculate binding scores -# can specify isize (default=15bp) - size of the internal window that is masked out -find.binding.positions <- function(signal.data,f=1,e.value=NULL,fdr=NULL, masked.data=NULL,control.data=NULL,whs=200,min.dist=200,window.size=4e7,cluster=NULL,debug=T,n.randomizations=3,shuffle.window=1,min.thr=2,topN=NULL, tag.count.whs=100, enrichment.z=2, method=tag.wtd, tec.filter=T,tec.window.size=1e4,tec.z=5,tec.masking.window.size=tec.window.size, tec.poisson.z=5,tec.poisson.ratio=5, tec=NULL, n.control.samples=1, enrichment.scale.down.control=F, enrichment.background.scales=c(1,5,10), use.randomized.controls=F, background.density.scaling=T, mle.filter=F, min.mle.threshold=1, ...) { - - if(f<1) { - if(debug) { cat("subsampling signal ... "); } - signal.data <- lapply(signal.data,function(x) sample(x,length(x)*f)) - if(debug) { cat("done\n"); } - } - - - if(!is.null(control.data) & !use.randomized.controls) { - # limit both control and signal data to a common set of chromosomes - chrl <- intersect(names(signal.data),names(control.data)); - signal.data <- signal.data[chrl]; - control.data <- control.data[chrl]; - control <- list(control.data); - } else { - control <- NULL; - } - - prd <- lwcc.prediction(signal.data,min.dist=min.dist,whs=whs,window.size=window.size,e.value=e.value,fdr=fdr,debug=debug,n.randomizations=n.randomizations,shuffle.window=shuffle.window,min.thr=min.thr,cluster=cluster,method=method,bg.tl=control.data,mask.tl=masked.data, topN=topN, control=control,tec.filter=tec.filter,tec.z=tec.z,tec.window.size=tec.window.size, tec.masking.window.size=tec.masking.window.size, tec.poisson.z=tec.poisson.z,tec.poisson.ratio=tec.poisson.ratio, background.density.scaling=background.density.scaling, ...); - - # add tag counts - chrl <- names(prd$npl); names(chrl) <- chrl; - prd$npl <- lapply(chrl,function(chr) { - pd <- prd$npl[[chr]]; - pd$nt <- points.within(abs(signal.data[[chr]]),pd$x-tag.count.whs,pd$x+tag.count.whs,return.point.counts=T); - return(pd); - }); - prd$f <- f; - prd$n <- sum(unlist(lapply(signal.data,length))); - if(!is.null(control.data)) { - prd$n.bg <- sum(unlist(lapply(control.data,length))); - } - - # calculate enrichment ratios - prd <- calculate.enrichment.estimates(prd,signal.data,control.data=control.data,fraction=1,tag.count.whs=tag.count.whs,z=enrichment.z,scale.down.control=enrichment.scale.down.control,background.scales=enrichment.background.scales); - - if(mle.filter) { - if(!is.null(prd$npl)) { - if(length(prd$npl)>1) { - mle.columns <- grep("enr.mle",colnames(prd$npl[[1]])); - if(length(mle.columns)>1) { - prd$npl <- lapply(prd$npl,function(d) d[apply(d[,mle.columns],1,function(x) all(x>min.mle.threshold)),]) - } - } - } - } - - prd$whs <- whs; - - return(prd); -} - - - -# -------- ROUTINES FOR WRITING OUT TAG DENSITY AND ENRICHMENT PROFILES ------------ -# calculate smoothed tag density, optionally subtracting the background -get.smoothed.tag.density <- function(signal.tags,control.tags=NULL,bandwidth=150,bg.weight=NULL,tag.shift=146/2,step=round(bandwidth/3),background.density.scaling=T,rngl=NULL,scale.by.dataset.size=F) { - chrl <- names(signal.tags); names(chrl) <- chrl; - - if(!is.null(control.tags)) { - bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling); - } - - if(scale.by.dataset.size) { - den.scaling <- dataset.density.size(signal.tags,background.density.scaling=background.density.scaling)/1e6; - } else { - den.scaling <- 1; - } - - lapply(chrl,function(chr) { - ad <- abs(signal.tags[[chr]]+tag.shift); - rng <- NULL; - if(!is.null(rngl)) { - rng <- rngl[[chr]]; - } - if(is.null(rng)) { - rng <- range(ad); - } - - ds <- densum(ad,bw=bandwidth,from=rng[1],to=rng[2],return.x=T,step=step); - if(!is.null(control.tags)) { - if(!is.null(control.tags[[chr]])) { - bsd <- densum(abs(control.tags[[chr]]+tag.shift),bw=bandwidth,from=rng[1],to=rng[2],return.x=F,step=step); - ds$y <- ds$y-bsd*bg.weight; - } - } - return(data.frame(x=seq(ds$x[1],ds$x[2],by=step),y=den.scaling*ds$y)) - }) -} - -# get smoothed maximum likelihood estimate of the log2 signal to control enrichment ratio -get.smoothed.enrichment.mle <- function(signal.tags, control.tags, tag.shift=146/2, background.density.scaling=F, pseudocount=1,bg.weight=NULL, ... ) { - # determine common range - chrl <- intersect(names(signal.tags),names(control.tags)); names(chrl) <- chrl; - rngl <- lapply(chrl,function(chr) range(c(range(abs(signal.tags[[chr]]+tag.shift)),range(abs(control.tags[[chr]]+tag.shift))))) - ssd <- get.smoothed.tag.density(signal.tags, rngl=rngl, ..., scale.by.dataset.size=F) - csd <- get.smoothed.tag.density(control.tags, rngl=rngl, ..., scale.by.dataset.size=F) - if(is.null(bg.weight)) { - bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling); - } - cmle <- lapply(chrl,function(chr) { d <- ssd[[chr]]; d$y <- log2(d$y+pseudocount) - log2(csd[[chr]]$y+pseudocount) - log2(bg.weight); return(d); }) -} - - -# returns a conservative upper/lower bound profile (log2) given signal tag list, background tag list and window scales -get.conservative.fold.enrichment.profile <- function(ftl,btl,fws,bwsl=c(1,5,25,50)*fws,step=50,tag.shift=146/2,alpha=0.05,use.most.informative.scale=F,quick.calculation=T,background.density.scaling=T,bg.weight=NULL,posl=NULL,return.mle=F) { - # include only chromosomes with more than 2 reads - ftl <- ftl[unlist(lapply(ftl,length))>2] - chrl <- names(ftl); names(chrl) <- chrl; - if(!is.null(posl)) { - chrl <- chrl[chrl %in% names(posl)]; - } - # calculate background tag ratio - if(is.null(bg.weight)) { - bg.weight <- dataset.density.ratio(ftl,btl,background.density.scaling=background.density.scaling); - } - lapply(chrl,function(chr) { - if(is.null(btl[[chr]])) { bt <- c(); } else { bt <- abs(btl[[chr]]+tag.shift); } - if(is.null(posl)) { - x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha); - } else { - x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha,pos=posl[[chr]]); - } - # compose profile showing lower bound for enriched, upper bound for depleted regions - ps <- rep(1,length(x$mle)); - vi <- which(!is.na(x$lb) & x$lb>1); - ps[vi] <- x$lb[vi]; - vi <- which(!is.na(x$ub) & x$ub<1); - ps[vi] <- x$ub[vi]; - ps <- log2(ps); - if(is.null(posl)) { - if(return.mle) { - return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub))); - } else { - return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps)); - } - } else { - if(return.mle) { - return(data.frame(x=posl[[chr]],y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub))); - } else { - return(data.frame(x=posl[[chr]],y=ps)); - } - } - }) -} - - -# write a per-chromosome $x/$y data structure into a wig file -writewig <- function(dat,fname,feature,threshold=5,zip=F) { - chrl <- names(dat); names(chrl) <- chrl; - invisible(lapply(chrl,function(chr) { - bdiff <- dat[[chr]]; - ind <- seq(1,length(bdiff$x)); - ind <- ind[!is.na(bdiff$y[ind])]; - header <- chr==chrl[1]; - write.probe.wig(chr,bdiff$x[ind],bdiff$y[ind],fname,append=!header,feature=feature,header=header); - })) - if(zip) { - zf <- paste(fname,"zip",sep="."); - system(paste("zip \"",zf,"\" \"",fname,"\"",sep="")); - system(paste("rm \"",fname,"\"",sep="")); - return(zf); - } else { - return(fname); - } -} - - - -# -------- ROUTINES FOR ANALYZING SATURATION PROPERTIES ------------ - -# PUBLIC -# calculate minimal saturation enrichment ratios (MSER) -get.mser <- function(signal.data,control.data,n.chains=5,step.size=1e5, chains=NULL, cluster=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), n.steps=1, ...) { - if(is.null(chains)) { - ci <- c(1:n.chains); names(ci) <- ci; - if(is.null(cluster)) { - chains <- lapply(ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...); - } else { - chains <- clusterApplyLB(cluster,ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...); - names(chains) <- ci; - } - } - cvl <- mser.chain.interpolation(chains=chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=F); - if(n.steps>1) { - msers <- cvl; - } else { - msers <- unlist(lapply(cvl,function(d) d$me)) - } - if(return.chains) { - return(list(mser=msers,chains=chains)); - } else { - return(msers); - } -} - -# PUBLIC -# interpolate MSER dependency on tag counts -get.mser.interpolation <- function(signal.data,control.data,target.fold.enrichment=5,n.chains=10,n.steps=6,step.size=1e5, chains=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), excluded.steps=c(seq(2,n.steps-2)), ...) { - msers <- get.mser(signal.data,control.data,n.chains=n.chains,n.steps=n.steps,step.size=step.size,chains=chains,test.agrement=test.agreement,return.chains=T,enrichment.background.scales=enrichment.background.scales,excluded.steps=excluded.steps, ...); - - # adjust sizes in case a subset of chromosomes was used - mser <- mser.chain.interpolation(chains=msers$chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=T); - sr <- sum(unlist(lapply(signal.data,length)))/mser[[1]][[1]]$n[1]; - - # Subsampling each chain requires removing a fraction of each chromosome's - # tag list. To get the exact step.size, this often leaves chromosomes with - # a non-integer number of tags. The non-integer values are floored, so each - # chr can contribute at most 0.999.. <= 1 error to the step.size. - floor.error <- length(msers$chains[[1]][[1]]$npl) - intpn <- lapply(mser,function(ms) { - lmvo <- do.call(rbind,ms) - lmvo$n <- lmvo$n*sr; - # Don't select rows corresponding to excluded.steps - # Keep in mind that nd values are negative. - lmvo <- lmvo[lmvo$nd <= (lmvo$nd[1] + floor.error) & lmvo$nd >= (lmvo$nd[1] - floor.error),]; - lmvo <- na.omit(lmvo); - if(any(lmvo$me==1)) { - return(list(prediction=NA)); - } - lmvo$n <- log10(lmvo$n); lmvo$me <- log10(lmvo$me-1) - # remove non-standard steps - emvf <- lm(me ~ n,data=lmvo); - tfe <- (log10(target.fold.enrichment-1)-coef(emvf)[[1]])/coef(emvf)[[2]]; - tfen <- 10^tfe; - return(list(prediction=tfen,log10.fit=emvf)); - }) - - if(return.chains) { - return(list(interpolation=intpn,chains=msers$chains)) - } else { - return(intpn); - } - - return(msers); - -} - - -# output binding detection results to a text file -# the file will contain a table with each row corresponding -# to a detected position, with the following columns: -# chr - chromosome or target sequence -# pos - position of detected binding site on the chromosome/sequence -# score - a score reflecting magnitude of the binding -# Evalue - E-value corresponding to the peak magnitude -# FDR - FDR corresponding to the peak magnitude -# enrichment.lb - lower bound of the fold-enrichment ratio -# enrichment.mle - maximum likelihood estimate of the fold-enrichment ratio -output.binding.results <- function(results,filename) { - write(file=filename,"chr\tpos\tscore\tEvalue\tFDR\tenrichment.lb\tenrichment.mle",append=F); - chrl <- names(results$npl); names(chrl) <- chrl; - x <- lapply(chrl,function(chr) { - d <- results$npl[[chr]]; - if(dim(d)[1]>0) { - if(results$thr$type=="topN") { - od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,enr,enr.mle))) - } else { - od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,evalue,fdr,enr,enr.mle))) - } - write.table(od,file=filename,col.names=F,row.names=F,sep="\t",append=T,quote=F) - } - }) -} - - -# -------- LOW-LEVEL ROUTINES ------------ - -# calculates tag strand cross-correlation for a range of shifts (on positive strand) -tag.scc <- function(tags,srange=c(50,250),bin=1,tt=NULL,llim=10) { - if(is.null(tt)) { - tt <- table(sign(tags)*as.integer(floor(abs(tags)/bin+0.5))); - } - if(!is.null(llim)) { l <- mean(tt); tt <- tt[tt<llim*l] } - tc <- as.integer(names(tt)); - tt <- as.numeric(tt); - - pv <- tt; pv[tc<0]<-0; - nv <- tt; nv[tc>0]<-0; - - pti <- which(tc>0) - nti <- which(tc<0); - - ptc <- tc[pti]; - ntc <- (-1)*tc[nti]; - - ptv <- tt[pti]; - ntv <- tt[nti]; - - trng <- range(c(range(ptc),range(ntc))) - l <- diff(trng)+1; - rm(tc,tt); - - mp <- sum(ptv)*bin/l; mn <- sum(ntv)*bin/l; - ptv <- ptv-mp; ntv <- ntv-mn; - ss <- sqrt((sum(ptv*ptv)+(l-length(ptv))*mp^2) * (sum(ntv*ntv)+(l-length(ntv))*mn^2)); - - t.cor <- function(s) { - smi <- match(ptc+s,ntc); - return((sum(ptv[!is.na(smi)]*ntv[na.omit(smi)]) - - mn*sum(ptv[is.na(smi)]) - - mp*sum(ntv[-na.omit(smi)]) + - mp*mn*(l-length(ptv)-length(ntv)+length(which(!is.na(smi)))))/ss); - } - shifts <- floor(seq(srange[1],srange[2],by=bin)/bin+0.5); - scc <- unlist(lapply(shifts,t.cor)); names(scc) <- shifts*bin; - return(scc); -} - - -# plot tag cross-correlation -t.plotcc <- function(ac, lab=c(10,5,7), ylab="correlation", xlab="lag", pch=19, grid.i=c(-5:5), grid.s=10, type='b', plot.grid=F, cols=c(1,2,4,"orange",8,"pink"), min.peak.x=NULL, xlim=NULL, plot.147=F, plot.max=T, rmw=1, rescale=F, legendx="right", ltys=rep(1,length(ac)), ...) { - if(is.list(ac)) { - cols <- cols[1:length(ac)]; - - if(!is.null(xlim)) { - vx <- as.numeric(names(ac[[1]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]); - ac[[1]] <- (ac[[1]])[vx]; - } else { - xlim <- range(as.numeric(names(ac[[1]]))); - } - - - plot(as.numeric(names(ac[[1]])),runmean(ac[[1]],rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, col=cols[1], xlim=xlim, lty=ltys[1], ...); - if(length(ac)>1) { - for(i in seq(2,length(ac))) { - irng <- range(ac[[i]]); - vx <- as.numeric(names(ac[[i]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]); - if(rescale) { - lines(as.numeric(names(ac[[i]])[vx]),runmean((ac[[i]][vx]-irng[1])/diff(irng)*diff(range(ac[[1]]))+min(ac[[1]]),rmw),col=cols[i],lty=ltys[i]); - } else { - lines(as.numeric(names(ac[[i]]))[vx],runmean(ac[[i]][vx],rmw),col=cols[i],lty=ltys[i]); - } - } - } - if(is.null(min.peak.x)) { - m <- as.numeric(names(ac[[1]])[which.max(ac[[1]])]); - } else { - sac <- (ac[[1]])[which(as.numeric(names(ac[[1]]))>min.peak.x)] - m <- as.numeric(names(sac)[which.max(sac)]); - } - legend(x="topright",bty="n",legend=c(names(ac)),col=cols,lty=ltys) - } else { - if(!is.null(xlim)) { - vx <- as.numeric(names(ac)); - vx <- which(vx>=xlim[1] & vx<=xlim[2]); - ac <- ac[vx]; - } else { - xlim <- range(as.numeric(names(ac))); - } - - plot(names(ac),runmean(ac,rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, xlim=xlim, ...); - if(is.null(min.peak.x)) { - m <- as.numeric(names(ac)[which.max(ac)]); - } else { - sac <- ac[which(names(ac)>min.peak.x)] - m <- as.numeric(names(sac)[which.max(sac)]); - } - } - if(plot.147) { - abline(v=147,lty=2,col=8); - } - if(plot.grid) { - abline(v=m+grid.i*grid.s,lty=3,col="pink"); - } - if(plot.max) { - abline(v=m,lty=2,col=2); - legend(x=legendx,bty="n",legend=c(paste("max at ",m,"bp",sep=""))); - return(m); - } - } - - # plot chromosome-acerage cross-correlation - t.plotavcc <- function(ci, main=paste(ci,"chromosome average"), ccl=tl.cc, return.ac=F, ttl=tl, plot=T, ... ) { - cc <- ccl[[ci]]; - if(length(cc)==1) { return(cc[[1]]) }; - if(length(cc)==0) { return(c()) }; - ac <- do.call(rbind,cc); - # omit NA chromosomes - ina <- apply(ac,1,function(d) any(is.na(d))); - - tags <- ttl[[ci]]; - avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw); - ac <- ac[!ina,]; avw <- avw[!ina]; - ac <- apply(ac,2,function(x) sum(x*avw)); - if(plot) { - m <- t.plotcc(ac, main=main, ...); - if(!return.ac) { return(m) } - } - if(return.ac) { return(ac) } - } - - t.plotchrcc <- function(ci,ncol=4, ccl=tl.cc, ... ) { - cc <- ccl[[ci]]; - ac <- do.call(rbind,cc); - par(mfrow = c(length(cc)/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8) - lapply(names(cc),function(ch) { t.plotcc(cc[[ch]],main=paste(ci,": chr",ch,sep=""), ...) }) - } - - t.plotavccl <- function(ci, ccl=tl.ccl, main=paste(ci,"chromosome average"), rtl=tl, ... ) { - #cc <- lapply(ccl[[ci]],function(x) { if(!is.null(x$M)) { x$M <- NULL;}; return(x); }); - cc <- ccl[[ci]]; - chrs <- names(cc[[1]]); names(chrs) <- chrs; - acl <- lapply(cc,function(x) do.call(rbind,x)); - tags <- rtl[[ci]][chrs]; - avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw); - acl <- lapply(acl,function(ac) apply(ac,2,function(x) sum(x*avw))) - t.plotcc(acl, main=main, ...); - } - - t.plotchrccl <- function(ci,ccl=tl.ccl,ncol=4, ... ) { - par(mfrow = c(length(cc[[1]])/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8) - lapply(names(cc[[1]]),function(ch) { t.plotcc(lapply(cc,function(x) x[[ch]]),main=paste(ci,": chr",ch,sep=""), ...) }) - } - - - -show.scc <- function(tl,srange,cluster=NULL) { - if(!is.null(cluster)) { - cc <- clusterApplyLB(cluster,tl,tag.scc,srange=srange); - names(cc) <- names(tl); - } else { - cc <- lapply(tl,tag.scc,srange=srange); - } - par(mfrow = c(1,1), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8); - ccl<-list(sample=cc); - ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,xlim=srange,return.ac=F,ttl=list(sample=tl),main="")[[1]] -} - -# find regions of significant tag enrichment -find.significantly.enriched.regions <- function(signal.data,control.data,window.size=500,multiplier=1,z.thr=3,mcs=0,debug=F,background.density.scaling=T,masking.window.size=window.size,poisson.z=0,poisson.ratio=4,either=F,tag.shift=146/2,bg.weight=NULL) { - if(is.null(bg.weight)) { - bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling); - } - - if(debug) { - cat("bg.weight=",bg.weight,"\n"); - } - chrl <- names(signal.data); names(chrl) <- chrl; - tec <- lapply(chrl,function(chr) { - d <- tag.enrichment.clusters(signal.data[[chr]],control.data[[chr]],bg.weight=bg.weight*multiplier,thr=z.thr,wsize=window.size,mcs=mcs,min.tag.count.z=poisson.z,min.tag.count.ratio=poisson.ratio,either=either,tag.shift=tag.shift); - d$s <- d$s-masking.window.size/2; d$e <- d$e+masking.window.size/2; - return(d); - }) -} - - -# given tag position vectors, find contigs of significant enrichment of signal over background -# thr - z score threshold -# mcs - minimal cluster size -# bg.weight - fraction by which background counts should be multipled -# min.tag.count.z will impose a poisson constraint based on randomized signal in parallel of background constaint (0 - no constraint) -tag.enrichment.clusters <- function(signal,background,wsize=200,thr=3,mcs=1,bg.weight=1,min.tag.count.z=0,tag.av.den=NULL,min.tag.count.thr=0,min.tag.count.ratio=4,either=F,tag.shift=146/2) { - if(is.null(tag.av.den)) { - tag.av.den <- length(signal)/diff(range(abs(signal))); - } - if(min.tag.count.z>0) { - min.tag.count.thr <- qpois(pnorm(min.tag.count.z,lower.tail=F),min.tag.count.ratio*tag.av.den*wsize,lower.tail=F) - } else { - min.tag.count.thr <- 0; - } - - #if(bg.weight!=1) { - # background <- sample(background,length(background)*(bg.weight),replace=T); - #} - # make up combined position, flag vectors - pv <- abs(c(signal,background)+tag.shift); - fv <- c(rep(1,length(signal)),rep(0,length(background))); - po <- order(pv); - pv <- pv[po]; - fv <- fv[po]; - - #thr <- pnorm(thr,lower.tail=F); - - storage.mode(wsize) <- storage.mode(mcs) <- storage.mode(fv) <- "integer"; - storage.mode(thr) <- storage.mode(pv) <- "double"; - storage.mode(bg.weight) <- "double"; - storage.mode(min.tag.count.thr) <- "double"; - either <- as.integer(either); - storage.mode(either) <- "integer"; - - z <- .Call("find_poisson_enrichment_clusters",pv,fv,wsize,thr,mcs,bg.weight,min.tag.count.thr,either) - return(z); -} - - - - - -# estimates threshold, calculates predictions on complete data and randomized data -# input: tvl -# control - a list of control tag datasets -# no randomization is done if control is supplied -# return.rtp - return randomized tag peaks - do not fit thresholds or do actual predictions -# topN - use min threshold to do a run, return topN peaks from entire genome -# threshold - specify a user-defined threshold -lwcc.prediction <- function(tvl,e.value=NULL, fdr=0.01, chrl=names(tvl), min.thr=0, n.randomizations=1, shuffle.window=1, debug=T, predict.on.random=F, shuffle.both.strands=T,strand.shuffle.only=F, return.rtp=F, control=NULL, print.level=0, threshold=NULL, topN=NULL, bg.tl=NULL, tec.filter=T, tec.window.size=1e3,tec.z=3, tec.masking.window.size=tec.window.size, tec.poisson.z=3,tec.poisson.ratio=4, bg.reverse=T, return.control.predictions=F, return.core.data=F, background.density.scaling=T, ... ) { - - control.predictions <- NULL; - core.data <- list(); - - if(!is.null(bg.tl) & tec.filter) { - if(debug) { cat("finding background exclusion regions ... "); } - tec <- find.significantly.enriched.regions(bg.tl,tvl,window.size=tec.window.size,z.thr=tec.z,masking.window.size=tec.masking.window.size,poisson.z=tec.poisson.z,poisson.ratio=tec.poisson.ratio,background.density.scaling=background.density.scaling,either=T); - if(return.core.data) { - core.data <- c(core.data,list(tec=tec)); - } - if(debug) { cat("done\n"); } - } - - - if(is.null(threshold) & is.null(topN)) { # threshold determination is needed - # generate control predictions - if(!is.null(control)) { - if(debug) { cat("determining peaks on provided",length(control),"control datasets:\n"); } - if(!is.null(bg.tl)) { - if(bg.reverse) { - if(debug) { cat("using reversed signal for FDR calculations\n"); } - rbg.tl <- tvl; - } else { - if(debug) { cat("generating randomized (within chromosome) background ... "); } - rbg.tl <- lapply(bg.tl,function(d) { - if(length(d)<2) { return(d); } - rng <- range(abs(d)); - rd <- round(runif(length(d),rng[1],rng[2])); - nrd <- sample(1:length(rd),length(which(d<0))); - rd[nrd] <- rd[nrd]*(-1); - return(rd); - }) - if(debug) { cat("done\n"); } - } - } else { - rbg.tl <- NULL; - } - n.randomizations <- length(control); - #signal.size <- sum(unlist(lapply(tvl,length))); - rtp <- lapply(control,function(d) { - # calculate tag.weight - #tag.weight <- sum(unlist(lapply(tvl,length)))/sum(unlist(lapply(d,length))); - tag.weight <- dataset.density.ratio(tvl,d,background.density.scaling=background.density.scaling); - #cat("tag.weight=",tag.weight," "); - return(window.call.mirror.binding(d,min.thr=min.thr, tag.weight=tag.weight,bg.tl=rbg.tl, debug=debug, round.up=T,background.density.scaling=background.density.scaling, ...)); - #return(window.call.mirror.binding(d,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster)) - }); - if(return.core.data) { - core.data <- c(core.data,list(rtp.unfiltered=rtp)); - } - if(tec.filter) { - if(debug) { cat("excluding systematic background anomalies ... "); } - rtp <- lapply(rtp,filter.binding.sites,tec,exclude=T); - if(debug) { cat("done\n"); } - } - } else { - if(debug) { cat("determining peaks on ",n.randomizations,"randomized datasets:\n"); } - rtp <- lapply(1:n.randomizations,function(i) { - rd <- generate.randomized.data(tvl,shuffle.window=shuffle.window,shuffle.both.strands=shuffle.both.strands,strand.shuffle.only=strand.shuffle.only); - return(window.call.mirror.binding(rd,min.thr=min.thr,bg.tl=bg.tl, debug=debug, ...)); - #return(window.call.mirror.binding(rd,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist)) - }); - } - if(return.control.predictions) { - control.predictions <- rtp; - } - rtp <- do.call(rbind,lapply(rtp,function(d) do.call(rbind,d))); # merge tables - - # generate real data predictions - if(debug) { cat("determining peaks on real data:\n"); } - npl <- window.call.mirror.binding(tvl,min.thr=min.thr,bg.tl=bg.tl, debug=debug, background.density.scaling=background.density.scaling, ...); - #npl <- window.call.mirror.binding(tvl,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster); - if(return.core.data) { - core.data <- c(core.data,list(npl.unfiltered=npl)); - } - - if(!is.null(bg.tl) & tec.filter) { - if(debug) { cat("excluding systematic background anomalies ... "); } - npl <- filter.binding.sites(npl,tec,exclude=T); - if(debug) { cat("done\n"); } - } - - # calculate E-value and FDRs for all of the peaks - if(debug) { cat("calculating statistical thresholds\n"); } - chrl <- names(npl); names(chrl) <- chrl; - npld <- do.call(rbind,lapply(names(npl),function(chr) { k <- npl[[chr]]; if(!is.null(k) & dim(k)[1]>0) { k$chr <- rep(chr,dim(k)[1]) }; return(k) })) - npld <- cbind(npld,get.eval.fdr.vectors(npld$y,rtp$y)); - # correct for n.randomizations - npld$fdr <- npld$fdr/n.randomizations; - npld$evalue <- npld$evalue/n.randomizations; - - if(return.core.data) { - core.data <- c(core.data,list(npld=npld)); - } - - # determine actual thresholds - if(is.null(e.value)) { - if(is.null(fdr)) { fdr <- 0.01; } - thr <- list(root=min(npld$y[npld$fdr<=fdr]),type="FDR",fdr=fdr) - if(debug) { cat("FDR",fdr,"threshold=",thr$root,"\n"); } - } else { - # determine threshold based on e-value - thr <- list(root=min(npld$y[npld$evalue<=e.value]),type="Evalue",e.value=e.value) - if(debug) { cat("E-value",e.value,"threshold=",thr$root,"\n"); } - } - - - npld <- npld[npld$y>=thr$root,]; - if(dim(npld)[1]>0) { - npl <- tapply(c(1:dim(npld)[1]),as.factor(npld$chr),function(ii) {df <- npld[ii,]; df$chr <- NULL; return(df) }); - } else { - npl <- list(); - } - } else { - if(is.null(threshold)) { - thr <- list(root=min.thr,type="minimal"); - } else { - thr <- list(root=threshold,type="user specified"); - } - - cat("calling binding positions using",thr$type,"threshold (",thr$root,") :\n"); - npl <- window.call.mirror.binding(tvl=tvl,min.thr=thr$root,bg.tl=bg.tl, debug=debug, ...); - if(!is.null(bg.tl) & tec.filter) { - if(debug) { cat("excluding systematic background anomalies ... "); } - npl <- filter.binding.sites(npl,tec,exclude=T); - if(debug) { cat("done\n"); } - } - - if(!is.null(topN)) { - # determine threshold based on topN peaks - ay <- unlist(lapply(npl,function(d) d$y)); - if(length(ay)>topN) { - thr <- list(root=sort(ay,decreasing=T)[topN],type="topN",topN=topN); - cat(paste("determined topN threshold :",thr$root,"\n")); - npl <- lapply(npl,function(d) d[d$y>thr$root,]); - } - } - } - - if(return.core.data) { - return(c(list(npl=npl,thr=thr),core.data)); - } - if(return.control.predictions & !is.null(control.predictions)) { - return(list(npl=npl,thr=thr,control.predictions=control.predictions)); - } - return(list(npl=npl,thr=thr)); -} - -# window tag difference method -wtd <- function(x,y,s,e,whs=200,return.peaks=T,min.thr=5,min.dist=200,step=1,direct.count=F,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=1,mask.x=NULL,mask.y=NULL,ignore.masking=F, bg.whs=whs, round.up=F, ...) { - ignore.masking <- ignore.masking | (is.null(mask.x) & is.null(mask.y)); - if(step>1) { - x <- floor(x/step+0.5); y <- floor(y/step+0.5) - - if(!is.null(bg.x)) { - bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5) - } - - if(!is.null(mask.x)) { - mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5) - } - - - whs <- floor(whs/step+0.5); - bg.whs <- floor(bg.whs/step+0.5); - min.dist <- floor(min.dist/step +0.5); - s <- floor(s/step+0.5) - e <- floor(e/step+0.5) - } - - # scale bg.weight, since within calculation they are considered independent - bg.weight <- bg.weight*tag.weight; - - rx <- c(s-whs,e+whs); - - # compile tag vectors - xt <- table(x); - xh <- integer(diff(rx)+1); - xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt); - - yt <- table(y); - yh <- integer(diff(rx)+1); - yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt); - - # compile background vectors - if(!is.null(bg.x) & length(bg.x)>0) { - bg.subtract <- 1; - - bg.xt <- table(bg.x); - bg.xh <- integer(diff(rx)+1); - bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt); - rm(bg.xt); - - bg.yt <- table(bg.y); - bg.yh <- integer(diff(rx)+1); - bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt); - rm(bg.yt); - - # adjust bg.weight according to bg.whs - if(bg.whs!=whs) { - bg.weight <- bg.weight*whs/bg.whs; - } - } else { - bg.subtract <- 0; - bg.xh <- bg.yh <- c(); - } - - # record masked positions - if(!ignore.masking) { - if(!is.null(mask.x) & length(mask.x)>0) { - mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt))); - mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]]; - xh[mvx-rx[1]+1] <- -1; - } - - if(!is.null(mask.y) & length(mask.y)>0) { - mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt))); - mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]]; - yh[mvy-rx[1]+1] <- -1; - } - } - - rm(xt,yt); - - if(round.up) { round.up <- 1; } else { round.up <- 0; } - - storage.mode(xh) <- storage.mode(yh) <- "integer"; - storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer"; - nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(bg.whs) <- "integer"; - rp <- as.integer(return.peaks); - dcon <- as.integer(direct.count); - storage.mode(rp) <- storage.mode(min.dist) <- "integer"; - storage.mode(min.thr) <- "double"; - storage.mode(dcon) <- "integer"; - storage.mode(tag.weight) <- "double"; - storage.mode(bg.weight) <- "double"; - storage.mode(bg.subtract) <- "integer"; - storage.mode(round.up) <- "integer"; - im <- as.integer(ignore.masking); - storage.mode(im) <- "integer"; - z <- .Call("wtd",xh,yh,whs,rp,min.dist,min.thr,dcon,tag.weight,im,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up); - if(return.peaks) { - return(data.frame(x=(z$x+rx[1])*step,y=z$v)); - } else { - return(list(x=rx*step,y=z)); - } -} - - -tag.wtd <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) { - x <- ctv[ctv>=s & ctv<=e]; - y <- (-1)*ctv[ctv<=-s & ctv>=-e]; - - if(!is.null(bg.ctv)) { - bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e]; - bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e]; - } else { - bg.x <- bg.y <- NULL; - } - - if(!is.null(mask.ctv)) { - mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e]; - mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e]; - } else { - mask.x <- mask.y <- NULL; - } - - if(length(x)==0 | length(y) ==0) { - if(return.peaks) { - return(data.frame(x=c(),y=c())); - } else { - rx <- range(c(x,y)); - return(list(x=rx,y=numeric(diff(rx)+1))); - } - } else { - return(wtd(x,y,s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...)) - } -} - -# shuffles tags in chromosome blocks of a specified size -# note: all coordinates should be positive -tag.block.shuffle <- function(tags,window.size=100) { - if(length(tags)<3) { - warning("too few tags for shuffling"); - return(tags); - } - rng <- range(tags); - #if(rng[1]<0) { stop("negative tag coordinates found") } - if(diff(rng)<=window.size) { - warning(paste("tag range (",diff(rng),") is smaller than shuffle window size")); - return(tags); - } - - if(window.size==0) { - return(as.integer(runif(length(tags),min=rng[1],max=rng[2]))) - } else if(window.size==1) { - tt <- table(tags); - return(rep(runif(length(tt),min=rng[1],max=rng[2]),as.integer(tt))) - } else { - # block positions - bp <- tags %/% window.size; - # block-relative tag positions - rp <- tags %% window.size; - - # shuffle block positions - bpu <- unique(bp); - rbp <- range(bpu); - bps <- as.integer(runif(length(bpu),min=rbp[1],max=rbp[2])); - bpi <- match(bp,bpu); - sbp <- bps[bpi]; - #sbp <- rbp[1]+match(bp,sample(rbp[1]:rbp[2])) - return(sbp*window.size+rp); - } -} - - -# calculate window cross-correlation -lwcc <- function(x,y,s,e,whs=100,isize=20,return.peaks=T,min.thr=1,min.dist=100,step=1,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=NULL,mask.x=NULL,mask.y=NULL,bg.whs=whs,round.up=F) { - if(step>1) { - x <- floor(x/step+0.5); y <- floor(y/step+0.5) - - if(!is.null(bg.x)) { - bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5) - } - - if(!is.null(mask.x)) { - mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5) - } - - whs <- floor(whs/step+0.5); - bg.whs <- floor(bg.whs/step+0.5); - isize <- floor(isize/step+0.5); - min.dist <- floor(min.dist/step +0.5); - s <- floor(s/step+0.5) - e <- floor(e/step+0.5) - } - - # scale bg.weight, since within calculation they are considered independent - bg.weight <- bg.weight*tag.weight; - - - rx <- c(s-whs,e+whs); - xt <- table(x); - xh <- integer(diff(rx)+1); - xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt); - - yt <- table(y); - - yh <- integer(diff(rx)+1); - yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt); - - # compile background vectors - if(!is.null(bg.x) & length(bg.x)>0) { - bg.subtract <- 1; - - bg.xt <- table(bg.x); - bg.xh <- integer(diff(rx)+1); - bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt); - rm(bg.xt); - - bg.yt <- table(bg.y); - bg.yh <- integer(diff(rx)+1); - bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt); - rm(bg.yt); - - # adjust bg.weight according to bg.whs - bg.weight <- bg.weight*(whs-isize)/bg.whs; - } else { - bg.subtract <- 0; - bg.xh <- bg.yh <- c(); - } - - # record masked positions - if(!is.null(mask.x) & length(mask.x)>0) { - mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt))); - mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]]; - - xh[mvx-rx[1]+1] <- -1; - } - - if(!is.null(mask.y) & length(mask.y)>0) { - mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt))); - mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]]; - yh[mvy-rx[1]+1] <- -1; - } - - rm(xt,yt); - if(round.up) { round.up <- 1; } else { round.up <- 0; } - - storage.mode(xh) <- storage.mode(yh) <- "integer"; - storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer"; - nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(isize) <- storage.mode(bg.whs) <- "integer"; - rp <- as.integer(return.peaks); - storage.mode(rp) <- storage.mode(min.dist) <- "integer"; - storage.mode(min.thr) <- "double"; - storage.mode(tag.weight) <- "double"; - storage.mode(bg.weight) <- "double"; - storage.mode(bg.subtract) <- "integer"; - storage.mode(round.up) <- "integer"; - - # allocate return arrays - #cc <- numeric(nx); storage.mode(cc) <- "double"; - z <- .Call("lwcc",xh,yh,whs,isize,rp,min.dist,min.thr,tag.weight,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up); - if(return.peaks) { - return(data.frame(x=(z$x+rx[1])*step,y=z$v)); - } else { - return(list(x=rx*step,y=z)); - } -} - - -tag.lwcc <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) { - x <- ctv[ctv>=s & ctv<=e]; - y <- (-1)*ctv[ctv<=-s & ctv>=-e]; - - if(!is.null(bg.ctv)) { - bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e]; - bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e]; - } else { - bg.x <- bg.y <- NULL; - } - - if(!is.null(mask.ctv)) { - mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e]; - mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e]; - } else { - mask.x <- mask.y <- NULL; - } - - if(length(x)==0 | length(y) ==0) { - if(return.peaks) { - return(data.frame(x=c(),y=c())); - } else { - rx <- range(c(x,y)); - return(list(x=rx,y=numeric(diff(rx)+1))); - } - } else { - return(lwcc(x,y, s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...)) - } -} - -# determine mirror-based binding positions using sliding window along each chromosome -# extra parameters are passed on to call.nucleosomes() -window.call.mirror.binding <- function(tvl,window.size=4e7, debug=T, cluster=NULL, bg.tl=NULL, mask.tl=NULL, background.density.scaling=T, ...) { - chrl <- names(tvl); - # determine bg.weight - if(!is.null(bg.tl)) { - bg.weight <- dataset.density.ratio(tvl,bg.tl,background.density.scaling=background.density.scaling); - } else { - bg.weight <- NULL; - } - if(debug) { - cat("bg.weight=",bg.weight," "); - } - - names(chrl) <- chrl; - - if(is.null(cluster)) { - return(lapply(chrl,function(chr) { - bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; }; - mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; }; - - window.chr.call.mirror.binding(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv),window.size=window.size,chr=chr,debug=debug, bg.weight=bg.weight, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ...); - })); - } else { - # add bg.ctv and mask.ctv to parallel call - tvll <- lapply(chrl,function(chr) { - bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; }; - mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; }; - return(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv)) - }); - bl <- clusterApplyLB(cluster,tvll,window.chr.call.mirror.binding,window.size=window.size,debug=debug, bg.weight=bg.weight, ...); - names(bl) <- chrl; - return(bl); - } -} - -window.chr.call.mirror.binding <- function(ctvl,window.size,debug=T, chr="NA", cluster=NULL, method=tag.wtd, bg.ctv=NULL, mask.ctv=NULL, ...) { - ctv <- ctvl$ctv; bg.ctv <- ctvl$bg.ctv; mask.ctv <- ctvl$mask.ctv; - if(is.null(ctv)) { return(data.frame(x=c(),y=c())) } - if(length(ctv)<2) { return(data.frame(x=c(),y=c())) } - - dr <- range(unlist(lapply(ctv,function(x) range(abs(x))))) - n.windows <- ceiling(diff(dr)/window.size); - - - pinfo <- c(); - if(debug) { - cat(paste("processing ",chr," in ",n.windows," steps [",sep="")); - } - for(i in 1:n.windows) { - s <- dr[1]+(i-1)*window.size; - npn <- method(s=s, e=s+window.size,ctv=ctv, return.peaks=T, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ... ); - if(length(npn) > 0) { pinfo <- rbind(pinfo,npn) } - if(debug) { - cat("."); - } - } - if(debug) { - cat(paste("] done (",dim(pinfo)[1],"positions)\n")); - } else { - cat("."); - } - return(data.frame(x=pinfo[,1],y=pinfo[,2])); -} - -generate.randomized.data <- function(data,shuffle.window=1,shuffle.both.strands=T,strand.shuffle.only=F,chrl=names(data)) { - names(chrl) <- unlist(chrl); - if(strand.shuffle.only) { - # shuffle just strand assignment, not tag positions - rt <- lapply(data[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T)); - } else { - if(shuffle.both.strands) { - rt <- lapply(data[unlist(chrl)],function(tv) { - pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window))) - }); - } else { - rt <- lapply(data[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))}); - } - } -} - -# determine threshold based on E value -# for efficiency chrl should include just one or two small chromosomes -# optional parameters are passed to call.nucleosomes() -determine.lwcc.threshold <- function(tvl,chrl=names(tvl),e.value=100, n.randomizations=1, min.thr=1, debug=F, tol=1e-2, shuffle.window=1, shuffle.both.strands=T, return.rtp=F, control=NULL, strand.shuffle=F, ...) { - names(chrl) <- unlist(chrl); - - # determine fraction of total tags contained in the specified nucleosomes - ntags <- sum(unlist(lapply(tvl,function(cv) length(cv)))); - nctags <- sum(unlist(lapply(chrl, function(cn) length(tvl[[cn]])))); - # calculate actual target E value - if(!is.null(control)) { - n.randomizations <- length(control); - } - eval <- e.value*n.randomizations*nctags/ntags - if(eval<1) { - warning("specified e.value and set of chromosomes results in target e.value of less than 1"); - eval <- 1; - } - - if(debug) { - cat(paste("randomizations =",n.randomizations," chromosomes =",length(chrl),"\n")) - cat(paste("adjusted target eval =",eval,"\ngenerating randomized tag peaks ...")); - } - - # get peaks on randomized tags - if(is.null(control)) { - rtp <- data.frame(do.call(rbind,lapply(1:n.randomizations,function(i) { - if(strand.shuffle) { - # shuffle just strand assignment, not tag positions - rt <- lapply(tvl[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T)); - } else { - if(shuffle.both.strands) { - rt <- lapply(tvl[unlist(chrl)],function(tv) { - pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window))) - }); - } else { - rt <- lapply(tvl[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))}); - } - } - if(debug) { - cat("."); - } - rl <- window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...); - - return(do.call(rbind,rl)) - #return(do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, whs=100,isize=10,window.size=3e7,min.dist=200))) - }))); - - } else { - if(debug) { - cat(" using provided controls "); - } - rtp <- data.frame(do.call(rbind,lapply(control,function(rt) do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...))))) - } - - if(return.rtp) { - return(rtp) - } - - if(debug) { - cat(" done\nfinding threshold ."); - } - - # determine range and starting value - rng <- c(min.thr,max(na.omit(rtp$y))) - - # find E value threshold - count.nucs.f <- function(nthr) { - return(eval-length(which(rtp$y>=nthr))); - } - - # estimate position of the root by downward bisection iterations - mv <- c(eval); mvp <- c(rng[2]); ni <- 1; - max.it <- 2*as.integer(log2(rng[2]/rng[1])+0.5); - while((ni<=max.it) & (mv[1]>=0)) { - np <- mvp[1]/2; - npv <- count.nucs.f(np); - mv <- c(npv,mv); - mvp <- c(np,mvp); - ni <- ni+1; - } - - - if(ni>max.it) { - # determine lowest value - if(debug) { - cat(paste("exceeded max.it (",max.it,"), returning lowest point",signif(mvp[1],4))); - } - return(list(root=mvp[1])) - } else { - rng <- mvp[1:2]; - if(mv[2]==0) rng[2] <- mvp[3]; - if(debug) { - cat(paste("bound to (",signif(rng[1],4),signif(rng[2],4),") ")); - } - } - - # find root on the right side - x <- uniroot(count.nucs.f,rng,tol=tol); - #x$max <- o$par; - #x$f.max <- (-1)*o$value; - if(debug) { - cat(paste(" done (thr=",signif(x$root,4),")\n")); - } - return(x); - -} - - -# determine membership of points in fragments -points.within <- function(x,fs,fe,return.list=F,return.unique=F,sorted=F,return.point.counts=F) { - if(is.null(x) | length(x) < 1) { return(c()) }; - if(!sorted) { - ox <- rank(x,ties="first"); - x <- sort(x); - } - - se <- c(fs,fe); - fi <- seq(1:length(fs)); - fi <- c(fi,-1*fi); - - fi <- fi[order(se)]; - se <- sort(se); - - storage.mode(x) <- storage.mode(fi) <- storage.mode(se) <- "integer"; - if(return.unique) { iu <- 1; } else { iu <- 0; } - if(return.list) { il <- 1; } else { il <- 0; } - if(return.point.counts) { rpc <- 1; } else { rpc <- 0; } - storage.mode(iu) <- storage.mode(il) <- storage.mode(rpc) <- "integer"; - result <- .Call("points_within",x,se,fi,il,iu,rpc); - if(!sorted & !return.point.counts) { - result <- result[ox]; - } - return(result); -} - - -# determine cooridnates of points x relative to signed -# positions pos within size range -get.relative.coordinates <- function(x,pos,size,sorted=F) { - if(!sorted) { - op <- order(abs(pos)); - x <- sort(x); pos <- pos[op]; - } - #dyn.load("~/zhao/sc/peaks.so"); - storage.mode(x) <- storage.mode(pos) <- storage.mode(size) <- "integer"; - rf <- .Call("get_relative_coordinates",x,pos,size); - if(!sorted) { - rf$i <- op[rf$i]; - } else { - return(rf$i); - } - return(rf); -} - -# given list of magnitude values for signal(x) and control (y), -# return a dataframe with $e.val and $fdr -get.eval.fdr.vectors <- function(x,y) { - nx <- length(x); ny <- length(y); - if(nx==0) { return(data.frame(evalue=c(),fdr=c())) } - if(ny==0) { return(data.frame(evalue=rep(0,nx),fdr=rep(1,nx))) } - ex <- ecdf(x); ey <- ecdf(y); - - evals <- (1-ey(x))*ny; - yvals <- (1-ex(x))*nx; - fdr <- (evals+0.5)/(yvals+0.5); # with pseudo-counts - fdr[yvals==0] <- min(fdr); # correct for undercounts - # find a min x corresponding to a minimal FDR - mfdr <- min(fdr); - mfdrmx <- min(x[fdr==mfdr]); - # correct - fdr[x>=mfdrmx] <- mfdr; - return(data.frame(evalue=(evals+1),fdr=fdr)); -} - - -# filter predictions to remove calls failling into the tag enrichment clusters ( chr list of $s/$e dfs) -filter.binding.sites <- function(bd,tec,exclude=F) { - chrl <- names(bd); names(chrl) <- chrl; - lapply(chrl,function(chr) { - cbd <- bd[[chr]]; - if(is.null(cbd)) { return(NULL) }; - if(length(cbd)==0) { return(NULL) }; - if(dim(cbd)[1]>0) { - ctec <- tec[[chr]]; - if(length(ctec$s)>0) { - if(exclude) { - pwi <- which(points.within(cbd$x,ctec$s,ctec$e)== -1); - } else { - pwi <- which(points.within(cbd$x,ctec$s,ctec$e)> -1); - } - return(cbd[pwi,]); - } else { - if(exclude) { - return(cbd); - } else { - return(data.frame(x=c(),y=c())); - } - } - } else { - return(cbd); - } - }); -} - - -# PUBLIC -# generate predictions on sequential (chained) subsamples of data -# if step.size <1, it is intepreted as a fraciton and a each subsequent subsample -# is of a size (1-fraction.step)*N (N - size of the signal data); -# otherwise the step.size is interpreted as a number of tags, and each subsample is of the size N-step.size -get.subsample.chain.calls <- function(signal.data,control.data,n.steps=NULL,step.size=1e6,subsample.control=F,debug=F,min.ntags=1e3, excluded.steps=c(), test.chromosomes=NULL, ... ) { - - if(!is.null(test.chromosomes)) { - # adjust step size - sz <- sum(unlist(lapply(signal.data,length))) - signal.data <- signal.data[test.chromosomes]; - control.data <- control.data[test.chromosomes]; - - if(step.size>1) { - step.size <- step.size*sum(unlist(lapply(signal.data,length)))/sz; - # cat("adjusted step.size=",step.size,"\n"); - } - } - - if(is.null(n.steps)) { - if(step.size<1) { - # down to 10% - n.steps <- log(0.1)/log(step.size); - } else { - n.steps <- floor(sum(unlist(lapply(signal.data,length)))/step.size) - } - } - if(subsample.control & !is.null(control.data)) { - # normalize control to the signal size - if(debug) { cat("pre-subsampling control.\n"); } - bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) - control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight,replace=(bg.weight>1))) - } - calls <- list(); - callnames <- c(); - for(i in 0:n.steps) { - if(debug) { cat("chained subsample step",i,":\n"); } - if(!i %in% excluded.steps) { - ans <- list(find.binding.positions(signal.data=signal.data,control.data=control.data,debug=debug, skip.control.normalization=T, ...)); - names(ans) <- as.character(c(i)); - calls <- c(calls,ans); - callnames <- c(callnames,i); - } - # subsample - if(step.size<1) { - # fraction steps - f <- 1-step.size; - } else { - # bin steps - sz <- sum(unlist(lapply(signal.data,length))); - f <- (sz-step.size)/sz; - if(f<=0) break; - } - if(debug) { cat("chained subsampling using fraction",f,".\n"); } - signal.data <- lapply(signal.data,function(d) sample(d,length(d)*f)); - if(subsample.control & !is.null(control.data)) { - control.data <- lapply(control.data,function(d) sample(d,length(d)*f)); - } - sz <- sum(unlist(lapply(signal.data,length))); - if(sz<min.ntags) break; - } - names(calls) <- callnames; - return(calls); -} - - -# chain-subsample dataset and calculate MSER interpolation -mser.chain.interpolation <- function(signal.data=NULL,control.data=NULL,chains=NULL,n.chains=5,debug=F, enrichment.background.scales=c(1,5), test.agreement=0.99, agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr", return.lists=F, ...) { - if(is.null(chains)) { - cn <- c(1:n.chains); names(cn) <- cn; - tf <- function(i, ...) get.subsample.chain.calls(signal.data,control.data,debug=debug, enrichment.background.scales=enrichment.background.scales, ...); - chains <- lapply(cn,tf,...); - } - names(enrichment.background.scales) <- enrichment.background.scales; - lapply(enrichment.background.scales,function(scale) { - actual.enr.field <- enr.field; - if(scale>1) { - actual.enr.field <- paste(actual.enr.field,scale,sep="."); - } - - cvl <- lapply(chains,function(chain) { - nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T); - nd <- diff(nn); - nn <- nn[-length(nn)]; - me <- lapply(c(2:length(chain)),function(i) { - sla <- t.precalculate.ref.peak.agreement(chain[[i-1]],chain[i],agreement.distance=agreement.distance,enr.field=actual.enr.field) - me <- t.find.min.saturated.enr(sla,thr=1-test.agreement) - menr <- max(min(na.omit(unlist(lapply(chain[[i-1]]$npl,function(d) d[actual.enr.field])))),min(na.omit(unlist(lapply(chain[[i]]$npl,function(d) d[actual.enr.field])))),1) - if(me<=menr) { me <- 1; }; - return(me); - }) - data.frame(n=nn,me=unlist(me),nd=nd); - }); - if(return.lists) { return(cvl) } - cvl <- na.omit(do.call(rbind,cvl)); - if(return.median) { - tv <- tapply(cvl$me,as.factor(cvl$n),median) - } else { - tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim); - } - df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv)); - return(df[order(df$n,decreasing=T),]) - }) -} - - - -# returns agreement as a function of dataset size, possibly filtering peaks by min.enr threshold, and by max.fdr -chain.to.reference.comparison <- function(chains,min.enr=NULL,debug=F,agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr",max.fdr=NULL) { - cvl <- lapply(chains,function(chain) { - # filter chain by fdr - if(!is.null(max.fdr)) { - chain <- lapply(chain,function(d) { d$npl <- lapply(d$npl,function(cd) cd[cd$fdr<=max.fdr,]); return(d); }); - } - nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T); - nn <- nn[-length(nn)]; - me <- lapply(c(2:length(chain)),function(i) { - sla <- t.precalculate.ref.peak.agreement(chain[[1]],chain[i],agreement.distance=agreement.distance,enr.field=enr.field) - # calculate overlap - x <- lapply(sla,function(mpd) { - if(!is.null(min.enr)) { - - me <- mpd$re >= min.enr; - me[is.na(me)] <- F; - mpd <- mpd[me,]; - ome <- mpd$oe < min.enr; - ome[is.na(ome)] <- T; - mpd$ov[ome] <- 0; - } - return(mean(mpd$ov)); - }) - }) - - data.frame(n=nn,me=unlist(me)); - }); - - cvl <- na.omit(do.call(rbind,cvl)); - if(return.median) { - tv <- tapply(cvl$me,as.factor(cvl$n),median) - } else { - tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim); - } - df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv)); - return(df[order(df$n,decreasing=T),]) -} - - -# estimates enrichment confidence interval based on 2*tag.count.whs window around each position, and a z-score (alpha/2) -# if(multiple.background.scales=T) the enrichment is also estimated using 5- and 10-fold increased background tag window -# adds $enr (lower bound), $enr.ub (upper bound) and $enr.mle fields -calculate.enrichment.estimates <- function(binding.positions,signal.data=NULL,control.data=NULL,fraction=1,tag.count.whs=100,z=2,effective.genome.size=3e9,scale.down.control=F,background.scales=c(1),bg.weight=NULL) { - f <- fraction; - qv <- pnorm(z,lower.tail=F); - cn <- names(binding.positions$npl); names(cn) <- cn; - - if(is.null(control.data)) { - # estimate from gamma distribution - fg.lambda <- f*sum(unlist(lapply(signal.data,length)))*2*tag.count.whs/effective.genome.size; - binding.positions$npl <- lapply(binding.positions$npl,function(d) { - d$enr <- qgamma(qv,d$nt,scale=1)/fg.lambda; - d$enr.ub <- qgamma(1-qv,d$nt,scale=1)/fg.lambda; - d$enr.mle <- d$nt/fg.lambda; - return(d); - }); - } else { - # estimate using beta distribution - if(is.null(bg.weight)) { - bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) - } - - if(scale.down.control) { - # sample down control to be the same size as true signal.data (bg.weight*f) - control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight*f,replace=(f*bg.weight>1))) - #bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) - bg.weight <- 1/f; - - } - - binding.positions$enrichment.bg.weight <- bg.weight; - binding.positions$enrichment.whs <- tag.count.whs; - binding.positions$enrichment.z <- z; - - binding.positions$npl <- lapply(cn,function(chr) { - d <- binding.positions$npl[[chr]]; - - edf <- lapply(background.scales,function(background.width.multiplier) { - sig.mult <- bg.weight*f/background.width.multiplier; - nbg <- points.within(abs(control.data[[chr]]),d$x-tag.count.whs*background.width.multiplier,d$x+tag.count.whs*background.width.multiplier,return.point.counts=T,return.unique=F); - - nfg <- d$nt; - - - # Poisson ratio Bayesian LB with non-informative prior (Clopper & Pearson 1934) - nf <- ((nfg+0.5)/(nbg+0.5))*qf(1-qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F) - nf <- nf/sig.mult; - - ub <- ((nfg+0.5)/(nbg+0.5))*qf(qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F) - ub <- ub/sig.mult; - - mle <- (nfg+0.5)/(nbg+0.5); - mle <- mle/sig.mult; - if(is.null(nbg)) { nbg <- numeric(0) } - if(is.null(nf)) { nf <- numeric(0) } - if(is.null(ub)) { ub <- numeric(0) } - if(is.null(mle)) { mle <- numeric(0) } - return(data.frame(nbg=nbg,lb=nf,ub=ub,mle=mle)) - }) - - adf <- do.call(cbind,lapply(c(1:length(background.scales)),function(i) { - df <- edf[[i]]; - cn <- c("nbgt","enr","enr.ub","enr.mle"); - if(background.scales[i]>1) { - cn <- paste(cn,as.character(background.scales[i]),sep="."); - } - names(df) <- cn; - return(df); - })) - - return(cbind(d,adf)); - }); - } - - return(binding.positions); -} - - -# precalculate peak agreement of a sampling list given a reference -t.precalculate.ref.peak.agreement <- function(ref,sf,agreement.distance=50,enr.field="enr") { - ref <- ref$npl; - cn <- names(ref); names(cn) <- cn; - - # for each sampling round - lapply(sf,function(sd) { - # calculate overlap - - ov <- data.frame(do.call(rbind,lapply(cn,function(chr) { - if(dim(ref[[chr]])[1]<1) { return(cbind(ov=c(),re=c(),oe=c())) }; - pwi <- points.within(ref[[chr]]$x,sd$npl[[chr]]$x-agreement.distance,sd$npl[[chr]]$x+agreement.distance); - pwi[pwi==-1] <- NA; - renr <- ref[[chr]][,enr.field] - oenr <- sd$npl[[chr]][,enr.field][pwi]; - if(length(oenr)==0) { oenr <- rep(NA,length(renr)); } - return(cbind(ov=as.integer(!is.na(pwi)),re=renr,oe=oenr)); - }))) - }) -} - - -# find minimal saturated enrichment given a list of replicate agreement matrices (for one fraction) -t.find.min.saturated.enr <- function(pal,thr=0.01,plot=F,return.number.of.peaks=F,plot.individual=T,return.median=F,return.vector=F) { - nr <- length(pal); - # merge replicate data frames - mpd <- data.frame(do.call(rbind,pal)); - - mpd$re[is.na(mpd$re)] <- Inf; - mpd$oe[is.na(mpd$oe)] <- Inf; - - - - # round up values to avoid miscounting - mpd$re <- round(mpd$re,digits=2); - mpd$oe <- round(mpd$oe,digits=2); - - me <- pmin(mpd$re,mpd$oe); - ome <- order(me,decreasing=T); - df <- data.frame(me=me[ome],ov=mpd$ov[ome]); - recdf <- ecdf(-mpd$re); ren <- length(mpd$re); - - # collapse equal peak heights - xk <- tapply(df$ov,as.factor(df$me),sum); xk <- data.frame(ov=as.numeric(xk),me=as.numeric(names(xk))); xk <- xk[order(xk$me,decreasing=T),]; - - - cso <- cumsum(xk$ov)/(recdf(-xk$me)*ren); - cso[is.na(cso)] <- 0; - cso[!is.finite(cso)] <- 0; - mv <- max(which(cso >= 1-thr)) - menr <- xk$me[mv]; - - ir <- lapply(pal,function(d) { - d$re[is.na(d$re)] <- Inf; - d$oe[is.na(d$oe)] <- Inf; - - me <- pmin(d$re,d$oe); - ome <- order(me,decreasing=T); - df <- data.frame(me=me[ome],ov=d$ov[ome]); - cso <- cumsum(df$ov)/c(1:length(df$ov)); - mv <- max(which(cso >= 1-thr)) - menr <- df$me[mv]; - return(list(df=df,menr=menr)); - }); - - if(plot) { - par(mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8); - plot(df$me,cumsum(df$ov)/c(1:length(df$ov)),type='l',ylab="fraction of positions overlapping with reference",xlab="minimal enrichment of binding positions",xlim=c(min(df$me),2*menr)); - abline(h=1-thr,lty=2,col=4) - if(plot.individual) { - lapply(ir,function(d) { - df <- d$df; - lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=8); - abline(v=menr,col="pink",lty=3) - }); - lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=1); - } - abline(v=menr,col=2,lty=2) - legend(x="bottomright",lty=c(1,2,1,3,2),col=c(1,2,8,"pink",4),legend=c("combined samples","combined sample MSER","individual samples","individual MSERs","consistency threshold")); - } - - if(return.number.of.peaks) { - mpd <- data.frame(do.call(rbind,pal)); - return(length(which(!is.na(mpd$re) & mpd$re >=menr))/nr); - } else { - if(return.vector) { - return(unlist(lapply(ir,function(d) d$menr))); - } - if(return.median) { - return(median(unlist(lapply(ir,function(d) d$menr)))); - } else { - return(menr); - } - } -} - - - -# determine d1/d2 dataset size ratio. If background.density.scaling=F, the ratio of tag counts is returned. -# if background.density.scaling=T, regions of significant tag enrichment are masked prior to ratio calculation. -dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { - if(!background.density.scaling) { - return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length)))) - } - - chrl <- intersect(names(d1),names(d2)); - ntc <- do.call(rbind,lapply(chrl,function(chr) { - x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) - x2 <- tag.enrichment.clusters(abs(d2[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) - return(c(length(which(points.within(abs(d1[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)),length(which(points.within(abs(d2[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)))) - })) - ntcs <- apply(ntc,2,sum); - #print(ntcs/c(sum(unlist(lapply(d1,length))),sum(unlist(lapply(d2,length))))); - return(ntcs[1]/ntcs[2]) -} - -# returns effective size of the dataset based on the same logic as dataset.density.ratio -dataset.density.size <- function(d1,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { - if(!background.density.scaling) { - return(sum(unlist(lapply(d1,length)))) - } - - chrl <- names(d1); - ntc <- lapply(chrl,function(chr) { - x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) - return(length(which(points.within(abs(d1[[chr]]),x1$s-wsize/2,x1$e+wsize/2)==-1))) - }) - return(sum(unlist(ntc))) -} - -old.dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { - if(!background.density.scaling) { - return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length)))) - } - - t.chromosome.counts <- function(tl) { - lapply(tl,function(d) { - x <- tag.enrichment.clusters(abs(d),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) - x$s <- x$s-wsize/2; x$e <- x$e+wsize/2; - x <- regionset.intersection.c(list(x),do.union=T) - return(c(n=length(which(points.within(abs(d),x$s,x$e)==-1)),s=diff(range(abs(d))),m=sum(x$e-x$s))); - }) - } - - l1 <- t.chromosome.counts(d1); - l2 <- t.chromosome.counts(d2); - - l2 <- data.frame(do.call(rbind,l2[names(l1)])); - l1 <- data.frame(do.call(rbind,l1)); - - # genome size - gs <- sum(pmax(l1$s,l2$s)) - - den1 <- sum(l1$n)/(gs-sum(l1$m)) - den2 <- sum(l2$n)/(gs-sum(l2$m)) - return(den1/den2); -} - - - - -# calculate cumulative density based on sum of scaled gaussian curves -# (by Michael Tolstorukov) -# -# vin - input vector; bw -- standard deviation, dw-gaussina cutoff in stdev; dout - output "density") -# output - if return.x=F vector of cumulative density values corresponding to integer positions described by range(vin) -# output - if return.x=T a data structure with $x and $y corresponding to the cumulative density -# optional match.wt.f is a function that will return weights for a tag vector -densum <- function(vin,bw=5,dw=3,match.wt.f=NULL,return.x=T,from=min(vin),to=max(vin),step=1) { - # construct vector of unique tags and their counts - tc <- table(vin[vin>=from & vin<=to]); - pos <- as.numeric(names(tc)); storage.mode(pos) <- "double"; - tc <- as.numeric(tc); storage.mode(tc) <- "double"; - n <- length(pos) - # weight counts - if(!is.null(match.wt.f)) { - tc <- tc*match.wt.f(pos); - } - - rng <- c(from,to); - if(rng[1]<0) { stop("range extends into negative values") } - if(range(pos)[1]<0) { stop("position vector contains negative values") } - - storage.mode(n) <- storage.mode(rng) <- storage.mode(bw) <- storage.mode(dw) <- storage.mode(step) <- "integer"; - - spos <- rng[1]; storage.mode(spos) <- "double"; - - dlength <- floor((rng[2] - rng[1])/step) + 1; # length of output array - if(dlength<1) { stop("zero data range") } - dout <- numeric(dlength); storage.mode(dout) <- "double"; - storage.mode(dlength) <- "integer"; - .C("cdensum",n,pos,tc,spos,bw,dw,dlength,step,dout,DUP=F); - - if(return.x) { - return(list(x=c(rng[1],rng[1]+step*(dlength-1)),y=dout,step=step)) - } else { - return(dout) - } -} - -# count tags within sliding window of a specified size -# vin - tag vector (postive values, pre-shifted) -# window.size/window.step - window characteristics -# tv - optional, pre-sorted, pre-trimmed tag vector -window.tag.count <- function(vin,window.size,window.step=1,return.x=T,from=min(vin)+floor(window.size/2),to=max(vin)-floor(window.size/2),tv=NULL) { - whs <- floor(window.size/2); - # select tags with margins - if(is.null(tv)) { - tv <- sort(vin[vin>=from-whs-1 & vin<=to+whs+1]) - } - storage.mode(tv) <- "double"; - n <- length(tv) - nsteps <- ceiling((to-from)/window.step); - - storage.mode(n) <- storage.mode(nsteps) <- storage.mode(window.size) <- storage.mode(window.step) <- "integer"; - - spos <- from; storage.mode(spos) <- "double"; - - if(nsteps<1) { stop("zero data range") } - #dout <- integer(nsteps); storage.mode(dout) <- "integer"; - #.C("window_n_tags",n,tv,spos,window.size,window.step,nsteps,dout,DUP=F); - dout <- .Call("cwindow_n_tags",tv,spos,window.size,window.step,nsteps); - - if(return.x) { - return(list(x=c(from,from+(nsteps-1)*window.step),y=dout,step=window.step)) - } else { - return(dout) - } -} - -# count tags in windows around specified positions (pos) -window.tag.count.around <- function(vin,window.size,pos,return.x=T,tc=NULL,sorted=F) { - if(is.null(tc)) { - tc <- table(vin); - } - if(!sorted) { - op <- rank(pos); - pos <- sort(pos); - } - storage.mode(pos) <- "double"; - tpos <- as.integer(names(tc)); storage.mode(tpos) <- "double"; - tc <- as.integer(tc); storage.mode(tc) <- "integer"; - - whs <- floor(window.size/2); - - storage.mode(whs) <- "integer"; - twc <- .Call("cwindow_n_tags_around",tpos,tc,pos,whs); - if(return.x) { - if(sorted) { - return(data.frame(x=pos,y=twc)); - } else { - return(data.frame(x=pos[op],y=twc[op])); - } - } else { - if(sorted) { - return(twc); - } else { - return(twc[op]); - } - } -} - -# given a tag vector (signed), identify and clean up (either remove or cap) singular positions that exceed local tag density -# vin - tag vector -# cap.fold - maximal fold over enrichment over local density allowed for a single tag position, at which the tag count is capped -# eliminate.fold - max fold enrichment that, when exceeded, results in exclusion of all the tags at that position (e.g. counted as anomaly) -# z.threshold - Z-score used to determine max allowed counts -filter.singular.positions.by.local.density <- function(tags,window.size=200,cap.fold=4,eliminate.fold=10,z.threshold=3) { - # tabulate tag positions - if(length(tags)<2) { return(tags); }; - - tc <- table(tags); - pos <- as.numeric(names(tc)); storage.mode(pos) <- "double"; - tc <- as.integer(tc); storage.mode(tc) <- "integer"; - n <- length(pos); - - whs <- floor(window.size/2); - - storage.mode(n) <- storage.mode(whs) <- "integer"; - twc <- .Call("cwindow_n_tags_around",pos,tc,pos,whs); - twc <- (twc-tc+1)/window.size; # local density - - pv <- pnorm(z.threshold,lower.tail=F) - # exclude - max.counts <- qpois(pv,twc*eliminate.fold,lower.tail=F) - tc[tc>max.counts] <- 0; - # cap - max.counts <- qpois(pv,twc*cap.fold,lower.tail=F) - ivi <- which(tc>max.counts); - tc[ivi] <- max.counts[ivi]+1; - - # reconstruct tag vector - tv <- rep(pos,tc); - to <- order(abs(tv)); tv <- tv[to]; - return(tv); -} - - - -# calculates enrichment bounds using multiple background scales -# ft - foreground tags (pre-shifted, positive) -# bt - background tags -# fws - foreground window size -# bwsl - background window size list -# step - window step -# rng - from/to coordinates (to will be adjusted according to step) -# -# returns: a list with $x ($s $e $step), $lb vector and $mle vector ($ub if calculate.upper.bound=T) -mbs.enrichment.bounds <- function(ft,bt,fws,bwsl,step=1,rng=NULL,alpha=0.05,calculate.upper.bound=F,bg.weight=length(ft)/length(bt),use.most.informative.scale=F,quick.calculation=F,pos=NULL) { - # determine range - if(is.null(rng)) { - rng <- range(range(ft)); - } - # foreground counts - if(is.null(pos)) { - fwc <- window.tag.count(ft,fws,window.step=step,from=rng[1],to=rng[2],return.x=T); - } else { - fwc <- window.tag.count.around(ft,fws,pos,return.x=T) - } - fwc$y <- fwc$y+0.5; - - zal <- qnorm(alpha/2,lower.tail=F); - - # background counts - bt <- sort(bt); - if(!is.null(pos)) { - tc <- table(bt); - } - bgcm <- lapply(bwsl,function(bgws) { - if(is.null(pos)) { - window.tag.count(bt,bgws,window.step=step,from=rng[1],to=rng[2],return.x=F,tv=bt)+0.5; - } else { - window.tag.count.around(bt,bgws,pos,return.x=F,tc=tc)+0.5 - } - }) - if(!is.null(pos)) { - rm(tc); - } - - # pick most informative scale - if(use.most.informative.scale) { - bgcm <- t(do.call(cbind,bgcm)) - isi <- max.col(t((bgcm)/(bwsl/fws))) # add pseudo-counts to select lowest scale in case of a tie - - bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)] - - if(quick.calculation) { - rte <- fwc$y+bgc-0.25*zal*zal; rte[rte<0] <- 0; - dn <- bgc - 0.25*zal*zal; - lbm=(sqrt(fwc$y*bgc) - 0.5*zal*sqrt(rte))/dn; - ivi <- which(lbm<0); - lbm <- lbm*lbm*bwsl[isi]/fws/bg.weight; - lbm[rte<=0] <- 1; - lbm[dn<=0] <- 1; - lbm[ivi] <- 1; - } else { - lbm <- (fwc$y/bgc)*qf(1-alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight; - } - - mle <- fwc$y/bgc*bwsl[isi]/fws/bg.weight; mle[is.nan(mle)] <- Inf; mle[is.na(mle)] <- Inf; - - rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle); - - if(calculate.upper.bound) { - isi <- max.col(t((-bgcm)/(bwsl/fws))) # add pseudo-counts to select highest scale in case of a tie - bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)] - - if(quick.calculation) { - ubm=(sqrt(fwc$y*bgc) + 0.5*zal*sqrt(rte))/dn; - ivi <- which(ubm<0); - ubm <- ubm*ubm*bwsl[isi]/fws/bg.weight; - ubm[rte<=0] <- 1; - ubm[ivi] <- 1; - lbm[dn<=0] <- 1; - } else { - ubm <- (fwc$y/bgc)*qf(alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight; - } - rl <- c(rl,list(ub=ubm)); - } - return(rl); - - } else { - # determine lower bounds - lbm <- lapply(c(1:length(bgcm)),function(i) { - nbg <- bgcm[[i]]; - if(quick.calculation) { - rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0; - dn <- (nbg - 0.25*zal*zal); - lbm=(sqrt(fwc$y*nbg) - 0.5*zal*sqrt(rte))/dn; - ivi <- which(lbm<0); - lbm <- lbm*lbm*bwsl[i]/fws/bg.weight; - lbm[rte<=0] <- 1; - lbm[dn<=0] <- 1; - lbm[ivi] <- 1; - return(lbm); - } else { - return((fwc$y/nbg)*qf(1-alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight); - } - }) - lbm <- do.call(pmin,lbm); - - # calculate mle - #mle <- do.call(pmin,lapply(bgcm,function(bgc) fwc/bgc)) - mle <- do.call(pmin,lapply(c(1:length(bgcm)),function(i) { - bgc <- bgcm[[i]]; - x <- fwc$y/bgc*bwsl[i]/fws/bg.weight; x[is.nan(x)] <- Inf; x[is.na(x)] <- Inf; return(x); - })) - - rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle); - - if(calculate.upper.bound) { - # determine upper bound - ubm <- lapply(c(1:length(bgcm)),function(i) { - nbg <- bgcm[[i]]; - if(quick.calculation) { - rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0; - dn <- (nbg - 0.25*zal*zal); - ubm=(sqrt(fwc$y*nbg) + 0.5*zal*sqrt(rte))/dn; - ivi <- which(ubm<0); - ubm <- ubm*ubm*bwsl[i]/fws/bg.weight; - ubm[rte<=0] <- 1; - ubm[dn<=0] <- 1; - ubm[ivi] <- 1; - return(ubm); - } else { - return((fwc$y/nbg)*qf(alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight); - } - }) - ubm <- do.call(pmax,ubm); - rl <- c(rl,list(ub=ubm)); - } - - return(rl); - } -} - -write.probe.wig <- function(chr,pos,val,fname,append=F,feature="M",probe.length=35,header=T) { - min.dist <- min(diff(pos)); - if(probe.length>=min.dist) { - probe.length <- min.dist-1; - cat("warning: adjusted down wig segment length to",probe.length,"\n"); - } - mdat <- data.frame(chr,as.integer(pos),as.integer(pos+probe.length),val) - - if(header) { - write(paste("track type=wiggle_0 name=\"Bed Format\" description=\"",feature,"\" visibility=dense color=200,100,0 altColor=0,100,200 priority=20",sep=""),file=fname,append=append) - write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=T); - } else { - write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=append); - } - -} - -# returns intersection of multiple region sets -# each regionset needs to contain $s, $e and optional $v column -regionset.intersection.c <- function(rsl,max.val=-1,do.union=F) { - # translate into position/flag form - rfl <- lapply(rsl,function(rs) { - rp <- c(rs$s,rs$e); rf <- c(rep(c(1,-1),each=length(rs$s))); - - ro <- order(rp); - rp <- rp[ro]; rf <- rf[ro]; - if(!is.null(rs$v)) { - rv <- c(rs$v,rs$v)[ro]; - return(data.frame(p=as.numeric(rp),f=as.integer(rf),v=as.numeric(rv))); - } else { - return(data.frame(p=as.numeric(rp),f=as.integer(rf))); - } - }) - rfd <- data.frame(do.call(rbind,lapply(1:length(rfl),function(i) { - d <- rfl[[i]]; d$f <- d$f*i; return(d); - }))) - rfd <- rfd[order(rfd$p),]; - if(is.null(rfd$v)) { max.val <- 0; } - if(do.union) { ur <- 1; } else { ur <- 0; }; - rl <- .Call("region_intersection",as.integer(length(rfl)),as.numeric(rfd$p),as.integer(rfd$f),as.numeric(rfd$v),as.integer(max.val),as.integer(ur)); - return(data.frame(do.call(cbind,rl))); -} - - -# idenfity if binding peak falls within a larger region of significant tag enrichment, and if so record its booundaries -add.broad.peak.regions <- function(chip.tags,input.tags,bp,window.size=500,z.thr=2) { - se <- find.significantly.enriched.regions(chip.tags,input.tags,window.size=window.size,z.thr=z.thr,poisson.z=0,poisson.ratio=0,either=F) - chrl <- names(bp$npl); names(chrl) <- chrl; - bnpl <- lapply(chrl,function(chr) { - npl <- bp$npl[[chr]]; - if(is.null(npl) | dim(npl)[1]<1) { - return(npl); - } - pi <- points.within(npl$x,se[[chr]]$s,se[[chr]]$e,return.list=T); - - pm <- do.call(rbind,lapply(pi,function(rl) { - if(length(rl)>0) { - return(range(c(se[[chr]]$s[rl],se[[chr]]$e[rl]))) - } else { - return(c(NA,NA)); - } - })) - - npl$rs <- pm[,1]; - npl$re <- pm[,2]; - return(npl); - }) - bp$npl <- bnpl; - return(bp); -} - -# writing out binding results in a narrowpeak format, incorporating broad region boundaries if they are present -# if broad region info is not present, margin is used to determine region width. The default margin is equal -# to the window half size used to call the binding peaks -write.narrowpeak.binding <- function(bd,fname,margin=bd$whs,npeaks=NA) { # Anshul: added npeaks option - if(is.null(margin)) { margin <- 50; } - chrl <- names(bd$npl); names(chrl) <- chrl; - md <- do.call(rbind,lapply(chrl,function(chr) { - df <- bd$npl[[chr]]; - x <- df$x; - rs <- df$rs; if(is.null(rs)) { rs <- rep(NA,length(x)) } - re <- df$re; if(is.null(re)) { re <- rep(NA,length(x)) } - #ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- x[ivi]-margin;} - ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- pmax(0,x[ivi]-margin);} # Anshul: added the pmax (0, ...) to avoid negative peak starts - ivi <- which(is.na(re)); if(any(ivi)) {re[ivi] <- x[ivi]+margin;} - #cbind(chr,rs,re,".","0",".",df$y,-1,format(df$fdr,scientific=T,digits=3),x-rs) - cbind(chr,rs,re,".","0",".",df$y,-1,-log10(df$fdr),x-rs) # Anshul: converted fdr to -log10 - })) - md <- md[order(as.numeric(md[,7]),decreasing=T),] - if (!is.na(npeaks)) { # Anshul: added this option to print a limited number of peaks - npeaks <- min(nrow(md),npeaks) - md <- md[1:npeaks,] - } - write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F); -} - - -get.broad.enrichment.clusters <- function(signal.data,control.data,window.size=1e3,z.thr=3, tag.shift=146/2,background.density.scaling=F, ... ) { - # find significantly enriched clusters - bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling); - se <- find.significantly.enriched.regions(signal.data,control.data,window.size=window.size,z.thr=z.thr,tag.shift=tag.shift, bg.weight=bg.weight, ...) - chrl <- names(se); names(chrl) <- chrl; - se <- lapply(chrl,function(chr) { - d <- se[[chr]]; - if(length(d$s>1)) { - d <- regionset.intersection.c(list(d,d),do.union=T); - sc <- points.within(abs(signal.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T); - cc <- points.within(abs(control.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T); - d$rv <- log2((sc+1)/(cc+1)/bg.weight); - return(d); - } else { - return(d) - } - }) -} - -write.broadpeak.info <- function(bp,fname) { - chrl <- names(bp); names(chrl) <- chrl; - chrl <- chrl[unlist(lapply(bp,function(d) length(d$s)))>0] - md <- do.call(rbind,lapply(chrl,function(chr) { - df <- bp[[chr]]; - cbind(chr,df$s,df$e,".","0",".",df$rv,-1,-1) - })) - md <- md[order(as.numeric(md[,7]),decreasing=T),] - write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F); -} - - -get.clusters2 <- function(x,CL) { - temp <- which(diff(x) != 0) - begin <- c(1, temp + 1) - end <- c(temp, length(x)) - size <- end - begin + 1 - - begin <- begin[size >= CL] - end <- end[size >= CL] - size <- size[size >= CL] - - size <- size[x[end] != 0] - begin <- begin[x[end] != 0] - end <- end[x[end] != 0] - - return (list(size=size,begin=begin,end=end)) -}
--- a/spp/configure Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3856 +0,0 @@ -#! /bin/sh -# Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.63 for SPP 1.7. -# -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, -# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. -# This configure script is free software; the Free Software Foundation -# gives unlimited permission to copy, distribute and modify it. -## --------------------- ## -## M4sh Initialization. ## -## --------------------- ## - -# Be more Bourne compatible -DUALCASE=1; export DUALCASE # for MKS sh -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then - emulate sh - NULLCMD=: - # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which - # is contrary to our usage. Disable this feature. - alias -g '${1+"$@"}'='"$@"' - setopt NO_GLOB_SUBST -else - case `(set -o) 2>/dev/null` in - *posix*) set -o posix ;; -esac - -fi - - - - -# PATH needs CR -# Avoid depending upon Character Ranges. -as_cr_letters='abcdefghijklmnopqrstuvwxyz' -as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' -as_cr_Letters=$as_cr_letters$as_cr_LETTERS -as_cr_digits='0123456789' -as_cr_alnum=$as_cr_Letters$as_cr_digits - -as_nl=' -' -export as_nl -# Printing a long string crashes Solaris 7 /usr/bin/printf. -as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo -if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='printf %s\n' - as_echo_n='printf %s' -else - if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then - as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' - as_echo_n='/usr/ucb/echo -n' - else - as_echo_body='eval expr "X$1" : "X\\(.*\\)"' - as_echo_n_body='eval - arg=$1; - case $arg in - *"$as_nl"*) - expr "X$arg" : "X\\(.*\\)$as_nl"; - arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; - esac; - expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" - ' - export as_echo_n_body - as_echo_n='sh -c $as_echo_n_body as_echo' - fi - export as_echo_body - as_echo='sh -c $as_echo_body as_echo' -fi - -# The user is always right. -if test "${PATH_SEPARATOR+set}" != set; then - PATH_SEPARATOR=: - (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { - (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || - PATH_SEPARATOR=';' - } -fi - -# Support unset when possible. -if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then - as_unset=unset -else - as_unset=false -fi - - -# IFS -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent editors from complaining about space-tab. -# (If _AS_PATH_WALK were called with IFS unset, it would disable word -# splitting by setting IFS to empty value.) -IFS=" "" $as_nl" - -# Find who we are. Look in the path if we contain no directory separator. -case $0 in - *[\\/]* ) as_myself=$0 ;; - *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break -done -IFS=$as_save_IFS - - ;; -esac -# We did not find ourselves, most probably we were run as `sh COMMAND' -# in which case we are not to be found in the path. -if test "x$as_myself" = x; then - as_myself=$0 -fi -if test ! -f "$as_myself"; then - $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 - { (exit 1); exit 1; } -fi - -# Work around bugs in pre-3.0 UWIN ksh. -for as_var in ENV MAIL MAILPATH -do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var -done -PS1='$ ' -PS2='> ' -PS4='+ ' - -# NLS nuisances. -LC_ALL=C -export LC_ALL -LANGUAGE=C -export LANGUAGE - -# Required to use basename. -if expr a : '\(a\)' >/dev/null 2>&1 && - test "X`expr 00001 : '.*\(...\)'`" = X001; then - as_expr=expr -else - as_expr=false -fi - -if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then - as_basename=basename -else - as_basename=false -fi - - -# Name of the executable. -as_me=`$as_basename -- "$0" || -$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ - X"$0" : 'X\(//\)$' \| \ - X"$0" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X/"$0" | - sed '/^.*\/\([^/][^/]*\)\/*$/{ - s//\1/ - q - } - /^X\/\(\/\/\)$/{ - s//\1/ - q - } - /^X\/\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - -# CDPATH. -$as_unset CDPATH - - -if test "x$CONFIG_SHELL" = x; then - if (eval ":") 2>/dev/null; then - as_have_required=yes -else - as_have_required=no -fi - - if test $as_have_required = yes && (eval ": -(as_func_return () { - (exit \$1) -} -as_func_success () { - as_func_return 0 -} -as_func_failure () { - as_func_return 1 -} -as_func_ret_success () { - return 0 -} -as_func_ret_failure () { - return 1 -} - -exitcode=0 -if as_func_success; then - : -else - exitcode=1 - echo as_func_success failed. -fi - -if as_func_failure; then - exitcode=1 - echo as_func_failure succeeded. -fi - -if as_func_ret_success; then - : -else - exitcode=1 - echo as_func_ret_success failed. -fi - -if as_func_ret_failure; then - exitcode=1 - echo as_func_ret_failure succeeded. -fi - -if ( set x; as_func_ret_success y && test x = \"\$1\" ); then - : -else - exitcode=1 - echo positional parameters were not saved. -fi - -test \$exitcode = 0) || { (exit 1); exit 1; } - -( - as_lineno_1=\$LINENO - as_lineno_2=\$LINENO - test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" && - test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; } -") 2> /dev/null; then - : -else - as_candidate_shells= - as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - case $as_dir in - /*) - for as_base in sh bash ksh sh5; do - as_candidate_shells="$as_candidate_shells $as_dir/$as_base" - done;; - esac -done -IFS=$as_save_IFS - - - for as_shell in $as_candidate_shells $SHELL; do - # Try only shells that exist, to save several forks. - if { test -f "$as_shell" || test -f "$as_shell.exe"; } && - { ("$as_shell") 2> /dev/null <<\_ASEOF -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then - emulate sh - NULLCMD=: - # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which - # is contrary to our usage. Disable this feature. - alias -g '${1+"$@"}'='"$@"' - setopt NO_GLOB_SUBST -else - case `(set -o) 2>/dev/null` in - *posix*) set -o posix ;; -esac - -fi - - -: -_ASEOF -}; then - CONFIG_SHELL=$as_shell - as_have_required=yes - if { "$as_shell" 2> /dev/null <<\_ASEOF -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then - emulate sh - NULLCMD=: - # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which - # is contrary to our usage. Disable this feature. - alias -g '${1+"$@"}'='"$@"' - setopt NO_GLOB_SUBST -else - case `(set -o) 2>/dev/null` in - *posix*) set -o posix ;; -esac - -fi - - -: -(as_func_return () { - (exit $1) -} -as_func_success () { - as_func_return 0 -} -as_func_failure () { - as_func_return 1 -} -as_func_ret_success () { - return 0 -} -as_func_ret_failure () { - return 1 -} - -exitcode=0 -if as_func_success; then - : -else - exitcode=1 - echo as_func_success failed. -fi - -if as_func_failure; then - exitcode=1 - echo as_func_failure succeeded. -fi - -if as_func_ret_success; then - : -else - exitcode=1 - echo as_func_ret_success failed. -fi - -if as_func_ret_failure; then - exitcode=1 - echo as_func_ret_failure succeeded. -fi - -if ( set x; as_func_ret_success y && test x = "$1" ); then - : -else - exitcode=1 - echo positional parameters were not saved. -fi - -test $exitcode = 0) || { (exit 1); exit 1; } - -( - as_lineno_1=$LINENO - as_lineno_2=$LINENO - test "x$as_lineno_1" != "x$as_lineno_2" && - test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; } - -_ASEOF -}; then - break -fi - -fi - - done - - if test "x$CONFIG_SHELL" != x; then - for as_var in BASH_ENV ENV - do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var - done - export CONFIG_SHELL - exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"} -fi - - - if test $as_have_required = no; then - echo This script requires a shell more modern than all the - echo shells that I found on your system. Please install a - echo modern shell, or manually run the script under such a - echo shell if you do have one. - { (exit 1); exit 1; } -fi - - -fi - -fi - - - -(eval "as_func_return () { - (exit \$1) -} -as_func_success () { - as_func_return 0 -} -as_func_failure () { - as_func_return 1 -} -as_func_ret_success () { - return 0 -} -as_func_ret_failure () { - return 1 -} - -exitcode=0 -if as_func_success; then - : -else - exitcode=1 - echo as_func_success failed. -fi - -if as_func_failure; then - exitcode=1 - echo as_func_failure succeeded. -fi - -if as_func_ret_success; then - : -else - exitcode=1 - echo as_func_ret_success failed. -fi - -if as_func_ret_failure; then - exitcode=1 - echo as_func_ret_failure succeeded. -fi - -if ( set x; as_func_ret_success y && test x = \"\$1\" ); then - : -else - exitcode=1 - echo positional parameters were not saved. -fi - -test \$exitcode = 0") || { - echo No shell found that supports shell functions. - echo Please tell bug-autoconf@gnu.org about your system, - echo including any error possibly output before this message. - echo This can help us improve future autoconf versions. - echo Configuration will now proceed without shell functions. -} - - - - as_lineno_1=$LINENO - as_lineno_2=$LINENO - test "x$as_lineno_1" != "x$as_lineno_2" && - test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || { - - # Create $as_me.lineno as a copy of $as_myself, but with $LINENO - # uniformly replaced by the line number. The first 'sed' inserts a - # line-number line after each line using $LINENO; the second 'sed' - # does the real work. The second script uses 'N' to pair each - # line-number line with the line containing $LINENO, and appends - # trailing '-' during substitution so that $LINENO is not a special - # case at line end. - # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the - # scripts with optimization help from Paolo Bonzini. Blame Lee - # E. McMahon (1931-1989) for sed's syntax. :-) - sed -n ' - p - /[$]LINENO/= - ' <$as_myself | - sed ' - s/[$]LINENO.*/&-/ - t lineno - b - :lineno - N - :loop - s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ - t loop - s/-\n.*// - ' >$as_me.lineno && - chmod +x "$as_me.lineno" || - { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 - { (exit 1); exit 1; }; } - - # Don't try to exec as it changes $[0], causing all sort of problems - # (the dirname of $[0] is not the place where we might find the - # original and so on. Autoconf is especially sensitive to this). - . "./$as_me.lineno" - # Exit status is that of the last command. - exit -} - - -if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then - as_dirname=dirname -else - as_dirname=false -fi - -ECHO_C= ECHO_N= ECHO_T= -case `echo -n x` in --n*) - case `echo 'x\c'` in - *c*) ECHO_T=' ';; # ECHO_T is single tab character. - *) ECHO_C='\c';; - esac;; -*) - ECHO_N='-n';; -esac -if expr a : '\(a\)' >/dev/null 2>&1 && - test "X`expr 00001 : '.*\(...\)'`" = X001; then - as_expr=expr -else - as_expr=false -fi - -rm -f conf$$ conf$$.exe conf$$.file -if test -d conf$$.dir; then - rm -f conf$$.dir/conf$$.file -else - rm -f conf$$.dir - mkdir conf$$.dir 2>/dev/null -fi -if (echo >conf$$.file) 2>/dev/null; then - if ln -s conf$$.file conf$$ 2>/dev/null; then - as_ln_s='ln -s' - # ... but there are two gotchas: - # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. - # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. - # In both cases, we have to default to `cp -p'. - ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || - as_ln_s='cp -p' - elif ln conf$$.file conf$$ 2>/dev/null; then - as_ln_s=ln - else - as_ln_s='cp -p' - fi -else - as_ln_s='cp -p' -fi -rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file -rmdir conf$$.dir 2>/dev/null - -if mkdir -p . 2>/dev/null; then - as_mkdir_p=: -else - test -d ./-p && rmdir ./-p - as_mkdir_p=false -fi - -if test -x / >/dev/null 2>&1; then - as_test_x='test -x' -else - if ls -dL / >/dev/null 2>&1; then - as_ls_L_option=L - else - as_ls_L_option= - fi - as_test_x=' - eval sh -c '\'' - if test -d "$1"; then - test -d "$1/."; - else - case $1 in - -*)set "./$1";; - esac; - case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in - ???[sx]*):;;*)false;;esac;fi - '\'' sh - ' -fi -as_executable_p=$as_test_x - -# Sed expression to map a string onto a valid CPP name. -as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" - -# Sed expression to map a string onto a valid variable name. -as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" - - - -exec 7<&0 </dev/null 6>&1 - -# Name of the host. -# hostname on some systems (SVR3.2, Linux) returns a bogus exit status, -# so uname gets run too. -ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` - -# -# Initializations. -# -ac_default_prefix=/usr/local -ac_clean_files= -ac_config_libobj_dir=. -LIBOBJS= -cross_compiling=no -subdirs= -MFLAGS= -MAKEFLAGS= -SHELL=${CONFIG_SHELL-/bin/sh} - -# Identity of this package. -PACKAGE_NAME='SPP' -PACKAGE_TARNAME='spp' -PACKAGE_VERSION='1.7' -PACKAGE_STRING='SPP 1.7' -PACKAGE_BUGREPORT='' - -ac_subst_vars='LTLIBOBJS -LIBOBJS -HAVE_LIBBZ2 -OBJEXT -EXEEXT -ac_ct_CC -CPPFLAGS -LDFLAGS -CFLAGS -CC -target_alias -host_alias -build_alias -LIBS -ECHO_T -ECHO_N -ECHO_C -DEFS -mandir -localedir -libdir -psdir -pdfdir -dvidir -htmldir -infodir -docdir -oldincludedir -includedir -localstatedir -sharedstatedir -sysconfdir -datadir -datarootdir -libexecdir -sbindir -bindir -program_transform_name -prefix -exec_prefix -PACKAGE_BUGREPORT -PACKAGE_STRING -PACKAGE_VERSION -PACKAGE_TARNAME -PACKAGE_NAME -PATH_SEPARATOR -SHELL' -ac_subst_files='' -ac_user_opts=' -enable_option_checking -' - ac_precious_vars='build_alias -host_alias -target_alias -CC -CFLAGS -LDFLAGS -LIBS -CPPFLAGS' - - -# Initialize some variables set by options. -ac_init_help= -ac_init_version=false -ac_unrecognized_opts= -ac_unrecognized_sep= -# The variables have the same names as the options, with -# dashes changed to underlines. -cache_file=/dev/null -exec_prefix=NONE -no_create= -no_recursion= -prefix=NONE -program_prefix=NONE -program_suffix=NONE -program_transform_name=s,x,x, -silent= -site= -srcdir= -verbose= -x_includes=NONE -x_libraries=NONE - -# Installation directory options. -# These are left unexpanded so users can "make install exec_prefix=/foo" -# and all the variables that are supposed to be based on exec_prefix -# by default will actually change. -# Use braces instead of parens because sh, perl, etc. also accept them. -# (The list follows the same order as the GNU Coding Standards.) -bindir='${exec_prefix}/bin' -sbindir='${exec_prefix}/sbin' -libexecdir='${exec_prefix}/libexec' -datarootdir='${prefix}/share' -datadir='${datarootdir}' -sysconfdir='${prefix}/etc' -sharedstatedir='${prefix}/com' -localstatedir='${prefix}/var' -includedir='${prefix}/include' -oldincludedir='/usr/include' -docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' -infodir='${datarootdir}/info' -htmldir='${docdir}' -dvidir='${docdir}' -pdfdir='${docdir}' -psdir='${docdir}' -libdir='${exec_prefix}/lib' -localedir='${datarootdir}/locale' -mandir='${datarootdir}/man' - -ac_prev= -ac_dashdash= -for ac_option -do - # If the previous option needs an argument, assign it. - if test -n "$ac_prev"; then - eval $ac_prev=\$ac_option - ac_prev= - continue - fi - - case $ac_option in - *=*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; - *) ac_optarg=yes ;; - esac - - # Accept the important Cygnus configure options, so we can diagnose typos. - - case $ac_dashdash$ac_option in - --) - ac_dashdash=yes ;; - - -bindir | --bindir | --bindi | --bind | --bin | --bi) - ac_prev=bindir ;; - -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) - bindir=$ac_optarg ;; - - -build | --build | --buil | --bui | --bu) - ac_prev=build_alias ;; - -build=* | --build=* | --buil=* | --bui=* | --bu=*) - build_alias=$ac_optarg ;; - - -cache-file | --cache-file | --cache-fil | --cache-fi \ - | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) - ac_prev=cache_file ;; - -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ - | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) - cache_file=$ac_optarg ;; - - --config-cache | -C) - cache_file=config.cache ;; - - -datadir | --datadir | --datadi | --datad) - ac_prev=datadir ;; - -datadir=* | --datadir=* | --datadi=* | --datad=*) - datadir=$ac_optarg ;; - - -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ - | --dataroo | --dataro | --datar) - ac_prev=datarootdir ;; - -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ - | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) - datarootdir=$ac_optarg ;; - - -disable-* | --disable-*) - ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` - # Reject names that are not valid shell variable names. - expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2 - { (exit 1); exit 1; }; } - ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` - case $ac_user_opts in - *" -"enable_$ac_useropt" -"*) ;; - *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" - ac_unrecognized_sep=', ';; - esac - eval enable_$ac_useropt=no ;; - - -docdir | --docdir | --docdi | --doc | --do) - ac_prev=docdir ;; - -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) - docdir=$ac_optarg ;; - - -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) - ac_prev=dvidir ;; - -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) - dvidir=$ac_optarg ;; - - -enable-* | --enable-*) - ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` - # Reject names that are not valid shell variable names. - expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2 - { (exit 1); exit 1; }; } - ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` - case $ac_user_opts in - *" -"enable_$ac_useropt" -"*) ;; - *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" - ac_unrecognized_sep=', ';; - esac - eval enable_$ac_useropt=\$ac_optarg ;; - - -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ - | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ - | --exec | --exe | --ex) - ac_prev=exec_prefix ;; - -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ - | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ - | --exec=* | --exe=* | --ex=*) - exec_prefix=$ac_optarg ;; - - -gas | --gas | --ga | --g) - # Obsolete; use --with-gas. - with_gas=yes ;; - - -help | --help | --hel | --he | -h) - ac_init_help=long ;; - -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) - ac_init_help=recursive ;; - -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) - ac_init_help=short ;; - - -host | --host | --hos | --ho) - ac_prev=host_alias ;; - -host=* | --host=* | --hos=* | --ho=*) - host_alias=$ac_optarg ;; - - -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) - ac_prev=htmldir ;; - -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ - | --ht=*) - htmldir=$ac_optarg ;; - - -includedir | --includedir | --includedi | --included | --include \ - | --includ | --inclu | --incl | --inc) - ac_prev=includedir ;; - -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ - | --includ=* | --inclu=* | --incl=* | --inc=*) - includedir=$ac_optarg ;; - - -infodir | --infodir | --infodi | --infod | --info | --inf) - ac_prev=infodir ;; - -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) - infodir=$ac_optarg ;; - - -libdir | --libdir | --libdi | --libd) - ac_prev=libdir ;; - -libdir=* | --libdir=* | --libdi=* | --libd=*) - libdir=$ac_optarg ;; - - -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ - | --libexe | --libex | --libe) - ac_prev=libexecdir ;; - -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ - | --libexe=* | --libex=* | --libe=*) - libexecdir=$ac_optarg ;; - - -localedir | --localedir | --localedi | --localed | --locale) - ac_prev=localedir ;; - -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) - localedir=$ac_optarg ;; - - -localstatedir | --localstatedir | --localstatedi | --localstated \ - | --localstate | --localstat | --localsta | --localst | --locals) - ac_prev=localstatedir ;; - -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ - | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) - localstatedir=$ac_optarg ;; - - -mandir | --mandir | --mandi | --mand | --man | --ma | --m) - ac_prev=mandir ;; - -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) - mandir=$ac_optarg ;; - - -nfp | --nfp | --nf) - # Obsolete; use --without-fp. - with_fp=no ;; - - -no-create | --no-create | --no-creat | --no-crea | --no-cre \ - | --no-cr | --no-c | -n) - no_create=yes ;; - - -no-recursion | --no-recursion | --no-recursio | --no-recursi \ - | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) - no_recursion=yes ;; - - -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ - | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ - | --oldin | --oldi | --old | --ol | --o) - ac_prev=oldincludedir ;; - -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ - | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ - | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) - oldincludedir=$ac_optarg ;; - - -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) - ac_prev=prefix ;; - -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) - prefix=$ac_optarg ;; - - -program-prefix | --program-prefix | --program-prefi | --program-pref \ - | --program-pre | --program-pr | --program-p) - ac_prev=program_prefix ;; - -program-prefix=* | --program-prefix=* | --program-prefi=* \ - | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) - program_prefix=$ac_optarg ;; - - -program-suffix | --program-suffix | --program-suffi | --program-suff \ - | --program-suf | --program-su | --program-s) - ac_prev=program_suffix ;; - -program-suffix=* | --program-suffix=* | --program-suffi=* \ - | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) - program_suffix=$ac_optarg ;; - - -program-transform-name | --program-transform-name \ - | --program-transform-nam | --program-transform-na \ - | --program-transform-n | --program-transform- \ - | --program-transform | --program-transfor \ - | --program-transfo | --program-transf \ - | --program-trans | --program-tran \ - | --progr-tra | --program-tr | --program-t) - ac_prev=program_transform_name ;; - -program-transform-name=* | --program-transform-name=* \ - | --program-transform-nam=* | --program-transform-na=* \ - | --program-transform-n=* | --program-transform-=* \ - | --program-transform=* | --program-transfor=* \ - | --program-transfo=* | --program-transf=* \ - | --program-trans=* | --program-tran=* \ - | --progr-tra=* | --program-tr=* | --program-t=*) - program_transform_name=$ac_optarg ;; - - -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) - ac_prev=pdfdir ;; - -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) - pdfdir=$ac_optarg ;; - - -psdir | --psdir | --psdi | --psd | --ps) - ac_prev=psdir ;; - -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) - psdir=$ac_optarg ;; - - -q | -quiet | --quiet | --quie | --qui | --qu | --q \ - | -silent | --silent | --silen | --sile | --sil) - silent=yes ;; - - -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) - ac_prev=sbindir ;; - -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ - | --sbi=* | --sb=*) - sbindir=$ac_optarg ;; - - -sharedstatedir | --sharedstatedir | --sharedstatedi \ - | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ - | --sharedst | --shareds | --shared | --share | --shar \ - | --sha | --sh) - ac_prev=sharedstatedir ;; - -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ - | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ - | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ - | --sha=* | --sh=*) - sharedstatedir=$ac_optarg ;; - - -site | --site | --sit) - ac_prev=site ;; - -site=* | --site=* | --sit=*) - site=$ac_optarg ;; - - -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) - ac_prev=srcdir ;; - -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) - srcdir=$ac_optarg ;; - - -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ - | --syscon | --sysco | --sysc | --sys | --sy) - ac_prev=sysconfdir ;; - -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ - | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) - sysconfdir=$ac_optarg ;; - - -target | --target | --targe | --targ | --tar | --ta | --t) - ac_prev=target_alias ;; - -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) - target_alias=$ac_optarg ;; - - -v | -verbose | --verbose | --verbos | --verbo | --verb) - verbose=yes ;; - - -version | --version | --versio | --versi | --vers | -V) - ac_init_version=: ;; - - -with-* | --with-*) - ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` - # Reject names that are not valid shell variable names. - expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2 - { (exit 1); exit 1; }; } - ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` - case $ac_user_opts in - *" -"with_$ac_useropt" -"*) ;; - *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" - ac_unrecognized_sep=', ';; - esac - eval with_$ac_useropt=\$ac_optarg ;; - - -without-* | --without-*) - ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` - # Reject names that are not valid shell variable names. - expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && - { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2 - { (exit 1); exit 1; }; } - ac_useropt_orig=$ac_useropt - ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` - case $ac_user_opts in - *" -"with_$ac_useropt" -"*) ;; - *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" - ac_unrecognized_sep=', ';; - esac - eval with_$ac_useropt=no ;; - - --x) - # Obsolete; use --with-x. - with_x=yes ;; - - -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ - | --x-incl | --x-inc | --x-in | --x-i) - ac_prev=x_includes ;; - -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ - | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) - x_includes=$ac_optarg ;; - - -x-libraries | --x-libraries | --x-librarie | --x-librari \ - | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) - ac_prev=x_libraries ;; - -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ - | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) - x_libraries=$ac_optarg ;; - - -*) { $as_echo "$as_me: error: unrecognized option: $ac_option -Try \`$0 --help' for more information." >&2 - { (exit 1); exit 1; }; } - ;; - - *=*) - ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` - # Reject names that are not valid shell variable names. - expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null && - { $as_echo "$as_me: error: invalid variable name: $ac_envvar" >&2 - { (exit 1); exit 1; }; } - eval $ac_envvar=\$ac_optarg - export $ac_envvar ;; - - *) - # FIXME: should be removed in autoconf 3.0. - $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 - expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && - $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 - : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option} - ;; - - esac -done - -if test -n "$ac_prev"; then - ac_option=--`echo $ac_prev | sed 's/_/-/g'` - { $as_echo "$as_me: error: missing argument to $ac_option" >&2 - { (exit 1); exit 1; }; } -fi - -if test -n "$ac_unrecognized_opts"; then - case $enable_option_checking in - no) ;; - fatal) { $as_echo "$as_me: error: unrecognized options: $ac_unrecognized_opts" >&2 - { (exit 1); exit 1; }; } ;; - *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; - esac -fi - -# Check all directory arguments for consistency. -for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ - datadir sysconfdir sharedstatedir localstatedir includedir \ - oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ - libdir localedir mandir -do - eval ac_val=\$$ac_var - # Remove trailing slashes. - case $ac_val in - */ ) - ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` - eval $ac_var=\$ac_val;; - esac - # Be sure to have absolute directory names. - case $ac_val in - [\\/$]* | ?:[\\/]* ) continue;; - NONE | '' ) case $ac_var in *prefix ) continue;; esac;; - esac - { $as_echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 - { (exit 1); exit 1; }; } -done - -# There might be people who depend on the old broken behavior: `$host' -# used to hold the argument of --host etc. -# FIXME: To remove some day. -build=$build_alias -host=$host_alias -target=$target_alias - -# FIXME: To remove some day. -if test "x$host_alias" != x; then - if test "x$build_alias" = x; then - cross_compiling=maybe - $as_echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host. - If a cross compiler is detected then cross compile mode will be used." >&2 - elif test "x$build_alias" != "x$host_alias"; then - cross_compiling=yes - fi -fi - -ac_tool_prefix= -test -n "$host_alias" && ac_tool_prefix=$host_alias- - -test "$silent" = yes && exec 6>/dev/null - - -ac_pwd=`pwd` && test -n "$ac_pwd" && -ac_ls_di=`ls -di .` && -ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || - { $as_echo "$as_me: error: working directory cannot be determined" >&2 - { (exit 1); exit 1; }; } -test "X$ac_ls_di" = "X$ac_pwd_ls_di" || - { $as_echo "$as_me: error: pwd does not report name of working directory" >&2 - { (exit 1); exit 1; }; } - - -# Find the source files, if location was not specified. -if test -z "$srcdir"; then - ac_srcdir_defaulted=yes - # Try the directory containing this script, then the parent directory. - ac_confdir=`$as_dirname -- "$as_myself" || -$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$as_myself" : 'X\(//\)[^/]' \| \ - X"$as_myself" : 'X\(//\)$' \| \ - X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_myself" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ - s//\1/ - q - } - /^X\(\/\/\)$/{ - s//\1/ - q - } - /^X\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - srcdir=$ac_confdir - if test ! -r "$srcdir/$ac_unique_file"; then - srcdir=.. - fi -else - ac_srcdir_defaulted=no -fi -if test ! -r "$srcdir/$ac_unique_file"; then - test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." - { $as_echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2 - { (exit 1); exit 1; }; } -fi -ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" -ac_abs_confdir=`( - cd "$srcdir" && test -r "./$ac_unique_file" || { $as_echo "$as_me: error: $ac_msg" >&2 - { (exit 1); exit 1; }; } - pwd)` -# When building in place, set srcdir=. -if test "$ac_abs_confdir" = "$ac_pwd"; then - srcdir=. -fi -# Remove unnecessary trailing slashes from srcdir. -# Double slashes in file names in object file debugging info -# mess up M-x gdb in Emacs. -case $srcdir in -*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; -esac -for ac_var in $ac_precious_vars; do - eval ac_env_${ac_var}_set=\${${ac_var}+set} - eval ac_env_${ac_var}_value=\$${ac_var} - eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} - eval ac_cv_env_${ac_var}_value=\$${ac_var} -done - -# -# Report the --help message. -# -if test "$ac_init_help" = "long"; then - # Omit some internal or obsolete options to make the list less imposing. - # This message is too long to be a string in the A/UX 3.1 sh. - cat <<_ACEOF -\`configure' configures SPP 1.7 to adapt to many kinds of systems. - -Usage: $0 [OPTION]... [VAR=VALUE]... - -To assign environment variables (e.g., CC, CFLAGS...), specify them as -VAR=VALUE. See below for descriptions of some of the useful variables. - -Defaults for the options are specified in brackets. - -Configuration: - -h, --help display this help and exit - --help=short display options specific to this package - --help=recursive display the short help of all the included packages - -V, --version display version information and exit - -q, --quiet, --silent do not print \`checking...' messages - --cache-file=FILE cache test results in FILE [disabled] - -C, --config-cache alias for \`--cache-file=config.cache' - -n, --no-create do not create output files - --srcdir=DIR find the sources in DIR [configure dir or \`..'] - -Installation directories: - --prefix=PREFIX install architecture-independent files in PREFIX - [$ac_default_prefix] - --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX - [PREFIX] - -By default, \`make install' will install all the files in -\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify -an installation prefix other than \`$ac_default_prefix' using \`--prefix', -for instance \`--prefix=\$HOME'. - -For better control, use the options below. - -Fine tuning of the installation directories: - --bindir=DIR user executables [EPREFIX/bin] - --sbindir=DIR system admin executables [EPREFIX/sbin] - --libexecdir=DIR program executables [EPREFIX/libexec] - --sysconfdir=DIR read-only single-machine data [PREFIX/etc] - --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] - --localstatedir=DIR modifiable single-machine data [PREFIX/var] - --libdir=DIR object code libraries [EPREFIX/lib] - --includedir=DIR C header files [PREFIX/include] - --oldincludedir=DIR C header files for non-gcc [/usr/include] - --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] - --datadir=DIR read-only architecture-independent data [DATAROOTDIR] - --infodir=DIR info documentation [DATAROOTDIR/info] - --localedir=DIR locale-dependent data [DATAROOTDIR/locale] - --mandir=DIR man documentation [DATAROOTDIR/man] - --docdir=DIR documentation root [DATAROOTDIR/doc/spp] - --htmldir=DIR html documentation [DOCDIR] - --dvidir=DIR dvi documentation [DOCDIR] - --pdfdir=DIR pdf documentation [DOCDIR] - --psdir=DIR ps documentation [DOCDIR] -_ACEOF - - cat <<\_ACEOF -_ACEOF -fi - -if test -n "$ac_init_help"; then - case $ac_init_help in - short | recursive ) echo "Configuration of SPP 1.7:";; - esac - cat <<\_ACEOF - -Some influential environment variables: - CC C compiler command - CFLAGS C compiler flags - LDFLAGS linker flags, e.g. -L<lib dir> if you have libraries in a - nonstandard directory <lib dir> - LIBS libraries to pass to the linker, e.g. -l<library> - CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I<include dir> if - you have headers in a nonstandard directory <include dir> - -Use these variables to override the choices made by `configure' or to help -it to find libraries and programs with nonstandard names/locations. - -_ACEOF -ac_status=$? -fi - -if test "$ac_init_help" = "recursive"; then - # If there are subdirs, report their specific --help. - for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue - test -d "$ac_dir" || - { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || - continue - ac_builddir=. - -case "$ac_dir" in -.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; -*) - ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` - # A ".." for each directory in $ac_dir_suffix. - ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` - case $ac_top_builddir_sub in - "") ac_top_builddir_sub=. ac_top_build_prefix= ;; - *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; - esac ;; -esac -ac_abs_top_builddir=$ac_pwd -ac_abs_builddir=$ac_pwd$ac_dir_suffix -# for backward compatibility: -ac_top_builddir=$ac_top_build_prefix - -case $srcdir in - .) # We are building in place. - ac_srcdir=. - ac_top_srcdir=$ac_top_builddir_sub - ac_abs_top_srcdir=$ac_pwd ;; - [\\/]* | ?:[\\/]* ) # Absolute name. - ac_srcdir=$srcdir$ac_dir_suffix; - ac_top_srcdir=$srcdir - ac_abs_top_srcdir=$srcdir ;; - *) # Relative name. - ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix - ac_top_srcdir=$ac_top_build_prefix$srcdir - ac_abs_top_srcdir=$ac_pwd/$srcdir ;; -esac -ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix - - cd "$ac_dir" || { ac_status=$?; continue; } - # Check for guested configure. - if test -f "$ac_srcdir/configure.gnu"; then - echo && - $SHELL "$ac_srcdir/configure.gnu" --help=recursive - elif test -f "$ac_srcdir/configure"; then - echo && - $SHELL "$ac_srcdir/configure" --help=recursive - else - $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 - fi || ac_status=$? - cd "$ac_pwd" || { ac_status=$?; break; } - done -fi - -test -n "$ac_init_help" && exit $ac_status -if $ac_init_version; then - cat <<\_ACEOF -SPP configure 1.7 -generated by GNU Autoconf 2.63 - -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, -2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. -This configure script is free software; the Free Software Foundation -gives unlimited permission to copy, distribute and modify it. -_ACEOF - exit -fi -cat >config.log <<_ACEOF -This file contains any messages produced by compilers while -running configure, to aid debugging if configure makes a mistake. - -It was created by SPP $as_me 1.7, which was -generated by GNU Autoconf 2.63. Invocation command line was - - $ $0 $@ - -_ACEOF -exec 5>>config.log -{ -cat <<_ASUNAME -## --------- ## -## Platform. ## -## --------- ## - -hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` -uname -m = `(uname -m) 2>/dev/null || echo unknown` -uname -r = `(uname -r) 2>/dev/null || echo unknown` -uname -s = `(uname -s) 2>/dev/null || echo unknown` -uname -v = `(uname -v) 2>/dev/null || echo unknown` - -/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` -/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` - -/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` -/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` -/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` -/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` -/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` -/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` -/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` - -_ASUNAME - -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - $as_echo "PATH: $as_dir" -done -IFS=$as_save_IFS - -} >&5 - -cat >&5 <<_ACEOF - - -## ----------- ## -## Core tests. ## -## ----------- ## - -_ACEOF - - -# Keep a trace of the command line. -# Strip out --no-create and --no-recursion so they do not pile up. -# Strip out --silent because we don't want to record it for future runs. -# Also quote any args containing shell meta-characters. -# Make two passes to allow for proper duplicate-argument suppression. -ac_configure_args= -ac_configure_args0= -ac_configure_args1= -ac_must_keep_next=false -for ac_pass in 1 2 -do - for ac_arg - do - case $ac_arg in - -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; - -q | -quiet | --quiet | --quie | --qui | --qu | --q \ - | -silent | --silent | --silen | --sile | --sil) - continue ;; - *\'*) - ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; - esac - case $ac_pass in - 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;; - 2) - ac_configure_args1="$ac_configure_args1 '$ac_arg'" - if test $ac_must_keep_next = true; then - ac_must_keep_next=false # Got value, back to normal. - else - case $ac_arg in - *=* | --config-cache | -C | -disable-* | --disable-* \ - | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ - | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ - | -with-* | --with-* | -without-* | --without-* | --x) - case "$ac_configure_args0 " in - "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; - esac - ;; - -* ) ac_must_keep_next=true ;; - esac - fi - ac_configure_args="$ac_configure_args '$ac_arg'" - ;; - esac - done -done -$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; } -$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; } - -# When interrupted or exit'd, cleanup temporary files, and complete -# config.log. We remove comments because anyway the quotes in there -# would cause problems or look ugly. -# WARNING: Use '\'' to represent an apostrophe within the trap. -# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. -trap 'exit_status=$? - # Save into config.log some information that might help in debugging. - { - echo - - cat <<\_ASBOX -## ---------------- ## -## Cache variables. ## -## ---------------- ## -_ASBOX - echo - # The following way of writing the cache mishandles newlines in values, -( - for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do - eval ac_val=\$$ac_var - case $ac_val in #( - *${as_nl}*) - case $ac_var in #( - *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5 -$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; - esac - case $ac_var in #( - _ | IFS | as_nl) ;; #( - BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( - *) $as_unset $ac_var ;; - esac ;; - esac - done - (set) 2>&1 | - case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( - *${as_nl}ac_space=\ *) - sed -n \ - "s/'\''/'\''\\\\'\'''\''/g; - s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" - ;; #( - *) - sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" - ;; - esac | - sort -) - echo - - cat <<\_ASBOX -## ----------------- ## -## Output variables. ## -## ----------------- ## -_ASBOX - echo - for ac_var in $ac_subst_vars - do - eval ac_val=\$$ac_var - case $ac_val in - *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; - esac - $as_echo "$ac_var='\''$ac_val'\''" - done | sort - echo - - if test -n "$ac_subst_files"; then - cat <<\_ASBOX -## ------------------- ## -## File substitutions. ## -## ------------------- ## -_ASBOX - echo - for ac_var in $ac_subst_files - do - eval ac_val=\$$ac_var - case $ac_val in - *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; - esac - $as_echo "$ac_var='\''$ac_val'\''" - done | sort - echo - fi - - if test -s confdefs.h; then - cat <<\_ASBOX -## ----------- ## -## confdefs.h. ## -## ----------- ## -_ASBOX - echo - cat confdefs.h - echo - fi - test "$ac_signal" != 0 && - $as_echo "$as_me: caught signal $ac_signal" - $as_echo "$as_me: exit $exit_status" - } >&5 - rm -f core *.core core.conftest.* && - rm -f -r conftest* confdefs* conf$$* $ac_clean_files && - exit $exit_status -' 0 -for ac_signal in 1 2 13 15; do - trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal -done -ac_signal=0 - -# confdefs.h avoids OS command line length limits that DEFS can exceed. -rm -f -r conftest* confdefs.h - -# Predefined preprocessor variables. - -cat >>confdefs.h <<_ACEOF -#define PACKAGE_NAME "$PACKAGE_NAME" -_ACEOF - - -cat >>confdefs.h <<_ACEOF -#define PACKAGE_TARNAME "$PACKAGE_TARNAME" -_ACEOF - - -cat >>confdefs.h <<_ACEOF -#define PACKAGE_VERSION "$PACKAGE_VERSION" -_ACEOF - - -cat >>confdefs.h <<_ACEOF -#define PACKAGE_STRING "$PACKAGE_STRING" -_ACEOF - - -cat >>confdefs.h <<_ACEOF -#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" -_ACEOF - - -# Let the site file select an alternate cache file if it wants to. -# Prefer an explicitly selected file to automatically selected ones. -ac_site_file1=NONE -ac_site_file2=NONE -if test -n "$CONFIG_SITE"; then - ac_site_file1=$CONFIG_SITE -elif test "x$prefix" != xNONE; then - ac_site_file1=$prefix/share/config.site - ac_site_file2=$prefix/etc/config.site -else - ac_site_file1=$ac_default_prefix/share/config.site - ac_site_file2=$ac_default_prefix/etc/config.site -fi -for ac_site_file in "$ac_site_file1" "$ac_site_file2" -do - test "x$ac_site_file" = xNONE && continue - if test -r "$ac_site_file"; then - { $as_echo "$as_me:$LINENO: loading site script $ac_site_file" >&5 -$as_echo "$as_me: loading site script $ac_site_file" >&6;} - sed 's/^/| /' "$ac_site_file" >&5 - . "$ac_site_file" - fi -done - -if test -r "$cache_file"; then - # Some versions of bash will fail to source /dev/null (special - # files actually), so we avoid doing that. - if test -f "$cache_file"; then - { $as_echo "$as_me:$LINENO: loading cache $cache_file" >&5 -$as_echo "$as_me: loading cache $cache_file" >&6;} - case $cache_file in - [\\/]* | ?:[\\/]* ) . "$cache_file";; - *) . "./$cache_file";; - esac - fi -else - { $as_echo "$as_me:$LINENO: creating cache $cache_file" >&5 -$as_echo "$as_me: creating cache $cache_file" >&6;} - >$cache_file -fi - -# Check that the precious variables saved in the cache have kept the same -# value. -ac_cache_corrupted=false -for ac_var in $ac_precious_vars; do - eval ac_old_set=\$ac_cv_env_${ac_var}_set - eval ac_new_set=\$ac_env_${ac_var}_set - eval ac_old_val=\$ac_cv_env_${ac_var}_value - eval ac_new_val=\$ac_env_${ac_var}_value - case $ac_old_set,$ac_new_set in - set,) - { $as_echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 -$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} - ac_cache_corrupted=: ;; - ,set) - { $as_echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5 -$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} - ac_cache_corrupted=: ;; - ,);; - *) - if test "x$ac_old_val" != "x$ac_new_val"; then - # differences in whitespace do not lead to failure. - ac_old_val_w=`echo x $ac_old_val` - ac_new_val_w=`echo x $ac_new_val` - if test "$ac_old_val_w" != "$ac_new_val_w"; then - { $as_echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5 -$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} - ac_cache_corrupted=: - else - { $as_echo "$as_me:$LINENO: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 -$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} - eval $ac_var=\$ac_old_val - fi - { $as_echo "$as_me:$LINENO: former value: \`$ac_old_val'" >&5 -$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} - { $as_echo "$as_me:$LINENO: current value: \`$ac_new_val'" >&5 -$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} - fi;; - esac - # Pass precious variables to config.status. - if test "$ac_new_set" = set; then - case $ac_new_val in - *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; - *) ac_arg=$ac_var=$ac_new_val ;; - esac - case " $ac_configure_args " in - *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. - *) ac_configure_args="$ac_configure_args '$ac_arg'" ;; - esac - fi -done -if $ac_cache_corrupted; then - { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} - { $as_echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5 -$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} - { { $as_echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5 -$as_echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;} - { (exit 1); exit 1; }; } -fi - - - - - - - - - - - - - - - - - - - - - - - - - -ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu - - - -ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu -if test -n "$ac_tool_prefix"; then - # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. -set dummy ${ac_tool_prefix}gcc; ac_word=$2 -{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_prog_CC+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_prog_CC="${ac_tool_prefix}gcc" - $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done -done -IFS=$as_save_IFS - -fi -fi -CC=$ac_cv_prog_CC -if test -n "$CC"; then - { $as_echo "$as_me:$LINENO: result: $CC" >&5 -$as_echo "$CC" >&6; } -else - { $as_echo "$as_me:$LINENO: result: no" >&5 -$as_echo "no" >&6; } -fi - - -fi -if test -z "$ac_cv_prog_CC"; then - ac_ct_CC=$CC - # Extract the first word of "gcc", so it can be a program name with args. -set dummy gcc; ac_word=$2 -{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_prog_ac_ct_CC+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if test -n "$ac_ct_CC"; then - ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_prog_ac_ct_CC="gcc" - $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done -done -IFS=$as_save_IFS - -fi -fi -ac_ct_CC=$ac_cv_prog_ac_ct_CC -if test -n "$ac_ct_CC"; then - { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 -$as_echo "$ac_ct_CC" >&6; } -else - { $as_echo "$as_me:$LINENO: result: no" >&5 -$as_echo "no" >&6; } -fi - - if test "x$ac_ct_CC" = x; then - CC="" - else - case $cross_compiling:$ac_tool_warned in -yes:) -{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} -ac_tool_warned=yes ;; -esac - CC=$ac_ct_CC - fi -else - CC="$ac_cv_prog_CC" -fi - -if test -z "$CC"; then - if test -n "$ac_tool_prefix"; then - # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. -set dummy ${ac_tool_prefix}cc; ac_word=$2 -{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_prog_CC+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_prog_CC="${ac_tool_prefix}cc" - $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done -done -IFS=$as_save_IFS - -fi -fi -CC=$ac_cv_prog_CC -if test -n "$CC"; then - { $as_echo "$as_me:$LINENO: result: $CC" >&5 -$as_echo "$CC" >&6; } -else - { $as_echo "$as_me:$LINENO: result: no" >&5 -$as_echo "no" >&6; } -fi - - - fi -fi -if test -z "$CC"; then - # Extract the first word of "cc", so it can be a program name with args. -set dummy cc; ac_word=$2 -{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_prog_CC+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else - ac_prog_rejected=no -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then - ac_prog_rejected=yes - continue - fi - ac_cv_prog_CC="cc" - $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done -done -IFS=$as_save_IFS - -if test $ac_prog_rejected = yes; then - # We found a bogon in the path, so make sure we never use it. - set dummy $ac_cv_prog_CC - shift - if test $# != 0; then - # We chose a different compiler from the bogus one. - # However, it has the same basename, so the bogon will be chosen - # first if we set CC to just the basename; use the full file name. - shift - ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" - fi -fi -fi -fi -CC=$ac_cv_prog_CC -if test -n "$CC"; then - { $as_echo "$as_me:$LINENO: result: $CC" >&5 -$as_echo "$CC" >&6; } -else - { $as_echo "$as_me:$LINENO: result: no" >&5 -$as_echo "no" >&6; } -fi - - -fi -if test -z "$CC"; then - if test -n "$ac_tool_prefix"; then - for ac_prog in cl.exe - do - # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. -set dummy $ac_tool_prefix$ac_prog; ac_word=$2 -{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_prog_CC+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if test -n "$CC"; then - ac_cv_prog_CC="$CC" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_prog_CC="$ac_tool_prefix$ac_prog" - $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done -done -IFS=$as_save_IFS - -fi -fi -CC=$ac_cv_prog_CC -if test -n "$CC"; then - { $as_echo "$as_me:$LINENO: result: $CC" >&5 -$as_echo "$CC" >&6; } -else - { $as_echo "$as_me:$LINENO: result: no" >&5 -$as_echo "no" >&6; } -fi - - - test -n "$CC" && break - done -fi -if test -z "$CC"; then - ac_ct_CC=$CC - for ac_prog in cl.exe -do - # Extract the first word of "$ac_prog", so it can be a program name with args. -set dummy $ac_prog; ac_word=$2 -{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 -$as_echo_n "checking for $ac_word... " >&6; } -if test "${ac_cv_prog_ac_ct_CC+set}" = set; then - $as_echo_n "(cached) " >&6 -else - if test -n "$ac_ct_CC"; then - ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. -else -as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - for ac_exec_ext in '' $ac_executable_extensions; do - if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then - ac_cv_prog_ac_ct_CC="$ac_prog" - $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 - break 2 - fi -done -done -IFS=$as_save_IFS - -fi -fi -ac_ct_CC=$ac_cv_prog_ac_ct_CC -if test -n "$ac_ct_CC"; then - { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 -$as_echo "$ac_ct_CC" >&6; } -else - { $as_echo "$as_me:$LINENO: result: no" >&5 -$as_echo "no" >&6; } -fi - - - test -n "$ac_ct_CC" && break -done - - if test "x$ac_ct_CC" = x; then - CC="" - else - case $cross_compiling:$ac_tool_warned in -yes:) -{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5 -$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} -ac_tool_warned=yes ;; -esac - CC=$ac_ct_CC - fi -fi - -fi - - -test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH -See \`config.log' for more details." >&5 -$as_echo "$as_me: error: no acceptable C compiler found in \$PATH -See \`config.log' for more details." >&2;} - { (exit 1); exit 1; }; }; } - -# Provide some information about the compiler. -$as_echo "$as_me:$LINENO: checking for C compiler version" >&5 -set X $ac_compile -ac_compiler=$2 -{ (ac_try="$ac_compiler --version >&5" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compiler --version >&5") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } -{ (ac_try="$ac_compiler -v >&5" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compiler -v >&5") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } -{ (ac_try="$ac_compiler -V >&5" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compiler -V >&5") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } - -cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -int -main () -{ - - ; - return 0; -} -_ACEOF -ac_clean_files_save=$ac_clean_files -ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" -# Try to create an executable without -o first, disregard a.out. -# It will help us diagnose broken compilers, and finding out an intuition -# of exeext. -{ $as_echo "$as_me:$LINENO: checking for C compiler default output file name" >&5 -$as_echo_n "checking for C compiler default output file name... " >&6; } -ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` - -# The possible output files: -ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" - -ac_rmfiles= -for ac_file in $ac_files -do - case $ac_file in - *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; - * ) ac_rmfiles="$ac_rmfiles $ac_file";; - esac -done -rm -f $ac_rmfiles - -if { (ac_try="$ac_link_default" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_link_default") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); }; then - # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. -# So ignore a value of `no', otherwise this would lead to `EXEEXT = no' -# in a Makefile. We should not override ac_cv_exeext if it was cached, -# so that the user can short-circuit this test for compilers unknown to -# Autoconf. -for ac_file in $ac_files '' -do - test -f "$ac_file" || continue - case $ac_file in - *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) - ;; - [ab].out ) - # We found the default executable, but exeext='' is most - # certainly right. - break;; - *.* ) - if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; - then :; else - ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` - fi - # We set ac_cv_exeext here because the later test for it is not - # safe: cross compilers may not add the suffix if given an `-o' - # argument, so we may need to know it at that point already. - # Even if this section looks crufty: it has the advantage of - # actually working. - break;; - * ) - break;; - esac -done -test "$ac_cv_exeext" = no && ac_cv_exeext= - -else - ac_file='' -fi - -{ $as_echo "$as_me:$LINENO: result: $ac_file" >&5 -$as_echo "$ac_file" >&6; } -if test -z "$ac_file"; then - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - -{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -{ { $as_echo "$as_me:$LINENO: error: C compiler cannot create executables -See \`config.log' for more details." >&5 -$as_echo "$as_me: error: C compiler cannot create executables -See \`config.log' for more details." >&2;} - { (exit 77); exit 77; }; }; } -fi - -ac_exeext=$ac_cv_exeext - -# Check that the compiler produces executables we can run. If not, either -# the compiler is broken, or we cross compile. -{ $as_echo "$as_me:$LINENO: checking whether the C compiler works" >&5 -$as_echo_n "checking whether the C compiler works... " >&6; } -# FIXME: These cross compiler hacks should be removed for Autoconf 3.0 -# If not cross compiling, check that we can run a simple program. -if test "$cross_compiling" != yes; then - if { ac_try='./$ac_file' - { (case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_try") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); }; }; then - cross_compiling=no - else - if test "$cross_compiling" = maybe; then - cross_compiling=yes - else - { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -{ { $as_echo "$as_me:$LINENO: error: cannot run C compiled programs. -If you meant to cross compile, use \`--host'. -See \`config.log' for more details." >&5 -$as_echo "$as_me: error: cannot run C compiled programs. -If you meant to cross compile, use \`--host'. -See \`config.log' for more details." >&2;} - { (exit 1); exit 1; }; }; } - fi - fi -fi -{ $as_echo "$as_me:$LINENO: result: yes" >&5 -$as_echo "yes" >&6; } - -rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out -ac_clean_files=$ac_clean_files_save -# Check that the compiler produces executables we can run. If not, either -# the compiler is broken, or we cross compile. -{ $as_echo "$as_me:$LINENO: checking whether we are cross compiling" >&5 -$as_echo_n "checking whether we are cross compiling... " >&6; } -{ $as_echo "$as_me:$LINENO: result: $cross_compiling" >&5 -$as_echo "$cross_compiling" >&6; } - -{ $as_echo "$as_me:$LINENO: checking for suffix of executables" >&5 -$as_echo_n "checking for suffix of executables... " >&6; } -if { (ac_try="$ac_link" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_link") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); }; then - # If both `conftest.exe' and `conftest' are `present' (well, observable) -# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will -# work properly (i.e., refer to `conftest.exe'), while it won't with -# `rm'. -for ac_file in conftest.exe conftest conftest.*; do - test -f "$ac_file" || continue - case $ac_file in - *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; - *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` - break;; - * ) break;; - esac -done -else - { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link -See \`config.log' for more details." >&5 -$as_echo "$as_me: error: cannot compute suffix of executables: cannot compile and link -See \`config.log' for more details." >&2;} - { (exit 1); exit 1; }; }; } -fi - -rm -f conftest$ac_cv_exeext -{ $as_echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5 -$as_echo "$ac_cv_exeext" >&6; } - -rm -f conftest.$ac_ext -EXEEXT=$ac_cv_exeext -ac_exeext=$EXEEXT -{ $as_echo "$as_me:$LINENO: checking for suffix of object files" >&5 -$as_echo_n "checking for suffix of object files... " >&6; } -if test "${ac_cv_objext+set}" = set; then - $as_echo_n "(cached) " >&6 -else - cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -int -main () -{ - - ; - return 0; -} -_ACEOF -rm -f conftest.o conftest.obj -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compile") 2>&5 - ac_status=$? - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); }; then - for ac_file in conftest.o conftest.obj conftest.*; do - test -f "$ac_file" || continue; - case $ac_file in - *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; - *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` - break;; - esac -done -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - -{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 -$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} -{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile -See \`config.log' for more details." >&5 -$as_echo "$as_me: error: cannot compute suffix of object files: cannot compile -See \`config.log' for more details." >&2;} - { (exit 1); exit 1; }; }; } -fi - -rm -f conftest.$ac_cv_objext conftest.$ac_ext -fi -{ $as_echo "$as_me:$LINENO: result: $ac_cv_objext" >&5 -$as_echo "$ac_cv_objext" >&6; } -OBJEXT=$ac_cv_objext -ac_objext=$OBJEXT -{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5 -$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } -if test "${ac_cv_c_compiler_gnu+set}" = set; then - $as_echo_n "(cached) " >&6 -else - cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -int -main () -{ -#ifndef __GNUC__ - choke me -#endif - - ; - return 0; -} -_ACEOF -rm -f conftest.$ac_objext -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compile") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then - ac_compiler_gnu=yes -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_compiler_gnu=no -fi - -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -ac_cv_c_compiler_gnu=$ac_compiler_gnu - -fi -{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5 -$as_echo "$ac_cv_c_compiler_gnu" >&6; } -if test $ac_compiler_gnu = yes; then - GCC=yes -else - GCC= -fi -ac_test_CFLAGS=${CFLAGS+set} -ac_save_CFLAGS=$CFLAGS -{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5 -$as_echo_n "checking whether $CC accepts -g... " >&6; } -if test "${ac_cv_prog_cc_g+set}" = set; then - $as_echo_n "(cached) " >&6 -else - ac_save_c_werror_flag=$ac_c_werror_flag - ac_c_werror_flag=yes - ac_cv_prog_cc_g=no - CFLAGS="-g" - cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -int -main () -{ - - ; - return 0; -} -_ACEOF -rm -f conftest.$ac_objext -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compile") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then - ac_cv_prog_cc_g=yes -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - CFLAGS="" - cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -int -main () -{ - - ; - return 0; -} -_ACEOF -rm -f conftest.$ac_objext -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compile") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then - : -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_c_werror_flag=$ac_save_c_werror_flag - CFLAGS="-g" - cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -int -main () -{ - - ; - return 0; -} -_ACEOF -rm -f conftest.$ac_objext -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compile") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then - ac_cv_prog_cc_g=yes -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - -fi - -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -fi - -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -fi - -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext - ac_c_werror_flag=$ac_save_c_werror_flag -fi -{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5 -$as_echo "$ac_cv_prog_cc_g" >&6; } -if test "$ac_test_CFLAGS" = set; then - CFLAGS=$ac_save_CFLAGS -elif test $ac_cv_prog_cc_g = yes; then - if test "$GCC" = yes; then - CFLAGS="-g -O2" - else - CFLAGS="-g" - fi -else - if test "$GCC" = yes; then - CFLAGS="-O2" - else - CFLAGS= - fi -fi -{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5 -$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } -if test "${ac_cv_prog_cc_c89+set}" = set; then - $as_echo_n "(cached) " >&6 -else - ac_cv_prog_cc_c89=no -ac_save_CC=$CC -cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ -#include <stdarg.h> -#include <stdio.h> -#include <sys/types.h> -#include <sys/stat.h> -/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ -struct buf { int x; }; -FILE * (*rcsopen) (struct buf *, struct stat *, int); -static char *e (p, i) - char **p; - int i; -{ - return p[i]; -} -static char *f (char * (*g) (char **, int), char **p, ...) -{ - char *s; - va_list v; - va_start (v,p); - s = g (p, va_arg (v,int)); - va_end (v); - return s; -} - -/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has - function prototypes and stuff, but not '\xHH' hex character constants. - These don't provoke an error unfortunately, instead are silently treated - as 'x'. The following induces an error, until -std is added to get - proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an - array size at least. It's necessary to write '\x00'==0 to get something - that's true only with -std. */ -int osf4_cc_array ['\x00' == 0 ? 1 : -1]; - -/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters - inside strings and character constants. */ -#define FOO(x) 'x' -int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; - -int test (int i, double x); -struct s1 {int (*f) (int a);}; -struct s2 {int (*f) (double a);}; -int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); -int argc; -char **argv; -int -main () -{ -return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; - ; - return 0; -} -_ACEOF -for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ - -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" -do - CC="$ac_save_CC $ac_arg" - rm -f conftest.$ac_objext -if { (ac_try="$ac_compile" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_compile") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest.$ac_objext; then - ac_cv_prog_cc_c89=$ac_arg -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - -fi - -rm -f core conftest.err conftest.$ac_objext - test "x$ac_cv_prog_cc_c89" != "xno" && break -done -rm -f conftest.$ac_ext -CC=$ac_save_CC - -fi -# AC_CACHE_VAL -case "x$ac_cv_prog_cc_c89" in - x) - { $as_echo "$as_me:$LINENO: result: none needed" >&5 -$as_echo "none needed" >&6; } ;; - xno) - { $as_echo "$as_me:$LINENO: result: unsupported" >&5 -$as_echo "unsupported" >&6; } ;; - *) - CC="$CC $ac_cv_prog_cc_c89" - { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5 -$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; -esac - - -ac_ext=c -ac_cpp='$CPP $CPPFLAGS' -ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' -ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' -ac_compiler_gnu=$ac_cv_c_compiler_gnu - - - -{ $as_echo "$as_me:$LINENO: checking for BZ2_bzDecompressInit in -lbz2" >&5 -$as_echo_n "checking for BZ2_bzDecompressInit in -lbz2... " >&6; } -if test "${ac_cv_lib_bz2_BZ2_bzDecompressInit+set}" = set; then - $as_echo_n "(cached) " >&6 -else - ac_check_lib_save_LIBS=$LIBS -LIBS="-lbz2 $LIBS" -cat >conftest.$ac_ext <<_ACEOF -/* confdefs.h. */ -_ACEOF -cat confdefs.h >>conftest.$ac_ext -cat >>conftest.$ac_ext <<_ACEOF -/* end confdefs.h. */ - -/* Override any GCC internal prototype to avoid an error. - Use char because int might match the return type of a GCC - builtin and then its argument prototype would still apply. */ -#ifdef __cplusplus -extern "C" -#endif -char BZ2_bzDecompressInit (); -int -main () -{ -return BZ2_bzDecompressInit (); - ; - return 0; -} -_ACEOF -rm -f conftest.$ac_objext conftest$ac_exeext -if { (ac_try="$ac_link" -case "(($ac_try" in - *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; - *) ac_try_echo=$ac_try;; -esac -eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" -$as_echo "$ac_try_echo") >&5 - (eval "$ac_link") 2>conftest.er1 - ac_status=$? - grep -v '^ *+' conftest.er1 >conftest.err - rm -f conftest.er1 - cat conftest.err >&5 - $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 - (exit $ac_status); } && { - test -z "$ac_c_werror_flag" || - test ! -s conftest.err - } && test -s conftest$ac_exeext && { - test "$cross_compiling" = yes || - $as_test_x conftest$ac_exeext - }; then - ac_cv_lib_bz2_BZ2_bzDecompressInit=yes -else - $as_echo "$as_me: failed program was:" >&5 -sed 's/^/| /' conftest.$ac_ext >&5 - - ac_cv_lib_bz2_BZ2_bzDecompressInit=no -fi - -rm -rf conftest.dSYM -rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ - conftest$ac_exeext conftest.$ac_ext -LIBS=$ac_check_lib_save_LIBS -fi -{ $as_echo "$as_me:$LINENO: result: $ac_cv_lib_bz2_BZ2_bzDecompressInit" >&5 -$as_echo "$ac_cv_lib_bz2_BZ2_bzDecompressInit" >&6; } -if test "x$ac_cv_lib_bz2_BZ2_bzDecompressInit" = x""yes; then - cat >>confdefs.h <<_ACEOF -#define HAVE_LIBBZ2 1 -_ACEOF - - LIBS="-lbz2 $LIBS" - -fi - - -ac_config_files="$ac_config_files src/Makevars" - -cp confdefs.h src/config.h -cat >confcache <<\_ACEOF -# This file is a shell script that caches the results of configure -# tests run on this system so they can be shared between configure -# scripts and configure runs, see configure's option --config-cache. -# It is not useful on other systems. If it contains results you don't -# want to keep, you may remove or edit it. -# -# config.status only pays attention to the cache file if you give it -# the --recheck option to rerun configure. -# -# `ac_cv_env_foo' variables (set or unset) will be overridden when -# loading this file, other *unset* `ac_cv_foo' will be assigned the -# following values. - -_ACEOF - -# The following way of writing the cache mishandles newlines in values, -# but we know of no workaround that is simple, portable, and efficient. -# So, we kill variables containing newlines. -# Ultrix sh set writes to stderr and can't be redirected directly, -# and sets the high bit in the cache file unless we assign to the vars. -( - for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do - eval ac_val=\$$ac_var - case $ac_val in #( - *${as_nl}*) - case $ac_var in #( - *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5 -$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; - esac - case $ac_var in #( - _ | IFS | as_nl) ;; #( - BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( - *) $as_unset $ac_var ;; - esac ;; - esac - done - - (set) 2>&1 | - case $as_nl`(ac_space=' '; set) 2>&1` in #( - *${as_nl}ac_space=\ *) - # `set' does not quote correctly, so add quotes (double-quote - # substitution turns \\\\ into \\, and sed turns \\ into \). - sed -n \ - "s/'/'\\\\''/g; - s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" - ;; #( - *) - # `set' quotes correctly as required by POSIX, so do not add quotes. - sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" - ;; - esac | - sort -) | - sed ' - /^ac_cv_env_/b end - t clear - :clear - s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ - t end - s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ - :end' >>confcache -if diff "$cache_file" confcache >/dev/null 2>&1; then :; else - if test -w "$cache_file"; then - test "x$cache_file" != "x/dev/null" && - { $as_echo "$as_me:$LINENO: updating cache $cache_file" >&5 -$as_echo "$as_me: updating cache $cache_file" >&6;} - cat confcache >$cache_file - else - { $as_echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5 -$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} - fi -fi -rm -f confcache - -test "x$prefix" = xNONE && prefix=$ac_default_prefix -# Let make expand exec_prefix. -test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' - -# Transform confdefs.h into DEFS. -# Protect against shell expansion while executing Makefile rules. -# Protect against Makefile macro expansion. -# -# If the first sed substitution is executed (which looks for macros that -# take arguments), then branch to the quote section. Otherwise, -# look for a macro that doesn't take arguments. -ac_script=' -:mline -/\\$/{ - N - s,\\\n,, - b mline -} -t clear -:clear -s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g -t quote -s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g -t quote -b any -:quote -s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g -s/\[/\\&/g -s/\]/\\&/g -s/\$/$$/g -H -:any -${ - g - s/^\n// - s/\n/ /g - p -} -' -DEFS=`sed -n "$ac_script" confdefs.h` - - -ac_libobjs= -ac_ltlibobjs= -for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue - # 1. Remove the extension, and $U if already installed. - ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' - ac_i=`$as_echo "$ac_i" | sed "$ac_script"` - # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR - # will be set to the directory where LIBOBJS objects are built. - ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext" - ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo' -done -LIBOBJS=$ac_libobjs - -LTLIBOBJS=$ac_ltlibobjs - - - -: ${CONFIG_STATUS=./config.status} -ac_write_fail=0 -ac_clean_files_save=$ac_clean_files -ac_clean_files="$ac_clean_files $CONFIG_STATUS" -{ $as_echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5 -$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} -cat >$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -#! $SHELL -# Generated by $as_me. -# Run this file to recreate the current configuration. -# Compiler output produced by configure, useful for debugging -# configure, is in config.log if it exists. - -debug=false -ac_cs_recheck=false -ac_cs_silent=false -SHELL=\${CONFIG_SHELL-$SHELL} -_ACEOF - -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -## --------------------- ## -## M4sh Initialization. ## -## --------------------- ## - -# Be more Bourne compatible -DUALCASE=1; export DUALCASE # for MKS sh -if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then - emulate sh - NULLCMD=: - # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which - # is contrary to our usage. Disable this feature. - alias -g '${1+"$@"}'='"$@"' - setopt NO_GLOB_SUBST -else - case `(set -o) 2>/dev/null` in - *posix*) set -o posix ;; -esac - -fi - - - - -# PATH needs CR -# Avoid depending upon Character Ranges. -as_cr_letters='abcdefghijklmnopqrstuvwxyz' -as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' -as_cr_Letters=$as_cr_letters$as_cr_LETTERS -as_cr_digits='0123456789' -as_cr_alnum=$as_cr_Letters$as_cr_digits - -as_nl=' -' -export as_nl -# Printing a long string crashes Solaris 7 /usr/bin/printf. -as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo -as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo -if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then - as_echo='printf %s\n' - as_echo_n='printf %s' -else - if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then - as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' - as_echo_n='/usr/ucb/echo -n' - else - as_echo_body='eval expr "X$1" : "X\\(.*\\)"' - as_echo_n_body='eval - arg=$1; - case $arg in - *"$as_nl"*) - expr "X$arg" : "X\\(.*\\)$as_nl"; - arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; - esac; - expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" - ' - export as_echo_n_body - as_echo_n='sh -c $as_echo_n_body as_echo' - fi - export as_echo_body - as_echo='sh -c $as_echo_body as_echo' -fi - -# The user is always right. -if test "${PATH_SEPARATOR+set}" != set; then - PATH_SEPARATOR=: - (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { - (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || - PATH_SEPARATOR=';' - } -fi - -# Support unset when possible. -if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then - as_unset=unset -else - as_unset=false -fi - - -# IFS -# We need space, tab and new line, in precisely that order. Quoting is -# there to prevent editors from complaining about space-tab. -# (If _AS_PATH_WALK were called with IFS unset, it would disable word -# splitting by setting IFS to empty value.) -IFS=" "" $as_nl" - -# Find who we are. Look in the path if we contain no directory separator. -case $0 in - *[\\/]* ) as_myself=$0 ;; - *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR -for as_dir in $PATH -do - IFS=$as_save_IFS - test -z "$as_dir" && as_dir=. - test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break -done -IFS=$as_save_IFS - - ;; -esac -# We did not find ourselves, most probably we were run as `sh COMMAND' -# in which case we are not to be found in the path. -if test "x$as_myself" = x; then - as_myself=$0 -fi -if test ! -f "$as_myself"; then - $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 - { (exit 1); exit 1; } -fi - -# Work around bugs in pre-3.0 UWIN ksh. -for as_var in ENV MAIL MAILPATH -do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var -done -PS1='$ ' -PS2='> ' -PS4='+ ' - -# NLS nuisances. -LC_ALL=C -export LC_ALL -LANGUAGE=C -export LANGUAGE - -# Required to use basename. -if expr a : '\(a\)' >/dev/null 2>&1 && - test "X`expr 00001 : '.*\(...\)'`" = X001; then - as_expr=expr -else - as_expr=false -fi - -if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then - as_basename=basename -else - as_basename=false -fi - - -# Name of the executable. -as_me=`$as_basename -- "$0" || -$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ - X"$0" : 'X\(//\)$' \| \ - X"$0" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X/"$0" | - sed '/^.*\/\([^/][^/]*\)\/*$/{ - s//\1/ - q - } - /^X\/\(\/\/\)$/{ - s//\1/ - q - } - /^X\/\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - -# CDPATH. -$as_unset CDPATH - - - - as_lineno_1=$LINENO - as_lineno_2=$LINENO - test "x$as_lineno_1" != "x$as_lineno_2" && - test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || { - - # Create $as_me.lineno as a copy of $as_myself, but with $LINENO - # uniformly replaced by the line number. The first 'sed' inserts a - # line-number line after each line using $LINENO; the second 'sed' - # does the real work. The second script uses 'N' to pair each - # line-number line with the line containing $LINENO, and appends - # trailing '-' during substitution so that $LINENO is not a special - # case at line end. - # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the - # scripts with optimization help from Paolo Bonzini. Blame Lee - # E. McMahon (1931-1989) for sed's syntax. :-) - sed -n ' - p - /[$]LINENO/= - ' <$as_myself | - sed ' - s/[$]LINENO.*/&-/ - t lineno - b - :lineno - N - :loop - s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ - t loop - s/-\n.*// - ' >$as_me.lineno && - chmod +x "$as_me.lineno" || - { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 - { (exit 1); exit 1; }; } - - # Don't try to exec as it changes $[0], causing all sort of problems - # (the dirname of $[0] is not the place where we might find the - # original and so on. Autoconf is especially sensitive to this). - . "./$as_me.lineno" - # Exit status is that of the last command. - exit -} - - -if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then - as_dirname=dirname -else - as_dirname=false -fi - -ECHO_C= ECHO_N= ECHO_T= -case `echo -n x` in --n*) - case `echo 'x\c'` in - *c*) ECHO_T=' ';; # ECHO_T is single tab character. - *) ECHO_C='\c';; - esac;; -*) - ECHO_N='-n';; -esac -if expr a : '\(a\)' >/dev/null 2>&1 && - test "X`expr 00001 : '.*\(...\)'`" = X001; then - as_expr=expr -else - as_expr=false -fi - -rm -f conf$$ conf$$.exe conf$$.file -if test -d conf$$.dir; then - rm -f conf$$.dir/conf$$.file -else - rm -f conf$$.dir - mkdir conf$$.dir 2>/dev/null -fi -if (echo >conf$$.file) 2>/dev/null; then - if ln -s conf$$.file conf$$ 2>/dev/null; then - as_ln_s='ln -s' - # ... but there are two gotchas: - # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. - # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. - # In both cases, we have to default to `cp -p'. - ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || - as_ln_s='cp -p' - elif ln conf$$.file conf$$ 2>/dev/null; then - as_ln_s=ln - else - as_ln_s='cp -p' - fi -else - as_ln_s='cp -p' -fi -rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file -rmdir conf$$.dir 2>/dev/null - -if mkdir -p . 2>/dev/null; then - as_mkdir_p=: -else - test -d ./-p && rmdir ./-p - as_mkdir_p=false -fi - -if test -x / >/dev/null 2>&1; then - as_test_x='test -x' -else - if ls -dL / >/dev/null 2>&1; then - as_ls_L_option=L - else - as_ls_L_option= - fi - as_test_x=' - eval sh -c '\'' - if test -d "$1"; then - test -d "$1/."; - else - case $1 in - -*)set "./$1";; - esac; - case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in - ???[sx]*):;;*)false;;esac;fi - '\'' sh - ' -fi -as_executable_p=$as_test_x - -# Sed expression to map a string onto a valid CPP name. -as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" - -# Sed expression to map a string onto a valid variable name. -as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" - - -exec 6>&1 - -# Save the log message, to keep $[0] and so on meaningful, and to -# report actual input values of CONFIG_FILES etc. instead of their -# values after options handling. -ac_log=" -This file was extended by SPP $as_me 1.7, which was -generated by GNU Autoconf 2.63. Invocation command line was - - CONFIG_FILES = $CONFIG_FILES - CONFIG_HEADERS = $CONFIG_HEADERS - CONFIG_LINKS = $CONFIG_LINKS - CONFIG_COMMANDS = $CONFIG_COMMANDS - $ $0 $@ - -on `(hostname || uname -n) 2>/dev/null | sed 1q` -" - -_ACEOF - -case $ac_config_files in *" -"*) set x $ac_config_files; shift; ac_config_files=$*;; -esac - - - -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -# Files that config.status was made for. -config_files="$ac_config_files" - -_ACEOF - -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -ac_cs_usage="\ -\`$as_me' instantiates files from templates according to the -current configuration. - -Usage: $0 [OPTION]... [FILE]... - - -h, --help print this help, then exit - -V, --version print version number and configuration settings, then exit - -q, --quiet, --silent - do not print progress messages - -d, --debug don't remove temporary files - --recheck update $as_me by reconfiguring in the same conditions - --file=FILE[:TEMPLATE] - instantiate the configuration file FILE - -Configuration files: -$config_files - -Report bugs to <bug-autoconf@gnu.org>." - -_ACEOF -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -ac_cs_version="\\ -SPP config.status 1.7 -configured by $0, generated by GNU Autoconf 2.63, - with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" - -Copyright (C) 2008 Free Software Foundation, Inc. -This config.status script is free software; the Free Software Foundation -gives unlimited permission to copy, distribute and modify it." - -ac_pwd='$ac_pwd' -srcdir='$srcdir' -test -n "\$AWK" || AWK=awk -_ACEOF - -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -# The default lists apply if the user does not specify any file. -ac_need_defaults=: -while test $# != 0 -do - case $1 in - --*=*) - ac_option=`expr "X$1" : 'X\([^=]*\)='` - ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` - ac_shift=: - ;; - *) - ac_option=$1 - ac_optarg=$2 - ac_shift=shift - ;; - esac - - case $ac_option in - # Handling of the options. - -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) - ac_cs_recheck=: ;; - --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) - $as_echo "$ac_cs_version"; exit ;; - --debug | --debu | --deb | --de | --d | -d ) - debug=: ;; - --file | --fil | --fi | --f ) - $ac_shift - case $ac_optarg in - *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; - esac - CONFIG_FILES="$CONFIG_FILES '$ac_optarg'" - ac_need_defaults=false;; - --he | --h | --help | --hel | -h ) - $as_echo "$ac_cs_usage"; exit ;; - -q | -quiet | --quiet | --quie | --qui | --qu | --q \ - | -silent | --silent | --silen | --sile | --sil | --si | --s) - ac_cs_silent=: ;; - - # This is an error. - -*) { $as_echo "$as_me: error: unrecognized option: $1 -Try \`$0 --help' for more information." >&2 - { (exit 1); exit 1; }; } ;; - - *) ac_config_targets="$ac_config_targets $1" - ac_need_defaults=false ;; - - esac - shift -done - -ac_configure_extra_args= - -if $ac_cs_silent; then - exec 6>/dev/null - ac_configure_extra_args="$ac_configure_extra_args --silent" -fi - -_ACEOF -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -if \$ac_cs_recheck; then - set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion - shift - \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 - CONFIG_SHELL='$SHELL' - export CONFIG_SHELL - exec "\$@" -fi - -_ACEOF -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -exec 5>>config.log -{ - echo - sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX -## Running $as_me. ## -_ASBOX - $as_echo "$ac_log" -} >&5 - -_ACEOF -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -_ACEOF - -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 - -# Handling of arguments. -for ac_config_target in $ac_config_targets -do - case $ac_config_target in - "src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;; - - *) { { $as_echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5 -$as_echo "$as_me: error: invalid argument: $ac_config_target" >&2;} - { (exit 1); exit 1; }; };; - esac -done - - -# If the user did not use the arguments to specify the items to instantiate, -# then the envvar interface is used. Set only those that are not. -# We use the long form for the default assignment because of an extremely -# bizarre bug on SunOS 4.1.3. -if $ac_need_defaults; then - test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files -fi - -# Have a temporary directory for convenience. Make it in the build tree -# simply because there is no reason against having it here, and in addition, -# creating and moving files from /tmp can sometimes cause problems. -# Hook for its removal unless debugging. -# Note that there is a small window in which the directory will not be cleaned: -# after its creation but before its name has been assigned to `$tmp'. -$debug || -{ - tmp= - trap 'exit_status=$? - { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status -' 0 - trap '{ (exit 1); exit 1; }' 1 2 13 15 -} -# Create a (secure) tmp directory for tmp files. - -{ - tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && - test -n "$tmp" && test -d "$tmp" -} || -{ - tmp=./conf$$-$RANDOM - (umask 077 && mkdir "$tmp") -} || -{ - $as_echo "$as_me: cannot create a temporary directory in ." >&2 - { (exit 1); exit 1; } -} - -# Set up the scripts for CONFIG_FILES section. -# No need to generate them if there are no CONFIG_FILES. -# This happens for instance with `./config.status config.h'. -if test -n "$CONFIG_FILES"; then - - -ac_cr=' ' -ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null` -if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then - ac_cs_awk_cr='\\r' -else - ac_cs_awk_cr=$ac_cr -fi - -echo 'BEGIN {' >"$tmp/subs1.awk" && -_ACEOF - - -{ - echo "cat >conf$$subs.awk <<_ACEOF" && - echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && - echo "_ACEOF" -} >conf$$subs.sh || - { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 -$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} - { (exit 1); exit 1; }; } -ac_delim_num=`echo "$ac_subst_vars" | grep -c '$'` -ac_delim='%!_!# ' -for ac_last_try in false false false false false :; do - . ./conf$$subs.sh || - { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 -$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} - { (exit 1); exit 1; }; } - - ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` - if test $ac_delim_n = $ac_delim_num; then - break - elif $ac_last_try; then - { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 -$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} - { (exit 1); exit 1; }; } - else - ac_delim="$ac_delim!$ac_delim _$ac_delim!! " - fi -done -rm -f conf$$subs.sh - -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -cat >>"\$tmp/subs1.awk" <<\\_ACAWK && -_ACEOF -sed -n ' -h -s/^/S["/; s/!.*/"]=/ -p -g -s/^[^!]*!// -:repl -t repl -s/'"$ac_delim"'$// -t delim -:nl -h -s/\(.\{148\}\).*/\1/ -t more1 -s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ -p -n -b repl -:more1 -s/["\\]/\\&/g; s/^/"/; s/$/"\\/ -p -g -s/.\{148\}// -t nl -:delim -h -s/\(.\{148\}\).*/\1/ -t more2 -s/["\\]/\\&/g; s/^/"/; s/$/"/ -p -b -:more2 -s/["\\]/\\&/g; s/^/"/; s/$/"\\/ -p -g -s/.\{148\}// -t delim -' <conf$$subs.awk | sed ' -/^[^""]/{ - N - s/\n// -} -' >>$CONFIG_STATUS || ac_write_fail=1 -rm -f conf$$subs.awk -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -_ACAWK -cat >>"\$tmp/subs1.awk" <<_ACAWK && - for (key in S) S_is_set[key] = 1 - FS = "" - -} -{ - line = $ 0 - nfields = split(line, field, "@") - substed = 0 - len = length(field[1]) - for (i = 2; i < nfields; i++) { - key = field[i] - keylen = length(key) - if (S_is_set[key]) { - value = S[key] - line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) - len += length(value) + length(field[++i]) - substed = 1 - } else - len += 1 + keylen - } - - print line -} - -_ACAWK -_ACEOF -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then - sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" -else - cat -fi < "$tmp/subs1.awk" > "$tmp/subs.awk" \ - || { { $as_echo "$as_me:$LINENO: error: could not setup config files machinery" >&5 -$as_echo "$as_me: error: could not setup config files machinery" >&2;} - { (exit 1); exit 1; }; } -_ACEOF - -# VPATH may cause trouble with some makes, so we remove $(srcdir), -# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and -# trailing colons and then remove the whole line if VPATH becomes empty -# (actually we leave an empty line to preserve line numbers). -if test "x$srcdir" = x.; then - ac_vpsub='/^[ ]*VPATH[ ]*=/{ -s/:*\$(srcdir):*/:/ -s/:*\${srcdir}:*/:/ -s/:*@srcdir@:*/:/ -s/^\([^=]*=[ ]*\):*/\1/ -s/:*$// -s/^[^=]*=[ ]*$// -}' -fi - -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -fi # test -n "$CONFIG_FILES" - - -eval set X " :F $CONFIG_FILES " -shift -for ac_tag -do - case $ac_tag in - :[FHLC]) ac_mode=$ac_tag; continue;; - esac - case $ac_mode$ac_tag in - :[FHL]*:*);; - :L* | :C*:*) { { $as_echo "$as_me:$LINENO: error: invalid tag $ac_tag" >&5 -$as_echo "$as_me: error: invalid tag $ac_tag" >&2;} - { (exit 1); exit 1; }; };; - :[FH]-) ac_tag=-:-;; - :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; - esac - ac_save_IFS=$IFS - IFS=: - set x $ac_tag - IFS=$ac_save_IFS - shift - ac_file=$1 - shift - - case $ac_mode in - :L) ac_source=$1;; - :[FH]) - ac_file_inputs= - for ac_f - do - case $ac_f in - -) ac_f="$tmp/stdin";; - *) # Look for the file first in the build tree, then in the source tree - # (if the path is not absolute). The absolute path cannot be DOS-style, - # because $ac_f cannot contain `:'. - test -f "$ac_f" || - case $ac_f in - [\\/$]*) false;; - *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; - esac || - { { $as_echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5 -$as_echo "$as_me: error: cannot find input file: $ac_f" >&2;} - { (exit 1); exit 1; }; };; - esac - case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac - ac_file_inputs="$ac_file_inputs '$ac_f'" - done - - # Let's still pretend it is `configure' which instantiates (i.e., don't - # use $as_me), people would be surprised to read: - # /* config.h. Generated by config.status. */ - configure_input='Generated from '` - $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' - `' by configure.' - if test x"$ac_file" != x-; then - configure_input="$ac_file. $configure_input" - { $as_echo "$as_me:$LINENO: creating $ac_file" >&5 -$as_echo "$as_me: creating $ac_file" >&6;} - fi - # Neutralize special characters interpreted by sed in replacement strings. - case $configure_input in #( - *\&* | *\|* | *\\* ) - ac_sed_conf_input=`$as_echo "$configure_input" | - sed 's/[\\\\&|]/\\\\&/g'`;; #( - *) ac_sed_conf_input=$configure_input;; - esac - - case $ac_tag in - *:-:* | *:-) cat >"$tmp/stdin" \ - || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 -$as_echo "$as_me: error: could not create $ac_file" >&2;} - { (exit 1); exit 1; }; } ;; - esac - ;; - esac - - ac_dir=`$as_dirname -- "$ac_file" || -$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$ac_file" : 'X\(//\)[^/]' \| \ - X"$ac_file" : 'X\(//\)$' \| \ - X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$ac_file" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ - s//\1/ - q - } - /^X\(\/\/\)$/{ - s//\1/ - q - } - /^X\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - { as_dir="$ac_dir" - case $as_dir in #( - -*) as_dir=./$as_dir;; - esac - test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || { - as_dirs= - while :; do - case $as_dir in #( - *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( - *) as_qdir=$as_dir;; - esac - as_dirs="'$as_qdir' $as_dirs" - as_dir=`$as_dirname -- "$as_dir" || -$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ - X"$as_dir" : 'X\(//\)[^/]' \| \ - X"$as_dir" : 'X\(//\)$' \| \ - X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || -$as_echo X"$as_dir" | - sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ - s//\1/ - q - } - /^X\(\/\/\)[^/].*/{ - s//\1/ - q - } - /^X\(\/\/\)$/{ - s//\1/ - q - } - /^X\(\/\).*/{ - s//\1/ - q - } - s/.*/./; q'` - test -d "$as_dir" && break - done - test -z "$as_dirs" || eval "mkdir $as_dirs" - } || test -d "$as_dir" || { { $as_echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5 -$as_echo "$as_me: error: cannot create directory $as_dir" >&2;} - { (exit 1); exit 1; }; }; } - ac_builddir=. - -case "$ac_dir" in -.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; -*) - ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` - # A ".." for each directory in $ac_dir_suffix. - ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` - case $ac_top_builddir_sub in - "") ac_top_builddir_sub=. ac_top_build_prefix= ;; - *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; - esac ;; -esac -ac_abs_top_builddir=$ac_pwd -ac_abs_builddir=$ac_pwd$ac_dir_suffix -# for backward compatibility: -ac_top_builddir=$ac_top_build_prefix - -case $srcdir in - .) # We are building in place. - ac_srcdir=. - ac_top_srcdir=$ac_top_builddir_sub - ac_abs_top_srcdir=$ac_pwd ;; - [\\/]* | ?:[\\/]* ) # Absolute name. - ac_srcdir=$srcdir$ac_dir_suffix; - ac_top_srcdir=$srcdir - ac_abs_top_srcdir=$srcdir ;; - *) # Relative name. - ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix - ac_top_srcdir=$ac_top_build_prefix$srcdir - ac_abs_top_srcdir=$ac_pwd/$srcdir ;; -esac -ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix - - - case $ac_mode in - :F) - # - # CONFIG_FILE - # - -_ACEOF - -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -# If the template does not know about datarootdir, expand it. -# FIXME: This hack should be removed a few years after 2.60. -ac_datarootdir_hack=; ac_datarootdir_seen= - -ac_sed_dataroot=' -/datarootdir/ { - p - q -} -/@datadir@/p -/@docdir@/p -/@infodir@/p -/@localedir@/p -/@mandir@/p -' -case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in -*datarootdir*) ac_datarootdir_seen=yes;; -*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) - { $as_echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 -$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} -_ACEOF -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 - ac_datarootdir_hack=' - s&@datadir@&$datadir&g - s&@docdir@&$docdir&g - s&@infodir@&$infodir&g - s&@localedir@&$localedir&g - s&@mandir@&$mandir&g - s&\\\${datarootdir}&$datarootdir&g' ;; -esac -_ACEOF - -# Neutralize VPATH when `$srcdir' = `.'. -# Shell code in configure.ac might set extrasub. -# FIXME: do we really want to maintain this feature? -cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 -ac_sed_extra="$ac_vpsub -$extrasub -_ACEOF -cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 -:t -/@[a-zA-Z_][a-zA-Z_0-9]*@/!b -s|@configure_input@|$ac_sed_conf_input|;t t -s&@top_builddir@&$ac_top_builddir_sub&;t t -s&@top_build_prefix@&$ac_top_build_prefix&;t t -s&@srcdir@&$ac_srcdir&;t t -s&@abs_srcdir@&$ac_abs_srcdir&;t t -s&@top_srcdir@&$ac_top_srcdir&;t t -s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t -s&@builddir@&$ac_builddir&;t t -s&@abs_builddir@&$ac_abs_builddir&;t t -s&@abs_top_builddir@&$ac_abs_top_builddir&;t t -$ac_datarootdir_hack -" -eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$tmp/subs.awk" >$tmp/out \ - || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 -$as_echo "$as_me: error: could not create $ac_file" >&2;} - { (exit 1); exit 1; }; } - -test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && - { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } && - { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } && - { $as_echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir' -which seems to be undefined. Please make sure it is defined." >&5 -$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' -which seems to be undefined. Please make sure it is defined." >&2;} - - rm -f "$tmp/stdin" - case $ac_file in - -) cat "$tmp/out" && rm -f "$tmp/out";; - *) rm -f "$ac_file" && mv "$tmp/out" "$ac_file";; - esac \ - || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 -$as_echo "$as_me: error: could not create $ac_file" >&2;} - { (exit 1); exit 1; }; } - ;; - - - - esac - -done # for ac_tag - - -{ (exit 0); exit 0; } -_ACEOF -chmod +x $CONFIG_STATUS -ac_clean_files=$ac_clean_files_save - -test $ac_write_fail = 0 || - { { $as_echo "$as_me:$LINENO: error: write failure creating $CONFIG_STATUS" >&5 -$as_echo "$as_me: error: write failure creating $CONFIG_STATUS" >&2;} - { (exit 1); exit 1; }; } - - -# configure is writing to config.log, and then calls config.status. -# config.status does its own redirection, appending to config.log. -# Unfortunately, on DOS this fails, as config.log is still kept open -# by configure, so config.status won't be able to write to it; its -# output is simply discarded. So we exec the FD to /dev/null, -# effectively closing config.log, so it can be properly (re)opened and -# appended to by config.status. When coming back to configure, we -# need to make the FD available again. -if test "$no_create" != yes; then - ac_cs_success=: - ac_config_status_args= - test "$silent" = yes && - ac_config_status_args="$ac_config_status_args --quiet" - exec 5>/dev/null - $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false - exec 5>>config.log - # Use ||, not &&, to avoid exiting from the if with $? = 1, which - # would make configure fail if this is the last instruction. - $ac_cs_success || { (exit 1); exit 1; } -fi -if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then - { $as_echo "$as_me:$LINENO: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 -$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} -fi -
--- a/spp/configure.ac Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -AC_INIT([SPP], 1.7) - -AC_CHECK_LIB(bz2, BZ2_bzDecompressInit) -AC_SUBST(HAVE_LIBBZ2) -AC_CONFIG_FILES([src/Makevars]) -cp confdefs.h src/config.h -AC_OUTPUT
--- a/spp/man/add.broad.peak.regions.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -\name{add.broad.peak.regions} -\alias{add.broad.peak.regions} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Calculate chromosome-wide profiles of smoothed tag density } -\description{ - Looks for broader regions of enrichment associated with the determined - peak positions, adds them to the $npl data as $rs, $re columns. -} -\usage{ -add.broad.peak.regions(signal.tags, control.tags, binding.postions,window.size=500,z.thr=2) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output - of \code{\link{select.informative.tags}} } - \item{control.tags}{ optionall control (input) tags } - \item{binding.positions}{ output of find.binding.positions call } - \item{window.size}{ window size to be used in calculating enrichment } - \item{z.thr}{ Z-score corresponding to the Poisson ratio threshold - used to flag significantly enriched windows} -} -\value{ - A structure identical to binding.postions with two additional columns - added (rs and re) corresponding to start and end of the associated - significantly enriched region. If no region was associated with a - particular peak, NAs values are reported. -}
--- a/spp/man/find.binding.positions.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -\name{find.binding.positions} -\alias{find.binding.positions} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Determine significant point protein binding positions (peaks) } -\description{ - Given the signal and optional control (input) data, determine location of the - statistically significant point binding positions. If the control data - is not provided, the statistical significance can be assessed based on - tag randomization. The method also provides options for masking - regions exhibiting strong signals within the control data. -} -\usage{ -find.binding.positions(signal.data, e.value = NULL, fdr = NULL, masked.data = NULL, control.data = NULL, min.dist = 200, window.size = 4e+07, cluster = NULL, debug = T, n.randomizations = 3, shuffle.window = 1, min.thr = 0, topN = NULL, tag.count.whs = 100, enrichment.z = 2, method = tag.wtd, tec.filter = T, tec.window.size = 10000, tec.masking.window.size=tec.window.size, tec.z = 5, tec.poisson.z=5,tec.poisson.ratio=5, n.control.samples = 1, enrichment.background.scales = c(1, 5, 10), background.density.scaling = F, use.randomized.controls = F, ...) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - ~~ tag data ~~ - \item{signal.data}{ signal tag vector list } - \item{control.data}{ optional control (input) tag vector list } - - ~~ position stringency criteria ~~ - \item{e.value}{ E-value defining the desired statistical significance - of binding positions. } - \item{fdr}{ FDR defining statistical significance of binding positions } - \item{topN}{ instead of determining statistical significance - thresholds, return the specified number of highest-scoring - positions} - - ~~ other params ~~ - \item{whs}{ window half-sized that should be used for binding - detection (e.g. determined from cross-correlation profiles)} - \item{masked.data}{ optional set of coordinates that should be masked - (e.g. known non-unique regions) } - \item{min.dist}{ minimal distance that must separate detected binding - positions. In case multiple binding positions are detected within - such distance, the position with the highest score is returned. } - \item{window.size}{ size of the window used to segment the chromosome - during calculations to reduce memory usage. } - \item{cluster}{ optional \code{snow} cluster to parallelize the - processing on } - \item{min.thr}{ minimal score requirement for a peak } - \item{background.density.scaling}{ If TRUE, regions of significant tag - enrichment will be masked out when calculating size ratio of the - signal to control datasets (to estimate ratio of the background tag - density). If FALSE, the dataset ratio will be equal to the ratio of - the number of tags in each dataset.} - - ~~ randomized controls ~~ - \item{n.randomizations}{ number of tag randomziations that should be - performed (when the control data is not provided) } - \item{use.randomized.controls}{ Use randomized tag control, even if - \code{control.data} is supplied. } - \item{shuffle.window}{ during tag randomizations, tags will be split - into groups of \code{shuffle.window} and will be maintained - together throughout the randomization. } - - ~~ fold-enrichment confidence intervals - \item{tag.count.whs}{ half-size of a window used to assess fold - enrichment of a binding position} - \item{enrichment.z}{ Z-score used to define the significance level of - the fold-enrichment confidence intervals } - \item{enrichment.background.scales}{ In estimating the peak - fold-enrichment confidence intervals, the background tag density is - estimated based on windows with half-sizes of - \code{2*tag.count.whs*enrichment.background.scales}. } - \item{method}{ either \code{tag.wtd} for WTD method, or - \code{tag.lwcc} for MTC method} - \item{mle.filter}{ If turned on, will exclude predicted positions - whose MLE enrichment ratio (for any of the background scales) is - below a specified min.mle.threshold } - \item{min.mle.threshold}{ MLE enrichment ratio threshold that each - predicted position must exceed if mle.filter is turned on. } - - ~~ masking regions of significant control enrichment ~~ - \item{tec.filter}{ Whether to mask out the regions exhibiting - significant enrichment in the control data in doing other - calculations. The regions are identified using Poisson statistics - within sliding windows, either relative to the scaled signal (tec.z), or - relative to randomly-distributed expectation (tec.poisson.z).} - \item{tec.window.size}{ size of the window used to determine - significantly enrichent control regions } - \item{tec.masking.window.size}{ size of the window used to mask - the area around significantly enrichent control regions } - \item{tec.z}{ Z-score defining statistical stringency by which a given - window is determined to be significantly higher in the input than in - the signal, and masked if that is the case.} - \item{tec.poisson.z}{ Z-score defining statistical stringency by which a given - window is determined to be significantly higher than the - tec.poisson.ratio above the expected uniform input background. } - \item{tec.poisson.ratio}{ Fold ratio by which input must exceed the - level expected from the uniform distribution. } - - - - -} -\value{ - \item{npl}{A per-chromosome list containing data frames describing - determined binding positions. Column description: - \item{x}{ position } - \item{y}{ score } - \item{evalue}{ E-value } - \item{fdr}{ FDR. For peaks higher than the maximum control peak, - the highest dataset FDR is reported } - \item{enr}{ lower bound of the fold-enrichment ratio confidence - interval. This is the estimate determined using scale of - 1. Estimates corresponding to higher scales are returned in other enr columns - with scale appearing in the name.} - \item{enr.mle}{ enrichment ratio maximum likely estimate } - } - \item{thr}{ info on the chosen statistical threshold of the peak scores} -} - -\examples{ - # find binding positions using WTD method, 200bp half-window size, -control data, 1% FDR - bp <- -find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.wtd,whs=200); - - # find binding positions using MTC method, using 5 tag randomizations, - # keeping pairs of tag positions together (shuffle.window=2) - bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.lwcc,whs=200,use.randomized.controls=T,n.randomizations=5,shuffle.window=2) - - # print out the number of determined positions - print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks")); - - -} \ No newline at end of file
--- a/spp/man/get.binding.characteristics.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ -\name{get.binding.characteristics} -\alias{get.binding.characteristics} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Calculate characteristics of observed DNA-binding signal from - cross-correlation profiles } -\description{ - The methods calculates strand cross-correlation profile to determine binding - peak separation distance and approximate window size that should be used - for binding detection. If quality scores were given for the tags, - which quality bins improve the cross-correlation pattern. -} -\usage{ -get.binding.characteristics(data, srange = c(50, 500), bin = 5, cluster = NULL, debug = F, min.tag.count = 1000, acceptance.z.score = 3, remove.tag.anomalies = T, anomalies.z = 5,accept.all.tags=F) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{data}{ Tag/quality data: output of \code{read.eland.tags} or similar function } - \item{srange}{ A range within which the binding peak separation is - expected to fall. Should be larger than probe size to avoid artifacts. } - \item{bin}{ Resolution (in basepairs) at which cross-corrrelation - should be calculated. bin=1 is ideal, but takes longer to calculate. } - \item{cluster}{ optional snow cluster for parallel processing } - \item{debug}{ whether to print debug messages } - \item{min.tag.count}{ minimal number of tags on the chromosome to be - considered in the cross-correlation calculations } - \item{acceptance.z.score}{ A Z-score used to determine if a given tag - quality bin provides significant improvement to the strand cross-correlation } - \item{remove.tag.anomalies}{ Whether to remove singular tag count peaks prior to - calculation. This is recommended, since such positions may distort the - cross-correlation profile and increase the necessary computational time. } - \item{anomalies.z}{ Z-score for determining if the number of tags at a - given position is significantly higher about background, and should be - considered an anomaly.} - \item{accept.all.tags}{ Whether tag alignment quality calculations - should be skipped and all available tags should be accepted in the - downstream analysis.} -} -\value{ - \item{cross.correlation }{ Cross-correlation profile as an $x/$y data.frame} - \item{peak }{Position ($x) and height ($y) of automatically detected - cross-correlation peak.} - \item{whs} { Optimized window half-size for binding detection (based - on the width of the cross-correlation peak) } - \item{quality.bin.acceptance} { A list structure, describing the - effect of inclusion of different tag quality bins on - cross-correlation, and a resolution on which bins should be - considered. - \item{informative.bins} { A boolean vector indicating whether the - inclusion of tags from the tag quality bin specified in the name - attribute significantly increases cross-correlation profile near - the peak.} - \item{quality.cc} { A list giving the cross-correlation profile - after the inclusion of the tags from different quality bins } - } -}
--- a/spp/man/get.broad.enrichment.clusters.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -\name{get.broad.enrichment.clusters} -\alias{get.broad.enrichment.clusters} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Determine broad clusters of enrichment } -\description{ - Scan chromosomes with a pre-defined window size, comparing scaled ChIP - and input tag coutns to see if their ratio exceeds that expected from - a Poisson process (normalized for dataset size). -} -\usage{ -get.broad.enrichment.clusters(chip.tags, input.tags, window.size=1e3,z.thr=3,tag.shift=146/2) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{chip.tags}{ foreground tag vector list } - \item{input.tags}{ background tag vector list } - \item{window.size}{ window size to be used for tag counting } - \item{z.thr}{ Z-score to be used as a significance threshold } - \item{tag.shift}{ number of base pairs by which positive and negative - tag coordinates should be shifted towards eachother (half of binding - peak separation distance)} -} -\value{ - A list of elements corresponding to chromosomes, with each element - being an $s/$e/$rv data.frame giving the starting, ending positions and the log2 - enrichment estimate for that region. -}
--- a/spp/man/get.conservative.fold.enrichment.profile.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,59 +0,0 @@ -\name{get.conservative.fold.enrichment.profile} -\alias{get.conservative.fold.enrichment.profile} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Estimate minimal fold enrichment/depletion along the chromosomes } -\description{ - The method provides a statistical assessment of enrichment/depletion - along the chromosomes. To assess tag density enrichment/depletion, a - sliding window of a specified size (\code{fws}) is used to calculate - the density of the foreground tags (\code{ftl}). Multiple, typically - larger windows are used to estimate background tag (\code{btl}) density around the - same location. The densities are compared as ratios of two Poisson - processes to estimate lower bound of foreground enrichment, or upper - bound of foreground depletion. If multiple window sizes were used to - estimate the background tag density, the most conservative one is - chosen for each point. -} -\usage{ -get.conservative.fold.enrichment.profile(ftl, btl, fws, bwsl = c(1, 5, 25, 50) * fws, step = 50, tag.shift = 146/2, alpha = 0.05, use.most.informative.scale = F, quick.calculation = T) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{ftl}{ foreground tag vector list } - \item{btl}{ background tag vector list } - \item{fws}{ foreground window size } - \item{bwsl}{ background window scales. The size(s) of background windows - will be \code{fws*bwsl}. } - \item{step}{ spacing between positions at which the - enrichment/depletion is evaluated } - \item{tag.shift}{ number of basepairs by which positive and negative - tag coordinates should be shifted towards eachother (half of binding - peak separation distance)} - \item{alpha}{ desired level of statistical significance } - \item{use.most.informative.scale}{ for each position, instead of - evaluating enrichment ratio bounds for all background window scales, - choose the one with the highest observed density to speed up the calculations} - \item{quick.calculation}{ Use square root transformation method - instead of a Bayesian method. This speeds up the caclulation - considerably and is turned on by default. } - \item{background.density.scaling}{ If TRUE, regions of significant tag - enrichment will be masked out when calculating size ratio of the - signal to control datasets (to estimate ratio of the background tag - density). If FALSE, the dataset ratio will be equal to the ratio of - the number of tags in each dataset.} -} -\value{ - A list of elements corresponding to chromosomes, with each element - being an $x/$y data.frame giving the position and the log2 - conservative estimate of enrichment/depletion fold ratios around that - position. - Use \code{\link{writewig}} to output the structure to a WIG - file. -} -\references{ R.M.Price, D.G. Bonett "Estimating the ratio fo two Poisson - rates", Comp. Stat & Data Anal. 32(2000) 345} -\seealso{ \code{\link{get.smoothed.tag.density}} } -\examples{ - enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01); - writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale"); -}
--- a/spp/man/get.mser.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -\name{get.mser} -\alias{get.mser} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Calculate minimal saturated enrichment fold ratio } -\description{ - Determine if the dataset has reached absolute saturation, or otherwise - find minimal fold enrichment ratio above which the detection of peaks - has stabilized enough to meet the saturation criteria. -} -\usage{ -get.mser(signal.data, control.data, n.chains = 5, step.size = 1e+05, chains = NULL, cluster = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), n.steps = 1, ...) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{signal.data}{ signal tag vector list } - \item{control.data}{ control tag vector list } - \item{n.chains}{ number of dataset subsamples to use } - \item{step.size}{ subsampling step describing the saturation - criteria. The criteria requires the set of detected binding sites to - be stable (as described by the \code{test.agreement} param) when the - number of tags in the dataset is reduced by \code{step.size}. The - value can either be an integer above one, in which case it specifies a fixed - number of tags, or a real value below one, in which case it - specifies the fraction of tags that should be removed (e.g. 0.1 will - remove 10% of tags). - } - \item{test.agreement}{ Fraction of the detected peaks that should - agree between the full and subsampled datasets. } - \item{chains}{ optional parameter, giving pre-calculated chains } - \item{cluster}{ optional \code{snow} cluster to parallelize processing } - - \item{return.chains}{ whether subsampled dataset results should be returned as - well } - \item{enrichment.background.scales}{ one or multiple window scales at - which the background tag density should be assessed. See - \code{enrichment.background.scales} in - \code{\link{find.binding.positions}}. If multiple scales are provided, - multiple MSER estimates will be returned.} - \item{\dots}{ additional parameters should be the same as those passed - to the \code{\link{find.binding.positions}}} -} -\value{ - A single, or multple (if multiple \code{enrichment.background.scales} were - provided) MSER value. A value of 1 or very close to it implies that - the dataset has reached absolute saturation based on the given criteria. -}
--- a/spp/man/get.mser.interpolation.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -\name{get.mser.interpolation} -\alias{get.mser.interpolation} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Interpolate MSER dependency on the tag count } -\description{ - MSER generally decreases with increasing sequencing depth. This - function interpolates the dependency of MSER on tag counts as a - log-log linear function. The log-log fit is used to estimate the depth - of sequencing required to reach desired \code{target.fold.enrichment}. -} -\usage{ -get.mser.interpolation(signal.data, control.data, target.fold.enrichment = 5, n.chains = 10, n.steps = 6, step.size = 1e+05, chains = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), excluded.steps = c(seq(2, n.steps - 2)), ...) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{signal.data}{ signal chromosome tag vector list } - \item{control.data}{ control chromosome tag vector list } - \item{target.fold.enrichment}{ target MSER for which the depth should - be estimated} - \item{n.steps}{ number of steps in each subset chain. } - \item{step.size}{ Either number of tags or fraction of the dataset - size, see \code{step.size} parameter for \code{\link{get.mser}}. } - \item{test.agreement}{ Fraction of the detected peaks that should - agree between the full and subsampled datasets. See \code{test.agreement} parameter for \code{\link{get.mser}}} - \item{n.chains}{ number of random subset chains } - \item{chains}{ optional structure of pre-calculated chains - (e.g. generated by an earlier call with \code{return.chains=T}.} - - \item{return.chains}{ whether to return peak predictions calculated on - random chains. These can be passed back using \code{chains} argument - to skip subsampling/prediction steps, and just recalculate the depth - estimate for a different MSER.} - \item{enrichment.background.scales}{ see \code{enrichment.background.scales} parameter for \code{\link{get.mser}} } - \item{excluded.steps}{ Intermediate subsampling steps that should be excluded from - the chains to speed up the calculation. By default, all intermediate - steps except for first two and last two are skipped. Adding - intermediate steps improves interpolation at the expense of - computational time.} - \item{\dots}{ additional parameters are passed to \code{\link{get.mser}} } -} -\details{ - To simulate sequencing growth, the method calculates peak predictions - on random chains. Each chain is produced by sequential random - subsampling of the original data. The number of steps in the chain - indicates how many times the random subsampling will be performed. -} -\value{ - Normally reurns a list, specifying for each backgroundscale: - \item{prediction}{estimated sequencing depth required to reach - specified target MSER} - \item{log10.fit}{linear fit model, a result of \code{lm()} call} - - If \code{return.chains=T}, the above structure is returned under - \code{interpolation} field, along with \code{chains} field containing - results of \code{\link{find.binding.positions}} calls on subsampled chains. -}
--- a/spp/man/get.smoothed.enrichment.mle.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,35 +0,0 @@ -\name{get.smoothed.enrichment.mle} -\alias{get.smoothed.enrichment.mle} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Calculate chromosome-wide profiles of smoothed enrichment estimate } -\description{ - Given signal and control tag positions, the method calculates log2 - signal to control enrichment esimates (maximum likelihood) for each - chromosome, based on the smoothed tag density profile (see \link{get.smoothed.tag.density}). -} -\usage{ -get.smoothed.enrichment.mle(signal.tags, control.tags, bandwidth = 150,tag.shift = 146/2, step = 50) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output - of \code{\link{select.informative.tags}} } - \item{control.tags}{ control (input) tags } - \item{pseudocount}{ pseudocount value to be added to tag density - - defaults to 1 } - other parameters (such as bandwidth, step.size and tag.shift) are - passed to \link{get.smoothed.tag.density} - see appropriate reference - for details. -} -\value{ - A list of elements corresponding to chromosomes, with each element - being an $x/$y data.frame giving the position and associated - log2 signal/control enrichment estimate. -} -\seealso{ \code{\link{writewig}} } -\examples{ - # get smoothed enrichment estimate profile using 500bp bandwidth at - # 50bp steps - smoothed.M <- get.smoothed.enrichment.mle(chip.data,bandwidth=500,step=50); - writewig(smoothed.M,"example.smoothedM.wig","Example smoothed log2 intensity ratio estimate"); -} \ No newline at end of file
--- a/spp/man/get.smoothed.tag.density.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -\name{get.smoothed.tag.density} -\alias{get.smoothed.tag.density} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Calculate chromosome-wide profiles of smoothed tag density } -\description{ - Given tag positions, the method calculates for each chromosome a tag - density profile, smoothed by the Gaussian kernel. If the optional - control tags are provided, the difference between ChIP and control tag - density is returned. -} -\usage{ -get.smoothed.tag.density(signal.tags, control.tags = NULL, bandwidth = 150, bg.weight = NULL, tag.shift = 146/2, step = round(bandwidth/3)) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output - of \code{\link{select.informative.tags}} } - \item{control.tags}{ optional control (input) tags } - \item{bandwidth}{ standard deviation of the Gaussian kernel } - \item{bg.weight}{ optional weight by which the background density - should be multipled for scaling. If not supplied, the weight is - calculated based on the ratio of the reduced ChIP to input dataset sizes. } - \item{tag.shift}{ Distance by which the positive and negative strand - tags should be shifted towards eachother. This - normally corresponds to the half of the cross-correlation peak - position (e.g. \code{get.binding.characteristics()}$peak$x/2) } - \item{step}{ The distance between the regularly spaced points for - which the values should be calculated. } - \item{background.density.scaling}{ If TRUE, regions of significant tag - enrichment will be masked out when calculating size ratio of the - signal to control datasets (to estimate ratio of the background tag - density). If FALSE, the dataset ratio will be equal to the ratio of - the number of tags in each dataset.} -} -\value{ - A list of elements corresponding to chromosomes, with each element - being an $x/$y data.frame giving the position and associated tag - density. Use \code{\link{writewig}} to output the structure to a WIG - file. -} -\seealso{ \code{\link{writewig}} } -\examples{ - smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2)); - writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density"); -} \ No newline at end of file
--- a/spp/man/output.binding.results.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -\name{output.binding.results} -\alias{output.binding.results} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Write out determined binding peaks into a text file table } -\description{ - Writes out determined binding positions into a text file. The file - will contain a table with each row corresponding to a detected - position, with the following columns: - \item{chr}{ chromosome or target sequence } - \item{pos}{ position of detected binding site on the chromosome/sequence} - \item{score}{a score reflecting magnitude of the binding} - \item{Evalue}{E-value corresponding to the peak magnitude} - \item{FDR}{FDR corresponding to the peak magnitude} - \item{enrichment.lb}{lower bound of the fold-enrichment ratio} - \item{enrichment.mle}{maximum likelihood estimate of the fold-enrichment ratio} -} -\usage{ -output.binding.results(results, filename) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{results}{ output of the \code{\link{find.binding.positions}} } - \item{filename}{ file name } -}
--- a/spp/man/read.bam.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,24 +0,0 @@ -\name{read.bam.tags} -\alias{read.bam.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Read BAM alignment file } -\description{ - Reads in aligned reads from BAM file. Note: no split (non-unique) - alignemnts should be reported in the BAM file. -} -\usage{ -read.bam.tags(filename, read.tag.names = F, fix.chromosome.names = F) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{filename}{ BAM file } - \item{read.tag.names}{ Whether the tag names should be read in } - \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of - the sequence names } -} -\value{ - \item{tags }{ A vector of 5' tag coordinates, with negative values - corresponding to tags mapped to the negative strand. } - \item{quality }{ Number of mismatches } - \item{names }{ Tag names, if \code{read.tag.names} was set } -} \ No newline at end of file
--- a/spp/man/read.bin.maqmap.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -\name{read.bin.maqmap.tags} -\alias{read.bin.maqmap.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Read MAQ binary alignment map file } -\description{ - Reads in MAQ binary map alignment result file -} -\usage{ -read.bin.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{filename}{ MAQ map output file (binary) } - \item{read.tag.names}{ Whether the tag names should be read in } - \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of - the sequence names } -} -\value{ - \item{tags }{ A vector of 5' tag coordinates, with negative values - corresponding to tags mapped to the negative strand. } - \item{quality }{ Number of mismatches } - \item{names }{ Tag names, if \code{read.tag.names} was set } -} \ No newline at end of file
--- a/spp/man/read.bowtie.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -\name{read.bowtie.tags} -\alias{read.bowtie.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Read bowtie text alignment output file } -\description{ - Reads in bowtie alignment results in text format -} -\usage{ -read.bowtie.tags(filename, read.tag.names = F, fix.chromosome.names = F) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{filename}{ bowtie text output file } - \item{read.tag.names}{ Whether the tag names should be read in } - \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of - the sequence names } -} -\value{ - \item{tags }{ A vector of 5' tag coordinates, with negative values - corresponding to tags mapped to the negative strand. } - \item{quality }{ Number of mismatches } - \item{names }{ Tag names, if \code{read.tag.names} was set } -} \ No newline at end of file
--- a/spp/man/read.eland.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,30 +0,0 @@ -\name{read.eland.tags} -\alias{read.eland.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Read eland output file } -\description{ - Reads in ELAND output file, returning 5'-end tag coordinates and - number of mismatches associated with each mapped tag. -} -\usage{ -read.eland.tags(filename, read.tag.names = F, fix.chromosome.names = T, max.eland.tag.length = -1,extended=F) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{filename}{ ELAND output file } - \item{read.tag.names}{ Whether the tag names should be read in } - \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of - the sequence names } - \item{max.eland.tag.length}{ Specifies max length of the tag sequence - considered by ELAND. This needs to be specified if the tags are - longer than the sequences considred by ELAND during alignment. } - \item{extended}{ Whether the file is written out in "extended" format - provided in GA pipeline 1.0. } - \item{multi}{ Whether the file is written in "multi" format, showing multiple alignments of the reads } -} -\value{ - \item{tags }{ A vector of 5' tag coordinates, with negative values - corresponding to tags mapped to the negative strand. } - \item{quality }{ Number of mismatches } - \item{names }{ Tag names, if \code{read.tag.names} was set } -}
--- a/spp/man/read.maqmap.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,23 +0,0 @@ -\name{read.maqmap.tags} -\alias{read.maqmap.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Read MAQ text alignment output file } -\description{ - Reads in MAQ alignment results in text format (that results from "maq mapview" command.) -} -\usage{ -read.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{filename}{ MAQ text output file } - \item{read.tag.names}{ Whether the tag names should be read in } - \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of - the sequence names } -} -\value{ - \item{tags }{ A vector of 5' tag coordinates, with negative values - corresponding to tags mapped to the negative strand. } - \item{quality }{ Number of mismatches } - \item{names }{ Tag names, if \code{read.tag.names} was set } -} \ No newline at end of file
--- a/spp/man/read.meland.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -\name{read.meland.tags} -\alias{read.meland.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Read modified BED tag alignment file that contains variable match - length information } -\description{ - Reads in an extended BED tag alignment file. An example line given below: - \code{49 . U1 . 1 . . 23 chr2 -234567} - The line above specifies a 23-bp portion of the tag tag with id 49 was - aligned with 1 mismatch to the negative strand of chr2 at position 234567. -} -\usage{ -read.meland.tags(filename, read.tag.names = F, fix.chromosome.names = T) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{filename}{ name of the extended BED file } - \item{read.tag.names}{ whether to read in tag names } - \item{fix.chromosome.names}{ whether to remove ".fa" from the sequence - name ends. } -} -\value{ - \item{tags }{ A vector of 5' tag coordinates, with negative values - corresponding to tags mapped to the negative strand. } - \item{quality }{ Quality expressed as a float x.y, where x is - tag.length - aligned.tag.portion.length, and y is the number of - mismatches (must be less than 10). } - \item{names }{ Tag names, if \code{read.tag.names} was set } -} \ No newline at end of file
--- a/spp/man/remove.local.tag.anomalies.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -\name{remove.local.tag.anomalies} -\alias{remove.local.tag.anomalies} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Restrict or remove positions with too many tags relative to - local background. } -\description{ - In Solexa ChIP-seq experiments some anomalous positions contain - extremely high number of tags at the exact coordinates. The function - scans the chromosomes, determining local tag density based on a - provided \code{window.size}, doing two types of corrections: - 1. removing all tags from positions that exceed local density by - \code{eliminate.fold}; 2. reducing the tag count at positions - exceeding \code{cap.fold} to the maximal allowed count. The - statistical significance of counts exceeding either of these two - threshold densities is calculated based on Poisson model, with - confidence interval determined by the \code{z.threshold} Z-score parameter. -} -\usage{ -remove.local.tag.anomalies(tags, window.size = 200, eliminate.fold = 10, cap.fold = 4, z.threshold = 3) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{tags}{ Chromosome-list of tag vectors } - \item{window.size}{ Size of the window used to assess local - density. Increasing the window size considerably beyond the size of - the binding features will result in flattened profiles, with bound - positions exhibiting a difference of just 1 tag beyond the background. } - \item{eliminate.fold}{ Threshold definining fold-over background - density above which the position is considered anomalous and removed - completely.} - \item{cap.fold}{ Threshold fold-over background density above which - the position is capped to the maximum statistically likely given - local tag density } - \item{z.threshold}{ Z-score used to assess significance of a given - position exceeding either of the two density thresholds. } -} -\value{ - A modified chromosome-wise tag vector list. -} -\references{ ~put references to the literature/web site here ~ } - -\note{ ~~further notes~~ - Increasing window.size to very large values will result in flat - profiles similar to those described by Zhang et al. "Model-based - Analysis of ChIP-Seq (MACS)." Genome Biol. 2008 Sep 17;9(9):R137. -}
--- a/spp/man/select.informative.tags.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -\name{select.informative.tags} -\alias{select.informative.tags} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Choose informative tags } -\description{ - For datasets with tag alignment quality information (e.g. number of - mismatches for Eland alignments), - \code{\link{get.binding.characteristics}} determines whether inclusion - of tags from each specific quality bin improves the cross-correlation - profile. The present function is then used to actually select these - informative tags, discarding all other information, including quality - scores that are not used in further processing. -} -\usage{ -select.informative.tags(data, binding.characteristics) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{data}{ Full alignment data (a list with $tags and $quality elements) } - \item{binding.characteristics}{ result of a - \code{\link{get.binding.characteristics}} call. If NULL value is - supplied,all tags will be accepted. } -} -\value{ - A chromosome-wise tag list. Each element of the list corresponds to a - chromosome and is a numeric vector of 5' tag coordinates, with sign - designating DNA strand. - This form of tag data is used for most of the other processing. -}
--- a/spp/man/spp-package.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,144 +0,0 @@ -\name{spp-package} -\alias{spp-package} -\alias{spp} -\docType{package} -\title{ -ChIP-seq (Solexa) Processing Pipeline -} -\description{ -A set of routines for reading short sequence alignments, calculating tag -density, estimates of statistically significant enrichment/depletion -along the chromosome, identifying point binding positions (peaks), and -characterizing saturation properties related to sequencing depth. -} -\details{ -\tabular{ll}{ -Package: \tab spp\cr -Type: \tab Package\cr -Version: \tab 1.8\cr -Date: \tab 2008-11-14\cr -License: \tab What license is it under?\cr -LazyLoad: \tab yes\cr -} -See example below for typical processing sequence.y -} -\author{Peter Kharchenko <peter.kharchenko@post.harvard.edu>} -\references{ -Kharchenko P., Tolstorukov M., Park P. "Design and analysis of ChIP-seq -experiments for DNA-binding proteins." Nature Biotech. doi:10.1038/nbt.1508 -} - -\examples{ - - # load the library - library(spp); - - ## The following section shows how to initialize a cluster of 8 nodes for parallel processing - ## To enable parallel processing, uncomment the next three lines, and comment out "cluster<-NULL"; - ## see "snow" package manual for details. - #library(snow) - #cluster <- makeCluster(2); - #invisible(clusterCall(cluster,source,"routines.r")); - cluster <- NULL; - - - - # read in tag alignments - chip.data <- read.eland.tags("chip.eland.alignment"); - input.data <- read.eland.tags("input.eland.alignment"); - - # get binding info from cross-correlation profile - # srange gives the possible range for the size of the protected region; - # srange should be higher than tag length; making the upper boundary too high will increase calculation time - # - # bin - bin tags within the specified number of basepairs to speed up calculation; - # increasing bin size decreases the accuracy of the determined parameters - binding.characteristics <- get.binding.characteristics(chip.data,srange=c(50,500),bin=5,cluster=cluster); - - - # plot cross-correlation profile - pdf(file="example.crosscorrelation.pdf",width=5,height=5) - par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8); - plot(binding.characteristics$cross.correlation,type='l',xlab="strand shift",ylab="cross-correlation"); - abline(v=binding.characteristics$peak$x,lty=2,col=2) - dev.off(); - - # select informative tags based on the binding characteristics - chip.data <- select.informative.tags(chip.data,binding.characteristics); - input.data <- select.informative.tags(input.data,binding.characteristics); - - # restrict or remove positions with anomalous number of tags relative - # to the local density - chip.data <- remove.local.tag.anomalies(chip.data); - input.data <- remove.local.tag.anomalies(input.data); - - - # output smoothed tag density (subtracting re-scaled input) into a WIG file - # note that the tags are shifted by half of the peak separation distance - smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2)); - writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density"); - rm(smoothed.density); - - # output conservative enrichment estimates - # alpha specifies significance level at which confidence intervals will be estimated - enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01); - writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale"); - rm(enrichment.estimates); - - - # binding detection parameters - # desired FDR. Alternatively, an E-value can be supplied to the method calls below instead of the fdr parameter - fdr <- 1e-2; - # the binding.characteristics contains the optimized half-size for binding detection window - detection.window.halfsize <- binding.characteristics$whs; - - # determine binding positions using wtd method - bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize,cluster=cluster) - - # alternatively determined binding positions using lwcc method (note: this takes longer than wtd) - # bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.lwcc,whs=detection.window.halfsize,cluster=cluster) - - print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks")); - - # output detected binding positions - output.binding.results(bp,"example.binding.positions.txt"); - - - # ------------------------------------------------------------------------------------------- - # the set of commands in the following section illustrates methods for saturation analysis - # these are separated from the previous section, since they are highly CPU intensive - # ------------------------------------------------------------------------------------------- - - # determine MSER - # note: this will take approximately 10-15x the amount of time the initial binding detection did - # The saturation criteria here is 0.99 consistency in the set of binding positions when adding 1e5 tags. - # To ensure convergence the number of subsampled chains (n.chains) should be higher (80) - mser <- get.mser(chip.data,input.data,step.size=1e5,test.agreement=0.99,n.chains=8,cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize) - - print(paste("MSER at a current depth is",mser)); - - # note: an MSER value of 1 or very near one implies that the set of detected binding positions satisfies saturation criteria without - # additional selection by fold-enrichment ratios. In other words, the dataset has reached saturation in a traditional sense (absolute saturation). - - # interpolate MSER dependency on tag count - # note: this requires considerably more calculations than the previous steps (~ 3x more than the first MSER calculation) - # Here we interpolate MSER dependency to determine a point at which MSER of 2 is reached - # The interpolation will be based on the difference in MSER at the current depth, and a depth at 5e5 fewer tags (n.steps=6); - # evaluation of the intermediate points is omitted here to speed up the calculation (excluded.steps parameter) - # A total of 7 chains is used here to speed up calculation, whereas a higher number of chains (50) would give good convergence - msers <- get.mser.interpolation(chip.data,input.data,step.size=1e5,test.agreement=0.99, target.fold.enrichment=2, n.chains=7,n.steps=6,excluded.steps=c(2:4),cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize) - - print(paste("predicted sequencing depth =",round(unlist(lapply(msers,function(x) x$prediction))/1e6,5)," million tags")) - - - # note: the interpolation will return NA prediction if the dataset has reached absolute saturation at the current depth. - # note: use return.chains=T to also calculated random chains (returned under msers$chains field) - these can be passed back as - # "get.mser.interpolation( ..., chains=msers$chains)" to calculate predictions for another target.fold.enrichment value - # without having to recalculate the random chain predictions. - - ## stop cluster if it was initialized - #stopCluster(cluster); - - - -}
--- a/spp/man/write.broadpeak.info.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,16 +0,0 @@ -\name{write.broadpeak.info} -\alias{write.broadpeak.info} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Write out determined broad enrichment regions using broadPeak format } -\description{ - Writes out broad regions of enrichment determined by the - get.broad.enrichment.clusters method in a broadPeak format. -} -\usage{ -write.broadpeak.info(broadpeak.results, filename) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{broadpeak.results}{ output of the \code{\link{get.broad.enrichment.clusters}} } - \item{filename}{ file name } -}
--- a/spp/man/write.narrowpeak.binding.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -\name{write.narrowpeak.binding} -\alias{write.narrowpeak.binding} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ Write out determined binding peaks using narrowPeak format } -\description{ - Writes out determined binding positions into a narrowPeak file. - The region will correspond to associated broad enrichment region, if - such were added using add.broad.peak.regions method. Otherwise the - region size will be determined using margin (which defaults to the - window half size that was used to determine binding positions) -} -\usage{ -write.narrowpeak.binding(results, filename,margin=results$whs) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{results}{ output of the \code{\link{find.binding.positions}} } - \item{filename}{ file name } - \item{margin}{ explicit value of the margin to be used if the borad - region information is absent (defaults to peak detection window half-size} -}
--- a/spp/man/writewig.Rd Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,31 +0,0 @@ -\name{writewig} -\alias{writewig} -%- Also NEED an '\alias' for EACH other topic documented here. -\title{ A function to save a list of chromosome-wise x/y data frames - into a WIG file format. } -\description{ - Takes a list that contains an $x and $y data.frame for a number of - chromosomes and writes it out to a WIG BED style format. -} -\usage{ -writewig(dat, fname, feature, threshold = 5, zip = F) -} -%- maybe also 'usage' for other objects documented here. -\arguments{ - \item{dat}{ Chromosome coordinate-value data. \code{dat} is a list, - each member of a list is a data frame with $x and $y columns - containing chromosome positions and associated values. The names of - the list elements correspond to the chromosomes. } - \item{fname}{ Filename to which the output should be written } - \item{feature}{ Data description to be incorporated into the WIG header } - \item{threshold}{ Optional threshold to be saved in the WIG file} - \item{zip}{ Wheter to invoke a zip program to compress the file } -} - -\seealso{ ~~objects to See Also as \code{\link{help}}, ~~~ } -\examples{ - -data <- list("chr1"=data.frame(x=c(100,130,200),y=c(1.2,4.0,2.3))); -writewig(data,"filename"); - -}
--- a/spp/src/BGZF.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,398 +0,0 @@ -// *************************************************************************** -// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#include <BGZF.h> -using namespace BamTools; - -#include <algorithm> -using namespace std; - -BgzfData::BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , IsWriteOnly(false) - , IsWriteUncompressed(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) -{ - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n"); - exit(1); - } -} - -// destructor -BgzfData::~BgzfData(void) { - if( CompressedBlock ) delete[] CompressedBlock; - if( UncompressedBlock ) delete[] UncompressedBlock; -} - -// closes BGZF file -void BgzfData::Close(void) { - - // skip if file not open, otherwise set flag - if ( !IsOpen ) return; - - // if writing to file, flush the current BGZF block, - // then write an empty block (as EOF marker) - if ( IsWriteOnly ) { - FlushBlock(); - int blockLength = DeflateBlock(); - fwrite(CompressedBlock, 1, blockLength, Stream); - } - - // flush and close - fflush(Stream); - fclose(Stream); - IsWriteUncompressed = false; - IsOpen = false; -} - -// compresses the current block -int BgzfData::DeflateBlock(void) { - - // initialize the gzip header - char* buffer = CompressedBlock; - memset(buffer, 0, 18); - buffer[0] = GZIP_ID1; - buffer[1] = (char)GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[9] = (char)OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - - // set compression level - const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); - - // loop to retry for blocks that do not compress enough - int inputLength = BlockOffset; - int compressedLength = 0; - unsigned int bufferSize = CompressedBlockSize; - - while ( true ) { - - // initialize zstream values - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { - fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n"); - exit(1); - } - - // compress the data - int status = deflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - - deflateEnd(&zs); - - // reduce the input length and try again - if ( status == Z_OK ) { - inputLength -= 1024; - if( inputLength < 0 ) { - fprintf(stderr, "BGZF ERROR: input reduction failed.\n"); - exit(1); - } - continue; - } - - fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - // finalize the compression routine - if ( deflateEnd(&zs) != Z_OK ) { - fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if ( compressedLength > MAX_BLOCK_SIZE ) { - fprintf(stderr, "BGZF ERROR: deflate overflow.\n"); - exit(1); - } - - break; - } - - // store the compressed length - BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); - - // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = BlockOffset - inputLength; - if ( remaining > 0 ) { - if ( remaining > inputLength ) { - fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n"); - exit(1); - } - memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); - } - - BlockOffset = remaining; - return compressedLength; -} - -// flushes the data in the BGZF block -void BgzfData::FlushBlock(void) { - - // flush all of the remaining blocks - while ( BlockOffset > 0 ) { - - // compress the data block - int blockLength = DeflateBlock(); - - // flush the data to our output stream - int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); - - if ( numBytesWritten != blockLength ) { - fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); - exit(1); - } - - BlockAddress += blockLength; - } -} - -// de-compresses the current block -int BgzfData::InflateBlock(const int& blockLength) { - - // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)CompressedBlock + 18; - zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)UncompressedBlock; - zs.avail_out = UncompressedBlockSize; - - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if ( status != Z_OK ) { - fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); - return -1; - } - - status = inflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - inflateEnd(&zs); - fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); - return -1; - } - - status = inflateEnd(&zs); - if ( status != Z_OK ) { - fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); - return -1; - } - - return zs.total_out; -} - -// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { - - // determine open mode - if ( strcmp(mode, "rb") == 0 ) - IsWriteOnly = false; - else if ( strcmp(mode, "wb") == 0) - IsWriteOnly = true; - else { - fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode); - return false; - } - - // ---------------------------------------------------------------- - // open Stream to read to/write from file, stdin, or stdout - // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) - - // read/write BGZF data to/from a file - if ( (filename != "stdin") && (filename != "stdout") ) - Stream = fopen(filename.c_str(), mode); - - // read BGZF data from stdin - else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) - Stream = freopen(NULL, mode, stdin); - - // write BGZF data to stdout - else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) - Stream = freopen(NULL, mode, stdout); - - if ( !Stream ) { - fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() ); - return false; - } - - // set flags, return success - IsOpen = true; - IsWriteUncompressed = isWriteUncompressed; - return true; -} - -// reads BGZF data into a byte buffer -int BgzfData::Read(char* data, const unsigned int dataLength) { - - if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; - - char* output = data; - unsigned int numBytesRead = 0; - while ( numBytesRead < dataLength ) { - - int bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; - bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) break; - } - - char* buffer = UncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + BlockOffset, copyLength); - - BlockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; - } - - if ( BlockOffset == BlockLength ) { - BlockAddress = ftell64(Stream); - BlockOffset = 0; - BlockLength = 0; - } - - return numBytesRead; -} - -// reads a BGZF block -bool BgzfData::ReadBlock(void) { - - char header[BLOCK_HEADER_LENGTH]; - int64_t blockAddress = ftell64(Stream); - - int count = fread(header, 1, sizeof(header), Stream); - if ( count == 0 ) { - BlockLength = 0; - return true; - } - - if ( count != sizeof(header) ) { - fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n"); - return false; - } - - if ( !BgzfData::CheckBlockHeader(header) ) { - fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n"); - return false; - } - - int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = CompressedBlock; - memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); - int remaining = blockLength - BLOCK_HEADER_LENGTH; - - count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); - if ( count != remaining ) { - fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n"); - return false; - } - - count = InflateBlock(blockLength); - if ( count < 0 ) { - fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n"); - return false; - } - - if ( BlockLength != 0 ) - BlockOffset = 0; - - BlockAddress = blockAddress; - BlockLength = count; - return true; -} - -// seek to position in BGZF file -bool BgzfData::Seek(int64_t position) { - - if ( !IsOpen ) return false; - - int blockOffset = (position & 0xFFFF); - int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - - if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { - fprintf(stderr, "BGZF ERROR: unable to seek in file\n"); - return false; - } - - BlockLength = 0; - BlockAddress = blockAddress; - BlockOffset = blockOffset; - return true; -} - -// get file position in BGZF file -int64_t BgzfData::Tell(void) { - if ( !IsOpen ) - return false; - else - return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); -} - -// writes the supplied data into the BGZF buffer -unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { - - if ( !IsOpen || !IsWriteOnly ) return false; - - // initialize - unsigned int numBytesWritten = 0; - const char* input = data; - unsigned int blockLength = UncompressedBlockSize; - - // copy the data to the buffer - while ( numBytesWritten < dataLen ) { - - unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); - char* buffer = UncompressedBlock; - memcpy(buffer + BlockOffset, input, copyLength); - - BlockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - if ( BlockOffset == blockLength ) - FlushBlock(); - } - - return numBytesWritten; -}
--- a/spp/src/BGZF.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,322 +0,0 @@ -// *************************************************************************** -// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#ifndef BGZF_H -#define BGZF_H - -#include <api_global.h> -#include <zlib.h> - -#include <cstdio> -#include <cstdlib> -#include <cstring> -#include <string> - -// Platform-specific large-file support -#ifndef BAMTOOLS_LFS -#define BAMTOOLS_LFS - #ifdef WIN32 - #define ftell64(a) _ftelli64(a) - #define fseek64(a,b,c) _fseeki64(a,b,c) - #else - #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) - #endif -#endif // BAMTOOLS_LFS - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include <stdint.h> - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// zlib constants -const int GZIP_ID1 = 31; -const int GZIP_ID2 = 139; -const int CM_DEFLATE = 8; -const int FLG_FEXTRA = 4; -const int OS_UNKNOWN = 255; -const int BGZF_XLEN = 6; -const int BGZF_ID1 = 66; -const int BGZF_ID2 = 67; -const int BGZF_LEN = 2; -const int GZIP_WINDOW_BITS = -15; -const int Z_DEFAULT_MEM_LEVEL = 8; - -// BZGF constants -const int BLOCK_HEADER_LENGTH = 18; -const int BLOCK_FOOTER_LENGTH = 8; -const int MAX_BLOCK_SIZE = 65536; -const int DEFAULT_BLOCK_SIZE = 65536; - -struct API_EXPORT BgzfData { - - // data members - public: - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - bool IsWriteOnly; - bool IsWriteUncompressed; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; - - // constructor & destructor - public: - BgzfData(void); - ~BgzfData(void); - - // main interface methods - public: - // closes BGZF file - void Close(void); - // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) - bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false); - // reads BGZF data into a byte buffer - int Read(char* data, const unsigned int dataLength); - // seek to position in BGZF file - bool Seek(int64_t position); - // get file position in BGZF file - int64_t Tell(void); - // writes the supplied data into the BGZF buffer - unsigned int Write(const char* data, const unsigned int dataLen); - - // internal methods - private: - // compresses the current block - int DeflateBlock(void); - // flushes the data in the BGZF block - void FlushBlock(void); - // de-compresses the current block - int InflateBlock(const int& blockLength); - // reads a BGZF block - bool ReadBlock(void); - - // static 'utility' methods - public: - // checks BGZF block header - static inline bool CheckBlockHeader(char* header); - // packs an unsigned integer into the specified buffer - static inline void PackUnsignedInt(char* buffer, unsigned int value); - // packs an unsigned short into the specified buffer - static inline void PackUnsignedShort(char* buffer, unsigned short value); - // unpacks a buffer into a double - static inline double UnpackDouble(char* buffer); - static inline double UnpackDouble(const char* buffer); - // unpacks a buffer into a float - static inline float UnpackFloat(char* buffer); - static inline float UnpackFloat(const char* buffer); - // unpacks a buffer into a signed int - static inline signed int UnpackSignedInt(char* buffer); - static inline signed int UnpackSignedInt(const char* buffer); - // unpacks a buffer into a signed short - static inline signed short UnpackSignedShort(char* buffer); - static inline signed short UnpackSignedShort(const char* buffer); - // unpacks a buffer into an unsigned int - static inline unsigned int UnpackUnsignedInt(char* buffer); - static inline unsigned int UnpackUnsignedInt(const char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short UnpackUnsignedShort(char* buffer); - static inline unsigned short UnpackUnsignedShort(const char* buffer); -}; - -// ------------------------------------------------------------- -// static 'utility' method implementations - -// checks BGZF block header -inline -bool BgzfData::CheckBlockHeader(char* header) { - return (header[0] == GZIP_ID1 && - header[1] == (char)GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); -} - -// 'packs' an unsigned integer into the specified buffer -inline -void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - buffer[2] = (char)(value >> 16); - buffer[3] = (char)(value >> 24); -} - -// 'packs' an unsigned short into the specified buffer -inline -void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); -} - -// 'unpacks' a buffer into a double (includes both non-const & const char* flavors) -inline -double BgzfData::UnpackDouble(char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -inline -double BgzfData::UnpackDouble(const char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -// 'unpacks' a buffer into a float (includes both non-const & const char* flavors) -inline -float BgzfData::UnpackFloat(char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -float BgzfData::UnpackFloat(const char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors) -inline -signed int BgzfData::UnpackSignedInt(char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -signed int BgzfData::UnpackSignedInt(const char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors) -inline -signed short BgzfData::UnpackSignedShort(char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -signed short BgzfData::UnpackSignedShort(const char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors) -inline -unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors) -inline -unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -} // namespace BamTools - -#endif // BGZF_H
--- a/spp/src/BamAlignment.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,696 +0,0 @@ -// *************************************************************************** -// BamAlignment.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 13 December 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the BamAlignment data structure -// *************************************************************************** - -#include <BamAlignment.h> -using namespace BamTools; - -#include <cctype> -#include <cstdio> -#include <cstdlib> -#include <cstring> -#include <exception> -#include <map> -#include <utility> -using namespace std; - -// default ctor -BamAlignment::BamAlignment(void) - : RefID(-1) - , Position(-1) - , MateRefID(-1) - , MatePosition(-1) - , InsertSize(0) -{ } - -// copy ctor -BamAlignment::BamAlignment(const BamAlignment& other) - : Name(other.Name) - , Length(other.Length) - , QueryBases(other.QueryBases) - , AlignedBases(other.AlignedBases) - , Qualities(other.Qualities) - , TagData(other.TagData) - , RefID(other.RefID) - , Position(other.Position) - , Bin(other.Bin) - , MapQuality(other.MapQuality) - , AlignmentFlag(other.AlignmentFlag) - , CigarData(other.CigarData) - , MateRefID(other.MateRefID) - , MatePosition(other.MatePosition) - , InsertSize(other.InsertSize) - , SupportData(other.SupportData) -{ } - -// dtor -BamAlignment::~BamAlignment(void) { } - -// Queries against alignment flags -bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } -bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } -bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } -bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } -bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } -bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } -bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } -bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } -bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } -bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } -bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - -// Manipulate alignment flags -void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } -void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } -void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } -void BamAlignment::SetIsMapped(bool ok) { SetIsUnmapped(!ok); } -void BamAlignment::SetIsMateMapped(bool ok) { SetIsMateUnmapped(!ok); } -void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } -void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } -void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } -void BamAlignment::SetIsPrimaryAlignment(bool ok) { SetIsSecondaryAlignment(!ok); } -void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } -void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } -void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } -void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } -void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - -// calculates alignment end position, based on starting position and CIGAR operations -int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { - - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); - vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) - alignEnd += (*cigarIter).Length; - else if ( usePadded && cigarType == 'I' ) - alignEnd += (*cigarIter).Length; - } - - // adjust for zeroBased, if necessary - if (zeroBased) - return alignEnd - 1; - else - return alignEnd; -} - -bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, copy tag data to temp buffer - string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} - -bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - const unsigned int dataLength = strlen(value.c_str()); - memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) { - return EditTag(tag, type, (const uint32_t&)value); -} - -bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -// get "NM" tag data - originally contributed by Aaron Quinlan -// stores data in 'editDistance', returns success/fail -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { - return GetTag("NM", (uint32_t&)editDistance); -} - -// get "RG" tag data -// stores data in 'readGroup', returns success/fail -bool BamAlignment::GetReadGroup(string& readGroup) const { - return GetTag("RG", readGroup); -} - -bool BamAlignment::GetTag(const string& tag, string& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - const unsigned int dataLength = strlen(pTagData); - destination.clear(); - destination.resize(dataLength); - memcpy( (char*)destination.data(), pTagData, dataLength ); - return true; - } - - // tag not found, return failure - return false; -} - -bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type for integer destination (float or var-length strings) - case 'f': - case 'Z': - case 'H': - fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -bool BamAlignment::GetTag(const string& tag, int32_t& destination) const { - return GetTag(tag, (uint32_t&)destination); -} - -bool BamAlignment::GetTag(const string& tag, float& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch(type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'f': - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case 'Z': - case 'H': - fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0.0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -bool BamAlignment::GetTagType(const string& tag, char& type) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // lookup tag - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // retrieve tag type code - type = *(pTagData - 1); - - // validate that type is a proper BAM tag type - switch(type) { - case 'A': - case 'c': - case 'C': - case 's': - case 'S': - case 'f': - case 'i': - case 'I': - case 'Z': - case 'H': - return true; - - // unknown tag type - default: - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - } - - // tag not found, return failure - return false; -} - -bool BamAlignment::RemoveTag(const string& tag) { - - // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed - // also, return false if no data present to remove - if ( SupportData.HasCoreOnly || TagData.empty() ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - char newTagData[originalTagDataLength]; - - // copy original tag data up til desired tag - pTagData -= 3; - numBytesParsed -= 3; - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - - // save new tag data - TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); - return true; - } - - // tag not found, no removal - return failure - return false; -} - -bool BamAlignment::FindTag(const string& tag, - char* &pTagData, - const unsigned int& tagDataLength, - unsigned int& numBytesParsed) -{ - - while ( numBytesParsed < tagDataLength ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag, return true on match - if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; -} - -bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch(storageType) { - - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; - - case 's': - case 'S': - numBytesParsed += 2; - pTagData += 2; - break; - - case 'f': - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; - - default: - // error case - fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType); - return false; - } - - // return success - return true; -}
--- a/spp/src/BamAlignment.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,203 +0,0 @@ -// *************************************************************************** -// BamAlignment.h (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 13 December 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the BamAlignment data structure -// *************************************************************************** - -#ifndef BAMALIGNMENT_H -#define BAMALIGNMENT_H - -#include <api_global.h> -#include <BamAux.h> -#include <string> -#include <vector> - -namespace BamTools { - -// forward declare BamAlignment's friend classes -namespace Internal { - class BamReaderPrivate; - class BamWriterPrivate; -} // namespace Internal - -// BamAlignment data structure -// explicitly labeled as 'struct' to indicate that (most of) its fields are public -struct API_EXPORT BamAlignment { - - // constructors & destructor - public: - BamAlignment(void); - BamAlignment(const BamAlignment& other); - ~BamAlignment(void); - - // Queries against alignment flags - public: - bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate - bool IsFailedQC(void) const; // Returns true if this read failed quality control - bool IsFirstMate(void) const; // Returns true if alignment is first mate on read - bool IsMapped(void) const; // Returns true if alignment is mapped - bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped - bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand - bool IsPaired(void) const; // Returns true if alignment part of paired-end read - bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment - bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution - bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand - bool IsSecondMate(void) const; // Returns true if alignment is second mate on read - - // Manipulate alignment flags - public: - void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag - void SetIsFailedQC(bool ok); // Sets "failed quality control" flag - void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag - void SetIsMapped(bool ok); // Sets "alignment is mapped" flag - void SetIsMateMapped(bool ok); // Sets "alignment's mate is mapped" flag - void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag - void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag - void SetIsPrimaryAlignment(bool ok); // Sets "position is primary alignment" flag - void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag - void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag - void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag - - // legacy methods (deprecated, but available) - void SetIsMateUnmapped(bool ok); // Complement of IsMateMapped() flag - void SetIsSecondaryAlignment(bool ok); // Complement of IsPrimaryAlignment() flag - void SetIsUnmapped(bool ok); // Complement of IsMapped() flag - - // Tag data access methods - public: - // ------------------------------------------------------------------------------------- - // N.B. - The following tag access methods may not be used on BamAlignments fetched - // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in - // error message (to keep output clean) but will ALWAYS return false. Only user-created - // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here. - - // add tag data (create new TAG entry with TYPE and VALUE) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if new data added, false if error or TAG already exists - // N.B. - will NOT modify existing tag. Use EditTag() instead - // @tag - two character tag name - // @type - single character tag type (see SAM/BAM spec for details) - // @value - value to associate with tag - bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if edit was successfaul, false if error - // @tag - two character tag name - // @type - single character tag type (see SAM/BAM spec for details) - // @value - new value for tag - bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // specific tag data access methods - these only remain for legacy support - // returns whether specific tag could be retrieved - bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance)) - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup)) - - // generic tag data access methods - // returns whether tag is found & tag type is compatible with DESTINATION - // @tag - two character tag name - // @destination - if found, tag value is stored here - bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings - bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data - bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data - bool GetTag(const std::string& tag, float& destination) const; // access floating point data - - // retrieve the tag type code for TAG - // returns true if tag could be found and type determined - bool GetTagType(const std::string& tag, char& type) const; - - // remove tag data - // returns true if removal was successful, false if error - // N.B. - returns false if TAG does not exist (no removal can occur) - // @tag - two character tag name - bool RemoveTag(const std::string& tag); - - // Additional data access methods - public: - // calculates & returns alignment end position, based on starting position and CIGAR operations - // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference - // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats. - int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; - - // 'internal' utility methods - private: - static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); - static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); - - // Data members - public: - std::string Name; // Read name - int32_t Length; // Query length - std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) - std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; // Tag data (accessor methods will pull the requested information out) - int32_t RefID; // ID number for reference sequence - int32_t Position; // Position (0-based) where alignment starts - uint16_t Bin; // Bin in BAM file where this alignment resides - uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate - std::vector<CigarOp> CigarData; // CIGAR operations for this alignment - int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned - int32_t MatePosition; // Position (0-based) where alignment's mate starts - int32_t InsertSize; // Mate-pair insert size - - // Internal data, inaccessible to client code - // but available BamReaderPrivate & BamWriterPrivate - private: - struct BamAlignmentSupportData { - - // data members - std::string AllCharData; - uint32_t BlockLength; - uint32_t NumCigarOperations; - uint32_t QueryNameLength; - uint32_t QuerySequenceLength; - bool HasCoreOnly; - - // constructor - BamAlignmentSupportData(void) - : BlockLength(0) - , NumCigarOperations(0) - , QueryNameLength(0) - , QuerySequenceLength(0) - , HasCoreOnly(false) - { } - }; - BamAlignmentSupportData SupportData; - friend class Internal::BamReaderPrivate; - friend class Internal::BamWriterPrivate; - - // Alignment flag query constants - // Use the get/set methods above instead - private: - enum { PAIRED = 1 - , PROPER_PAIR = 2 - , UNMAPPED = 4 - , MATE_UNMAPPED = 8 - , REVERSE = 16 - , MATE_REVERSE = 32 - , READ_1 = 64 - , READ_2 = 128 - , SECONDARY = 256 - , QC_FAILED = 512 - , DUPLICATE = 1024 - }; -}; - -// convenience typedef(s) -typedef std::vector<BamAlignment> BamAlignmentVector; - -} // namespace BamTools - -#endif // BAMALIGNMENT_H
--- a/spp/src/BamAux.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,227 +0,0 @@ -// *************************************************************************** -// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic constants, data structures, utilities etc. -// used throughout the API for handling BAM files -// *************************************************************************** - -#ifndef BAMAUX_H -#define BAMAUX_H - -#include <api_global.h> - -#include <fstream> -#include <iostream> -#include <string> -#include <vector> - -// Platform-specific large-file support -#ifndef BAMTOOLS_LFS -#define BAMTOOLS_LFS - #ifdef WIN32 - #define ftell64(a) _ftelli64(a) - #define fseek64(a,b,c) _fseeki64(a,b,c) - #else - #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) - #endif -#endif // BAMTOOLS_LFS - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include <stdint.h> - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// BAM constants - -const int BAM_CMATCH = 0; -const int BAM_CINS = 1; -const int BAM_CDEL = 2; -const int BAM_CREF_SKIP = 3; -const int BAM_CSOFT_CLIP = 4; -const int BAM_CHARD_CLIP = 5; -const int BAM_CPAD = 6; -const int BAM_CIGAR_SHIFT = 4; -const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); -const int BAM_CORE_SIZE = 32; -const int BT_SIZEOF_INT = 4; - -// ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// Data structs & typedefs - -// CIGAR operation data structure -struct API_EXPORT CigarOp { - - // data members - char Type; // Operation type (MIDNSHP) - uint32_t Length; // Operation length (number of bases) - - // constructor - CigarOp(const char type = '\0', - const uint32_t length = 0) - : Type(type) - , Length(length) - { } -}; - -// Reference data entry -struct API_EXPORT RefData { - - // data members - std::string RefName; // Name of reference sequence - int32_t RefLength; // Length of reference sequence - bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence - - // constructor - RefData(const int32_t& length = 0, - bool ok = false) - : RefLength(length) - , RefHasAlignments(ok) - { } -}; -typedef std::vector<RefData> RefVector; - -// General (sequential) genome region -struct API_EXPORT BamRegion { - - // data members - int LeftRefID; - int LeftPosition; - int RightRefID; - int RightPosition; - - // constructor - BamRegion(const int& leftID = -1, - const int& leftPos = -1, - const int& rightID = -1, - const int& rightPos = -1) - : LeftRefID(leftID) - , LeftPosition(leftPos) - , RightRefID(rightID) - , RightPosition(rightPos) - { } - - // copy constructor - BamRegion(const BamRegion& other) - : LeftRefID(other.LeftRefID) - , LeftPosition(other.LeftPosition) - , RightRefID(other.RightRefID) - , RightPosition(other.RightPosition) - { } - - // member functions - void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; } - bool isLeftBoundSpecified(void) const { return ( LeftRefID >= 0 && LeftPosition >= 0 ); } - bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); } - bool isRightBoundSpecified(void) const { return ( RightRefID >= 0 && RightPosition >= 0 ); } -}; - -// ---------------------------------------------------------------- -// ---------------------------------------------------------------- -// General utilities - -// returns true if system is big endian -inline bool SystemIsBigEndian(void) { - const uint16_t one = 0x0001; - return ((*(char*) &one) == 0 ); -} - -// swaps endianness of 16-bit value 'in place' -inline void SwapEndian_16(int16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -inline void SwapEndian_16(uint16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -// swaps endianness of 32-bit value 'in-place' -inline void SwapEndian_32(int32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -inline void SwapEndian_32(uint32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -// swaps endianness of 64-bit value 'in-place' -inline void SwapEndian_64(int64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -inline void SwapEndian_64(uint64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -// swaps endianness of 'next 2 bytes' in a char buffer (in-place) -inline void SwapEndian_16p(char* data) { - uint16_t& value = (uint16_t&)*data; - SwapEndian_16(value); -} - -// swaps endianness of 'next 4 bytes' in a char buffer (in-place) -inline void SwapEndian_32p(char* data) { - uint32_t& value = (uint32_t&)*data; - SwapEndian_32(value); -} - -// swaps endianness of 'next 8 bytes' in a char buffer (in-place) -inline void SwapEndian_64p(char* data) { - uint64_t& value = (uint64_t&)*data; - SwapEndian_64(value); -} - -// returns whether file exists (can be opened OK) -inline bool FileExists(const std::string& filename) { - std::ifstream f(filename.c_str(), std::ifstream::in); - return !f.fail(); -} - -} // namespace BamTools - -#endif // BAMAUX_H
--- a/spp/src/BamIndex.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,230 +0,0 @@ -// *************************************************************************** -// BamIndex.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). -// *************************************************************************** - -#include <BamIndex.h> -#include <BamReader.h> -#include <BGZF.h> -#include <BamStandardIndex_p.h> -#include <BamToolsIndex_p.h> -using namespace BamTools; -using namespace BamTools::Internal; - -#include <cstdio> -#include <cstdlib> -#include <algorithm> -#include <iostream> -#include <map> -using namespace std; - -// -------------------------------------------------- -// BamIndex factory methods - -// returns index based on BAM filename 'stub' -// checks first for preferred type, returns that type if found -// (if not found, attmempts to load other type(s), returns 0 if NONE found) -// -// ** default preferred type is BamToolsIndex ** use this anytime it exists -BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - const BamIndex::PreferredIndexType& type) -{ - // --------------------------------------------------- - // attempt to load preferred type first - - const std::string bamtoolsIndexFilename = bamFilename + ".bti"; - const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename); - if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists ) - return new BamToolsIndex(bgzf, reader); - - const std::string standardIndexFilename = bamFilename + ".bai"; - const bool standardIndexExists = BamTools::FileExists(standardIndexFilename); - if ( (type == BamIndex::STANDARD) && standardIndexExists ) - return new BamStandardIndex(bgzf, reader); - - // ---------------------------------------------------- - // preferred type could not be found, try other (non-preferred) types - // if none found, return 0 - - if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader); - if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader); - return 0; -} - -// returns index based on explicitly named index file (or 0 if not found) -BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader) -{ - // see if specified file exists - const bool indexExists = BamTools::FileExists(indexFilename); - if ( !indexExists ) return 0; - - const std::string bamtoolsIndexExtension(".bti"); - const std::string standardIndexExtension(".bai"); - - // if has bamtoolsIndexExtension - if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) ) - return new BamToolsIndex(bgzf, reader); - - // if has standardIndexExtension - if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) ) - return new BamStandardIndex(bgzf, reader); - - // otherwise, unsupported file type - return 0; -} - -// ------------------------------- -// BamIndex implementation - -// ctor -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader) - : m_BGZF(bgzf) - , m_reader(reader) - , m_cacheMode(BamIndex::LimitedIndexCaching) - , m_indexStream(0) -{ - if ( m_reader && m_reader->IsOpen() ) - m_references = m_reader->GetReferenceData(); -} - -// dtor -BamIndex::~BamIndex(void) { - if ( IsOpen() ) - fclose(m_indexStream); -} - -// return true if FILE* is open -bool BamIndex::IsOpen(void) const { - return ( m_indexStream != 0 ); -} - -// loads existing data from file into memory -bool BamIndex::Load(const string& filename) { - - // open index file, abort on error - if ( !OpenIndexFile(filename, "rb") ) { - fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // check magic number - if ( !LoadHeader() ) { - fclose(m_indexStream); - return false; - } - - // load reference data (but only keep in memory if full caching requested) - bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching ); - if ( !LoadAllReferences(saveInitialLoad) ) { - fclose(m_indexStream); - return false; - } - - // update index cache based on selected mode - UpdateCache(); - - // return success - return true; -} - -// opens index file for reading/writing, return true if opened OK -bool BamIndex::OpenIndexFile(const string& filename, const string& mode) { - m_indexStream = fopen(filename.c_str(), mode.c_str()); - return ( m_indexStream != 0 ); -} - -// rewind index file to beginning of index data, return true if rewound OK -bool BamIndex::Rewind(void) { - return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 ); -} - -// change the index caching behavior -void BamIndex::SetCacheMode(const BamIndexCacheMode mode) { - if ( mode != m_cacheMode ) { - m_cacheMode = mode; - UpdateCache(); - } -} - -// updates in-memory cache of index data, depending on current cache mode -void BamIndex::UpdateCache(void) { - - // skip if file not open - if ( !IsOpen() ) return; - - // reflect requested cache mode behavior - switch ( m_cacheMode ) { - - case (BamIndex::FullIndexCaching) : - Rewind(); - LoadAllReferences(true); - break; - - case (BamIndex::LimitedIndexCaching) : - if ( HasFullDataCache() ) - KeepOnlyFirstReferenceOffsets(); - else { - ClearAllData(); - SkipToFirstReference(); - LoadFirstReference(true); - } - break; - case(BamIndex::NoIndexCaching) : - ClearAllData(); - break; - default : - // unreachable - ; - } -} - -// writes in-memory index data out to file -bool BamIndex::Write(const string& bamFilename) { - - // open index file for writing - string indexFilename = bamFilename + Extension(); - if ( !OpenIndexFile(indexFilename, "wb") ) { - fprintf(stderr, "ERROR: Could not open file to save index.\n"); - return false; - } - - // write index header data - if ( !WriteHeader() ) { - fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n"); - fflush(m_indexStream); - fclose(m_indexStream); - exit(1); - } - - // write main index data - if ( !WriteAllReferences() ) { - fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n"); - fflush(m_indexStream); - fclose(m_indexStream); - exit(1); - } - - // flush any remaining output, rewind file, and return success - fflush(m_indexStream); - fclose(m_indexStream); - - // re-open index file for later reading - if ( !OpenIndexFile(indexFilename, "rb") ) { - fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n"); - return false; - } - - // return success/failure of write - return true; -}
--- a/spp/src/BamIndex.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,145 +0,0 @@ -// *************************************************************************** -// BamIndex.h (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides basic BAM index interface -// *************************************************************************** - -#ifndef BAM_INDEX_H -#define BAM_INDEX_H - -#include <api_global.h> -#include <BamAux.h> -#include <iostream> -#include <string> -#include <vector> - -namespace BamTools { - -class BamReader; -class BgzfData; - -namespace Internal { - class BamStandardIndex; - class BamToolsIndex; -} // namespace Internal - -// -------------------------------------------------- -// BamIndex base class -class API_EXPORT BamIndex { - - // specify index-caching behavior - // - // @FullIndexCaching - store entire index file contents in memory - // @LimitedIndexCaching - store only index data for current reference - // being processed - // @NoIndexCaching - do not store any index data. Load as needed to - // calculate jump offset - public: enum BamIndexCacheMode { FullIndexCaching = 0 - , LimitedIndexCaching - , NoIndexCaching - }; - - // ctor & dtor - public: - BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - virtual ~BamIndex(void); - - // index interface - public: - // creates index data (in-memory) from current reader data - virtual bool Build(void) =0; - // returns supported file extension - virtual const std::string Extension(void) const =0; - // returns whether reference has alignments or no - virtual bool HasAlignments(const int& referenceID) const =0; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; - // loads existing data from file into memory - virtual bool Load(const std::string& filename); - // change the index caching behavior - virtual void SetCacheMode(const BamIndexCacheMode mode); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - virtual bool Write(const std::string& bamFilename); - - // derived-classes MUST provide implementation - protected: - // clear all current index offset data in memory - virtual void ClearAllData(void) =0; - // return file position after header metadata - virtual const off_t DataBeginOffset(void) const =0; - // return true if all index data is cached - virtual bool HasFullDataCache(void) const =0; - // clears index data from all references except the first - virtual void KeepOnlyFirstReferenceOffsets(void) =0; - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - virtual bool LoadAllReferences(bool saveData = true) =0; - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - virtual bool LoadFirstReference(bool saveData = true) =0; - // load header data from index file, return true if loaded OK - virtual bool LoadHeader(void) =0; - // position file pointer to first reference begin, return true if skipped OK - virtual bool SkipToFirstReference(void) =0; - // write index reference data - virtual bool WriteAllReferences(void) =0; - // write index header data - virtual bool WriteHeader(void) =0; - - // internal methods - protected: - // rewind index file to beginning of index data, return true if rewound OK - bool Rewind(void); - - private: - // return true if FILE* is open - bool IsOpen(void) const; - // opens index file according to requested mode, return true if opened OK - bool OpenIndexFile(const std::string& filename, const std::string& mode); - // updates in-memory cache of index data, depending on current cache mode - void UpdateCache(void); - - // factory methods for returning proper BamIndex-derived type based on available index files - public: - - // returns index based on BAM filename 'stub' - // checks first for preferred type, returns that type if found - // (if not found, attmempts to load other type(s), returns 0 if NONE found) - // - // ** default preferred type is BamToolsIndex ** use this anytime it exists - enum PreferredIndexType { BAMTOOLS = 0, STANDARD }; - static BamIndex* FromBamFilename(const std::string& bamFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS); - - // returns index based on explicitly named index file (or 0 if not found) - static BamIndex* FromIndexFilename(const std::string& indexFilename, - BamTools::BgzfData* bgzf, - BamTools::BamReader* reader); - - // data members - protected: - BamTools::BgzfData* m_BGZF; - BamTools::BamReader* m_reader; - BamTools::RefVector m_references; - BamIndex::BamIndexCacheMode m_cacheMode; - FILE* m_indexStream; - - - // friends - friend class Internal::BamStandardIndex; - friend class Internal::BamToolsIndex; -}; - -} // namespace BamTools - -#endif // BAM_INDEX_H
--- a/spp/src/BamMultiReader.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,450 +0,0 @@ -// *************************************************************************** -// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files. -// -// This functionality allows applications to work on very large sets of files -// without requiring intermediate merge, sort, and index steps for each file -// subset. It also improves the performance of our merge system as it -// precludes the need to sort merged files. -// *************************************************************************** - -#include <BamMultiReader.h> -#include <BGZF.h> -using namespace BamTools; - -#include <algorithm> -#include <fstream> -#include <iostream> -#include <iterator> -#include <sstream> -#include <string> -#include <vector> -using namespace std; - -// ----------------------------------------------------- -// BamMultiReader implementation -// ----------------------------------------------------- - -// constructor -BamMultiReader::BamMultiReader(void) - : CurrentRefID(0) - , CurrentLeft(0) -{ } - -// destructor -BamMultiReader::~BamMultiReader(void) { - Close(); -} - -// close the BAM files -void BamMultiReader::Close(void) { - - // close all BAM readers and clean up pointers - vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin(); - vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end(); - for ( ; readerIter != readerEnd; ++readerIter) { - - BamReader* reader = (*readerIter).first; - BamAlignment* alignment = (*readerIter).second; - - // close the reader - if ( reader) reader->Close(); - - // delete reader pointer - delete reader; - reader = 0; - - // delete alignment pointer - delete alignment; - alignment = 0; - } - - // clear out the container - readers.clear(); -} - -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReader::CreateIndexes(bool useStandardIndex) { - bool result = true; - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->CreateIndex(useStandardIndex); - } - return result; -} - -// sets the index caching mode on the readers -void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - reader->SetIndexCacheMode(mode); - } -} - -// for debugging -void BamMultiReader::DumpAlignmentIndex(void) { - for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { - cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; - } -} - -// makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReader::GetHeaderText(void) const { - - string mergedHeader = ""; - map<string, bool> readGroups; - - // foreach extraction entry (each BAM file) - for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { - - BamReader* reader = rs->first; - string headerText = reader->GetHeaderText(); - if ( headerText.empty() ) continue; - - map<string, bool> currentFileReadGroups; - stringstream header(headerText); - vector<string> lines; - string item; - while (getline(header, item)) - lines.push_back(item); - - for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) { - - // get next line from header, skip if empty - string headerLine = *it; - if ( headerLine.empty() ) { continue; } - - // if first file, save HD & SQ entries - if ( rs == readers.begin() ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } - - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - stringstream headerLineSs(headerLine); - string part, readGroupPart, readGroup; - while(std::getline(headerLineSs, part, '\t')) { - stringstream partSs(part); - string subtag; - std::getline(partSs, subtag, ':'); - if (subtag == "ID") { - std::getline(partSs, readGroup, ':'); - break; - } - } - if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } - } - } - - // return merged header text - return mergedHeader; -} - -// get next alignment among all files -bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignment(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - -} - -// get next alignment among all files without parsing character data from alignments -bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignmentCore(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - -} - -// --------------------------------------------------------------------------------------- -// -// NB: The following GetReferenceX() functions assume that we have identical -// references for all BAM files. We enforce this by invoking the above -// validation function (ValidateReaders) to verify that our reference data -// is the same across all files on Open, so we will not encounter a situation -// in which there is a mismatch and we are still live. -// -// --------------------------------------------------------------------------------------- - -// returns the number of reference sequences -const int BamMultiReader::GetReferenceCount(void) const { - return readers.front().first->GetReferenceCount(); -} - -// returns vector of reference objects -const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { - return readers.front().first->GetReferenceData(); -} - -// returns refID from reference name -const int BamMultiReader::GetReferenceID(const string& refName) const { - return readers.front().first->GetReferenceID(refName); -} - -// --------------------------------------------------------------------------------------- - -// checks if any readers still have alignments -bool BamMultiReader::HasOpenReaders() { - return alignments.size() > 0; -} - -// returns whether underlying BAM readers ALL have an index loaded -// this is useful to indicate whether Jump() or SetRegion() are possible -bool BamMultiReader::IsIndexLoaded(void) const { - bool ok = true; - vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin(); - vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd = readers.end(); - for ( ; readerIter != readerEnd; ++readerIter ) { - const BamReader* reader = (*readerIter).first; - if ( reader ) ok &= reader->IsIndexLoaded(); - } - return ok; -} - -// jumps to specified region(refID, leftBound) in BAM files, returns success/fail -bool BamMultiReader::Jump(int refID, int position) { - - //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - CurrentRefID = refID; - CurrentLeft = position; - - bool result = true; - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Jump(refID, position); - if (!result) { - cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; - exit(1); - } - } - if (result) UpdateAlignments(); - return result; -} - -// opens BAM files -bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) { - - // for filename in filenames - fileNames = filenames; // save filenames in our multireader - for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { - - const string filename = *it; - BamReader* reader = new BamReader; - - bool openedOK = true; - openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex); - - // if file opened ok, check that it can be read - if ( openedOK ) { - - bool fileOK = true; - BamAlignment* alignment = new BamAlignment; - fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) ); - - if (fileOK) { - readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { - cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; - // if only file available & could not be read, return failure - if ( filenames.size() == 1 ) return false; - } - } - - // TODO; any further error handling when openedOK is false ?? - else - return false; - } - - // files opened ok, at least one alignment could be read, - // now need to check that all files use same reference data - ValidateReaders(); - return true; -} - -void BamMultiReader::PrintFilenames(void) { - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - cout << reader->GetFilename() << endl; - } -} - -// returns BAM file pointers to beginning of alignment data -bool BamMultiReader::Rewind(void) { - bool result = true; - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Rewind(); - } - return result; -} - -bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { - BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); - return SetRegion(region); -} - -bool BamMultiReader::SetRegion(const BamRegion& region) { - - Region = region; - - // NB: While it may make sense to track readers in which we can - // successfully SetRegion, In practice a failure of SetRegion means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - if (!it->first->SetRegion(region)) { - cerr << "ERROR: could not jump " << it->first->GetFilename() << " to " - << region.LeftRefID << ":" << region.LeftPosition - << ".." << region.RightRefID << ":" << region.RightPosition << endl; - } - } - - UpdateAlignments(); - return true; -} - -void BamMultiReader::UpdateAlignments(void) { - // Update Alignments - alignments.clear(); - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* br = it->first; - BamAlignment* ba = it->second; - if (br->GetNextAlignment(*ba)) { - alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), - make_pair(br, ba))); - } else { - // assume BamReader end of region / EOF - } - } -} - -// updates the reference id stored in the BamMultiReader -// to reflect the current state of the readers -void BamMultiReader::UpdateReferenceID(void) { - // the alignments are sorted by position, so the first alignment will always have the lowest reference ID - if (alignments.begin()->second.second->RefID != CurrentRefID) { - // get the next reference id - // while there aren't any readers at the next ref id - // increment the ref id - int nextRefID = CurrentRefID; - while (alignments.begin()->second.second->RefID != nextRefID) { - ++nextRefID; - } - //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; - CurrentRefID = nextRefID; - } -} - -// ValidateReaders checks that all the readers point to BAM files representing -// alignments against the same set of reference sequences, and that the -// sequences are identically ordered. If these checks fail the operation of -// the multireader is undefined, so we force program exit. -void BamMultiReader::ValidateReaders(void) const { - int firstRefCount = readers.front().first->GetReferenceCount(); - BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); - for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - BamTools::RefVector currentRefData = reader->GetReferenceData(); - BamTools::RefVector::const_iterator f = firstRefData.begin(); - BamTools::RefVector::const_iterator c = currentRefData.begin(); - if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { - cerr << "ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstRefCount - << " reference sequences but only found " << reader->GetReferenceCount() << endl; - exit(1); - } - // this will be ok; we just checked above that we have identically-sized sets of references - // here we simply check if they are all, in fact, equal in content - while (f != firstRefData.end()) { - if (f->RefName != c->RefName || f->RefLength != c->RefLength) { - cerr << "ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; - for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - cerr << "but found: " << endl; - for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - exit(1); - } - ++f; ++c; - } - } -}
--- a/spp/src/BamMultiReader.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,136 +0,0 @@ -// *************************************************************************** -// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files -// *************************************************************************** - -#ifndef BAMMULTIREADER_H -#define BAMMULTIREADER_H - -#include <api_global.h> -#include <BamReader.h> -#include <map> -#include <sstream> -#include <string> -#include <utility> - -namespace BamTools { - -// index mapping reference/position pairings to bamreaders and their alignments -typedef std::multimap<std::pair<int, int>, std::pair<BamReader*, BamAlignment*> > AlignmentIndex; - -class API_EXPORT BamMultiReader { - - // constructor / destructor - public: - BamMultiReader(void); - ~BamMultiReader(void); - - // public interface - public: - - // positioning - int CurrentRefID; - int CurrentLeft; - - // region under analysis, specified using SetRegion - BamRegion Region; - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM files - void Close(void); - - // opens BAM files (and optional BAM index files, if provided) - // @openIndexes - triggers index opening, useful for suppressing - // error messages during merging of files in which we may not have - // indexes. - // @coreMode - setup our first alignments using GetNextAlignmentCore(); - // also useful for merging - // @preferStandardIndex - look for standard BAM index ".bai" first. If false, - // will look for BamTools index ".bti". - bool Open(const std::vector<std::string>& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false); - - // returns whether underlying BAM readers ALL have an index loaded - // this is useful to indicate whether Jump() or SetRegion() are possible - bool IsIndexLoaded(void) const; - - // performs random-access jump to reference, position - bool Jump(int refID, int position = 0); - - // sets the target region - bool SetRegion(const BamRegion& region); - bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above - - // returns file pointers to beginning of alignments - bool Rewind(void); - - // ---------------------- - // access alignment data - // ---------------------- - // updates the reference id marker to match the lower limit of our readers - void UpdateReferenceID(void); - - // retrieves next available alignment (returns success/fail) from all files - bool GetNextAlignment(BamAlignment&); - // retrieves next available alignment (returns success/fail) from all files - // and populates the support data with information about the alignment - // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT - bool GetNextAlignmentCore(BamAlignment&); - // ... should this be private? - bool HasOpenReaders(void); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns unified SAM header text for all files - const std::string GetHeaderText(void) const; - // returns number of reference sequences - const int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector GetReferenceData(void) const; - // returns reference id (used for BamMultiReader::Jump()) for the given reference name - const int GetReferenceID(const std::string& refName) const; - // validates that we have a congruent set of BAM files that are aligned against the same reference sequences - void ValidateReaders() const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") - bool CreateIndexes(bool useStandardIndex = true); - - // sets the index caching mode for the readers - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - - //const int GetReferenceID(const string& refName) const; - - // utility - void PrintFilenames(void); - void DumpAlignmentIndex(void); - void UpdateAlignments(void); // updates our alignment cache - - // private implementation - private: - - // the set of readers and alignments which we operate on, maintained throughout the life of this class - std::vector<std::pair<BamReader*, BamAlignment*> > readers; - - // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment - // when a reader reaches EOF, its entry is removed from this index - AlignmentIndex alignments; - - std::vector<std::string> fileNames; -}; - -} // namespace BamTools - -#endif // BAMMULTIREADER_H
--- a/spp/src/BamReader.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,66 +0,0 @@ -// *************************************************************************** -// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#include <BamReader.h> -#include <BamReader_p.h> -using namespace BamTools; -using namespace BamTools::Internal; - -#include <algorithm> -#include <iostream> -#include <iterator> -#include <string> -#include <vector> -using namespace std; - -// constructor -BamReader::BamReader(void) { - d = new BamReaderPrivate(this); -} - -// destructor -BamReader::~BamReader(void) { - delete d; - d = 0; -} - -// file operations -void BamReader::Close(void) { d->Close(); } -bool BamReader::HasIndex(void) const { return d->HasIndex; } -bool BamReader::IsIndexLoaded(void) const { return HasIndex(); } -bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); } -bool BamReader::Open(const std::string& filename, - const std::string& indexFilename, - const bool lookForIndex, - const bool preferStandardIndex) -{ - return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); -} -bool BamReader::Rewind(void) { return d->Rewind(); } -bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); } -bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) { - return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); -} - -// access alignment data -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } -bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); } - -// access auxiliary data -const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); } -int BamReader::GetReferenceCount(void) const { return d->References.size(); } -const RefVector& BamReader::GetReferenceData(void) const { return d->References; } -int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } -const std::string BamReader::GetFilename(void) const { return d->Filename; } - -// index operations -bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); } -void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); }
--- a/spp/src/BamReader.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,130 +0,0 @@ -// *************************************************************************** -// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#ifndef BAMREADER_H -#define BAMREADER_H - -#include <api_global.h> -#include <BamAlignment.h> -#include <BamIndex.h> -#include <string> - -namespace BamTools { - -namespace Internal { - class BamReaderPrivate; -} // namespace Internal - -class API_EXPORT BamReader { - - // constructor / destructor - public: - BamReader(void); - ~BamReader(void); - - // public interface - public: - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM file - void Close(void); - // returns whether reader is open for reading or not - bool IsOpen(void) const; - // performs random-access jump using (reference, position) as a left-bound - bool Jump(int refID, int position = 0); - // opens BAM file (and optional BAM index file, if provided) - // @lookForIndex - if no indexFilename provided, look in BAM file's directory for an existing index file - // default behavior is to skip index file search if no index filename given - // @preferStandardIndex - if true, give priority in index file searching to standard BAM index (*.bai) - // default behavior is to prefer the BamToolsIndex (*.bti) if both are available - bool Open(const std::string& filename, - const std::string& indexFilename = "", - const bool lookForIndex = false, - const bool preferStandardIndex = false); - // returns file pointer to beginning of alignments - bool Rewind(void); - // sets a region of interest (with left & right bound reference/position) - // returns success/failure of seeking to left bound of region - bool SetRegion(const BamRegion& region); - bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound); - - // ---------------------- - // access alignment data - // ---------------------- - - // retrieves next available alignment (returns success/fail) - bool GetNextAlignment(BamAlignment& bAlignment); - // retrieves next available alignment core data (returns success/fail) - // ** DOES NOT parse any character data (read name, bases, qualities, tag data) ** - // useful for operations requiring ONLY aligner-related information - // (refId/position, alignment flags, CIGAR, mapQuality, etc) - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns SAM header text - const std::string GetHeaderText(void) const; - // returns number of reference sequences - int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector& GetReferenceData(void) const; - // returns reference id (used for BamReader::Jump()) for the given reference name - int GetReferenceID(const std::string& refName) const; - // returns the name of the file associated with this BamReader - const std::string GetFilename(void) const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM file, saves to file - // default behavior is to create the BAM standard index (".bai") - // set flag to false to create the BamTools-specific index (".bti") - bool CreateIndex(bool useStandardIndex = true); - // returns whether index data is available for reading - // (e.g. if true, BamReader should be able to seek to a region) - bool HasIndex(void) const; - // change the index caching behavior - // default BamReader/Index mode is LimitedIndexCaching - // @mode - can be either FullIndexCaching, LimitedIndexCaching, - // or NoIndexCaching. See BamIndex.h for more details - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - - // deprecated methods - public: - - // deprecated (but still available): prefer HasIndex() instead - // - // Deprecated purely for API semantic clarity - HasIndex() should be clearer - // than IsIndexLoaded() in light of the new caching modes that may clear the - // index data from memory, but leave the index file open for later random access - // seeks. - // - // For example, what would (IsIndexLoaded() == true) mean when cacheMode has been - // explicitly set to NoIndexCaching? This is confusing at best, misleading about - // current memory behavior at worst. - // - // returns whether index data is available - // (e.g. if true, BamReader should be able to seek to a region) - bool IsIndexLoaded(void) const; - - // private implementation - private: - Internal::BamReaderPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMREADER_H
--- a/spp/src/BamReader_p.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,729 +0,0 @@ -// *************************************************************************** -// BamReader_p.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#include <BamReader.h> -#include <BGZF.h> -#include <BamReader_p.h> -#include <BamStandardIndex_p.h> -#include <BamToolsIndex_p.h> -using namespace BamTools; -using namespace BamTools::Internal; - -#include <algorithm> -#include <iostream> -#include <iterator> -#include <vector> -using namespace std; - -// constructor -BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : HeaderText("") - , Index(0) - , HasIndex(false) - , AlignmentsBeginOffset(0) -// , m_header(0) - , IndexCacheMode(BamIndex::LimitedIndexCaching) - , HasAlignmentsInRegion(true) - , Parent(parent) - , DNA_LOOKUP("=ACMGRSVTWYHKDBN") - , CIGAR_LOOKUP("MIDNSHP") -{ - IsBigEndian = SystemIsBigEndian(); -} - -// destructor -BamReaderPrivate::~BamReaderPrivate(void) { - Close(); -} - -// adjusts requested region if necessary (depending on where data actually begins) -void BamReaderPrivate::AdjustRegion(BamRegion& region) { - - // check for valid index first - if ( Index == 0 ) return; - - // see if any references in region have alignments - HasAlignmentsInRegion = false; - int currentId = region.LeftRefID; - - const int rightBoundRefId = ( region.isRightBoundSpecified() ? region.RightRefID : References.size() - 1 ); - while ( currentId <= rightBoundRefId ) { - HasAlignmentsInRegion = Index->HasAlignments(currentId); - if ( HasAlignmentsInRegion ) break; - ++currentId; - } - - // if no data found on any reference in region - if ( !HasAlignmentsInRegion ) return; - - // if left bound of desired region had no data, use first reference that had data - // otherwise, leave requested region as-is - if ( currentId != region.LeftRefID ) { - region.LeftRefID = currentId; - region.LeftPosition = 0; - } -} - -// fills out character data for BamAlignment data -bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { - - // calculate character lengths/offsets - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; - - // check offsets to see what char data exists - const bool hasSeqData = ( seqDataOffset < dataLength ); - const bool hasQualData = ( qualDataOffset < dataLength ); - const bool hasTagData = ( tagDataOffset < dataLength ); - - // set up char buffers - const char* allCharData = bAlignment.SupportData.AllCharData.data(); - const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); - const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); - char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); - - // store alignment name (relies on null char in name as terminator) - bAlignment.Name.assign((const char*)(allCharData)); - - // save query sequence - bAlignment.QueryBases.clear(); - if ( hasSeqData ) { - bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - bAlignment.QueryBases.append(1, singleBase); - } - } - - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character - bAlignment.Qualities.clear(); - if ( hasQualData ) { - bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); - bAlignment.Qualities.append(1, singleQuality); - } - } - - // if QueryBases is empty (and this is a allowed case) - if ( bAlignment.QueryBases.empty() ) - bAlignment.AlignedBases = bAlignment.QueryBases; - - // if QueryBases contains data, then build AlignedBases using CIGAR data - else { - - // resize AlignedBases - bAlignment.AlignedBases.clear(); - bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); - - // iterate over CigarOps - int k = 0; - vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin(); - vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - - const CigarOp& op = (*cigarIter); - switch(op.Type) { - - case ('M') : - case ('I') : - bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases - // fall through - - case ('S') : - k += op.Length; // for 'S' - soft clip, skip over query bases - break; - - case ('D') : - bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character - break; - - case ('P') : - bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character - break; - - case ('N') : - bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence - break; - - case ('H') : - break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op - - default: - fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - } - - // save tag data - bAlignment.TagData.clear(); - if ( hasTagData ) { - if ( IsBigEndian ) { - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i += sizeof(uint64_t); - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here - exit(1); - } - } - } - - // store tagData in alignment - bAlignment.TagData.resize(tagDataLength); - memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); - } - - // clear the core-only flag - bAlignment.SupportData.HasCoreOnly = false; - - // return success - return true; -} - -// clear index data structure -void BamReaderPrivate::ClearIndex(void) { - delete Index; - Index = 0; - HasIndex = false; -} - -// closes the BAM file -void BamReaderPrivate::Close(void) { - - // close BGZF file stream - mBGZF.Close(); - - // clear out index data - ClearIndex(); - - // clear out header data - HeaderText.clear(); -// if ( m_header ) { -// delete m_header; -// m_header = 0; -// } - - // clear out region flags - Region.clear(); -} - -// creates index for BAM file, saves to file -// default behavior is to create the BAM standard index (".bai") -// set flag to false to create the BamTools-specific index (".bti") -bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { - - // clear out prior index data - ClearIndex(); - - // create index based on type requested - if ( useStandardIndex ) - Index = new BamStandardIndex(&mBGZF, Parent); - else - Index = new BamToolsIndex(&mBGZF, Parent); - - // set index cache mode to full for writing - Index->SetCacheMode(BamIndex::FullIndexCaching); - - // build new index - bool ok = true; - ok &= Index->Build(); - HasIndex = ok; - - // mark empty references - MarkReferences(); - - // attempt to save index data to file - ok &= Index->Write(Filename); - - // set client's desired index cache mode - Index->SetCacheMode(IndexCacheMode); - - // return success/fail of both building & writing index - return ok; -} - -const string BamReaderPrivate::GetHeaderText(void) const { - - return HeaderText; - -// if ( m_header ) -// return m_header->Text(); -// else -// return string(""); -} - -// get next alignment (from specified region, if given) -bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { - - // if valid alignment found, attempt to parse char data, and return success/failure - if ( GetNextAlignmentCore(bAlignment) ) - return BuildCharData(bAlignment); - - // no valid alignment found - else return false; -} - -// retrieves next available alignment core data (returns success/fail) -// ** DOES NOT parse any character data (read name, bases, qualities, tag data) -// these can be accessed, if necessary, from the supportData -// useful for operations requiring ONLY positional or other alignment-related information -bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { - - // if region is set but has no alignments - if ( !Region.isNull() && !HasAlignmentsInRegion ) - return false; - - // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { - - // set core-only flag - bAlignment.SupportData.HasCoreOnly = true; - - // if region not specified with at least a left boundary, return success - if ( !Region.isLeftBoundSpecified() ) return true; - - // determine region state (before, within, after) - BamReaderPrivate::RegionState state = IsOverlap(bAlignment); - - // if alignment lies after region, return false - if ( state == AFTER_REGION ) return false; - - while ( state != WITHIN_REGION ) { - // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) return false; - // if alignment lies after region, return false (no available read within region) - state = IsOverlap(bAlignment); - if ( state == AFTER_REGION ) return false; - } - - // return success (alignment found that overlaps region) - return true; - } - - // no valid alignment - else return false; -} - -// returns RefID for given RefName (returns References.size() if not found) -int BamReaderPrivate::GetReferenceID(const string& refName) const { - - // retrieve names from reference data - vector<string> refNames; - RefVector::const_iterator refIter = References.begin(); - RefVector::const_iterator refEnd = References.end(); - for ( ; refIter != refEnd; ++refIter) - refNames.push_back( (*refIter).RefName ); - - // return 'index-of' refName ( if not found, returns refNames.size() ) - return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); -} - -// returns region state - whether alignment ends before, overlaps, or starts after currently specified region -// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true -BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { - - // if alignment is on any reference sequence before left bound - if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; - - // if alignment starts on left bound reference - else if ( bAlignment.RefID == Region.LeftRefID ) { - - // if alignment starts at or after left boundary - if ( bAlignment.Position >= Region.LeftPosition) { - - // if right boundary is specified AND - // left/right boundaries are on same reference AND - // alignment starts past right boundary - if ( Region.isRightBoundSpecified() && - Region.LeftRefID == Region.RightRefID && - bAlignment.Position > Region.RightPosition ) - return AFTER_REGION; - - // otherwise, alignment is within region - return WITHIN_REGION; - } - - // alignment starts before left boundary - else { - // check if alignment overlaps left boundary - if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION; - else return BEFORE_REGION; - } - } - - // alignment starts on a reference after the left bound - else { - - // if region has a right boundary - if ( Region.isRightBoundSpecified() ) { - - // alignment is on reference between boundaries - if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION; - - // alignment is on reference after right boundary - else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; - - // alignment is on right bound reference - else { - // check if alignment starts before or at right boundary - if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION; - else return AFTER_REGION; - } - } - - // otherwise, alignment is after left bound reference, but there is no right boundary - else return WITHIN_REGION; - } -} - -// load BAM header data -void BamReaderPrivate::LoadHeaderData(void) { - -// m_header = new BamHeader(&mBGZF); -// bool headerLoadedOk = m_header->Load(); -// if ( !headerLoadedOk ) -// cerr << "BamReader could not load header" << endl; - - // check to see if proper BAM header - char buffer[4]; - if (mBGZF.Read(buffer, 4) != 4) { - fprintf(stderr, "Could not read header type\n"); - exit(1); - } - - if (strncmp(buffer, "BAM\001", 4)) { - fprintf(stderr, "wrong header type!\n"); - exit(1); - } - - // get BAM header text length - mBGZF.Read(buffer, 4); - unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(headerTextLength); - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - mBGZF.Read(headerText, headerTextLength); - HeaderText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); -} - -// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail -bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { - - // clear out any existing index data - ClearIndex(); - - // if no index filename provided, so we need to look for available index files - if ( IndexFilename.empty() ) { - - // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag - const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); - Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); - - // if null, return failure - if ( Index == 0 ) return false; - - // generate proper IndexFilename based on type of index created - IndexFilename = Filename + Index->Extension(); - } - - else { - - // attempt to load BamIndex based on IndexFilename provided by client - Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); - - // if null, return failure - if ( Index == 0 ) return false; - } - - // set cache mode for BamIndex - Index->SetCacheMode(IndexCacheMode); - - // loading the index data from file - HasIndex = Index->Load(IndexFilename); - - // mark empty references - MarkReferences(); - - // return index status - return HasIndex; -} - -// populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { - - // read in the 'block length' value, make sure it's not zero - char buffer[4]; - mBGZF.Read(buffer, 4); - bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } - if ( bAlignment.SupportData.BlockLength == 0 ) return false; - - // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; - if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; - - if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) - SwapEndian_32p(&x[i]); - } - - // set BamAlignment 'core' and 'support' data - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); - bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - - unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); - bAlignment.Bin = tempValue >> 16; - bAlignment.MapQuality = tempValue >> 8 & 0xff; - bAlignment.SupportData.QueryNameLength = tempValue & 0xff; - - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); - bAlignment.AlignmentFlag = tempValue >> 16; - bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; - - bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); - bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - - // set BamAlignment length - bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; - - // read in character data - make sure proper data size was read - bool readCharDataOK = false; - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - char* allCharData = (char*)calloc(sizeof(char), dataLength); - - if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { - - // store 'allCharData' in supportData structure - bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); - - // set success flag - readCharDataOK = true; - - // save CIGAR ops - // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, - // even when GetNextAlignmentCore() is called - const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); - CigarOp op; - bAlignment.CigarData.clear(); - bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); - for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { - - // swap if necessary - if ( IsBigEndian ) SwapEndian_32(cigarData[i]); - - // build CigarOp structure - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - } - } - - free(allCharData); - return readCharDataOK; -} - -// loads reference data from BAM file -void BamReaderPrivate::LoadReferenceData(void) { - - // get number of reference sequences - char buffer[4]; - mBGZF.Read(buffer, 4); - unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); - if ( numberRefSeqs == 0 ) return; - References.reserve((int)numberRefSeqs); - - // iterate over all references in header - for (unsigned int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - mBGZF.Read(buffer, 4); - unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(refNameLength); - char* refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - mBGZF.Read(refName, refNameLength); - mBGZF.Read(buffer, 4); - int refLength = BgzfData::UnpackSignedInt(buffer); - if ( IsBigEndian ) SwapEndian_32(refLength); - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - References.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } -} - -// mark references with no alignment data -void BamReaderPrivate::MarkReferences(void) { - - // ensure index is available - if ( !HasIndex ) return; - - // mark empty references - for ( int i = 0; i < (int)References.size(); ++i ) - References.at(i).RefHasAlignments = Index->HasAlignments(i); -} - -// opens BAM file (and index) -bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) { - - // store filenames - Filename = filename; - IndexFilename = indexFilename; - - // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) return false; - - // retrieve header text & reference data - LoadHeaderData(); - LoadReferenceData(); - - // store file offset of first alignment - AlignmentsBeginOffset = mBGZF.Tell(); - - // if no index filename provided - if ( IndexFilename.empty() ) { - - // client did not specify that index SHOULD be found - // useful for cases where sequential access is all that is required - if ( !lookForIndex ) return true; - - // otherwise, look for index file, return success/fail - return LoadIndex(lookForIndex, preferStandardIndex) ; - } - - // client supplied an index filename - // attempt to load index data, return success/fail - return LoadIndex(lookForIndex, preferStandardIndex); -} - -// returns BAM file pointer to beginning of alignment data -bool BamReaderPrivate::Rewind(void) { - - // rewind to first alignment, return false if unable to seek - if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; - - // retrieve first alignment data, return false if unable to read - BamAlignment al; - if ( !LoadNextAlignment(al) ) return false; - - // reset default region info using first alignment in file - Region.clear(); - HasAlignmentsInRegion = true; - - // rewind back to beginning of first alignment - // return success/fail of seek - return mBGZF.Seek(AlignmentsBeginOffset); -} - -// change the index caching behavior -void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { - IndexCacheMode = mode; - if ( Index == 0 ) return; - Index->SetCacheMode(mode); -} - -// asks Index to attempt a Jump() to specified region -// returns success/failure -bool BamReaderPrivate::SetRegion(const BamRegion& region) { - - // clear out any prior BamReader region data - // - // N.B. - this is cleared so that BamIndex now has free reign to call - // GetNextAlignmentCore() and do overlap checking without worrying about BamReader - // performing any overlap checking of its own and moving on to the next read... Calls - // to GetNextAlignmentCore() with no Region set, simply return the next alignment. - // This ensures that the Index is able to do just that. (All without exposing - // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) - Region.clear(); - - // check for existing index - if ( !HasIndex ) return false; - - // adjust region if necessary to reflect where data actually begins - BamRegion adjustedRegion(region); - AdjustRegion(adjustedRegion); - - // if no data present, return true - // not an error, but BamReader knows that no data is there for future alignment access - // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions - // that other BAMs have data) - if ( !HasAlignmentsInRegion ) { - Region = adjustedRegion; - return true; - } - - // attempt jump to user-specified region return false if jump could not be performed at all - // (invalid index, unknown reference, etc) - // - // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag - // * This covers case where a region is requested that lies beyond the last alignment on a reference - // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false - // BamMultiReader is then able to successfully pull alignments from a region from multiple files - // even if one or more have no data. - if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; - - // save region and return success - Region = adjustedRegion; - return true; -}
--- a/spp/src/BamReader_p.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,137 +0,0 @@ -// *************************************************************************** -// BamReader_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#ifndef BAMREADER_P_H -#define BAMREADER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to version -// without notice, or even be removed. -// -// We mean it. - -#include <BamAlignment.h> -#include <BamIndex.h> -#include <BGZF.h> -#include <string> - -namespace BamTools { - -class BamReader; - -namespace Internal { - -class BamReaderPrivate { - - // enums - public: enum RegionState { BEFORE_REGION = 0 - , WITHIN_REGION - , AFTER_REGION - }; - - // ctor & dtor - public: - BamReaderPrivate(BamReader* parent); - ~BamReaderPrivate(void); - - // 'public' interface to BamReader - public: - - // file operations - void Close(void); - bool Open(const std::string& filename, - const std::string& indexFilename, - const bool lookForIndex, - const bool preferStandardIndex); - bool Rewind(void); - bool SetRegion(const BamRegion& region); - - // access alignment data - bool GetNextAlignment(BamAlignment& bAlignment); - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // access auxiliary data - const std::string GetHeaderText(void) const; - int GetReferenceID(const std::string& refName) const; - - // index operations - bool CreateIndex(bool useStandardIndex); - void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); - - // 'internal' methods - public: - - // --------------------------------------- - // reading alignments and auxiliary data - - // adjusts requested region if necessary (depending on where data actually begins) - void AdjustRegion(BamRegion& region); - // fills out character data for BamAlignment data - bool BuildCharData(BamAlignment& bAlignment); - // checks to see if alignment overlaps current region - RegionState IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - // mark references with 'HasAlignments' status - void MarkReferences(void); - - // --------------------------------- - // index file handling - - // clear out inernal index data structure - void ClearIndex(void); - // loads index from BAM index file - bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); - - // data members - public: - - // general file data - BgzfData mBGZF; - std::string HeaderText; - BamIndex* Index; - RefVector References; - bool HasIndex; - int64_t AlignmentsBeginOffset; - std::string Filename; - std::string IndexFilename; - -// Internal::BamHeader* m_header; - - // index caching mode - BamIndex::BamIndexCacheMode IndexCacheMode; - - // system data - bool IsBigEndian; - - // user-specified region values - BamRegion Region; - bool HasAlignmentsInRegion; - - // parent BamReader - BamReader* Parent; - - // BAM character constants - const char* DNA_LOOKUP; - const char* CIGAR_LOOKUP; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMREADER_P_H
--- a/spp/src/BamStandardIndex_p.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,910 +0,0 @@ -// *************************************************************************** -// BamStandardIndex.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the standardized BAM index format (".bai") -// *************************************************************************** - -#include <BamAlignment.h> -#include <BamReader.h> -#include <BGZF.h> -#include <BamStandardIndex_p.h> -using namespace BamTools; -using namespace BamTools::Internal; - -#include <cstdio> -#include <cstdlib> -#include <algorithm> -#include <iostream> -#include <map> -using namespace std; - -BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -BamStandardIndex::~BamStandardIndex(void) { - ClearAllData(); -} - -// calculate bins that overlap region -int BamStandardIndex::BinsFromRegion(const BamRegion& region, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]) -{ - // get region boundaries - uint32_t begin = (unsigned int)region.LeftPosition; - uint32_t end; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) - end = (unsigned int)region.RightPosition; - - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - bins[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } - - // return number of bins stored - return i; -} - -// creates index data (in-memory) from current reader data -bool BamStandardIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - m_reader->Rewind(); - - // get reference count, reserve index space - const int numReferences = (int)m_references.size(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); - - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); - - // offset data - uint64_t saveOffset = m_BGZF->Tell(); - uint64_t lastOffset = saveOffset; - - // coordinate data - int32_t lastCoordinate = defaultValue; - - BamAlignment bAlignment; - while ( m_reader->GetNextAlignmentCore(bAlignment) ) { - - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } - - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - fprintf(stderr, "BAM file not properly sorted:\n"); - fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), - lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } - - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - LinearOffsetVector& offsets = refIndex.Offsets; - SaveLinearOffset(offsets, bAlignment, lastOffset); - } - - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { - - // save Bam bin entry - BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& binMap = refIndex.Bins; - SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; - - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out - if ( saveRefID < 0 ) break; - } - - // make sure that current file pointer is beyond lastOffset - if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { - fprintf(stderr, "Error in BGZF offsets.\n"); - exit(1); - } - - // update lastOffset - lastOffset = m_BGZF->Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); - if ( indexIter == m_indexData.end() ) return false; // error - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& binMap = refIndex.Bins; - SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // simplify index by merging chunks - MergeChunks(); - - // iterate through references in index - // sort offsets in linear offset vector - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { - - // get reference index data - ReferenceIndex& refIndex = (*indexIter).second; - LinearOffsetVector& offsets = refIndex.Offsets; - - // sort linear offsets - sort(offsets.begin(), offsets.end()); - } - - // rewind file pointer to beginning of alignments, return success/fail - return m_reader->Rewind(); -} - -// check index file magic number, return true if OK -bool BamStandardIndex::CheckMagicNumber(void) { - - // read in magic number - char magic[4]; - size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); - - // compare to expected value - if ( strncmp(magic, "BAI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - fclose(m_indexStream); - return false; - } - - // return success/failure of load - return (elementsRead == 4); -} - -// clear all current index offset data in memory -void BamStandardIndex::ClearAllData(void) { - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } -} - -// clear all index offset data for desired reference -void BamStandardIndex::ClearReferenceOffsets(const int& refId) { - - // look up refId, skip if not found - BamStandardIndexData::iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return ; - - // clear reference data - ReferenceIndex& refEntry = (*indexIter).second; - refEntry.Bins.clear(); - refEntry.Offsets.clear(); - - // set flag - m_hasFullDataCache = false; -} - -// return file position after header metadata -const off_t BamStandardIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} - -// calculates offset(s) for a given region -bool BamStandardIndex::GetOffsets(const BamRegion& region, - const bool isRightBoundSpecified, - vector<int64_t>& offsets, - bool* hasAlignmentsInRegion) -{ - // return false if leftBound refID is not found in index data - if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) - return false; - - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; - } - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); - - // get bins for this reference - BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end() ) return false; // error - const ReferenceIndex& refIndex = (*indexIter).second; - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& linearOffsets = refIndex.Offsets; - const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) - ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); - - // store all alignment 'chunk' starts (file offsets) for bins in this region - for ( int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - // iterate over chunks - const ChunkVector& chunks = (*binIter).second; - std::vector<Chunk>::const_iterator chunksIter = chunks.begin(); - std::vector<Chunk>::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - - // if valid chunk found, store its file offset - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) - offsets.push_back( chunk.Start ); - } - } - } - - // clean up memory - free(bins); - - // sort the offsets before returning - sort(offsets.begin(), offsets.end()); - - // set flag & return success - *hasAlignmentsInRegion = (offsets.size() != 0 ); - - // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); - - // return succes - return true; -} - -// returns whether reference has alignments or no -bool BamStandardIndex::HasAlignments(const int& refId) const { - BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return false; // error - const ReferenceIndex& refEntry = (*indexIter).second; - return refEntry.HasAlignments; -} - -// return true if all index data is cached -bool BamStandardIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; -} - -// returns true if index cache has data for desired reference -bool BamStandardIndex::IsDataLoaded(const int& refId) const { - - // look up refId, return false if not found - BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end() ) return false; - - // see if reference has alignments - // if not, it's not a problem to have no offset data - const ReferenceIndex& refEntry = (*indexIter).second; - if ( !refEntry.HasAlignments ) return true; - - // return whether bin map contains data - return ( !refEntry.Bins.empty() ); -} - -// attempts to use index to jump to region; returns success/fail -bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) - return false; - - // calculate offsets for this region - // if failed, print message, set flag, and return failure - vector<int64_t> offsets; - if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); - *hasAlignmentsInRegion = false; - return false; - } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - // set flag to true if data exists - result &= m_BGZF->Seek(*o); - *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to the offset before the 'current offset' (to cover overlaps) - if ( ((bAlignment.RefID == region.LeftRefID) && - ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || - (bAlignment.RefID > region.LeftRefID) ) - { - if ( o != offsets.begin() ) --o; - return m_BGZF->Seek(*o); - } - } - - // if error in jumping, print message & set flag - if ( !result ) { - fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); - *hasAlignmentsInRegion = false; - } - - // return success/failure - return result; -} - -// clears index data from all references except the first -void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets((*indexBegin).first); -} - -// clears index data from all references except the one specified -void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamStandardIndexData::iterator mapIter = m_indexData.begin(); - BamStandardIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); - } -} - -bool BamStandardIndex::LoadAllReferences(bool saveData) { - - // skip if data already loaded - if ( m_hasFullDataCache ) return true; - - // get number of reference sequences - uint32_t numReferences; - if ( !LoadReferenceCount((int&)numReferences) ) - return false; - - // iterate over reference entries - bool loadedOk = true; - for ( int i = 0; i < (int)numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); - - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; - - // return success/failure of loading references - return loadedOk; -} - -// load header data from index file, return true if loaded OK -bool BamStandardIndex::LoadHeader(void) { - - bool loadedOk = CheckMagicNumber(); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of load - return loadedOk; -} - -// load a single index bin entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { - - size_t elementsRead = 0; - - // get bin ID - uint32_t binId; - elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(binId); - - // load alignment chunks for this bin - ChunkVector chunks; - bool chunksOk = LoadChunks(chunks, saveData); - - // store bin entry - if ( chunksOk && saveData ) - refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks)); - - // return success/failure of load - return ( (elementsRead == 1) && chunksOk ); -} - -bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { - - size_t elementsRead = 0; - - // get number of bins - int32_t numBins; - elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numBins); - - // set flag - refEntry.HasAlignments = ( numBins != 0 ); - - // iterate over bins - bool binsOk = true; - for ( int i = 0; i < numBins; ++i ) - binsOk &= LoadBin(refEntry, saveData); - - // return success/failure of load - return ( (elementsRead == 1) && binsOk ); -} - -// load a single index bin entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { - - size_t elementsRead = 0; - - // read in chunk data - uint64_t start; - uint64_t stop; - elementsRead += fread(&start, sizeof(start), 1, m_indexStream); - elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save data if requested - if ( saveData ) chunks.push_back( Chunk(start, stop) ); - - // return success/failure of load - return ( elementsRead == 2 ); -} - -bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { - - size_t elementsRead = 0; - - // read in number of chunks - uint32_t numChunks; - elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numChunks); - - // initialize space for chunks if we're storing this data - if ( saveData ) chunks.reserve(numChunks); - - // iterate over chunks - bool chunksOk = true; - for ( int i = 0; i < (int)numChunks; ++i ) - chunksOk &= LoadChunk(chunks, saveData); - - // sort chunk vector - sort( chunks.begin(), chunks.end(), ChunkLessThan ); - - // return success/failure of load - return ( (elementsRead == 1) && chunksOk ); -} - -// load a single index linear offset entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { - - size_t elementsRead = 0; - - // read in number of linear offsets - int32_t numLinearOffsets; - elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); - - // set up destination vector (if we're saving the data) - LinearOffsetVector linearOffsets; - if ( saveData ) linearOffsets.reserve(numLinearOffsets); - - // iterate over linear offsets - uint64_t linearOffset; - for ( int i = 0; i < numLinearOffsets; ++i ) { - elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - if ( saveData ) linearOffsets.push_back(linearOffset); - } - - // sort linear offsets - sort ( linearOffsets.begin(), linearOffsets.end() ); - - // save in reference index entry if desired - if ( saveData ) refEntry.Offsets = linearOffsets; - - // return success/failure of load - return ( elementsRead == (size_t)(numLinearOffsets + 1) ); -} - -bool BamStandardIndex::LoadFirstReference(bool saveData) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference((*indexBegin).first, saveData); -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { - - // look up refId - BamStandardIndexData::iterator indexIter = m_indexData.find(refId); - - // if reference not previously loaded, create new entry - if ( indexIter == m_indexData.end() ) { - ReferenceIndex newEntry; - newEntry.HasAlignments = false; - m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) ); - } - - // load reference data - indexIter = m_indexData.find(refId); - ReferenceIndex& entry = (*indexIter).second; - bool loadedOk = true; - loadedOk &= LoadBins(entry, saveData); - loadedOk &= LoadLinearOffsets(entry, saveData); - return loadedOk; -} - -// loads number of references, return true if loaded OK -bool BamStandardIndex::LoadReferenceCount(int& numReferences) { - - size_t elementsRead = 0; - - // read reference count - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // return success/failure of load - return ( elementsRead == 1 ); -} - -// merges 'alignment chunks' in BAM bin (used for index building) -void BamStandardIndex::MergeChunks(void) { - - // iterate over reference enties - BamStandardIndexData::iterator indexIter = m_indexData.begin(); - BamStandardIndexData::iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter).second; - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) continue; - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if chunk ends where (iterator) chunk starts, then merge - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) - currentChunk.Stop = iteratorChunk.Stop; - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } -} - -// saves BAM bin entry for index -void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -// saves linear offset entry for index -void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} - -// initializes index data structure to hold @count references -void BamStandardIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; -} - -bool BamStandardIndex::SkipToFirstReference(void) { - BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); -} - -// position file pointer to desired reference begin, return true if skipped OK -bool BamStandardIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; - - // read in number of references - uint32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // iterate over reference entries - bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; - } - - // return success - return skippedOk; -} - -// write header to new index file -bool BamStandardIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write magic number - elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return (elementsWritten == 4); -} - -// write index data for all references to new index file -bool BamStandardIndex::WriteAllReferences(void) { - - size_t elementsWritten = 0; - - // write number of reference sequences - int32_t numReferenceSeqs = m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); - elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); - - // iterate over reference sequences - bool refsOk = true; - BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); - BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) - refsOk &= WriteReference( (*indexIter).second ); - - // return success/failure of write - return ( (elementsWritten == 1) && refsOk ); -} - -// write index data for bin to new index file -bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { - - size_t elementsWritten = 0; - - // write BAM bin ID - uint32_t binKey = binId; - if ( m_isBigEndian ) SwapEndian_32(binKey); - elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); - - // write chunks - bool chunksOk = WriteChunks(chunks); - - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); -} - -// write index data for bins to new index file -bool BamStandardIndex::WriteBins(const BamBinMap& bins) { - - size_t elementsWritten = 0; - - // write number of bins - int32_t binCount = bins.size(); - if ( m_isBigEndian ) SwapEndian_32(binCount); - elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); - - // iterate over bins - bool binsOk = true; - BamBinMap::const_iterator binIter = bins.begin(); - BamBinMap::const_iterator binEnd = bins.end(); - for ( ; binIter != binEnd; ++binIter ) - binsOk &= WriteBin( (*binIter).first, (*binIter).second ); - - // return success/failure of write - return ( (elementsWritten == 1) && binsOk ); -} - -// write index data for chunk entry to new index file -bool BamStandardIndex::WriteChunk(const Chunk& chunk) { - - size_t elementsWritten = 0; - - // localize alignment chunk offsets - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // write to index file - elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); - elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); - - // return success/failure of write - return ( elementsWritten == 2 ); -} - -// write index data for chunk entry to new index file -bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { - - size_t elementsWritten = 0; - - // write chunks - int32_t chunkCount = chunks.size(); - if ( m_isBigEndian ) SwapEndian_32(chunkCount); - elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); - - // iterate over chunks - bool chunksOk = true; - ChunkVector::const_iterator chunkIter = chunks.begin(); - ChunkVector::const_iterator chunkEnd = chunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) - chunksOk &= WriteChunk( (*chunkIter) ); - - // return success/failure of write - return ( (elementsWritten == 1) && chunksOk ); -} - -// write index data for linear offsets entry to new index file -bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { - - size_t elementsWritten = 0; - - // write number of linear offsets - int32_t offsetCount = offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(offsetCount); - elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); - - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset - uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) SwapEndian_64(linearOffset); - elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); - } - - // return success/failure of write - return ( elementsWritten == (size_t)(offsetCount + 1) ); -} - -// write index data for a single reference to new index file -bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { - bool refOk = true; - refOk &= WriteBins(refEntry.Bins); - refOk &= WriteLinearOffsets(refEntry.Offsets); - return refOk; -}
--- a/spp/src/BamStandardIndex_p.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,213 +0,0 @@ -// *************************************************************************** -// BamStandardIndex.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the standardized BAM index format (".bai") -// *************************************************************************** - -#ifndef BAM_STANDARD_INDEX_FORMAT_H -#define BAM_STANDARD_INDEX_FORMAT_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include <BamAux.h> -#include <BamIndex.h> -#include <map> -#include <string> -#include <vector> - -namespace BamTools { - -class BamAlignment; - -namespace Internal { - -// BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BAM_LIDX_SHIFT = 14; - -// -------------------------------------------------- -// BamStandardIndex data structures & typedefs -struct Chunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -inline -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { - return lhs.Start < rhs.Start; -} - -typedef std::vector<Chunk> ChunkVector; -typedef std::map<uint32_t, ChunkVector> BamBinMap; -typedef std::vector<uint64_t> LinearOffsetVector; - -struct ReferenceIndex { - - // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - bool HasAlignments; - - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector(), - const bool hasAlignments = false) - : Bins(binMap) - , Offsets(offsets) - , HasAlignments(hasAlignments) - { } -}; - -typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData; - -class BamStandardIndex : public BamIndex { - - // ctor & dtor - public: - BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - ~BamStandardIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bai"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - const off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // 'internal' methods - public: - - // ----------------------- - // index file operations - - // check index file magic number, return true if OK - bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); - bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); - // load a single index bin entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadChunk(ChunkVector& chunks, bool saveData = true); - bool LoadChunks(ChunkVector& chunks, bool saveData = true); - // load a single index linear offset entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write index data for bin to new index file - bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); - // write index data for bins to new index file - bool WriteBins(const BamBinMap& bins); - // write index data for chunk entry to new index file - bool WriteChunk(const Chunk& chunk); - // write index data for chunk entry to new index file - bool WriteChunks(const ChunkVector& chunks); - // write index data for linear offsets entry to new index file - bool WriteLinearOffsets(const LinearOffsetVector& offsets); - // write index data single reference to new index file - bool WriteReference(const ReferenceIndex& refEntry); - - // ----------------------- - // index data operations - - // calculate bins that overlap region - int BinsFromRegion(const BamRegion& region, - const bool isRightBoundSpecified, - uint16_t bins[MAX_BIN]); - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculates offset(s) for a given region - bool GetOffsets(const BamRegion& region, - const bool isRightBoundSpecified, - std::vector<int64_t>& offsets, - bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // simplifies index by merging 'chunks' - void MergeChunks(void); - // saves BAM bin entry for index - void SaveBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset); - // saves linear offset entry for index - void SaveLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); - - // data members - private: - - BamStandardIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAM_STANDARD_INDEX_FORMAT_H
--- a/spp/src/BamToolsIndex_p.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,577 +0,0 @@ -// *************************************************************************** -// BamToolsIndex.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the BamTools index format (".bti") -// *************************************************************************** - -#include <BamAlignment.h> -#include <BamReader.h> -#include <BGZF.h> -#include <BamToolsIndex_p.h> -using namespace BamTools; -using namespace BamTools::Internal; - -#include <cstdio> -#include <cstdlib> -#include <algorithm> -#include <iostream> -#include <map> -using namespace std; - -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) - : BamIndex(bgzf, reader) - , m_blockSize(1000) - , m_dataBeginOffset(0) - , m_hasFullDataCache(false) - , m_inputVersion(0) - , m_outputVersion(BTI_1_2) // latest version - used for writing new index files -{ - m_isBigEndian = BamTools::SystemIsBigEndian(); -} - -// dtor -BamToolsIndex::~BamToolsIndex(void) { - ClearAllData(); -} - -// creates index data (in-memory) from current reader data -bool BamToolsIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - if ( !m_reader->Rewind() ) return false; - - // initialize index data structure with space for all references - const int numReferences = (int)m_references.size(); - m_indexData.clear(); - m_hasFullDataCache = false; - SetReferenceCount(numReferences); - - // set up counters and markers - int32_t currentBlockCount = 0; - int64_t currentAlignmentOffset = m_BGZF->Tell(); - int32_t blockRefId = 0; - int32_t blockMaxEndPosition = 0; - int64_t blockStartOffset = currentAlignmentOffset; - int32_t blockStartPosition = -1; - - // plow through alignments, storing index entries - BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { - - // if block contains data (not the first time through) AND alignment is on a new reference - if ( currentBlockCount > 0 && al.RefID != blockRefId ) { - - // store previous data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // intialize new block for current alignment's reference - currentBlockCount = 0; - blockMaxEndPosition = al.GetEndPosition(); - blockStartOffset = currentAlignmentOffset; - } - - // if beginning of block, save first alignment's refID & position - if ( currentBlockCount == 0 ) { - blockRefId = al.RefID; - blockStartPosition = al.Position; - } - - // increment block counter - ++currentBlockCount; - - // check end position - int32_t alignmentEndPosition = al.GetEndPosition(); - if ( alignmentEndPosition > blockMaxEndPosition ) - blockMaxEndPosition = alignmentEndPosition; - - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == m_blockSize ) { - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - blockStartOffset = m_BGZF->Tell(); - currentBlockCount = 0; - } - - // not the best name, but for the next iteration, this value will be the offset of the *current* alignment - // necessary because we won't know if this next alignment is on a new reference until we actually read it - currentAlignmentOffset = m_BGZF->Tell(); - } - - // store final block with data - BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); - SaveOffsetEntry(blockRefId, entry); - - // set flag - m_hasFullDataCache = true; - - // return success/failure of rewind - return m_reader->Rewind(); -} - -// check index file magic number, return true if OK -bool BamToolsIndex::CheckMagicNumber(void) { - - // see if index is valid BAM index - char magic[4]; - size_t elementsRead = fread(magic, 1, 4, m_indexStream); - if ( elementsRead != 4 ) return false; - if ( strncmp(magic, "BTI\1", 4) != 0 ) { - fprintf(stderr, "Problem with index file - invalid format.\n"); - return false; - } - - // otherwise ok - return true; -} - -// check index file version, return true if OK -bool BamToolsIndex::CheckVersion(void) { - - // read version from file - size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); - - // if version is negative, or zero - if ( m_inputVersion <= 0 ) { - fprintf(stderr, "Problem with index file - invalid version.\n"); - return false; - } - - // if version is newer than can be supported by this version of bamtools - else if ( m_inputVersion > m_outputVersion ) { - fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); - fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); - return false; - } - - // ------------------------------------------------------------------ - // check for deprecated, unsupported versions - // (typically whose format did not accomodate a particular bug fix) - - else if ( (Version)m_inputVersion == BTI_1_0 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); - return false; - } - - else if ( (Version)m_inputVersion == BTI_1_1 ) { - fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); - fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); - return false; - } - - // otherwise ok - else return true; -} - -// clear all current index offset data in memory -void BamToolsIndex::ClearAllData(void) { - BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - const int& refId = (*indexIter).first; - ClearReferenceOffsets(refId); - } -} - -// clear all index offset data for desired reference -void BamToolsIndex::ClearReferenceOffsets(const int& refId) { - if ( m_indexData.find(refId) == m_indexData.end() ) return; - vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets; - offsets.clear(); - m_hasFullDataCache = false; -} - -// return file position after header metadata -const off_t BamToolsIndex::DataBeginOffset(void) const { - return m_dataBeginOffset; -} - -// calculate BAM file offset for desired region -// return true if no error (*NOT* equivalent to "has alignments or valid offset") -// check @hasAlignmentsInRegion to determine this status -// @region - target region -// @offset - resulting seek target -// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status -// N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { - - // return false if leftBound refID is not found in index data - BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; - - // load index data for region if not already cached - if ( !IsDataLoaded(region.LeftRefID) ) { - bool loadedOk = true; - loadedOk &= SkipToReference(region.LeftRefID); - loadedOk &= LoadReference(region.LeftRefID); - if ( !loadedOk ) return false; - } - - // localize index data for this reference (& sanity check that data actually exists) - indexIter = m_indexData.find(region.LeftRefID); - if ( indexIter == m_indexData.end()) return false; - const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets; - if ( referenceOffsets.empty() ) return false; - - // ------------------------------------------------------- - // calculate nearest index to jump to - - // save first offset - offset = (*referenceOffsets.begin()).StartOffset; - - // iterate over offsets entries on this reference - vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin(); - vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - const BamToolsIndexEntry& entry = (*offsetIter); - // break if alignment 'entry' overlaps region - if ( entry.MaxEndPosition >= region.LeftPosition ) break; - offset = (*offsetIter).StartOffset; - } - - // set flag based on whether an index entry was found for this region - *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); - - // if cache mode set to none, dump the data we just loaded - if (m_cacheMode == BamIndex::NoIndexCaching ) - ClearReferenceOffsets(region.LeftRefID); - - // return success - return true; -} - -// returns whether reference has alignments or no -bool BamToolsIndex::HasAlignments(const int& refId) const { - - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - return refEntry.HasAlignments; -} - -// return true if all index data is cached -bool BamToolsIndex::HasFullDataCache(void) const { - return m_hasFullDataCache; -} - -// returns true if index cache has data for desired reference -bool BamToolsIndex::IsDataLoaded(const int& refId) const { - - BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); - if ( indexIter == m_indexData.end()) return false; - const BamToolsReferenceEntry& refEntry = (*indexIter).second; - - if ( !refEntry.HasAlignments ) return true; // no data period - - // return whether offsets list contains data - return !refEntry.Offsets.empty(); -} - -// attempts to use index to jump to region; returns success/fail -bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { - - // clear flag - *hasAlignmentsInRegion = false; - - // check valid BamReader state - if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { - fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); - return false; - } - - // make sure left-bound position is valid - if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) - return false; - - // calculate nearest offset to jump to - int64_t offset; - if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { - fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); - return false; - } - - // return success/failure of seek - return m_BGZF->Seek(offset); -} - -// clears index data from all references except the first -void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - KeepOnlyReferenceOffsets( (*indexBegin).first ); -} - -// clears index data from all references except the one specified -void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) { - BamToolsIndexData::iterator mapIter = m_indexData.begin(); - BamToolsIndexData::iterator mapEnd = m_indexData.end(); - for ( ; mapIter != mapEnd; ++mapIter ) { - const int entryRefId = (*mapIter).first; - if ( entryRefId != refId ) - ClearReferenceOffsets(entryRefId); - } -} - -// load index data for all references, return true if loaded OK -bool BamToolsIndex::LoadAllReferences(bool saveData) { - - // skip if data already loaded - if ( m_hasFullDataCache ) return true; - - // read in number of references - int32_t numReferences; - if ( !LoadReferenceCount(numReferences) ) return false; - //SetReferenceCount(numReferences); - - // iterate over reference entries - bool loadedOk = true; - for ( int i = 0; i < numReferences; ++i ) - loadedOk &= LoadReference(i, saveData); - - // set flag - if ( loadedOk && saveData ) - m_hasFullDataCache = true; - - // return success/failure of load - return loadedOk; -} - -// load header data from index file, return true if loaded OK -bool BamToolsIndex::LoadHeader(void) { - - // check magic number - if ( !CheckMagicNumber() ) return false; - - // check BTI version - if ( !CheckVersion() ) return false; - - // read in block size - size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(m_blockSize); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of load - return (elementsRead == 1); -} - -// load a single index entry from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { - - // read in index entry data members - size_t elementsRead = 0; - BamToolsIndexEntry entry; - elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); - elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); - elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); - if ( elementsRead != 3 ) { - cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; - return false; - } - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(entry.MaxEndPosition); - SwapEndian_64(entry.StartOffset); - SwapEndian_32(entry.StartPosition); - } - - // save data - if ( saveData ) - SaveOffsetEntry(refId, entry); - - // return success/failure of load - return true; -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadFirstReference(bool saveData) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return LoadReference( (*indexBegin).first, saveData ); -} - -// load a single reference from file, return true if loaded OK -// @saveData - save data in memory if true, just read & discard if false -bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { - - // read in number of offsets for this reference - uint32_t numOffsets; - size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - - // initialize offsets container for this reference - SetOffsetCount(refId, (int)numOffsets); - - // iterate over offset entries - for ( unsigned int j = 0; j < numOffsets; ++j ) - LoadIndexEntry(refId, saveData); - - // return success/failure of load - return true; -} - -// loads number of references, return true if loaded OK -bool BamToolsIndex::LoadReferenceCount(int& numReferences) { - - size_t elementsRead = 0; - - // read reference count - elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // return success/failure of load - return ( elementsRead == 1 ); -} - -// saves an index offset entry in memory -void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.HasAlignments = true; - refEntry.Offsets.push_back(entry); -} - -// pre-allocates size for offset vector -void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) { - BamToolsReferenceEntry& refEntry = m_indexData[refId]; - refEntry.Offsets.reserve(offsetCount); - refEntry.HasAlignments = ( offsetCount > 0); -} - -// initializes index data structure to hold @count references -void BamToolsIndex::SetReferenceCount(const int& count) { - for ( int i = 0; i < count; ++i ) - m_indexData[i].HasAlignments = false; -} - -// position file pointer to first reference begin, return true if skipped OK -bool BamToolsIndex::SkipToFirstReference(void) { - BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); - return SkipToReference( (*indexBegin).first ); -} - -// position file pointer to desired reference begin, return true if skipped OK -bool BamToolsIndex::SkipToReference(const int& refId) { - - // attempt rewind - if ( !Rewind() ) return false; - - // read in number of references - int32_t numReferences; - size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); - if ( elementsRead != 1 ) return false; - if ( m_isBigEndian ) SwapEndian_32(numReferences); - - // iterate over reference entries - bool skippedOk = true; - int currentRefId = 0; - while (currentRefId != refId) { - skippedOk &= LoadReference(currentRefId, false); - ++currentRefId; - } - - // return success/failure of skip - return skippedOk; -} - -// write header to new index file -bool BamToolsIndex::WriteHeader(void) { - - size_t elementsWritten = 0; - - // write BTI index format 'magic number' - elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream); - - // write BTI index format version - int32_t currentVersion = (int32_t)m_outputVersion; - if ( m_isBigEndian ) SwapEndian_32(currentVersion); - elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); - - // write block size - int32_t blockSize = m_blockSize; - if ( m_isBigEndian ) SwapEndian_32(blockSize); - elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); - - // store offset of beginning of data - m_dataBeginOffset = ftell64(m_indexStream); - - // return success/failure of write - return ( elementsWritten == 6 ); -} - -// write index data for all references to new index file -bool BamToolsIndex::WriteAllReferences(void) { - - size_t elementsWritten = 0; - - // write number of references - int32_t numReferences = (int32_t)m_indexData.size(); - if ( m_isBigEndian ) SwapEndian_32(numReferences); - elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); - - // iterate through references in index - bool refOk = true; - BamToolsIndexData::const_iterator refIter = m_indexData.begin(); - BamToolsIndexData::const_iterator refEnd = m_indexData.end(); - for ( ; refIter != refEnd; ++refIter ) - refOk &= WriteReferenceEntry( (*refIter).second ); - - return ( (elementsWritten == 1) && refOk ); -} - -// write current reference index data to new index file -bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { - - size_t elementsWritten = 0; - - // write number of offsets listed for this reference - uint32_t numOffsets = refEntry.Offsets.size(); - if ( m_isBigEndian ) SwapEndian_32(numOffsets); - elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); - - // iterate over offset entries - bool entriesOk = true; - vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin(); - vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) - entriesOk &= WriteIndexEntry( (*offsetIter) ); - - return ( (elementsWritten == 1) && entriesOk ); -} - -// write current index offset entry to new index file -bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) { - - // copy entry data - int32_t maxEndPosition = entry.MaxEndPosition; - int64_t startOffset = entry.StartOffset; - int32_t startPosition = entry.StartPosition; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_32(maxEndPosition); - SwapEndian_64(startOffset); - SwapEndian_32(startPosition); - } - - // write the reference index entry - size_t elementsWritten = 0; - elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); - elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); - elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); - return ( elementsWritten == 3 ); -}
--- a/spp/src/BamToolsIndex_p.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,192 +0,0 @@ -// *************************************************************************** -// BamToolsIndex.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index operations for the BamTools index format (".bti") -// *************************************************************************** - -#ifndef BAMTOOLS_INDEX_FORMAT_H -#define BAMTOOLS_INDEX_FORMAT_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include <BamAux.h> -#include <BamIndex.h> -#include <map> -#include <string> -#include <vector> - -namespace BamTools { - -namespace Internal { - -// individual index offset entry -struct BamToolsIndexEntry { - - // data members - int32_t MaxEndPosition; - int64_t StartOffset; - int32_t StartPosition; - - // ctor - BamToolsIndexEntry(const int32_t& maxEndPosition = 0, - const int64_t& startOffset = 0, - const int32_t& startPosition = 0) - : MaxEndPosition(maxEndPosition) - , StartOffset(startOffset) - , StartPosition(startPosition) - { } -}; - -// reference index entry -struct BamToolsReferenceEntry { - - // data members - bool HasAlignments; - std::vector<BamToolsIndexEntry> Offsets; - - // ctor - BamToolsReferenceEntry(void) - : HasAlignments(false) - { } -}; - -// the actual index data structure -typedef std::map<int, BamToolsReferenceEntry> BamToolsIndexData; - -class BamToolsIndex : public BamIndex { - - // keep a list of any supported versions here - // (might be useful later to handle any 'legacy' versions if the format changes) - // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on - // - // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: - // - // if ( indexVersion >= BTI_1_2 ) - // do something new - // else - // do the old thing - enum Version { BTI_1_0 = 1 - , BTI_1_1 - , BTI_1_2 - }; - - - // ctor & dtor - public: - BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); - ~BamToolsIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // returns supported file extension - const std::string Extension(void) const { return std::string(".bti"); } - // returns whether reference has alignments or no - bool HasAlignments(const int& referenceID) const; - // attempts to use index to jump to region; returns success/fail - // a "successful" jump indicates no error, but not whether this region has data - // * thus, the method sets a flag to indicate whether there are alignments - // available after the jump position - bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); - public: - // clear all current index offset data in memory - void ClearAllData(void); - // return file position after header metadata - const off_t DataBeginOffset(void) const; - // return true if all index data is cached - bool HasFullDataCache(void) const; - // clears index data from all references except the first - void KeepOnlyFirstReferenceOffsets(void); - // load index data for all references, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadAllReferences(bool saveData = true); - // load first reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadFirstReference(bool saveData = true); - // load header data from index file, return true if loaded OK - bool LoadHeader(void); - // position file pointer to first reference begin, return true if skipped OK - bool SkipToFirstReference(void); - // write index reference data - bool WriteAllReferences(void); - // write index header data - bool WriteHeader(void); - - // 'internal' methods - public: - - // ----------------------- - // index file operations - - // check index file magic number, return true if OK - bool CheckMagicNumber(void); - // check index file version, return true if OK - bool CheckVersion(void); - // return true if FILE* is open - bool IsOpen(void) const; - // load a single index entry from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadIndexEntry(const int& refId, bool saveData = true); - // load a single reference from file, return true if loaded OK - // @saveData - save data in memory if true, just read & discard if false - bool LoadReference(const int& refId, bool saveData = true); - // loads number of references, return true if loaded OK - bool LoadReferenceCount(int& numReferences); - // position file pointer to desired reference begin, return true if skipped OK - bool SkipToReference(const int& refId); - // write current reference index data to new index file - bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); - // write current index offset entry to new index file - bool WriteIndexEntry(const BamToolsIndexEntry& entry); - - // ----------------------- - // index data operations - - // clear all index offset data for desired reference - void ClearReferenceOffsets(const int& refId); - // calculate BAM file offset for desired region - // return true if no error (*NOT* equivalent to "has alignments or valid offset") - // check @hasAlignmentsInRegion to determine this status - // @region - target region - // @offset - resulting seek target - // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status - bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); - // returns true if index cache has data for desired reference - bool IsDataLoaded(const int& refId) const; - // clears index data from all references except the one specified - void KeepOnlyReferenceOffsets(const int& refId); - // saves an index offset entry in memory - void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); - // pre-allocates size for offset vector - void SetOffsetCount(const int& refId, const int& offsetCount); - // initializes index data structure to hold @count references - void SetReferenceCount(const int& count); - - // data members - private: - int32_t m_blockSize; - BamToolsIndexData m_indexData; - off_t m_dataBeginOffset; - bool m_hasFullDataCache; - bool m_isBigEndian; - int32_t m_inputVersion; // Version is serialized as int - Version m_outputVersion; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMTOOLS_INDEX_FORMAT_H
--- a/spp/src/BamWriter.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -// *************************************************************************** -// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include <BamWriter.h> -#include <BamWriter_p.h> -using namespace BamTools; -using namespace BamTools::Internal; - -#include <iostream> -using namespace std; - -// constructor -BamWriter::BamWriter(void) { - d = new BamWriterPrivate; -} - -// destructor -BamWriter::~BamWriter(void) { - delete d; - d = 0; -} - -// closes the alignment archive -void BamWriter::Close(void) { - d->Close(); -} - -// opens the alignment archive -bool BamWriter::Open(const string& filename, - const string& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) -{ - return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); -} - -// saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { - d->SaveAlignment(al); -}
--- a/spp/src/BamWriter.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -// *************************************************************************** -// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#ifndef BAMWRITER_H -#define BAMWRITER_H - -#include <api_global.h> -#include <BamAlignment.h> -#include <string> - -namespace BamTools { - -namespace Internal { - class BamWriterPrivate; -} // namespace Internal - -class API_EXPORT BamWriter { - - // constructor/destructor - public: - BamWriter(void); - ~BamWriter(void); - - // public interface - public: - // closes the alignment archive - void Close(void); - // opens the alignment archive - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool writeUncompressed = false); - // saves the alignment to the alignment archive - void SaveAlignment(const BamTools::BamAlignment& al); - - // private implementation - private: - Internal::BamWriterPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMWRITER_H
--- a/spp/src/BamWriter_p.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,379 +0,0 @@ -// *************************************************************************** -// BamWriter_p.cpp (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 22 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include <BamAlignment.h> -#include <BamWriter_p.h> -using namespace BamTools; -using namespace BamTools::Internal; -using namespace std; - -BamWriterPrivate::BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); -} - -BamWriterPrivate::~BamWriterPrivate(void) { - mBGZF.Close(); -} - -// closes the alignment archive -void BamWriterPrivate::Close(void) { - mBGZF.Close(); -} - -// calculates minimum bin for a BAM alignment interval -const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { - --end; - if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); - return 0; -} - -// creates a cigar string from the supplied alignment -void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) { - - // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - unsigned int cigarOp; - vector<CigarOp>::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - unsigned char nucleotideCode; - bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - - case '=': - nucleotideCode = 0; - break; - - case 'A': - nucleotideCode = 1; - break; - - case 'C': - nucleotideCode = 2; - break; - - case 'G': - nucleotideCode = 4; - break; - - case 'T': - nucleotideCode = 8; - break; - - case 'N': - nucleotideCode = 15; - break; - - default: - fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; - } -} - -// opens the alignment archive -bool BamWriterPrivate::Open(const string& filename, - const string& samHeader, - const RefVector& referenceSequences, - bool isWriteUncompressed) -{ - // open the BGZF file for writing, return failure if error - if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) - return false; - - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - uint32_t samHeaderLen = samHeader.size(); - if (IsBigEndian) SwapEndian_32(samHeaderLen); - mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) - mBGZF.Write(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if (IsBigEndian) SwapEndian_32(numReferenceSequences); - mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { - - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); - } - - // return success - return true; -} - -// saves the alignment to the alignment archive -void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); - } - - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( IsBigEndian ) { - - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } - - mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - mBGZF.Write(packedCigar.data(), packedCigarLength); - - // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); - - // write the base qualities - string baseQualities(al.Qualities); - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) { - pBaseQualities[i] -= 33; - } - mBGZF.Write(pBaseQualities, queryLength); - - // write the read group tag - if ( IsBigEndian ) { - - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); - - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); - } - } - - mBGZF.Write(tagData, tagDataLength); - free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); - } -}
--- a/spp/src/BamWriter_p.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,63 +0,0 @@ -// *************************************************************************** -// BamWriter_p.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#ifndef BAMWRITER_P_H -#define BAMWRITER_P_H - -// ------------- -// W A R N I N G -// ------------- -// -// This file is not part of the BamTools API. It exists purely as an -// implementation detail. This header file may change from version to -// version without notice, or even be removed. -// -// We mean it. - -#include <BamAux.h> -#include <BGZF.h> -#include <string> -#include <vector> - -namespace BamTools { -namespace Internal { - -class BamWriterPrivate { - - // ctor & dtor - public: - BamWriterPrivate(void); - ~BamWriterPrivate(void); - - // "public" interface to BamWriter - public: - void Close(void); - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool isWriteUncompressed); - void SaveAlignment(const BamAlignment& al); - - // internal methods - public: - const unsigned int CalculateMinimumBin(const int begin, int end) const; - void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar); - void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); - - // data members - public: - BgzfData mBGZF; - bool IsBigEndian; -}; - -} // namespace Internal -} // namespace BamTools - -#endif // BAMWRITER_P_H
--- a/spp/src/Makevars.in Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -PKG_LIBS=@LIBS@ -lz -PKG_CFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@ -PKG_CXXFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@ -
--- a/spp/src/api_global.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,22 +0,0 @@ -// *************************************************************************** -// api_global.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides macros for exporting & importing BamTools API library symbols -// *************************************************************************** - -#ifndef API_GLOBAL_H -#define API_GLOBAL_H - -#include "bamtools_global.h" - -#ifdef BAMTOOLS_API_LIBRARY -# define API_EXPORT BAMTOOLS_LIBRARY_EXPORT -#else -# define API_EXPORT BAMTOOLS_LIBRARY_IMPORT -#endif - -#endif // API_GLOBAL_H
--- a/spp/src/bamread.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,222 +0,0 @@ -#include "pc.h" -#include "config.h" -#include <vector> -#include <string.h> -#include <iostream> -#include <fstream> -#include <sstream> -#include <strstream> -#include <algorithm> -#include <string> -#include <functional> -#include <utility> -#include <ext/hash_map> -#include <boost/tokenizer.hpp> - -#include "BamAlignment.h" -#include "BamAux.h" /* RefVector/RefData */ -#include "BamReader.h" - - -extern "C" { -#include "R.h" -#include "Rmath.h" -#include "Rinternals.h" -#include "Rdefines.h" -} - -using namespace std; -using namespace __gnu_cxx; - - -class lessAbsoluteValue { -public: - bool operator()(int a, int b) const { - return abs(a) < abs(b); - } -}; - - - - - -//#define DEBUG 1 - -extern "C" { - - - // read in bam file - SEXP read_bam(SEXP filename,SEXP read_tag_names_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - boost::char_separator<char> sep2(","); - - BamTools::BamReader bamf; - - if (!bamf.Open(fname)) { - cout << "ERROR: failed to open BAM file '" << fname << "'" << endl; - } else { - - Rprintf("opened %s\n",fname); - BamTools::RefVector refs = bamf.GetReferenceData(); - BamTools::BamAlignment al; - - int fcount=0; - while (bamf.GetNextAlignment(al)) { - if (!al.IsMapped() || !al.IsPrimaryAlignment()) { - continue; - } - - string tagname=al.Name; - string chr=refs[al.RefID].RefName; - int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0)); - if(al.IsReverseStrand()) { fpos=-1*fpos; } - - uint32_t nms; - int nm=0; - if (al.GetEditDistance(nms)) { - nm=nms; - } - - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(al.Name); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm); - if(fcount>30) { - break; - } -#endif - - } - bamf.Close(); - - Rprintf("done. read %d fragments\n",fcount); - } - - - - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - -}
--- a/spp/src/bamtools_global.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,32 +0,0 @@ -// *************************************************************************** -// bamtools_global.h (c) 2010 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 19 November 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic definitions for exporting & importing library symbols -// *************************************************************************** - -#ifndef BAMTOOLS_GLOBAL_H -#define BAMTOOLS_GLOBAL_H - -// BAMTOOLS_LIBRARY_EXPORT -#ifndef BAMTOOLS_LIBRARY_EXPORT -# if defined(WIN32) -# define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport) -# else -# define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default"))) -# endif -#endif // BAMTOOLS_LIBRARY_EXPORT - -// BAMTOOLS_LIBRARY_IMPORT -#ifndef BAMTOOLS_LIBRARY_IMPORT -# if defined(WIN32) -# define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport) -# else -# define BAMTOOLS_LIBRARY_IMPORT -# endif -#endif // BAMTOOLS_LIBRARY_IMPORT - -#endif // BAMTOOLS_GLOBAL_H
--- a/spp/src/bed2vector.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,2628 +0,0 @@ -#include "pc.h" -#include "config.h" -#include <vector> -#include <string.h> -#include <iostream> -#include <fstream> -#include <sstream> -#include <strstream> -#include <algorithm> -#include <string> -#include <functional> -#include <utility> -#include <ext/hash_map> -#include <boost/tokenizer.hpp> - -#ifdef HAVE_LIBBZ2 -#include <bzlib.h> -#endif - -extern "C" { -#include "R.h" -#include "Rmath.h" -#include "Rinternals.h" -#include "Rdefines.h" -} - -using namespace std; -using namespace __gnu_cxx; - - -class lessAbsoluteValue { -public: - bool operator()(int a, int b) const { - return abs(a) < abs(b); - } -}; - - - -#ifdef HAVE_LIBBZ2 -int get_bzline(BZFILE* b,string& line) { - char c; - int nBuf; - int bzerror=BZ_OK; - - while(bzerror == BZ_OK) { - nBuf=BZ2_bzRead(&bzerror, b, &c, 1); - if(bzerror==BZ_OK) { - if(c=='\n') { - return bzerror; - } else { - line+=c; - } - } - } - return bzerror; -} - -int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) { - line=""; - if(bz2file) { - int bzerror=get_bzline(b,line); - if(bzerror==BZ_OK) { - return(1); - } else { - if(bzerror!=BZ_STREAM_END) { - cerr<<"encountered BZERROR="<<bzerror<<endl; - } - return(0); - } - } else { - char *cline=NULL; - size_t n; - if(getline(&cline,&n,f) != -1) { - if(cline) { - cline[strlen(cline)-1]='\0'; - line+=cline; - free(cline); - } - return(1); - } else { - return(0); - } - } -} -#endif - - -/** - * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign - * corresponding to the strand. - */ - -//#define DEBUG 1 - -extern "C" { -SEXP read_bed_ends(SEXP filename) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t"); - - - ifstream bed_file(fname); - -#ifdef DEBUG - Rprintf("opened %s\n",fname); -#endif - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - - int fcount=0; - while(getline(bed_file,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str())); - string str_start=*sit++; - int fstart=atoi(str_start.c_str()); - string str_end=*sit++; - int fend=atoi(str_end.c_str()); - int fpos=fstart; - if(sit!=tok.end()) { - string u0=*sit++; - string nfield=*sit++; - string strand=*sit++; - if(strand=="-") { - fpos=-1*fend; - } - } - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos); - if(fcount>30) { - break; - } -#endif - - } - } - bed_file.close(); - - -#ifdef DEBUG - Rprintf("done. read %d fragments\n",fcount); -#endif - - Rprintf("done. read %d fragments\n",fcount); - - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - sort(csi->begin(), csi->end(), lessAbsoluteValue()); - } - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - SEXP nv; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - int* i_nv=INTEGER(nv); - int i=0; - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_nv[i++]=*pi; - } - SET_VECTOR_ELT(ans, csi-pos.begin(), nv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - -SEXP read_meland_old(SEXP filename) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<int> > poslen; // length - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t"); - - - ifstream bed_file(fname); - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - - int fcount=0; - while(getline(bed_file,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - sit++; sit++; - string str_nm=*sit++; - int nm=0; - if(str_nm[0]=='U') { - nm=atoi((str_nm.c_str()+1)); - } else { - continue; - } - sit++; sit++; sit++; - string str_len=*sit++; - int len=atoi(str_len.c_str()); - string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); - string str_pos=*sit++; - int fpos=atoi(str_pos.c_str()); - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - poslen.push_back(vector<int>()); -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - (poslen[cind]).push_back(len); -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - bed_file.close(); - - -#ifdef DEBUG - Rprintf("done. read %d fragments\n",fcount); -#endif - - Rprintf("done. read %d fragments\n",fcount); - - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi,lsi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - lsi=poslen.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - SET_STRING_ELT(dnames_R, 2, mkChar("l")); - - - - SEXP tv,nv,lv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - PROTECT(lv=allocVector(INTSXP,csi->size())); np++; - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - int* i_lv=INTEGER(lv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - vector<int>::const_iterator ili=lsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i_lv[i]=*ili++; - i++; - } - PROTECT(dv = allocVector(VECSXP, 3)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - SET_VECTOR_ELT(dv, 2, lv); - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - int get_a_line(FILE *f,string& line) { - line=""; - char cline[1024]; - if(fgets(cline,1024,f)) { - line+=cline; - return(1); - } else { - return(0); - } - } - - - SEXP read_meland(SEXP filename,SEXP read_tag_names_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<int> > poslen; // length - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t"); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - - Rprintf("opened %s\n",fname); - - - // read in bed line - string line; - int fcount=0; - while(get_a_line(f,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string tagname=*sit++; - sit++; - string str_nm=*sit++; - int nm=0; - if(str_nm[0]=='U') { - nm=atoi((str_nm.c_str()+1)); - } else { - continue; - } - sit++; sit++; sit++; - string str_len=*sit++; - int len=atoi(str_len.c_str()); - string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); - string str_pos=*sit++; - int fpos=atoi(str_pos.c_str()); - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - poslen.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - (poslen[cind]).push_back(len); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - -#ifdef DEBUG - Rprintf("done. read %d fragments\n",fcount); -#endif - - Rprintf("done. read %d fragments\n",fcount); - - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi,lsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - lsi=poslen.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - SET_STRING_ELT(dnames_R, 2, mkChar("l")); - if(read_names) { - SET_STRING_ELT(dnames_R, 3, mkChar("s")); - } - - - - SEXP tv,nv,lv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - PROTECT(lv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - int* i_lv=INTEGER(lv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - vector<int>::const_iterator ili=lsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i_lv[i]=*ili++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - SET_VECTOR_ELT(dv, 2, lv); - if(read_names) { - SET_VECTOR_ELT(dv, 3, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - -// reads regular eland files, recording mismatch positions -SEXP read_eland_mismatches(SEXP filename) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > mm1; // position of the first mismatch (or 0 for none) - vector< vector<int> > mm2; // position of the second mismatch - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - while(get_a_line(f,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - sit++; - string seq=*sit++; - string str_nm=*sit++; - int nm=0; - if(str_nm[0]=='U') { - nm=atoi((str_nm.c_str()+1)); - } else { - continue; - } - sit++; sit++; sit++; - string chr=*sit++; - // extract chromosome name from this - int chrp=chr.find("chr"); - int pp=chr.find('.'); - chr=chr.substr(chrp+3,pp-chrp-3); - - string str_pos=*sit++; - int fpos=atoi(str_pos.c_str()); - - - string strand=*sit++; - int nstrand=0; - if(strand=="R") { - fpos=-1*(fpos+seq.size()-1); - nstrand=1; - } - - sit++; - - int nm1=0; int nm2=0; - if(sit!=tok.end()) { - string nms=*sit++; - nm1=atoi(nms.substr(0,nms.size()-1).c_str()); - if(nstrand) { nm1=seq.size()-nm1+1; } - } - if(sit!=tok.end()) { - string nms=*sit++; - nm2=atoi(nms.substr(0,nms.size()-1).c_str()); - if(nstrand) { nm2=seq.size()-nm2+1; } - } - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - mm1.push_back(vector<int>()); - mm2.push_back(vector<int>()); -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (mm1[cind]).push_back(nm1); - (mm2[cind]).push_back(nm2); -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - -#ifdef DEBUG - Rprintf("done. read %d fragments\n",fcount); -#endif - - Rprintf("done. read %d fragments\n",fcount); - - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi,lsi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=mm1.begin()+(csi-pos.begin()); - lsi=mm2.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("f")); - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - - - - SEXP tv,nv,lv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - PROTECT(lv=allocVector(INTSXP,csi->size())); np++; - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - int* i_lv=INTEGER(lv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - vector<int>::const_iterator ili=lsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i_lv[i]=*ili++; - i++; - } - PROTECT(dv = allocVector(VECSXP, 3)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - SET_VECTOR_ELT(dv, 2, lv); - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - // read in regular eland files, adjusting the negative strand coordinate by sequence length - SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); - int eland_tag_length=*(INTEGER(eland_tag_length_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - while(get_a_line(f,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string tagname=*sit++; - string sequence=*sit++; - int len=sequence.size(); - // adjust probe length if eland length limit was specified - if(eland_tag_length>0 && len>eland_tag_length) { - len=eland_tag_length; - } - string str_nm=*sit++; - int nm=0; - if(str_nm[0]=='U') { - nm=atoi((str_nm.c_str()+1)); - } else { - continue; - } - sit++; sit++; sit++; - string chr=*sit++; - string str_pos=*sit++; - int fpos=atoi(str_pos.c_str()); - string str_strand=*sit++; - - if(str_strand[0]=='R') { - fpos=-1*(fpos+len-1); - } - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - - // read in extended eland files, adjusting the negative strand coordinate by sequence length - SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); - int eland_tag_length=*(INTEGER(eland_tag_length_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - while(get_a_line(f,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string machinename=*sit++; - string runnumber=*sit++; - string lanenumber=*sit++; - *sit++; - - string str_x=*sit++; - string str_y=*sit++; - - string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y; - - - - *sit++; - *sit++; - - - string sequence=*sit++; - *sit++; - - string chr=*sit++; - string contig=*sit++; - chr=chr+contig; - - int len=sequence.size(); - // adjust probe length if eland length limit was specified - if(eland_tag_length>0 && len>eland_tag_length) { - len=eland_tag_length; - } - - - - string str_pos=*sit++; - if(str_pos.size()<1) { continue; } - int fpos=atoi(str_pos.c_str()); - string str_strand=*sit++; - - if(str_strand[0]=='R') { - fpos=-1*(fpos+len-1); - } - - string str_nm=*sit++; - // count non-digit characters - int nm=0; - for(int i=0;i<str_nm.size();i++) { - if(!isdigit(str_nm[i])) { nm++; } - } - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - // read in eland multi files, adjusting the negative strand coordinate by sequence length -SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { - -#ifdef DEBUG - Rprintf("read_eland_muti() : start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); - int eland_tag_length=*(INTEGER(eland_tag_length_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t",""); - boost::char_separator<char> comsep(",","",boost::keep_empty_tokens); - boost::char_separator<char> colsep(":","",boost::keep_empty_tokens); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int nline=0; - int fcount=0; - while(get_a_line(f,line)) { - nline++; - // chomp - size_t elpos = line.find_last_not_of("\n"); - if(elpos != string::npos) { - line = line.substr(0, elpos+1); - } -#ifdef DEBUG - Rprintf("line %d: %s\n",nline,line.c_str()); -#endif - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string tagname=*sit++; - string sequence=*sit++; - string mspec=*sit++; - // parse out match spec - - if(mspec=="NM" || mspec=="QC") { continue; } -#ifdef DEBUG - Rprintf("parsing out spec \"%s\" : ",mspec.c_str()); -#endif - - tokType stok(mspec, colsep); - tokType::iterator ssit=stok.begin(); - string str_nm0=*ssit++; - - int nm=0; - int nm0=atoi(str_nm0.c_str()); - if(nm0>1) { -#ifdef DEBUG - Rprintf("rejected for nm0\n"); -#endif - continue; - } - if(nm0==0) { - string str_nm1=*ssit++; - int nm1=atoi(str_nm1.c_str()); - if(nm1>1) { -#ifdef DEBUG - Rprintf("rejected for nm1\n"); -#endif - continue; - } - if(nm1==0) { - string str_nm2=*ssit++; - int nm2=atoi(str_nm2.c_str()); - if(nm2>1) { -#ifdef DEBUG - Rprintf("rejected for nm2\n"); -#endif - continue; - } - nm=2; - } else { - nm=1; - } - } - -#ifdef DEBUG - Rprintf("accepted (nm=%d)\n",nm); -#endif - int npos=0; - string mpos=*sit++; - vector<string> mposc; - vector<int> mposp; - tokType ptok(mpos, comsep); - string prevchr; - for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) { - string cpos=*psit; - npos++; - int strand=1; - if(cpos.size()<5) { - Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str()); - } - char lc=cpos.at(cpos.size()-1); - - if(atoi(&lc)==nm) { - switch(cpos.at(cpos.size()-2)) { - case 'R': strand=-1; break; - case 'F': strand=1; break; - default: - Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break; - continue; - } - string chr,str_pos; - size_t colpos=cpos.find(":"); - if(colpos==string::npos) { - if(npos>1) { - chr=prevchr; - str_pos=cpos.substr(0,cpos.size()-2); - } else { - Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str()); - continue; - } - } else { - chr=cpos.substr(0,colpos); - str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos); - } -#ifdef DEBUG - Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand); -#endif - int pos=strand*atoi(str_pos.c_str()); - mposc.push_back(chr); - mposp.push_back(pos); - } - } - - string chr; - int fpos; - if(mposc.size()!=1) { - if(mposc.size()==0) { - Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str()); - } else { - Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str()); - } - continue; - } else { - chr=*mposc.begin(); - fpos=*mposp.begin(); - } - - int len=sequence.size(); - // adjust probe length if eland length limit was specified - if(eland_tag_length>0 && len>eland_tag_length) { - len=eland_tag_length; - } - - if(fpos<0) { - fpos=-1*(-1*fpos+len-1); - } - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - // read in regular eland files, adjusting the negative strand coordinate by sequence length - SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - boost::char_separator<char> sep2(","); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; - } else { -#ifdef HAVE_LIBBZ2 - BZFILE* b; - int bzerror; - - int bz2file=0; - if(strstr(fname,".bz2")) { - bz2file=1; - b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); - if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } - } -#endif - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; -#ifdef HAVE_LIBBZ2 - while(get_a_line(f,b,bz2file,line)) { -#else - while(get_a_line(f,line)) { -#endif - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string tagname=*sit++; - string str_strand=*sit++; - string chr=*sit++; - - string str_pos=*sit++; - int fpos=atoi(str_pos.c_str()); - - string sequence=*sit++; - sit++; sit++; - string mm=*sit++; - - int len=sequence.size(); - if(str_strand[0]=='-') { - fpos=-1*(fpos+len-1); - } - // determine number of mismatches - int nm=0; - if(mm.size()>0) { - nm++; - string::size_type tp(0); - while(tp!=string::npos) { - tp = mm.find(",",tp); - if(tp!=string::npos) { - tp++; - ++nm; - } - } - } - - - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - -#ifdef HAVE_LIBBZ2 - BZ2_bzReadClose( &bzerror, b); -#endif - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - // read in helicos tab-separated alignment output (regular or bz2) - SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<int> > poslen; // length of the match - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - boost::char_separator<char> sep2(","); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; - } else { -#ifdef HAVE_LIBBZ2 - BZFILE* b; - int bzerror; - - int bz2file=0; - if(strstr(fname,".bz2")) { - bz2file=1; - b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); - if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } - } -#endif - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - int nlines=0; -#ifdef HAVE_LIBBZ2 - while(get_a_line(f,b,bz2file,line)) { -#else - while(get_a_line(f,line)) { -#endif - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - nlines++; - // skip comments - if(line[0]=='#') { continue; } - if(line.compare(0,12,"Reference_ID")==0) { -#ifdef DEBUG - Rprintf("matched header on line %d\n",nlines); -#endif - continue; - } - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string chr=*sit++; - string tagname=*sit++; - string str_startpos=*sit++; - string str_endpos=*sit++; - - string str_tstart=*sit++; - string str_tend=*sit++; - int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str()); - - sit++; sit++; - string str_ndel=*sit++; - string str_nins=*sit++; - string str_nsub=*sit++; - - string str_strand=*sit++; - int fpos; - if(str_strand[0]=='-') { - fpos=-1*atoi(str_endpos.c_str()); - } else { - fpos=atoi(str_startpos.c_str()); - } - - // determine number of mismatches - int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str()); - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - poslen.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - (poslen[cind]).push_back(len); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm); - if(fcount>30) { - break; - } -#endif - - } - } - -#ifdef HAVE_LIBBZ2 - BZ2_bzReadClose( &bzerror, b); -#endif - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<int> >::const_iterator lsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - lsi=poslen.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - SET_STRING_ELT(dnames_R, 2, mkChar("l")); - if(read_names) { - SET_STRING_ELT(dnames_R, 3, mkChar("s")); - } - - - - SEXP tv,nv,lv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - PROTECT(lv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - int* i_lv=INTEGER(lv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - vector<int>::const_iterator lni=lsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i_lv[i]=*lni++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - SET_VECTOR_ELT(dv, 2, lv); - if(read_names) { - SET_VECTOR_ELT(dv, 3, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - - // read in text version of maq map - SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - while(get_a_line(f,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string tagname=*sit++; - string chr=*sit++; - string str_pos=*sit++; - int fpos=atoi(str_pos.c_str()); - string str_strand=*sit++; - sit++; sit++; sit++; sit++; sit++; - string str_nm=*sit++; - sit++; sit++; sit++; - string str_len=*sit++; - int nm=atoi(str_nm.c_str()); - int len=atoi(str_len.c_str()); - - if(str_strand[0]=='-') { - fpos=-1*(fpos+len-1); - } - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - - - - // read in tagalign file - SEXP read_tagalign(SEXP filename) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t"); - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - while(get_a_line(f,line)) { - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string chr=*sit++; - string str_spos=*sit++; - string str_epos=*sit++; - sit++; - string str_qual=*sit++; - string str_strand=*sit; - - int fpos; - if(str_strand[0]=='+') { - fpos=atoi(str_spos.c_str()); - } else { - fpos=-1*atoi(str_epos.c_str()); - } - int nm=atoi(str_qual.c_str()); - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); - if(fcount>30) { - break; - } -#endif - - } - } - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - - - SEXP tv,nv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - PROTECT(dv = allocVector(VECSXP, 2)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - - - // arachne madness - SEXP read_arachne(SEXP filename) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t"); - - - - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - -#ifdef HAVE_LIBBZ2 - BZFILE* b; - int bzerror; - - int bz2file=0; - if(strstr(fname,".bz2")) { - bz2file=1; - b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); - if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } - } -#endif - - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; -#ifdef HAVE_LIBBZ2 - while(get_a_line(f,b,bz2file,line)) { -#else - while(get_a_line(f,line)) { -#endif - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string chr=*sit++; - string str_spos=*sit++; - int nm=0; - if(sit!=tok.end()) { - string str_mm=*sit; - nm=atoi(str_mm.c_str()); - } - - int fpos=atoi(str_spos.c_str());; - - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); - if(fcount>30) { - break; - } -#endif - - } - } -#ifdef HAVE_LIBBZ2 - BZ2_bzReadClose( &bzerror, b); -#endif - - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - - - SEXP tv,nv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - PROTECT(dv = allocVector(VECSXP, 2)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - - // arachne madness - SEXP read_arachne_long(SEXP filename) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<int> > poslen; // length of the match - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - typedef boost::tokenizer<boost::char_separator<char> > tokType; - boost::char_separator<char> sep(" \t"); - - - - - - FILE *f=fopen(fname,"rb"); - if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } - else { - -#ifdef HAVE_LIBBZ2 - BZFILE* b; - int bzerror; - - int bz2file=0; - if(strstr(fname,".bz2")) { - bz2file=1; - b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); - if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } - } -#endif - - - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; -#ifdef HAVE_LIBBZ2 - while(get_a_line(f,b,bz2file,line)) { -#else - while(get_a_line(f,line)) { -#endif - -#ifdef DEBUG - Rprintf("line: %s\n",line.c_str()); -#endif - - - tokType tok(line, sep); - tokType::iterator sit=tok.begin(); - if(sit!=tok.end()) { - string query=*sit++; - if(query!="QUERY") { continue; } - *sit++; *sit++; *sit++; *sit++; - string str_strand=*sit++; - string chr=*sit++; - string str_startpos=*sit++; - string str_endpos=*sit++; - - int fpos; - if(str_strand[0]=='1') { - fpos=-1*atoi(str_endpos.c_str()); - } else { - fpos=atoi(str_startpos.c_str()); - } -#ifdef DEBUG - Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos); -#endif - *sit++; - string str_nblocks=*sit++; - int nblocks=atoi(str_nblocks.c_str()); -#ifdef DEBUG - Rprintf("nblocks=%d\n",nblocks); -#endif - // tally up the read length and the number of mismatches for all blocks - int len=0; int nm=0; - for(int i=0;i<nblocks;i++) { - string str_sgs=*sit++; - int sgs=atoi(str_sgs.c_str()); - string str_slen=*sit++; - int slen=atoi(str_slen.c_str()); - string str_snm=*sit++; - int snm=atoi(str_snm.c_str()); -#ifdef DEBUG - Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm); -#endif - len+=slen; - nm+=abs(sgs)+snm; - } - nm+=nblocks-1; - - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - poslen.push_back(vector<int>()); -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - (poslen[cind]).push_back(len); -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - } -#ifdef HAVE_LIBBZ2 - BZ2_bzReadClose( &bzerror, b); -#endif - - fclose(f); - - Rprintf("done. read %d fragments\n",fcount); - } - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<int> >::const_iterator lsi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - lsi=poslen.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - SET_STRING_ELT(dnames_R, 2, mkChar("l")); - - - SEXP tv,nv,lv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - PROTECT(lv=allocVector(INTSXP,csi->size())); np++; - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - int* i_lv=INTEGER(lv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - vector<int>::const_iterator lni=lsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i_lv[i]=*lni++; - i++; - } - PROTECT(dv = allocVector(VECSXP, 3)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - SET_VECTOR_ELT(dv, 2, lv); - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - -}
--- a/spp/src/cdensum.c Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,144 +0,0 @@ -#include <math.h> -#include "R.h" -#include "Rmath.h" -#include "Rinternals.h" - - -#undef DEBUG 1 - -// dout is npos-length output array. -// n - number of positions in pos (and length of tc count array) -// spos - starting position -void cdensum(int *n, double *pos, double *tc, double *spos, int *bw,int *dw, int *npos, int *step,double *dout) -{ - int i,j; - - double epos= *spos + ((double) *npos); - double dbw=(double) *bw; - for(i = 0; i< *n; i++) { - // size of the window to which the contributions should be added - int in=(int) (pos[i]- *spos); - int ic=tc[i]; - int whs=(*dw)*(*bw)*ic; - int ws=(int) floor((in-whs)/(*step)); - int we=(int) ceil((in+whs)/(*step)); - if(ws<0) { ws=0; } - if(we>= *npos) { we= *npos -1; } - - for(j=ws;j<we;j++) { - double beta=((double)(j*(*step)-in))/dbw; - dout[j]+=((double)ic)*exp(-0.5*beta*beta); - } - } -} - - -// window tag counts -// dout is npos-length output array that will contain window tag counts -// windows are of a specified size, moved at a specified step -// n - number of positions in sorted tag array (positive only) -// spos - starting position -void window_n_tags(int *n, double *pos, double *spos, int *window_size, int *window_step, int *npos, int *dout) -{ - int i; - int cs=0; int ce=0; // current array start/end indecies - int ctc=0; // current tag count - double wpos=*spos-(*window_size)/2; // left-edge position - //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",*n,*window_size,*window_step,*npos,*spos); - for(i=0;i<*npos;i++) { - // advance end if needed - double ep=wpos+(*window_size); - while(ce<(*n) && pos[ce]<=ep) { - ctc++; ce++; - } - // advance start - while(cs<*n && pos[cs]<wpos) { - ctc--; cs++; - } - dout[i]=ctc; - // advance window position - wpos+=*window_step; - } -} - -// window tag counts -// windows are of a specified size, moved at a specified step -// pos - tag positions (positive, pre-shifted)y -// spos - starting position -// returns nsteps-length output array that will contain window tag counts -SEXP cwindow_n_tags(SEXP pos_R, SEXP spos_R, SEXP window_size_R, SEXP window_step_R, SEXP nsteps_R) { - double* pos=REAL(pos_R); - int n=LENGTH(pos_R); - int window_size=*INTEGER(window_size_R); - int window_step=*INTEGER(window_step_R); - int nsteps=*INTEGER(nsteps_R); - double spos=*REAL(spos_R); - - // allocate return array - SEXP tc_R; - PROTECT(tc_R=allocVector(INTSXP,nsteps)); - int* dout=INTEGER(tc_R); - - int i; - int cs=0; int ce=0; // current array start/end indecies - int ctc=0; // current tag count - double wpos=spos-window_size/2; // left-edge position - //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",n,window_size,window_step,nsteps,spos); - for(i=0;i<nsteps;i++) { - // advance end if needed - double ep=wpos+window_size; - while(ce<n && pos[ce]<=ep) { - ctc++; ce++; - } - // advance start - while(cs<n && pos[cs]<wpos) { - ctc--; cs++; - } - dout[i]=ctc; - // advance window position - wpos+=window_step; - } - UNPROTECT(1); - return(tc_R); -} - -// tag counts in windows around specified positions -// pos - tag positions -// ntags - number of tags in each position -// wpos - window positions -// returns a pos-length vector giving number of tags that fall within window_half_size from the provided positions -SEXP cwindow_n_tags_around(SEXP pos_R, SEXP ntags_R, SEXP wpos_R, SEXP window_half_size_R) { - double* pos=REAL(pos_R); - int* ntags=INTEGER(ntags_R); - int n=LENGTH(pos_R); - double* wpos=REAL(wpos_R); - int nw=LENGTH(wpos_R); // number of windows - double whs=(double) *INTEGER(window_half_size_R); - - // allocate return array - SEXP tc_R; - PROTECT(tc_R=allocVector(INTSXP,nw)); - int* dout=INTEGER(tc_R); - - int i; - int cs=0; int ce=0; // current array start/end indecies - int ctc=0; // current tag count - for(i=0;i<nw;i++) { - //if(i>(nw-2)) { Rprintf("-i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); } - // advance end if needed - double ep=wpos[i]+whs; - while(ce<n && pos[ce]<=ep) { - ctc+=ntags[ce]; ce++; - } - // advance start - double sp=wpos[i]-whs; - while(cs<n && pos[cs]<sp) { - ctc-=ntags[cs]; cs++; - } - dout[i]=ctc; - // if(i>(nw-2)) { Rprintf("+i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); } - } - UNPROTECT(1); - return(tc_R); -} -
--- a/spp/src/const.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -#ifndef NST_CONST_H -#define NST_CONST_H - -#define MAX_ULL 0xffffffffffffffffull - -typedef unsigned long long bit64_t; -typedef unsigned bit32_t; -typedef unsigned short bit16_t; -typedef unsigned char bit8_t; - -extern bit8_t nst_nt4_table[]; -extern bit8_t nst_nt16_table[]; -extern char *nst_nt4_rev_table; -extern char *nst_nt16_rev_table; -extern bit8_t nst_nt16_nt4_table[]; -extern int nst_nt16_count_table[]; - -#endif
--- a/spp/src/maqmap.c Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,164 +0,0 @@ -#include <stdio.h> -#include <stdlib.h> -#include <assert.h> -#include <unistd.h> -#include "const.h" -#include "maqmap.h" - -maqmap_t *maq_new_maqmap() -{ - maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); - mm->format = MAQMAP_FORMAT_NEW; - return mm; -} -void maq_delete_maqmap(maqmap_t *mm) -{ - int i; - if (mm == 0) return; - for (i = 0; i < mm->n_ref; ++i) - free(mm->ref_name[i]); - free(mm->ref_name); - free(mm->mapped_reads); - free(mm); -} -void maqmap_write_header(gzFile fp, const maqmap_t *mm) -{ - int i, len; - gzwrite(fp, &mm->format, sizeof(int)); - gzwrite(fp, &mm->n_ref, sizeof(int)); - for (i = 0; i != mm->n_ref; ++i) { - len = strlen(mm->ref_name[i]) + 1; - gzwrite(fp, &len, sizeof(int)); - gzwrite(fp, mm->ref_name[i], len); - } - gzwrite(fp, &mm->n_mapped_reads, sizeof(bit64_t)); -} -maqmap_t *maqmap_read_header(gzFile fp) -{ - maqmap_t *mm; - int k, len; - mm = maq_new_maqmap(); - gzread(fp, &mm->format, sizeof(int)); - if (mm->format != MAQMAP_FORMAT_NEW) { - if (mm->format > 0) { - fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); - exit(3); - } - assert(mm->format == MAQMAP_FORMAT_NEW); - } - gzread(fp, &mm->n_ref, sizeof(int)); - mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); - for (k = 0; k != mm->n_ref; ++k) { - gzread(fp, &len, sizeof(int)); - mm->ref_name[k] = (char*)malloc(len * sizeof(char)); - gzread(fp, mm->ref_name[k], len); - } - /* read number of mapped reads */ - gzread(fp, &mm->n_mapped_reads, sizeof(bit64_t)); - return mm; -} - -/* mapvalidate */ - -static void mapvalidate_core(gzFile fpin) -{ - maqmap_t *m = maqmap_read_header(fpin); - maqmap1_t *m1, mm1; - bit64_t n = 0; - int i, l; - bit64_t *cnt; - m1 = &mm1; - cnt = (bit64_t*)calloc(m->n_ref, 8); - printf("[message] number of reference sequences: %d\n", m->n_ref); - while ((l = maqmap_read1(fpin, m1)) != 0) { - if (l != sizeof(maqmap1_t)) { - printf("[fatal error] truncated map file.\n"); - break; - } - ++n; - if ((int)m1->seqid >= m->n_ref) { - printf("[fatal error] maqmap1_t::seqid is invalid (%d >= %d).\n", m1->seqid, m->n_ref); - break; - } - ++cnt[m1->seqid]; - if (m1->size >= MAX_READLEN - 1) { - printf("[faltal error] maqmap1_t::size is invalid (%d >= %d).\n", m1->size, MAX_READLEN - 1); - break; - } - } - if (m->n_mapped_reads != 0) { - if (m->n_mapped_reads != n) { - printf("[warning] maqmap1_t::n_mapped_reads is set, but not equals the real number (%llu != %llu).\n", - m->n_mapped_reads, n); - } - } - for (i = 0; i != m->n_ref; ++i) - printf("[message] %s : %llu\n", m->ref_name[i], cnt[i]); - free(cnt); - maq_delete_maqmap(m); -} - -/* mapview */ - -static void mapview_core(FILE *fpout, gzFile fpin, int is_verbose, int is_mm) -{ - bit32_t j; - maqmap_t *m = maqmap_read_header(fpin); - maqmap1_t *m1, mm1; - m1 = &mm1; - while (maqmap_read1(fpin, m1)) { - fprintf(fpout, "%s\t%s\t%d\t%c\t%d\t%u\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", - m1->name, m->ref_name[m1->seqid], (m1->pos>>1) + 1, - (m1->pos&1)? '-' : '+', m1->dist, m1->flag, m1->map_qual, (signed char)m1->seq[MAX_READLEN-1], - m1->alt_qual, m1->info1&0xf, m1->info2, m1->c[0], m1->c[1], m1->size); - if (is_verbose) { - fputc('\t', fpout); - for (j = 0; j != m1->size; ++j) { - if (m1->seq[j] == 0) fputc('n', fpout); - else if ((m1->seq[j]&0x3f) < 27) fputc("acgt"[m1->seq[j]>>6&3], fpout); - else fputc("ACGT"[m1->seq[j]>>6&3], fpout); - } - fputc('\t', fpout); - for (j = 0; j != m1->size; ++j) - fputc((m1->seq[j]&0x3f) + 33, fpout); - } - if (is_mm) { - bit64_t *p = (bit64_t*)(m1->seq + 55); - fprintf(fpout, "\t%llx", *p); - } - fputc('\n', fpout); - } - maq_delete_maqmap(m); -} - -int ma_mapview(int argc, char *argv[]) -{ - int c, is_verbose = 1, is_mm = 0; - while ((c = getopt(argc, argv, "bN")) >= 0) { - switch (c) { - case 'b': is_verbose = 0; break; - case 'N': is_mm = 1; break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: maq mapview [-bN] <in.map>\n"); - return 1; - } - gzFile fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[optind], "r"); - mapview_core(stdout, fp, is_verbose, is_mm); - gzclose(fp); - return 0; -} - -int ma_mapvalidate(int argc, char *argv[]) -{ - gzFile fp; - if (argc < 2) { - fprintf(stderr, "Usage: maq mapvalidate <in.map>\n"); - return 1; - } - fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[1], "r"); - mapvalidate_core(fp); - gzclose(fp); - return 0; -}
--- a/spp/src/maqmap.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ -#ifndef MAQMAP_H_ -#define MAQMAP_H_ - -#ifdef MAQ_LONGREADS -# define MAX_READLEN 128 -#else -# define MAX_READLEN 64 -#endif - -#define MAX_NAMELEN 36 -#define MAQMAP_FORMAT_OLD 0 -#define MAQMAP_FORMAT_NEW -1 - -#define PAIRFLAG_FF 0x01 -#define PAIRFLAG_FR 0x02 -#define PAIRFLAG_RF 0x04 -#define PAIRFLAG_RR 0x08 -#define PAIRFLAG_PAIRED 0x10 -#define PAIRFLAG_DIFFCHR 0x20 -#define PAIRFLAG_NOMATCH 0x40 -#define PAIRFLAG_SW 0x80 - -#include <string.h> -#include <zlib.h> -#include "const.h" - -/* - name: read name - size: the length of the read - seq: read sequence (see also below) - seq[MAX_READLEN-1]: single end mapping quality (equals to map_qual if not paired) - map_qual: the final mapping quality - alt_qual: the lower quality of the two ends (equals to map_qual if not paired) - flag: status of the pair - dist: offset of the mate (zero if not paired) - info1: mismatches in the 24bp (higher 4 bits) and mismatches (lower 4 bits) - info2: sum of errors of the best hit - c[2]: count of all 0- and 1-mismatch hits on the reference - */ -typedef struct -{ - bit8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ - bit8_t size, map_qual, info1, info2, c[2], flag, alt_qual; - bit32_t seqid, pos; - int dist; - char name[MAX_NAMELEN]; -} maqmap1_t; - -typedef struct -{ - int format, n_ref; - char **ref_name; - bit64_t n_mapped_reads; - maqmap1_t *mapped_reads; -} maqmap_t; - -#define maqmap_read1(fp, m1) gzread((fp), (m1), sizeof(maqmap1_t)) - -#ifdef __cplusplus -extern "C" { -#endif - maqmap_t *maq_new_maqmap(); - void maq_delete_maqmap(maqmap_t *mm); - void maqmap_write_header(gzFile fp, const maqmap_t *mm); - maqmap_t *maqmap_read_header(gzFile fp); -#ifdef __cplusplus -} -#endif - -#endif
--- a/spp/src/maqread.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,207 +0,0 @@ -#include "pc.h" -#include <vector> -#include <string.h> -#include <iostream> -#include <fstream> -#include <sstream> -#include <strstream> -#include <algorithm> -#include <string> -#include <functional> -#include <utility> -#include <zlib.h> - -extern "C" { -#include "R.h" -#include "Rmath.h" -#include "Rinternals.h" -#include "Rdefines.h" -#include "maqmap.h" -} - -using namespace std; -using namespace __gnu_cxx; - - -class lessAbsoluteValue { -public: - bool operator()(int a, int b) const { - return abs(a) < abs(b); - } -}; - - - -//#define DEBUG 1 - -extern "C" { - - // read in text version of maq map - SEXP read_binmaqmap(SEXP filename,SEXP read_tag_names_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - const char* fname=CHAR(asChar(filename)); - int read_names=*(INTEGER(read_tag_names_R)); -#ifdef DEBUG - Rprintf("fname=%s\n",fname); -#endif - - // main data vector - // chr - pos - vector< vector<int> > pos; - vector< vector<int> > posnm; // number of mismatches - vector< vector<string> > tagnames; - - // chromosome map - hash_map<string, int, hash<string>,equal_to<string> > cind_map; - vector<string> cnames; - - - gzFile f=gzopen(fname,"r"); - - maqmap_t *m = maqmap_read_header(f); - maqmap1_t *m1, mm1; - m1 = &mm1; - - if (!f) { - cout<<"can't open input file \""<<fname<<"\"\n"; - } else { - Rprintf("opened %s\n",fname); - - // read in bed line - string line; - int fcount=0; - while(maqmap_read1(f, m1)) { - string tagname=string(m1->name); - string chr=string(m->ref_name[m1->seqid]); - int len=m1->size; - int fpos=(m1->pos>>1) + 1; - if(m1->pos&1) { - fpos=-1*(fpos+len-1); - } - int nm=m1->info1&0xf; - -#ifdef DEBUG - Rprintf("read in map line chr=%s tagname=%s fpos=%d, nm=%d, len=%d\n",chr.c_str(),tagname.c_str(),fpos,nm,len); -#endif - - - // determine the chromosome index - hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); - int cind=-1; - if(li==cind_map.end()) { - // register new chromosome - cind=cnames.size(); - cnames.push_back(chr); - cind_map[chr]=cind; - // allocate new pos vector - pos.push_back(vector<int>()); - posnm.push_back(vector<int>()); - if(read_names) { - tagnames.push_back(vector<string>()); - } -#ifdef DEBUG - Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); -#endif - } else { - cind=li->second; - } - fcount++; - (pos[cind]).push_back(fpos); - (posnm[cind]).push_back(nm); - if(read_names) { - (tagnames[cind]).push_back(tagname); - } -#ifdef DEBUG - Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); - if(fcount>30) { - break; - } -#endif - - } - gzclose(f); - Rprintf("done. read %d fragments\n",fcount); - } - - - // construct output structures - SEXP chnames; - int np=0; // number of protections - PROTECT(chnames = allocVector(STRSXP, cnames.size())); - for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { - SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); - } - np++; - - // sort - //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { - // sort(csi->begin(), csi->end(), lessAbsoluteValue()); - //} - - SEXP ans; - PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; - vector<vector<int> >::const_iterator nsi; - vector<vector<string> >::const_iterator ssi; - for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { - nsi=posnm.begin()+(csi-pos.begin()); - - SEXP dv,dnames_R; - PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; - SET_STRING_ELT(dnames_R, 0, mkChar("t")); - SET_STRING_ELT(dnames_R, 1, mkChar("n")); - if(read_names) { - SET_STRING_ELT(dnames_R, 2, mkChar("s")); - } - - - - SEXP tv,nv,sv; - PROTECT(tv=allocVector(INTSXP,csi->size())); np++; - PROTECT(nv=allocVector(INTSXP,csi->size())); np++; - if(read_names) { - PROTECT(sv=allocVector(STRSXP,csi->size())); np++; - } - int* i_tv=INTEGER(tv); - int* i_nv=INTEGER(nv); - - int i=0; - vector<int>::const_iterator ini=nsi->begin(); - for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { - i_tv[i]=*pi; - i_nv[i]=*ini++; - i++; - } - if(read_names) { - int i=0; - ssi=tagnames.begin()+(csi-pos.begin()); - for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { - SET_STRING_ELT(sv,i,mkChar(si->c_str())); - i++; - } - } - PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; - SET_VECTOR_ELT(dv, 0, tv); - SET_VECTOR_ELT(dv, 1, nv); - if(read_names) { - SET_VECTOR_ELT(dv, 2, sv); - } - setAttrib(dv, R_NamesSymbol, dnames_R); - - SET_VECTOR_ELT(ans, csi-pos.begin(), dv); - } - - setAttrib(ans,R_NamesSymbol,chnames); - -#ifdef DEBUG - Rprintf("unprotecting %d elements\n",np); -#endif - - UNPROTECT(np); - return(ans); -} - - -}
--- a/spp/src/pc.h Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,20 +0,0 @@ -#ifndef PC_H -#define PC_H 1 -#include <functional> -//#include <hash_map.h> -#include <ext/hash_set> -#include <ext/hash_map> - - -namespace __gnu_cxx -{ - template<> struct hash< std::string > - { - size_t operator()( const std::string& x ) const - { - return hash< const char* >()( x.c_str() ); - } - }; -} - -#endif
--- a/spp/src/peaks.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,804 +0,0 @@ -#include <vector> -#include <string.h> -#include <iostream> -#include <string> -#include <set> - -extern "C" { -#include "R.h" -#include "Rmath.h" -#include "Rinternals.h" -#include "Rdefines.h" -} - -using namespace std; -using namespace __gnu_cxx; - -/** - * Calculate all local peaks - */ - -//#define DEBUG 1 - -extern "C" { - SEXP find_peaks(SEXP x_R,SEXP thr_R,SEXP max_span_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - double* x=REAL(x_R); - int nx=LENGTH(x_R); - int max_span=*INTEGER(max_span_R); - double thr=REAL(thr_R)[0]; -#ifdef DEBUG - Rprintf("n=%d; thr=%f; max_span=%d\n",nx,thr,max_span); -#endif - - vector<int> pos; - - double pv=x[0]; - double ppv=0; // previous peak value - int ppp=-max_span-1; // previous peak position - - for(int i=1;i<(nx-1);i++) { - if(x[i]>pv && x[i]>=thr && x[i]>x[i+1]) { - if(max_span>2) { - //Rprintf("i=%d; ppp=%d\n",i,ppp); - if(i-ppp > max_span) { - if(ppp>=0) { - pos.push_back(ppp); - } - //Rprintf("recorded %d; now %d\n",ppp,i); - ppp=i; ppv=x[i]; - } else { - if(x[i]>ppv) { - //Rprintf("reset from %d to %d\n",ppp,i); - ppp=i; ppv=x[i]; - } - } - } else { - pos.push_back(i); - } - } - if(x[i]!=x[i+1]) { pv=x[i]; } - } - - // add remaining peak - if(max_span>2 && ppp>=0) { - pos.push_back(ppp); - } - - SEXP nv; - PROTECT(nv=allocVector(INTSXP,pos.size())); - int* i_nv=INTEGER(nv); - int i=0; - for(vector<int> ::const_iterator pi=pos.begin();pi!=pos.end();++pi) { - i_nv[i++]=1+(*pi); - } - - UNPROTECT(1); - return(nv); - } - - - - - /************************************************************************/ - // given a data vector d (positive values) and a set of signed center coordinates pos, - // returns coordinates of data points relative to the centers - // size is the size of the region around the centers - // return: vector of relative coordinates (x) and indecies of centers relative the coordinate - // was calculated (i). - SEXP get_relative_coordinates(SEXP d_R, - SEXP pos_R, - SEXP size_R) - { - int *d, *pos; - int npos,nd,size; - - d = INTEGER(d_R); pos = INTEGER(pos_R); - npos=LENGTH(pos_R); nd=LENGTH(d_R); - size = INTEGER(size_R)[0]; -#ifdef DEBUG - Rprintf("|d|=%d, |c|=%d, size=%d\n",nd,npos,size); -#endif - - vector<int> x; vector<int> xi; - int k=0; // current pos index - - for(int i=0;i<nd;i++) { - // increment k until pos[k]+size>=d[i] - while((abs(pos[k])+size) < d[i]) { k++; if(k==npos) { break; }; -#ifdef DEBUG - Rprintf("advancing k to %d\n",k); -#endif - } - if(k==npos) { break; }; - // increment i until d[i]>=pos[k]-size - while((abs(pos[k])-size) > d[i]) { i++; if(i==nd) { break; } -#ifdef DEBUG - Rprintf("advancing i to %d\n",i); -#endif - } - if(i==nd) { break; } - - - int l=k; - while((l<npos) && ((abs(pos[l])-size) <= d[i])) { l++; -#ifdef DEBUG - Rprintf("advancing l to %d\n",l); -#endif - } - for(int j=k;j<l;j++) { - int pd=d[i]-abs(pos[j]); - if(abs(pd)<=size) { - // record - if(pos[j]>0) { - x.push_back(pd); - } else { - x.push_back(-1*pd); - } - xi.push_back(j); -#ifdef DEBUG - Rprintf("recorded i=%d, j=%d\n",i,j); -#endif - } else { - break; - } - } - } - - SEXP xv_R,xiv_R; - PROTECT(xv_R=allocVector(INTSXP,x.size())); - PROTECT(xiv_R=allocVector(INTSXP,x.size())); - int* xv=INTEGER(xv_R); - int* xiv=INTEGER(xiv_R); - - int i=0; - for(vector<int> ::const_iterator pi=x.begin();pi!=x.end();++pi) { - xv[i++]=*pi; - } - i=0; - for(vector<int> ::const_iterator pi=xi.begin();pi!=xi.end();++pi) { - xiv[i++]=1+(*pi); - } - - SEXP ans_R, names_R; - PROTECT(names_R = allocVector(STRSXP, 2)); - SET_STRING_ELT(names_R, 0, mkChar("x")); - SET_STRING_ELT(names_R, 1, mkChar("i")); - - PROTECT(ans_R = allocVector(VECSXP, 2)); - SET_VECTOR_ELT(ans_R, 0, xv_R); - SET_VECTOR_ELT(ans_R, 1, xiv_R); - setAttrib(ans_R, R_NamesSymbol, names_R); - - UNPROTECT(4); - return(ans_R); - } - - - // determines a set of points within a set of fragments - // note: all vectors sorted in ascending order - // note: all vectors are integers - // x_R - vector of point positions - // se_R - vector of start and end positions - // fi_R - vector of signed fragment indecies - // return_list_R - whether a list of fragments should be returned for each point - // return_unique_R - whether points in multiple fragments should be omitted - SEXP points_within(SEXP x_R,SEXP se_R,SEXP fi_R,SEXP return_list_R,SEXP return_unique_R,SEXP return_point_counts_R) { -#ifdef DEBUG - Rprintf("start\n"); -#endif - int* x=INTEGER(x_R); - int nx=LENGTH(x_R); - int* se=INTEGER(se_R); - int* fi=INTEGER(fi_R); - int nf=LENGTH(se_R); - - int return_list=*(INTEGER(return_list_R)); - int return_unique=*(INTEGER(return_unique_R)); - int return_point_counts=*(INTEGER(return_point_counts_R)); - -#ifdef DEBUG - Rprintf("nf=%d; nx=%d, return_list=%d, return_unique=%d, return_point_counts=%d\n",nf/2,nx,return_list,return_unique,return_point_counts); -#endif - set<int> fset; - - - SEXP nv; int *i_nv; - int np=0; - if(return_point_counts) { - PROTECT(nv = allocVector(INTSXP, nf/2)); np++; - i_nv=INTEGER(nv); - for(int i=0;i<nf/2;i++) { i_nv[i]=0; } - } else if(return_list) { - PROTECT(nv = allocVector(VECSXP, nx)); np++; - } else { - PROTECT(nv=allocVector(INTSXP,nx)); np++; - i_nv=INTEGER(nv); - } - - int j=0; - - for(int i=0;i<nx;i++) { - // advance j - while(j<nf && se[j]<x[i]) { - int frag=fi[j]; - if(frag>0) { // insert - fset.insert(frag); -#ifdef DEBUG - Rprintf("inserted frag %d, size=%d\n",frag,fset.size()); -#endif - } else { // remove - fset.erase(-frag); -#ifdef DEBUG - Rprintf("removed frag %d, size=%d\n",-frag,fset.size()); -#endif - } - j++; - } -#ifdef DEBUG - Rprintf("i=%d j=%d\n",i,j); -#endif - if(return_list) { - if(fset.empty() || (return_unique && fset.size()>1)) { - // assign null list? - } else { - SEXP fil_R; - PROTECT(fil_R=allocVector(INTSXP,fset.size())); np++; - int* fil=INTEGER(fil_R); - int k=0; - for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) { - fil[k]=*ki; k++; - } - SET_VECTOR_ELT(nv, i, fil_R); - UNPROTECT(1); np--; - } - } else { - if(return_point_counts) { - for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) { - i_nv[*ki-1]++; - } - } else { - if(fset.empty() || (return_unique && fset.size()>1)) { - i_nv[i]=-1; - } else { - i_nv[i]=*fset.begin(); - } - } - } - } - - UNPROTECT(np); - return(nv); - } - - - SEXP expuni_lr(SEXP x_R, // positions and their number (assumed sorted in ascending order) - SEXP mdist_R, // max distance at which points should be considered - SEXP lambda_R, // lambda value - SEXP spos_R, // starting position - SEXP epos_R, // ending position - SEXP step_R, // step size - SEXP return_peaks_R, // whether peak positions should be returned, or entire score vector - SEXP min_peak_lr_R // min peak height (lr) - ) - { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - int* x=INTEGER(x_R); - int nx=LENGTH(x_R); - int mdist=INTEGER(mdist_R)[0]; - double lambda=*(REAL(lambda_R)); - - int return_peaks=*(INTEGER(return_peaks_R)); - double min_peak=*(REAL(min_peak_lr_R)); - - int spos=*(INTEGER(spos_R)); - int epos=*(INTEGER(epos_R)); - int step=*(INTEGER(step_R)); - - int nsteps=(int) (epos-spos)/step; - - -#ifdef DEBUG - Rprintf("n=%d; lambda=%f; mdist=%d; spos=%d; epos=%d; step=%d; nsteps=%d\n",nx,lambda,mdist,spos,epos,step,nsteps); -#endif - - - SEXP nv; - double *d_nv; - if(!return_peaks) { - PROTECT(nv=allocVector(REALSXP,nsteps+1)); - d_nv=REAL(nv); - } - - - int i=0; // current index of the first point being used in the calculations - int j=0; // current index of the last point being used in the calculations - int sx=0; // current sum of all positions - int n=0; - - for(int k=0; k<=nsteps; k++) { - int cpos=spos+k*step; - // increase i until x[i]>=cpos-mdist; remove x from sx; decrement n; - while(i<nx && x[i]<(cpos-mdist)) { - n--; sx-=x[i]; i++; - //Rprintf("incremented i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]); - } - //Rprintf("stable i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]); - - //if(i>j) { j=i; } - - // increase j until x[j]>cpos - while(j<nx && x[j]<=cpos) { - n++; sx+=x[j]; j++; - //Rprintf("incremented j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j-1]); - } - //Rprintf("stable j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j]); - - // calculate lr - d_nv[k]=((double)(1-n))*log(lambda)-lambda*((double)(n*(cpos+1)-sx)); - //Rprintf("recorded lr[%d]=%f\n",k-1,d_nv[k-1]); - } - UNPROTECT(1); - return(nv); - } - - - SEXP allpdist(SEXP x_R,SEXP max_dist_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - double* x=REAL(x_R); - int nx=LENGTH(x_R); - double max_dist=*REAL(max_dist_R); -#ifdef DEBUG - Rprintf("n=%d; max_dist=%d\n",nx,max_dist); -#endif - - vector<double> dist; - - for(int i=0;i<nx;i++) { - for(int j=i+1;j<nx;j++) { - - double d=x[j]-x[i]; -#ifdef DEBUG - Rprintf("i=%d; j=%d; d=%f\n",i,j,d); -#endif - if(d<=max_dist) { - dist.push_back(d); - } else { - break; - } - } - } - - SEXP nv; - PROTECT(nv=allocVector(REALSXP,dist.size())); - double* i_nv=REAL(nv); - int i=0; - for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) { - i_nv[i++]=*pi; - } - - UNPROTECT(1); - return(nv); - } - - // same as above, but for two different sets - SEXP allxpdist(SEXP x_R,SEXP y_R, SEXP max_dist_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - double* x=REAL(x_R); - double* y=REAL(y_R); - int nx=LENGTH(x_R); - int ny=LENGTH(y_R); - double max_dist=*REAL(max_dist_R); -#ifdef DEBUG - Rprintf("nx=%d; ny=%d; max_dist=%d\n",nx,ny,max_dist); -#endif - - vector<double> dist; - int yi=0; // latest y start index - - for(int i=0;i<nx;i++) { - // adjust yi so that yi>=x[i]-max_dist_R - while(y[yi]<(x[i]-max_dist) && yi<ny) { yi++; } - if(yi==ny) { break; } - - for(int j=yi;j<ny;j++) { - double d=y[j]-x[i]; -#ifdef DEBUG - Rprintf("i=%d; j=%d; d=%f\n",i,j,d); -#endif - if(d<=max_dist) { - dist.push_back(d); - } else { - break; - } - } - } - - SEXP nv; - PROTECT(nv=allocVector(REALSXP,dist.size())); - double* i_nv=REAL(nv); - int i=0; - for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) { - i_nv[i++]=*pi; - } - - UNPROTECT(1); - return(nv); - } - - // returns a vector giving for each point, - // number of points within a given max_dist - SEXP nwithindist(SEXP x_R,SEXP max_dist_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - double* x=REAL(x_R); - int nx=LENGTH(x_R); - double max_dist=*REAL(max_dist_R); - - SEXP nv; - PROTECT(nv=allocVector(REALSXP,nx)); - double* i_nv=REAL(nv); - for(int i=0;i<nx;i++) { i_nv[i]=0; } - -#ifdef DEBUG - Rprintf("n=%d; max_dist=%d\n",nx,max_dist); -#endif - - for(int i=0;i<nx;i++) { - for(int j=i+1;j<nx;j++) { - - double d=x[j]-x[i]; -#ifdef DEBUG - Rprintf("i=%d; j=%d; d=%f\n",i,j,d); -#endif - if(d<=max_dist) { - i_nv[i]++; - i_nv[j]++; - } else { - break; - } - } - } - - UNPROTECT(1); - return(nv); - } - - - - - // given a list of sorted chromosome signal and background vectors (unscaled), determine - // cluster contigs exceeding thr poisson P value, based on a whs window size, - // and satisfying mcs cluster size - SEXP find_poisson_enrichment_clusters(SEXP pos_R,SEXP flag_R,SEXP wsize_R,SEXP thr_R,SEXP mcs_R,SEXP bgm_R,SEXP mintag_R,SEXP either_R) { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - double* pos=REAL(pos_R); - int* flag=INTEGER(flag_R); - int nt=LENGTH(pos_R); - - int mcs=*INTEGER(mcs_R); - int wsize=*INTEGER(wsize_R); - int either=*INTEGER(either_R); - double thr=REAL(thr_R)[0]; - double bgm=REAL(bgm_R)[0]; - double mintag=REAL(mintag_R)[0]; - -#ifdef DEBUG - Rprintf("nt=%d; wsize=%d; thr=%f; mcs=%d; min.tag=%f; bgm=%f\n",nt,wsize,thr,mcs,mintag,bgm); -#endif - - - vector< pair<double,double> > contigs; - - // running indecies (start and end) - int si=0; - int ei=0; - - // current window coordinate - double ws=pos[0]; - - // current window tag counts - int cc[2]={0,0}; - - - if(nt>0) { - cc[flag[si]]++; - // increment window end - while(ei<(nt-1) && (pos[ei+1]-ws) <= wsize) { - ei++; - cc[flag[ei]]++; - } - - - // cluster start,end positions - double cs,ce; - int inclust=0; - - while(si<nt-1) { - - if((pos[si+1]-ws) > (pos[ei+1] - ws - wsize) && ei!=(nt-1)) { - // move end boudnary - ei++; - ws=pos[ei]-wsize; - cc[flag[ei]]++; - while(ei<(nt-1) && pos[ei+1]==ws+wsize) { - ei++; - cc[flag[ei]]++; - } - - // increment window start - while(si<(nt-1) && pos[si] < ws) { - cc[flag[si]]--; - si++; - } - - } else { - // move up start boundary - ws=pos[si+1]; - cc[flag[si]]--; - si++; - while(si<(nt-1) && pos[si+1]==ws) { - cc[flag[si]]--; - si++; - } - - // increment window end - while(ei<(nt-1) && (pos[ei+1] - ws) <= wsize) { - ei++; - cc[flag[ei]]++; - } - - } - - // calculate z score - double dc0=((double)cc[0])+0.5; - double dc1=((double)cc[1])+0.5; - double rte=dc0+dc1-0.25*thr*thr; - double lb; - if(rte<=0) { - lb=0; - } else { - lb=(sqrt(dc1*dc0) - 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr); - if(lb<0) { lb=0; } - lb*=lb; - } - - //Rprintf("%f=f(%f,%f,%f); %f=f(%f,%f,%f)\n",lb,1.0-thr,2.0*dc1,2.0*dc0,ub,thr,2.0*dc1,2.0*dc0); - -#ifdef DEBUG - //double ub=gsl_cdf_fdist_Qinv(thr,2.0*dc1,2.0*dc0)*dc1/dc0; - double ub=(sqrt(dc1*dc0) + 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr); - ub*=ub; - Rprintf("s=%d (%f); e=%d (%f); window: %f-%f; cc=[%d,%d]; lb=%f; ub=%f\n",si,pos[si],ei,pos[ei],ws,ws+wsize,cc[0],cc[1],lb,ub); -#endif - - int bc=lb>=bgm && cc[1]>=mintag; - if(either) { - bc=lb>=bgm || cc[1]>=mintag; - } - if(bc) { - if(inclust) { - double nce=ws+wsize/2.0; - if(nce-ce > wsize/2.0) { - // next point is too far removed, end cluster - if(ce-cs >= mcs) { - contigs.push_back(pair<double,double>(cs,ce)); -#ifdef DEBUG - Rprintf("recorded cluster %f-%f\n",cs,ce); -#endif - } - inclust=0; - } else { - ce=nce; - } - } else { - inclust=1; - cs=ws+wsize/2.0; - ce=cs; - } - } else { - if(inclust) { - if(ce-cs >= mcs) { - contigs.push_back(pair<double,double>(cs,ce)); -#ifdef DEBUG - Rprintf("recorded cluster %f-%f\n",cs,ce); -#endif - } - inclust=0; - } - } - - } - - if(inclust) { - if(ce-cs >= mcs) { - contigs.push_back(pair<double,double>(cs,ce)); -#ifdef DEBUG - Rprintf("recorded cluster %f-%f\n",cs,ce); -#endif - } - inclust=0; - } - } - - SEXP cs_R,ce_R; - PROTECT(cs_R=allocVector(REALSXP,contigs.size())); - PROTECT(ce_R=allocVector(REALSXP,contigs.size())); - double* csa=REAL(cs_R); - double* cea=REAL(ce_R); - - int i=0; - for(vector< pair<double,double> >::const_iterator ci=contigs.begin(); ci!=contigs.end();++ci) { - csa[i]=ci->first; - cea[i]=ci->second; - i++; - } - - SEXP ans_R, names_R; - PROTECT(names_R = allocVector(STRSXP, 2)); - SET_STRING_ELT(names_R, 0, mkChar("s")); - SET_STRING_ELT(names_R, 1, mkChar("e")); - - PROTECT(ans_R = allocVector(VECSXP, 2)); - SET_VECTOR_ELT(ans_R, 0, cs_R); - SET_VECTOR_ELT(ans_R, 1, ce_R); - setAttrib(ans_R, R_NamesSymbol, names_R); - - UNPROTECT(4); - return(ans_R); - - } - - - // finds intersection between a list of regions - // the flag has +n/-n value, corresponding to the start/end of a segment in n-th regionset - // max_val: 1 - report max overlapping value, -1: report min, 0 - don't look at values - // returns: $s, $e, ($v) lists - SEXP region_intersection(SEXP n_R,SEXP pos_R,SEXP flags_R,SEXP vals_R,SEXP max_val_R,SEXP union_R) { - const int max_val=*INTEGER(max_val_R); - const int unionr=*INTEGER(union_R); - const int n=*INTEGER(n_R); - double* pos=REAL(pos_R); - int* flags=INTEGER(flags_R); - double* val=REAL(vals_R); - -#ifdef DEBUG - Rprintf("n=%d; npos=%d; max_val=%d\n",n,LENGTH(pos_R),max_val); -#endif - - int s[n]; // flag status for each set - double mv[n]; // max/min value of current clusters - - for(int i=0;i<n;i++) { s[i]=0; } - - vector<double> starts; - vector<double> ends; - vector<double> values; - - int start=-1; - double mval=0; - for(int i=0;i<LENGTH(pos_R);i++) { - // update flags - int f=flags[i]; - if(f>0) { - s[abs(f)-1]++; - } else { - s[abs(f)-1]--; - } - - if(max_val!=0 && val[i]*max_val > mval*max_val) { mval=val[i]; } - - // joined status - int all; - if(unionr) { - all=0; - for(int j=0;j<n;j++) { if(s[j]>0) { all=1; break;} } - } else { - all=1; - for(int j=0;j<n;j++) { all=all & (s[j]>0); } - } - - - //Rprintf("i=%d; s=[",i); - //for(int j=0;j<n;j++) { Rprintf("%d",s[j]); } - //Rprintf("]; all=%d; start=%d\n",all,start); - - if(start>=0) { - // in fragment - if(!all) { - // end fragment - starts.push_back(pos[start]); - ends.push_back(pos[i]); - start=-1; - if(max_val!=0) { values.push_back(mval); } - -#ifdef DEBUG - Rprintf("recorded new fragment (s=%f,e=%f,v=%f);\n",pos[start],pos[i],mval); -#endif - } - } else { - // should a fragment be started? - if(all) { - start=i; - if(max_val!=0) { mval=val[i]; } -#ifdef DEBUG - Rprintf("starting new fragment (s=%f,i=%d);\n",pos[start],i); -#endif - } - } - } - SEXP cs_R,ce_R,cv_R; - PROTECT(cs_R=allocVector(REALSXP,starts.size())); - PROTECT(ce_R=allocVector(REALSXP,ends.size())); - - double* csa=REAL(cs_R); - int i=0; - for(vector<double>::const_iterator ci=starts.begin(); ci!=starts.end(); ++ci) { - csa[i]=*ci; i++; - } - - csa=REAL(ce_R); - i=0; - for(vector<double>::const_iterator ci=ends.begin(); ci!=ends.end(); ++ci) { - csa[i]=*ci; i++; - } - - if(max_val!=0) { - PROTECT(cv_R=allocVector(REALSXP,values.size())); - csa=REAL(cv_R); - i=0; - for(vector<double>::const_iterator ci=values.begin(); ci!=values.end(); ++ci) { - csa[i]=*ci; i++; - } - } - - SEXP ans_R, names_R; - if(max_val!=0) { - PROTECT(names_R = allocVector(STRSXP, 3)); - SET_STRING_ELT(names_R, 0, mkChar("s")); - SET_STRING_ELT(names_R, 1, mkChar("e")); - SET_STRING_ELT(names_R, 2, mkChar("v")); - - PROTECT(ans_R = allocVector(VECSXP, 3)); - SET_VECTOR_ELT(ans_R, 0, cs_R); - SET_VECTOR_ELT(ans_R, 1, ce_R); - SET_VECTOR_ELT(ans_R, 2, cv_R); - } else { - PROTECT(names_R = allocVector(STRSXP, 2)); - SET_STRING_ELT(names_R, 0, mkChar("s")); - SET_STRING_ELT(names_R, 1, mkChar("e")); - - PROTECT(ans_R = allocVector(VECSXP, 2)); - SET_VECTOR_ELT(ans_R, 0, cs_R); - SET_VECTOR_ELT(ans_R, 1, ce_R); - } - - setAttrib(ans_R, R_NamesSymbol, names_R); - - if(max_val!=0) { - UNPROTECT(5); - } else { - UNPROTECT(4); - } - return(ans_R); - } - -} -
--- a/spp/src/wdl.cpp Tue Nov 27 16:14:55 2012 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,657 +0,0 @@ -#include <vector> -#include <string.h> -#include <iostream> -#include <string> -#include <set> - -extern "C" { -#include "R.h" -#include "Rmath.h" -#include "Rinternals.h" -#include "Rdefines.h" -} - -using namespace std; -using namespace __gnu_cxx; - -//#define DEBUG 1 - -extern "C" { - - /************************************************************************/ - /* - * lwcc - calculate local window cross-correlation - */ - - SEXP lwcc(SEXP x_R, // positive strand hist - SEXP y_R, // negative strand hist of the same length - SEXP osize_R, // outer boundary distance - SEXP isize_R, // inner boundary distance - SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned - SEXP min_peak_dist_R, // distance between closest peaks - SEXP min_peak_val_R, // min peak threshold - SEXP tag_weight_R, // tag weight - SEXP bg_subtract_R, // a flag whether do background subtractio - SEXP bgp_R, // optional background hist for positive strand - SEXP bgn_R, // optional background hist for negative strand - SEXP bg_wsize_R, // window size for the background counts - SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout) - SEXP round_up_R // whether to round up fractional signal tag counts - ) - { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - - int is=INTEGER(isize_R)[0]; - int os=INTEGER(osize_R)[0]; - double rs=((double)(2*os+1)); - int* x=INTEGER(x_R); - int* y=INTEGER(y_R); - int n_x=LENGTH(x_R); - - // background-related - int* bgp=INTEGER(bgp_R); - int* bgn=INTEGER(bgn_R); - int bg_whs=INTEGER(bg_wsize_R)[0]; - - int return_peaks=*(INTEGER(return_peaks_R)); - double min_peak_val=*(REAL(min_peak_val_R)); - int min_peak_dist=*(INTEGER(min_peak_dist_R)); - double tag_weight=*(REAL(tag_weight_R)); - - const int round_up=*(INTEGER(round_up_R)); - const int bg_subtract=*(INTEGER(bg_subtract_R)); - const double bg_weight=*(REAL(bg_weight_R)); - - int i; // point at which the value is being calculated - int start=os; - int end=n_x-os-1; - - // bg tag counts within bg window - int bg_pn1=0; - int bg_nn1=0; - int bg_pn2=0; - int bg_nn2=0; - - - - // illustration for counting: - // - // 012345678901234567890123456789012 - // ==========------|------========== - // - // osize=16; isize=6; - - - SEXP nv; - double *d_nv; - vector<int> ppos; - vector<double> pval; - if(!return_peaks) { - PROTECT(nv=allocVector(REALSXP,n_x)); - d_nv=REAL(nv); - for(int i=0;i<n_x;i++) { - d_nv[i]=0; - } - } - -#ifdef DEBUG - Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight); - Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]); -#endif - - int lpp=-1; // last peak position - double lpv=-1e3; // last peak value - - double ppv=-1e3; // last value - double pppv=-11e-3; // value before last - - int pn1,pn2,nn1,nn2; - - - if(bg_subtract) { - // pre-initialize background tag counts, - for(int i=0;i<bg_whs;i++) { - if(i<n_x) { - bg_pn2+=bgp[i]; - bg_nn2+=bgn[i]; - } - } - } - - - for(i=0;i<end;i++) { -#ifdef DEBUG - //Rprintf("i=%d ", i); -#endif - - if(bg_subtract) { - // update background counts - int nl=i-bg_whs-1; - - if(nl>=0) { - bg_pn1-=bgp[nl]; - bg_nn1-=bgn[nl]; - } - bg_pn1+=bgp[i]; - bg_nn1+=bgn[i]; - - if(i>0) { - bg_pn2-=bgp[i-1]; - bg_nn2-=bgn[i-1]; - } - int nr=i+bg_whs; - if(nr<n_x) { - bg_pn2+=bgp[nr]; - bg_nn2+=bgn[nr]; - } - } - - if(i >= start) { - // update counts, taking into account masked out regions - pn1=pn2=nn1=nn2=0; - - for(int k=0;k<=(os-is);k++) { - int xp1=x[i-os+k]; - int xp2=x[i+os-k]; - int xn1=y[i+os-k]; - int xn2=y[i-os+k]; - - if(xp1!=-1 && xn1!=-1) { - pn1+=xp1; - nn1+=xn1; - } - if(xp2!=-1 && xn2!=-1) { - pn2+=xp2; - nn2+=xn2; - } - } - - // calculate the means - double mp=((double)(pn1+pn2))/rs; - double mn=((double)(pn1+pn2))/rs; -#ifdef DEBUG - Rprintf("mp=%f mn=%f\n",mp,mn); -#endif - // calculate correlation - double varp=0; - double varn=0; - double num=0; - double val=-1e3; - if(mp>0 & mn>0) { - for(int k=0;k<=(os-is);k++) { - int xp1=x[i-os+k]; - int xp2=x[i+os-k]; - int xn1=y[i+os-k]; - int xn2=y[i-os+k]; - - - if(xp1!=-1 && xn1!=-1) { - double nnp1=((double) xp1)-mp; - double nnn1=((double) xn1)-mn; - num+=nnp1*nnn1; - varp+=nnp1*nnp1; - varn+=nnn1*nnn1; - } - - if(xp2!=-1 && xn2!=-1) { - double nnp2=((double) xp2)-mp; - double nnn2=((double) xn2)-mn; - num+=nnp2*nnn2; - varp+=nnp2*nnp2; - varn+=nnn2*nnn2; - } - - } - double tagw; - double spn1=((double)pn1)*tag_weight; - double snn1=((double)nn1)*tag_weight; - double spn2=((double)pn2)*tag_weight; - double snn2=((double)nn2)*tag_weight; - if(round_up) { - if(pn1>0 && spn1<1) { spn1=1.0; } - //if(pn2>0 && spn2<1) { spn2=1.0; } - if(nn1>0 && snn1<1) { snn1=1.0; } - //if(nn2>0 && snn2<1) { snn2=1.0; } - } - - if(bg_subtract) { - spn1-=((double)bg_pn1)*bg_weight; - snn1-=((double)bg_nn2)*bg_weight; - spn2-=((double)bg_pn2)*bg_weight; - snn2-=((double)bg_nn1)*bg_weight; - - if(spn2<0) spn2=0; - if(snn2<0) snn2=0; - - if(spn1>0 && snn1>0) { - tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0)); - } else { - tagw=-(spn2+snn2+1.0); - } - //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl; - } else { - tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2); - } - - if(tagw<0) { - val=0.0; - } else { - if(num==0.0) { - val=0; - } else { - val=num/(sqrt(varp*varn)); - } - val=val*sqrt(tagw) + tagw; - - } - //cout<<"val="<<val<<endl; - -#ifdef DEBUG - Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw); - Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val); -#endif - } - - - - if(return_peaks) { - // determine if previous position was a peak - if(ppv>min_peak_val && ppv>val && ppv>pppv) { - if(lpp>0 && (i-lpp+1)>min_peak_dist) { - // record previous peak position - ppos.push_back(lpp); - pval.push_back(lpv); -#ifdef DEBUG - Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp)); -#endif - lpp=i-1; lpv=ppv; -#ifdef DEBUG - Rprintf("updated peak to x=%d y=%f\n",lpp,lpv); -#endif - } else { - if(ppv>lpv) { - // update last peak positions -#ifdef DEBUG - Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv); -#endif - lpp=i-1; lpv=ppv; - } - } - } - - // update previous values - if(val!=ppv) { - pppv=ppv; ppv=val; - } - } else { - d_nv[i]=val; - } - } - } - - if(return_peaks) { - // record last position - if(lpp>0) { -#ifdef DEBUG - Rprintf("recording last peak x=%d y=%f\n",lpp,lpv); -#endif - ppos.push_back(lpp); - pval.push_back(lpv); - } - - SEXP rpp_R,rpv_R; - PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); - PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); - int* rpp=INTEGER(rpp_R); - double* rpv=REAL(rpv_R); - - for(int i=0;i<ppos.size();i++) { - rpp[i]=ppos[i]; - rpv[i]=pval[i]; - } - - SEXP ans_R, names_R; - PROTECT(names_R = allocVector(STRSXP, 2)); - SET_STRING_ELT(names_R, 0, mkChar("x")); - SET_STRING_ELT(names_R, 1, mkChar("v")); - - PROTECT(ans_R = allocVector(VECSXP, 2)); - SET_VECTOR_ELT(ans_R, 0, rpp_R); - SET_VECTOR_ELT(ans_R, 1, rpv_R); - setAttrib(ans_R, R_NamesSymbol, names_R); - - UNPROTECT(4); - return(ans_R); - } else { - UNPROTECT(1); - return(nv); - } - - } - - - - /************************************************************************/ - /* - * wtd - window tag difference implementation - */ - - SEXP wtd(SEXP x_R, // positive strand hist - SEXP y_R, // negative strand hist of the same length - SEXP wsize_R, // outer boundary distance - SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned - SEXP min_peak_dist_R, // distance between closest peaks - SEXP min_peak_val_R, // min peak threshold - SEXP direct_count_R, // whether tag weighting should not be done - SEXP tag_weight_R, // tag weight - SEXP ignore_masking_R, // whether to ignore masked regions - SEXP bg_subtract_R, // a flag whether do background subtractio - SEXP bgp_R, // optional background hist for positive strand - SEXP bgn_R, // optional background hist for negative strand - SEXP bg_wsize_R, // window size for the background counts - SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference - SEXP round_up_R // whether to round up fractional signal tag counts - ) - { - -#ifdef DEBUG - Rprintf("start\n"); -#endif - - int whs=INTEGER(wsize_R)[0]; - int* x=INTEGER(x_R); - int* y=INTEGER(y_R); - int n_x=LENGTH(x_R); - - // background-related - int* bgp=INTEGER(bgp_R); - int* bgn=INTEGER(bgn_R); - int bg_whs=INTEGER(bg_wsize_R)[0]; - - - const int return_peaks=*(INTEGER(return_peaks_R)); - const int direct_count=*(INTEGER(direct_count_R)); - const int ignore_masking=*(INTEGER(ignore_masking_R)); - const double min_peak_val=*(REAL(min_peak_val_R)); - const int min_peak_dist=*(INTEGER(min_peak_dist_R)); - const double tag_weight=*(REAL(tag_weight_R)); - - const int round_up=*(INTEGER(round_up_R)); - const int bg_subtract=*(INTEGER(bg_subtract_R)); - const double bg_weight=*(REAL(bg_weight_R)); - - int i; // point at which the value is being calculated - int start=whs+1; - int end=n_x-whs-1; - - // tag counts to calculate the means - int pn1=0; - int pn2=0; - int nn1=0; - int nn2=0; - - // bg tag counts within bg window - int bg_pn1=0; - int bg_pn2=0; - int bg_nn1=0; - int bg_nn2=0; - - SEXP nv; - double *d_nv; - vector<int> ppos; - vector<double> pval; - if(!return_peaks) { - PROTECT(nv=allocVector(REALSXP,n_x)); - d_nv=REAL(nv); - for(int i=0;i<n_x;i++) { - d_nv[i]=0; - } - } - -#ifdef DEBUG - Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking); - Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]); -#endif - - int lpp=-1; // last peak position - double lpv=-1000; // last peak value - - double ppv=-1000; // last value - int ppl=-1; // position of the last value - double pppv=-1000; // value before last - - - if(ignore_masking==1) { - for(int i=0;i<whs;i++) { - pn1+=x[i]; - pn2+=x[i+whs+1]; - nn1+=y[i]; - nn2+=y[i+whs+1]; - - } - } - - if(bg_subtract) { - // pre-initialize background tag counts, - for(int i=0;i<bg_whs;i++) { - if(i<n_x) { - bg_pn2+=bgp[i]; - bg_nn2+=bgn[i]; - } - } - // increment center of background count window to the start position - for(int i=0;i<start;i++) { - // update background counts - int nl=i-bg_whs-1; - - if(nl>=0) { - bg_pn1-=bgp[nl]; - bg_nn1-=bgn[nl]; - } - bg_pn1+=bgp[i]; - bg_nn1+=bgn[i]; - - if(i>0) { - bg_pn2-=bgp[i-1]; - bg_nn2-=bgn[i-1]; - } - int nr=i+bg_whs; - if(nr<n_x) { - bg_pn2+=bgp[nr]; - bg_nn2+=bgn[nr]; - } - } - - } - - -#ifdef DEBUG - Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2); -#endif - - for(i=start;i<end;i++) { - if(bg_subtract) { - // update background counts - int nl=i-bg_whs-1; - - if(nl>=0) { - bg_pn1-=bgp[nl]; - bg_nn1-=bgn[nl]; - } - bg_pn1+=bgp[i]; - bg_nn1+=bgn[i]; - - if(i>0) { - bg_pn2-=bgp[i-1]; - bg_nn2-=bgn[i-1]; - } - int nr=i+bg_whs; - if(nr<n_x) { - bg_pn2+=bgp[nr]; - bg_nn2+=bgn[nr]; - } - } - - // update counts - if(ignore_masking==1) { - pn1+=x[i-1]-x[i-whs-1]; - pn2+=x[i+whs]-x[i-1]; - nn1+=y[i-1]-y[i-whs-1]; - nn2+=y[i+whs]-y[i-1]; - - } else { - - pn1=pn2=nn1=nn2=0; - - for(int k=0;k<whs;k++) { - int xp1=x[i-k-1]; - int xp2=x[i+k]; - int xn1=y[i-k-1]; - int xn2=y[i+k]; - - // omit masked positions - if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) { - pn1+=xp1; - nn1+=xn1; - pn2+=xp2; - nn2+=xn2; - } - } - } - - double val; - double spn1=((double)pn1)*tag_weight; - double snn1=((double)nn1)*tag_weight; - double spn2=((double)pn2)*tag_weight; - double snn2=((double)nn2)*tag_weight; - if(round_up) { - if(pn1>0 && spn1<1) { spn1=1.0; } - //if(pn2>0 && spn2<1) { spn2=1.0; } - //if(nn1>0 && snn1<1) { snn1=1.0; } - if(nn2>0 && snn2<1) { snn2=1.0; } - } - - if(direct_count) { - val=spn1+snn2; - if(round_up && val<1) { - val=1.0; - } - if(bg_subtract) { - val-=((double) (bg_pn1+bg_nn2))*bg_weight; - } - } else { - if(bg_subtract) { - spn1-=((double)bg_pn1)*bg_weight; - snn1-=((double)bg_nn1)*bg_weight; - spn2-=((double)bg_pn2)*bg_weight; - snn2-=((double)bg_nn2)*bg_weight; - - if(spn2<0) spn2=0; - if(snn1<0) snn1=0; - - if(spn1>0 && snn2>0) { - val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0)); - } else { - val=-(spn2+snn1+1.0); - } - } else { - val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight); - } - } - //double val=sqrt(pn1*nn2); - //if(pn2>nn1) { val-=pn2; } else { val-=pn1; } -#ifdef DEBUG - Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val); -#endif - - if(return_peaks) { - // determine if previous position was a peak - if(ppv>min_peak_val && ppv>val && ppv>pppv) { - if(lpp>0 && (i-lpp+1)>min_peak_dist) { - // record previous peak position - ppos.push_back(lpp); - pval.push_back(lpv); -#ifdef DEBUG - Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp)); -#endif - if(ppl!=-1 && ppl!=i-1) { - lpp=(int) round((ppl+i-1)/2); - } else { - lpp=i-1; - } - lpv=ppv; -#ifdef DEBUG - Rprintf("updated peak to x=%d y=%f\n",lpp,lpv); -#endif - } else { - if(ppv>lpv) { - // update last peak positions -#ifdef DEBUG - Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv); -#endif - if(ppl!=-1 && ppl!=i-1) { - lpp=(int) round((ppl+i-1)/2); - } else { - lpp=i-1; - } - lpv=ppv; - } - } - } - - // update previous values - if(val!=ppv) { - pppv=ppv; ppv=val; ppl=i; - } - } else { - d_nv[i]=val; - } - } - - if(return_peaks) { - // record last position - if(lpp>0) { -#ifdef DEBUG - Rprintf("recording last peak x=%d y=%f\n",lpp,lpv); -#endif - ppos.push_back(lpp); - pval.push_back(lpv); - } - - SEXP rpp_R,rpv_R; - PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); - PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); - int* rpp=INTEGER(rpp_R); - double* rpv=REAL(rpv_R); - - for(int i=0;i<ppos.size();i++) { - rpp[i]=ppos[i]; - rpv[i]=pval[i]; - } - - SEXP ans_R, names_R; - PROTECT(names_R = allocVector(STRSXP, 2)); - SET_STRING_ELT(names_R, 0, mkChar("x")); - SET_STRING_ELT(names_R, 1, mkChar("v")); - - PROTECT(ans_R = allocVector(VECSXP, 2)); - SET_VECTOR_ELT(ans_R, 0, rpp_R); - SET_VECTOR_ELT(ans_R, 1, rpv_R); - setAttrib(ans_R, R_NamesSymbol, names_R); - - UNPROTECT(4); - return(ans_R); - } else { - UNPROTECT(1); - return(nv); - } - - } - - -} - -