Mercurial > repos > zzhou > spp_phantompeak
changeset 8:eeea5224f074 draft
Uploaded
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/DESCRIPTION Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,12 @@ +Package: spp +Type: Package +Title: some description +Version: 1.0 +Date: 2008-11-10 +Author: Peter K +Depends: caTools +Maintainer: peterK<peterk@compbio.med.harvard.edu> +Description: Describe the package +License: GPL-2 +LazyLoad: yes +Packaged: Wed Nov 12 10:42:54 2008; vidhuch
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/NAMESPACE Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,3 @@ +useDynLib(spp) + +exportPattern("^[^\\.]")
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/R/zroutines.R Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,2501 @@ +#library(caTools) +#dyn.load("src/bed2vector.so"); +#dyn.load("src/wdl.so"); +#dyn.load("src/peaks.so"); +#dyn.load("src/cdensum.so"); + + +# -------- ROUTINES FOR READING IN THE DATA FILES ------------ +# fix.chromosome.names : remove ".fa" suffix from match sequence names +read.eland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T,max.eland.tag.length=-1,extended=F,multi=F) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + storage.mode(max.eland.tag.length) <- "integer"; + callfunction <- "read_eland"; + if(extended) { callfunction <- "read_eland_extended"; }; + if(multi) { callfunction <- "read_eland_multi"; }; + tl <- lapply(.Call(callfunction,filename,rtn,max.eland.tag.length),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); + } +} + +read.tagalign.tags <- function(filename,fix.chromosome.names=T,fix.quality=T) { + tl <- lapply(.Call("read_tagalign",filename),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + #if(fix.quality) { + # d$n <- 4-cut(d$n,breaks=c(0,250,500,750,1000),labels=F) + #} + if(fix.quality) { # Anshul: changed the way the quality field is processed + if (min(d$n)<0.5){ + d$n = ceiling(1000/4^d$n); + } + break.vals <- unique(sort(c(0,unique(d$n)))); + d$n <- length(break.vals)-1-cut(d$n,breaks=break.vals,labels=F); + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); +} + + +read.short.arachne.tags <- function(filename,fix.chromosome.names=F) { + tl <- lapply(.Call("read_arachne",filename),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); +} + + +read.arachne.tags <- function(filename,fix.chromosome.names=F) { + tl <- lapply(.Call("read_arachne_long",filename),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + d$l <- d$l[xo]; + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l))); +} + +read.bowtie.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + tl <- lapply(.Call("read_bowtie",filename,rtn),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); + } +} + +read.bam.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + tl <- lapply(.Call("read_bam",filename,rtn),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); + } +} + + +read.helicos.tags <- function(filename,read.tag.names=F,fix.chromosome.names=F,include.length.info=T) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + tl <- lapply(.Call("read_helicostabf",filename,rtn),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + d$l <- d$l[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l),names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),length=lapply(tl,function(d) d$l))); + } +} + +read.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + tl <- lapply(.Call("read_maqmap",filename,rtn),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); + } +} + + +read.bin.maqmap.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + tl <- lapply(.Call("read_binmaqmap",filename,rtn),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n),names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=lapply(tl,function(d) d$n))); + } +} + + +# read in tags from an extended eland format with match length information +read.meland.tags <- function(filename,read.tag.names=F,fix.chromosome.names=T) { + if(read.tag.names) { rtn <- as.integer(1); } else { rtn <- as.integer(0); }; + tl <- lapply(.Call("read_meland",filename,rtn),function(d) { + xo <- order(abs(d$t)); + d$t <- d$t[xo]; + d$n <- d$n[xo]; + d$l <- d$l[xo]; + if(read.tag.names) { + d$s <- d$s[xo]; + } + return(d); + }); + + if(fix.chromosome.names) { + # remove ".fa" + names(tl) <- gsub("\\.fa","",names(tl)) + } + # separate tags and quality + chrl <- names(tl); names(chrl) <- chrl; + # reformulate quality scores into monotonic integers + ml <- max(unlist(lapply(tl,function(d) max(d$l)))); + qual <- lapply(chrl,function(chr) (ml-tl[[chr]]$l)+tl[[chr]]$n/10); + if(read.tag.names) { + return(list(tags=lapply(tl,function(d) d$t),quality=qual,names=lapply(tl,function(d) d$s))); + } else { + return(list(tags=lapply(tl,function(d) d$t),quality=qual)); + } +} + +# -------- ROUTINES FOR ASSESSING BINDING PATTERN AND SELECTING INFORMATIVE TAGS ------------ + +# removes tag positions that have anomalously high counts on both strands +# z - z-score used to determine anomalous bins +# zo - z used to filter out one-strand matches +# trim.fraction - fraction of top bins to discard when calculating overall background density +remove.tag.anomalies <- function(data, bin=1,trim.fraction=1e-3,z=5,zo=3*z) { + + t.remove.tag.anomalies <- function(tv,bin=1,trim.fraction=1e-3,z=5,zo=3*z,return.indecies=F) { + tt <- table(floor(tv/bin)); + + # trim value + stt <- sort(as.numeric(tt)); + stt <- stt[1:(length(stt)*(1-trim.fraction))]; + mtc <- mean(stt); tcd <- sqrt(var(stt)); + + thr <- max(1,ceiling(mtc+z*tcd)); + thr.o <- max(1,ceiling(mtc+zo*tcd)); + # filter tt + tt <- tt[tt>=thr] + # get + and - tags + tp <- as.numeric(names(tt)); + pti <- tp>0; + it <- intersect(tp[pti],(-1)*tp[!pti]); + # add one-strand matches + it <- unique(c(it,tp[tt>=thr.o])); + sit <- c(it,(-1)*it); + + if(bin>1) { + sit <- sit*bin; + sit <- c(sit,unlist(lapply(1:bin,function(i) sit+i))) + } + if(return.indecies) { + return(!tv %in% sit); + } else { + return(tv[!tv %in% sit]); + } + } + + vil <- lapply(data$tags,t.remove.tag.anomalies,return.indecies=T,bin=bin,trim.fraction=trim.fraction,z=z,zo=zo); + chrl <- names(data$tags); names(chrl) <- chrl; + data$tags <- lapply(chrl,function(chr) data$tags[[chr]][vil[[chr]]]); + # count tags to remove empty chromosomes + nt <- unlist(lapply(data$tags,length)); + if(any(nt==0)) { + data$tags <- data$tags[nt!=0] + } + + if(!is.null(data$quality)) { + data$quality <- lapply(chrl,function(chr) data$quality[[chr]][vil[[chr]]]); + data$quality <- data$quality[nt!=0]; + } + if(!is.null(data$names)) { + data$names <- lapply(chrl,function(chr) data$names[[chr]][vil[[chr]]]); + data$names <- data$names[nt!=0]; + } + + return(data); +} + +# caps or removes tag positions that are significantly higher than local background +remove.local.tag.anomalies <- function(tags,window.size=200,eliminate.fold=10,cap.fold=4,z.threshold=3) { + lapply(tags,filter.singular.positions.by.local.density,window.size=2e2,eliminate.fold=10,cap.fold=4,z.threshold=3); +} + + + +# assess strand cross-correlation, determine peak position, determine appropriate window size +# for binding detection. +get.binding.characteristics <- function(data,srange=c(50,500),bin=5,cluster=NULL,debug=F,min.tag.count=1e3,acceptance.z.score=3,remove.tag.anomalies=T,anomalies.z=5,accept.all.tags=F) { + if(remove.tag.anomalies) { + data <- remove.tag.anomalies(data,z=anomalies.z); + } + + # take highest quality tag bin + if(!is.null(data$quality) & !accept.all.tags) { + min.bin <- min(unlist(lapply(data$quality,min))) + chrl <- names(data$tags); names(chrl) <- chrl; + otl <- lapply(chrl,function(chr) data$tags[[chr]][data$quality[[chr]]==min.bin]); + } else { + otl <- data$tags; + } + # remove empty chromosomes + otl <- otl[unlist(lapply(otl,length))!=0]; + + + # calculate strand scc + if(!is.null(cluster)) { + cc <- clusterApplyLB(cluster,otl,tag.scc,srange=srange,bin=bin); + names(cc) <- names(otl); + } else { + cc <- lapply(otl,tag.scc,srange=srange,bin=bin); + } + ccl<-list(sample=cc); + ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,return.ac=T,ttl=list(sample=otl),plot=F)[[1]] + ccl.av <- data.frame(x=as.numeric(names(ccl.av)),y=as.numeric(ccl.av)); + + # find peak + pi <- which.max(ccl.av$y); + + # determine width at third-height + th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/3+ccl.av$y[length(ccl.av$y)] + whs <- max(ccl.av$x[ccl.av$y>=th]); + + if (! is.integer(whs)) { # Anshul: added this to avoid situations where whs ends up being -Inf + whs <- ccl.av$x[ min(c(2*pi,length(ccl.av$y))) ] + } + + # determine acceptance of different quality bins + + # calculates tag scc for the best tags, and combinations of best tag category with every other category + # for subsequent selection of acceptable categories + scc.acceptance.calc <- function() { + + qr <- range(unlist(lapply(data$quality,range))) + + # start with best tags + + # determine half-width for scc calculations + pi <- which.max(ccl.av$y); + + # determine width at half-height + th <- (ccl.av$y[pi]-ccl.av$y[length(ccl.av$y)])/2+ccl.av$y[length(ccl.av$y)] + lwhs <- max(ccl.av$x[ccl.av$y>=th])-ccl.av$x[pi]; + lwhs <- max(c(20,bin*10,lwhs)); + srange <- ccl.av$x[pi]+c(-lwhs,lwhs) + + # calculate chromosome-average scc + t.scc <- function(tags) { + if(is.null(cluster)) { + cc <- lapply(tags,tag.scc,srange=srange,bin=bin); + } else { + cc <- clusterApplyLB(cluster,tags,tag.scc,srange=srange,bin=bin); names(cc) <- names(tags); + } + return(t.plotavcc(1,type='l',ccl=list(cc),ttl=list(tags),plot=F,return.ac=T)) + } + + + # returns info list for a given tag length (lv), mismatch count (nv) + t.cat <- function(qual) { + # construct tag set + if(qual==qr[1]) { + ts <- otl; + } else { + nts <- names(otl); names(nts) <- nts; + # select tags + at <- lapply(nts,function(chr) data$tags[[chr]][data$quality[[chr]]==qual]); + ntags <- sum(unlist(lapply(at,length))); + if(ntags<min.tag.count) { return(NULL); } + + # append to otl + ts <- lapply(nts,function(nam) c(otl[[nam]],at[[nam]])); + } + + return(t.scc(ts)); + } + + + # calculate cross-correlation values for each quality bin + ql <- sort(unique(unlist(lapply(data$quality,unique)))); names(ql) <- ql; + + qccl <- lapply(ql,t.cat); + + # acceptance tests + ac <- c(T,unlist(lapply(qccl[-1],function(d) if(is.null(d)) { return(F) } else { t.test(d-qccl[[as.character(min.bin)]],alternative="greater")$p.value<pnorm(acceptance.z.score,lower.tail=F) }))); names(ac) <- names(qccl); + return(list(informative.bins=ac,quality.cc=qccl)) + } + + if(accept.all.tags | is.null(data$quality)) { + return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs)) + } else { + acc <- scc.acceptance.calc(); + return(list(cross.correlation=ccl.av,peak=list(x=ccl.av$x[pi],y=ccl.av$y[pi]),whs=whs,quality.bin.acceptance=acc)); + } + +} + + +# select a set of informative tags based on the pre-calculated binding characteristics +select.informative.tags <- function(data,binding.characteristics=NULL) { + if(is.null(binding.characteristics)) { + return(data$tags); + } + if(is.null(binding.characteristics$quality.bin.acceptance)) { + cat("binding characteristics doesn't contain quality selection info, accepting all tags\n"); + return(data$tags); + } + + ib <- binding.characteristics$quality.bin.acceptance$informative.bins; + abn <- names(ib)[ib] + + chrl <- names(data$tags); names(chrl) <- chrl; + lapply(chrl,function(chr) { + data$tags[[chr]][as.character(data$quality[[chr]]) %in% abn] + }) +} + +# -------- ROUTINES FOR CALLING BINDING POSITIONS ------------ + +# determine binding positions +# signal.data - IP tag lists +# control.data - input tag lists +# e.value - desired E-value threshold (either E-value or FDR threshold must be provided) +# fdr - desired FDR threshold +# min.dist - minimal distance between detected positions +# tag.count.whs - size of the window to be used to estimate confidence interval of the peak fold enrichment ratios +# enrichmnent.z - Z-score defining the desired confidence level for enrichment interval estimates +# enrichment.background.scales - define how many tiems larger should be the window for estimating background +# tag density when evaluating peak enrichment confidence intervals. +# If multiple values are given, multiple independent interval estimates will be +# calculated. +# tec.filter - whether to mask out the regions that exhibit significant background enrichment +# tec.window.size, tec.z - window size and Z-score for maksing out significant background enrichment regions +# +# If the control.data is not provided, the method will assess significance of the determined binding positions +# based on the randomizations of the original data. The following paramters control such randomizations: +# n.randomizations - number of randomizations to be performed +# shuffle.window - size of the bin that defines the tags that are kept together during randomization. +# value of 0 means that all tags are shuffled independently +# +# Binding detection methods: +# tag.wtd - default method. +# must specify parameter "whs", which is the half-size of the window used to calculate binding scores +# tag.lwcc - LWCC method; +# must specify whs - a size of the window used to calculate binding scores +# can specify isize (default=15bp) - size of the internal window that is masked out +find.binding.positions <- function(signal.data,f=1,e.value=NULL,fdr=NULL, masked.data=NULL,control.data=NULL,whs=200,min.dist=200,window.size=4e7,cluster=NULL,debug=T,n.randomizations=3,shuffle.window=1,min.thr=2,topN=NULL, tag.count.whs=100, enrichment.z=2, method=tag.wtd, tec.filter=T,tec.window.size=1e4,tec.z=5,tec.masking.window.size=tec.window.size, tec.poisson.z=5,tec.poisson.ratio=5, tec=NULL, n.control.samples=1, enrichment.scale.down.control=F, enrichment.background.scales=c(1,5,10), use.randomized.controls=F, background.density.scaling=T, mle.filter=F, min.mle.threshold=1, ...) { + + if(f<1) { + if(debug) { cat("subsampling signal ... "); } + signal.data <- lapply(signal.data,function(x) sample(x,length(x)*f)) + if(debug) { cat("done\n"); } + } + + + if(!is.null(control.data) & !use.randomized.controls) { + # limit both control and signal data to a common set of chromosomes + chrl <- intersect(names(signal.data),names(control.data)); + signal.data <- signal.data[chrl]; + control.data <- control.data[chrl]; + control <- list(control.data); + } else { + control <- NULL; + } + + prd <- lwcc.prediction(signal.data,min.dist=min.dist,whs=whs,window.size=window.size,e.value=e.value,fdr=fdr,debug=debug,n.randomizations=n.randomizations,shuffle.window=shuffle.window,min.thr=min.thr,cluster=cluster,method=method,bg.tl=control.data,mask.tl=masked.data, topN=topN, control=control,tec.filter=tec.filter,tec.z=tec.z,tec.window.size=tec.window.size, tec.masking.window.size=tec.masking.window.size, tec.poisson.z=tec.poisson.z,tec.poisson.ratio=tec.poisson.ratio, background.density.scaling=background.density.scaling, ...); + + # add tag counts + chrl <- names(prd$npl); names(chrl) <- chrl; + prd$npl <- lapply(chrl,function(chr) { + pd <- prd$npl[[chr]]; + pd$nt <- points.within(abs(signal.data[[chr]]),pd$x-tag.count.whs,pd$x+tag.count.whs,return.point.counts=T); + return(pd); + }); + prd$f <- f; + prd$n <- sum(unlist(lapply(signal.data,length))); + if(!is.null(control.data)) { + prd$n.bg <- sum(unlist(lapply(control.data,length))); + } + + # calculate enrichment ratios + prd <- calculate.enrichment.estimates(prd,signal.data,control.data=control.data,fraction=1,tag.count.whs=tag.count.whs,z=enrichment.z,scale.down.control=enrichment.scale.down.control,background.scales=enrichment.background.scales); + + if(mle.filter) { + if(!is.null(prd$npl)) { + if(length(prd$npl)>1) { + mle.columns <- grep("enr.mle",colnames(prd$npl[[1]])); + if(length(mle.columns)>1) { + prd$npl <- lapply(prd$npl,function(d) d[apply(d[,mle.columns],1,function(x) all(x>min.mle.threshold)),]) + } + } + } + } + + prd$whs <- whs; + + return(prd); +} + + + +# -------- ROUTINES FOR WRITING OUT TAG DENSITY AND ENRICHMENT PROFILES ------------ +# calculate smoothed tag density, optionally subtracting the background +get.smoothed.tag.density <- function(signal.tags,control.tags=NULL,bandwidth=150,bg.weight=NULL,tag.shift=146/2,step=round(bandwidth/3),background.density.scaling=T,rngl=NULL,scale.by.dataset.size=F) { + chrl <- names(signal.tags); names(chrl) <- chrl; + + if(!is.null(control.tags)) { + bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling); + } + + if(scale.by.dataset.size) { + den.scaling <- dataset.density.size(signal.tags,background.density.scaling=background.density.scaling)/1e6; + } else { + den.scaling <- 1; + } + + lapply(chrl,function(chr) { + ad <- abs(signal.tags[[chr]]+tag.shift); + rng <- NULL; + if(!is.null(rngl)) { + rng <- rngl[[chr]]; + } + if(is.null(rng)) { + rng <- range(ad); + } + + ds <- densum(ad,bw=bandwidth,from=rng[1],to=rng[2],return.x=T,step=step); + if(!is.null(control.tags)) { + if(!is.null(control.tags[[chr]])) { + bsd <- densum(abs(control.tags[[chr]]+tag.shift),bw=bandwidth,from=rng[1],to=rng[2],return.x=F,step=step); + ds$y <- ds$y-bsd*bg.weight; + } + } + return(data.frame(x=seq(ds$x[1],ds$x[2],by=step),y=den.scaling*ds$y)) + }) +} + +# get smoothed maximum likelihood estimate of the log2 signal to control enrichment ratio +get.smoothed.enrichment.mle <- function(signal.tags, control.tags, tag.shift=146/2, background.density.scaling=F, pseudocount=1,bg.weight=NULL, ... ) { + # determine common range + chrl <- intersect(names(signal.tags),names(control.tags)); names(chrl) <- chrl; + rngl <- lapply(chrl,function(chr) range(c(range(abs(signal.tags[[chr]]+tag.shift)),range(abs(control.tags[[chr]]+tag.shift))))) + ssd <- get.smoothed.tag.density(signal.tags, rngl=rngl, ..., scale.by.dataset.size=F) + csd <- get.smoothed.tag.density(control.tags, rngl=rngl, ..., scale.by.dataset.size=F) + if(is.null(bg.weight)) { + bg.weight <- dataset.density.ratio(signal.tags,control.tags,background.density.scaling=background.density.scaling); + } + cmle <- lapply(chrl,function(chr) { d <- ssd[[chr]]; d$y <- log2(d$y+pseudocount) - log2(csd[[chr]]$y+pseudocount) - log2(bg.weight); return(d); }) +} + + +# returns a conservative upper/lower bound profile (log2) given signal tag list, background tag list and window scales +get.conservative.fold.enrichment.profile <- function(ftl,btl,fws,bwsl=c(1,5,25,50)*fws,step=50,tag.shift=146/2,alpha=0.05,use.most.informative.scale=F,quick.calculation=T,background.density.scaling=T,bg.weight=NULL,posl=NULL,return.mle=F) { + # include only chromosomes with more than 2 reads + ftl <- ftl[unlist(lapply(ftl,length))>2] + chrl <- names(ftl); names(chrl) <- chrl; + if(!is.null(posl)) { + chrl <- chrl[chrl %in% names(posl)]; + } + # calculate background tag ratio + if(is.null(bg.weight)) { + bg.weight <- dataset.density.ratio(ftl,btl,background.density.scaling=background.density.scaling); + } + lapply(chrl,function(chr) { + if(is.null(btl[[chr]])) { bt <- c(); } else { bt <- abs(btl[[chr]]+tag.shift); } + if(is.null(posl)) { + x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha); + } else { + x <- mbs.enrichment.bounds(abs(ftl[[chr]]+tag.shift),bt,fws=fws,bwsl=bwsl,step=step,calculate.upper.bound=T,bg.weight=bg.weight,use.most.informative.scale=use.most.informative.scale,quick.calculation=quick.calculation,alpha=alpha,pos=posl[[chr]]); + } + # compose profile showing lower bound for enriched, upper bound for depleted regions + ps <- rep(1,length(x$mle)); + vi <- which(!is.na(x$lb) & x$lb>1); + ps[vi] <- x$lb[vi]; + vi <- which(!is.na(x$ub) & x$ub<1); + ps[vi] <- x$ub[vi]; + ps <- log2(ps); + if(is.null(posl)) { + if(return.mle) { + return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub))); + } else { + return(data.frame(x=seq(x$x$s,x$x$e,by=x$x$step),y=ps)); + } + } else { + if(return.mle) { + return(data.frame(x=posl[[chr]],y=ps,mle=log2(x$mle),lb=log2(x$lb),ub=log2(x$ub))); + } else { + return(data.frame(x=posl[[chr]],y=ps)); + } + } + }) +} + + +# write a per-chromosome $x/$y data structure into a wig file +writewig <- function(dat,fname,feature,threshold=5,zip=F) { + chrl <- names(dat); names(chrl) <- chrl; + invisible(lapply(chrl,function(chr) { + bdiff <- dat[[chr]]; + ind <- seq(1,length(bdiff$x)); + ind <- ind[!is.na(bdiff$y[ind])]; + header <- chr==chrl[1]; + write.probe.wig(chr,bdiff$x[ind],bdiff$y[ind],fname,append=!header,feature=feature,header=header); + })) + if(zip) { + zf <- paste(fname,"zip",sep="."); + system(paste("zip \"",zf,"\" \"",fname,"\"",sep="")); + system(paste("rm \"",fname,"\"",sep="")); + return(zf); + } else { + return(fname); + } +} + + + +# -------- ROUTINES FOR ANALYZING SATURATION PROPERTIES ------------ + +# PUBLIC +# calculate minimal saturation enrichment ratios (MSER) +get.mser <- function(signal.data,control.data,n.chains=5,step.size=1e5, chains=NULL, cluster=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), n.steps=1, ...) { + if(is.null(chains)) { + ci <- c(1:n.chains); names(ci) <- ci; + if(is.null(cluster)) { + chains <- lapply(ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...); + } else { + chains <- clusterApplyLB(cluster,ci,get.subsample.chain.calls,signal.data=signal.data,control.data=control.data,n.steps=n.steps,step.size=step.size,subsample.control=F, enrichment.background.scales=enrichment.background.scales, ...); + names(chains) <- ci; + } + } + cvl <- mser.chain.interpolation(chains=chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=F); + if(n.steps>1) { + msers <- cvl; + } else { + msers <- unlist(lapply(cvl,function(d) d$me)) + } + if(return.chains) { + return(list(mser=msers,chains=chains)); + } else { + return(msers); + } +} + +# PUBLIC +# interpolate MSER dependency on tag counts +get.mser.interpolation <- function(signal.data,control.data,target.fold.enrichment=5,n.chains=10,n.steps=6,step.size=1e5, chains=NULL, test.agreement=0.99, return.chains=F, enrichment.background.scales=c(1), excluded.steps=c(seq(2,n.steps-2)), ...) { + msers <- get.mser(signal.data,control.data,n.chains=n.chains,n.steps=n.steps,step.size=step.size,chains=chains,test.agrement=test.agreement,return.chains=T,enrichment.background.scales=enrichment.background.scales,excluded.steps=excluded.steps, ...); + + # adjust sizes in case a subset of chromosomes was used + mser <- mser.chain.interpolation(chains=msers$chains,enrichment.background.scales=enrichment.background.scales,test.agreement=test.agreement,return.lists=T); + sr <- sum(unlist(lapply(signal.data,length)))/mser[[1]][[1]]$n[1]; + + # Subsampling each chain requires removing a fraction of each chromosome's + # tag list. To get the exact step.size, this often leaves chromosomes with + # a non-integer number of tags. The non-integer values are floored, so each + # chr can contribute at most 0.999.. <= 1 error to the step.size. + floor.error <- length(msers$chains[[1]][[1]]$npl) + intpn <- lapply(mser,function(ms) { + lmvo <- do.call(rbind,ms) + lmvo$n <- lmvo$n*sr; + # Don't select rows corresponding to excluded.steps + # Keep in mind that nd values are negative. + lmvo <- lmvo[lmvo$nd <= (lmvo$nd[1] + floor.error) & lmvo$nd >= (lmvo$nd[1] - floor.error),]; + lmvo <- na.omit(lmvo); + if(any(lmvo$me==1)) { + return(list(prediction=NA)); + } + lmvo$n <- log10(lmvo$n); lmvo$me <- log10(lmvo$me-1) + # remove non-standard steps + emvf <- lm(me ~ n,data=lmvo); + tfe <- (log10(target.fold.enrichment-1)-coef(emvf)[[1]])/coef(emvf)[[2]]; + tfen <- 10^tfe; + return(list(prediction=tfen,log10.fit=emvf)); + }) + + if(return.chains) { + return(list(interpolation=intpn,chains=msers$chains)) + } else { + return(intpn); + } + + return(msers); + +} + + +# output binding detection results to a text file +# the file will contain a table with each row corresponding +# to a detected position, with the following columns: +# chr - chromosome or target sequence +# pos - position of detected binding site on the chromosome/sequence +# score - a score reflecting magnitude of the binding +# Evalue - E-value corresponding to the peak magnitude +# FDR - FDR corresponding to the peak magnitude +# enrichment.lb - lower bound of the fold-enrichment ratio +# enrichment.mle - maximum likelihood estimate of the fold-enrichment ratio +output.binding.results <- function(results,filename) { + write(file=filename,"chr\tpos\tscore\tEvalue\tFDR\tenrichment.lb\tenrichment.mle",append=F); + chrl <- names(results$npl); names(chrl) <- chrl; + x <- lapply(chrl,function(chr) { + d <- results$npl[[chr]]; + if(dim(d)[1]>0) { + if(results$thr$type=="topN") { + od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,enr,enr.mle))) + } else { + od <- cbind(rep(chr,dim(d)[1]),subset(d,select=c(x,y,evalue,fdr,enr,enr.mle))) + } + write.table(od,file=filename,col.names=F,row.names=F,sep="\t",append=T,quote=F) + } + }) +} + + +# -------- LOW-LEVEL ROUTINES ------------ + +# calculates tag strand cross-correlation for a range of shifts (on positive strand) +tag.scc <- function(tags,srange=c(50,250),bin=1,tt=NULL,llim=10) { + if(is.null(tt)) { + tt <- table(sign(tags)*as.integer(floor(abs(tags)/bin+0.5))); + } + if(!is.null(llim)) { l <- mean(tt); tt <- tt[tt<llim*l] } + tc <- as.integer(names(tt)); + tt <- as.numeric(tt); + + pv <- tt; pv[tc<0]<-0; + nv <- tt; nv[tc>0]<-0; + + pti <- which(tc>0) + nti <- which(tc<0); + + ptc <- tc[pti]; + ntc <- (-1)*tc[nti]; + + ptv <- tt[pti]; + ntv <- tt[nti]; + + trng <- range(c(range(ptc),range(ntc))) + l <- diff(trng)+1; + rm(tc,tt); + + mp <- sum(ptv)*bin/l; mn <- sum(ntv)*bin/l; + ptv <- ptv-mp; ntv <- ntv-mn; + ss <- sqrt((sum(ptv*ptv)+(l-length(ptv))*mp^2) * (sum(ntv*ntv)+(l-length(ntv))*mn^2)); + + t.cor <- function(s) { + smi <- match(ptc+s,ntc); + return((sum(ptv[!is.na(smi)]*ntv[na.omit(smi)]) - + mn*sum(ptv[is.na(smi)]) - + mp*sum(ntv[-na.omit(smi)]) + + mp*mn*(l-length(ptv)-length(ntv)+length(which(!is.na(smi)))))/ss); + } + shifts <- floor(seq(srange[1],srange[2],by=bin)/bin+0.5); + scc <- unlist(lapply(shifts,t.cor)); names(scc) <- shifts*bin; + return(scc); +} + + +# plot tag cross-correlation +t.plotcc <- function(ac, lab=c(10,5,7), ylab="correlation", xlab="lag", pch=19, grid.i=c(-5:5), grid.s=10, type='b', plot.grid=F, cols=c(1,2,4,"orange",8,"pink"), min.peak.x=NULL, xlim=NULL, plot.147=F, plot.max=T, rmw=1, rescale=F, legendx="right", ltys=rep(1,length(ac)), ...) { + if(is.list(ac)) { + cols <- cols[1:length(ac)]; + + if(!is.null(xlim)) { + vx <- as.numeric(names(ac[[1]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]); + ac[[1]] <- (ac[[1]])[vx]; + } else { + xlim <- range(as.numeric(names(ac[[1]]))); + } + + + plot(as.numeric(names(ac[[1]])),runmean(ac[[1]],rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, col=cols[1], xlim=xlim, lty=ltys[1], ...); + if(length(ac)>1) { + for(i in seq(2,length(ac))) { + irng <- range(ac[[i]]); + vx <- as.numeric(names(ac[[i]])); vx <- which(vx>=xlim[1] & vx<=xlim[2]); + if(rescale) { + lines(as.numeric(names(ac[[i]])[vx]),runmean((ac[[i]][vx]-irng[1])/diff(irng)*diff(range(ac[[1]]))+min(ac[[1]]),rmw),col=cols[i],lty=ltys[i]); + } else { + lines(as.numeric(names(ac[[i]]))[vx],runmean(ac[[i]][vx],rmw),col=cols[i],lty=ltys[i]); + } + } + } + if(is.null(min.peak.x)) { + m <- as.numeric(names(ac[[1]])[which.max(ac[[1]])]); + } else { + sac <- (ac[[1]])[which(as.numeric(names(ac[[1]]))>min.peak.x)] + m <- as.numeric(names(sac)[which.max(sac)]); + } + legend(x="topright",bty="n",legend=c(names(ac)),col=cols,lty=ltys) + } else { + if(!is.null(xlim)) { + vx <- as.numeric(names(ac)); + vx <- which(vx>=xlim[1] & vx<=xlim[2]); + ac <- ac[vx]; + } else { + xlim <- range(as.numeric(names(ac))); + } + + plot(names(ac),runmean(ac,rmw),type=type,pch=pch,xlab=xlab,ylab=ylab,lab=lab, xlim=xlim, ...); + if(is.null(min.peak.x)) { + m <- as.numeric(names(ac)[which.max(ac)]); + } else { + sac <- ac[which(names(ac)>min.peak.x)] + m <- as.numeric(names(sac)[which.max(sac)]); + } + } + if(plot.147) { + abline(v=147,lty=2,col=8); + } + if(plot.grid) { + abline(v=m+grid.i*grid.s,lty=3,col="pink"); + } + if(plot.max) { + abline(v=m,lty=2,col=2); + legend(x=legendx,bty="n",legend=c(paste("max at ",m,"bp",sep=""))); + return(m); + } + } + + # plot chromosome-acerage cross-correlation + t.plotavcc <- function(ci, main=paste(ci,"chromosome average"), ccl=tl.cc, return.ac=F, ttl=tl, plot=T, ... ) { + cc <- ccl[[ci]]; + if(length(cc)==1) { return(cc[[1]]) }; + if(length(cc)==0) { return(c()) }; + ac <- do.call(rbind,cc); + # omit NA chromosomes + ina <- apply(ac,1,function(d) any(is.na(d))); + + tags <- ttl[[ci]]; + avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw); + ac <- ac[!ina,]; avw <- avw[!ina]; + ac <- apply(ac,2,function(x) sum(x*avw)); + if(plot) { + m <- t.plotcc(ac, main=main, ...); + if(!return.ac) { return(m) } + } + if(return.ac) { return(ac) } + } + + t.plotchrcc <- function(ci,ncol=4, ccl=tl.cc, ... ) { + cc <- ccl[[ci]]; + ac <- do.call(rbind,cc); + par(mfrow = c(length(cc)/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8) + lapply(names(cc),function(ch) { t.plotcc(cc[[ch]],main=paste(ci,": chr",ch,sep=""), ...) }) + } + + t.plotavccl <- function(ci, ccl=tl.ccl, main=paste(ci,"chromosome average"), rtl=tl, ... ) { + #cc <- lapply(ccl[[ci]],function(x) { if(!is.null(x$M)) { x$M <- NULL;}; return(x); }); + cc <- ccl[[ci]]; + chrs <- names(cc[[1]]); names(chrs) <- chrs; + acl <- lapply(cc,function(x) do.call(rbind,x)); + tags <- rtl[[ci]][chrs]; + avw <- unlist(lapply(tags,length)); avw <- avw/sum(avw); + acl <- lapply(acl,function(ac) apply(ac,2,function(x) sum(x*avw))) + t.plotcc(acl, main=main, ...); + } + + t.plotchrccl <- function(ci,ccl=tl.ccl,ncol=4, ... ) { + par(mfrow = c(length(cc[[1]])/ncol,ncol), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8) + lapply(names(cc[[1]]),function(ch) { t.plotcc(lapply(cc,function(x) x[[ch]]),main=paste(ci,": chr",ch,sep=""), ...) }) + } + + + +show.scc <- function(tl,srange,cluster=NULL) { + if(!is.null(cluster)) { + cc <- clusterApplyLB(cluster,tl,tag.scc,srange=srange); + names(cc) <- names(tl); + } else { + cc <- lapply(tl,tag.scc,srange=srange); + } + par(mfrow = c(1,1), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8); + ccl<-list(sample=cc); + ccl.av <- lapply(names(ccl),t.plotavcc,type='l',ccl=ccl,xlim=srange,return.ac=F,ttl=list(sample=tl),main="")[[1]] +} + +# find regions of significant tag enrichment +find.significantly.enriched.regions <- function(signal.data,control.data,window.size=500,multiplier=1,z.thr=3,mcs=0,debug=F,background.density.scaling=T,masking.window.size=window.size,poisson.z=0,poisson.ratio=4,either=F,tag.shift=146/2,bg.weight=NULL) { + if(is.null(bg.weight)) { + bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling); + } + + if(debug) { + cat("bg.weight=",bg.weight,"\n"); + } + chrl <- names(signal.data); names(chrl) <- chrl; + tec <- lapply(chrl,function(chr) { + d <- tag.enrichment.clusters(signal.data[[chr]],control.data[[chr]],bg.weight=bg.weight*multiplier,thr=z.thr,wsize=window.size,mcs=mcs,min.tag.count.z=poisson.z,min.tag.count.ratio=poisson.ratio,either=either,tag.shift=tag.shift); + d$s <- d$s-masking.window.size/2; d$e <- d$e+masking.window.size/2; + return(d); + }) +} + + +# given tag position vectors, find contigs of significant enrichment of signal over background +# thr - z score threshold +# mcs - minimal cluster size +# bg.weight - fraction by which background counts should be multipled +# min.tag.count.z will impose a poisson constraint based on randomized signal in parallel of background constaint (0 - no constraint) +tag.enrichment.clusters <- function(signal,background,wsize=200,thr=3,mcs=1,bg.weight=1,min.tag.count.z=0,tag.av.den=NULL,min.tag.count.thr=0,min.tag.count.ratio=4,either=F,tag.shift=146/2) { + if(is.null(tag.av.den)) { + tag.av.den <- length(signal)/diff(range(abs(signal))); + } + if(min.tag.count.z>0) { + min.tag.count.thr <- qpois(pnorm(min.tag.count.z,lower.tail=F),min.tag.count.ratio*tag.av.den*wsize,lower.tail=F) + } else { + min.tag.count.thr <- 0; + } + + #if(bg.weight!=1) { + # background <- sample(background,length(background)*(bg.weight),replace=T); + #} + # make up combined position, flag vectors + pv <- abs(c(signal,background)+tag.shift); + fv <- c(rep(1,length(signal)),rep(0,length(background))); + po <- order(pv); + pv <- pv[po]; + fv <- fv[po]; + + #thr <- pnorm(thr,lower.tail=F); + + storage.mode(wsize) <- storage.mode(mcs) <- storage.mode(fv) <- "integer"; + storage.mode(thr) <- storage.mode(pv) <- "double"; + storage.mode(bg.weight) <- "double"; + storage.mode(min.tag.count.thr) <- "double"; + either <- as.integer(either); + storage.mode(either) <- "integer"; + + z <- .Call("find_poisson_enrichment_clusters",pv,fv,wsize,thr,mcs,bg.weight,min.tag.count.thr,either) + return(z); +} + + + + + +# estimates threshold, calculates predictions on complete data and randomized data +# input: tvl +# control - a list of control tag datasets +# no randomization is done if control is supplied +# return.rtp - return randomized tag peaks - do not fit thresholds or do actual predictions +# topN - use min threshold to do a run, return topN peaks from entire genome +# threshold - specify a user-defined threshold +lwcc.prediction <- function(tvl,e.value=NULL, fdr=0.01, chrl=names(tvl), min.thr=0, n.randomizations=1, shuffle.window=1, debug=T, predict.on.random=F, shuffle.both.strands=T,strand.shuffle.only=F, return.rtp=F, control=NULL, print.level=0, threshold=NULL, topN=NULL, bg.tl=NULL, tec.filter=T, tec.window.size=1e3,tec.z=3, tec.masking.window.size=tec.window.size, tec.poisson.z=3,tec.poisson.ratio=4, bg.reverse=T, return.control.predictions=F, return.core.data=F, background.density.scaling=T, ... ) { + + control.predictions <- NULL; + core.data <- list(); + + if(!is.null(bg.tl) & tec.filter) { + if(debug) { cat("finding background exclusion regions ... "); } + tec <- find.significantly.enriched.regions(bg.tl,tvl,window.size=tec.window.size,z.thr=tec.z,masking.window.size=tec.masking.window.size,poisson.z=tec.poisson.z,poisson.ratio=tec.poisson.ratio,background.density.scaling=background.density.scaling,either=T); + if(return.core.data) { + core.data <- c(core.data,list(tec=tec)); + } + if(debug) { cat("done\n"); } + } + + + if(is.null(threshold) & is.null(topN)) { # threshold determination is needed + # generate control predictions + if(!is.null(control)) { + if(debug) { cat("determining peaks on provided",length(control),"control datasets:\n"); } + if(!is.null(bg.tl)) { + if(bg.reverse) { + if(debug) { cat("using reversed signal for FDR calculations\n"); } + rbg.tl <- tvl; + } else { + if(debug) { cat("generating randomized (within chromosome) background ... "); } + rbg.tl <- lapply(bg.tl,function(d) { + if(length(d)<2) { return(d); } + rng <- range(abs(d)); + rd <- round(runif(length(d),rng[1],rng[2])); + nrd <- sample(1:length(rd),length(which(d<0))); + rd[nrd] <- rd[nrd]*(-1); + return(rd); + }) + if(debug) { cat("done\n"); } + } + } else { + rbg.tl <- NULL; + } + n.randomizations <- length(control); + #signal.size <- sum(unlist(lapply(tvl,length))); + rtp <- lapply(control,function(d) { + # calculate tag.weight + #tag.weight <- sum(unlist(lapply(tvl,length)))/sum(unlist(lapply(d,length))); + tag.weight <- dataset.density.ratio(tvl,d,background.density.scaling=background.density.scaling); + #cat("tag.weight=",tag.weight," "); + return(window.call.mirror.binding(d,min.thr=min.thr, tag.weight=tag.weight,bg.tl=rbg.tl, debug=debug, round.up=T,background.density.scaling=background.density.scaling, ...)); + #return(window.call.mirror.binding(d,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster)) + }); + if(return.core.data) { + core.data <- c(core.data,list(rtp.unfiltered=rtp)); + } + if(tec.filter) { + if(debug) { cat("excluding systematic background anomalies ... "); } + rtp <- lapply(rtp,filter.binding.sites,tec,exclude=T); + if(debug) { cat("done\n"); } + } + } else { + if(debug) { cat("determining peaks on ",n.randomizations,"randomized datasets:\n"); } + rtp <- lapply(1:n.randomizations,function(i) { + rd <- generate.randomized.data(tvl,shuffle.window=shuffle.window,shuffle.both.strands=shuffle.both.strands,strand.shuffle.only=strand.shuffle.only); + return(window.call.mirror.binding(rd,min.thr=min.thr,bg.tl=bg.tl, debug=debug, ...)); + #return(window.call.mirror.binding(rd,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist)) + }); + } + if(return.control.predictions) { + control.predictions <- rtp; + } + rtp <- do.call(rbind,lapply(rtp,function(d) do.call(rbind,d))); # merge tables + + # generate real data predictions + if(debug) { cat("determining peaks on real data:\n"); } + npl <- window.call.mirror.binding(tvl,min.thr=min.thr,bg.tl=bg.tl, debug=debug, background.density.scaling=background.density.scaling, ...); + #npl <- window.call.mirror.binding(tvl,min.thr=min.thr, method=tag.wtd,wsize=200,bg.tl=control.data,window.size=window.size,debug=T,min.dist=min.dist,cluster=cluster); + if(return.core.data) { + core.data <- c(core.data,list(npl.unfiltered=npl)); + } + + if(!is.null(bg.tl) & tec.filter) { + if(debug) { cat("excluding systematic background anomalies ... "); } + npl <- filter.binding.sites(npl,tec,exclude=T); + if(debug) { cat("done\n"); } + } + + # calculate E-value and FDRs for all of the peaks + if(debug) { cat("calculating statistical thresholds\n"); } + chrl <- names(npl); names(chrl) <- chrl; + npld <- do.call(rbind,lapply(names(npl),function(chr) { k <- npl[[chr]]; if(!is.null(k) & dim(k)[1]>0) { k$chr <- rep(chr,dim(k)[1]) }; return(k) })) + npld <- cbind(npld,get.eval.fdr.vectors(npld$y,rtp$y)); + # correct for n.randomizations + npld$fdr <- npld$fdr/n.randomizations; + npld$evalue <- npld$evalue/n.randomizations; + + if(return.core.data) { + core.data <- c(core.data,list(npld=npld)); + } + + # determine actual thresholds + if(is.null(e.value)) { + if(is.null(fdr)) { fdr <- 0.01; } + thr <- list(root=min(npld$y[npld$fdr<=fdr]),type="FDR",fdr=fdr) + if(debug) { cat("FDR",fdr,"threshold=",thr$root,"\n"); } + } else { + # determine threshold based on e-value + thr <- list(root=min(npld$y[npld$evalue<=e.value]),type="Evalue",e.value=e.value) + if(debug) { cat("E-value",e.value,"threshold=",thr$root,"\n"); } + } + + + npld <- npld[npld$y>=thr$root,]; + if(dim(npld)[1]>0) { + npl <- tapply(c(1:dim(npld)[1]),as.factor(npld$chr),function(ii) {df <- npld[ii,]; df$chr <- NULL; return(df) }); + } else { + npl <- list(); + } + } else { + if(is.null(threshold)) { + thr <- list(root=min.thr,type="minimal"); + } else { + thr <- list(root=threshold,type="user specified"); + } + + cat("calling binding positions using",thr$type,"threshold (",thr$root,") :\n"); + npl <- window.call.mirror.binding(tvl=tvl,min.thr=thr$root,bg.tl=bg.tl, debug=debug, ...); + if(!is.null(bg.tl) & tec.filter) { + if(debug) { cat("excluding systematic background anomalies ... "); } + npl <- filter.binding.sites(npl,tec,exclude=T); + if(debug) { cat("done\n"); } + } + + if(!is.null(topN)) { + # determine threshold based on topN peaks + ay <- unlist(lapply(npl,function(d) d$y)); + if(length(ay)>topN) { + thr <- list(root=sort(ay,decreasing=T)[topN],type="topN",topN=topN); + cat(paste("determined topN threshold :",thr$root,"\n")); + npl <- lapply(npl,function(d) d[d$y>thr$root,]); + } + } + } + + if(return.core.data) { + return(c(list(npl=npl,thr=thr),core.data)); + } + if(return.control.predictions & !is.null(control.predictions)) { + return(list(npl=npl,thr=thr,control.predictions=control.predictions)); + } + return(list(npl=npl,thr=thr)); +} + +# window tag difference method +wtd <- function(x,y,s,e,whs=200,return.peaks=T,min.thr=5,min.dist=200,step=1,direct.count=F,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=1,mask.x=NULL,mask.y=NULL,ignore.masking=F, bg.whs=whs, round.up=F, ...) { + ignore.masking <- ignore.masking | (is.null(mask.x) & is.null(mask.y)); + if(step>1) { + x <- floor(x/step+0.5); y <- floor(y/step+0.5) + + if(!is.null(bg.x)) { + bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5) + } + + if(!is.null(mask.x)) { + mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5) + } + + + whs <- floor(whs/step+0.5); + bg.whs <- floor(bg.whs/step+0.5); + min.dist <- floor(min.dist/step +0.5); + s <- floor(s/step+0.5) + e <- floor(e/step+0.5) + } + + # scale bg.weight, since within calculation they are considered independent + bg.weight <- bg.weight*tag.weight; + + rx <- c(s-whs,e+whs); + + # compile tag vectors + xt <- table(x); + xh <- integer(diff(rx)+1); + xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt); + + yt <- table(y); + yh <- integer(diff(rx)+1); + yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt); + + # compile background vectors + if(!is.null(bg.x) & length(bg.x)>0) { + bg.subtract <- 1; + + bg.xt <- table(bg.x); + bg.xh <- integer(diff(rx)+1); + bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt); + rm(bg.xt); + + bg.yt <- table(bg.y); + bg.yh <- integer(diff(rx)+1); + bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt); + rm(bg.yt); + + # adjust bg.weight according to bg.whs + if(bg.whs!=whs) { + bg.weight <- bg.weight*whs/bg.whs; + } + } else { + bg.subtract <- 0; + bg.xh <- bg.yh <- c(); + } + + # record masked positions + if(!ignore.masking) { + if(!is.null(mask.x) & length(mask.x)>0) { + mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt))); + mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]]; + xh[mvx-rx[1]+1] <- -1; + } + + if(!is.null(mask.y) & length(mask.y)>0) { + mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt))); + mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]]; + yh[mvy-rx[1]+1] <- -1; + } + } + + rm(xt,yt); + + if(round.up) { round.up <- 1; } else { round.up <- 0; } + + storage.mode(xh) <- storage.mode(yh) <- "integer"; + storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer"; + nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(bg.whs) <- "integer"; + rp <- as.integer(return.peaks); + dcon <- as.integer(direct.count); + storage.mode(rp) <- storage.mode(min.dist) <- "integer"; + storage.mode(min.thr) <- "double"; + storage.mode(dcon) <- "integer"; + storage.mode(tag.weight) <- "double"; + storage.mode(bg.weight) <- "double"; + storage.mode(bg.subtract) <- "integer"; + storage.mode(round.up) <- "integer"; + im <- as.integer(ignore.masking); + storage.mode(im) <- "integer"; + z <- .Call("wtd",xh,yh,whs,rp,min.dist,min.thr,dcon,tag.weight,im,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up); + if(return.peaks) { + return(data.frame(x=(z$x+rx[1])*step,y=z$v)); + } else { + return(list(x=rx*step,y=z)); + } +} + + +tag.wtd <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) { + x <- ctv[ctv>=s & ctv<=e]; + y <- (-1)*ctv[ctv<=-s & ctv>=-e]; + + if(!is.null(bg.ctv)) { + bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e]; + bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e]; + } else { + bg.x <- bg.y <- NULL; + } + + if(!is.null(mask.ctv)) { + mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e]; + mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e]; + } else { + mask.x <- mask.y <- NULL; + } + + if(length(x)==0 | length(y) ==0) { + if(return.peaks) { + return(data.frame(x=c(),y=c())); + } else { + rx <- range(c(x,y)); + return(list(x=rx,y=numeric(diff(rx)+1))); + } + } else { + return(wtd(x,y,s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...)) + } +} + +# shuffles tags in chromosome blocks of a specified size +# note: all coordinates should be positive +tag.block.shuffle <- function(tags,window.size=100) { + if(length(tags)<3) { + warning("too few tags for shuffling"); + return(tags); + } + rng <- range(tags); + #if(rng[1]<0) { stop("negative tag coordinates found") } + if(diff(rng)<=window.size) { + warning(paste("tag range (",diff(rng),") is smaller than shuffle window size")); + return(tags); + } + + if(window.size==0) { + return(as.integer(runif(length(tags),min=rng[1],max=rng[2]))) + } else if(window.size==1) { + tt <- table(tags); + return(rep(runif(length(tt),min=rng[1],max=rng[2]),as.integer(tt))) + } else { + # block positions + bp <- tags %/% window.size; + # block-relative tag positions + rp <- tags %% window.size; + + # shuffle block positions + bpu <- unique(bp); + rbp <- range(bpu); + bps <- as.integer(runif(length(bpu),min=rbp[1],max=rbp[2])); + bpi <- match(bp,bpu); + sbp <- bps[bpi]; + #sbp <- rbp[1]+match(bp,sample(rbp[1]:rbp[2])) + return(sbp*window.size+rp); + } +} + + +# calculate window cross-correlation +lwcc <- function(x,y,s,e,whs=100,isize=20,return.peaks=T,min.thr=1,min.dist=100,step=1,tag.weight=1,bg.x=NULL,bg.y=NULL,bg.weight=NULL,mask.x=NULL,mask.y=NULL,bg.whs=whs,round.up=F) { + if(step>1) { + x <- floor(x/step+0.5); y <- floor(y/step+0.5) + + if(!is.null(bg.x)) { + bg.x <- floor(bg.x/step+0.5); bg.y <- floor(bg.y/step+0.5) + } + + if(!is.null(mask.x)) { + mask.x <- floor(mask.x/step+0.5); mask.y <- floor(mask.y/step+0.5) + } + + whs <- floor(whs/step+0.5); + bg.whs <- floor(bg.whs/step+0.5); + isize <- floor(isize/step+0.5); + min.dist <- floor(min.dist/step +0.5); + s <- floor(s/step+0.5) + e <- floor(e/step+0.5) + } + + # scale bg.weight, since within calculation they are considered independent + bg.weight <- bg.weight*tag.weight; + + + rx <- c(s-whs,e+whs); + xt <- table(x); + xh <- integer(diff(rx)+1); + xh[as.integer(names(xt))-rx[1]+1] <- as.integer(xt); + + yt <- table(y); + + yh <- integer(diff(rx)+1); + yh[as.integer(names(yt))-rx[1]+1] <- as.integer(yt); + + # compile background vectors + if(!is.null(bg.x) & length(bg.x)>0) { + bg.subtract <- 1; + + bg.xt <- table(bg.x); + bg.xh <- integer(diff(rx)+1); + bg.xh[as.integer(names(bg.xt))-rx[1]+1] <- as.integer(bg.xt); + rm(bg.xt); + + bg.yt <- table(bg.y); + bg.yh <- integer(diff(rx)+1); + bg.yh[as.integer(names(bg.yt))-rx[1]+1] <- as.integer(bg.yt); + rm(bg.yt); + + # adjust bg.weight according to bg.whs + bg.weight <- bg.weight*(whs-isize)/bg.whs; + } else { + bg.subtract <- 0; + bg.xh <- bg.yh <- c(); + } + + # record masked positions + if(!is.null(mask.x) & length(mask.x)>0) { + mvx <- unique(mask.x); mvx <- setdiff(mvx,as.numeric(names(xt))); + mvx <- mvx[mvx>=rx[1] & mvx<=rx[2]]; + + xh[mvx-rx[1]+1] <- -1; + } + + if(!is.null(mask.y) & length(mask.y)>0) { + mvy <- unique(mask.y); mvy <- setdiff(mvy,as.numeric(names(yt))); + mvy <- mvy[mvy>=rx[1] & mvy<=rx[2]]; + yh[mvy-rx[1]+1] <- -1; + } + + rm(xt,yt); + if(round.up) { round.up <- 1; } else { round.up <- 0; } + + storage.mode(xh) <- storage.mode(yh) <- "integer"; + storage.mode(bg.xh) <- storage.mode(bg.yh) <- "integer"; + nx <- length(xh); storage.mode(nx) <- storage.mode(whs) <- storage.mode(isize) <- storage.mode(bg.whs) <- "integer"; + rp <- as.integer(return.peaks); + storage.mode(rp) <- storage.mode(min.dist) <- "integer"; + storage.mode(min.thr) <- "double"; + storage.mode(tag.weight) <- "double"; + storage.mode(bg.weight) <- "double"; + storage.mode(bg.subtract) <- "integer"; + storage.mode(round.up) <- "integer"; + + # allocate return arrays + #cc <- numeric(nx); storage.mode(cc) <- "double"; + z <- .Call("lwcc",xh,yh,whs,isize,rp,min.dist,min.thr,tag.weight,bg.subtract,bg.xh,bg.yh,bg.whs,bg.weight,round.up); + if(return.peaks) { + return(data.frame(x=(z$x+rx[1])*step,y=z$v)); + } else { + return(list(x=rx*step,y=z)); + } +} + + +tag.lwcc <- function(ctv,s,e,return.peaks=T, bg.ctv=NULL, mask.ctv=NULL, ...) { + x <- ctv[ctv>=s & ctv<=e]; + y <- (-1)*ctv[ctv<=-s & ctv>=-e]; + + if(!is.null(bg.ctv)) { + bg.x <- bg.ctv[bg.ctv>=s & bg.ctv<=e]; + bg.y <- (-1)*bg.ctv[bg.ctv<=-s & bg.ctv>=-e]; + } else { + bg.x <- bg.y <- NULL; + } + + if(!is.null(mask.ctv)) { + mask.x <- mask.ctv[mask.ctv>=s & mask.ctv<=e]; + mask.y <- (-1)*mask.ctv[mask.ctv<=-s & mask.ctv>=-e]; + } else { + mask.x <- mask.y <- NULL; + } + + if(length(x)==0 | length(y) ==0) { + if(return.peaks) { + return(data.frame(x=c(),y=c())); + } else { + rx <- range(c(x,y)); + return(list(x=rx,y=numeric(diff(rx)+1))); + } + } else { + return(lwcc(x,y, s,e,return.peaks=return.peaks, bg.x=bg.x,bg.y=bg.y, mask.x=mask.x,mask.y=mask.y, ...)) + } +} + +# determine mirror-based binding positions using sliding window along each chromosome +# extra parameters are passed on to call.nucleosomes() +window.call.mirror.binding <- function(tvl,window.size=4e7, debug=T, cluster=NULL, bg.tl=NULL, mask.tl=NULL, background.density.scaling=T, ...) { + chrl <- names(tvl); + # determine bg.weight + if(!is.null(bg.tl)) { + bg.weight <- dataset.density.ratio(tvl,bg.tl,background.density.scaling=background.density.scaling); + } else { + bg.weight <- NULL; + } + if(debug) { + cat("bg.weight=",bg.weight," "); + } + + names(chrl) <- chrl; + + if(is.null(cluster)) { + return(lapply(chrl,function(chr) { + bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; }; + mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; }; + + window.chr.call.mirror.binding(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv),window.size=window.size,chr=chr,debug=debug, bg.weight=bg.weight, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ...); + })); + } else { + # add bg.ctv and mask.ctv to parallel call + tvll <- lapply(chrl,function(chr) { + bg.ctv <- NULL; if(!is.null(bg.tl)) { bg.ctv <- bg.tl[[chr]]; }; + mask.ctv <- NULL; if(!is.null(mask.tl)) { mask.ctv <- mask.tl[[chr]]; }; + return(list(ctv=tvl[[chr]],bg.ctv=bg.ctv,mask.ctv=mask.ctv)) + }); + bl <- clusterApplyLB(cluster,tvll,window.chr.call.mirror.binding,window.size=window.size,debug=debug, bg.weight=bg.weight, ...); + names(bl) <- chrl; + return(bl); + } +} + +window.chr.call.mirror.binding <- function(ctvl,window.size,debug=T, chr="NA", cluster=NULL, method=tag.wtd, bg.ctv=NULL, mask.ctv=NULL, ...) { + ctv <- ctvl$ctv; bg.ctv <- ctvl$bg.ctv; mask.ctv <- ctvl$mask.ctv; + if(is.null(ctv)) { return(data.frame(x=c(),y=c())) } + if(length(ctv)<2) { return(data.frame(x=c(),y=c())) } + + dr <- range(unlist(lapply(ctv,function(x) range(abs(x))))) + n.windows <- ceiling(diff(dr)/window.size); + + + pinfo <- c(); + if(debug) { + cat(paste("processing ",chr," in ",n.windows," steps [",sep="")); + } + for(i in 1:n.windows) { + s <- dr[1]+(i-1)*window.size; + npn <- method(s=s, e=s+window.size,ctv=ctv, return.peaks=T, bg.ctv=bg.ctv, mask.ctv=mask.ctv, ... ); + if(length(npn) > 0) { pinfo <- rbind(pinfo,npn) } + if(debug) { + cat("."); + } + } + if(debug) { + cat(paste("] done (",dim(pinfo)[1],"positions)\n")); + } else { + cat("."); + } + return(data.frame(x=pinfo[,1],y=pinfo[,2])); +} + +generate.randomized.data <- function(data,shuffle.window=1,shuffle.both.strands=T,strand.shuffle.only=F,chrl=names(data)) { + names(chrl) <- unlist(chrl); + if(strand.shuffle.only) { + # shuffle just strand assignment, not tag positions + rt <- lapply(data[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T)); + } else { + if(shuffle.both.strands) { + rt <- lapply(data[unlist(chrl)],function(tv) { + pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window))) + }); + } else { + rt <- lapply(data[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))}); + } + } +} + +# determine threshold based on E value +# for efficiency chrl should include just one or two small chromosomes +# optional parameters are passed to call.nucleosomes() +determine.lwcc.threshold <- function(tvl,chrl=names(tvl),e.value=100, n.randomizations=1, min.thr=1, debug=F, tol=1e-2, shuffle.window=1, shuffle.both.strands=T, return.rtp=F, control=NULL, strand.shuffle=F, ...) { + names(chrl) <- unlist(chrl); + + # determine fraction of total tags contained in the specified nucleosomes + ntags <- sum(unlist(lapply(tvl,function(cv) length(cv)))); + nctags <- sum(unlist(lapply(chrl, function(cn) length(tvl[[cn]])))); + # calculate actual target E value + if(!is.null(control)) { + n.randomizations <- length(control); + } + eval <- e.value*n.randomizations*nctags/ntags + if(eval<1) { + warning("specified e.value and set of chromosomes results in target e.value of less than 1"); + eval <- 1; + } + + if(debug) { + cat(paste("randomizations =",n.randomizations," chromosomes =",length(chrl),"\n")) + cat(paste("adjusted target eval =",eval,"\ngenerating randomized tag peaks ...")); + } + + # get peaks on randomized tags + if(is.null(control)) { + rtp <- data.frame(do.call(rbind,lapply(1:n.randomizations,function(i) { + if(strand.shuffle) { + # shuffle just strand assignment, not tag positions + rt <- lapply(tvl[unlist(chrl)],function(tv) tv*sample(c(-1,1),length(tv),replace=T)); + } else { + if(shuffle.both.strands) { + rt <- lapply(tvl[unlist(chrl)],function(tv) { + pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tag.block.shuffle(tv[-pti],window.size=shuffle.window))) + }); + } else { + rt <- lapply(tvl[unlist(chrl)],function(tv) { pti <- which(tv>0); return(c(tag.block.shuffle(tv[pti],window.size=shuffle.window),tv[-pti]))}); + } + } + if(debug) { + cat("."); + } + rl <- window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...); + + return(do.call(rbind,rl)) + #return(do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, whs=100,isize=10,window.size=3e7,min.dist=200))) + }))); + + } else { + if(debug) { + cat(" using provided controls "); + } + rtp <- data.frame(do.call(rbind,lapply(control,function(rt) do.call(rbind,window.call.mirror.binding(rt,min.thr=min.thr, debug=F, ...))))) + } + + if(return.rtp) { + return(rtp) + } + + if(debug) { + cat(" done\nfinding threshold ."); + } + + # determine range and starting value + rng <- c(min.thr,max(na.omit(rtp$y))) + + # find E value threshold + count.nucs.f <- function(nthr) { + return(eval-length(which(rtp$y>=nthr))); + } + + # estimate position of the root by downward bisection iterations + mv <- c(eval); mvp <- c(rng[2]); ni <- 1; + max.it <- 2*as.integer(log2(rng[2]/rng[1])+0.5); + while((ni<=max.it) & (mv[1]>=0)) { + np <- mvp[1]/2; + npv <- count.nucs.f(np); + mv <- c(npv,mv); + mvp <- c(np,mvp); + ni <- ni+1; + } + + + if(ni>max.it) { + # determine lowest value + if(debug) { + cat(paste("exceeded max.it (",max.it,"), returning lowest point",signif(mvp[1],4))); + } + return(list(root=mvp[1])) + } else { + rng <- mvp[1:2]; + if(mv[2]==0) rng[2] <- mvp[3]; + if(debug) { + cat(paste("bound to (",signif(rng[1],4),signif(rng[2],4),") ")); + } + } + + # find root on the right side + x <- uniroot(count.nucs.f,rng,tol=tol); + #x$max <- o$par; + #x$f.max <- (-1)*o$value; + if(debug) { + cat(paste(" done (thr=",signif(x$root,4),")\n")); + } + return(x); + +} + + +# determine membership of points in fragments +points.within <- function(x,fs,fe,return.list=F,return.unique=F,sorted=F,return.point.counts=F) { + if(is.null(x) | length(x) < 1) { return(c()) }; + if(!sorted) { + ox <- rank(x,ties="first"); + x <- sort(x); + } + + se <- c(fs,fe); + fi <- seq(1:length(fs)); + fi <- c(fi,-1*fi); + + fi <- fi[order(se)]; + se <- sort(se); + + storage.mode(x) <- storage.mode(fi) <- storage.mode(se) <- "integer"; + if(return.unique) { iu <- 1; } else { iu <- 0; } + if(return.list) { il <- 1; } else { il <- 0; } + if(return.point.counts) { rpc <- 1; } else { rpc <- 0; } + storage.mode(iu) <- storage.mode(il) <- storage.mode(rpc) <- "integer"; + result <- .Call("points_within",x,se,fi,il,iu,rpc); + if(!sorted & !return.point.counts) { + result <- result[ox]; + } + return(result); +} + + +# determine cooridnates of points x relative to signed +# positions pos within size range +get.relative.coordinates <- function(x,pos,size,sorted=F) { + if(!sorted) { + op <- order(abs(pos)); + x <- sort(x); pos <- pos[op]; + } + #dyn.load("~/zhao/sc/peaks.so"); + storage.mode(x) <- storage.mode(pos) <- storage.mode(size) <- "integer"; + rf <- .Call("get_relative_coordinates",x,pos,size); + if(!sorted) { + rf$i <- op[rf$i]; + } else { + return(rf$i); + } + return(rf); +} + +# given list of magnitude values for signal(x) and control (y), +# return a dataframe with $e.val and $fdr +get.eval.fdr.vectors <- function(x,y) { + nx <- length(x); ny <- length(y); + if(nx==0) { return(data.frame(evalue=c(),fdr=c())) } + if(ny==0) { return(data.frame(evalue=rep(0,nx),fdr=rep(1,nx))) } + ex <- ecdf(x); ey <- ecdf(y); + + evals <- (1-ey(x))*ny; + yvals <- (1-ex(x))*nx; + fdr <- (evals+0.5)/(yvals+0.5); # with pseudo-counts + fdr[yvals==0] <- min(fdr); # correct for undercounts + # find a min x corresponding to a minimal FDR + mfdr <- min(fdr); + mfdrmx <- min(x[fdr==mfdr]); + # correct + fdr[x>=mfdrmx] <- mfdr; + return(data.frame(evalue=(evals+1),fdr=fdr)); +} + + +# filter predictions to remove calls failling into the tag enrichment clusters ( chr list of $s/$e dfs) +filter.binding.sites <- function(bd,tec,exclude=F) { + chrl <- names(bd); names(chrl) <- chrl; + lapply(chrl,function(chr) { + cbd <- bd[[chr]]; + if(is.null(cbd)) { return(NULL) }; + if(length(cbd)==0) { return(NULL) }; + if(dim(cbd)[1]>0) { + ctec <- tec[[chr]]; + if(length(ctec$s)>0) { + if(exclude) { + pwi <- which(points.within(cbd$x,ctec$s,ctec$e)== -1); + } else { + pwi <- which(points.within(cbd$x,ctec$s,ctec$e)> -1); + } + return(cbd[pwi,]); + } else { + if(exclude) { + return(cbd); + } else { + return(data.frame(x=c(),y=c())); + } + } + } else { + return(cbd); + } + }); +} + + +# PUBLIC +# generate predictions on sequential (chained) subsamples of data +# if step.size <1, it is intepreted as a fraciton and a each subsequent subsample +# is of a size (1-fraction.step)*N (N - size of the signal data); +# otherwise the step.size is interpreted as a number of tags, and each subsample is of the size N-step.size +get.subsample.chain.calls <- function(signal.data,control.data,n.steps=NULL,step.size=1e6,subsample.control=F,debug=F,min.ntags=1e3, excluded.steps=c(), test.chromosomes=NULL, ... ) { + + if(!is.null(test.chromosomes)) { + # adjust step size + sz <- sum(unlist(lapply(signal.data,length))) + signal.data <- signal.data[test.chromosomes]; + control.data <- control.data[test.chromosomes]; + + if(step.size>1) { + step.size <- step.size*sum(unlist(lapply(signal.data,length)))/sz; + # cat("adjusted step.size=",step.size,"\n"); + } + } + + if(is.null(n.steps)) { + if(step.size<1) { + # down to 10% + n.steps <- log(0.1)/log(step.size); + } else { + n.steps <- floor(sum(unlist(lapply(signal.data,length)))/step.size) + } + } + if(subsample.control & !is.null(control.data)) { + # normalize control to the signal size + if(debug) { cat("pre-subsampling control.\n"); } + bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) + control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight,replace=(bg.weight>1))) + } + calls <- list(); + callnames <- c(); + for(i in 0:n.steps) { + if(debug) { cat("chained subsample step",i,":\n"); } + if(!i %in% excluded.steps) { + ans <- list(find.binding.positions(signal.data=signal.data,control.data=control.data,debug=debug, skip.control.normalization=T, ...)); + names(ans) <- as.character(c(i)); + calls <- c(calls,ans); + callnames <- c(callnames,i); + } + # subsample + if(step.size<1) { + # fraction steps + f <- 1-step.size; + } else { + # bin steps + sz <- sum(unlist(lapply(signal.data,length))); + f <- (sz-step.size)/sz; + if(f<=0) break; + } + if(debug) { cat("chained subsampling using fraction",f,".\n"); } + signal.data <- lapply(signal.data,function(d) sample(d,length(d)*f)); + if(subsample.control & !is.null(control.data)) { + control.data <- lapply(control.data,function(d) sample(d,length(d)*f)); + } + sz <- sum(unlist(lapply(signal.data,length))); + if(sz<min.ntags) break; + } + names(calls) <- callnames; + return(calls); +} + + +# chain-subsample dataset and calculate MSER interpolation +mser.chain.interpolation <- function(signal.data=NULL,control.data=NULL,chains=NULL,n.chains=5,debug=F, enrichment.background.scales=c(1,5), test.agreement=0.99, agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr", return.lists=F, ...) { + if(is.null(chains)) { + cn <- c(1:n.chains); names(cn) <- cn; + tf <- function(i, ...) get.subsample.chain.calls(signal.data,control.data,debug=debug, enrichment.background.scales=enrichment.background.scales, ...); + chains <- lapply(cn,tf,...); + } + names(enrichment.background.scales) <- enrichment.background.scales; + lapply(enrichment.background.scales,function(scale) { + actual.enr.field <- enr.field; + if(scale>1) { + actual.enr.field <- paste(actual.enr.field,scale,sep="."); + } + + cvl <- lapply(chains,function(chain) { + nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T); + nd <- diff(nn); + nn <- nn[-length(nn)]; + me <- lapply(c(2:length(chain)),function(i) { + sla <- t.precalculate.ref.peak.agreement(chain[[i-1]],chain[i],agreement.distance=agreement.distance,enr.field=actual.enr.field) + me <- t.find.min.saturated.enr(sla,thr=1-test.agreement) + menr <- max(min(na.omit(unlist(lapply(chain[[i-1]]$npl,function(d) d[actual.enr.field])))),min(na.omit(unlist(lapply(chain[[i]]$npl,function(d) d[actual.enr.field])))),1) + if(me<=menr) { me <- 1; }; + return(me); + }) + data.frame(n=nn,me=unlist(me),nd=nd); + }); + if(return.lists) { return(cvl) } + cvl <- na.omit(do.call(rbind,cvl)); + if(return.median) { + tv <- tapply(cvl$me,as.factor(cvl$n),median) + } else { + tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim); + } + df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv)); + return(df[order(df$n,decreasing=T),]) + }) +} + + + +# returns agreement as a function of dataset size, possibly filtering peaks by min.enr threshold, and by max.fdr +chain.to.reference.comparison <- function(chains,min.enr=NULL,debug=F,agreement.distance=50, return.median=F, mean.trim=0.1, enr.field="enr",max.fdr=NULL) { + cvl <- lapply(chains,function(chain) { + # filter chain by fdr + if(!is.null(max.fdr)) { + chain <- lapply(chain,function(d) { d$npl <- lapply(d$npl,function(cd) cd[cd$fdr<=max.fdr,]); return(d); }); + } + nn <- sort(unlist(lapply(chain,function(d) d$n)),decreasing=T); + nn <- nn[-length(nn)]; + me <- lapply(c(2:length(chain)),function(i) { + sla <- t.precalculate.ref.peak.agreement(chain[[1]],chain[i],agreement.distance=agreement.distance,enr.field=enr.field) + # calculate overlap + x <- lapply(sla,function(mpd) { + if(!is.null(min.enr)) { + + me <- mpd$re >= min.enr; + me[is.na(me)] <- F; + mpd <- mpd[me,]; + ome <- mpd$oe < min.enr; + ome[is.na(ome)] <- T; + mpd$ov[ome] <- 0; + } + return(mean(mpd$ov)); + }) + }) + + data.frame(n=nn,me=unlist(me)); + }); + + cvl <- na.omit(do.call(rbind,cvl)); + if(return.median) { + tv <- tapply(cvl$me,as.factor(cvl$n),median) + } else { + tv <- tapply(cvl$me,as.factor(cvl$n),mean,trim=mean.trim); + } + df <- data.frame(n=as.numeric(names(tv)),me=as.numeric(tv)); + return(df[order(df$n,decreasing=T),]) +} + + +# estimates enrichment confidence interval based on 2*tag.count.whs window around each position, and a z-score (alpha/2) +# if(multiple.background.scales=T) the enrichment is also estimated using 5- and 10-fold increased background tag window +# adds $enr (lower bound), $enr.ub (upper bound) and $enr.mle fields +calculate.enrichment.estimates <- function(binding.positions,signal.data=NULL,control.data=NULL,fraction=1,tag.count.whs=100,z=2,effective.genome.size=3e9,scale.down.control=F,background.scales=c(1),bg.weight=NULL) { + f <- fraction; + qv <- pnorm(z,lower.tail=F); + cn <- names(binding.positions$npl); names(cn) <- cn; + + if(is.null(control.data)) { + # estimate from gamma distribution + fg.lambda <- f*sum(unlist(lapply(signal.data,length)))*2*tag.count.whs/effective.genome.size; + binding.positions$npl <- lapply(binding.positions$npl,function(d) { + d$enr <- qgamma(qv,d$nt,scale=1)/fg.lambda; + d$enr.ub <- qgamma(1-qv,d$nt,scale=1)/fg.lambda; + d$enr.mle <- d$nt/fg.lambda; + return(d); + }); + } else { + # estimate using beta distribution + if(is.null(bg.weight)) { + bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) + } + + if(scale.down.control) { + # sample down control to be the same size as true signal.data (bg.weight*f) + control.data <- lapply(control.data,function(d) sample(d,length(d)*bg.weight*f,replace=(f*bg.weight>1))) + #bg.weight <- sum(unlist(lapply(signal.data,length)))/sum(unlist(lapply(control.data,length))) + bg.weight <- 1/f; + + } + + binding.positions$enrichment.bg.weight <- bg.weight; + binding.positions$enrichment.whs <- tag.count.whs; + binding.positions$enrichment.z <- z; + + binding.positions$npl <- lapply(cn,function(chr) { + d <- binding.positions$npl[[chr]]; + + edf <- lapply(background.scales,function(background.width.multiplier) { + sig.mult <- bg.weight*f/background.width.multiplier; + nbg <- points.within(abs(control.data[[chr]]),d$x-tag.count.whs*background.width.multiplier,d$x+tag.count.whs*background.width.multiplier,return.point.counts=T,return.unique=F); + + nfg <- d$nt; + + + # Poisson ratio Bayesian LB with non-informative prior (Clopper & Pearson 1934) + nf <- ((nfg+0.5)/(nbg+0.5))*qf(1-qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F) + nf <- nf/sig.mult; + + ub <- ((nfg+0.5)/(nbg+0.5))*qf(qv,2*(nfg+0.5),2*(nbg+0.5),lower.tail=F) + ub <- ub/sig.mult; + + mle <- (nfg+0.5)/(nbg+0.5); + mle <- mle/sig.mult; + if(is.null(nbg)) { nbg <- numeric(0) } + if(is.null(nf)) { nf <- numeric(0) } + if(is.null(ub)) { ub <- numeric(0) } + if(is.null(mle)) { mle <- numeric(0) } + return(data.frame(nbg=nbg,lb=nf,ub=ub,mle=mle)) + }) + + adf <- do.call(cbind,lapply(c(1:length(background.scales)),function(i) { + df <- edf[[i]]; + cn <- c("nbgt","enr","enr.ub","enr.mle"); + if(background.scales[i]>1) { + cn <- paste(cn,as.character(background.scales[i]),sep="."); + } + names(df) <- cn; + return(df); + })) + + return(cbind(d,adf)); + }); + } + + return(binding.positions); +} + + +# precalculate peak agreement of a sampling list given a reference +t.precalculate.ref.peak.agreement <- function(ref,sf,agreement.distance=50,enr.field="enr") { + ref <- ref$npl; + cn <- names(ref); names(cn) <- cn; + + # for each sampling round + lapply(sf,function(sd) { + # calculate overlap + + ov <- data.frame(do.call(rbind,lapply(cn,function(chr) { + if(dim(ref[[chr]])[1]<1) { return(cbind(ov=c(),re=c(),oe=c())) }; + pwi <- points.within(ref[[chr]]$x,sd$npl[[chr]]$x-agreement.distance,sd$npl[[chr]]$x+agreement.distance); + pwi[pwi==-1] <- NA; + renr <- ref[[chr]][,enr.field] + oenr <- sd$npl[[chr]][,enr.field][pwi]; + if(length(oenr)==0) { oenr <- rep(NA,length(renr)); } + return(cbind(ov=as.integer(!is.na(pwi)),re=renr,oe=oenr)); + }))) + }) +} + + +# find minimal saturated enrichment given a list of replicate agreement matrices (for one fraction) +t.find.min.saturated.enr <- function(pal,thr=0.01,plot=F,return.number.of.peaks=F,plot.individual=T,return.median=F,return.vector=F) { + nr <- length(pal); + # merge replicate data frames + mpd <- data.frame(do.call(rbind,pal)); + + mpd$re[is.na(mpd$re)] <- Inf; + mpd$oe[is.na(mpd$oe)] <- Inf; + + + + # round up values to avoid miscounting + mpd$re <- round(mpd$re,digits=2); + mpd$oe <- round(mpd$oe,digits=2); + + me <- pmin(mpd$re,mpd$oe); + ome <- order(me,decreasing=T); + df <- data.frame(me=me[ome],ov=mpd$ov[ome]); + recdf <- ecdf(-mpd$re); ren <- length(mpd$re); + + # collapse equal peak heights + xk <- tapply(df$ov,as.factor(df$me),sum); xk <- data.frame(ov=as.numeric(xk),me=as.numeric(names(xk))); xk <- xk[order(xk$me,decreasing=T),]; + + + cso <- cumsum(xk$ov)/(recdf(-xk$me)*ren); + cso[is.na(cso)] <- 0; + cso[!is.finite(cso)] <- 0; + mv <- max(which(cso >= 1-thr)) + menr <- xk$me[mv]; + + ir <- lapply(pal,function(d) { + d$re[is.na(d$re)] <- Inf; + d$oe[is.na(d$oe)] <- Inf; + + me <- pmin(d$re,d$oe); + ome <- order(me,decreasing=T); + df <- data.frame(me=me[ome],ov=d$ov[ome]); + cso <- cumsum(df$ov)/c(1:length(df$ov)); + mv <- max(which(cso >= 1-thr)) + menr <- df$me[mv]; + return(list(df=df,menr=menr)); + }); + + if(plot) { + par(mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 0.8); + plot(df$me,cumsum(df$ov)/c(1:length(df$ov)),type='l',ylab="fraction of positions overlapping with reference",xlab="minimal enrichment of binding positions",xlim=c(min(df$me),2*menr)); + abline(h=1-thr,lty=2,col=4) + if(plot.individual) { + lapply(ir,function(d) { + df <- d$df; + lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=8); + abline(v=menr,col="pink",lty=3) + }); + lines(df$me,cumsum(df$ov)/c(1:length(df$ov)),col=1); + } + abline(v=menr,col=2,lty=2) + legend(x="bottomright",lty=c(1,2,1,3,2),col=c(1,2,8,"pink",4),legend=c("combined samples","combined sample MSER","individual samples","individual MSERs","consistency threshold")); + } + + if(return.number.of.peaks) { + mpd <- data.frame(do.call(rbind,pal)); + return(length(which(!is.na(mpd$re) & mpd$re >=menr))/nr); + } else { + if(return.vector) { + return(unlist(lapply(ir,function(d) d$menr))); + } + if(return.median) { + return(median(unlist(lapply(ir,function(d) d$menr)))); + } else { + return(menr); + } + } +} + + + +# determine d1/d2 dataset size ratio. If background.density.scaling=F, the ratio of tag counts is returned. +# if background.density.scaling=T, regions of significant tag enrichment are masked prior to ratio calculation. +dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { + if(!background.density.scaling) { + return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length)))) + } + + chrl <- intersect(names(d1),names(d2)); + ntc <- do.call(rbind,lapply(chrl,function(chr) { + x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) + x2 <- tag.enrichment.clusters(abs(d2[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) + return(c(length(which(points.within(abs(d1[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)),length(which(points.within(abs(d2[[chr]]),c(x1$s,x2$s)-wsize/2,c(x1$e,x2$e)+wsize/2)==-1)))) + })) + ntcs <- apply(ntc,2,sum); + #print(ntcs/c(sum(unlist(lapply(d1,length))),sum(unlist(lapply(d2,length))))); + return(ntcs[1]/ntcs[2]) +} + +# returns effective size of the dataset based on the same logic as dataset.density.ratio +dataset.density.size <- function(d1,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { + if(!background.density.scaling) { + return(sum(unlist(lapply(d1,length)))) + } + + chrl <- names(d1); + ntc <- lapply(chrl,function(chr) { + x1 <- tag.enrichment.clusters(abs(d1[[chr]]),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) + return(length(which(points.within(abs(d1[[chr]]),x1$s-wsize/2,x1$e+wsize/2)==-1))) + }) + return(sum(unlist(ntc))) +} + +old.dataset.density.ratio <- function(d1,d2,min.tag.count.z=4.3,wsize=1e3,mcs=0,background.density.scaling=T) { + if(!background.density.scaling) { + return(sum(unlist(lapply(d1,length)))/sum(unlist(lapply(d2,length)))) + } + + t.chromosome.counts <- function(tl) { + lapply(tl,function(d) { + x <- tag.enrichment.clusters(abs(d),c(),wsize=wsize,bg.weight=0,min.tag.count.z=min.tag.count.z,mcs=mcs,either=F) + x$s <- x$s-wsize/2; x$e <- x$e+wsize/2; + x <- regionset.intersection.c(list(x),do.union=T) + return(c(n=length(which(points.within(abs(d),x$s,x$e)==-1)),s=diff(range(abs(d))),m=sum(x$e-x$s))); + }) + } + + l1 <- t.chromosome.counts(d1); + l2 <- t.chromosome.counts(d2); + + l2 <- data.frame(do.call(rbind,l2[names(l1)])); + l1 <- data.frame(do.call(rbind,l1)); + + # genome size + gs <- sum(pmax(l1$s,l2$s)) + + den1 <- sum(l1$n)/(gs-sum(l1$m)) + den2 <- sum(l2$n)/(gs-sum(l2$m)) + return(den1/den2); +} + + + + +# calculate cumulative density based on sum of scaled gaussian curves +# (by Michael Tolstorukov) +# +# vin - input vector; bw -- standard deviation, dw-gaussina cutoff in stdev; dout - output "density") +# output - if return.x=F vector of cumulative density values corresponding to integer positions described by range(vin) +# output - if return.x=T a data structure with $x and $y corresponding to the cumulative density +# optional match.wt.f is a function that will return weights for a tag vector +densum <- function(vin,bw=5,dw=3,match.wt.f=NULL,return.x=T,from=min(vin),to=max(vin),step=1) { + # construct vector of unique tags and their counts + tc <- table(vin[vin>=from & vin<=to]); + pos <- as.numeric(names(tc)); storage.mode(pos) <- "double"; + tc <- as.numeric(tc); storage.mode(tc) <- "double"; + n <- length(pos) + # weight counts + if(!is.null(match.wt.f)) { + tc <- tc*match.wt.f(pos); + } + + rng <- c(from,to); + if(rng[1]<0) { stop("range extends into negative values") } + if(range(pos)[1]<0) { stop("position vector contains negative values") } + + storage.mode(n) <- storage.mode(rng) <- storage.mode(bw) <- storage.mode(dw) <- storage.mode(step) <- "integer"; + + spos <- rng[1]; storage.mode(spos) <- "double"; + + dlength <- floor((rng[2] - rng[1])/step) + 1; # length of output array + if(dlength<1) { stop("zero data range") } + dout <- numeric(dlength); storage.mode(dout) <- "double"; + storage.mode(dlength) <- "integer"; + .C("cdensum",n,pos,tc,spos,bw,dw,dlength,step,dout,DUP=F); + + if(return.x) { + return(list(x=c(rng[1],rng[1]+step*(dlength-1)),y=dout,step=step)) + } else { + return(dout) + } +} + +# count tags within sliding window of a specified size +# vin - tag vector (postive values, pre-shifted) +# window.size/window.step - window characteristics +# tv - optional, pre-sorted, pre-trimmed tag vector +window.tag.count <- function(vin,window.size,window.step=1,return.x=T,from=min(vin)+floor(window.size/2),to=max(vin)-floor(window.size/2),tv=NULL) { + whs <- floor(window.size/2); + # select tags with margins + if(is.null(tv)) { + tv <- sort(vin[vin>=from-whs-1 & vin<=to+whs+1]) + } + storage.mode(tv) <- "double"; + n <- length(tv) + nsteps <- ceiling((to-from)/window.step); + + storage.mode(n) <- storage.mode(nsteps) <- storage.mode(window.size) <- storage.mode(window.step) <- "integer"; + + spos <- from; storage.mode(spos) <- "double"; + + if(nsteps<1) { stop("zero data range") } + #dout <- integer(nsteps); storage.mode(dout) <- "integer"; + #.C("window_n_tags",n,tv,spos,window.size,window.step,nsteps,dout,DUP=F); + dout <- .Call("cwindow_n_tags",tv,spos,window.size,window.step,nsteps); + + if(return.x) { + return(list(x=c(from,from+(nsteps-1)*window.step),y=dout,step=window.step)) + } else { + return(dout) + } +} + +# count tags in windows around specified positions (pos) +window.tag.count.around <- function(vin,window.size,pos,return.x=T,tc=NULL,sorted=F) { + if(is.null(tc)) { + tc <- table(vin); + } + if(!sorted) { + op <- rank(pos); + pos <- sort(pos); + } + storage.mode(pos) <- "double"; + tpos <- as.integer(names(tc)); storage.mode(tpos) <- "double"; + tc <- as.integer(tc); storage.mode(tc) <- "integer"; + + whs <- floor(window.size/2); + + storage.mode(whs) <- "integer"; + twc <- .Call("cwindow_n_tags_around",tpos,tc,pos,whs); + if(return.x) { + if(sorted) { + return(data.frame(x=pos,y=twc)); + } else { + return(data.frame(x=pos[op],y=twc[op])); + } + } else { + if(sorted) { + return(twc); + } else { + return(twc[op]); + } + } +} + +# given a tag vector (signed), identify and clean up (either remove or cap) singular positions that exceed local tag density +# vin - tag vector +# cap.fold - maximal fold over enrichment over local density allowed for a single tag position, at which the tag count is capped +# eliminate.fold - max fold enrichment that, when exceeded, results in exclusion of all the tags at that position (e.g. counted as anomaly) +# z.threshold - Z-score used to determine max allowed counts +filter.singular.positions.by.local.density <- function(tags,window.size=200,cap.fold=4,eliminate.fold=10,z.threshold=3) { + # tabulate tag positions + if(length(tags)<2) { return(tags); }; + + tc <- table(tags); + pos <- as.numeric(names(tc)); storage.mode(pos) <- "double"; + tc <- as.integer(tc); storage.mode(tc) <- "integer"; + n <- length(pos); + + whs <- floor(window.size/2); + + storage.mode(n) <- storage.mode(whs) <- "integer"; + twc <- .Call("cwindow_n_tags_around",pos,tc,pos,whs); + twc <- (twc-tc+1)/window.size; # local density + + pv <- pnorm(z.threshold,lower.tail=F) + # exclude + max.counts <- qpois(pv,twc*eliminate.fold,lower.tail=F) + tc[tc>max.counts] <- 0; + # cap + max.counts <- qpois(pv,twc*cap.fold,lower.tail=F) + ivi <- which(tc>max.counts); + tc[ivi] <- max.counts[ivi]+1; + + # reconstruct tag vector + tv <- rep(pos,tc); + to <- order(abs(tv)); tv <- tv[to]; + return(tv); +} + + + +# calculates enrichment bounds using multiple background scales +# ft - foreground tags (pre-shifted, positive) +# bt - background tags +# fws - foreground window size +# bwsl - background window size list +# step - window step +# rng - from/to coordinates (to will be adjusted according to step) +# +# returns: a list with $x ($s $e $step), $lb vector and $mle vector ($ub if calculate.upper.bound=T) +mbs.enrichment.bounds <- function(ft,bt,fws,bwsl,step=1,rng=NULL,alpha=0.05,calculate.upper.bound=F,bg.weight=length(ft)/length(bt),use.most.informative.scale=F,quick.calculation=F,pos=NULL) { + # determine range + if(is.null(rng)) { + rng <- range(range(ft)); + } + # foreground counts + if(is.null(pos)) { + fwc <- window.tag.count(ft,fws,window.step=step,from=rng[1],to=rng[2],return.x=T); + } else { + fwc <- window.tag.count.around(ft,fws,pos,return.x=T) + } + fwc$y <- fwc$y+0.5; + + zal <- qnorm(alpha/2,lower.tail=F); + + # background counts + bt <- sort(bt); + if(!is.null(pos)) { + tc <- table(bt); + } + bgcm <- lapply(bwsl,function(bgws) { + if(is.null(pos)) { + window.tag.count(bt,bgws,window.step=step,from=rng[1],to=rng[2],return.x=F,tv=bt)+0.5; + } else { + window.tag.count.around(bt,bgws,pos,return.x=F,tc=tc)+0.5 + } + }) + if(!is.null(pos)) { + rm(tc); + } + + # pick most informative scale + if(use.most.informative.scale) { + bgcm <- t(do.call(cbind,bgcm)) + isi <- max.col(t((bgcm)/(bwsl/fws))) # add pseudo-counts to select lowest scale in case of a tie + + bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)] + + if(quick.calculation) { + rte <- fwc$y+bgc-0.25*zal*zal; rte[rte<0] <- 0; + dn <- bgc - 0.25*zal*zal; + lbm=(sqrt(fwc$y*bgc) - 0.5*zal*sqrt(rte))/dn; + ivi <- which(lbm<0); + lbm <- lbm*lbm*bwsl[isi]/fws/bg.weight; + lbm[rte<=0] <- 1; + lbm[dn<=0] <- 1; + lbm[ivi] <- 1; + } else { + lbm <- (fwc$y/bgc)*qf(1-alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight; + } + + mle <- fwc$y/bgc*bwsl[isi]/fws/bg.weight; mle[is.nan(mle)] <- Inf; mle[is.na(mle)] <- Inf; + + rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle); + + if(calculate.upper.bound) { + isi <- max.col(t((-bgcm)/(bwsl/fws))) # add pseudo-counts to select highest scale in case of a tie + bgc <- c(bgcm)[isi+dim(bgcm)[1]*(c(1:length(isi))-1)] + + if(quick.calculation) { + ubm=(sqrt(fwc$y*bgc) + 0.5*zal*sqrt(rte))/dn; + ivi <- which(ubm<0); + ubm <- ubm*ubm*bwsl[isi]/fws/bg.weight; + ubm[rte<=0] <- 1; + ubm[ivi] <- 1; + lbm[dn<=0] <- 1; + } else { + ubm <- (fwc$y/bgc)*qf(alpha/2,2*fwc$y,2*bgc,lower.tail=F)*bwsl[isi]/fws/bg.weight; + } + rl <- c(rl,list(ub=ubm)); + } + return(rl); + + } else { + # determine lower bounds + lbm <- lapply(c(1:length(bgcm)),function(i) { + nbg <- bgcm[[i]]; + if(quick.calculation) { + rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0; + dn <- (nbg - 0.25*zal*zal); + lbm=(sqrt(fwc$y*nbg) - 0.5*zal*sqrt(rte))/dn; + ivi <- which(lbm<0); + lbm <- lbm*lbm*bwsl[i]/fws/bg.weight; + lbm[rte<=0] <- 1; + lbm[dn<=0] <- 1; + lbm[ivi] <- 1; + return(lbm); + } else { + return((fwc$y/nbg)*qf(1-alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight); + } + }) + lbm <- do.call(pmin,lbm); + + # calculate mle + #mle <- do.call(pmin,lapply(bgcm,function(bgc) fwc/bgc)) + mle <- do.call(pmin,lapply(c(1:length(bgcm)),function(i) { + bgc <- bgcm[[i]]; + x <- fwc$y/bgc*bwsl[i]/fws/bg.weight; x[is.nan(x)] <- Inf; x[is.na(x)] <- Inf; return(x); + })) + + rl <- list(x=list(s=fwc$x[1],e=fwc$x[2],step=fwc$step),lb=lbm,mle=mle); + + if(calculate.upper.bound) { + # determine upper bound + ubm <- lapply(c(1:length(bgcm)),function(i) { + nbg <- bgcm[[i]]; + if(quick.calculation) { + rte <- fwc$y+nbg-0.25*zal*zal; rte[rte<0] <- 0; + dn <- (nbg - 0.25*zal*zal); + ubm=(sqrt(fwc$y*nbg) + 0.5*zal*sqrt(rte))/dn; + ivi <- which(ubm<0); + ubm <- ubm*ubm*bwsl[i]/fws/bg.weight; + ubm[rte<=0] <- 1; + ubm[dn<=0] <- 1; + ubm[ivi] <- 1; + return(ubm); + } else { + return((fwc$y/nbg)*qf(alpha/2,2*fwc$y,2*nbg,lower.tail=F)*bwsl[i]/fws/bg.weight); + } + }) + ubm <- do.call(pmax,ubm); + rl <- c(rl,list(ub=ubm)); + } + + return(rl); + } +} + +write.probe.wig <- function(chr,pos,val,fname,append=F,feature="M",probe.length=35,header=T) { + min.dist <- min(diff(pos)); + if(probe.length>=min.dist) { + probe.length <- min.dist-1; + cat("warning: adjusted down wig segment length to",probe.length,"\n"); + } + mdat <- data.frame(chr,as.integer(pos),as.integer(pos+probe.length),val) + + if(header) { + write(paste("track type=wiggle_0 name=\"Bed Format\" description=\"",feature,"\" visibility=dense color=200,100,0 altColor=0,100,200 priority=20",sep=""),file=fname,append=append) + write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=T); + } else { + write.table(mdat,file=fname,col.names=F,row.names=F,quote=F,sep=" ",append=append); + } + +} + +# returns intersection of multiple region sets +# each regionset needs to contain $s, $e and optional $v column +regionset.intersection.c <- function(rsl,max.val=-1,do.union=F) { + # translate into position/flag form + rfl <- lapply(rsl,function(rs) { + rp <- c(rs$s,rs$e); rf <- c(rep(c(1,-1),each=length(rs$s))); + + ro <- order(rp); + rp <- rp[ro]; rf <- rf[ro]; + if(!is.null(rs$v)) { + rv <- c(rs$v,rs$v)[ro]; + return(data.frame(p=as.numeric(rp),f=as.integer(rf),v=as.numeric(rv))); + } else { + return(data.frame(p=as.numeric(rp),f=as.integer(rf))); + } + }) + rfd <- data.frame(do.call(rbind,lapply(1:length(rfl),function(i) { + d <- rfl[[i]]; d$f <- d$f*i; return(d); + }))) + rfd <- rfd[order(rfd$p),]; + if(is.null(rfd$v)) { max.val <- 0; } + if(do.union) { ur <- 1; } else { ur <- 0; }; + rl <- .Call("region_intersection",as.integer(length(rfl)),as.numeric(rfd$p),as.integer(rfd$f),as.numeric(rfd$v),as.integer(max.val),as.integer(ur)); + return(data.frame(do.call(cbind,rl))); +} + + +# idenfity if binding peak falls within a larger region of significant tag enrichment, and if so record its booundaries +add.broad.peak.regions <- function(chip.tags,input.tags,bp,window.size=500,z.thr=2) { + se <- find.significantly.enriched.regions(chip.tags,input.tags,window.size=window.size,z.thr=z.thr,poisson.z=0,poisson.ratio=0,either=F) + chrl <- names(bp$npl); names(chrl) <- chrl; + bnpl <- lapply(chrl,function(chr) { + npl <- bp$npl[[chr]]; + if(is.null(npl) | dim(npl)[1]<1) { + return(npl); + } + pi <- points.within(npl$x,se[[chr]]$s,se[[chr]]$e,return.list=T); + + pm <- do.call(rbind,lapply(pi,function(rl) { + if(length(rl)>0) { + return(range(c(se[[chr]]$s[rl],se[[chr]]$e[rl]))) + } else { + return(c(NA,NA)); + } + })) + + npl$rs <- pm[,1]; + npl$re <- pm[,2]; + return(npl); + }) + bp$npl <- bnpl; + return(bp); +} + +# writing out binding results in a narrowpeak format, incorporating broad region boundaries if they are present +# if broad region info is not present, margin is used to determine region width. The default margin is equal +# to the window half size used to call the binding peaks +write.narrowpeak.binding <- function(bd,fname,margin=bd$whs,npeaks=NA) { # Anshul: added npeaks option + if(is.null(margin)) { margin <- 50; } + chrl <- names(bd$npl); names(chrl) <- chrl; + md <- do.call(rbind,lapply(chrl,function(chr) { + df <- bd$npl[[chr]]; + x <- df$x; + rs <- df$rs; if(is.null(rs)) { rs <- rep(NA,length(x)) } + re <- df$re; if(is.null(re)) { re <- rep(NA,length(x)) } + #ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- x[ivi]-margin;} + ivi <- which(is.na(rs)); if(any(ivi)) {rs[ivi] <- pmax(0,x[ivi]-margin);} # Anshul: added the pmax (0, ...) to avoid negative peak starts + ivi <- which(is.na(re)); if(any(ivi)) {re[ivi] <- x[ivi]+margin;} + #cbind(chr,rs,re,".","0",".",df$y,-1,format(df$fdr,scientific=T,digits=3),x-rs) + cbind(chr,rs,re,".","0",".",df$y,-1,-log10(df$fdr),x-rs) # Anshul: converted fdr to -log10 + })) + md <- md[order(as.numeric(md[,7]),decreasing=T),] + if (!is.na(npeaks)) { # Anshul: added this option to print a limited number of peaks + npeaks <- min(nrow(md),npeaks) + md <- md[1:npeaks,] + } + write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F); +} + + +get.broad.enrichment.clusters <- function(signal.data,control.data,window.size=1e3,z.thr=3, tag.shift=146/2,background.density.scaling=F, ... ) { + # find significantly enriched clusters + bg.weight <- dataset.density.ratio(signal.data,control.data,background.density.scaling=background.density.scaling); + se <- find.significantly.enriched.regions(signal.data,control.data,window.size=window.size,z.thr=z.thr,tag.shift=tag.shift, bg.weight=bg.weight, ...) + chrl <- names(se); names(chrl) <- chrl; + se <- lapply(chrl,function(chr) { + d <- se[[chr]]; + if(length(d$s>1)) { + d <- regionset.intersection.c(list(d,d),do.union=T); + sc <- points.within(abs(signal.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T); + cc <- points.within(abs(control.data[[chr]]+tag.shift),d$s,d$e,return.point.counts=T); + d$rv <- log2((sc+1)/(cc+1)/bg.weight); + return(d); + } else { + return(d) + } + }) +} + +write.broadpeak.info <- function(bp,fname) { + chrl <- names(bp); names(chrl) <- chrl; + chrl <- chrl[unlist(lapply(bp,function(d) length(d$s)))>0] + md <- do.call(rbind,lapply(chrl,function(chr) { + df <- bp[[chr]]; + cbind(chr,df$s,df$e,".","0",".",df$rv,-1,-1) + })) + md <- md[order(as.numeric(md[,7]),decreasing=T),] + write.table(md,file=fname,col.names=F,row.names=F,quote=F,sep="\t",append=F); +} + + +get.clusters2 <- function(x,CL) { + temp <- which(diff(x) != 0) + begin <- c(1, temp + 1) + end <- c(temp, length(x)) + size <- end - begin + 1 + + begin <- begin[size >= CL] + end <- end[size >= CL] + size <- size[size >= CL] + + size <- size[x[end] != 0] + begin <- begin[x[end] != 0] + end <- end[x[end] != 0] + + return (list(size=size,begin=begin,end=end)) +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/configure Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,3856 @@ +#! /bin/sh +# Guess values for system-dependent variables and create Makefiles. +# Generated by GNU Autoconf 2.63 for SPP 1.7. +# +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, +# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. +## --------------------- ## +## M4sh Initialization. ## +## --------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; +esac + +fi + + + + +# PATH needs CR +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + +# Support unset when possible. +if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then + as_unset=unset +else + as_unset=false +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +case $0 in + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break +done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + { (exit 1); exit 1; } +fi + +# Work around bugs in pre-3.0 UWIN ksh. +for as_var in ENV MAIL MAILPATH +do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# Required to use basename. +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + + +# Name of the executable. +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# CDPATH. +$as_unset CDPATH + + +if test "x$CONFIG_SHELL" = x; then + if (eval ":") 2>/dev/null; then + as_have_required=yes +else + as_have_required=no +fi + + if test $as_have_required = yes && (eval ": +(as_func_return () { + (exit \$1) +} +as_func_success () { + as_func_return 0 +} +as_func_failure () { + as_func_return 1 +} +as_func_ret_success () { + return 0 +} +as_func_ret_failure () { + return 1 +} + +exitcode=0 +if as_func_success; then + : +else + exitcode=1 + echo as_func_success failed. +fi + +if as_func_failure; then + exitcode=1 + echo as_func_failure succeeded. +fi + +if as_func_ret_success; then + : +else + exitcode=1 + echo as_func_ret_success failed. +fi + +if as_func_ret_failure; then + exitcode=1 + echo as_func_ret_failure succeeded. +fi + +if ( set x; as_func_ret_success y && test x = \"\$1\" ); then + : +else + exitcode=1 + echo positional parameters were not saved. +fi + +test \$exitcode = 0) || { (exit 1); exit 1; } + +( + as_lineno_1=\$LINENO + as_lineno_2=\$LINENO + test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" && + test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; } +") 2> /dev/null; then + : +else + as_candidate_shells= + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + case $as_dir in + /*) + for as_base in sh bash ksh sh5; do + as_candidate_shells="$as_candidate_shells $as_dir/$as_base" + done;; + esac +done +IFS=$as_save_IFS + + + for as_shell in $as_candidate_shells $SHELL; do + # Try only shells that exist, to save several forks. + if { test -f "$as_shell" || test -f "$as_shell.exe"; } && + { ("$as_shell") 2> /dev/null <<\_ASEOF +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; +esac + +fi + + +: +_ASEOF +}; then + CONFIG_SHELL=$as_shell + as_have_required=yes + if { "$as_shell" 2> /dev/null <<\_ASEOF +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; +esac + +fi + + +: +(as_func_return () { + (exit $1) +} +as_func_success () { + as_func_return 0 +} +as_func_failure () { + as_func_return 1 +} +as_func_ret_success () { + return 0 +} +as_func_ret_failure () { + return 1 +} + +exitcode=0 +if as_func_success; then + : +else + exitcode=1 + echo as_func_success failed. +fi + +if as_func_failure; then + exitcode=1 + echo as_func_failure succeeded. +fi + +if as_func_ret_success; then + : +else + exitcode=1 + echo as_func_ret_success failed. +fi + +if as_func_ret_failure; then + exitcode=1 + echo as_func_ret_failure succeeded. +fi + +if ( set x; as_func_ret_success y && test x = "$1" ); then + : +else + exitcode=1 + echo positional parameters were not saved. +fi + +test $exitcode = 0) || { (exit 1); exit 1; } + +( + as_lineno_1=$LINENO + as_lineno_2=$LINENO + test "x$as_lineno_1" != "x$as_lineno_2" && + test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; } + +_ASEOF +}; then + break +fi + +fi + + done + + if test "x$CONFIG_SHELL" != x; then + for as_var in BASH_ENV ENV + do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var + done + export CONFIG_SHELL + exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"} +fi + + + if test $as_have_required = no; then + echo This script requires a shell more modern than all the + echo shells that I found on your system. Please install a + echo modern shell, or manually run the script under such a + echo shell if you do have one. + { (exit 1); exit 1; } +fi + + +fi + +fi + + + +(eval "as_func_return () { + (exit \$1) +} +as_func_success () { + as_func_return 0 +} +as_func_failure () { + as_func_return 1 +} +as_func_ret_success () { + return 0 +} +as_func_ret_failure () { + return 1 +} + +exitcode=0 +if as_func_success; then + : +else + exitcode=1 + echo as_func_success failed. +fi + +if as_func_failure; then + exitcode=1 + echo as_func_failure succeeded. +fi + +if as_func_ret_success; then + : +else + exitcode=1 + echo as_func_ret_success failed. +fi + +if as_func_ret_failure; then + exitcode=1 + echo as_func_ret_failure succeeded. +fi + +if ( set x; as_func_ret_success y && test x = \"\$1\" ); then + : +else + exitcode=1 + echo positional parameters were not saved. +fi + +test \$exitcode = 0") || { + echo No shell found that supports shell functions. + echo Please tell bug-autoconf@gnu.org about your system, + echo including any error possibly output before this message. + echo This can help us improve future autoconf versions. + echo Configuration will now proceed without shell functions. +} + + + + as_lineno_1=$LINENO + as_lineno_2=$LINENO + test "x$as_lineno_1" != "x$as_lineno_2" && + test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || { + + # Create $as_me.lineno as a copy of $as_myself, but with $LINENO + # uniformly replaced by the line number. The first 'sed' inserts a + # line-number line after each line using $LINENO; the second 'sed' + # does the real work. The second script uses 'N' to pair each + # line-number line with the line containing $LINENO, and appends + # trailing '-' during substitution so that $LINENO is not a special + # case at line end. + # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the + # scripts with optimization help from Paolo Bonzini. Blame Lee + # E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 + { (exit 1); exit 1; }; } + + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in +-n*) + case `echo 'x\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + *) ECHO_C='\c';; + esac;; +*) + ECHO_N='-n';; +esac +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -p'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -p' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -p' + fi +else + as_ln_s='cp -p' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + +if mkdir -p . 2>/dev/null; then + as_mkdir_p=: +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +if test -x / >/dev/null 2>&1; then + as_test_x='test -x' +else + if ls -dL / >/dev/null 2>&1; then + as_ls_L_option=L + else + as_ls_L_option= + fi + as_test_x=' + eval sh -c '\'' + if test -d "$1"; then + test -d "$1/."; + else + case $1 in + -*)set "./$1";; + esac; + case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in + ???[sx]*):;;*)false;;esac;fi + '\'' sh + ' +fi +as_executable_p=$as_test_x + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + + +exec 7<&0 </dev/null 6>&1 + +# Name of the host. +# hostname on some systems (SVR3.2, Linux) returns a bogus exit status, +# so uname gets run too. +ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` + +# +# Initializations. +# +ac_default_prefix=/usr/local +ac_clean_files= +ac_config_libobj_dir=. +LIBOBJS= +cross_compiling=no +subdirs= +MFLAGS= +MAKEFLAGS= +SHELL=${CONFIG_SHELL-/bin/sh} + +# Identity of this package. +PACKAGE_NAME='SPP' +PACKAGE_TARNAME='spp' +PACKAGE_VERSION='1.7' +PACKAGE_STRING='SPP 1.7' +PACKAGE_BUGREPORT='' + +ac_subst_vars='LTLIBOBJS +LIBOBJS +HAVE_LIBBZ2 +OBJEXT +EXEEXT +ac_ct_CC +CPPFLAGS +LDFLAGS +CFLAGS +CC +target_alias +host_alias +build_alias +LIBS +ECHO_T +ECHO_N +ECHO_C +DEFS +mandir +localedir +libdir +psdir +pdfdir +dvidir +htmldir +infodir +docdir +oldincludedir +includedir +localstatedir +sharedstatedir +sysconfdir +datadir +datarootdir +libexecdir +sbindir +bindir +program_transform_name +prefix +exec_prefix +PACKAGE_BUGREPORT +PACKAGE_STRING +PACKAGE_VERSION +PACKAGE_TARNAME +PACKAGE_NAME +PATH_SEPARATOR +SHELL' +ac_subst_files='' +ac_user_opts=' +enable_option_checking +' + ac_precious_vars='build_alias +host_alias +target_alias +CC +CFLAGS +LDFLAGS +LIBS +CPPFLAGS' + + +# Initialize some variables set by options. +ac_init_help= +ac_init_version=false +ac_unrecognized_opts= +ac_unrecognized_sep= +# The variables have the same names as the options, with +# dashes changed to underlines. +cache_file=/dev/null +exec_prefix=NONE +no_create= +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +verbose= +x_includes=NONE +x_libraries=NONE + +# Installation directory options. +# These are left unexpanded so users can "make install exec_prefix=/foo" +# and all the variables that are supposed to be based on exec_prefix +# by default will actually change. +# Use braces instead of parens because sh, perl, etc. also accept them. +# (The list follows the same order as the GNU Coding Standards.) +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datarootdir='${prefix}/share' +datadir='${datarootdir}' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +includedir='${prefix}/include' +oldincludedir='/usr/include' +docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' +infodir='${datarootdir}/info' +htmldir='${docdir}' +dvidir='${docdir}' +pdfdir='${docdir}' +psdir='${docdir}' +libdir='${exec_prefix}/lib' +localedir='${datarootdir}/locale' +mandir='${datarootdir}/man' + +ac_prev= +ac_dashdash= +for ac_option +do + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval $ac_prev=\$ac_option + ac_prev= + continue + fi + + case $ac_option in + *=*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; + *) ac_optarg=yes ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case $ac_dashdash$ac_option in + --) + ac_dashdash=yes ;; + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir=$ac_optarg ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build_alias ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build_alias=$ac_optarg ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file=$ac_optarg ;; + + --config-cache | -C) + cache_file=config.cache ;; + + -datadir | --datadir | --datadi | --datad) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=*) + datadir=$ac_optarg ;; + + -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ + | --dataroo | --dataro | --datar) + ac_prev=datarootdir ;; + -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ + | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) + datarootdir=$ac_optarg ;; + + -disable-* | --disable-*) + ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=no ;; + + -docdir | --docdir | --docdi | --doc | --do) + ac_prev=docdir ;; + -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) + docdir=$ac_optarg ;; + + -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) + ac_prev=dvidir ;; + -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) + dvidir=$ac_optarg ;; + + -enable-* | --enable-*) + ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=\$ac_optarg ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix=$ac_optarg ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he | -h) + ac_init_help=long ;; + -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) + ac_init_help=recursive ;; + -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) + ac_init_help=short ;; + + -host | --host | --hos | --ho) + ac_prev=host_alias ;; + -host=* | --host=* | --hos=* | --ho=*) + host_alias=$ac_optarg ;; + + -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) + ac_prev=htmldir ;; + -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ + | --ht=*) + htmldir=$ac_optarg ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir=$ac_optarg ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir=$ac_optarg ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir=$ac_optarg ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir=$ac_optarg ;; + + -localedir | --localedir | --localedi | --localed | --locale) + ac_prev=localedir ;; + -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) + localedir=$ac_optarg ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst | --locals) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) + localstatedir=$ac_optarg ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir=$ac_optarg ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c | -n) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir=$ac_optarg ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix=$ac_optarg ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix=$ac_optarg ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix=$ac_optarg ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name=$ac_optarg ;; + + -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) + ac_prev=pdfdir ;; + -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) + pdfdir=$ac_optarg ;; + + -psdir | --psdir | --psdi | --psd | --ps) + ac_prev=psdir ;; + -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) + psdir=$ac_optarg ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir=$ac_optarg ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir=$ac_optarg ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site=$ac_optarg ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir=$ac_optarg ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir=$ac_optarg ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target_alias ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target_alias=$ac_optarg ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers | -V) + ac_init_version=: ;; + + -with-* | --with-*) + ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=\$ac_optarg ;; + + -without-* | --without-*) + ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2 + { (exit 1); exit 1; }; } + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=no ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes=$ac_optarg ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries=$ac_optarg ;; + + -*) { $as_echo "$as_me: error: unrecognized option: $ac_option +Try \`$0 --help' for more information." >&2 + { (exit 1); exit 1; }; } + ;; + + *=*) + ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` + # Reject names that are not valid shell variable names. + expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null && + { $as_echo "$as_me: error: invalid variable name: $ac_envvar" >&2 + { (exit 1); exit 1; }; } + eval $ac_envvar=\$ac_optarg + export $ac_envvar ;; + + *) + # FIXME: should be removed in autoconf 3.0. + $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 + expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && + $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 + : ${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option} + ;; + + esac +done + +if test -n "$ac_prev"; then + ac_option=--`echo $ac_prev | sed 's/_/-/g'` + { $as_echo "$as_me: error: missing argument to $ac_option" >&2 + { (exit 1); exit 1; }; } +fi + +if test -n "$ac_unrecognized_opts"; then + case $enable_option_checking in + no) ;; + fatal) { $as_echo "$as_me: error: unrecognized options: $ac_unrecognized_opts" >&2 + { (exit 1); exit 1; }; } ;; + *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; + esac +fi + +# Check all directory arguments for consistency. +for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ + datadir sysconfdir sharedstatedir localstatedir includedir \ + oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ + libdir localedir mandir +do + eval ac_val=\$$ac_var + # Remove trailing slashes. + case $ac_val in + */ ) + ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` + eval $ac_var=\$ac_val;; + esac + # Be sure to have absolute directory names. + case $ac_val in + [\\/$]* | ?:[\\/]* ) continue;; + NONE | '' ) case $ac_var in *prefix ) continue;; esac;; + esac + { $as_echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2 + { (exit 1); exit 1; }; } +done + +# There might be people who depend on the old broken behavior: `$host' +# used to hold the argument of --host etc. +# FIXME: To remove some day. +build=$build_alias +host=$host_alias +target=$target_alias + +# FIXME: To remove some day. +if test "x$host_alias" != x; then + if test "x$build_alias" = x; then + cross_compiling=maybe + $as_echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host. + If a cross compiler is detected then cross compile mode will be used." >&2 + elif test "x$build_alias" != "x$host_alias"; then + cross_compiling=yes + fi +fi + +ac_tool_prefix= +test -n "$host_alias" && ac_tool_prefix=$host_alias- + +test "$silent" = yes && exec 6>/dev/null + + +ac_pwd=`pwd` && test -n "$ac_pwd" && +ac_ls_di=`ls -di .` && +ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || + { $as_echo "$as_me: error: working directory cannot be determined" >&2 + { (exit 1); exit 1; }; } +test "X$ac_ls_di" = "X$ac_pwd_ls_di" || + { $as_echo "$as_me: error: pwd does not report name of working directory" >&2 + { (exit 1); exit 1; }; } + + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then the parent directory. + ac_confdir=`$as_dirname -- "$as_myself" || +$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_myself" : 'X\(//\)[^/]' \| \ + X"$as_myself" : 'X\(//\)$' \| \ + X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_myself" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + srcdir=$ac_confdir + if test ! -r "$srcdir/$ac_unique_file"; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r "$srcdir/$ac_unique_file"; then + test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." + { $as_echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2 + { (exit 1); exit 1; }; } +fi +ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" +ac_abs_confdir=`( + cd "$srcdir" && test -r "./$ac_unique_file" || { $as_echo "$as_me: error: $ac_msg" >&2 + { (exit 1); exit 1; }; } + pwd)` +# When building in place, set srcdir=. +if test "$ac_abs_confdir" = "$ac_pwd"; then + srcdir=. +fi +# Remove unnecessary trailing slashes from srcdir. +# Double slashes in file names in object file debugging info +# mess up M-x gdb in Emacs. +case $srcdir in +*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; +esac +for ac_var in $ac_precious_vars; do + eval ac_env_${ac_var}_set=\${${ac_var}+set} + eval ac_env_${ac_var}_value=\$${ac_var} + eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} + eval ac_cv_env_${ac_var}_value=\$${ac_var} +done + +# +# Report the --help message. +# +if test "$ac_init_help" = "long"; then + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat <<_ACEOF +\`configure' configures SPP 1.7 to adapt to many kinds of systems. + +Usage: $0 [OPTION]... [VAR=VALUE]... + +To assign environment variables (e.g., CC, CFLAGS...), specify them as +VAR=VALUE. See below for descriptions of some of the useful variables. + +Defaults for the options are specified in brackets. + +Configuration: + -h, --help display this help and exit + --help=short display options specific to this package + --help=recursive display the short help of all the included packages + -V, --version display version information and exit + -q, --quiet, --silent do not print \`checking...' messages + --cache-file=FILE cache test results in FILE [disabled] + -C, --config-cache alias for \`--cache-file=config.cache' + -n, --no-create do not create output files + --srcdir=DIR find the sources in DIR [configure dir or \`..'] + +Installation directories: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [PREFIX] + +By default, \`make install' will install all the files in +\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify +an installation prefix other than \`$ac_default_prefix' using \`--prefix', +for instance \`--prefix=\$HOME'. + +For better control, use the options below. + +Fine tuning of the installation directories: + --bindir=DIR user executables [EPREFIX/bin] + --sbindir=DIR system admin executables [EPREFIX/sbin] + --libexecdir=DIR program executables [EPREFIX/libexec] + --sysconfdir=DIR read-only single-machine data [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] + --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --libdir=DIR object code libraries [EPREFIX/lib] + --includedir=DIR C header files [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc [/usr/include] + --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] + --datadir=DIR read-only architecture-independent data [DATAROOTDIR] + --infodir=DIR info documentation [DATAROOTDIR/info] + --localedir=DIR locale-dependent data [DATAROOTDIR/locale] + --mandir=DIR man documentation [DATAROOTDIR/man] + --docdir=DIR documentation root [DATAROOTDIR/doc/spp] + --htmldir=DIR html documentation [DOCDIR] + --dvidir=DIR dvi documentation [DOCDIR] + --pdfdir=DIR pdf documentation [DOCDIR] + --psdir=DIR ps documentation [DOCDIR] +_ACEOF + + cat <<\_ACEOF +_ACEOF +fi + +if test -n "$ac_init_help"; then + case $ac_init_help in + short | recursive ) echo "Configuration of SPP 1.7:";; + esac + cat <<\_ACEOF + +Some influential environment variables: + CC C compiler command + CFLAGS C compiler flags + LDFLAGS linker flags, e.g. -L<lib dir> if you have libraries in a + nonstandard directory <lib dir> + LIBS libraries to pass to the linker, e.g. -l<library> + CPPFLAGS C/C++/Objective C preprocessor flags, e.g. -I<include dir> if + you have headers in a nonstandard directory <include dir> + +Use these variables to override the choices made by `configure' or to help +it to find libraries and programs with nonstandard names/locations. + +_ACEOF +ac_status=$? +fi + +if test "$ac_init_help" = "recursive"; then + # If there are subdirs, report their specific --help. + for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue + test -d "$ac_dir" || + { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || + continue + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + cd "$ac_dir" || { ac_status=$?; continue; } + # Check for guested configure. + if test -f "$ac_srcdir/configure.gnu"; then + echo && + $SHELL "$ac_srcdir/configure.gnu" --help=recursive + elif test -f "$ac_srcdir/configure"; then + echo && + $SHELL "$ac_srcdir/configure" --help=recursive + else + $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 + fi || ac_status=$? + cd "$ac_pwd" || { ac_status=$?; break; } + done +fi + +test -n "$ac_init_help" && exit $ac_status +if $ac_init_version; then + cat <<\_ACEOF +SPP configure 1.7 +generated by GNU Autoconf 2.63 + +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, +2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. +This configure script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it. +_ACEOF + exit +fi +cat >config.log <<_ACEOF +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. + +It was created by SPP $as_me 1.7, which was +generated by GNU Autoconf 2.63. Invocation command line was + + $ $0 $@ + +_ACEOF +exec 5>>config.log +{ +cat <<_ASUNAME +## --------- ## +## Platform. ## +## --------- ## + +hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` + +/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` +/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` +/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` +/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` + +_ASUNAME + +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + $as_echo "PATH: $as_dir" +done +IFS=$as_save_IFS + +} >&5 + +cat >&5 <<_ACEOF + + +## ----------- ## +## Core tests. ## +## ----------- ## + +_ACEOF + + +# Keep a trace of the command line. +# Strip out --no-create and --no-recursion so they do not pile up. +# Strip out --silent because we don't want to record it for future runs. +# Also quote any args containing shell meta-characters. +# Make two passes to allow for proper duplicate-argument suppression. +ac_configure_args= +ac_configure_args0= +ac_configure_args1= +ac_must_keep_next=false +for ac_pass in 1 2 +do + for ac_arg + do + case $ac_arg in + -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + continue ;; + *\'*) + ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + case $ac_pass in + 1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;; + 2) + ac_configure_args1="$ac_configure_args1 '$ac_arg'" + if test $ac_must_keep_next = true; then + ac_must_keep_next=false # Got value, back to normal. + else + case $ac_arg in + *=* | --config-cache | -C | -disable-* | --disable-* \ + | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ + | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ + | -with-* | --with-* | -without-* | --without-* | --x) + case "$ac_configure_args0 " in + "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; + esac + ;; + -* ) ac_must_keep_next=true ;; + esac + fi + ac_configure_args="$ac_configure_args '$ac_arg'" + ;; + esac + done +done +$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; } +$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; } + +# When interrupted or exit'd, cleanup temporary files, and complete +# config.log. We remove comments because anyway the quotes in there +# would cause problems or look ugly. +# WARNING: Use '\'' to represent an apostrophe within the trap. +# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. +trap 'exit_status=$? + # Save into config.log some information that might help in debugging. + { + echo + + cat <<\_ASBOX +## ---------------- ## +## Cache variables. ## +## ---------------- ## +_ASBOX + echo + # The following way of writing the cache mishandles newlines in values, +( + for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) $as_unset $ac_var ;; + esac ;; + esac + done + (set) 2>&1 | + case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + sed -n \ + "s/'\''/'\''\\\\'\'''\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" + ;; #( + *) + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) + echo + + cat <<\_ASBOX +## ----------------- ## +## Output variables. ## +## ----------------- ## +_ASBOX + echo + for ac_var in $ac_subst_vars + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + + if test -n "$ac_subst_files"; then + cat <<\_ASBOX +## ------------------- ## +## File substitutions. ## +## ------------------- ## +_ASBOX + echo + for ac_var in $ac_subst_files + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + fi + + if test -s confdefs.h; then + cat <<\_ASBOX +## ----------- ## +## confdefs.h. ## +## ----------- ## +_ASBOX + echo + cat confdefs.h + echo + fi + test "$ac_signal" != 0 && + $as_echo "$as_me: caught signal $ac_signal" + $as_echo "$as_me: exit $exit_status" + } >&5 + rm -f core *.core core.conftest.* && + rm -f -r conftest* confdefs* conf$$* $ac_clean_files && + exit $exit_status +' 0 +for ac_signal in 1 2 13 15; do + trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal +done +ac_signal=0 + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -f -r conftest* confdefs.h + +# Predefined preprocessor variables. + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_NAME "$PACKAGE_NAME" +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_TARNAME "$PACKAGE_TARNAME" +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_VERSION "$PACKAGE_VERSION" +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_STRING "$PACKAGE_STRING" +_ACEOF + + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" +_ACEOF + + +# Let the site file select an alternate cache file if it wants to. +# Prefer an explicitly selected file to automatically selected ones. +ac_site_file1=NONE +ac_site_file2=NONE +if test -n "$CONFIG_SITE"; then + ac_site_file1=$CONFIG_SITE +elif test "x$prefix" != xNONE; then + ac_site_file1=$prefix/share/config.site + ac_site_file2=$prefix/etc/config.site +else + ac_site_file1=$ac_default_prefix/share/config.site + ac_site_file2=$ac_default_prefix/etc/config.site +fi +for ac_site_file in "$ac_site_file1" "$ac_site_file2" +do + test "x$ac_site_file" = xNONE && continue + if test -r "$ac_site_file"; then + { $as_echo "$as_me:$LINENO: loading site script $ac_site_file" >&5 +$as_echo "$as_me: loading site script $ac_site_file" >&6;} + sed 's/^/| /' "$ac_site_file" >&5 + . "$ac_site_file" + fi +done + +if test -r "$cache_file"; then + # Some versions of bash will fail to source /dev/null (special + # files actually), so we avoid doing that. + if test -f "$cache_file"; then + { $as_echo "$as_me:$LINENO: loading cache $cache_file" >&5 +$as_echo "$as_me: loading cache $cache_file" >&6;} + case $cache_file in + [\\/]* | ?:[\\/]* ) . "$cache_file";; + *) . "./$cache_file";; + esac + fi +else + { $as_echo "$as_me:$LINENO: creating cache $cache_file" >&5 +$as_echo "$as_me: creating cache $cache_file" >&6;} + >$cache_file +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { $as_echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { $as_echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { $as_echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5 +$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { $as_echo "$as_me:$LINENO: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { $as_echo "$as_me:$LINENO: former value: \`$ac_old_val'" >&5 +$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} + { $as_echo "$as_me:$LINENO: current value: \`$ac_new_val'" >&5 +$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) ac_configure_args="$ac_configure_args '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + { $as_echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5 +$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} + { { $as_echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5 +$as_echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;} + { (exit 1); exit 1; }; } +fi + + + + + + + + + + + + + + + + + + + + + + + + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. +set dummy ${ac_tool_prefix}gcc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="${ac_tool_prefix}gcc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_CC"; then + ac_ct_CC=$CC + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_ac_ct_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_CC="gcc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +else + CC="$ac_cv_prog_CC" +fi + +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. +set dummy ${ac_tool_prefix}cc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="${ac_tool_prefix}cc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + + fi +fi +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + ac_prog_rejected=no +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# != 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" + fi +fi +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + for ac_prog in cl.exe + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_CC="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:$LINENO: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CC" && break + done +fi +if test -z "$CC"; then + ac_ct_CC=$CC + for ac_prog in cl.exe +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if test "${ac_cv_prog_ac_ct_CC+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then + ac_cv_prog_ac_ct_CC="$ac_prog" + $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done +done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:$LINENO: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CC" && break +done + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +fi + +fi + + +test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: no acceptable C compiler found in \$PATH +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } + +# Provide some information about the compiler. +$as_echo "$as_me:$LINENO: checking for C compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +{ (ac_try="$ac_compiler --version >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compiler --version >&5") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } +{ (ac_try="$ac_compiler -v >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compiler -v >&5") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } +{ (ac_try="$ac_compiler -V >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compiler -V >&5") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } + +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" +# Try to create an executable without -o first, disregard a.out. +# It will help us diagnose broken compilers, and finding out an intuition +# of exeext. +{ $as_echo "$as_me:$LINENO: checking for C compiler default output file name" >&5 +$as_echo_n "checking for C compiler default output file name... " >&6; } +ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` + +# The possible output files: +ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" + +ac_rmfiles= +for ac_file in $ac_files +do + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + * ) ac_rmfiles="$ac_rmfiles $ac_file";; + esac +done +rm -f $ac_rmfiles + +if { (ac_try="$ac_link_default" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link_default") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; then + # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. +# So ignore a value of `no', otherwise this would lead to `EXEEXT = no' +# in a Makefile. We should not override ac_cv_exeext if it was cached, +# so that the user can short-circuit this test for compilers unknown to +# Autoconf. +for ac_file in $ac_files '' +do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) + ;; + [ab].out ) + # We found the default executable, but exeext='' is most + # certainly right. + break;; + *.* ) + if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; + then :; else + ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + fi + # We set ac_cv_exeext here because the later test for it is not + # safe: cross compilers may not add the suffix if given an `-o' + # argument, so we may need to know it at that point already. + # Even if this section looks crufty: it has the advantage of + # actually working. + break;; + * ) + break;; + esac +done +test "$ac_cv_exeext" = no && ac_cv_exeext= + +else + ac_file='' +fi + +{ $as_echo "$as_me:$LINENO: result: $ac_file" >&5 +$as_echo "$ac_file" >&6; } +if test -z "$ac_file"; then + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: C compiler cannot create executables +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: C compiler cannot create executables +See \`config.log' for more details." >&2;} + { (exit 77); exit 77; }; }; } +fi + +ac_exeext=$ac_cv_exeext + +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:$LINENO: checking whether the C compiler works" >&5 +$as_echo_n "checking whether the C compiler works... " >&6; } +# FIXME: These cross compiler hacks should be removed for Autoconf 3.0 +# If not cross compiling, check that we can run a simple program. +if test "$cross_compiling" != yes; then + if { ac_try='./$ac_file' + { (case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + cross_compiling=no + else + if test "$cross_compiling" = maybe; then + cross_compiling=yes + else + { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } + fi + fi +fi +{ $as_echo "$as_me:$LINENO: result: yes" >&5 +$as_echo "yes" >&6; } + +rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out +ac_clean_files=$ac_clean_files_save +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:$LINENO: checking whether we are cross compiling" >&5 +$as_echo_n "checking whether we are cross compiling... " >&6; } +{ $as_echo "$as_me:$LINENO: result: $cross_compiling" >&5 +$as_echo "$cross_compiling" >&6; } + +{ $as_echo "$as_me:$LINENO: checking for suffix of executables" >&5 +$as_echo_n "checking for suffix of executables... " >&6; } +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; then + # If both `conftest.exe' and `conftest' are `present' (well, observable) +# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will +# work properly (i.e., refer to `conftest.exe'), while it won't with +# `rm'. +for ac_file in conftest.exe conftest conftest.*; do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + break;; + * ) break;; + esac +done +else + { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } +fi + +rm -f conftest$ac_cv_exeext +{ $as_echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5 +$as_echo "$ac_cv_exeext" >&6; } + +rm -f conftest.$ac_ext +EXEEXT=$ac_cv_exeext +ac_exeext=$EXEEXT +{ $as_echo "$as_me:$LINENO: checking for suffix of object files" >&5 +$as_echo_n "checking for suffix of object files... " >&6; } +if test "${ac_cv_objext+set}" = set; then + $as_echo_n "(cached) " >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.o conftest.obj +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; then + for ac_file in conftest.o conftest.obj conftest.*; do + test -f "$ac_file" || continue; + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; + *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` + break;; + esac +done +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile +See \`config.log' for more details." >&5 +$as_echo "$as_me: error: cannot compute suffix of object files: cannot compile +See \`config.log' for more details." >&2;} + { (exit 1); exit 1; }; }; } +fi + +rm -f conftest.$ac_cv_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_objext" >&5 +$as_echo "$ac_cv_objext" >&6; } +OBJEXT=$ac_cv_objext +ac_objext=$OBJEXT +{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5 +$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } +if test "${ac_cv_c_compiler_gnu+set}" = set; then + $as_echo_n "(cached) " >&6 +else + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_compiler_gnu=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_compiler_gnu=no +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_c_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5 +$as_echo "$ac_cv_c_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GCC=yes +else + GCC= +fi +ac_test_CFLAGS=${CFLAGS+set} +ac_save_CFLAGS=$CFLAGS +{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5 +$as_echo_n "checking whether $CC accepts -g... " >&6; } +if test "${ac_cv_prog_cc_g+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_save_c_werror_flag=$ac_c_werror_flag + ac_c_werror_flag=yes + ac_cv_prog_cc_g=no + CFLAGS="-g" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_g=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + CFLAGS="" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + : +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_c_werror_flag=$ac_save_c_werror_flag + CFLAGS="-g" + cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_g=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi + +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_c_werror_flag=$ac_save_c_werror_flag +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5 +$as_echo "$ac_cv_prog_cc_g" >&6; } +if test "$ac_test_CFLAGS" = set; then + CFLAGS=$ac_save_CFLAGS +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi +{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5 +$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } +if test "${ac_cv_prog_cc_c89+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ +#include <stdarg.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/stat.h> +/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ +struct buf { int x; }; +FILE * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not '\xHH' hex character constants. + These don't provoke an error unfortunately, instead are silently treated + as 'x'. The following induces an error, until -std is added to get + proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an + array size at least. It's necessary to write '\x00'==0 to get something + that's true only with -std. */ +int osf4_cc_array ['\x00' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) 'x' +int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); +int argc; +char **argv; +int +main () +{ +return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; + ; + return 0; +} +_ACEOF +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ + -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +do + CC="$ac_save_CC $ac_arg" + rm -f conftest.$ac_objext +if { (ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_prog_cc_c89=$ac_arg +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + +fi + +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c89" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c89" in + x) + { $as_echo "$as_me:$LINENO: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:$LINENO: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c89" + { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5 +$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; +esac + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + + +{ $as_echo "$as_me:$LINENO: checking for BZ2_bzDecompressInit in -lbz2" >&5 +$as_echo_n "checking for BZ2_bzDecompressInit in -lbz2... " >&6; } +if test "${ac_cv_lib_bz2_BZ2_bzDecompressInit+set}" = set; then + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lbz2 $LIBS" +cat >conftest.$ac_ext <<_ACEOF +/* confdefs.h. */ +_ACEOF +cat confdefs.h >>conftest.$ac_ext +cat >>conftest.$ac_ext <<_ACEOF +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char BZ2_bzDecompressInit (); +int +main () +{ +return BZ2_bzDecompressInit (); + ; + return 0; +} +_ACEOF +rm -f conftest.$ac_objext conftest$ac_exeext +if { (ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\"" +$as_echo "$ac_try_echo") >&5 + (eval "$ac_link") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + $as_test_x conftest$ac_exeext + }; then + ac_cv_lib_bz2_BZ2_bzDecompressInit=yes +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_cv_lib_bz2_BZ2_bzDecompressInit=no +fi + +rm -rf conftest.dSYM +rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:$LINENO: result: $ac_cv_lib_bz2_BZ2_bzDecompressInit" >&5 +$as_echo "$ac_cv_lib_bz2_BZ2_bzDecompressInit" >&6; } +if test "x$ac_cv_lib_bz2_BZ2_bzDecompressInit" = x""yes; then + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBBZ2 1 +_ACEOF + + LIBS="-lbz2 $LIBS" + +fi + + +ac_config_files="$ac_config_files src/Makevars" + +cp confdefs.h src/config.h +cat >confcache <<\_ACEOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs, see configure's option --config-cache. +# It is not useful on other systems. If it contains results you don't +# want to keep, you may remove or edit it. +# +# config.status only pays attention to the cache file if you give it +# the --recheck option to rerun configure. +# +# `ac_cv_env_foo' variables (set or unset) will be overridden when +# loading this file, other *unset* `ac_cv_foo' will be assigned the +# following values. + +_ACEOF + +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, we kill variables containing newlines. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +( + for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) $as_unset $ac_var ;; + esac ;; + esac + done + + (set) 2>&1 | + case $as_nl`(ac_space=' '; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + # `set' does not quote correctly, so add quotes (double-quote + # substitution turns \\\\ into \\, and sed turns \\ into \). + sed -n \ + "s/'/'\\\\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" + ;; #( + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) | + sed ' + /^ac_cv_env_/b end + t clear + :clear + s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ + t end + s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ + :end' >>confcache +if diff "$cache_file" confcache >/dev/null 2>&1; then :; else + if test -w "$cache_file"; then + test "x$cache_file" != "x/dev/null" && + { $as_echo "$as_me:$LINENO: updating cache $cache_file" >&5 +$as_echo "$as_me: updating cache $cache_file" >&6;} + cat confcache >$cache_file + else + { $as_echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5 +$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} + fi +fi +rm -f confcache + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +# Transform confdefs.h into DEFS. +# Protect against shell expansion while executing Makefile rules. +# Protect against Makefile macro expansion. +# +# If the first sed substitution is executed (which looks for macros that +# take arguments), then branch to the quote section. Otherwise, +# look for a macro that doesn't take arguments. +ac_script=' +:mline +/\\$/{ + N + s,\\\n,, + b mline +} +t clear +:clear +s/^[ ]*#[ ]*define[ ][ ]*\([^ (][^ (]*([^)]*)\)[ ]*\(.*\)/-D\1=\2/g +t quote +s/^[ ]*#[ ]*define[ ][ ]*\([^ ][^ ]*\)[ ]*\(.*\)/-D\1=\2/g +t quote +b any +:quote +s/[ `~#$^&*(){}\\|;'\''"<>?]/\\&/g +s/\[/\\&/g +s/\]/\\&/g +s/\$/$$/g +H +:any +${ + g + s/^\n// + s/\n/ /g + p +} +' +DEFS=`sed -n "$ac_script" confdefs.h` + + +ac_libobjs= +ac_ltlibobjs= +for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue + # 1. Remove the extension, and $U if already installed. + ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' + ac_i=`$as_echo "$ac_i" | sed "$ac_script"` + # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR + # will be set to the directory where LIBOBJS objects are built. + ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext" + ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo' +done +LIBOBJS=$ac_libobjs + +LTLIBOBJS=$ac_ltlibobjs + + + +: ${CONFIG_STATUS=./config.status} +ac_write_fail=0 +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files $CONFIG_STATUS" +{ $as_echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} +cat >$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +#! $SHELL +# Generated by $as_me. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. + +debug=false +ac_cs_recheck=false +ac_cs_silent=false +SHELL=\${CONFIG_SHELL-$SHELL} +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +## --------------------- ## +## M4sh Initialization. ## +## --------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in + *posix*) set -o posix ;; +esac + +fi + + + + +# PATH needs CR +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + +# Support unset when possible. +if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then + as_unset=unset +else + as_unset=false +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +case $0 in + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break +done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + { (exit 1); exit 1; } +fi + +# Work around bugs in pre-3.0 UWIN ksh. +for as_var in ENV MAIL MAILPATH +do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# Required to use basename. +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + + +# Name of the executable. +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# CDPATH. +$as_unset CDPATH + + + + as_lineno_1=$LINENO + as_lineno_2=$LINENO + test "x$as_lineno_1" != "x$as_lineno_2" && + test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || { + + # Create $as_me.lineno as a copy of $as_myself, but with $LINENO + # uniformly replaced by the line number. The first 'sed' inserts a + # line-number line after each line using $LINENO; the second 'sed' + # does the real work. The second script uses 'N' to pair each + # line-number line with the line containing $LINENO, and appends + # trailing '-' during substitution so that $LINENO is not a special + # case at line end. + # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the + # scripts with optimization help from Paolo Bonzini. Blame Lee + # E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2 + { (exit 1); exit 1; }; } + + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in +-n*) + case `echo 'x\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + *) ECHO_C='\c';; + esac;; +*) + ECHO_N='-n';; +esac +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -p'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -p' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -p' + fi +else + as_ln_s='cp -p' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + +if mkdir -p . 2>/dev/null; then + as_mkdir_p=: +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +if test -x / >/dev/null 2>&1; then + as_test_x='test -x' +else + if ls -dL / >/dev/null 2>&1; then + as_ls_L_option=L + else + as_ls_L_option= + fi + as_test_x=' + eval sh -c '\'' + if test -d "$1"; then + test -d "$1/."; + else + case $1 in + -*)set "./$1";; + esac; + case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in + ???[sx]*):;;*)false;;esac;fi + '\'' sh + ' +fi +as_executable_p=$as_test_x + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +exec 6>&1 + +# Save the log message, to keep $[0] and so on meaningful, and to +# report actual input values of CONFIG_FILES etc. instead of their +# values after options handling. +ac_log=" +This file was extended by SPP $as_me 1.7, which was +generated by GNU Autoconf 2.63. Invocation command line was + + CONFIG_FILES = $CONFIG_FILES + CONFIG_HEADERS = $CONFIG_HEADERS + CONFIG_LINKS = $CONFIG_LINKS + CONFIG_COMMANDS = $CONFIG_COMMANDS + $ $0 $@ + +on `(hostname || uname -n) 2>/dev/null | sed 1q` +" + +_ACEOF + +case $ac_config_files in *" +"*) set x $ac_config_files; shift; ac_config_files=$*;; +esac + + + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +# Files that config.status was made for. +config_files="$ac_config_files" + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +ac_cs_usage="\ +\`$as_me' instantiates files from templates according to the +current configuration. + +Usage: $0 [OPTION]... [FILE]... + + -h, --help print this help, then exit + -V, --version print version number and configuration settings, then exit + -q, --quiet, --silent + do not print progress messages + -d, --debug don't remove temporary files + --recheck update $as_me by reconfiguring in the same conditions + --file=FILE[:TEMPLATE] + instantiate the configuration file FILE + +Configuration files: +$config_files + +Report bugs to <bug-autoconf@gnu.org>." + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_cs_version="\\ +SPP config.status 1.7 +configured by $0, generated by GNU Autoconf 2.63, + with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" + +Copyright (C) 2008 Free Software Foundation, Inc. +This config.status script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it." + +ac_pwd='$ac_pwd' +srcdir='$srcdir' +test -n "\$AWK" || AWK=awk +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# The default lists apply if the user does not specify any file. +ac_need_defaults=: +while test $# != 0 +do + case $1 in + --*=*) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` + ac_shift=: + ;; + *) + ac_option=$1 + ac_optarg=$2 + ac_shift=shift + ;; + esac + + case $ac_option in + # Handling of the options. + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + ac_cs_recheck=: ;; + --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) + $as_echo "$ac_cs_version"; exit ;; + --debug | --debu | --deb | --de | --d | -d ) + debug=: ;; + --file | --fil | --fi | --f ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + CONFIG_FILES="$CONFIG_FILES '$ac_optarg'" + ac_need_defaults=false;; + --he | --h | --help | --hel | -h ) + $as_echo "$ac_cs_usage"; exit ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil | --si | --s) + ac_cs_silent=: ;; + + # This is an error. + -*) { $as_echo "$as_me: error: unrecognized option: $1 +Try \`$0 --help' for more information." >&2 + { (exit 1); exit 1; }; } ;; + + *) ac_config_targets="$ac_config_targets $1" + ac_need_defaults=false ;; + + esac + shift +done + +ac_configure_extra_args= + +if $ac_cs_silent; then + exec 6>/dev/null + ac_configure_extra_args="$ac_configure_extra_args --silent" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +if \$ac_cs_recheck; then + set X '$SHELL' '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + shift + \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 + CONFIG_SHELL='$SHELL' + export CONFIG_SHELL + exec "\$@" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +exec 5>>config.log +{ + echo + sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX +## Running $as_me. ## +_ASBOX + $as_echo "$ac_log" +} >&5 + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + +# Handling of arguments. +for ac_config_target in $ac_config_targets +do + case $ac_config_target in + "src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;; + + *) { { $as_echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5 +$as_echo "$as_me: error: invalid argument: $ac_config_target" >&2;} + { (exit 1); exit 1; }; };; + esac +done + + +# If the user did not use the arguments to specify the items to instantiate, +# then the envvar interface is used. Set only those that are not. +# We use the long form for the default assignment because of an extremely +# bizarre bug on SunOS 4.1.3. +if $ac_need_defaults; then + test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files +fi + +# Have a temporary directory for convenience. Make it in the build tree +# simply because there is no reason against having it here, and in addition, +# creating and moving files from /tmp can sometimes cause problems. +# Hook for its removal unless debugging. +# Note that there is a small window in which the directory will not be cleaned: +# after its creation but before its name has been assigned to `$tmp'. +$debug || +{ + tmp= + trap 'exit_status=$? + { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status +' 0 + trap '{ (exit 1); exit 1; }' 1 2 13 15 +} +# Create a (secure) tmp directory for tmp files. + +{ + tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && + test -n "$tmp" && test -d "$tmp" +} || +{ + tmp=./conf$$-$RANDOM + (umask 077 && mkdir "$tmp") +} || +{ + $as_echo "$as_me: cannot create a temporary directory in ." >&2 + { (exit 1); exit 1; } +} + +# Set up the scripts for CONFIG_FILES section. +# No need to generate them if there are no CONFIG_FILES. +# This happens for instance with `./config.status config.h'. +if test -n "$CONFIG_FILES"; then + + +ac_cr=' ' +ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null` +if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then + ac_cs_awk_cr='\\r' +else + ac_cs_awk_cr=$ac_cr +fi + +echo 'BEGIN {' >"$tmp/subs1.awk" && +_ACEOF + + +{ + echo "cat >conf$$subs.awk <<_ACEOF" && + echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && + echo "_ACEOF" +} >conf$$subs.sh || + { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } +ac_delim_num=`echo "$ac_subst_vars" | grep -c '$'` +ac_delim='%!_!# ' +for ac_last_try in false false false false false :; do + . ./conf$$subs.sh || + { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } + + ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` + if test $ac_delim_n = $ac_delim_num; then + break + elif $ac_last_try; then + { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done +rm -f conf$$subs.sh + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +cat >>"\$tmp/subs1.awk" <<\\_ACAWK && +_ACEOF +sed -n ' +h +s/^/S["/; s/!.*/"]=/ +p +g +s/^[^!]*!// +:repl +t repl +s/'"$ac_delim"'$// +t delim +:nl +h +s/\(.\{148\}\).*/\1/ +t more1 +s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ +p +n +b repl +:more1 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t nl +:delim +h +s/\(.\{148\}\).*/\1/ +t more2 +s/["\\]/\\&/g; s/^/"/; s/$/"/ +p +b +:more2 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t delim +' <conf$$subs.awk | sed ' +/^[^""]/{ + N + s/\n// +} +' >>$CONFIG_STATUS || ac_write_fail=1 +rm -f conf$$subs.awk +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACAWK +cat >>"\$tmp/subs1.awk" <<_ACAWK && + for (key in S) S_is_set[key] = 1 + FS = "" + +} +{ + line = $ 0 + nfields = split(line, field, "@") + substed = 0 + len = length(field[1]) + for (i = 2; i < nfields; i++) { + key = field[i] + keylen = length(key) + if (S_is_set[key]) { + value = S[key] + line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) + len += length(value) + length(field[++i]) + substed = 1 + } else + len += 1 + keylen + } + + print line +} + +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then + sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" +else + cat +fi < "$tmp/subs1.awk" > "$tmp/subs.awk" \ + || { { $as_echo "$as_me:$LINENO: error: could not setup config files machinery" >&5 +$as_echo "$as_me: error: could not setup config files machinery" >&2;} + { (exit 1); exit 1; }; } +_ACEOF + +# VPATH may cause trouble with some makes, so we remove $(srcdir), +# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and +# trailing colons and then remove the whole line if VPATH becomes empty +# (actually we leave an empty line to preserve line numbers). +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=/{ +s/:*\$(srcdir):*/:/ +s/:*\${srcdir}:*/:/ +s/:*@srcdir@:*/:/ +s/^\([^=]*=[ ]*\):*/\1/ +s/:*$// +s/^[^=]*=[ ]*$// +}' +fi + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +fi # test -n "$CONFIG_FILES" + + +eval set X " :F $CONFIG_FILES " +shift +for ac_tag +do + case $ac_tag in + :[FHLC]) ac_mode=$ac_tag; continue;; + esac + case $ac_mode$ac_tag in + :[FHL]*:*);; + :L* | :C*:*) { { $as_echo "$as_me:$LINENO: error: invalid tag $ac_tag" >&5 +$as_echo "$as_me: error: invalid tag $ac_tag" >&2;} + { (exit 1); exit 1; }; };; + :[FH]-) ac_tag=-:-;; + :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; + esac + ac_save_IFS=$IFS + IFS=: + set x $ac_tag + IFS=$ac_save_IFS + shift + ac_file=$1 + shift + + case $ac_mode in + :L) ac_source=$1;; + :[FH]) + ac_file_inputs= + for ac_f + do + case $ac_f in + -) ac_f="$tmp/stdin";; + *) # Look for the file first in the build tree, then in the source tree + # (if the path is not absolute). The absolute path cannot be DOS-style, + # because $ac_f cannot contain `:'. + test -f "$ac_f" || + case $ac_f in + [\\/$]*) false;; + *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; + esac || + { { $as_echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5 +$as_echo "$as_me: error: cannot find input file: $ac_f" >&2;} + { (exit 1); exit 1; }; };; + esac + case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + ac_file_inputs="$ac_file_inputs '$ac_f'" + done + + # Let's still pretend it is `configure' which instantiates (i.e., don't + # use $as_me), people would be surprised to read: + # /* config.h. Generated by config.status. */ + configure_input='Generated from '` + $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + `' by configure.' + if test x"$ac_file" != x-; then + configure_input="$ac_file. $configure_input" + { $as_echo "$as_me:$LINENO: creating $ac_file" >&5 +$as_echo "$as_me: creating $ac_file" >&6;} + fi + # Neutralize special characters interpreted by sed in replacement strings. + case $configure_input in #( + *\&* | *\|* | *\\* ) + ac_sed_conf_input=`$as_echo "$configure_input" | + sed 's/[\\\\&|]/\\\\&/g'`;; #( + *) ac_sed_conf_input=$configure_input;; + esac + + case $ac_tag in + *:-:* | *:-) cat >"$tmp/stdin" \ + || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 +$as_echo "$as_me: error: could not create $ac_file" >&2;} + { (exit 1); exit 1; }; } ;; + esac + ;; + esac + + ac_dir=`$as_dirname -- "$ac_file" || +$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$ac_file" : 'X\(//\)[^/]' \| \ + X"$ac_file" : 'X\(//\)$' \| \ + X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$ac_file" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + { as_dir="$ac_dir" + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || { { $as_echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5 +$as_echo "$as_me: error: cannot create directory $as_dir" >&2;} + { (exit 1); exit 1; }; }; } + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + + case $ac_mode in + :F) + # + # CONFIG_FILE + # + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# If the template does not know about datarootdir, expand it. +# FIXME: This hack should be removed a few years after 2.60. +ac_datarootdir_hack=; ac_datarootdir_seen= + +ac_sed_dataroot=' +/datarootdir/ { + p + q +} +/@datadir@/p +/@docdir@/p +/@infodir@/p +/@localedir@/p +/@mandir@/p +' +case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in +*datarootdir*) ac_datarootdir_seen=yes;; +*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) + { $as_echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + ac_datarootdir_hack=' + s&@datadir@&$datadir&g + s&@docdir@&$docdir&g + s&@infodir@&$infodir&g + s&@localedir@&$localedir&g + s&@mandir@&$mandir&g + s&\\\${datarootdir}&$datarootdir&g' ;; +esac +_ACEOF + +# Neutralize VPATH when `$srcdir' = `.'. +# Shell code in configure.ac might set extrasub. +# FIXME: do we really want to maintain this feature? +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_sed_extra="$ac_vpsub +$extrasub +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +:t +/@[a-zA-Z_][a-zA-Z_0-9]*@/!b +s|@configure_input@|$ac_sed_conf_input|;t t +s&@top_builddir@&$ac_top_builddir_sub&;t t +s&@top_build_prefix@&$ac_top_build_prefix&;t t +s&@srcdir@&$ac_srcdir&;t t +s&@abs_srcdir@&$ac_abs_srcdir&;t t +s&@top_srcdir@&$ac_top_srcdir&;t t +s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t +s&@builddir@&$ac_builddir&;t t +s&@abs_builddir@&$ac_abs_builddir&;t t +s&@abs_top_builddir@&$ac_abs_top_builddir&;t t +$ac_datarootdir_hack +" +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$tmp/subs.awk" >$tmp/out \ + || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 +$as_echo "$as_me: error: could not create $ac_file" >&2;} + { (exit 1); exit 1; }; } + +test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && + { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined." >&5 +$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined." >&2;} + + rm -f "$tmp/stdin" + case $ac_file in + -) cat "$tmp/out" && rm -f "$tmp/out";; + *) rm -f "$ac_file" && mv "$tmp/out" "$ac_file";; + esac \ + || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5 +$as_echo "$as_me: error: could not create $ac_file" >&2;} + { (exit 1); exit 1; }; } + ;; + + + + esac + +done # for ac_tag + + +{ (exit 0); exit 0; } +_ACEOF +chmod +x $CONFIG_STATUS +ac_clean_files=$ac_clean_files_save + +test $ac_write_fail = 0 || + { { $as_echo "$as_me:$LINENO: error: write failure creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: error: write failure creating $CONFIG_STATUS" >&2;} + { (exit 1); exit 1; }; } + + +# configure is writing to config.log, and then calls config.status. +# config.status does its own redirection, appending to config.log. +# Unfortunately, on DOS this fails, as config.log is still kept open +# by configure, so config.status won't be able to write to it; its +# output is simply discarded. So we exec the FD to /dev/null, +# effectively closing config.log, so it can be properly (re)opened and +# appended to by config.status. When coming back to configure, we +# need to make the FD available again. +if test "$no_create" != yes; then + ac_cs_success=: + ac_config_status_args= + test "$silent" = yes && + ac_config_status_args="$ac_config_status_args --quiet" + exec 5>/dev/null + $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false + exec 5>>config.log + # Use ||, not &&, to avoid exiting from the if with $? = 1, which + # would make configure fail if this is the last instruction. + $ac_cs_success || { (exit 1); exit 1; } +fi +if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then + { $as_echo "$as_me:$LINENO: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 +$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} +fi +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/configure.ac Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,7 @@ +AC_INIT([SPP], 1.7) + +AC_CHECK_LIB(bz2, BZ2_bzDecompressInit) +AC_SUBST(HAVE_LIBBZ2) +AC_CONFIG_FILES([src/Makevars]) +cp confdefs.h src/config.h +AC_OUTPUT
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/add.broad.peak.regions.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,27 @@ +\name{add.broad.peak.regions} +\alias{add.broad.peak.regions} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Calculate chromosome-wide profiles of smoothed tag density } +\description{ + Looks for broader regions of enrichment associated with the determined + peak positions, adds them to the $npl data as $rs, $re columns. +} +\usage{ +add.broad.peak.regions(signal.tags, control.tags, binding.postions,window.size=500,z.thr=2) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output + of \code{\link{select.informative.tags}} } + \item{control.tags}{ optionall control (input) tags } + \item{binding.positions}{ output of find.binding.positions call } + \item{window.size}{ window size to be used in calculating enrichment } + \item{z.thr}{ Z-score corresponding to the Poisson ratio threshold + used to flag significantly enriched windows} +} +\value{ + A structure identical to binding.postions with two additional columns + added (rs and re) corresponding to start and end of the associated + significantly enriched region. If no region was associated with a + particular peak, NAs values are reported. +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/find.binding.positions.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,128 @@ +\name{find.binding.positions} +\alias{find.binding.positions} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Determine significant point protein binding positions (peaks) } +\description{ + Given the signal and optional control (input) data, determine location of the + statistically significant point binding positions. If the control data + is not provided, the statistical significance can be assessed based on + tag randomization. The method also provides options for masking + regions exhibiting strong signals within the control data. +} +\usage{ +find.binding.positions(signal.data, e.value = NULL, fdr = NULL, masked.data = NULL, control.data = NULL, min.dist = 200, window.size = 4e+07, cluster = NULL, debug = T, n.randomizations = 3, shuffle.window = 1, min.thr = 0, topN = NULL, tag.count.whs = 100, enrichment.z = 2, method = tag.wtd, tec.filter = T, tec.window.size = 10000, tec.masking.window.size=tec.window.size, tec.z = 5, tec.poisson.z=5,tec.poisson.ratio=5, n.control.samples = 1, enrichment.background.scales = c(1, 5, 10), background.density.scaling = F, use.randomized.controls = F, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + ~~ tag data ~~ + \item{signal.data}{ signal tag vector list } + \item{control.data}{ optional control (input) tag vector list } + + ~~ position stringency criteria ~~ + \item{e.value}{ E-value defining the desired statistical significance + of binding positions. } + \item{fdr}{ FDR defining statistical significance of binding positions } + \item{topN}{ instead of determining statistical significance + thresholds, return the specified number of highest-scoring + positions} + + ~~ other params ~~ + \item{whs}{ window half-sized that should be used for binding + detection (e.g. determined from cross-correlation profiles)} + \item{masked.data}{ optional set of coordinates that should be masked + (e.g. known non-unique regions) } + \item{min.dist}{ minimal distance that must separate detected binding + positions. In case multiple binding positions are detected within + such distance, the position with the highest score is returned. } + \item{window.size}{ size of the window used to segment the chromosome + during calculations to reduce memory usage. } + \item{cluster}{ optional \code{snow} cluster to parallelize the + processing on } + \item{min.thr}{ minimal score requirement for a peak } + \item{background.density.scaling}{ If TRUE, regions of significant tag + enrichment will be masked out when calculating size ratio of the + signal to control datasets (to estimate ratio of the background tag + density). If FALSE, the dataset ratio will be equal to the ratio of + the number of tags in each dataset.} + + ~~ randomized controls ~~ + \item{n.randomizations}{ number of tag randomziations that should be + performed (when the control data is not provided) } + \item{use.randomized.controls}{ Use randomized tag control, even if + \code{control.data} is supplied. } + \item{shuffle.window}{ during tag randomizations, tags will be split + into groups of \code{shuffle.window} and will be maintained + together throughout the randomization. } + + ~~ fold-enrichment confidence intervals + \item{tag.count.whs}{ half-size of a window used to assess fold + enrichment of a binding position} + \item{enrichment.z}{ Z-score used to define the significance level of + the fold-enrichment confidence intervals } + \item{enrichment.background.scales}{ In estimating the peak + fold-enrichment confidence intervals, the background tag density is + estimated based on windows with half-sizes of + \code{2*tag.count.whs*enrichment.background.scales}. } + \item{method}{ either \code{tag.wtd} for WTD method, or + \code{tag.lwcc} for MTC method} + \item{mle.filter}{ If turned on, will exclude predicted positions + whose MLE enrichment ratio (for any of the background scales) is + below a specified min.mle.threshold } + \item{min.mle.threshold}{ MLE enrichment ratio threshold that each + predicted position must exceed if mle.filter is turned on. } + + ~~ masking regions of significant control enrichment ~~ + \item{tec.filter}{ Whether to mask out the regions exhibiting + significant enrichment in the control data in doing other + calculations. The regions are identified using Poisson statistics + within sliding windows, either relative to the scaled signal (tec.z), or + relative to randomly-distributed expectation (tec.poisson.z).} + \item{tec.window.size}{ size of the window used to determine + significantly enrichent control regions } + \item{tec.masking.window.size}{ size of the window used to mask + the area around significantly enrichent control regions } + \item{tec.z}{ Z-score defining statistical stringency by which a given + window is determined to be significantly higher in the input than in + the signal, and masked if that is the case.} + \item{tec.poisson.z}{ Z-score defining statistical stringency by which a given + window is determined to be significantly higher than the + tec.poisson.ratio above the expected uniform input background. } + \item{tec.poisson.ratio}{ Fold ratio by which input must exceed the + level expected from the uniform distribution. } + + + + +} +\value{ + \item{npl}{A per-chromosome list containing data frames describing + determined binding positions. Column description: + \item{x}{ position } + \item{y}{ score } + \item{evalue}{ E-value } + \item{fdr}{ FDR. For peaks higher than the maximum control peak, + the highest dataset FDR is reported } + \item{enr}{ lower bound of the fold-enrichment ratio confidence + interval. This is the estimate determined using scale of + 1. Estimates corresponding to higher scales are returned in other enr columns + with scale appearing in the name.} + \item{enr.mle}{ enrichment ratio maximum likely estimate } + } + \item{thr}{ info on the chosen statistical threshold of the peak scores} +} + +\examples{ + # find binding positions using WTD method, 200bp half-window size, +control data, 1% FDR + bp <- +find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.wtd,whs=200); + + # find binding positions using MTC method, using 5 tag randomizations, + # keeping pairs of tag positions together (shuffle.window=2) + bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=0.01,method=tag.lwcc,whs=200,use.randomized.controls=T,n.randomizations=5,shuffle.window=2) + + # print out the number of determined positions + print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks")); + + +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.binding.characteristics.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,55 @@ +\name{get.binding.characteristics} +\alias{get.binding.characteristics} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Calculate characteristics of observed DNA-binding signal from + cross-correlation profiles } +\description{ + The methods calculates strand cross-correlation profile to determine binding + peak separation distance and approximate window size that should be used + for binding detection. If quality scores were given for the tags, + which quality bins improve the cross-correlation pattern. +} +\usage{ +get.binding.characteristics(data, srange = c(50, 500), bin = 5, cluster = NULL, debug = F, min.tag.count = 1000, acceptance.z.score = 3, remove.tag.anomalies = T, anomalies.z = 5,accept.all.tags=F) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{data}{ Tag/quality data: output of \code{read.eland.tags} or similar function } + \item{srange}{ A range within which the binding peak separation is + expected to fall. Should be larger than probe size to avoid artifacts. } + \item{bin}{ Resolution (in basepairs) at which cross-corrrelation + should be calculated. bin=1 is ideal, but takes longer to calculate. } + \item{cluster}{ optional snow cluster for parallel processing } + \item{debug}{ whether to print debug messages } + \item{min.tag.count}{ minimal number of tags on the chromosome to be + considered in the cross-correlation calculations } + \item{acceptance.z.score}{ A Z-score used to determine if a given tag + quality bin provides significant improvement to the strand cross-correlation } + \item{remove.tag.anomalies}{ Whether to remove singular tag count peaks prior to + calculation. This is recommended, since such positions may distort the + cross-correlation profile and increase the necessary computational time. } + \item{anomalies.z}{ Z-score for determining if the number of tags at a + given position is significantly higher about background, and should be + considered an anomaly.} + \item{accept.all.tags}{ Whether tag alignment quality calculations + should be skipped and all available tags should be accepted in the + downstream analysis.} +} +\value{ + \item{cross.correlation }{ Cross-correlation profile as an $x/$y data.frame} + \item{peak }{Position ($x) and height ($y) of automatically detected + cross-correlation peak.} + \item{whs} { Optimized window half-size for binding detection (based + on the width of the cross-correlation peak) } + \item{quality.bin.acceptance} { A list structure, describing the + effect of inclusion of different tag quality bins on + cross-correlation, and a resolution on which bins should be + considered. + \item{informative.bins} { A boolean vector indicating whether the + inclusion of tags from the tag quality bin specified in the name + attribute significantly increases cross-correlation profile near + the peak.} + \item{quality.cc} { A list giving the cross-correlation profile + after the inclusion of the tags from different quality bins } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.broad.enrichment.clusters.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,27 @@ +\name{get.broad.enrichment.clusters} +\alias{get.broad.enrichment.clusters} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Determine broad clusters of enrichment } +\description{ + Scan chromosomes with a pre-defined window size, comparing scaled ChIP + and input tag coutns to see if their ratio exceeds that expected from + a Poisson process (normalized for dataset size). +} +\usage{ +get.broad.enrichment.clusters(chip.tags, input.tags, window.size=1e3,z.thr=3,tag.shift=146/2) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{chip.tags}{ foreground tag vector list } + \item{input.tags}{ background tag vector list } + \item{window.size}{ window size to be used for tag counting } + \item{z.thr}{ Z-score to be used as a significance threshold } + \item{tag.shift}{ number of base pairs by which positive and negative + tag coordinates should be shifted towards eachother (half of binding + peak separation distance)} +} +\value{ + A list of elements corresponding to chromosomes, with each element + being an $s/$e/$rv data.frame giving the starting, ending positions and the log2 + enrichment estimate for that region. +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.conservative.fold.enrichment.profile.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,59 @@ +\name{get.conservative.fold.enrichment.profile} +\alias{get.conservative.fold.enrichment.profile} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Estimate minimal fold enrichment/depletion along the chromosomes } +\description{ + The method provides a statistical assessment of enrichment/depletion + along the chromosomes. To assess tag density enrichment/depletion, a + sliding window of a specified size (\code{fws}) is used to calculate + the density of the foreground tags (\code{ftl}). Multiple, typically + larger windows are used to estimate background tag (\code{btl}) density around the + same location. The densities are compared as ratios of two Poisson + processes to estimate lower bound of foreground enrichment, or upper + bound of foreground depletion. If multiple window sizes were used to + estimate the background tag density, the most conservative one is + chosen for each point. +} +\usage{ +get.conservative.fold.enrichment.profile(ftl, btl, fws, bwsl = c(1, 5, 25, 50) * fws, step = 50, tag.shift = 146/2, alpha = 0.05, use.most.informative.scale = F, quick.calculation = T) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{ftl}{ foreground tag vector list } + \item{btl}{ background tag vector list } + \item{fws}{ foreground window size } + \item{bwsl}{ background window scales. The size(s) of background windows + will be \code{fws*bwsl}. } + \item{step}{ spacing between positions at which the + enrichment/depletion is evaluated } + \item{tag.shift}{ number of basepairs by which positive and negative + tag coordinates should be shifted towards eachother (half of binding + peak separation distance)} + \item{alpha}{ desired level of statistical significance } + \item{use.most.informative.scale}{ for each position, instead of + evaluating enrichment ratio bounds for all background window scales, + choose the one with the highest observed density to speed up the calculations} + \item{quick.calculation}{ Use square root transformation method + instead of a Bayesian method. This speeds up the caclulation + considerably and is turned on by default. } + \item{background.density.scaling}{ If TRUE, regions of significant tag + enrichment will be masked out when calculating size ratio of the + signal to control datasets (to estimate ratio of the background tag + density). If FALSE, the dataset ratio will be equal to the ratio of + the number of tags in each dataset.} +} +\value{ + A list of elements corresponding to chromosomes, with each element + being an $x/$y data.frame giving the position and the log2 + conservative estimate of enrichment/depletion fold ratios around that + position. + Use \code{\link{writewig}} to output the structure to a WIG + file. +} +\references{ R.M.Price, D.G. Bonett "Estimating the ratio fo two Poisson + rates", Comp. Stat & Data Anal. 32(2000) 345} +\seealso{ \code{\link{get.smoothed.tag.density}} } +\examples{ + enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01); + writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale"); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.mser.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,46 @@ +\name{get.mser} +\alias{get.mser} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Calculate minimal saturated enrichment fold ratio } +\description{ + Determine if the dataset has reached absolute saturation, or otherwise + find minimal fold enrichment ratio above which the detection of peaks + has stabilized enough to meet the saturation criteria. +} +\usage{ +get.mser(signal.data, control.data, n.chains = 5, step.size = 1e+05, chains = NULL, cluster = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), n.steps = 1, ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{signal.data}{ signal tag vector list } + \item{control.data}{ control tag vector list } + \item{n.chains}{ number of dataset subsamples to use } + \item{step.size}{ subsampling step describing the saturation + criteria. The criteria requires the set of detected binding sites to + be stable (as described by the \code{test.agreement} param) when the + number of tags in the dataset is reduced by \code{step.size}. The + value can either be an integer above one, in which case it specifies a fixed + number of tags, or a real value below one, in which case it + specifies the fraction of tags that should be removed (e.g. 0.1 will + remove 10% of tags). + } + \item{test.agreement}{ Fraction of the detected peaks that should + agree between the full and subsampled datasets. } + \item{chains}{ optional parameter, giving pre-calculated chains } + \item{cluster}{ optional \code{snow} cluster to parallelize processing } + + \item{return.chains}{ whether subsampled dataset results should be returned as + well } + \item{enrichment.background.scales}{ one or multiple window scales at + which the background tag density should be assessed. See + \code{enrichment.background.scales} in + \code{\link{find.binding.positions}}. If multiple scales are provided, + multiple MSER estimates will be returned.} + \item{\dots}{ additional parameters should be the same as those passed + to the \code{\link{find.binding.positions}}} +} +\value{ + A single, or multple (if multiple \code{enrichment.background.scales} were + provided) MSER value. A value of 1 or very close to it implies that + the dataset has reached absolute saturation based on the given criteria. +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.mser.interpolation.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,56 @@ +\name{get.mser.interpolation} +\alias{get.mser.interpolation} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Interpolate MSER dependency on the tag count } +\description{ + MSER generally decreases with increasing sequencing depth. This + function interpolates the dependency of MSER on tag counts as a + log-log linear function. The log-log fit is used to estimate the depth + of sequencing required to reach desired \code{target.fold.enrichment}. +} +\usage{ +get.mser.interpolation(signal.data, control.data, target.fold.enrichment = 5, n.chains = 10, n.steps = 6, step.size = 1e+05, chains = NULL, test.agreement = 0.99, return.chains = F, enrichment.background.scales = c(1), excluded.steps = c(seq(2, n.steps - 2)), ...) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{signal.data}{ signal chromosome tag vector list } + \item{control.data}{ control chromosome tag vector list } + \item{target.fold.enrichment}{ target MSER for which the depth should + be estimated} + \item{n.steps}{ number of steps in each subset chain. } + \item{step.size}{ Either number of tags or fraction of the dataset + size, see \code{step.size} parameter for \code{\link{get.mser}}. } + \item{test.agreement}{ Fraction of the detected peaks that should + agree between the full and subsampled datasets. See \code{test.agreement} parameter for \code{\link{get.mser}}} + \item{n.chains}{ number of random subset chains } + \item{chains}{ optional structure of pre-calculated chains + (e.g. generated by an earlier call with \code{return.chains=T}.} + + \item{return.chains}{ whether to return peak predictions calculated on + random chains. These can be passed back using \code{chains} argument + to skip subsampling/prediction steps, and just recalculate the depth + estimate for a different MSER.} + \item{enrichment.background.scales}{ see \code{enrichment.background.scales} parameter for \code{\link{get.mser}} } + \item{excluded.steps}{ Intermediate subsampling steps that should be excluded from + the chains to speed up the calculation. By default, all intermediate + steps except for first two and last two are skipped. Adding + intermediate steps improves interpolation at the expense of + computational time.} + \item{\dots}{ additional parameters are passed to \code{\link{get.mser}} } +} +\details{ + To simulate sequencing growth, the method calculates peak predictions + on random chains. Each chain is produced by sequential random + subsampling of the original data. The number of steps in the chain + indicates how many times the random subsampling will be performed. +} +\value{ + Normally reurns a list, specifying for each backgroundscale: + \item{prediction}{estimated sequencing depth required to reach + specified target MSER} + \item{log10.fit}{linear fit model, a result of \code{lm()} call} + + If \code{return.chains=T}, the above structure is returned under + \code{interpolation} field, along with \code{chains} field containing + results of \code{\link{find.binding.positions}} calls on subsampled chains. +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.smoothed.enrichment.mle.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,35 @@ +\name{get.smoothed.enrichment.mle} +\alias{get.smoothed.enrichment.mle} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Calculate chromosome-wide profiles of smoothed enrichment estimate } +\description{ + Given signal and control tag positions, the method calculates log2 + signal to control enrichment esimates (maximum likelihood) for each + chromosome, based on the smoothed tag density profile (see \link{get.smoothed.tag.density}). +} +\usage{ +get.smoothed.enrichment.mle(signal.tags, control.tags, bandwidth = 150,tag.shift = 146/2, step = 50) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output + of \code{\link{select.informative.tags}} } + \item{control.tags}{ control (input) tags } + \item{pseudocount}{ pseudocount value to be added to tag density - + defaults to 1 } + other parameters (such as bandwidth, step.size and tag.shift) are + passed to \link{get.smoothed.tag.density} - see appropriate reference + for details. +} +\value{ + A list of elements corresponding to chromosomes, with each element + being an $x/$y data.frame giving the position and associated + log2 signal/control enrichment estimate. +} +\seealso{ \code{\link{writewig}} } +\examples{ + # get smoothed enrichment estimate profile using 500bp bandwidth at + # 50bp steps + smoothed.M <- get.smoothed.enrichment.mle(chip.data,bandwidth=500,step=50); + writewig(smoothed.M,"example.smoothedM.wig","Example smoothed log2 intensity ratio estimate"); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/get.smoothed.tag.density.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,45 @@ +\name{get.smoothed.tag.density} +\alias{get.smoothed.tag.density} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Calculate chromosome-wide profiles of smoothed tag density } +\description{ + Given tag positions, the method calculates for each chromosome a tag + density profile, smoothed by the Gaussian kernel. If the optional + control tags are provided, the difference between ChIP and control tag + density is returned. +} +\usage{ +get.smoothed.tag.density(signal.tags, control.tags = NULL, bandwidth = 150, bg.weight = NULL, tag.shift = 146/2, step = round(bandwidth/3)) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{signal.tags}{ signal chromosome tag coordinate vectors (e.g. output + of \code{\link{select.informative.tags}} } + \item{control.tags}{ optional control (input) tags } + \item{bandwidth}{ standard deviation of the Gaussian kernel } + \item{bg.weight}{ optional weight by which the background density + should be multipled for scaling. If not supplied, the weight is + calculated based on the ratio of the reduced ChIP to input dataset sizes. } + \item{tag.shift}{ Distance by which the positive and negative strand + tags should be shifted towards eachother. This + normally corresponds to the half of the cross-correlation peak + position (e.g. \code{get.binding.characteristics()}$peak$x/2) } + \item{step}{ The distance between the regularly spaced points for + which the values should be calculated. } + \item{background.density.scaling}{ If TRUE, regions of significant tag + enrichment will be masked out when calculating size ratio of the + signal to control datasets (to estimate ratio of the background tag + density). If FALSE, the dataset ratio will be equal to the ratio of + the number of tags in each dataset.} +} +\value{ + A list of elements corresponding to chromosomes, with each element + being an $x/$y data.frame giving the position and associated tag + density. Use \code{\link{writewig}} to output the structure to a WIG + file. +} +\seealso{ \code{\link{writewig}} } +\examples{ + smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2)); + writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density"); +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/output.binding.results.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,24 @@ +\name{output.binding.results} +\alias{output.binding.results} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Write out determined binding peaks into a text file table } +\description{ + Writes out determined binding positions into a text file. The file + will contain a table with each row corresponding to a detected + position, with the following columns: + \item{chr}{ chromosome or target sequence } + \item{pos}{ position of detected binding site on the chromosome/sequence} + \item{score}{a score reflecting magnitude of the binding} + \item{Evalue}{E-value corresponding to the peak magnitude} + \item{FDR}{FDR corresponding to the peak magnitude} + \item{enrichment.lb}{lower bound of the fold-enrichment ratio} + \item{enrichment.mle}{maximum likelihood estimate of the fold-enrichment ratio} +} +\usage{ +output.binding.results(results, filename) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{results}{ output of the \code{\link{find.binding.positions}} } + \item{filename}{ file name } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/read.bam.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,24 @@ +\name{read.bam.tags} +\alias{read.bam.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Read BAM alignment file } +\description{ + Reads in aligned reads from BAM file. Note: no split (non-unique) + alignemnts should be reported in the BAM file. +} +\usage{ +read.bam.tags(filename, read.tag.names = F, fix.chromosome.names = F) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{filename}{ BAM file } + \item{read.tag.names}{ Whether the tag names should be read in } + \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of + the sequence names } +} +\value{ + \item{tags }{ A vector of 5' tag coordinates, with negative values + corresponding to tags mapped to the negative strand. } + \item{quality }{ Number of mismatches } + \item{names }{ Tag names, if \code{read.tag.names} was set } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/read.bin.maqmap.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,23 @@ +\name{read.bin.maqmap.tags} +\alias{read.bin.maqmap.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Read MAQ binary alignment map file } +\description{ + Reads in MAQ binary map alignment result file +} +\usage{ +read.bin.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{filename}{ MAQ map output file (binary) } + \item{read.tag.names}{ Whether the tag names should be read in } + \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of + the sequence names } +} +\value{ + \item{tags }{ A vector of 5' tag coordinates, with negative values + corresponding to tags mapped to the negative strand. } + \item{quality }{ Number of mismatches } + \item{names }{ Tag names, if \code{read.tag.names} was set } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/read.bowtie.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,23 @@ +\name{read.bowtie.tags} +\alias{read.bowtie.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Read bowtie text alignment output file } +\description{ + Reads in bowtie alignment results in text format +} +\usage{ +read.bowtie.tags(filename, read.tag.names = F, fix.chromosome.names = F) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{filename}{ bowtie text output file } + \item{read.tag.names}{ Whether the tag names should be read in } + \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of + the sequence names } +} +\value{ + \item{tags }{ A vector of 5' tag coordinates, with negative values + corresponding to tags mapped to the negative strand. } + \item{quality }{ Number of mismatches } + \item{names }{ Tag names, if \code{read.tag.names} was set } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/read.eland.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,30 @@ +\name{read.eland.tags} +\alias{read.eland.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Read eland output file } +\description{ + Reads in ELAND output file, returning 5'-end tag coordinates and + number of mismatches associated with each mapped tag. +} +\usage{ +read.eland.tags(filename, read.tag.names = F, fix.chromosome.names = T, max.eland.tag.length = -1,extended=F) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{filename}{ ELAND output file } + \item{read.tag.names}{ Whether the tag names should be read in } + \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of + the sequence names } + \item{max.eland.tag.length}{ Specifies max length of the tag sequence + considered by ELAND. This needs to be specified if the tags are + longer than the sequences considred by ELAND during alignment. } + \item{extended}{ Whether the file is written out in "extended" format + provided in GA pipeline 1.0. } + \item{multi}{ Whether the file is written in "multi" format, showing multiple alignments of the reads } +} +\value{ + \item{tags }{ A vector of 5' tag coordinates, with negative values + corresponding to tags mapped to the negative strand. } + \item{quality }{ Number of mismatches } + \item{names }{ Tag names, if \code{read.tag.names} was set } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/read.maqmap.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,23 @@ +\name{read.maqmap.tags} +\alias{read.maqmap.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Read MAQ text alignment output file } +\description{ + Reads in MAQ alignment results in text format (that results from "maq mapview" command.) +} +\usage{ +read.maqmap.tags(filename, read.tag.names = F, fix.chromosome.names = T) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{filename}{ MAQ text output file } + \item{read.tag.names}{ Whether the tag names should be read in } + \item{fix.chromosome.names}{ Whether to remove ".fa" from the end of + the sequence names } +} +\value{ + \item{tags }{ A vector of 5' tag coordinates, with negative values + corresponding to tags mapped to the negative strand. } + \item{quality }{ Number of mismatches } + \item{names }{ Tag names, if \code{read.tag.names} was set } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/read.meland.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,29 @@ +\name{read.meland.tags} +\alias{read.meland.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Read modified BED tag alignment file that contains variable match + length information } +\description{ + Reads in an extended BED tag alignment file. An example line given below: + \code{49 . U1 . 1 . . 23 chr2 -234567} + The line above specifies a 23-bp portion of the tag tag with id 49 was + aligned with 1 mismatch to the negative strand of chr2 at position 234567. +} +\usage{ +read.meland.tags(filename, read.tag.names = F, fix.chromosome.names = T) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{filename}{ name of the extended BED file } + \item{read.tag.names}{ whether to read in tag names } + \item{fix.chromosome.names}{ whether to remove ".fa" from the sequence + name ends. } +} +\value{ + \item{tags }{ A vector of 5' tag coordinates, with negative values + corresponding to tags mapped to the negative strand. } + \item{quality }{ Quality expressed as a float x.y, where x is + tag.length - aligned.tag.portion.length, and y is the number of + mismatches (must be less than 10). } + \item{names }{ Tag names, if \code{read.tag.names} was set } +} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/remove.local.tag.anomalies.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,46 @@ +\name{remove.local.tag.anomalies} +\alias{remove.local.tag.anomalies} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Restrict or remove positions with too many tags relative to + local background. } +\description{ + In Solexa ChIP-seq experiments some anomalous positions contain + extremely high number of tags at the exact coordinates. The function + scans the chromosomes, determining local tag density based on a + provided \code{window.size}, doing two types of corrections: + 1. removing all tags from positions that exceed local density by + \code{eliminate.fold}; 2. reducing the tag count at positions + exceeding \code{cap.fold} to the maximal allowed count. The + statistical significance of counts exceeding either of these two + threshold densities is calculated based on Poisson model, with + confidence interval determined by the \code{z.threshold} Z-score parameter. +} +\usage{ +remove.local.tag.anomalies(tags, window.size = 200, eliminate.fold = 10, cap.fold = 4, z.threshold = 3) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{tags}{ Chromosome-list of tag vectors } + \item{window.size}{ Size of the window used to assess local + density. Increasing the window size considerably beyond the size of + the binding features will result in flattened profiles, with bound + positions exhibiting a difference of just 1 tag beyond the background. } + \item{eliminate.fold}{ Threshold definining fold-over background + density above which the position is considered anomalous and removed + completely.} + \item{cap.fold}{ Threshold fold-over background density above which + the position is capped to the maximum statistically likely given + local tag density } + \item{z.threshold}{ Z-score used to assess significance of a given + position exceeding either of the two density thresholds. } +} +\value{ + A modified chromosome-wise tag vector list. +} +\references{ ~put references to the literature/web site here ~ } + +\note{ ~~further notes~~ + Increasing window.size to very large values will result in flat + profiles similar to those described by Zhang et al. "Model-based + Analysis of ChIP-Seq (MACS)." Genome Biol. 2008 Sep 17;9(9):R137. +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/select.informative.tags.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,29 @@ +\name{select.informative.tags} +\alias{select.informative.tags} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Choose informative tags } +\description{ + For datasets with tag alignment quality information (e.g. number of + mismatches for Eland alignments), + \code{\link{get.binding.characteristics}} determines whether inclusion + of tags from each specific quality bin improves the cross-correlation + profile. The present function is then used to actually select these + informative tags, discarding all other information, including quality + scores that are not used in further processing. +} +\usage{ +select.informative.tags(data, binding.characteristics) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{data}{ Full alignment data (a list with $tags and $quality elements) } + \item{binding.characteristics}{ result of a + \code{\link{get.binding.characteristics}} call. If NULL value is + supplied,all tags will be accepted. } +} +\value{ + A chromosome-wise tag list. Each element of the list corresponds to a + chromosome and is a numeric vector of 5' tag coordinates, with sign + designating DNA strand. + This form of tag data is used for most of the other processing. +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/spp-package.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,144 @@ +\name{spp-package} +\alias{spp-package} +\alias{spp} +\docType{package} +\title{ +ChIP-seq (Solexa) Processing Pipeline +} +\description{ +A set of routines for reading short sequence alignments, calculating tag +density, estimates of statistically significant enrichment/depletion +along the chromosome, identifying point binding positions (peaks), and +characterizing saturation properties related to sequencing depth. +} +\details{ +\tabular{ll}{ +Package: \tab spp\cr +Type: \tab Package\cr +Version: \tab 1.8\cr +Date: \tab 2008-11-14\cr +License: \tab What license is it under?\cr +LazyLoad: \tab yes\cr +} +See example below for typical processing sequence.y +} +\author{Peter Kharchenko <peter.kharchenko@post.harvard.edu>} +\references{ +Kharchenko P., Tolstorukov M., Park P. "Design and analysis of ChIP-seq +experiments for DNA-binding proteins." Nature Biotech. doi:10.1038/nbt.1508 +} + +\examples{ + + # load the library + library(spp); + + ## The following section shows how to initialize a cluster of 8 nodes for parallel processing + ## To enable parallel processing, uncomment the next three lines, and comment out "cluster<-NULL"; + ## see "snow" package manual for details. + #library(snow) + #cluster <- makeCluster(2); + #invisible(clusterCall(cluster,source,"routines.r")); + cluster <- NULL; + + + + # read in tag alignments + chip.data <- read.eland.tags("chip.eland.alignment"); + input.data <- read.eland.tags("input.eland.alignment"); + + # get binding info from cross-correlation profile + # srange gives the possible range for the size of the protected region; + # srange should be higher than tag length; making the upper boundary too high will increase calculation time + # + # bin - bin tags within the specified number of basepairs to speed up calculation; + # increasing bin size decreases the accuracy of the determined parameters + binding.characteristics <- get.binding.characteristics(chip.data,srange=c(50,500),bin=5,cluster=cluster); + + + # plot cross-correlation profile + pdf(file="example.crosscorrelation.pdf",width=5,height=5) + par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8); + plot(binding.characteristics$cross.correlation,type='l',xlab="strand shift",ylab="cross-correlation"); + abline(v=binding.characteristics$peak$x,lty=2,col=2) + dev.off(); + + # select informative tags based on the binding characteristics + chip.data <- select.informative.tags(chip.data,binding.characteristics); + input.data <- select.informative.tags(input.data,binding.characteristics); + + # restrict or remove positions with anomalous number of tags relative + # to the local density + chip.data <- remove.local.tag.anomalies(chip.data); + input.data <- remove.local.tag.anomalies(input.data); + + + # output smoothed tag density (subtracting re-scaled input) into a WIG file + # note that the tags are shifted by half of the peak separation distance + smoothed.density <- get.smoothed.tag.density(chip.data,control.tags=input.data,bandwidth=200,step=100,tag.shift=round(binding.characteristics$peak$x/2)); + writewig(smoothed.density,"example.density.wig","Example smoothed, background-subtracted tag density"); + rm(smoothed.density); + + # output conservative enrichment estimates + # alpha specifies significance level at which confidence intervals will be estimated + enrichment.estimates <- get.conservative.fold.enrichment.profile(chip.data,input.data,fws=2*binding.characteristics$whs,step=100,alpha=0.01); + writewig(enrichment.estimates,"example.enrichment.estimates.wig","Example conservative fold-enrichment/depletion estimates shown on log2 scale"); + rm(enrichment.estimates); + + + # binding detection parameters + # desired FDR. Alternatively, an E-value can be supplied to the method calls below instead of the fdr parameter + fdr <- 1e-2; + # the binding.characteristics contains the optimized half-size for binding detection window + detection.window.halfsize <- binding.characteristics$whs; + + # determine binding positions using wtd method + bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize,cluster=cluster) + + # alternatively determined binding positions using lwcc method (note: this takes longer than wtd) + # bp <- find.binding.positions(signal.data=chip.data,control.data=input.data,fdr=fdr,method=tag.lwcc,whs=detection.window.halfsize,cluster=cluster) + + print(paste("detected",sum(unlist(lapply(bp$npl,function(d) length(d$x)))),"peaks")); + + # output detected binding positions + output.binding.results(bp,"example.binding.positions.txt"); + + + # ------------------------------------------------------------------------------------------- + # the set of commands in the following section illustrates methods for saturation analysis + # these are separated from the previous section, since they are highly CPU intensive + # ------------------------------------------------------------------------------------------- + + # determine MSER + # note: this will take approximately 10-15x the amount of time the initial binding detection did + # The saturation criteria here is 0.99 consistency in the set of binding positions when adding 1e5 tags. + # To ensure convergence the number of subsampled chains (n.chains) should be higher (80) + mser <- get.mser(chip.data,input.data,step.size=1e5,test.agreement=0.99,n.chains=8,cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize) + + print(paste("MSER at a current depth is",mser)); + + # note: an MSER value of 1 or very near one implies that the set of detected binding positions satisfies saturation criteria without + # additional selection by fold-enrichment ratios. In other words, the dataset has reached saturation in a traditional sense (absolute saturation). + + # interpolate MSER dependency on tag count + # note: this requires considerably more calculations than the previous steps (~ 3x more than the first MSER calculation) + # Here we interpolate MSER dependency to determine a point at which MSER of 2 is reached + # The interpolation will be based on the difference in MSER at the current depth, and a depth at 5e5 fewer tags (n.steps=6); + # evaluation of the intermediate points is omitted here to speed up the calculation (excluded.steps parameter) + # A total of 7 chains is used here to speed up calculation, whereas a higher number of chains (50) would give good convergence + msers <- get.mser.interpolation(chip.data,input.data,step.size=1e5,test.agreement=0.99, target.fold.enrichment=2, n.chains=7,n.steps=6,excluded.steps=c(2:4),cluster=cluster,fdr=fdr,method=tag.wtd,whs=detection.window.halfsize) + + print(paste("predicted sequencing depth =",round(unlist(lapply(msers,function(x) x$prediction))/1e6,5)," million tags")) + + + # note: the interpolation will return NA prediction if the dataset has reached absolute saturation at the current depth. + # note: use return.chains=T to also calculated random chains (returned under msers$chains field) - these can be passed back as + # "get.mser.interpolation( ..., chains=msers$chains)" to calculate predictions for another target.fold.enrichment value + # without having to recalculate the random chain predictions. + + ## stop cluster if it was initialized + #stopCluster(cluster); + + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/write.broadpeak.info.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,16 @@ +\name{write.broadpeak.info} +\alias{write.broadpeak.info} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Write out determined broad enrichment regions using broadPeak format } +\description{ + Writes out broad regions of enrichment determined by the + get.broad.enrichment.clusters method in a broadPeak format. +} +\usage{ +write.broadpeak.info(broadpeak.results, filename) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{broadpeak.results}{ output of the \code{\link{get.broad.enrichment.clusters}} } + \item{filename}{ file name } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/write.narrowpeak.binding.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,21 @@ +\name{write.narrowpeak.binding} +\alias{write.narrowpeak.binding} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ Write out determined binding peaks using narrowPeak format } +\description{ + Writes out determined binding positions into a narrowPeak file. + The region will correspond to associated broad enrichment region, if + such were added using add.broad.peak.regions method. Otherwise the + region size will be determined using margin (which defaults to the + window half size that was used to determine binding positions) +} +\usage{ +write.narrowpeak.binding(results, filename,margin=results$whs) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{results}{ output of the \code{\link{find.binding.positions}} } + \item{filename}{ file name } + \item{margin}{ explicit value of the margin to be used if the borad + region information is absent (defaults to peak detection window half-size} +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/man/writewig.Rd Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,31 @@ +\name{writewig} +\alias{writewig} +%- Also NEED an '\alias' for EACH other topic documented here. +\title{ A function to save a list of chromosome-wise x/y data frames + into a WIG file format. } +\description{ + Takes a list that contains an $x and $y data.frame for a number of + chromosomes and writes it out to a WIG BED style format. +} +\usage{ +writewig(dat, fname, feature, threshold = 5, zip = F) +} +%- maybe also 'usage' for other objects documented here. +\arguments{ + \item{dat}{ Chromosome coordinate-value data. \code{dat} is a list, + each member of a list is a data frame with $x and $y columns + containing chromosome positions and associated values. The names of + the list elements correspond to the chromosomes. } + \item{fname}{ Filename to which the output should be written } + \item{feature}{ Data description to be incorporated into the WIG header } + \item{threshold}{ Optional threshold to be saved in the WIG file} + \item{zip}{ Wheter to invoke a zip program to compress the file } +} + +\seealso{ ~~objects to See Also as \code{\link{help}}, ~~~ } +\examples{ + +data <- list("chr1"=data.frame(x=c(100,130,200),y=c(1.2,4.0,2.3))); +writewig(data,"filename"); + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BGZF.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,398 @@ +// *************************************************************************** +// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading & writing BGZF files +// *************************************************************************** + +#include <BGZF.h> +using namespace BamTools; + +#include <algorithm> +using namespace std; + +BgzfData::BgzfData(void) + : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , IsWriteOnly(false) + , IsWriteUncompressed(false) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) +{ + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch( std::bad_alloc& ba ) { + fprintf(stderr, "BGZF ERROR: unable to allocate memory for our BGZF object.\n"); + exit(1); + } +} + +// destructor +BgzfData::~BgzfData(void) { + if( CompressedBlock ) delete[] CompressedBlock; + if( UncompressedBlock ) delete[] UncompressedBlock; +} + +// closes BGZF file +void BgzfData::Close(void) { + + // skip if file not open, otherwise set flag + if ( !IsOpen ) return; + + // if writing to file, flush the current BGZF block, + // then write an empty block (as EOF marker) + if ( IsWriteOnly ) { + FlushBlock(); + int blockLength = DeflateBlock(); + fwrite(CompressedBlock, 1, blockLength, Stream); + } + + // flush and close + fflush(Stream); + fclose(Stream); + IsWriteUncompressed = false; + IsOpen = false; +} + +// compresses the current block +int BgzfData::DeflateBlock(void) { + + // initialize the gzip header + char* buffer = CompressedBlock; + memset(buffer, 0, 18); + buffer[0] = GZIP_ID1; + buffer[1] = (char)GZIP_ID2; + buffer[2] = CM_DEFLATE; + buffer[3] = FLG_FEXTRA; + buffer[9] = (char)OS_UNKNOWN; + buffer[10] = BGZF_XLEN; + buffer[12] = BGZF_ID1; + buffer[13] = BGZF_ID2; + buffer[14] = BGZF_LEN; + + // set compression level + const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); + + // loop to retry for blocks that do not compress enough + int inputLength = BlockOffset; + int compressedLength = 0; + unsigned int bufferSize = CompressedBlockSize; + + while ( true ) { + + // initialize zstream values + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)UncompressedBlock; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { + fprintf(stderr, "BGZF ERROR: zlib deflate initialization failed.\n"); + exit(1); + } + + // compress the data + int status = deflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + + deflateEnd(&zs); + + // reduce the input length and try again + if ( status == Z_OK ) { + inputLength -= 1024; + if( inputLength < 0 ) { + fprintf(stderr, "BGZF ERROR: input reduction failed.\n"); + exit(1); + } + continue; + } + + fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n"); + exit(1); + } + + // finalize the compression routine + if ( deflateEnd(&zs) != Z_OK ) { + fprintf(stderr, "BGZF ERROR: zlib::deflateEnd() failed.\n"); + exit(1); + } + + compressedLength = zs.total_out; + compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; + if ( compressedLength > MAX_BLOCK_SIZE ) { + fprintf(stderr, "BGZF ERROR: deflate overflow.\n"); + exit(1); + } + + break; + } + + // store the compressed length + BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + + // store the CRC32 checksum + unsigned int crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); + BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); + BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = BlockOffset - inputLength; + if ( remaining > 0 ) { + if ( remaining > inputLength ) { + fprintf(stderr, "BGZF ERROR: after deflate, remainder too large.\n"); + exit(1); + } + memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); + } + + BlockOffset = remaining; + return compressedLength; +} + +// flushes the data in the BGZF block +void BgzfData::FlushBlock(void) { + + // flush all of the remaining blocks + while ( BlockOffset > 0 ) { + + // compress the data block + int blockLength = DeflateBlock(); + + // flush the data to our output stream + int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); + + if ( numBytesWritten != blockLength ) { + fprintf(stderr, "BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); + exit(1); + } + + BlockAddress += blockLength; + } +} + +// de-compresses the current block +int BgzfData::InflateBlock(const int& blockLength) { + + // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)CompressedBlock + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)UncompressedBlock; + zs.avail_out = UncompressedBlockSize; + + int status = inflateInit2(&zs, GZIP_WINDOW_BITS); + if ( status != Z_OK ) { + fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); + return -1; + } + + status = inflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + inflateEnd(&zs); + fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); + return -1; + } + + status = inflateEnd(&zs); + if ( status != Z_OK ) { + fprintf(stderr, "BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); + return -1; + } + + return zs.total_out; +} + +// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) +bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { + + // determine open mode + if ( strcmp(mode, "rb") == 0 ) + IsWriteOnly = false; + else if ( strcmp(mode, "wb") == 0) + IsWriteOnly = true; + else { + fprintf(stderr, "BGZF ERROR: unknown file mode: %s\n", mode); + return false; + } + + // ---------------------------------------------------------------- + // open Stream to read to/write from file, stdin, or stdout + // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) + + // read/write BGZF data to/from a file + if ( (filename != "stdin") && (filename != "stdout") ) + Stream = fopen(filename.c_str(), mode); + + // read BGZF data from stdin + else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) + Stream = freopen(NULL, mode, stdin); + + // write BGZF data to stdout + else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) + Stream = freopen(NULL, mode, stdout); + + if ( !Stream ) { + fprintf(stderr, "BGZF ERROR: unable to open file %s\n", filename.c_str() ); + return false; + } + + // set flags, return success + IsOpen = true; + IsWriteUncompressed = isWriteUncompressed; + return true; +} + +// reads BGZF data into a byte buffer +int BgzfData::Read(char* data, const unsigned int dataLength) { + + if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; + + char* output = data; + unsigned int numBytesRead = 0; + while ( numBytesRead < dataLength ) { + + int bytesAvailable = BlockLength - BlockOffset; + if ( bytesAvailable <= 0 ) { + if ( !ReadBlock() ) return -1; + bytesAvailable = BlockLength - BlockOffset; + if ( bytesAvailable <= 0 ) break; + } + + char* buffer = UncompressedBlock; + int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); + memcpy(output, buffer + BlockOffset, copyLength); + + BlockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } + + if ( BlockOffset == BlockLength ) { + BlockAddress = ftell64(Stream); + BlockOffset = 0; + BlockLength = 0; + } + + return numBytesRead; +} + +// reads a BGZF block +bool BgzfData::ReadBlock(void) { + + char header[BLOCK_HEADER_LENGTH]; + int64_t blockAddress = ftell64(Stream); + + int count = fread(header, 1, sizeof(header), Stream); + if ( count == 0 ) { + BlockLength = 0; + return true; + } + + if ( count != sizeof(header) ) { + fprintf(stderr, "BGZF ERROR: read block failed - could not read block header\n"); + return false; + } + + if ( !BgzfData::CheckBlockHeader(header) ) { + fprintf(stderr, "BGZF ERROR: read block failed - invalid block header\n"); + return false; + } + + int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; + char* compressedBlock = CompressedBlock; + memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); + int remaining = blockLength - BLOCK_HEADER_LENGTH; + + count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); + if ( count != remaining ) { + fprintf(stderr, "BGZF ERROR: read block failed - could not read data from block\n"); + return false; + } + + count = InflateBlock(blockLength); + if ( count < 0 ) { + fprintf(stderr, "BGZF ERROR: read block failed - could not decompress block data\n"); + return false; + } + + if ( BlockLength != 0 ) + BlockOffset = 0; + + BlockAddress = blockAddress; + BlockLength = count; + return true; +} + +// seek to position in BGZF file +bool BgzfData::Seek(int64_t position) { + + if ( !IsOpen ) return false; + + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + + if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { + fprintf(stderr, "BGZF ERROR: unable to seek in file\n"); + return false; + } + + BlockLength = 0; + BlockAddress = blockAddress; + BlockOffset = blockOffset; + return true; +} + +// get file position in BGZF file +int64_t BgzfData::Tell(void) { + if ( !IsOpen ) + return false; + else + return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); +} + +// writes the supplied data into the BGZF buffer +unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { + + if ( !IsOpen || !IsWriteOnly ) return false; + + // initialize + unsigned int numBytesWritten = 0; + const char* input = data; + unsigned int blockLength = UncompressedBlockSize; + + // copy the data to the buffer + while ( numBytesWritten < dataLen ) { + + unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); + char* buffer = UncompressedBlock; + memcpy(buffer + BlockOffset, input, copyLength); + + BlockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + if ( BlockOffset == blockLength ) + FlushBlock(); + } + + return numBytesWritten; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BGZF.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,322 @@ +// *************************************************************************** +// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading & writing BGZF files +// *************************************************************************** + +#ifndef BGZF_H +#define BGZF_H + +#include <api_global.h> +#include <zlib.h> + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <string> + +// Platform-specific large-file support +#ifndef BAMTOOLS_LFS +#define BAMTOOLS_LFS + #ifdef WIN32 + #define ftell64(a) _ftelli64(a) + #define fseek64(a,b,c) _fseeki64(a,b,c) + #else + #define ftell64(a) ftello(a) + #define fseek64(a,b,c) fseeko(a,b,c) + #endif +#endif // BAMTOOLS_LFS + +// Platform-specific type definitions +#ifndef BAMTOOLS_TYPES +#define BAMTOOLS_TYPES + #ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; + #else + #include <stdint.h> + #endif +#endif // BAMTOOLS_TYPES + +namespace BamTools { + +// zlib constants +const int GZIP_ID1 = 31; +const int GZIP_ID2 = 139; +const int CM_DEFLATE = 8; +const int FLG_FEXTRA = 4; +const int OS_UNKNOWN = 255; +const int BGZF_XLEN = 6; +const int BGZF_ID1 = 66; +const int BGZF_ID2 = 67; +const int BGZF_LEN = 2; +const int GZIP_WINDOW_BITS = -15; +const int Z_DEFAULT_MEM_LEVEL = 8; + +// BZGF constants +const int BLOCK_HEADER_LENGTH = 18; +const int BLOCK_FOOTER_LENGTH = 8; +const int MAX_BLOCK_SIZE = 65536; +const int DEFAULT_BLOCK_SIZE = 65536; + +struct API_EXPORT BgzfData { + + // data members + public: + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + bool IsWriteOnly; + bool IsWriteUncompressed; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; + + // constructor & destructor + public: + BgzfData(void); + ~BgzfData(void); + + // main interface methods + public: + // closes BGZF file + void Close(void); + // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) + bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false); + // reads BGZF data into a byte buffer + int Read(char* data, const unsigned int dataLength); + // seek to position in BGZF file + bool Seek(int64_t position); + // get file position in BGZF file + int64_t Tell(void); + // writes the supplied data into the BGZF buffer + unsigned int Write(const char* data, const unsigned int dataLen); + + // internal methods + private: + // compresses the current block + int DeflateBlock(void); + // flushes the data in the BGZF block + void FlushBlock(void); + // de-compresses the current block + int InflateBlock(const int& blockLength); + // reads a BGZF block + bool ReadBlock(void); + + // static 'utility' methods + public: + // checks BGZF block header + static inline bool CheckBlockHeader(char* header); + // packs an unsigned integer into the specified buffer + static inline void PackUnsignedInt(char* buffer, unsigned int value); + // packs an unsigned short into the specified buffer + static inline void PackUnsignedShort(char* buffer, unsigned short value); + // unpacks a buffer into a double + static inline double UnpackDouble(char* buffer); + static inline double UnpackDouble(const char* buffer); + // unpacks a buffer into a float + static inline float UnpackFloat(char* buffer); + static inline float UnpackFloat(const char* buffer); + // unpacks a buffer into a signed int + static inline signed int UnpackSignedInt(char* buffer); + static inline signed int UnpackSignedInt(const char* buffer); + // unpacks a buffer into a signed short + static inline signed short UnpackSignedShort(char* buffer); + static inline signed short UnpackSignedShort(const char* buffer); + // unpacks a buffer into an unsigned int + static inline unsigned int UnpackUnsignedInt(char* buffer); + static inline unsigned int UnpackUnsignedInt(const char* buffer); + // unpacks a buffer into an unsigned short + static inline unsigned short UnpackUnsignedShort(char* buffer); + static inline unsigned short UnpackUnsignedShort(const char* buffer); +}; + +// ------------------------------------------------------------- +// static 'utility' method implementations + +// checks BGZF block header +inline +bool BgzfData::CheckBlockHeader(char* header) { + return (header[0] == GZIP_ID1 && + header[1] == (char)GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & FLG_FEXTRA) != 0 && + BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN && + header[12] == BGZF_ID1 && + header[13] == BGZF_ID2 && + BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); +} + +// 'packs' an unsigned integer into the specified buffer +inline +void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +// 'packs' an unsigned short into the specified buffer +inline +void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} + +// 'unpacks' a buffer into a double (includes both non-const & const char* flavors) +inline +double BgzfData::UnpackDouble(char* buffer) { + union { double value; unsigned char valueBuffer[sizeof(double)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +inline +double BgzfData::UnpackDouble(const char* buffer) { + union { double value; unsigned char valueBuffer[sizeof(double)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +// 'unpacks' a buffer into a float (includes both non-const & const char* flavors) +inline +float BgzfData::UnpackFloat(char* buffer) { + union { float value; unsigned char valueBuffer[sizeof(float)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +inline +float BgzfData::UnpackFloat(const char* buffer) { + union { float value; unsigned char valueBuffer[sizeof(float)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors) +inline +signed int BgzfData::UnpackSignedInt(char* buffer) { + union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +inline +signed int BgzfData::UnpackSignedInt(const char* buffer) { + union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors) +inline +signed short BgzfData::UnpackSignedShort(char* buffer) { + union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +inline +signed short BgzfData::UnpackSignedShort(const char* buffer) { + union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors) +inline +unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +inline +unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors) +inline +unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +inline +unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +} // namespace BamTools + +#endif // BGZF_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamAlignment.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,696 @@ +// *************************************************************************** +// BamAlignment.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 13 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#include <BamAlignment.h> +using namespace BamTools; + +#include <cctype> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <exception> +#include <map> +#include <utility> +using namespace std; + +// default ctor +BamAlignment::BamAlignment(void) + : RefID(-1) + , Position(-1) + , MateRefID(-1) + , MatePosition(-1) + , InsertSize(0) +{ } + +// copy ctor +BamAlignment::BamAlignment(const BamAlignment& other) + : Name(other.Name) + , Length(other.Length) + , QueryBases(other.QueryBases) + , AlignedBases(other.AlignedBases) + , Qualities(other.Qualities) + , TagData(other.TagData) + , RefID(other.RefID) + , Position(other.Position) + , Bin(other.Bin) + , MapQuality(other.MapQuality) + , AlignmentFlag(other.AlignmentFlag) + , CigarData(other.CigarData) + , MateRefID(other.MateRefID) + , MatePosition(other.MatePosition) + , InsertSize(other.InsertSize) + , SupportData(other.SupportData) +{ } + +// dtor +BamAlignment::~BamAlignment(void) { } + +// Queries against alignment flags +bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } +bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } +bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } +bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } +bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } +bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } +bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } +bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } +bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } +bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } +bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } + +// Manipulate alignment flags +void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } +void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } +void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } +void BamAlignment::SetIsMapped(bool ok) { SetIsUnmapped(!ok); } +void BamAlignment::SetIsMateMapped(bool ok) { SetIsMateUnmapped(!ok); } +void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } +void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } +void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } +void BamAlignment::SetIsPrimaryAlignment(bool ok) { SetIsSecondaryAlignment(!ok); } +void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } +void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } +void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } +void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } +void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } + +// calculates alignment end position, based on starting position and CIGAR operations +int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const char cigarType = (*cigarIter).Type; + if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) + alignEnd += (*cigarIter).Length; + else if ( usePadded && cigarType == 'I' ) + alignEnd += (*cigarIter).Length; + } + + // adjust for zeroBased, if necessary + if (zeroBased) + return alignEnd - 1; + else + return alignEnd; +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, copy tag data to temp buffer + string newTag = tag + type + value; + const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const int32_t& value) { + return AddTag(tag, type, (const uint32_t&)value); +} + +bool BamAlignment::AddTag(const string& tag, const string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; + + // otherwise, convert value to string + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const string& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type != "Z" && type != "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + value.size()]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + const unsigned int dataLength = strlen(value.c_str()); + memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const uint32_t& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "f" || type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const int32_t& value) { + return EditTag(tag, type, (const uint32_t&)value); +} + +bool BamAlignment::EditTag(const string& tag, const string& type, const float& value) { + + if ( SupportData.HasCoreOnly ) return false; + if ( tag.size() != 2 || type.size() != 1 ) return false; + if ( type == "Z" || type == "H" ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new VALUE in place of current tag data + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +// get "NM" tag data - originally contributed by Aaron Quinlan +// stores data in 'editDistance', returns success/fail +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { + return GetTag("NM", (uint32_t&)editDistance); +} + +// get "RG" tag data +// stores data in 'readGroup', returns success/fail +bool BamAlignment::GetReadGroup(string& readGroup) const { + return GetTag("RG", readGroup); +} + +bool BamAlignment::GetTag(const string& tag, string& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + const unsigned int dataLength = strlen(pTagData); + destination.clear(); + destination.resize(dataLength); + memcpy( (char*)destination.data(), pTagData, dataLength ); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTag(const string& tag, uint32_t& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch (type) { + + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type for integer destination (float or var-length strings) + case 'f': + case 'Z': + case 'H': + fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTag(const string& tag, int32_t& destination) const { + return GetTag(tag, (uint32_t&)destination); +} + +bool BamAlignment::GetTag(const string& tag, float& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found, determine data byte-length, store data in readGroup, return success + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch(type) { + + // 1 byte data + case 'A': + case 'c': + case 'C': + destinationLength = 1; + break; + + // 2 byte data + case 's': + case 'S': + destinationLength = 2; + break; + + // 4 byte data + case 'f': + case 'i': + case 'I': + destinationLength = 4; + break; + + // unsupported type (var-length strings) + case 'Z': + case 'H': + fprintf(stderr, "ERROR: Cannot store tag of type %c in integer destination\n", type); + return false; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + + // store in destination + destination = 0.0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::GetTagType(const string& tag, char& type) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // lookup tag + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // retrieve tag type code + type = *(pTagData - 1); + + // validate that type is a proper BAM tag type + switch(type) { + case 'A': + case 'c': + case 'C': + case 's': + case 'S': + case 'f': + case 'i': + case 'I': + case 'Z': + case 'H': + return true; + + // unknown tag type + default: + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", type); + return false; + } + } + + // tag not found, return failure + return false; +} + +bool BamAlignment::RemoveTag(const string& tag) { + + // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed + // also, return false if no data present to remove + if ( SupportData.HasCoreOnly || TagData.empty() ) return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found, store data in readGroup, return success + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + char newTagData[originalTagDataLength]; + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); + + // save new tag data + TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); + return true; + } + + // tag not found, no removal - return failure + return false; +} + +bool BamAlignment::FindTag(const string& tag, + char* &pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) +{ + + while ( numBytesParsed < tagDataLength ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag, return true on match + if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; + + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; + } + + // checked all tags, none match + return false; +} + +bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { + + switch(storageType) { + + case 'A': + case 'c': + case 'C': + ++numBytesParsed; + ++pTagData; + break; + + case 's': + case 'S': + numBytesParsed += 2; + pTagData += 2; + break; + + case 'f': + case 'i': + case 'I': + numBytesParsed += 4; + pTagData += 4; + break; + + case 'Z': + case 'H': + while(*pTagData) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + default: + // error case + fprintf(stderr, "ERROR: Unknown tag storage class encountered: [%c]\n", storageType); + return false; + } + + // return success + return true; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamAlignment.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,203 @@ +// *************************************************************************** +// BamAlignment.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 13 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#ifndef BAMALIGNMENT_H +#define BAMALIGNMENT_H + +#include <api_global.h> +#include <BamAux.h> +#include <string> +#include <vector> + +namespace BamTools { + +// forward declare BamAlignment's friend classes +namespace Internal { + class BamReaderPrivate; + class BamWriterPrivate; +} // namespace Internal + +// BamAlignment data structure +// explicitly labeled as 'struct' to indicate that (most of) its fields are public +struct API_EXPORT BamAlignment { + + // constructors & destructor + public: + BamAlignment(void); + BamAlignment(const BamAlignment& other); + ~BamAlignment(void); + + // Queries against alignment flags + public: + bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate + bool IsFailedQC(void) const; // Returns true if this read failed quality control + bool IsFirstMate(void) const; // Returns true if alignment is first mate on read + bool IsMapped(void) const; // Returns true if alignment is mapped + bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped + bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand + bool IsPaired(void) const; // Returns true if alignment part of paired-end read + bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment + bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand + bool IsSecondMate(void) const; // Returns true if alignment is second mate on read + + // Manipulate alignment flags + public: + void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag + void SetIsFailedQC(bool ok); // Sets "failed quality control" flag + void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag + void SetIsMapped(bool ok); // Sets "alignment is mapped" flag + void SetIsMateMapped(bool ok); // Sets "alignment's mate is mapped" flag + void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag + void SetIsPrimaryAlignment(bool ok); // Sets "position is primary alignment" flag + void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag + void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag + + // legacy methods (deprecated, but available) + void SetIsMateUnmapped(bool ok); // Complement of IsMateMapped() flag + void SetIsSecondaryAlignment(bool ok); // Complement of IsPrimaryAlignment() flag + void SetIsUnmapped(bool ok); // Complement of IsMapped() flag + + // Tag data access methods + public: + // ------------------------------------------------------------------------------------- + // N.B. - The following tag access methods may not be used on BamAlignments fetched + // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in + // error message (to keep output clean) but will ALWAYS return false. Only user-created + // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here. + + // add tag data (create new TAG entry with TYPE and VALUE) + // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details + // returns true if new data added, false if error or TAG already exists + // N.B. - will NOT modify existing tag. Use EditTag() instead + // @tag - two character tag name + // @type - single character tag type (see SAM/BAM spec for details) + // @value - value to associate with tag + bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H + bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i + bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i + bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f + + // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) + // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details + // returns true if edit was successfaul, false if error + // @tag - two character tag name + // @type - single character tag type (see SAM/BAM spec for details) + // @value - new value for tag + bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H + bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i + bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i + bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f + + // specific tag data access methods - these only remain for legacy support + // returns whether specific tag could be retrieved + bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (equivalent to GetTag("NM", editDistance)) + bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (equivalent to GetTag("RG", readGroup)) + + // generic tag data access methods + // returns whether tag is found & tag type is compatible with DESTINATION + // @tag - two character tag name + // @destination - if found, tag value is stored here + bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings + bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data + bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data + bool GetTag(const std::string& tag, float& destination) const; // access floating point data + + // retrieve the tag type code for TAG + // returns true if tag could be found and type determined + bool GetTagType(const std::string& tag, char& type) const; + + // remove tag data + // returns true if removal was successful, false if error + // N.B. - returns false if TAG does not exist (no removal can occur) + // @tag - two character tag name + bool RemoveTag(const std::string& tag); + + // Additional data access methods + public: + // calculates & returns alignment end position, based on starting position and CIGAR operations + // @usePadded - if true, counts inserted bases. Default is false, so that alignment end position matches the last base's position in reference + // @zeroBased - if true, returns 0-based coordinate; else returns 1-based. Setting this to false is useful when using BAM data along with other, half-open formats. + int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; + + // 'internal' utility methods + private: + static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); + static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); + + // Data members + public: + std::string Name; // Read name + int32_t Length; // Query length + std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) + std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) + std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; // Tag data (accessor methods will pull the requested information out) + int32_t RefID; // ID number for reference sequence + int32_t Position; // Position (0-based) where alignment starts + uint16_t Bin; // Bin in BAM file where this alignment resides + uint16_t MapQuality; // Mapping quality score + uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate + std::vector<CigarOp> CigarData; // CIGAR operations for this alignment + int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned + int32_t MatePosition; // Position (0-based) where alignment's mate starts + int32_t InsertSize; // Mate-pair insert size + + // Internal data, inaccessible to client code + // but available BamReaderPrivate & BamWriterPrivate + private: + struct BamAlignmentSupportData { + + // data members + std::string AllCharData; + uint32_t BlockLength; + uint32_t NumCigarOperations; + uint32_t QueryNameLength; + uint32_t QuerySequenceLength; + bool HasCoreOnly; + + // constructor + BamAlignmentSupportData(void) + : BlockLength(0) + , NumCigarOperations(0) + , QueryNameLength(0) + , QuerySequenceLength(0) + , HasCoreOnly(false) + { } + }; + BamAlignmentSupportData SupportData; + friend class Internal::BamReaderPrivate; + friend class Internal::BamWriterPrivate; + + // Alignment flag query constants + // Use the get/set methods above instead + private: + enum { PAIRED = 1 + , PROPER_PAIR = 2 + , UNMAPPED = 4 + , MATE_UNMAPPED = 8 + , REVERSE = 16 + , MATE_REVERSE = 32 + , READ_1 = 64 + , READ_2 = 128 + , SECONDARY = 256 + , QC_FAILED = 512 + , DUPLICATE = 1024 + }; +}; + +// convenience typedef(s) +typedef std::vector<BamAlignment> BamAlignmentVector; + +} // namespace BamTools + +#endif // BAMALIGNMENT_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamAux.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,227 @@ +// *************************************************************************** +// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic constants, data structures, utilities etc. +// used throughout the API for handling BAM files +// *************************************************************************** + +#ifndef BAMAUX_H +#define BAMAUX_H + +#include <api_global.h> + +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +// Platform-specific large-file support +#ifndef BAMTOOLS_LFS +#define BAMTOOLS_LFS + #ifdef WIN32 + #define ftell64(a) _ftelli64(a) + #define fseek64(a,b,c) _fseeki64(a,b,c) + #else + #define ftell64(a) ftello(a) + #define fseek64(a,b,c) fseeko(a,b,c) + #endif +#endif // BAMTOOLS_LFS + +// Platform-specific type definitions +#ifndef BAMTOOLS_TYPES +#define BAMTOOLS_TYPES + #ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; + #else + #include <stdint.h> + #endif +#endif // BAMTOOLS_TYPES + +namespace BamTools { + +// ---------------------------------------------------------------- +// ---------------------------------------------------------------- +// BAM constants + +const int BAM_CMATCH = 0; +const int BAM_CINS = 1; +const int BAM_CDEL = 2; +const int BAM_CREF_SKIP = 3; +const int BAM_CSOFT_CLIP = 4; +const int BAM_CHARD_CLIP = 5; +const int BAM_CPAD = 6; +const int BAM_CIGAR_SHIFT = 4; +const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); +const int BAM_CORE_SIZE = 32; +const int BT_SIZEOF_INT = 4; + +// ---------------------------------------------------------------- +// ---------------------------------------------------------------- +// Data structs & typedefs + +// CIGAR operation data structure +struct API_EXPORT CigarOp { + + // data members + char Type; // Operation type (MIDNSHP) + uint32_t Length; // Operation length (number of bases) + + // constructor + CigarOp(const char type = '\0', + const uint32_t length = 0) + : Type(type) + , Length(length) + { } +}; + +// Reference data entry +struct API_EXPORT RefData { + + // data members + std::string RefName; // Name of reference sequence + int32_t RefLength; // Length of reference sequence + bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence + + // constructor + RefData(const int32_t& length = 0, + bool ok = false) + : RefLength(length) + , RefHasAlignments(ok) + { } +}; +typedef std::vector<RefData> RefVector; + +// General (sequential) genome region +struct API_EXPORT BamRegion { + + // data members + int LeftRefID; + int LeftPosition; + int RightRefID; + int RightPosition; + + // constructor + BamRegion(const int& leftID = -1, + const int& leftPos = -1, + const int& rightID = -1, + const int& rightPos = -1) + : LeftRefID(leftID) + , LeftPosition(leftPos) + , RightRefID(rightID) + , RightPosition(rightPos) + { } + + // copy constructor + BamRegion(const BamRegion& other) + : LeftRefID(other.LeftRefID) + , LeftPosition(other.LeftPosition) + , RightRefID(other.RightRefID) + , RightPosition(other.RightPosition) + { } + + // member functions + void clear(void) { LeftRefID = -1; LeftPosition = -1; RightRefID = -1; RightPosition = -1; } + bool isLeftBoundSpecified(void) const { return ( LeftRefID >= 0 && LeftPosition >= 0 ); } + bool isNull(void) const { return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); } + bool isRightBoundSpecified(void) const { return ( RightRefID >= 0 && RightPosition >= 0 ); } +}; + +// ---------------------------------------------------------------- +// ---------------------------------------------------------------- +// General utilities + +// returns true if system is big endian +inline bool SystemIsBigEndian(void) { + const uint16_t one = 0x0001; + return ((*(char*) &one) == 0 ); +} + +// swaps endianness of 16-bit value 'in place' +inline void SwapEndian_16(int16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +inline void SwapEndian_16(uint16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +// swaps endianness of 32-bit value 'in-place' +inline void SwapEndian_32(int32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +inline void SwapEndian_32(uint32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +// swaps endianness of 64-bit value 'in-place' +inline void SwapEndian_64(int64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +inline void SwapEndian_64(uint64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +// swaps endianness of 'next 2 bytes' in a char buffer (in-place) +inline void SwapEndian_16p(char* data) { + uint16_t& value = (uint16_t&)*data; + SwapEndian_16(value); +} + +// swaps endianness of 'next 4 bytes' in a char buffer (in-place) +inline void SwapEndian_32p(char* data) { + uint32_t& value = (uint32_t&)*data; + SwapEndian_32(value); +} + +// swaps endianness of 'next 8 bytes' in a char buffer (in-place) +inline void SwapEndian_64p(char* data) { + uint64_t& value = (uint64_t&)*data; + SwapEndian_64(value); +} + +// returns whether file exists (can be opened OK) +inline bool FileExists(const std::string& filename) { + std::ifstream f(filename.c_str(), std::ifstream::in); + return !f.fail(); +} + +} // namespace BamTools + +#endif // BAMAUX_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamIndex.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,230 @@ +// *************************************************************************** +// BamIndex.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index functionality - both for the default (standardized) BAM +// index format (.bai) as well as a BamTools-specific (nonstandard) index +// format (.bti). +// *************************************************************************** + +#include <BamIndex.h> +#include <BamReader.h> +#include <BGZF.h> +#include <BamStandardIndex_p.h> +#include <BamToolsIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <algorithm> +#include <iostream> +#include <map> +using namespace std; + +// -------------------------------------------------- +// BamIndex factory methods + +// returns index based on BAM filename 'stub' +// checks first for preferred type, returns that type if found +// (if not found, attmempts to load other type(s), returns 0 if NONE found) +// +// ** default preferred type is BamToolsIndex ** use this anytime it exists +BamIndex* BamIndex::FromBamFilename(const std::string& bamFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + const BamIndex::PreferredIndexType& type) +{ + // --------------------------------------------------- + // attempt to load preferred type first + + const std::string bamtoolsIndexFilename = bamFilename + ".bti"; + const bool bamtoolsIndexExists = BamTools::FileExists(bamtoolsIndexFilename); + if ( (type == BamIndex::BAMTOOLS) && bamtoolsIndexExists ) + return new BamToolsIndex(bgzf, reader); + + const std::string standardIndexFilename = bamFilename + ".bai"; + const bool standardIndexExists = BamTools::FileExists(standardIndexFilename); + if ( (type == BamIndex::STANDARD) && standardIndexExists ) + return new BamStandardIndex(bgzf, reader); + + // ---------------------------------------------------- + // preferred type could not be found, try other (non-preferred) types + // if none found, return 0 + + if ( bamtoolsIndexExists ) return new BamToolsIndex(bgzf, reader); + if ( standardIndexExists ) return new BamStandardIndex(bgzf, reader); + return 0; +} + +// returns index based on explicitly named index file (or 0 if not found) +BamIndex* BamIndex::FromIndexFilename(const std::string& indexFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader) +{ + // see if specified file exists + const bool indexExists = BamTools::FileExists(indexFilename); + if ( !indexExists ) return 0; + + const std::string bamtoolsIndexExtension(".bti"); + const std::string standardIndexExtension(".bai"); + + // if has bamtoolsIndexExtension + if ( indexFilename.find(bamtoolsIndexExtension) == (indexFilename.length() - bamtoolsIndexExtension.length()) ) + return new BamToolsIndex(bgzf, reader); + + // if has standardIndexExtension + if ( indexFilename.find(standardIndexExtension) == (indexFilename.length() - standardIndexExtension.length()) ) + return new BamStandardIndex(bgzf, reader); + + // otherwise, unsupported file type + return 0; +} + +// ------------------------------- +// BamIndex implementation + +// ctor +BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader) + : m_BGZF(bgzf) + , m_reader(reader) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_indexStream(0) +{ + if ( m_reader && m_reader->IsOpen() ) + m_references = m_reader->GetReferenceData(); +} + +// dtor +BamIndex::~BamIndex(void) { + if ( IsOpen() ) + fclose(m_indexStream); +} + +// return true if FILE* is open +bool BamIndex::IsOpen(void) const { + return ( m_indexStream != 0 ); +} + +// loads existing data from file into memory +bool BamIndex::Load(const string& filename) { + + // open index file, abort on error + if ( !OpenIndexFile(filename, "rb") ) { + fprintf(stderr, "ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); + return false; + } + + // check magic number + if ( !LoadHeader() ) { + fclose(m_indexStream); + return false; + } + + // load reference data (but only keep in memory if full caching requested) + bool saveInitialLoad = ( m_cacheMode == BamIndex::FullIndexCaching ); + if ( !LoadAllReferences(saveInitialLoad) ) { + fclose(m_indexStream); + return false; + } + + // update index cache based on selected mode + UpdateCache(); + + // return success + return true; +} + +// opens index file for reading/writing, return true if opened OK +bool BamIndex::OpenIndexFile(const string& filename, const string& mode) { + m_indexStream = fopen(filename.c_str(), mode.c_str()); + return ( m_indexStream != 0 ); +} + +// rewind index file to beginning of index data, return true if rewound OK +bool BamIndex::Rewind(void) { + return ( fseek64(m_indexStream, DataBeginOffset(), SEEK_SET) == 0 ); +} + +// change the index caching behavior +void BamIndex::SetCacheMode(const BamIndexCacheMode mode) { + if ( mode != m_cacheMode ) { + m_cacheMode = mode; + UpdateCache(); + } +} + +// updates in-memory cache of index data, depending on current cache mode +void BamIndex::UpdateCache(void) { + + // skip if file not open + if ( !IsOpen() ) return; + + // reflect requested cache mode behavior + switch ( m_cacheMode ) { + + case (BamIndex::FullIndexCaching) : + Rewind(); + LoadAllReferences(true); + break; + + case (BamIndex::LimitedIndexCaching) : + if ( HasFullDataCache() ) + KeepOnlyFirstReferenceOffsets(); + else { + ClearAllData(); + SkipToFirstReference(); + LoadFirstReference(true); + } + break; + case(BamIndex::NoIndexCaching) : + ClearAllData(); + break; + default : + // unreachable + ; + } +} + +// writes in-memory index data out to file +bool BamIndex::Write(const string& bamFilename) { + + // open index file for writing + string indexFilename = bamFilename + Extension(); + if ( !OpenIndexFile(indexFilename, "wb") ) { + fprintf(stderr, "ERROR: Could not open file to save index.\n"); + return false; + } + + // write index header data + if ( !WriteHeader() ) { + fprintf(stderr, "ERROR: There was a problem writing index metadata to new index file.\n"); + fflush(m_indexStream); + fclose(m_indexStream); + exit(1); + } + + // write main index data + if ( !WriteAllReferences() ) { + fprintf(stderr, "ERROR: There was a problem writing index data to new index file.\n"); + fflush(m_indexStream); + fclose(m_indexStream); + exit(1); + } + + // flush any remaining output, rewind file, and return success + fflush(m_indexStream); + fclose(m_indexStream); + + // re-open index file for later reading + if ( !OpenIndexFile(indexFilename, "rb") ) { + fprintf(stderr, "ERROR: Could not open newly created index file for reading.\n"); + return false; + } + + // return success/failure of write + return true; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamIndex.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,145 @@ +// *************************************************************************** +// BamIndex.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides basic BAM index interface +// *************************************************************************** + +#ifndef BAM_INDEX_H +#define BAM_INDEX_H + +#include <api_global.h> +#include <BamAux.h> +#include <iostream> +#include <string> +#include <vector> + +namespace BamTools { + +class BamReader; +class BgzfData; + +namespace Internal { + class BamStandardIndex; + class BamToolsIndex; +} // namespace Internal + +// -------------------------------------------------- +// BamIndex base class +class API_EXPORT BamIndex { + + // specify index-caching behavior + // + // @FullIndexCaching - store entire index file contents in memory + // @LimitedIndexCaching - store only index data for current reference + // being processed + // @NoIndexCaching - do not store any index data. Load as needed to + // calculate jump offset + public: enum BamIndexCacheMode { FullIndexCaching = 0 + , LimitedIndexCaching + , NoIndexCaching + }; + + // ctor & dtor + public: + BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + virtual ~BamIndex(void); + + // index interface + public: + // creates index data (in-memory) from current reader data + virtual bool Build(void) =0; + // returns supported file extension + virtual const std::string Extension(void) const =0; + // returns whether reference has alignments or no + virtual bool HasAlignments(const int& referenceID) const =0; + // attempts to use index to jump to region; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; + // loads existing data from file into memory + virtual bool Load(const std::string& filename); + // change the index caching behavior + virtual void SetCacheMode(const BamIndexCacheMode mode); + // writes in-memory index data out to file + // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) + virtual bool Write(const std::string& bamFilename); + + // derived-classes MUST provide implementation + protected: + // clear all current index offset data in memory + virtual void ClearAllData(void) =0; + // return file position after header metadata + virtual const off_t DataBeginOffset(void) const =0; + // return true if all index data is cached + virtual bool HasFullDataCache(void) const =0; + // clears index data from all references except the first + virtual void KeepOnlyFirstReferenceOffsets(void) =0; + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + virtual bool LoadAllReferences(bool saveData = true) =0; + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + virtual bool LoadFirstReference(bool saveData = true) =0; + // load header data from index file, return true if loaded OK + virtual bool LoadHeader(void) =0; + // position file pointer to first reference begin, return true if skipped OK + virtual bool SkipToFirstReference(void) =0; + // write index reference data + virtual bool WriteAllReferences(void) =0; + // write index header data + virtual bool WriteHeader(void) =0; + + // internal methods + protected: + // rewind index file to beginning of index data, return true if rewound OK + bool Rewind(void); + + private: + // return true if FILE* is open + bool IsOpen(void) const; + // opens index file according to requested mode, return true if opened OK + bool OpenIndexFile(const std::string& filename, const std::string& mode); + // updates in-memory cache of index data, depending on current cache mode + void UpdateCache(void); + + // factory methods for returning proper BamIndex-derived type based on available index files + public: + + // returns index based on BAM filename 'stub' + // checks first for preferred type, returns that type if found + // (if not found, attmempts to load other type(s), returns 0 if NONE found) + // + // ** default preferred type is BamToolsIndex ** use this anytime it exists + enum PreferredIndexType { BAMTOOLS = 0, STANDARD }; + static BamIndex* FromBamFilename(const std::string& bamFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader, + const BamIndex::PreferredIndexType& type = BamIndex::BAMTOOLS); + + // returns index based on explicitly named index file (or 0 if not found) + static BamIndex* FromIndexFilename(const std::string& indexFilename, + BamTools::BgzfData* bgzf, + BamTools::BamReader* reader); + + // data members + protected: + BamTools::BgzfData* m_BGZF; + BamTools::BamReader* m_reader; + BamTools::RefVector m_references; + BamIndex::BamIndexCacheMode m_cacheMode; + FILE* m_indexStream; + + + // friends + friend class Internal::BamStandardIndex; + friend class Internal::BamToolsIndex; +}; + +} // namespace BamTools + +#endif // BAM_INDEX_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamMultiReader.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,450 @@ +// *************************************************************************** +// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad +// Institute. +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files. +// +// This functionality allows applications to work on very large sets of files +// without requiring intermediate merge, sort, and index steps for each file +// subset. It also improves the performance of our merge system as it +// precludes the need to sort merged files. +// *************************************************************************** + +#include <BamMultiReader.h> +#include <BGZF.h> +using namespace BamTools; + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <iterator> +#include <sstream> +#include <string> +#include <vector> +using namespace std; + +// ----------------------------------------------------- +// BamMultiReader implementation +// ----------------------------------------------------- + +// constructor +BamMultiReader::BamMultiReader(void) + : CurrentRefID(0) + , CurrentLeft(0) +{ } + +// destructor +BamMultiReader::~BamMultiReader(void) { + Close(); +} + +// close the BAM files +void BamMultiReader::Close(void) { + + // close all BAM readers and clean up pointers + vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin(); + vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end(); + for ( ; readerIter != readerEnd; ++readerIter) { + + BamReader* reader = (*readerIter).first; + BamAlignment* alignment = (*readerIter).second; + + // close the reader + if ( reader) reader->Close(); + + // delete reader pointer + delete reader; + reader = 0; + + // delete alignment pointer + delete alignment; + alignment = 0; + } + + // clear out the container + readers.clear(); +} + +// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail +bool BamMultiReader::CreateIndexes(bool useStandardIndex) { + bool result = true; + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + result &= reader->CreateIndex(useStandardIndex); + } + return result; +} + +// sets the index caching mode on the readers +void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + reader->SetIndexCacheMode(mode); + } +} + +// for debugging +void BamMultiReader::DumpAlignmentIndex(void) { + for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { + cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; + } +} + +// makes a virtual, unified header for all the bam files in the multireader +const string BamMultiReader::GetHeaderText(void) const { + + string mergedHeader = ""; + map<string, bool> readGroups; + + // foreach extraction entry (each BAM file) + for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { + + BamReader* reader = rs->first; + string headerText = reader->GetHeaderText(); + if ( headerText.empty() ) continue; + + map<string, bool> currentFileReadGroups; + stringstream header(headerText); + vector<string> lines; + string item; + while (getline(header, item)) + lines.push_back(item); + + for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) { + + // get next line from header, skip if empty + string headerLine = *it; + if ( headerLine.empty() ) { continue; } + + // if first file, save HD & SQ entries + if ( rs == readers.begin() ) { + if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { + mergedHeader.append(headerLine.c_str()); + mergedHeader.append(1, '\n'); + } + } + + // (for all files) append RG entries if they are unique + if ( headerLine.find("@RG") == 0 ) { + stringstream headerLineSs(headerLine); + string part, readGroupPart, readGroup; + while(std::getline(headerLineSs, part, '\t')) { + stringstream partSs(part); + string subtag; + std::getline(partSs, subtag, ':'); + if (subtag == "ID") { + std::getline(partSs, readGroup, ':'); + break; + } + } + if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries + mergedHeader.append(headerLine.c_str() ); + mergedHeader.append(1, '\n'); + readGroups[readGroup] = true; + currentFileReadGroups[readGroup] = true; + } else { + // warn iff we are reading one file and discover duplicated @RG tags in the header + // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags + if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { + cerr << "WARNING: duplicate @RG tag " << readGroup + << " entry in header of " << reader->GetFilename() << endl; + } + } + } + } + } + + // return merged header text + return mergedHeader; +} + +// get next alignment among all files +bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { + + // bail out if we are at EOF in all files, means no more alignments to process + if (!HasOpenReaders()) + return false; + + // when all alignments have stepped into a new target sequence, update our + // current reference sequence id + UpdateReferenceID(); + + // our lowest alignment and reader will be at the front of our alignment index + BamAlignment* alignment = alignments.begin()->second.second; + BamReader* reader = alignments.begin()->second.first; + + // now that we have the lowest alignment in the set, save it by copy to our argument + nextAlignment = BamAlignment(*alignment); + + // remove this alignment index entry from our alignment index + alignments.erase(alignments.begin()); + + // and add another entry if we can get another alignment from the reader + if (reader->GetNextAlignment(*alignment)) { + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { // do nothing + //cerr << "reached end of file " << lowestReader->GetFilename() << endl; + } + + return true; + +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { + + // bail out if we are at EOF in all files, means no more alignments to process + if (!HasOpenReaders()) + return false; + + // when all alignments have stepped into a new target sequence, update our + // current reference sequence id + UpdateReferenceID(); + + // our lowest alignment and reader will be at the front of our alignment index + BamAlignment* alignment = alignments.begin()->second.second; + BamReader* reader = alignments.begin()->second.first; + + // now that we have the lowest alignment in the set, save it by copy to our argument + nextAlignment = BamAlignment(*alignment); + //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); + + // remove this alignment index entry from our alignment index + alignments.erase(alignments.begin()); + + // and add another entry if we can get another alignment from the reader + if (reader->GetNextAlignmentCore(*alignment)) { + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { // do nothing + //cerr << "reached end of file " << lowestReader->GetFilename() << endl; + } + + return true; + +} + +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the above +// validation function (ValidateReaders) to verify that our reference data +// is the same across all files on Open, so we will not encounter a situation +// in which there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +const int BamMultiReader::GetReferenceCount(void) const { + return readers.front().first->GetReferenceCount(); +} + +// returns vector of reference objects +const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { + return readers.front().first->GetReferenceData(); +} + +// returns refID from reference name +const int BamMultiReader::GetReferenceID(const string& refName) const { + return readers.front().first->GetReferenceID(refName); +} + +// --------------------------------------------------------------------------------------- + +// checks if any readers still have alignments +bool BamMultiReader::HasOpenReaders() { + return alignments.size() > 0; +} + +// returns whether underlying BAM readers ALL have an index loaded +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReader::IsIndexLoaded(void) const { + bool ok = true; + vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin(); + vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd = readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader ) ok &= reader->IsIndexLoaded(); + } + return ok; +} + +// jumps to specified region(refID, leftBound) in BAM files, returns success/fail +bool BamMultiReader::Jump(int refID, int position) { + + //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { + CurrentRefID = refID; + CurrentLeft = position; + + bool result = true; + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + result &= reader->Jump(refID, position); + if (!result) { + cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; + exit(1); + } + } + if (result) UpdateAlignments(); + return result; +} + +// opens BAM files +bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) { + + // for filename in filenames + fileNames = filenames; // save filenames in our multireader + for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { + + const string filename = *it; + BamReader* reader = new BamReader; + + bool openedOK = true; + openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex); + + // if file opened ok, check that it can be read + if ( openedOK ) { + + bool fileOK = true; + BamAlignment* alignment = new BamAlignment; + fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) ); + + if (fileOK) { + readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup + alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), + make_pair(reader, alignment))); + } else { + cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; + // if only file available & could not be read, return failure + if ( filenames.size() == 1 ) return false; + } + } + + // TODO; any further error handling when openedOK is false ?? + else + return false; + } + + // files opened ok, at least one alignment could be read, + // now need to check that all files use same reference data + ValidateReaders(); + return true; +} + +void BamMultiReader::PrintFilenames(void) { + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + cout << reader->GetFilename() << endl; + } +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReader::Rewind(void) { + bool result = true; + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + result &= reader->Rewind(); + } + return result; +} + +bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { + BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); + return SetRegion(region); +} + +bool BamMultiReader::SetRegion(const BamRegion& region) { + + Region = region; + + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + if (!it->first->SetRegion(region)) { + cerr << "ERROR: could not jump " << it->first->GetFilename() << " to " + << region.LeftRefID << ":" << region.LeftPosition + << ".." << region.RightRefID << ":" << region.RightPosition << endl; + } + } + + UpdateAlignments(); + return true; +} + +void BamMultiReader::UpdateAlignments(void) { + // Update Alignments + alignments.clear(); + for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* br = it->first; + BamAlignment* ba = it->second; + if (br->GetNextAlignment(*ba)) { + alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), + make_pair(br, ba))); + } else { + // assume BamReader end of region / EOF + } + } +} + +// updates the reference id stored in the BamMultiReader +// to reflect the current state of the readers +void BamMultiReader::UpdateReferenceID(void) { + // the alignments are sorted by position, so the first alignment will always have the lowest reference ID + if (alignments.begin()->second.second->RefID != CurrentRefID) { + // get the next reference id + // while there aren't any readers at the next ref id + // increment the ref id + int nextRefID = CurrentRefID; + while (alignments.begin()->second.second->RefID != nextRefID) { + ++nextRefID; + } + //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; + CurrentRefID = nextRefID; + } +} + +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +void BamMultiReader::ValidateReaders(void) const { + int firstRefCount = readers.front().first->GetReferenceCount(); + BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); + for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) { + BamReader* reader = it->first; + BamTools::RefVector currentRefData = reader->GetReferenceData(); + BamTools::RefVector::const_iterator f = firstRefData.begin(); + BamTools::RefVector::const_iterator c = currentRefData.begin(); + if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { + cerr << "ERROR: mismatched number of references in " << reader->GetFilename() + << " expected " << firstRefCount + << " reference sequences but only found " << reader->GetReferenceCount() << endl; + exit(1); + } + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while (f != firstRefData.end()) { + if (f->RefName != c->RefName || f->RefLength != c->RefLength) { + cerr << "ERROR: mismatched references found in " << reader->GetFilename() + << " expected: " << endl; + for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + cerr << "but found: " << endl; + for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) + cerr << a->RefName << " " << a->RefLength << endl; + exit(1); + } + ++f; ++c; + } + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamMultiReader.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,136 @@ +// *************************************************************************** +// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// *************************************************************************** + +#ifndef BAMMULTIREADER_H +#define BAMMULTIREADER_H + +#include <api_global.h> +#include <BamReader.h> +#include <map> +#include <sstream> +#include <string> +#include <utility> + +namespace BamTools { + +// index mapping reference/position pairings to bamreaders and their alignments +typedef std::multimap<std::pair<int, int>, std::pair<BamReader*, BamAlignment*> > AlignmentIndex; + +class API_EXPORT BamMultiReader { + + // constructor / destructor + public: + BamMultiReader(void); + ~BamMultiReader(void); + + // public interface + public: + + // positioning + int CurrentRefID; + int CurrentLeft; + + // region under analysis, specified using SetRegion + BamRegion Region; + + // ---------------------- + // BAM file operations + // ---------------------- + + // close BAM files + void Close(void); + + // opens BAM files (and optional BAM index files, if provided) + // @openIndexes - triggers index opening, useful for suppressing + // error messages during merging of files in which we may not have + // indexes. + // @coreMode - setup our first alignments using GetNextAlignmentCore(); + // also useful for merging + // @preferStandardIndex - look for standard BAM index ".bai" first. If false, + // will look for BamTools index ".bti". + bool Open(const std::vector<std::string>& filenames, bool openIndexes = true, bool coreMode = false, bool preferStandardIndex = false); + + // returns whether underlying BAM readers ALL have an index loaded + // this is useful to indicate whether Jump() or SetRegion() are possible + bool IsIndexLoaded(void) const; + + // performs random-access jump to reference, position + bool Jump(int refID, int position = 0); + + // sets the target region + bool SetRegion(const BamRegion& region); + bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above + + // returns file pointers to beginning of alignments + bool Rewind(void); + + // ---------------------- + // access alignment data + // ---------------------- + // updates the reference id marker to match the lower limit of our readers + void UpdateReferenceID(void); + + // retrieves next available alignment (returns success/fail) from all files + bool GetNextAlignment(BamAlignment&); + // retrieves next available alignment (returns success/fail) from all files + // and populates the support data with information about the alignment + // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT + bool GetNextAlignmentCore(BamAlignment&); + // ... should this be private? + bool HasOpenReaders(void); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns unified SAM header text for all files + const std::string GetHeaderText(void) const; + // returns number of reference sequences + const int GetReferenceCount(void) const; + // returns vector of reference objects + const BamTools::RefVector GetReferenceData(void) const; + // returns reference id (used for BamMultiReader::Jump()) for the given reference name + const int GetReferenceID(const std::string& refName) const; + // validates that we have a congruent set of BAM files that are aligned against the same reference sequences + void ValidateReaders() const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") + bool CreateIndexes(bool useStandardIndex = true); + + // sets the index caching mode for the readers + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + + //const int GetReferenceID(const string& refName) const; + + // utility + void PrintFilenames(void); + void DumpAlignmentIndex(void); + void UpdateAlignments(void); // updates our alignment cache + + // private implementation + private: + + // the set of readers and alignments which we operate on, maintained throughout the life of this class + std::vector<std::pair<BamReader*, BamAlignment*> > readers; + + // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment + // when a reader reaches EOF, its entry is removed from this index + AlignmentIndex alignments; + + std::vector<std::string> fileNames; +}; + +} // namespace BamTools + +#endif // BAMMULTIREADER_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamReader.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,66 @@ +// *************************************************************************** +// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#include <BamReader.h> +#include <BamReader_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <iostream> +#include <iterator> +#include <string> +#include <vector> +using namespace std; + +// constructor +BamReader::BamReader(void) { + d = new BamReaderPrivate(this); +} + +// destructor +BamReader::~BamReader(void) { + delete d; + d = 0; +} + +// file operations +void BamReader::Close(void) { d->Close(); } +bool BamReader::HasIndex(void) const { return d->HasIndex; } +bool BamReader::IsIndexLoaded(void) const { return HasIndex(); } +bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } +bool BamReader::Jump(int refID, int position) { return d->SetRegion( BamRegion(refID, position) ); } +bool BamReader::Open(const std::string& filename, + const std::string& indexFilename, + const bool lookForIndex, + const bool preferStandardIndex) +{ + return d->Open(filename, indexFilename, lookForIndex, preferStandardIndex); +} +bool BamReader::Rewind(void) { return d->Rewind(); } +bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); } +bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) { + return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); +} + +// access alignment data +bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } +bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); } + +// access auxiliary data +const string BamReader::GetHeaderText(void) const { return d->GetHeaderText(); } +int BamReader::GetReferenceCount(void) const { return d->References.size(); } +const RefVector& BamReader::GetReferenceData(void) const { return d->References; } +int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } +const std::string BamReader::GetFilename(void) const { return d->Filename; } + +// index operations +bool BamReader::CreateIndex(bool useStandardIndex) { return d->CreateIndex(useStandardIndex); } +void BamReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { d->SetIndexCacheMode(mode); }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamReader.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,130 @@ +// *************************************************************************** +// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_H +#define BAMREADER_H + +#include <api_global.h> +#include <BamAlignment.h> +#include <BamIndex.h> +#include <string> + +namespace BamTools { + +namespace Internal { + class BamReaderPrivate; +} // namespace Internal + +class API_EXPORT BamReader { + + // constructor / destructor + public: + BamReader(void); + ~BamReader(void); + + // public interface + public: + + // ---------------------- + // BAM file operations + // ---------------------- + + // close BAM file + void Close(void); + // returns whether reader is open for reading or not + bool IsOpen(void) const; + // performs random-access jump using (reference, position) as a left-bound + bool Jump(int refID, int position = 0); + // opens BAM file (and optional BAM index file, if provided) + // @lookForIndex - if no indexFilename provided, look in BAM file's directory for an existing index file + // default behavior is to skip index file search if no index filename given + // @preferStandardIndex - if true, give priority in index file searching to standard BAM index (*.bai) + // default behavior is to prefer the BamToolsIndex (*.bti) if both are available + bool Open(const std::string& filename, + const std::string& indexFilename = "", + const bool lookForIndex = false, + const bool preferStandardIndex = false); + // returns file pointer to beginning of alignments + bool Rewind(void); + // sets a region of interest (with left & right bound reference/position) + // returns success/failure of seeking to left bound of region + bool SetRegion(const BamRegion& region); + bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment (returns success/fail) + bool GetNextAlignment(BamAlignment& bAlignment); + // retrieves next available alignment core data (returns success/fail) + // ** DOES NOT parse any character data (read name, bases, qualities, tag data) ** + // useful for operations requiring ONLY aligner-related information + // (refId/position, alignment flags, CIGAR, mapQuality, etc) + bool GetNextAlignmentCore(BamAlignment& bAlignment); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns SAM header text + const std::string GetHeaderText(void) const; + // returns number of reference sequences + int GetReferenceCount(void) const; + // returns vector of reference objects + const BamTools::RefVector& GetReferenceData(void) const; + // returns reference id (used for BamReader::Jump()) for the given reference name + int GetReferenceID(const std::string& refName) const; + // returns the name of the file associated with this BamReader + const std::string GetFilename(void) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index for BAM file, saves to file + // default behavior is to create the BAM standard index (".bai") + // set flag to false to create the BamTools-specific index (".bti") + bool CreateIndex(bool useStandardIndex = true); + // returns whether index data is available for reading + // (e.g. if true, BamReader should be able to seek to a region) + bool HasIndex(void) const; + // change the index caching behavior + // default BamReader/Index mode is LimitedIndexCaching + // @mode - can be either FullIndexCaching, LimitedIndexCaching, + // or NoIndexCaching. See BamIndex.h for more details + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + + // deprecated methods + public: + + // deprecated (but still available): prefer HasIndex() instead + // + // Deprecated purely for API semantic clarity - HasIndex() should be clearer + // than IsIndexLoaded() in light of the new caching modes that may clear the + // index data from memory, but leave the index file open for later random access + // seeks. + // + // For example, what would (IsIndexLoaded() == true) mean when cacheMode has been + // explicitly set to NoIndexCaching? This is confusing at best, misleading about + // current memory behavior at worst. + // + // returns whether index data is available + // (e.g. if true, BamReader should be able to seek to a region) + bool IsIndexLoaded(void) const; + + // private implementation + private: + Internal::BamReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMREADER_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamReader_p.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,729 @@ +// *************************************************************************** +// BamReader_p.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#include <BamReader.h> +#include <BGZF.h> +#include <BamReader_p.h> +#include <BamStandardIndex_p.h> +#include <BamToolsIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <iostream> +#include <iterator> +#include <vector> +using namespace std; + +// constructor +BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : HeaderText("") + , Index(0) + , HasIndex(false) + , AlignmentsBeginOffset(0) +// , m_header(0) + , IndexCacheMode(BamIndex::LimitedIndexCaching) + , HasAlignmentsInRegion(true) + , Parent(parent) + , DNA_LOOKUP("=ACMGRSVTWYHKDBN") + , CIGAR_LOOKUP("MIDNSHP") +{ + IsBigEndian = SystemIsBigEndian(); +} + +// destructor +BamReaderPrivate::~BamReaderPrivate(void) { + Close(); +} + +// adjusts requested region if necessary (depending on where data actually begins) +void BamReaderPrivate::AdjustRegion(BamRegion& region) { + + // check for valid index first + if ( Index == 0 ) return; + + // see if any references in region have alignments + HasAlignmentsInRegion = false; + int currentId = region.LeftRefID; + + const int rightBoundRefId = ( region.isRightBoundSpecified() ? region.RightRefID : References.size() - 1 ); + while ( currentId <= rightBoundRefId ) { + HasAlignmentsInRegion = Index->HasAlignments(currentId); + if ( HasAlignmentsInRegion ) break; + ++currentId; + } + + // if no data found on any reference in region + if ( !HasAlignmentsInRegion ) return; + + // if left bound of desired region had no data, use first reference that had data + // otherwise, leave requested region as-is + if ( currentId != region.LeftRefID ) { + region.LeftRefID = currentId; + region.LeftPosition = 0; + } +} + +// fills out character data for BamAlignment data +bool BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { + + // calculate character lengths/offsets + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; + + // check offsets to see what char data exists + const bool hasSeqData = ( seqDataOffset < dataLength ); + const bool hasQualData = ( qualDataOffset < dataLength ); + const bool hasTagData = ( tagDataOffset < dataLength ); + + // set up char buffers + const char* allCharData = bAlignment.SupportData.AllCharData.data(); + const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); + const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); + char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); + + // store alignment name (relies on null char in name as terminator) + bAlignment.Name.assign((const char*)(allCharData)); + + // save query sequence + bAlignment.QueryBases.clear(); + if ( hasSeqData ) { + bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); + for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { + char singleBase = DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + bAlignment.QueryBases.append(1, singleBase); + } + } + + // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + bAlignment.Qualities.clear(); + if ( hasQualData ) { + bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); + for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { + char singleQuality = (char)(qualData[i]+33); + bAlignment.Qualities.append(1, singleQuality); + } + } + + // if QueryBases is empty (and this is a allowed case) + if ( bAlignment.QueryBases.empty() ) + bAlignment.AlignedBases = bAlignment.QueryBases; + + // if QueryBases contains data, then build AlignedBases using CIGAR data + else { + + // resize AlignedBases + bAlignment.AlignedBases.clear(); + bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); + + // iterate over CigarOps + int k = 0; + vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin(); + vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + + const CigarOp& op = (*cigarIter); + switch(op.Type) { + + case ('M') : + case ('I') : + bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases + // fall through + + case ('S') : + k += op.Length; // for 'S' - soft clip, skip over query bases + break; + + case ('D') : + bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character + break; + + case ('P') : + bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character + break; + + case ('N') : + bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence + break; + + case ('H') : + break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op + + default: + fprintf(stderr, "ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + } + + // save tag data + bAlignment.TagData.clear(); + if ( hasTagData ) { + if ( IsBigEndian ) { + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i += sizeof(uint64_t); + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here + exit(1); + } + } + } + + // store tagData in alignment + bAlignment.TagData.resize(tagDataLength); + memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); + } + + // clear the core-only flag + bAlignment.SupportData.HasCoreOnly = false; + + // return success + return true; +} + +// clear index data structure +void BamReaderPrivate::ClearIndex(void) { + delete Index; + Index = 0; + HasIndex = false; +} + +// closes the BAM file +void BamReaderPrivate::Close(void) { + + // close BGZF file stream + mBGZF.Close(); + + // clear out index data + ClearIndex(); + + // clear out header data + HeaderText.clear(); +// if ( m_header ) { +// delete m_header; +// m_header = 0; +// } + + // clear out region flags + Region.clear(); +} + +// creates index for BAM file, saves to file +// default behavior is to create the BAM standard index (".bai") +// set flag to false to create the BamTools-specific index (".bti") +bool BamReaderPrivate::CreateIndex(bool useStandardIndex) { + + // clear out prior index data + ClearIndex(); + + // create index based on type requested + if ( useStandardIndex ) + Index = new BamStandardIndex(&mBGZF, Parent); + else + Index = new BamToolsIndex(&mBGZF, Parent); + + // set index cache mode to full for writing + Index->SetCacheMode(BamIndex::FullIndexCaching); + + // build new index + bool ok = true; + ok &= Index->Build(); + HasIndex = ok; + + // mark empty references + MarkReferences(); + + // attempt to save index data to file + ok &= Index->Write(Filename); + + // set client's desired index cache mode + Index->SetCacheMode(IndexCacheMode); + + // return success/fail of both building & writing index + return ok; +} + +const string BamReaderPrivate::GetHeaderText(void) const { + + return HeaderText; + +// if ( m_header ) +// return m_header->Text(); +// else +// return string(""); +} + +// get next alignment (from specified region, if given) +bool BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { + + // if valid alignment found, attempt to parse char data, and return success/failure + if ( GetNextAlignmentCore(bAlignment) ) + return BuildCharData(bAlignment); + + // no valid alignment found + else return false; +} + +// retrieves next available alignment core data (returns success/fail) +// ** DOES NOT parse any character data (read name, bases, qualities, tag data) +// these can be accessed, if necessary, from the supportData +// useful for operations requiring ONLY positional or other alignment-related information +bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { + + // if region is set but has no alignments + if ( !Region.isNull() && !HasAlignmentsInRegion ) + return false; + + // if valid alignment available + if ( LoadNextAlignment(bAlignment) ) { + + // set core-only flag + bAlignment.SupportData.HasCoreOnly = true; + + // if region not specified with at least a left boundary, return success + if ( !Region.isLeftBoundSpecified() ) return true; + + // determine region state (before, within, after) + BamReaderPrivate::RegionState state = IsOverlap(bAlignment); + + // if alignment lies after region, return false + if ( state == AFTER_REGION ) return false; + + while ( state != WITHIN_REGION ) { + // if no valid alignment available (likely EOF) return failure + if ( !LoadNextAlignment(bAlignment) ) return false; + // if alignment lies after region, return false (no available read within region) + state = IsOverlap(bAlignment); + if ( state == AFTER_REGION ) return false; + } + + // return success (alignment found that overlaps region) + return true; + } + + // no valid alignment + else return false; +} + +// returns RefID for given RefName (returns References.size() if not found) +int BamReaderPrivate::GetReferenceID(const string& refName) const { + + // retrieve names from reference data + vector<string> refNames; + RefVector::const_iterator refIter = References.begin(); + RefVector::const_iterator refEnd = References.end(); + for ( ; refIter != refEnd; ++refIter) + refNames.push_back( (*refIter).RefName ); + + // return 'index-of' refName ( if not found, returns refNames.size() ) + return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); +} + +// returns region state - whether alignment ends before, overlaps, or starts after currently specified region +// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true +BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { + + // if alignment is on any reference sequence before left bound + if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; + + // if alignment starts on left bound reference + else if ( bAlignment.RefID == Region.LeftRefID ) { + + // if alignment starts at or after left boundary + if ( bAlignment.Position >= Region.LeftPosition) { + + // if right boundary is specified AND + // left/right boundaries are on same reference AND + // alignment starts past right boundary + if ( Region.isRightBoundSpecified() && + Region.LeftRefID == Region.RightRefID && + bAlignment.Position > Region.RightPosition ) + return AFTER_REGION; + + // otherwise, alignment is within region + return WITHIN_REGION; + } + + // alignment starts before left boundary + else { + // check if alignment overlaps left boundary + if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION; + else return BEFORE_REGION; + } + } + + // alignment starts on a reference after the left bound + else { + + // if region has a right boundary + if ( Region.isRightBoundSpecified() ) { + + // alignment is on reference between boundaries + if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION; + + // alignment is on reference after right boundary + else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; + + // alignment is on right bound reference + else { + // check if alignment starts before or at right boundary + if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION; + else return AFTER_REGION; + } + } + + // otherwise, alignment is after left bound reference, but there is no right boundary + else return WITHIN_REGION; + } +} + +// load BAM header data +void BamReaderPrivate::LoadHeaderData(void) { + +// m_header = new BamHeader(&mBGZF); +// bool headerLoadedOk = m_header->Load(); +// if ( !headerLoadedOk ) +// cerr << "BamReader could not load header" << endl; + + // check to see if proper BAM header + char buffer[4]; + if (mBGZF.Read(buffer, 4) != 4) { + fprintf(stderr, "Could not read header type\n"); + exit(1); + } + + if (strncmp(buffer, "BAM\001", 4)) { + fprintf(stderr, "wrong header type!\n"); + exit(1); + } + + // get BAM header text length + mBGZF.Read(buffer, 4); + unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(headerTextLength); + + // get BAM header text + char* headerText = (char*)calloc(headerTextLength + 1, 1); + mBGZF.Read(headerText, headerTextLength); + HeaderText = (string)((const char*)headerText); + + // clean up calloc-ed temp variable + free(headerText); +} + +// load existing index data from BAM index file (".bti" OR ".bai"), return success/fail +bool BamReaderPrivate::LoadIndex(const bool lookForIndex, const bool preferStandardIndex) { + + // clear out any existing index data + ClearIndex(); + + // if no index filename provided, so we need to look for available index files + if ( IndexFilename.empty() ) { + + // attempt to load BamIndex based on current Filename provided & preferStandardIndex flag + const BamIndex::PreferredIndexType type = (preferStandardIndex ? BamIndex::STANDARD : BamIndex::BAMTOOLS); + Index = BamIndex::FromBamFilename(Filename, &mBGZF, Parent, type); + + // if null, return failure + if ( Index == 0 ) return false; + + // generate proper IndexFilename based on type of index created + IndexFilename = Filename + Index->Extension(); + } + + else { + + // attempt to load BamIndex based on IndexFilename provided by client + Index = BamIndex::FromIndexFilename(IndexFilename, &mBGZF, Parent); + + // if null, return failure + if ( Index == 0 ) return false; + } + + // set cache mode for BamIndex + Index->SetCacheMode(IndexCacheMode); + + // loading the index data from file + HasIndex = Index->Load(IndexFilename); + + // mark empty references + MarkReferences(); + + // return index status + return HasIndex; +} + +// populates BamAlignment with alignment data under file pointer, returns success/fail +bool BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { + + // read in the 'block length' value, make sure it's not zero + char buffer[4]; + mBGZF.Read(buffer, 4); + bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } + if ( bAlignment.SupportData.BlockLength == 0 ) return false; + + // read in core alignment data, make sure the right size of data was read + char x[BAM_CORE_SIZE]; + if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) return false; + + if ( IsBigEndian ) { + for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + SwapEndian_32p(&x[i]); + } + + // set BamAlignment 'core' and 'support' data + bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); + bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); + + unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); + bAlignment.Bin = tempValue >> 16; + bAlignment.MapQuality = tempValue >> 8 & 0xff; + bAlignment.SupportData.QueryNameLength = tempValue & 0xff; + + tempValue = BgzfData::UnpackUnsignedInt(&x[12]); + bAlignment.AlignmentFlag = tempValue >> 16; + bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; + + bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); + bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); + bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); + bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); + + // set BamAlignment length + bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; + + // read in character data - make sure proper data size was read + bool readCharDataOK = false; + const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; + char* allCharData = (char*)calloc(sizeof(char), dataLength); + + if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { + + // store 'allCharData' in supportData structure + bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); + + // set success flag + readCharDataOK = true; + + // save CIGAR ops + // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, + // even when GetNextAlignmentCore() is called + const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + CigarOp op; + bAlignment.CigarData.clear(); + bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); + for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { + + // swap if necessary + if ( IsBigEndian ) SwapEndian_32(cigarData[i]); + + // build CigarOp structure + op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); + op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; + + // save CigarOp + bAlignment.CigarData.push_back(op); + } + } + + free(allCharData); + return readCharDataOK; +} + +// loads reference data from BAM file +void BamReaderPrivate::LoadReferenceData(void) { + + // get number of reference sequences + char buffer[4]; + mBGZF.Read(buffer, 4); + unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(numberRefSeqs); + if ( numberRefSeqs == 0 ) return; + References.reserve((int)numberRefSeqs); + + // iterate over all references in header + for (unsigned int i = 0; i != numberRefSeqs; ++i) { + + // get length of reference name + mBGZF.Read(buffer, 4); + unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(refNameLength); + char* refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + mBGZF.Read(refName, refNameLength); + mBGZF.Read(buffer, 4); + int refLength = BgzfData::UnpackSignedInt(buffer); + if ( IsBigEndian ) SwapEndian_32(refLength); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + References.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } +} + +// mark references with no alignment data +void BamReaderPrivate::MarkReferences(void) { + + // ensure index is available + if ( !HasIndex ) return; + + // mark empty references + for ( int i = 0; i < (int)References.size(); ++i ) + References.at(i).RefHasAlignments = Index->HasAlignments(i); +} + +// opens BAM file (and index) +bool BamReaderPrivate::Open(const string& filename, const string& indexFilename, const bool lookForIndex, const bool preferStandardIndex) { + + // store filenames + Filename = filename; + IndexFilename = indexFilename; + + // open the BGZF file for reading, return false on failure + if ( !mBGZF.Open(filename, "rb") ) return false; + + // retrieve header text & reference data + LoadHeaderData(); + LoadReferenceData(); + + // store file offset of first alignment + AlignmentsBeginOffset = mBGZF.Tell(); + + // if no index filename provided + if ( IndexFilename.empty() ) { + + // client did not specify that index SHOULD be found + // useful for cases where sequential access is all that is required + if ( !lookForIndex ) return true; + + // otherwise, look for index file, return success/fail + return LoadIndex(lookForIndex, preferStandardIndex) ; + } + + // client supplied an index filename + // attempt to load index data, return success/fail + return LoadIndex(lookForIndex, preferStandardIndex); +} + +// returns BAM file pointer to beginning of alignment data +bool BamReaderPrivate::Rewind(void) { + + // rewind to first alignment, return false if unable to seek + if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; + + // retrieve first alignment data, return false if unable to read + BamAlignment al; + if ( !LoadNextAlignment(al) ) return false; + + // reset default region info using first alignment in file + Region.clear(); + HasAlignmentsInRegion = true; + + // rewind back to beginning of first alignment + // return success/fail of seek + return mBGZF.Seek(AlignmentsBeginOffset); +} + +// change the index caching behavior +void BamReaderPrivate::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) { + IndexCacheMode = mode; + if ( Index == 0 ) return; + Index->SetCacheMode(mode); +} + +// asks Index to attempt a Jump() to specified region +// returns success/failure +bool BamReaderPrivate::SetRegion(const BamRegion& region) { + + // clear out any prior BamReader region data + // + // N.B. - this is cleared so that BamIndex now has free reign to call + // GetNextAlignmentCore() and do overlap checking without worrying about BamReader + // performing any overlap checking of its own and moving on to the next read... Calls + // to GetNextAlignmentCore() with no Region set, simply return the next alignment. + // This ensures that the Index is able to do just that. (All without exposing + // LoadNextAlignment() to the public API, and potentially confusing clients with the nomenclature) + Region.clear(); + + // check for existing index + if ( !HasIndex ) return false; + + // adjust region if necessary to reflect where data actually begins + BamRegion adjustedRegion(region); + AdjustRegion(adjustedRegion); + + // if no data present, return true + // not an error, but BamReader knows that no data is there for future alignment access + // (this is useful in a MultiBamReader setting where some BAM files may lack data in regions + // that other BAMs have data) + if ( !HasAlignmentsInRegion ) { + Region = adjustedRegion; + return true; + } + + // attempt jump to user-specified region return false if jump could not be performed at all + // (invalid index, unknown reference, etc) + // + // Index::Jump() is allowed to modify the HasAlignmentsInRegion flag + // * This covers case where a region is requested that lies beyond the last alignment on a reference + // If this occurs, any subsequent calls to GetNexAlignment[Core] simply return false + // BamMultiReader is then able to successfully pull alignments from a region from multiple files + // even if one or more have no data. + if ( !Index->Jump(adjustedRegion, &HasAlignmentsInRegion) ) return false; + + // save region and return success + Region = adjustedRegion; + return true; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamReader_p.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,137 @@ +// *************************************************************************** +// BamReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_P_H +#define BAMREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <BamAlignment.h> +#include <BamIndex.h> +#include <BGZF.h> +#include <string> + +namespace BamTools { + +class BamReader; + +namespace Internal { + +class BamReaderPrivate { + + // enums + public: enum RegionState { BEFORE_REGION = 0 + , WITHIN_REGION + , AFTER_REGION + }; + + // ctor & dtor + public: + BamReaderPrivate(BamReader* parent); + ~BamReaderPrivate(void); + + // 'public' interface to BamReader + public: + + // file operations + void Close(void); + bool Open(const std::string& filename, + const std::string& indexFilename, + const bool lookForIndex, + const bool preferStandardIndex); + bool Rewind(void); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& bAlignment); + bool GetNextAlignmentCore(BamAlignment& bAlignment); + + // access auxiliary data + const std::string GetHeaderText(void) const; + int GetReferenceID(const std::string& refName) const; + + // index operations + bool CreateIndex(bool useStandardIndex); + void SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode); + + // 'internal' methods + public: + + // --------------------------------------- + // reading alignments and auxiliary data + + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(BamRegion& region); + // fills out character data for BamAlignment data + bool BuildCharData(BamAlignment& bAlignment); + // checks to see if alignment overlaps current region + RegionState IsOverlap(BamAlignment& bAlignment); + // retrieves header text from BAM file + void LoadHeaderData(void); + // retrieves BAM alignment under file pointer + bool LoadNextAlignment(BamAlignment& bAlignment); + // builds reference data structure from BAM file + void LoadReferenceData(void); + // mark references with 'HasAlignments' status + void MarkReferences(void); + + // --------------------------------- + // index file handling + + // clear out inernal index data structure + void ClearIndex(void); + // loads index from BAM index file + bool LoadIndex(const bool lookForIndex, const bool preferStandardIndex); + + // data members + public: + + // general file data + BgzfData mBGZF; + std::string HeaderText; + BamIndex* Index; + RefVector References; + bool HasIndex; + int64_t AlignmentsBeginOffset; + std::string Filename; + std::string IndexFilename; + +// Internal::BamHeader* m_header; + + // index caching mode + BamIndex::BamIndexCacheMode IndexCacheMode; + + // system data + bool IsBigEndian; + + // user-specified region values + BamRegion Region; + bool HasAlignmentsInRegion; + + // parent BamReader + BamReader* Parent; + + // BAM character constants + const char* DNA_LOOKUP; + const char* CIGAR_LOOKUP; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMREADER_P_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamStandardIndex_p.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,910 @@ +// *************************************************************************** +// BamStandardIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#include <BamAlignment.h> +#include <BamReader.h> +#include <BGZF.h> +#include <BamStandardIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <algorithm> +#include <iostream> +#include <map> +using namespace std; + +BamStandardIndex::BamStandardIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +BamStandardIndex::~BamStandardIndex(void) { + ClearAllData(); +} + +// calculate bins that overlap region +int BamStandardIndex::BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]) +{ + // get region boundaries + uint32_t begin = (unsigned int)region.LeftPosition; + uint32_t end; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position + if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) + end = (unsigned int)region.RightPosition; + + // otherwise, use end of left bound reference as cutoff + else + end = (unsigned int)m_references.at(region.LeftRefID).RefLength - 1; + + // initialize list, bin '0' always a valid bin + int i = 0; + bins[i++] = 0; + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } + + // return number of bins stored + return i; +} + +// creates index data (in-memory) from current reader data +bool BamStandardIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + m_reader->Rewind(); + + // get reference count, reserve index space + const int numReferences = (int)m_references.size(); + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); + + // sets default constant for bin, ID, offset, coordinate variables + const uint32_t defaultValue = 0xffffffffu; + + // bin data + uint32_t saveBin(defaultValue); + uint32_t lastBin(defaultValue); + + // reference ID data + int32_t saveRefID(defaultValue); + int32_t lastRefID(defaultValue); + + // offset data + uint64_t saveOffset = m_BGZF->Tell(); + uint64_t lastOffset = saveOffset; + + // coordinate data + int32_t lastCoordinate = defaultValue; + + BamAlignment bAlignment; + while ( m_reader->GetNextAlignmentCore(bAlignment) ) { + + // change of chromosome, save ID, reset bin + if ( lastRefID != bAlignment.RefID ) { + lastRefID = bAlignment.RefID; + lastBin = defaultValue; + } + + // if lastCoordinate greater than BAM position - file not sorted properly + else if ( lastCoordinate > bAlignment.Position ) { + fprintf(stderr, "BAM file not properly sorted:\n"); + fprintf(stderr, "Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), + lastCoordinate, bAlignment.Position, bAlignment.RefID); + exit(1); + } + + // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) + if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { + + // save linear offset entry (matched to BAM entry refID) + BamStandardIndexData::iterator indexIter = m_indexData.find(bAlignment.RefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + LinearOffsetVector& offsets = refIndex.Offsets; + SaveLinearOffset(offsets, bAlignment, lastOffset); + } + + // if current BamAlignment bin != lastBin, "then possibly write the binning index" + if ( bAlignment.Bin != lastBin ) { + + // if not first time through + if ( saveBin != defaultValue ) { + + // save Bam bin entry + BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& binMap = refIndex.Bins; + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // update saveOffset + saveOffset = lastOffset; + + // update bin values + saveBin = bAlignment.Bin; + lastBin = bAlignment.Bin; + + // update saveRefID + saveRefID = bAlignment.RefID; + + // if invalid RefID, break out + if ( saveRefID < 0 ) break; + } + + // make sure that current file pointer is beyond lastOffset + if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { + fprintf(stderr, "Error in BGZF offsets.\n"); + exit(1); + } + + // update lastOffset + lastOffset = m_BGZF->Tell(); + + // update lastCoordinate + lastCoordinate = bAlignment.Position; + } + + // save any leftover BAM data (as long as refID is valid) + if ( saveRefID >= 0 ) { + // save Bam bin entry + BamStandardIndexData::iterator indexIter = m_indexData.find(saveRefID); + if ( indexIter == m_indexData.end() ) return false; // error + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& binMap = refIndex.Bins; + SaveBinEntry(binMap, saveBin, saveOffset, lastOffset); + } + + // simplify index by merging chunks + MergeChunks(); + + // iterate through references in index + // sort offsets in linear offset vector + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); + for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { + + // get reference index data + ReferenceIndex& refIndex = (*indexIter).second; + LinearOffsetVector& offsets = refIndex.Offsets; + + // sort linear offsets + sort(offsets.begin(), offsets.end()); + } + + // rewind file pointer to beginning of alignments, return success/fail + return m_reader->Rewind(); +} + +// check index file magic number, return true if OK +bool BamStandardIndex::CheckMagicNumber(void) { + + // read in magic number + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + + // compare to expected value + if ( strncmp(magic, "BAI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + fclose(m_indexStream); + return false; + } + + // return success/failure of load + return (elementsRead == 4); +} + +// clear all current index offset data in memory +void BamStandardIndex::ClearAllData(void) { + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamStandardIndex::ClearReferenceOffsets(const int& refId) { + + // look up refId, skip if not found + BamStandardIndexData::iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return ; + + // clear reference data + ReferenceIndex& refEntry = (*indexIter).second; + refEntry.Bins.clear(); + refEntry.Offsets.clear(); + + // set flag + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamStandardIndex::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + +// calculates offset(s) for a given region +bool BamStandardIndex::GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + vector<int64_t>& offsets, + bool* hasAlignmentsInRegion) +{ + // return false if leftBound refID is not found in index data + if ( m_indexData.find(region.LeftRefID) == m_indexData.end() ) + return false; + + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + + // calculate which bins overlap this region + uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); + int numBins = BinsFromRegion(region, isRightBoundSpecified, bins); + + // get bins for this reference + BamStandardIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end() ) return false; // error + const ReferenceIndex& refIndex = (*indexIter).second; + const BamBinMap& binMap = refIndex.Bins; + + // get minimum offset to consider + const LinearOffsetVector& linearOffsets = refIndex.Offsets; + const uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) + ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); + + // store all alignment 'chunk' starts (file offsets) for bins in this region + for ( int i = 0; i < numBins; ++i ) { + + const uint16_t binKey = bins[i]; + map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey); + if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { + + // iterate over chunks + const ChunkVector& chunks = (*binIter).second; + std::vector<Chunk>::const_iterator chunksIter = chunks.begin(); + std::vector<Chunk>::const_iterator chunksEnd = chunks.end(); + for ( ; chunksIter != chunksEnd; ++chunksIter) { + + // if valid chunk found, store its file offset + const Chunk& chunk = (*chunksIter); + if ( chunk.Stop > minOffset ) + offsets.push_back( chunk.Start ); + } + } + } + + // clean up memory + free(bins); + + // sort the offsets before returning + sort(offsets.begin(), offsets.end()); + + // set flag & return success + *hasAlignmentsInRegion = (offsets.size() != 0 ); + + // if cache mode set to none, dump the data we just loaded + if (m_cacheMode == BamIndex::NoIndexCaching ) + ClearReferenceOffsets(region.LeftRefID); + + // return succes + return true; +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& refId) const { + BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return false; // error + const ReferenceIndex& refEntry = (*indexIter).second; + return refEntry.HasAlignments; +} + +// return true if all index data is cached +bool BamStandardIndex::HasFullDataCache(void) const { + return m_hasFullDataCache; +} + +// returns true if index cache has data for desired reference +bool BamStandardIndex::IsDataLoaded(const int& refId) const { + + // look up refId, return false if not found + BamStandardIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end() ) return false; + + // see if reference has alignments + // if not, it's not a problem to have no offset data + const ReferenceIndex& refEntry = (*indexIter).second; + if ( !refEntry.HasAlignments ) return true; + + // return whether bin map contains data + return ( !refEntry.Bins.empty() ); +} + +// attempts to use index to jump to region; returns success/fail +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + return false; + + // calculate offsets for this region + // if failed, print message, set flag, and return failure + vector<int64_t> offsets; + if ( !GetOffsets(region, region.isRightBoundSpecified(), offsets, hasAlignmentsInRegion) ) { + fprintf(stderr, "ERROR: Could not jump: unable to calculate offset(s) for specified region.\n"); + *hasAlignmentsInRegion = false; + return false; + } + + // iterate through offsets + BamAlignment bAlignment; + bool result = true; + for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { + + // attempt seek & load first available alignment + // set flag to true if data exists + result &= m_BGZF->Seek(*o); + *hasAlignmentsInRegion = m_reader->GetNextAlignmentCore(bAlignment); + + // if this alignment corresponds to desired position + // return success of seeking back to the offset before the 'current offset' (to cover overlaps) + if ( ((bAlignment.RefID == region.LeftRefID) && + ((bAlignment.Position + bAlignment.Length) > region.LeftPosition)) || + (bAlignment.RefID > region.LeftRefID) ) + { + if ( o != offsets.begin() ) --o; + return m_BGZF->Seek(*o); + } + } + + // if error in jumping, print message & set flag + if ( !result ) { + fprintf(stderr, "ERROR: Could not jump: unable to determine correct offset for specified region.\n"); + *hasAlignmentsInRegion = false; + } + + // return success/failure + return result; +} + +// clears index data from all references except the first +void BamStandardIndex::KeepOnlyFirstReferenceOffsets(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets((*indexBegin).first); +} + +// clears index data from all references except the one specified +void BamStandardIndex::KeepOnlyReferenceOffsets(const int& refId) { + BamStandardIndexData::iterator mapIter = m_indexData.begin(); + BamStandardIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); + } +} + +bool BamStandardIndex::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // get number of reference sequences + uint32_t numReferences; + if ( !LoadReferenceCount((int&)numReferences) ) + return false; + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < (int)numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of loading references + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamStandardIndex::LoadHeader(void) { + + bool loadedOk = CheckMagicNumber(); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of load + return loadedOk; +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadBin(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get bin ID + uint32_t binId; + elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(binId); + + // load alignment chunks for this bin + ChunkVector chunks; + bool chunksOk = LoadChunks(chunks, saveData); + + // store bin entry + if ( chunksOk && saveData ) + refEntry.Bins.insert(pair<uint32_t, ChunkVector>(binId, chunks)); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +bool BamStandardIndex::LoadBins(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // get number of bins + int32_t numBins; + elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBins); + + // set flag + refEntry.HasAlignments = ( numBins != 0 ); + + // iterate over bins + bool binsOk = true; + for ( int i = 0; i < numBins; ++i ) + binsOk &= LoadBin(refEntry, saveData); + + // return success/failure of load + return ( (elementsRead == 1) && binsOk ); +} + +// load a single index bin entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadChunk(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in chunk data + uint64_t start; + uint64_t stop; + elementsRead += fread(&start, sizeof(start), 1, m_indexStream); + elementsRead += fread(&stop, sizeof(stop), 1, m_indexStream); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // save data if requested + if ( saveData ) chunks.push_back( Chunk(start, stop) ); + + // return success/failure of load + return ( elementsRead == 2 ); +} + +bool BamStandardIndex::LoadChunks(ChunkVector& chunks, bool saveData) { + + size_t elementsRead = 0; + + // read in number of chunks + uint32_t numChunks; + elementsRead += fread(&numChunks, sizeof(numChunks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numChunks); + + // initialize space for chunks if we're storing this data + if ( saveData ) chunks.reserve(numChunks); + + // iterate over chunks + bool chunksOk = true; + for ( int i = 0; i < (int)numChunks; ++i ) + chunksOk &= LoadChunk(chunks, saveData); + + // sort chunk vector + sort( chunks.begin(), chunks.end(), ChunkLessThan ); + + // return success/failure of load + return ( (elementsRead == 1) && chunksOk ); +} + +// load a single index linear offset entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData) { + + size_t elementsRead = 0; + + // read in number of linear offsets + int32_t numLinearOffsets; + elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + + // set up destination vector (if we're saving the data) + LinearOffsetVector linearOffsets; + if ( saveData ) linearOffsets.reserve(numLinearOffsets); + + // iterate over linear offsets + uint64_t linearOffset; + for ( int i = 0; i < numLinearOffsets; ++i ) { + elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + if ( saveData ) linearOffsets.push_back(linearOffset); + } + + // sort linear offsets + sort ( linearOffsets.begin(), linearOffsets.end() ); + + // save in reference index entry if desired + if ( saveData ) refEntry.Offsets = linearOffsets; + + // return success/failure of load + return ( elementsRead == (size_t)(numLinearOffsets + 1) ); +} + +bool BamStandardIndex::LoadFirstReference(bool saveData) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference((*indexBegin).first, saveData); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamStandardIndex::LoadReference(const int& refId, bool saveData) { + + // look up refId + BamStandardIndexData::iterator indexIter = m_indexData.find(refId); + + // if reference not previously loaded, create new entry + if ( indexIter == m_indexData.end() ) { + ReferenceIndex newEntry; + newEntry.HasAlignments = false; + m_indexData.insert( pair<int32_t, ReferenceIndex>(refId, newEntry) ); + } + + // load reference data + indexIter = m_indexData.find(refId); + ReferenceIndex& entry = (*indexIter).second; + bool loadedOk = true; + loadedOk &= LoadBins(entry, saveData); + loadedOk &= LoadLinearOffsets(entry, saveData); + return loadedOk; +} + +// loads number of references, return true if loaded OK +bool BamStandardIndex::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); +} + +// merges 'alignment chunks' in BAM bin (used for index building) +void BamStandardIndex::MergeChunks(void) { + + // iterate over reference enties + BamStandardIndexData::iterator indexIter = m_indexData.begin(); + BamStandardIndexData::iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + + // get BAM bin map for this reference + ReferenceIndex& refIndex = (*indexIter).second; + BamBinMap& bamBinMap = refIndex.Bins; + + // iterate over BAM bins + BamBinMap::iterator binIter = bamBinMap.begin(); + BamBinMap::iterator binEnd = bamBinMap.end(); + for ( ; binIter != binEnd; ++binIter ) { + + // get chunk vector for this bin + ChunkVector& binChunks = (*binIter).second; + if ( binChunks.size() == 0 ) continue; + + ChunkVector mergedChunks; + mergedChunks.push_back( binChunks[0] ); + + // iterate over chunks + int i = 0; + ChunkVector::iterator chunkIter = binChunks.begin(); + ChunkVector::iterator chunkEnd = binChunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentChunk' based on numeric index + Chunk& currentChunk = mergedChunks[i]; + + // get iteratorChunk based on vector iterator + Chunk& iteratorChunk = (*chunkIter); + + // if chunk ends where (iterator) chunk starts, then merge + if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) + currentChunk.Stop = iteratorChunk.Stop; + + // otherwise + else { + // set currentChunk + 1 to iteratorChunk + mergedChunks.push_back(iteratorChunk); + ++i; + } + } + + // saved merged chunk vector + (*binIter).second = mergedChunks; + } + } +} + +// saves BAM bin entry for index +void BamStandardIndex::SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset) +{ + // look up saveBin + BamBinMap::iterator binIter = binMap.find(saveBin); + + // create new chunk + Chunk newChunk(saveOffset, lastOffset); + + // if entry doesn't exist + if ( binIter == binMap.end() ) { + ChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks)); + } + + // otherwise + else { + ChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +// saves linear offset entry for index +void BamStandardIndex::SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset) +{ + // get converted offsets + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); + + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} + +// initializes index data structure to hold @count references +void BamStandardIndex::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} + +bool BamStandardIndex::SkipToFirstReference(void) { + BamStandardIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} + +// position file pointer to desired reference begin, return true if skipped OK +bool BamStandardIndex::SkipToReference(const int& refId) { + + // attempt rewind + if ( !Rewind() ) return false; + + // read in number of references + uint32_t numReferences; + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // iterate over reference entries + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; + } + + // return success + return skippedOk; +} + +// write header to new index file +bool BamStandardIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write magic number + elementsWritten += fwrite("BAI\1", sizeof(char), 4, m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of write + return (elementsWritten == 4); +} + +// write index data for all references to new index file +bool BamStandardIndex::WriteAllReferences(void) { + + size_t elementsWritten = 0; + + // write number of reference sequences + int32_t numReferenceSeqs = m_indexData.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferenceSeqs); + elementsWritten += fwrite(&numReferenceSeqs, sizeof(numReferenceSeqs), 1, m_indexStream); + + // iterate over reference sequences + bool refsOk = true; + BamStandardIndexData::const_iterator indexIter = m_indexData.begin(); + BamStandardIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++ indexIter ) + refsOk &= WriteReference( (*indexIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && refsOk ); +} + +// write index data for bin to new index file +bool BamStandardIndex::WriteBin(const uint32_t& binId, const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + + // write chunks + bool chunksOk = WriteChunks(chunks); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for bins to new index file +bool BamStandardIndex::WriteBins(const BamBinMap& bins) { + + size_t elementsWritten = 0; + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + + // iterate over bins + bool binsOk = true; + BamBinMap::const_iterator binIter = bins.begin(); + BamBinMap::const_iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && binsOk ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::WriteChunk(const Chunk& chunk) { + + size_t elementsWritten = 0; + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); + elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); + + // return success/failure of write + return ( elementsWritten == 2 ); +} + +// write index data for chunk entry to new index file +bool BamStandardIndex::WriteChunks(const ChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write chunks + int32_t chunkCount = chunks.size(); + if ( m_isBigEndian ) SwapEndian_32(chunkCount); + elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); + + // iterate over chunks + bool chunksOk = true; + ChunkVector::const_iterator chunkIter = chunks.begin(); + ChunkVector::const_iterator chunkEnd = chunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) + chunksOk &= WriteChunk( (*chunkIter) ); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +// write index data for linear offsets entry to new index file +bool BamStandardIndex::WriteLinearOffsets(const LinearOffsetVector& offsets) { + + size_t elementsWritten = 0; + + // write number of linear offsets + int32_t offsetCount = offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(offsetCount); + elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); + + // iterate over linear offsets + LinearOffsetVector::const_iterator offsetIter = offsets.begin(); + LinearOffsetVector::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + } + + // return success/failure of write + return ( elementsWritten == (size_t)(offsetCount + 1) ); +} + +// write index data for a single reference to new index file +bool BamStandardIndex::WriteReference(const ReferenceIndex& refEntry) { + bool refOk = true; + refOk &= WriteBins(refEntry.Bins); + refOk &= WriteLinearOffsets(refEntry.Offsets); + return refOk; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamStandardIndex_p.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,213 @@ +// *************************************************************************** +// BamStandardIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#ifndef BAM_STANDARD_INDEX_FORMAT_H +#define BAM_STANDARD_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <BamAux.h> +#include <BamIndex.h> +#include <map> +#include <string> +#include <vector> + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +// BAM index constants +const int MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BAM_LIDX_SHIFT = 14; + +// -------------------------------------------------- +// BamStandardIndex data structures & typedefs +struct Chunk { + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + Chunk(const uint64_t& start = 0, + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } +}; + +inline +bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { + return lhs.Start < rhs.Start; +} + +typedef std::vector<Chunk> ChunkVector; +typedef std::map<uint32_t, ChunkVector> BamBinMap; +typedef std::vector<uint64_t> LinearOffsetVector; + +struct ReferenceIndex { + + // data members + BamBinMap Bins; + LinearOffsetVector Offsets; + bool HasAlignments; + + // constructor + ReferenceIndex(const BamBinMap& binMap = BamBinMap(), + const LinearOffsetVector& offsets = LinearOffsetVector(), + const bool hasAlignments = false) + : Bins(binMap) + , Offsets(offsets) + , HasAlignments(hasAlignments) + { } +}; + +typedef std::map<int32_t, ReferenceIndex> BamStandardIndexData; + +class BamStandardIndex : public BamIndex { + + // ctor & dtor + public: + BamStandardIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + ~BamStandardIndex(void); + + // interface (implements BamIndex virtual methods) + public: + // creates index data (in-memory) from current reader data + bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bai"); } + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index to jump to region; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + public: + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + + // 'internal' methods + public: + + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadBin(ReferenceIndex& refEntry, bool saveData = true); + bool LoadBins(ReferenceIndex& refEntry, bool saveData = true); + // load a single index bin entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadChunk(ChunkVector& chunks, bool saveData = true); + bool LoadChunks(ChunkVector& chunks, bool saveData = true); + // load a single index linear offset entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadLinearOffsets(ReferenceIndex& refEntry, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write index data for bin to new index file + bool WriteBin(const uint32_t& binId, const ChunkVector& chunks); + // write index data for bins to new index file + bool WriteBins(const BamBinMap& bins); + // write index data for chunk entry to new index file + bool WriteChunk(const Chunk& chunk); + // write index data for chunk entry to new index file + bool WriteChunks(const ChunkVector& chunks); + // write index data for linear offsets entry to new index file + bool WriteLinearOffsets(const LinearOffsetVector& offsets); + // write index data single reference to new index file + bool WriteReference(const ReferenceIndex& refEntry); + + // ----------------------- + // index data operations + + // calculate bins that overlap region + int BinsFromRegion(const BamRegion& region, + const bool isRightBoundSpecified, + uint16_t bins[MAX_BIN]); + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculates offset(s) for a given region + bool GetOffsets(const BamRegion& region, + const bool isRightBoundSpecified, + std::vector<int64_t>& offsets, + bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // simplifies index by merging 'chunks' + void MergeChunks(void); + // saves BAM bin entry for index + void SaveBinEntry(BamBinMap& binMap, + const uint32_t& saveBin, + const uint64_t& saveOffset, + const uint64_t& lastOffset); + // saves linear offset entry for index + void SaveLinearOffset(LinearOffsetVector& offsets, + const BamAlignment& bAlignment, + const uint64_t& lastOffset); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); + + // data members + private: + + BamStandardIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAM_STANDARD_INDEX_FORMAT_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamToolsIndex_p.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,577 @@ +// *************************************************************************** +// BamToolsIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#include <BamAlignment.h> +#include <BamReader.h> +#include <BGZF.h> +#include <BamToolsIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <algorithm> +#include <iostream> +#include <map> +using namespace std; + +BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader) + : BamIndex(bgzf, reader) + , m_blockSize(1000) + , m_dataBeginOffset(0) + , m_hasFullDataCache(false) + , m_inputVersion(0) + , m_outputVersion(BTI_1_2) // latest version - used for writing new index files +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamToolsIndex::~BamToolsIndex(void) { + ClearAllData(); +} + +// creates index data (in-memory) from current reader data +bool BamToolsIndex::Build(void) { + + // be sure reader & BGZF file are valid & open for reading + if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) + return false; + + // move file pointer to beginning of alignments + if ( !m_reader->Rewind() ) return false; + + // initialize index data structure with space for all references + const int numReferences = (int)m_references.size(); + m_indexData.clear(); + m_hasFullDataCache = false; + SetReferenceCount(numReferences); + + // set up counters and markers + int32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_BGZF->Tell(); + int32_t blockRefId = 0; + int32_t blockMaxEndPosition = 0; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries + BamAlignment al; + while ( m_reader->GetNextAlignmentCore(al) ) { + + // if block contains data (not the first time through) AND alignment is on a new reference + if ( currentBlockCount > 0 && al.RefID != blockRefId ) { + + // store previous data + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); + + // intialize new block for current alignment's reference + currentBlockCount = 0; + blockMaxEndPosition = al.GetEndPosition(); + blockStartOffset = currentAlignmentOffset; + } + + // if beginning of block, save first alignment's refID & position + if ( currentBlockCount == 0 ) { + blockRefId = al.RefID; + blockStartPosition = al.Position; + } + + // increment block counter + ++currentBlockCount; + + // check end position + int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; + + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == m_blockSize ) { + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); + blockStartOffset = m_BGZF->Tell(); + currentBlockCount = 0; + } + + // not the best name, but for the next iteration, this value will be the offset of the *current* alignment + // necessary because we won't know if this next alignment is on a new reference until we actually read it + currentAlignmentOffset = m_BGZF->Tell(); + } + + // store final block with data + BamToolsIndexEntry entry(blockMaxEndPosition, blockStartOffset, blockStartPosition); + SaveOffsetEntry(blockRefId, entry); + + // set flag + m_hasFullDataCache = true; + + // return success/failure of rewind + return m_reader->Rewind(); +} + +// check index file magic number, return true if OK +bool BamToolsIndex::CheckMagicNumber(void) { + + // see if index is valid BAM index + char magic[4]; + size_t elementsRead = fread(magic, 1, 4, m_indexStream); + if ( elementsRead != 4 ) return false; + if ( strncmp(magic, "BTI\1", 4) != 0 ) { + fprintf(stderr, "Problem with index file - invalid format.\n"); + return false; + } + + // otherwise ok + return true; +} + +// check index file version, return true if OK +bool BamToolsIndex::CheckVersion(void) { + + // read version from file + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if ( m_inputVersion <= 0 ) { + fprintf(stderr, "Problem with index file - invalid version.\n"); + return false; + } + + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + fprintf(stderr, "Problem with index file - attempting to use an outdated version of BamTools with a newer index file.\n"); + fprintf(stderr, "Please update BamTools to a more recent version to support this index file.\n"); + return false; + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (typically whose format did not accomodate a particular bug fix) + + else if ( (Version)m_inputVersion == BTI_1_0 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to accessing data near reference ends.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + return false; + } + + else if ( (Version)m_inputVersion == BTI_1_1 ) { + fprintf(stderr, "\nProblem with index file - this version of the index contains a bug related to handling empty references.\n"); + fprintf(stderr, "\nPlease run \'bamtools index -bti -in yourData.bam\' to generate an up-to-date BamToolsIndex.\n\n"); + return false; + } + + // otherwise ok + else return true; +} + +// clear all current index offset data in memory +void BamToolsIndex::ClearAllData(void) { + BamToolsIndexData::const_iterator indexIter = m_indexData.begin(); + BamToolsIndexData::const_iterator indexEnd = m_indexData.end(); + for ( ; indexIter != indexEnd; ++indexIter ) { + const int& refId = (*indexIter).first; + ClearReferenceOffsets(refId); + } +} + +// clear all index offset data for desired reference +void BamToolsIndex::ClearReferenceOffsets(const int& refId) { + if ( m_indexData.find(refId) == m_indexData.end() ) return; + vector<BamToolsIndexEntry>& offsets = m_indexData[refId].Offsets; + offsets.clear(); + m_hasFullDataCache = false; +} + +// return file position after header metadata +const off_t BamToolsIndex::DataBeginOffset(void) const { + return m_dataBeginOffset; +} + +// calculate BAM file offset for desired region +// return true if no error (*NOT* equivalent to "has alignments or valid offset") +// check @hasAlignmentsInRegion to determine this status +// @region - target region +// @offset - resulting seek target +// @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status +// N.B. - ignores isRightBoundSpecified +bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + + // return false if leftBound refID is not found in index data + BamToolsIndexData::const_iterator indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end()) return false; + + // load index data for region if not already cached + if ( !IsDataLoaded(region.LeftRefID) ) { + bool loadedOk = true; + loadedOk &= SkipToReference(region.LeftRefID); + loadedOk &= LoadReference(region.LeftRefID); + if ( !loadedOk ) return false; + } + + // localize index data for this reference (& sanity check that data actually exists) + indexIter = m_indexData.find(region.LeftRefID); + if ( indexIter == m_indexData.end()) return false; + const vector<BamToolsIndexEntry>& referenceOffsets = (*indexIter).second.Offsets; + if ( referenceOffsets.empty() ) return false; + + // ------------------------------------------------------- + // calculate nearest index to jump to + + // save first offset + offset = (*referenceOffsets.begin()).StartOffset; + + // iterate over offsets entries on this reference + vector<BamToolsIndexEntry>::const_iterator offsetIter = referenceOffsets.begin(); + vector<BamToolsIndexEntry>::const_iterator offsetEnd = referenceOffsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + const BamToolsIndexEntry& entry = (*offsetIter); + // break if alignment 'entry' overlaps region + if ( entry.MaxEndPosition >= region.LeftPosition ) break; + offset = (*offsetIter).StartOffset; + } + + // set flag based on whether an index entry was found for this region + *hasAlignmentsInRegion = ( offsetIter != offsetEnd ); + + // if cache mode set to none, dump the data we just loaded + if (m_cacheMode == BamIndex::NoIndexCaching ) + ClearReferenceOffsets(region.LeftRefID); + + // return success + return true; +} + +// returns whether reference has alignments or no +bool BamToolsIndex::HasAlignments(const int& refId) const { + + BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end()) return false; + const BamToolsReferenceEntry& refEntry = (*indexIter).second; + return refEntry.HasAlignments; +} + +// return true if all index data is cached +bool BamToolsIndex::HasFullDataCache(void) const { + return m_hasFullDataCache; +} + +// returns true if index cache has data for desired reference +bool BamToolsIndex::IsDataLoaded(const int& refId) const { + + BamToolsIndexData::const_iterator indexIter = m_indexData.find(refId); + if ( indexIter == m_indexData.end()) return false; + const BamToolsReferenceEntry& refEntry = (*indexIter).second; + + if ( !refEntry.HasAlignments ) return true; // no data period + + // return whether offsets list contains data + return !refEntry.Offsets.empty(); +} + +// attempts to use index to jump to region; returns success/fail +bool BamToolsIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear flag + *hasAlignmentsInRegion = false; + + // check valid BamReader state + if ( m_reader == 0 || m_BGZF == 0 || !m_reader->IsOpen() ) { + fprintf(stderr, "ERROR: Could not jump: invalid BamReader state.\n"); + return false; + } + + // make sure left-bound position is valid + if ( region.LeftPosition > m_references.at(region.LeftRefID).RefLength ) + return false; + + // calculate nearest offset to jump to + int64_t offset; + if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { + fprintf(stderr, "ERROR: Could not jump - unable to calculate offset for specified region.\n"); + return false; + } + + // return success/failure of seek + return m_BGZF->Seek(offset); +} + +// clears index data from all references except the first +void BamToolsIndex::KeepOnlyFirstReferenceOffsets(void) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + KeepOnlyReferenceOffsets( (*indexBegin).first ); +} + +// clears index data from all references except the one specified +void BamToolsIndex::KeepOnlyReferenceOffsets(const int& refId) { + BamToolsIndexData::iterator mapIter = m_indexData.begin(); + BamToolsIndexData::iterator mapEnd = m_indexData.end(); + for ( ; mapIter != mapEnd; ++mapIter ) { + const int entryRefId = (*mapIter).first; + if ( entryRefId != refId ) + ClearReferenceOffsets(entryRefId); + } +} + +// load index data for all references, return true if loaded OK +bool BamToolsIndex::LoadAllReferences(bool saveData) { + + // skip if data already loaded + if ( m_hasFullDataCache ) return true; + + // read in number of references + int32_t numReferences; + if ( !LoadReferenceCount(numReferences) ) return false; + //SetReferenceCount(numReferences); + + // iterate over reference entries + bool loadedOk = true; + for ( int i = 0; i < numReferences; ++i ) + loadedOk &= LoadReference(i, saveData); + + // set flag + if ( loadedOk && saveData ) + m_hasFullDataCache = true; + + // return success/failure of load + return loadedOk; +} + +// load header data from index file, return true if loaded OK +bool BamToolsIndex::LoadHeader(void) { + + // check magic number + if ( !CheckMagicNumber() ) return false; + + // check BTI version + if ( !CheckVersion() ) return false; + + // read in block size + size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of load + return (elementsRead == 1); +} + +// load a single index entry from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::LoadIndexEntry(const int& refId, bool saveData) { + + // read in index entry data members + size_t elementsRead = 0; + BamToolsIndexEntry entry; + elementsRead += fread(&entry.MaxEndPosition, sizeof(entry.MaxEndPosition), 1, m_indexStream); + elementsRead += fread(&entry.StartOffset, sizeof(entry.StartOffset), 1, m_indexStream); + elementsRead += fread(&entry.StartPosition, sizeof(entry.StartPosition), 1, m_indexStream); + if ( elementsRead != 3 ) { + cerr << "Error reading index entry. Expected 3 elements, read in: " << elementsRead << endl; + return false; + } + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(entry.MaxEndPosition); + SwapEndian_64(entry.StartOffset); + SwapEndian_32(entry.StartPosition); + } + + // save data + if ( saveData ) + SaveOffsetEntry(refId, entry); + + // return success/failure of load + return true; +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::LoadFirstReference(bool saveData) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + return LoadReference( (*indexBegin).first, saveData ); +} + +// load a single reference from file, return true if loaded OK +// @saveData - save data in memory if true, just read & discard if false +bool BamToolsIndex::LoadReference(const int& refId, bool saveData) { + + // read in number of offsets for this reference + uint32_t numOffsets; + size_t elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numOffsets); + + // initialize offsets container for this reference + SetOffsetCount(refId, (int)numOffsets); + + // iterate over offset entries + for ( unsigned int j = 0; j < numOffsets; ++j ) + LoadIndexEntry(refId, saveData); + + // return success/failure of load + return true; +} + +// loads number of references, return true if loaded OK +bool BamToolsIndex::LoadReferenceCount(int& numReferences) { + + size_t elementsRead = 0; + + // read reference count + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // return success/failure of load + return ( elementsRead == 1 ); +} + +// saves an index offset entry in memory +void BamToolsIndex::SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry) { + BamToolsReferenceEntry& refEntry = m_indexData[refId]; + refEntry.HasAlignments = true; + refEntry.Offsets.push_back(entry); +} + +// pre-allocates size for offset vector +void BamToolsIndex::SetOffsetCount(const int& refId, const int& offsetCount) { + BamToolsReferenceEntry& refEntry = m_indexData[refId]; + refEntry.Offsets.reserve(offsetCount); + refEntry.HasAlignments = ( offsetCount > 0); +} + +// initializes index data structure to hold @count references +void BamToolsIndex::SetReferenceCount(const int& count) { + for ( int i = 0; i < count; ++i ) + m_indexData[i].HasAlignments = false; +} + +// position file pointer to first reference begin, return true if skipped OK +bool BamToolsIndex::SkipToFirstReference(void) { + BamToolsIndexData::const_iterator indexBegin = m_indexData.begin(); + return SkipToReference( (*indexBegin).first ); +} + +// position file pointer to desired reference begin, return true if skipped OK +bool BamToolsIndex::SkipToReference(const int& refId) { + + // attempt rewind + if ( !Rewind() ) return false; + + // read in number of references + int32_t numReferences; + size_t elementsRead = fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(numReferences); + + // iterate over reference entries + bool skippedOk = true; + int currentRefId = 0; + while (currentRefId != refId) { + skippedOk &= LoadReference(currentRefId, false); + ++currentRefId; + } + + // return success/failure of skip + return skippedOk; +} + +// write header to new index file +bool BamToolsIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write BTI index format 'magic number' + elementsWritten += fwrite("BTI\1", 1, 4, m_indexStream); + + // write BTI index format version + int32_t currentVersion = (int32_t)m_outputVersion; + if ( m_isBigEndian ) SwapEndian_32(currentVersion); + elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); + + // write block size + int32_t blockSize = m_blockSize; + if ( m_isBigEndian ) SwapEndian_32(blockSize); + elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); + + // store offset of beginning of data + m_dataBeginOffset = ftell64(m_indexStream); + + // return success/failure of write + return ( elementsWritten == 6 ); +} + +// write index data for all references to new index file +bool BamToolsIndex::WriteAllReferences(void) { + + size_t elementsWritten = 0; + + // write number of references + int32_t numReferences = (int32_t)m_indexData.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + + // iterate through references in index + bool refOk = true; + BamToolsIndexData::const_iterator refIter = m_indexData.begin(); + BamToolsIndexData::const_iterator refEnd = m_indexData.end(); + for ( ; refIter != refEnd; ++refIter ) + refOk &= WriteReferenceEntry( (*refIter).second ); + + return ( (elementsWritten == 1) && refOk ); +} + +// write current reference index data to new index file +bool BamToolsIndex::WriteReferenceEntry(const BamToolsReferenceEntry& refEntry) { + + size_t elementsWritten = 0; + + // write number of offsets listed for this reference + uint32_t numOffsets = refEntry.Offsets.size(); + if ( m_isBigEndian ) SwapEndian_32(numOffsets); + elementsWritten += fwrite(&numOffsets, sizeof(numOffsets), 1, m_indexStream); + + // iterate over offset entries + bool entriesOk = true; + vector<BamToolsIndexEntry>::const_iterator offsetIter = refEntry.Offsets.begin(); + vector<BamToolsIndexEntry>::const_iterator offsetEnd = refEntry.Offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) + entriesOk &= WriteIndexEntry( (*offsetIter) ); + + return ( (elementsWritten == 1) && entriesOk ); +} + +// write current index offset entry to new index file +bool BamToolsIndex::WriteIndexEntry(const BamToolsIndexEntry& entry) { + + // copy entry data + int32_t maxEndPosition = entry.MaxEndPosition; + int64_t startOffset = entry.StartOffset; + int32_t startPosition = entry.StartPosition; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // write the reference index entry + size_t elementsWritten = 0; + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); + return ( elementsWritten == 3 ); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamToolsIndex_p.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,192 @@ +// *************************************************************************** +// BamToolsIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#ifndef BAMTOOLS_INDEX_FORMAT_H +#define BAMTOOLS_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <BamAux.h> +#include <BamIndex.h> +#include <map> +#include <string> +#include <vector> + +namespace BamTools { + +namespace Internal { + +// individual index offset entry +struct BamToolsIndexEntry { + + // data members + int32_t MaxEndPosition; + int64_t StartOffset; + int32_t StartPosition; + + // ctor + BamToolsIndexEntry(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) + { } +}; + +// reference index entry +struct BamToolsReferenceEntry { + + // data members + bool HasAlignments; + std::vector<BamToolsIndexEntry> Offsets; + + // ctor + BamToolsReferenceEntry(void) + : HasAlignments(false) + { } +}; + +// the actual index data structure +typedef std::map<int, BamToolsReferenceEntry> BamToolsIndexData; + +class BamToolsIndex : public BamIndex { + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version { BTI_1_0 = 1 + , BTI_1_1 + , BTI_1_2 + }; + + + // ctor & dtor + public: + BamToolsIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader); + ~BamToolsIndex(void); + + // interface (implements BamIndex virtual methods) + public: + // creates index data (in-memory) from current reader data + bool Build(void); + // returns supported file extension + const std::string Extension(void) const { return std::string(".bti"); } + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index to jump to region; returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + public: + // clear all current index offset data in memory + void ClearAllData(void); + // return file position after header metadata + const off_t DataBeginOffset(void) const; + // return true if all index data is cached + bool HasFullDataCache(void) const; + // clears index data from all references except the first + void KeepOnlyFirstReferenceOffsets(void); + // load index data for all references, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadAllReferences(bool saveData = true); + // load first reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadFirstReference(bool saveData = true); + // load header data from index file, return true if loaded OK + bool LoadHeader(void); + // position file pointer to first reference begin, return true if skipped OK + bool SkipToFirstReference(void); + // write index reference data + bool WriteAllReferences(void); + // write index header data + bool WriteHeader(void); + + // 'internal' methods + public: + + // ----------------------- + // index file operations + + // check index file magic number, return true if OK + bool CheckMagicNumber(void); + // check index file version, return true if OK + bool CheckVersion(void); + // return true if FILE* is open + bool IsOpen(void) const; + // load a single index entry from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadIndexEntry(const int& refId, bool saveData = true); + // load a single reference from file, return true if loaded OK + // @saveData - save data in memory if true, just read & discard if false + bool LoadReference(const int& refId, bool saveData = true); + // loads number of references, return true if loaded OK + bool LoadReferenceCount(int& numReferences); + // position file pointer to desired reference begin, return true if skipped OK + bool SkipToReference(const int& refId); + // write current reference index data to new index file + bool WriteReferenceEntry(const BamToolsReferenceEntry& refEntry); + // write current index offset entry to new index file + bool WriteIndexEntry(const BamToolsIndexEntry& entry); + + // ----------------------- + // index data operations + + // clear all index offset data for desired reference + void ClearReferenceOffsets(const int& refId); + // calculate BAM file offset for desired region + // return true if no error (*NOT* equivalent to "has alignments or valid offset") + // check @hasAlignmentsInRegion to determine this status + // @region - target region + // @offset - resulting seek target + // @hasAlignmentsInRegion - sometimes a file just lacks data in region, this flag indicates that status + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + // returns true if index cache has data for desired reference + bool IsDataLoaded(const int& refId) const; + // clears index data from all references except the one specified + void KeepOnlyReferenceOffsets(const int& refId); + // saves an index offset entry in memory + void SaveOffsetEntry(const int& refId, const BamToolsIndexEntry& entry); + // pre-allocates size for offset vector + void SetOffsetCount(const int& refId, const int& offsetCount); + // initializes index data structure to hold @count references + void SetReferenceCount(const int& count); + + // data members + private: + int32_t m_blockSize; + BamToolsIndexData m_indexData; + off_t m_dataBeginOffset; + bool m_hasFullDataCache; + bool m_isBigEndian; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMTOOLS_INDEX_FORMAT_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamWriter.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,47 @@ +// *************************************************************************** +// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include <BamWriter.h> +#include <BamWriter_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> +using namespace std; + +// constructor +BamWriter::BamWriter(void) { + d = new BamWriterPrivate; +} + +// destructor +BamWriter::~BamWriter(void) { + delete d; + d = 0; +} + +// closes the alignment archive +void BamWriter::Close(void) { + d->Close(); +} + +// opens the alignment archive +bool BamWriter::Open(const string& filename, + const string& samHeader, + const RefVector& referenceSequences, + bool isWriteUncompressed) +{ + return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); +} + +// saves the alignment to the alignment archive +void BamWriter::SaveAlignment(const BamAlignment& al) { + d->SaveAlignment(al); +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamWriter.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,50 @@ +// *************************************************************************** +// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_H +#define BAMWRITER_H + +#include <api_global.h> +#include <BamAlignment.h> +#include <string> + +namespace BamTools { + +namespace Internal { + class BamWriterPrivate; +} // namespace Internal + +class API_EXPORT BamWriter { + + // constructor/destructor + public: + BamWriter(void); + ~BamWriter(void); + + // public interface + public: + // closes the alignment archive + void Close(void); + // opens the alignment archive + bool Open(const std::string& filename, + const std::string& samHeader, + const BamTools::RefVector& referenceSequences, + bool writeUncompressed = false); + // saves the alignment to the alignment archive + void SaveAlignment(const BamTools::BamAlignment& al); + + // private implementation + private: + Internal::BamWriterPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMWRITER_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamWriter_p.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,379 @@ +// *************************************************************************** +// BamWriter_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include <BamAlignment.h> +#include <BamWriter_p.h> +using namespace BamTools; +using namespace BamTools::Internal; +using namespace std; + +BamWriterPrivate::BamWriterPrivate(void) { + IsBigEndian = SystemIsBigEndian(); +} + +BamWriterPrivate::~BamWriterPrivate(void) { + mBGZF.Close(); +} + +// closes the alignment archive +void BamWriterPrivate::Close(void) { + mBGZF.Close(); +} + +// calculates minimum bin for a BAM alignment interval +const unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { + --end; + if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + return 0; +} + +// creates a cigar string from the supplied alignment +void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + unsigned int cigarOp; + vector<CigarOp>::const_iterator coIter; + for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { + + switch(coIter->Type) { + case 'M': + cigarOp = BAM_CMATCH; + break; + case 'I': + cigarOp = BAM_CINS; + break; + case 'D': + cigarOp = BAM_CDEL; + break; + case 'N': + cigarOp = BAM_CREF_SKIP; + break; + case 'S': + cigarOp = BAM_CSOFT_CLIP; + break; + case 'H': + cigarOp = BAM_CHARD_CLIP; + break; + case 'P': + cigarOp = BAM_CPAD; + break; + default: + fprintf(stderr, "ERROR: Unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while(*pQuery) { + + switch(*pQuery) { + + case '=': + nucleotideCode = 0; + break; + + case 'A': + nucleotideCode = 1; + break; + + case 'C': + nucleotideCode = 2; + break; + + case 'G': + nucleotideCode = 4; + break; + + case 'T': + nucleotideCode = 8; + break; + + case 'N': + nucleotideCode = 15; + break; + + default: + fprintf(stderr, "ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if(useHighWord) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + pEncodedQuery++; + useHighWord = true; + } + + // increment the query position + pQuery++; + } +} + +// opens the alignment archive +bool BamWriterPrivate::Open(const string& filename, + const string& samHeader, + const RefVector& referenceSequences, + bool isWriteUncompressed) +{ + // open the BGZF file for writing, return failure if error + if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) + return false; + + // ================ + // write the header + // ================ + + // write the BAM signature + const unsigned char SIGNATURE_LENGTH = 4; + const char* BAM_SIGNATURE = "BAM\1"; + mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); + + // write the SAM header text length + uint32_t samHeaderLen = samHeader.size(); + if (IsBigEndian) SwapEndian_32(samHeaderLen); + mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); + + // write the SAM header text + if(samHeaderLen > 0) + mBGZF.Write(samHeader.data(), samHeaderLen); + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if (IsBigEndian) SwapEndian_32(numReferenceSequences); + mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); + + // ============================= + // write the sequence dictionary + // ============================= + + RefVector::const_iterator rsIter; + for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { + + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); + mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); + + // write the reference sequence name + mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if (IsBigEndian) SwapEndian_32(referenceLength); + mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); + } + + // return success + return true; +} + +// saves the alignment to the alignment archive +void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { + + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the raw char data + mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); + } + + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) + else { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; + unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; + if (IsBigEndian) SwapEndian_32(blockSize); + mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[8]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( IsBigEndian ) { + for ( int i = 0; i < 8; ++i ) + SwapEndian_32(buffer[i]); + } + + // write the BAM core + mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); + + // write the query name + mBGZF.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if ( IsBigEndian ) { + + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + + for (unsigned int i = 0; i < packedCigarLength; ++i) { + if ( IsBigEndian ) + SwapEndian_32p(&cigarData[i]); + } + + mBGZF.Write(cigarData, packedCigarLength); + free(cigarData); + } + else + mBGZF.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + mBGZF.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + string baseQualities(al.Qualities); + char* pBaseQualities = (char*)al.Qualities.data(); + for(unsigned int i = 0; i < queryLength; i++) { + pBaseQualities[i] -= 33; + } + mBGZF.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( IsBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += 2; // skip tag type (e.g. "RG", "NM", etc) + uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning + ++i; // skip value type + + switch (type) { + + case('A') : + case('C') : + ++i; + break; + + case('S') : + SwapEndian_16p(&tagData[i]); + i+=2; // sizeof(uint16_t) + break; + + case('F') : + case('I') : + SwapEndian_32p(&tagData[i]); + i+=4; // sizeof(uint32_t) + break; + + case('D') : + SwapEndian_64p(&tagData[i]); + i+=8; // sizeof(uint64_t) + break; + + case('H') : + case('Z') : + while (tagData[i]) { ++i; } + ++i; // increment one more for null terminator + break; + + default : + fprintf(stderr, "ERROR: Invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + + mBGZF.Write(tagData, tagDataLength); + free(tagData); + } + else + mBGZF.Write(al.TagData.data(), tagDataLength); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/BamWriter_p.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,63 @@ +// *************************************************************************** +// BamWriter_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_P_H +#define BAMWRITER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <BamAux.h> +#include <BGZF.h> +#include <string> +#include <vector> + +namespace BamTools { +namespace Internal { + +class BamWriterPrivate { + + // ctor & dtor + public: + BamWriterPrivate(void); + ~BamWriterPrivate(void); + + // "public" interface to BamWriter + public: + void Close(void); + bool Open(const std::string& filename, + const std::string& samHeader, + const BamTools::RefVector& referenceSequences, + bool isWriteUncompressed); + void SaveAlignment(const BamAlignment& al); + + // internal methods + public: + const unsigned int CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + + // data members + public: + BgzfData mBGZF; + bool IsBigEndian; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMWRITER_P_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/Makevars.in Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,4 @@ +PKG_LIBS=@LIBS@ -lz +PKG_CFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@ +PKG_CXXFLAGS=-I./ -D_FASTMAP -DMAQ_LONGREADS @HAVE_LIBBZ2@ +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/api_global.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,22 @@ +// *************************************************************************** +// api_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides macros for exporting & importing BamTools API library symbols +// *************************************************************************** + +#ifndef API_GLOBAL_H +#define API_GLOBAL_H + +#include "bamtools_global.h" + +#ifdef BAMTOOLS_API_LIBRARY +# define API_EXPORT BAMTOOLS_LIBRARY_EXPORT +#else +# define API_EXPORT BAMTOOLS_LIBRARY_IMPORT +#endif + +#endif // API_GLOBAL_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/bamread.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,222 @@ +#include "pc.h" +#include "config.h" +#include <vector> +#include <string.h> +#include <iostream> +#include <fstream> +#include <sstream> +#include <strstream> +#include <algorithm> +#include <string> +#include <functional> +#include <utility> +#include <ext/hash_map> +#include <boost/tokenizer.hpp> + +#include "BamAlignment.h" +#include "BamAux.h" /* RefVector/RefData */ +#include "BamReader.h" + + +extern "C" { +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" +#include "Rdefines.h" +} + +using namespace std; +using namespace __gnu_cxx; + + +class lessAbsoluteValue { +public: + bool operator()(int a, int b) const { + return abs(a) < abs(b); + } +}; + + + + + +//#define DEBUG 1 + +extern "C" { + + + // read in bam file + SEXP read_bam(SEXP filename,SEXP read_tag_names_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + boost::char_separator<char> sep2(","); + + BamTools::BamReader bamf; + + if (!bamf.Open(fname)) { + cout << "ERROR: failed to open BAM file '" << fname << "'" << endl; + } else { + + Rprintf("opened %s\n",fname); + BamTools::RefVector refs = bamf.GetReferenceData(); + BamTools::BamAlignment al; + + int fcount=0; + while (bamf.GetNextAlignment(al)) { + if (!al.IsMapped() || !al.IsPrimaryAlignment()) { + continue; + } + + string tagname=al.Name; + string chr=refs[al.RefID].RefName; + int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0)); + if(al.IsReverseStrand()) { fpos=-1*fpos; } + + uint32_t nms; + int nm=0; + if (al.GetEditDistance(nms)) { + nm=nms; + } + + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(al.Name); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm); + if(fcount>30) { + break; + } +#endif + + } + bamf.Close(); + + Rprintf("done. read %d fragments\n",fcount); + } + + + + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/bamtools_global.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,32 @@ +// *************************************************************************** +// bamtools_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides the basic definitions for exporting & importing library symbols +// *************************************************************************** + +#ifndef BAMTOOLS_GLOBAL_H +#define BAMTOOLS_GLOBAL_H + +// BAMTOOLS_LIBRARY_EXPORT +#ifndef BAMTOOLS_LIBRARY_EXPORT +# if defined(WIN32) +# define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport) +# else +# define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default"))) +# endif +#endif // BAMTOOLS_LIBRARY_EXPORT + +// BAMTOOLS_LIBRARY_IMPORT +#ifndef BAMTOOLS_LIBRARY_IMPORT +# if defined(WIN32) +# define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport) +# else +# define BAMTOOLS_LIBRARY_IMPORT +# endif +#endif // BAMTOOLS_LIBRARY_IMPORT + +#endif // BAMTOOLS_GLOBAL_H
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/bed2vector.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,2628 @@ +#include "pc.h" +#include "config.h" +#include <vector> +#include <string.h> +#include <iostream> +#include <fstream> +#include <sstream> +#include <strstream> +#include <algorithm> +#include <string> +#include <functional> +#include <utility> +#include <ext/hash_map> +#include <boost/tokenizer.hpp> + +#ifdef HAVE_LIBBZ2 +#include <bzlib.h> +#endif + +extern "C" { +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" +#include "Rdefines.h" +} + +using namespace std; +using namespace __gnu_cxx; + + +class lessAbsoluteValue { +public: + bool operator()(int a, int b) const { + return abs(a) < abs(b); + } +}; + + + +#ifdef HAVE_LIBBZ2 +int get_bzline(BZFILE* b,string& line) { + char c; + int nBuf; + int bzerror=BZ_OK; + + while(bzerror == BZ_OK) { + nBuf=BZ2_bzRead(&bzerror, b, &c, 1); + if(bzerror==BZ_OK) { + if(c=='\n') { + return bzerror; + } else { + line+=c; + } + } + } + return bzerror; +} + +int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) { + line=""; + if(bz2file) { + int bzerror=get_bzline(b,line); + if(bzerror==BZ_OK) { + return(1); + } else { + if(bzerror!=BZ_STREAM_END) { + cerr<<"encountered BZERROR="<<bzerror<<endl; + } + return(0); + } + } else { + char *cline=NULL; + size_t n; + if(getline(&cline,&n,f) != -1) { + if(cline) { + cline[strlen(cline)-1]='\0'; + line+=cline; + free(cline); + } + return(1); + } else { + return(0); + } + } +} +#endif + + +/** + * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign + * corresponding to the strand. + */ + +//#define DEBUG 1 + +extern "C" { +SEXP read_bed_ends(SEXP filename) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t"); + + + ifstream bed_file(fname); + +#ifdef DEBUG + Rprintf("opened %s\n",fname); +#endif + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + + int fcount=0; + while(getline(bed_file,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str())); + string str_start=*sit++; + int fstart=atoi(str_start.c_str()); + string str_end=*sit++; + int fend=atoi(str_end.c_str()); + int fpos=fstart; + if(sit!=tok.end()) { + string u0=*sit++; + string nfield=*sit++; + string strand=*sit++; + if(strand=="-") { + fpos=-1*fend; + } + } + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos); + if(fcount>30) { + break; + } +#endif + + } + } + bed_file.close(); + + +#ifdef DEBUG + Rprintf("done. read %d fragments\n",fcount); +#endif + + Rprintf("done. read %d fragments\n",fcount); + + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + sort(csi->begin(), csi->end(), lessAbsoluteValue()); + } + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + SEXP nv; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + int* i_nv=INTEGER(nv); + int i=0; + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_nv[i++]=*pi; + } + SET_VECTOR_ELT(ans, csi-pos.begin(), nv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + +SEXP read_meland_old(SEXP filename) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<int> > poslen; // length + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t"); + + + ifstream bed_file(fname); + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + + int fcount=0; + while(getline(bed_file,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + sit++; sit++; + string str_nm=*sit++; + int nm=0; + if(str_nm[0]=='U') { + nm=atoi((str_nm.c_str()+1)); + } else { + continue; + } + sit++; sit++; sit++; + string str_len=*sit++; + int len=atoi(str_len.c_str()); + string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); + string str_pos=*sit++; + int fpos=atoi(str_pos.c_str()); + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + poslen.push_back(vector<int>()); +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + (poslen[cind]).push_back(len); +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + bed_file.close(); + + +#ifdef DEBUG + Rprintf("done. read %d fragments\n",fcount); +#endif + + Rprintf("done. read %d fragments\n",fcount); + + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi,lsi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + lsi=poslen.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + SET_STRING_ELT(dnames_R, 2, mkChar("l")); + + + + SEXP tv,nv,lv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + PROTECT(lv=allocVector(INTSXP,csi->size())); np++; + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + int* i_lv=INTEGER(lv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + vector<int>::const_iterator ili=lsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i_lv[i]=*ili++; + i++; + } + PROTECT(dv = allocVector(VECSXP, 3)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + SET_VECTOR_ELT(dv, 2, lv); + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + int get_a_line(FILE *f,string& line) { + line=""; + char cline[1024]; + if(fgets(cline,1024,f)) { + line+=cline; + return(1); + } else { + return(0); + } + } + + + SEXP read_meland(SEXP filename,SEXP read_tag_names_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<int> > poslen; // length + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t"); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + + Rprintf("opened %s\n",fname); + + + // read in bed line + string line; + int fcount=0; + while(get_a_line(f,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string tagname=*sit++; + sit++; + string str_nm=*sit++; + int nm=0; + if(str_nm[0]=='U') { + nm=atoi((str_nm.c_str()+1)); + } else { + continue; + } + sit++; sit++; sit++; + string str_len=*sit++; + int len=atoi(str_len.c_str()); + string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); + string str_pos=*sit++; + int fpos=atoi(str_pos.c_str()); + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + poslen.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + (poslen[cind]).push_back(len); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + +#ifdef DEBUG + Rprintf("done. read %d fragments\n",fcount); +#endif + + Rprintf("done. read %d fragments\n",fcount); + + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi,lsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + lsi=poslen.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + SET_STRING_ELT(dnames_R, 2, mkChar("l")); + if(read_names) { + SET_STRING_ELT(dnames_R, 3, mkChar("s")); + } + + + + SEXP tv,nv,lv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + PROTECT(lv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + int* i_lv=INTEGER(lv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + vector<int>::const_iterator ili=lsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i_lv[i]=*ili++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + SET_VECTOR_ELT(dv, 2, lv); + if(read_names) { + SET_VECTOR_ELT(dv, 3, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + +// reads regular eland files, recording mismatch positions +SEXP read_eland_mismatches(SEXP filename) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > mm1; // position of the first mismatch (or 0 for none) + vector< vector<int> > mm2; // position of the second mismatch + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + while(get_a_line(f,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + sit++; + string seq=*sit++; + string str_nm=*sit++; + int nm=0; + if(str_nm[0]=='U') { + nm=atoi((str_nm.c_str()+1)); + } else { + continue; + } + sit++; sit++; sit++; + string chr=*sit++; + // extract chromosome name from this + int chrp=chr.find("chr"); + int pp=chr.find('.'); + chr=chr.substr(chrp+3,pp-chrp-3); + + string str_pos=*sit++; + int fpos=atoi(str_pos.c_str()); + + + string strand=*sit++; + int nstrand=0; + if(strand=="R") { + fpos=-1*(fpos+seq.size()-1); + nstrand=1; + } + + sit++; + + int nm1=0; int nm2=0; + if(sit!=tok.end()) { + string nms=*sit++; + nm1=atoi(nms.substr(0,nms.size()-1).c_str()); + if(nstrand) { nm1=seq.size()-nm1+1; } + } + if(sit!=tok.end()) { + string nms=*sit++; + nm2=atoi(nms.substr(0,nms.size()-1).c_str()); + if(nstrand) { nm2=seq.size()-nm2+1; } + } + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + mm1.push_back(vector<int>()); + mm2.push_back(vector<int>()); +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (mm1[cind]).push_back(nm1); + (mm2[cind]).push_back(nm2); +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + +#ifdef DEBUG + Rprintf("done. read %d fragments\n",fcount); +#endif + + Rprintf("done. read %d fragments\n",fcount); + + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi,lsi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=mm1.begin()+(csi-pos.begin()); + lsi=mm2.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("f")); + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + + + + SEXP tv,nv,lv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + PROTECT(lv=allocVector(INTSXP,csi->size())); np++; + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + int* i_lv=INTEGER(lv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + vector<int>::const_iterator ili=lsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i_lv[i]=*ili++; + i++; + } + PROTECT(dv = allocVector(VECSXP, 3)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + SET_VECTOR_ELT(dv, 2, lv); + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + // read in regular eland files, adjusting the negative strand coordinate by sequence length + SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); + int eland_tag_length=*(INTEGER(eland_tag_length_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + while(get_a_line(f,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string tagname=*sit++; + string sequence=*sit++; + int len=sequence.size(); + // adjust probe length if eland length limit was specified + if(eland_tag_length>0 && len>eland_tag_length) { + len=eland_tag_length; + } + string str_nm=*sit++; + int nm=0; + if(str_nm[0]=='U') { + nm=atoi((str_nm.c_str()+1)); + } else { + continue; + } + sit++; sit++; sit++; + string chr=*sit++; + string str_pos=*sit++; + int fpos=atoi(str_pos.c_str()); + string str_strand=*sit++; + + if(str_strand[0]=='R') { + fpos=-1*(fpos+len-1); + } + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + + // read in extended eland files, adjusting the negative strand coordinate by sequence length + SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); + int eland_tag_length=*(INTEGER(eland_tag_length_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + while(get_a_line(f,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string machinename=*sit++; + string runnumber=*sit++; + string lanenumber=*sit++; + *sit++; + + string str_x=*sit++; + string str_y=*sit++; + + string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y; + + + + *sit++; + *sit++; + + + string sequence=*sit++; + *sit++; + + string chr=*sit++; + string contig=*sit++; + chr=chr+contig; + + int len=sequence.size(); + // adjust probe length if eland length limit was specified + if(eland_tag_length>0 && len>eland_tag_length) { + len=eland_tag_length; + } + + + + string str_pos=*sit++; + if(str_pos.size()<1) { continue; } + int fpos=atoi(str_pos.c_str()); + string str_strand=*sit++; + + if(str_strand[0]=='R') { + fpos=-1*(fpos+len-1); + } + + string str_nm=*sit++; + // count non-digit characters + int nm=0; + for(int i=0;i<str_nm.size();i++) { + if(!isdigit(str_nm[i])) { nm++; } + } + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + // read in eland multi files, adjusting the negative strand coordinate by sequence length +SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { + +#ifdef DEBUG + Rprintf("read_eland_muti() : start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); + int eland_tag_length=*(INTEGER(eland_tag_length_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t",""); + boost::char_separator<char> comsep(",","",boost::keep_empty_tokens); + boost::char_separator<char> colsep(":","",boost::keep_empty_tokens); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int nline=0; + int fcount=0; + while(get_a_line(f,line)) { + nline++; + // chomp + size_t elpos = line.find_last_not_of("\n"); + if(elpos != string::npos) { + line = line.substr(0, elpos+1); + } +#ifdef DEBUG + Rprintf("line %d: %s\n",nline,line.c_str()); +#endif + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string tagname=*sit++; + string sequence=*sit++; + string mspec=*sit++; + // parse out match spec + + if(mspec=="NM" || mspec=="QC") { continue; } +#ifdef DEBUG + Rprintf("parsing out spec \"%s\" : ",mspec.c_str()); +#endif + + tokType stok(mspec, colsep); + tokType::iterator ssit=stok.begin(); + string str_nm0=*ssit++; + + int nm=0; + int nm0=atoi(str_nm0.c_str()); + if(nm0>1) { +#ifdef DEBUG + Rprintf("rejected for nm0\n"); +#endif + continue; + } + if(nm0==0) { + string str_nm1=*ssit++; + int nm1=atoi(str_nm1.c_str()); + if(nm1>1) { +#ifdef DEBUG + Rprintf("rejected for nm1\n"); +#endif + continue; + } + if(nm1==0) { + string str_nm2=*ssit++; + int nm2=atoi(str_nm2.c_str()); + if(nm2>1) { +#ifdef DEBUG + Rprintf("rejected for nm2\n"); +#endif + continue; + } + nm=2; + } else { + nm=1; + } + } + +#ifdef DEBUG + Rprintf("accepted (nm=%d)\n",nm); +#endif + int npos=0; + string mpos=*sit++; + vector<string> mposc; + vector<int> mposp; + tokType ptok(mpos, comsep); + string prevchr; + for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) { + string cpos=*psit; + npos++; + int strand=1; + if(cpos.size()<5) { + Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str()); + } + char lc=cpos.at(cpos.size()-1); + + if(atoi(&lc)==nm) { + switch(cpos.at(cpos.size()-2)) { + case 'R': strand=-1; break; + case 'F': strand=1; break; + default: + Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break; + continue; + } + string chr,str_pos; + size_t colpos=cpos.find(":"); + if(colpos==string::npos) { + if(npos>1) { + chr=prevchr; + str_pos=cpos.substr(0,cpos.size()-2); + } else { + Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str()); + continue; + } + } else { + chr=cpos.substr(0,colpos); + str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos); + } +#ifdef DEBUG + Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand); +#endif + int pos=strand*atoi(str_pos.c_str()); + mposc.push_back(chr); + mposp.push_back(pos); + } + } + + string chr; + int fpos; + if(mposc.size()!=1) { + if(mposc.size()==0) { + Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str()); + } else { + Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str()); + } + continue; + } else { + chr=*mposc.begin(); + fpos=*mposp.begin(); + } + + int len=sequence.size(); + // adjust probe length if eland length limit was specified + if(eland_tag_length>0 && len>eland_tag_length) { + len=eland_tag_length; + } + + if(fpos<0) { + fpos=-1*(-1*fpos+len-1); + } + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + // read in regular eland files, adjusting the negative strand coordinate by sequence length + SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + boost::char_separator<char> sep2(","); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; + } else { +#ifdef HAVE_LIBBZ2 + BZFILE* b; + int bzerror; + + int bz2file=0; + if(strstr(fname,".bz2")) { + bz2file=1; + b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); + if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } + } +#endif + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; +#ifdef HAVE_LIBBZ2 + while(get_a_line(f,b,bz2file,line)) { +#else + while(get_a_line(f,line)) { +#endif + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string tagname=*sit++; + string str_strand=*sit++; + string chr=*sit++; + + string str_pos=*sit++; + int fpos=atoi(str_pos.c_str()); + + string sequence=*sit++; + sit++; sit++; + string mm=*sit++; + + int len=sequence.size(); + if(str_strand[0]=='-') { + fpos=-1*(fpos+len-1); + } + // determine number of mismatches + int nm=0; + if(mm.size()>0) { + nm++; + string::size_type tp(0); + while(tp!=string::npos) { + tp = mm.find(",",tp); + if(tp!=string::npos) { + tp++; + ++nm; + } + } + } + + + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + +#ifdef HAVE_LIBBZ2 + BZ2_bzReadClose( &bzerror, b); +#endif + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + // read in helicos tab-separated alignment output (regular or bz2) + SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<int> > poslen; // length of the match + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + boost::char_separator<char> sep2(","); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; + } else { +#ifdef HAVE_LIBBZ2 + BZFILE* b; + int bzerror; + + int bz2file=0; + if(strstr(fname,".bz2")) { + bz2file=1; + b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); + if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } + } +#endif + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + int nlines=0; +#ifdef HAVE_LIBBZ2 + while(get_a_line(f,b,bz2file,line)) { +#else + while(get_a_line(f,line)) { +#endif + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + nlines++; + // skip comments + if(line[0]=='#') { continue; } + if(line.compare(0,12,"Reference_ID")==0) { +#ifdef DEBUG + Rprintf("matched header on line %d\n",nlines); +#endif + continue; + } + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string chr=*sit++; + string tagname=*sit++; + string str_startpos=*sit++; + string str_endpos=*sit++; + + string str_tstart=*sit++; + string str_tend=*sit++; + int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str()); + + sit++; sit++; + string str_ndel=*sit++; + string str_nins=*sit++; + string str_nsub=*sit++; + + string str_strand=*sit++; + int fpos; + if(str_strand[0]=='-') { + fpos=-1*atoi(str_endpos.c_str()); + } else { + fpos=atoi(str_startpos.c_str()); + } + + // determine number of mismatches + int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str()); + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + poslen.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + (poslen[cind]).push_back(len); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm); + if(fcount>30) { + break; + } +#endif + + } + } + +#ifdef HAVE_LIBBZ2 + BZ2_bzReadClose( &bzerror, b); +#endif + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<int> >::const_iterator lsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + lsi=poslen.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + SET_STRING_ELT(dnames_R, 2, mkChar("l")); + if(read_names) { + SET_STRING_ELT(dnames_R, 3, mkChar("s")); + } + + + + SEXP tv,nv,lv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + PROTECT(lv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + int* i_lv=INTEGER(lv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + vector<int>::const_iterator lni=lsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i_lv[i]=*lni++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + SET_VECTOR_ELT(dv, 2, lv); + if(read_names) { + SET_VECTOR_ELT(dv, 3, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + + // read in text version of maq map + SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + while(get_a_line(f,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string tagname=*sit++; + string chr=*sit++; + string str_pos=*sit++; + int fpos=atoi(str_pos.c_str()); + string str_strand=*sit++; + sit++; sit++; sit++; sit++; sit++; + string str_nm=*sit++; + sit++; sit++; sit++; + string str_len=*sit++; + int nm=atoi(str_nm.c_str()); + int len=atoi(str_len.c_str()); + + if(str_strand[0]=='-') { + fpos=-1*(fpos+len-1); + } + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + + + + // read in tagalign file + SEXP read_tagalign(SEXP filename) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t"); + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + while(get_a_line(f,line)) { + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string chr=*sit++; + string str_spos=*sit++; + string str_epos=*sit++; + sit++; + string str_qual=*sit++; + string str_strand=*sit; + + int fpos; + if(str_strand[0]=='+') { + fpos=atoi(str_spos.c_str()); + } else { + fpos=-1*atoi(str_epos.c_str()); + } + int nm=atoi(str_qual.c_str()); + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); + if(fcount>30) { + break; + } +#endif + + } + } + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + + + SEXP tv,nv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + PROTECT(dv = allocVector(VECSXP, 2)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + + + // arachne madness + SEXP read_arachne(SEXP filename) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t"); + + + + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + +#ifdef HAVE_LIBBZ2 + BZFILE* b; + int bzerror; + + int bz2file=0; + if(strstr(fname,".bz2")) { + bz2file=1; + b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); + if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } + } +#endif + + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; +#ifdef HAVE_LIBBZ2 + while(get_a_line(f,b,bz2file,line)) { +#else + while(get_a_line(f,line)) { +#endif + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string chr=*sit++; + string str_spos=*sit++; + int nm=0; + if(sit!=tok.end()) { + string str_mm=*sit; + nm=atoi(str_mm.c_str()); + } + + int fpos=atoi(str_spos.c_str());; + + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); + if(fcount>30) { + break; + } +#endif + + } + } +#ifdef HAVE_LIBBZ2 + BZ2_bzReadClose( &bzerror, b); +#endif + + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + + + SEXP tv,nv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + PROTECT(dv = allocVector(VECSXP, 2)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + + // arachne madness + SEXP read_arachne_long(SEXP filename) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<int> > poslen; // length of the match + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + typedef boost::tokenizer<boost::char_separator<char> > tokType; + boost::char_separator<char> sep(" \t"); + + + + + + FILE *f=fopen(fname,"rb"); + if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } + else { + +#ifdef HAVE_LIBBZ2 + BZFILE* b; + int bzerror; + + int bz2file=0; + if(strstr(fname,".bz2")) { + bz2file=1; + b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); + if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } + } +#endif + + + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; +#ifdef HAVE_LIBBZ2 + while(get_a_line(f,b,bz2file,line)) { +#else + while(get_a_line(f,line)) { +#endif + +#ifdef DEBUG + Rprintf("line: %s\n",line.c_str()); +#endif + + + tokType tok(line, sep); + tokType::iterator sit=tok.begin(); + if(sit!=tok.end()) { + string query=*sit++; + if(query!="QUERY") { continue; } + *sit++; *sit++; *sit++; *sit++; + string str_strand=*sit++; + string chr=*sit++; + string str_startpos=*sit++; + string str_endpos=*sit++; + + int fpos; + if(str_strand[0]=='1') { + fpos=-1*atoi(str_endpos.c_str()); + } else { + fpos=atoi(str_startpos.c_str()); + } +#ifdef DEBUG + Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos); +#endif + *sit++; + string str_nblocks=*sit++; + int nblocks=atoi(str_nblocks.c_str()); +#ifdef DEBUG + Rprintf("nblocks=%d\n",nblocks); +#endif + // tally up the read length and the number of mismatches for all blocks + int len=0; int nm=0; + for(int i=0;i<nblocks;i++) { + string str_sgs=*sit++; + int sgs=atoi(str_sgs.c_str()); + string str_slen=*sit++; + int slen=atoi(str_slen.c_str()); + string str_snm=*sit++; + int snm=atoi(str_snm.c_str()); +#ifdef DEBUG + Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm); +#endif + len+=slen; + nm+=abs(sgs)+snm; + } + nm+=nblocks-1; + + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + poslen.push_back(vector<int>()); +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + (poslen[cind]).push_back(len); +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + } +#ifdef HAVE_LIBBZ2 + BZ2_bzReadClose( &bzerror, b); +#endif + + fclose(f); + + Rprintf("done. read %d fragments\n",fcount); + } + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<int> >::const_iterator lsi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + lsi=poslen.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + SET_STRING_ELT(dnames_R, 2, mkChar("l")); + + + SEXP tv,nv,lv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + PROTECT(lv=allocVector(INTSXP,csi->size())); np++; + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + int* i_lv=INTEGER(lv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + vector<int>::const_iterator lni=lsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i_lv[i]=*lni++; + i++; + } + PROTECT(dv = allocVector(VECSXP, 3)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + SET_VECTOR_ELT(dv, 2, lv); + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/cdensum.c Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,144 @@ +#include <math.h> +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" + + +#undef DEBUG 1 + +// dout is npos-length output array. +// n - number of positions in pos (and length of tc count array) +// spos - starting position +void cdensum(int *n, double *pos, double *tc, double *spos, int *bw,int *dw, int *npos, int *step,double *dout) +{ + int i,j; + + double epos= *spos + ((double) *npos); + double dbw=(double) *bw; + for(i = 0; i< *n; i++) { + // size of the window to which the contributions should be added + int in=(int) (pos[i]- *spos); + int ic=tc[i]; + int whs=(*dw)*(*bw)*ic; + int ws=(int) floor((in-whs)/(*step)); + int we=(int) ceil((in+whs)/(*step)); + if(ws<0) { ws=0; } + if(we>= *npos) { we= *npos -1; } + + for(j=ws;j<we;j++) { + double beta=((double)(j*(*step)-in))/dbw; + dout[j]+=((double)ic)*exp(-0.5*beta*beta); + } + } +} + + +// window tag counts +// dout is npos-length output array that will contain window tag counts +// windows are of a specified size, moved at a specified step +// n - number of positions in sorted tag array (positive only) +// spos - starting position +void window_n_tags(int *n, double *pos, double *spos, int *window_size, int *window_step, int *npos, int *dout) +{ + int i; + int cs=0; int ce=0; // current array start/end indecies + int ctc=0; // current tag count + double wpos=*spos-(*window_size)/2; // left-edge position + //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",*n,*window_size,*window_step,*npos,*spos); + for(i=0;i<*npos;i++) { + // advance end if needed + double ep=wpos+(*window_size); + while(ce<(*n) && pos[ce]<=ep) { + ctc++; ce++; + } + // advance start + while(cs<*n && pos[cs]<wpos) { + ctc--; cs++; + } + dout[i]=ctc; + // advance window position + wpos+=*window_step; + } +} + +// window tag counts +// windows are of a specified size, moved at a specified step +// pos - tag positions (positive, pre-shifted)y +// spos - starting position +// returns nsteps-length output array that will contain window tag counts +SEXP cwindow_n_tags(SEXP pos_R, SEXP spos_R, SEXP window_size_R, SEXP window_step_R, SEXP nsteps_R) { + double* pos=REAL(pos_R); + int n=LENGTH(pos_R); + int window_size=*INTEGER(window_size_R); + int window_step=*INTEGER(window_step_R); + int nsteps=*INTEGER(nsteps_R); + double spos=*REAL(spos_R); + + // allocate return array + SEXP tc_R; + PROTECT(tc_R=allocVector(INTSXP,nsteps)); + int* dout=INTEGER(tc_R); + + int i; + int cs=0; int ce=0; // current array start/end indecies + int ctc=0; // current tag count + double wpos=spos-window_size/2; // left-edge position + //Rprintf("n=%d; window_size=%d, window_step=%d, npos=%d, spos=%f\n",n,window_size,window_step,nsteps,spos); + for(i=0;i<nsteps;i++) { + // advance end if needed + double ep=wpos+window_size; + while(ce<n && pos[ce]<=ep) { + ctc++; ce++; + } + // advance start + while(cs<n && pos[cs]<wpos) { + ctc--; cs++; + } + dout[i]=ctc; + // advance window position + wpos+=window_step; + } + UNPROTECT(1); + return(tc_R); +} + +// tag counts in windows around specified positions +// pos - tag positions +// ntags - number of tags in each position +// wpos - window positions +// returns a pos-length vector giving number of tags that fall within window_half_size from the provided positions +SEXP cwindow_n_tags_around(SEXP pos_R, SEXP ntags_R, SEXP wpos_R, SEXP window_half_size_R) { + double* pos=REAL(pos_R); + int* ntags=INTEGER(ntags_R); + int n=LENGTH(pos_R); + double* wpos=REAL(wpos_R); + int nw=LENGTH(wpos_R); // number of windows + double whs=(double) *INTEGER(window_half_size_R); + + // allocate return array + SEXP tc_R; + PROTECT(tc_R=allocVector(INTSXP,nw)); + int* dout=INTEGER(tc_R); + + int i; + int cs=0; int ce=0; // current array start/end indecies + int ctc=0; // current tag count + for(i=0;i<nw;i++) { + //if(i>(nw-2)) { Rprintf("-i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); } + // advance end if needed + double ep=wpos[i]+whs; + while(ce<n && pos[ce]<=ep) { + ctc+=ntags[ce]; ce++; + } + // advance start + double sp=wpos[i]-whs; + while(cs<n && pos[cs]<sp) { + ctc-=ntags[cs]; cs++; + } + dout[i]=ctc; + // if(i>(nw-2)) { Rprintf("+i=%d; cs=%d, ce=%d; ctc=%d\n",i,cs,ce,ctc); } + } + UNPROTECT(1); + return(tc_R); +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/const.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,18 @@ +#ifndef NST_CONST_H +#define NST_CONST_H + +#define MAX_ULL 0xffffffffffffffffull + +typedef unsigned long long bit64_t; +typedef unsigned bit32_t; +typedef unsigned short bit16_t; +typedef unsigned char bit8_t; + +extern bit8_t nst_nt4_table[]; +extern bit8_t nst_nt16_table[]; +extern char *nst_nt4_rev_table; +extern char *nst_nt16_rev_table; +extern bit8_t nst_nt16_nt4_table[]; +extern int nst_nt16_count_table[]; + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/maqmap.c Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,164 @@ +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <unistd.h> +#include "const.h" +#include "maqmap.h" + +maqmap_t *maq_new_maqmap() +{ + maqmap_t *mm = (maqmap_t*)calloc(1, sizeof(maqmap_t)); + mm->format = MAQMAP_FORMAT_NEW; + return mm; +} +void maq_delete_maqmap(maqmap_t *mm) +{ + int i; + if (mm == 0) return; + for (i = 0; i < mm->n_ref; ++i) + free(mm->ref_name[i]); + free(mm->ref_name); + free(mm->mapped_reads); + free(mm); +} +void maqmap_write_header(gzFile fp, const maqmap_t *mm) +{ + int i, len; + gzwrite(fp, &mm->format, sizeof(int)); + gzwrite(fp, &mm->n_ref, sizeof(int)); + for (i = 0; i != mm->n_ref; ++i) { + len = strlen(mm->ref_name[i]) + 1; + gzwrite(fp, &len, sizeof(int)); + gzwrite(fp, mm->ref_name[i], len); + } + gzwrite(fp, &mm->n_mapped_reads, sizeof(bit64_t)); +} +maqmap_t *maqmap_read_header(gzFile fp) +{ + maqmap_t *mm; + int k, len; + mm = maq_new_maqmap(); + gzread(fp, &mm->format, sizeof(int)); + if (mm->format != MAQMAP_FORMAT_NEW) { + if (mm->format > 0) { + fprintf(stderr, "** Obsolete map format is detected. Please use 'mapass2maq' command to convert the format.\n"); + exit(3); + } + assert(mm->format == MAQMAP_FORMAT_NEW); + } + gzread(fp, &mm->n_ref, sizeof(int)); + mm->ref_name = (char**)calloc(mm->n_ref, sizeof(char*)); + for (k = 0; k != mm->n_ref; ++k) { + gzread(fp, &len, sizeof(int)); + mm->ref_name[k] = (char*)malloc(len * sizeof(char)); + gzread(fp, mm->ref_name[k], len); + } + /* read number of mapped reads */ + gzread(fp, &mm->n_mapped_reads, sizeof(bit64_t)); + return mm; +} + +/* mapvalidate */ + +static void mapvalidate_core(gzFile fpin) +{ + maqmap_t *m = maqmap_read_header(fpin); + maqmap1_t *m1, mm1; + bit64_t n = 0; + int i, l; + bit64_t *cnt; + m1 = &mm1; + cnt = (bit64_t*)calloc(m->n_ref, 8); + printf("[message] number of reference sequences: %d\n", m->n_ref); + while ((l = maqmap_read1(fpin, m1)) != 0) { + if (l != sizeof(maqmap1_t)) { + printf("[fatal error] truncated map file.\n"); + break; + } + ++n; + if ((int)m1->seqid >= m->n_ref) { + printf("[fatal error] maqmap1_t::seqid is invalid (%d >= %d).\n", m1->seqid, m->n_ref); + break; + } + ++cnt[m1->seqid]; + if (m1->size >= MAX_READLEN - 1) { + printf("[faltal error] maqmap1_t::size is invalid (%d >= %d).\n", m1->size, MAX_READLEN - 1); + break; + } + } + if (m->n_mapped_reads != 0) { + if (m->n_mapped_reads != n) { + printf("[warning] maqmap1_t::n_mapped_reads is set, but not equals the real number (%llu != %llu).\n", + m->n_mapped_reads, n); + } + } + for (i = 0; i != m->n_ref; ++i) + printf("[message] %s : %llu\n", m->ref_name[i], cnt[i]); + free(cnt); + maq_delete_maqmap(m); +} + +/* mapview */ + +static void mapview_core(FILE *fpout, gzFile fpin, int is_verbose, int is_mm) +{ + bit32_t j; + maqmap_t *m = maqmap_read_header(fpin); + maqmap1_t *m1, mm1; + m1 = &mm1; + while (maqmap_read1(fpin, m1)) { + fprintf(fpout, "%s\t%s\t%d\t%c\t%d\t%u\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", + m1->name, m->ref_name[m1->seqid], (m1->pos>>1) + 1, + (m1->pos&1)? '-' : '+', m1->dist, m1->flag, m1->map_qual, (signed char)m1->seq[MAX_READLEN-1], + m1->alt_qual, m1->info1&0xf, m1->info2, m1->c[0], m1->c[1], m1->size); + if (is_verbose) { + fputc('\t', fpout); + for (j = 0; j != m1->size; ++j) { + if (m1->seq[j] == 0) fputc('n', fpout); + else if ((m1->seq[j]&0x3f) < 27) fputc("acgt"[m1->seq[j]>>6&3], fpout); + else fputc("ACGT"[m1->seq[j]>>6&3], fpout); + } + fputc('\t', fpout); + for (j = 0; j != m1->size; ++j) + fputc((m1->seq[j]&0x3f) + 33, fpout); + } + if (is_mm) { + bit64_t *p = (bit64_t*)(m1->seq + 55); + fprintf(fpout, "\t%llx", *p); + } + fputc('\n', fpout); + } + maq_delete_maqmap(m); +} + +int ma_mapview(int argc, char *argv[]) +{ + int c, is_verbose = 1, is_mm = 0; + while ((c = getopt(argc, argv, "bN")) >= 0) { + switch (c) { + case 'b': is_verbose = 0; break; + case 'N': is_mm = 1; break; + } + } + if (argc == optind) { + fprintf(stderr, "Usage: maq mapview [-bN] <in.map>\n"); + return 1; + } + gzFile fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[optind], "r"); + mapview_core(stdout, fp, is_verbose, is_mm); + gzclose(fp); + return 0; +} + +int ma_mapvalidate(int argc, char *argv[]) +{ + gzFile fp; + if (argc < 2) { + fprintf(stderr, "Usage: maq mapvalidate <in.map>\n"); + return 1; + } + fp = (strcmp(argv[optind], "-") == 0)? gzdopen(STDIN_FILENO, "r") : gzopen(argv[1], "r"); + mapvalidate_core(fp); + gzclose(fp); + return 0; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/maqmap.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,70 @@ +#ifndef MAQMAP_H_ +#define MAQMAP_H_ + +#ifdef MAQ_LONGREADS +# define MAX_READLEN 128 +#else +# define MAX_READLEN 64 +#endif + +#define MAX_NAMELEN 36 +#define MAQMAP_FORMAT_OLD 0 +#define MAQMAP_FORMAT_NEW -1 + +#define PAIRFLAG_FF 0x01 +#define PAIRFLAG_FR 0x02 +#define PAIRFLAG_RF 0x04 +#define PAIRFLAG_RR 0x08 +#define PAIRFLAG_PAIRED 0x10 +#define PAIRFLAG_DIFFCHR 0x20 +#define PAIRFLAG_NOMATCH 0x40 +#define PAIRFLAG_SW 0x80 + +#include <string.h> +#include <zlib.h> +#include "const.h" + +/* + name: read name + size: the length of the read + seq: read sequence (see also below) + seq[MAX_READLEN-1]: single end mapping quality (equals to map_qual if not paired) + map_qual: the final mapping quality + alt_qual: the lower quality of the two ends (equals to map_qual if not paired) + flag: status of the pair + dist: offset of the mate (zero if not paired) + info1: mismatches in the 24bp (higher 4 bits) and mismatches (lower 4 bits) + info2: sum of errors of the best hit + c[2]: count of all 0- and 1-mismatch hits on the reference + */ +typedef struct +{ + bit8_t seq[MAX_READLEN]; /* the last base is the single-end mapping quality. */ + bit8_t size, map_qual, info1, info2, c[2], flag, alt_qual; + bit32_t seqid, pos; + int dist; + char name[MAX_NAMELEN]; +} maqmap1_t; + +typedef struct +{ + int format, n_ref; + char **ref_name; + bit64_t n_mapped_reads; + maqmap1_t *mapped_reads; +} maqmap_t; + +#define maqmap_read1(fp, m1) gzread((fp), (m1), sizeof(maqmap1_t)) + +#ifdef __cplusplus +extern "C" { +#endif + maqmap_t *maq_new_maqmap(); + void maq_delete_maqmap(maqmap_t *mm); + void maqmap_write_header(gzFile fp, const maqmap_t *mm); + maqmap_t *maqmap_read_header(gzFile fp); +#ifdef __cplusplus +} +#endif + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/maqread.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,207 @@ +#include "pc.h" +#include <vector> +#include <string.h> +#include <iostream> +#include <fstream> +#include <sstream> +#include <strstream> +#include <algorithm> +#include <string> +#include <functional> +#include <utility> +#include <zlib.h> + +extern "C" { +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" +#include "Rdefines.h" +#include "maqmap.h" +} + +using namespace std; +using namespace __gnu_cxx; + + +class lessAbsoluteValue { +public: + bool operator()(int a, int b) const { + return abs(a) < abs(b); + } +}; + + + +//#define DEBUG 1 + +extern "C" { + + // read in text version of maq map + SEXP read_binmaqmap(SEXP filename,SEXP read_tag_names_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + const char* fname=CHAR(asChar(filename)); + int read_names=*(INTEGER(read_tag_names_R)); +#ifdef DEBUG + Rprintf("fname=%s\n",fname); +#endif + + // main data vector + // chr - pos + vector< vector<int> > pos; + vector< vector<int> > posnm; // number of mismatches + vector< vector<string> > tagnames; + + // chromosome map + hash_map<string, int, hash<string>,equal_to<string> > cind_map; + vector<string> cnames; + + + gzFile f=gzopen(fname,"r"); + + maqmap_t *m = maqmap_read_header(f); + maqmap1_t *m1, mm1; + m1 = &mm1; + + if (!f) { + cout<<"can't open input file \""<<fname<<"\"\n"; + } else { + Rprintf("opened %s\n",fname); + + // read in bed line + string line; + int fcount=0; + while(maqmap_read1(f, m1)) { + string tagname=string(m1->name); + string chr=string(m->ref_name[m1->seqid]); + int len=m1->size; + int fpos=(m1->pos>>1) + 1; + if(m1->pos&1) { + fpos=-1*(fpos+len-1); + } + int nm=m1->info1&0xf; + +#ifdef DEBUG + Rprintf("read in map line chr=%s tagname=%s fpos=%d, nm=%d, len=%d\n",chr.c_str(),tagname.c_str(),fpos,nm,len); +#endif + + + // determine the chromosome index + hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); + int cind=-1; + if(li==cind_map.end()) { + // register new chromosome + cind=cnames.size(); + cnames.push_back(chr); + cind_map[chr]=cind; + // allocate new pos vector + pos.push_back(vector<int>()); + posnm.push_back(vector<int>()); + if(read_names) { + tagnames.push_back(vector<string>()); + } +#ifdef DEBUG + Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); +#endif + } else { + cind=li->second; + } + fcount++; + (pos[cind]).push_back(fpos); + (posnm[cind]).push_back(nm); + if(read_names) { + (tagnames[cind]).push_back(tagname); + } +#ifdef DEBUG + Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); + if(fcount>30) { + break; + } +#endif + + } + gzclose(f); + Rprintf("done. read %d fragments\n",fcount); + } + + + // construct output structures + SEXP chnames; + int np=0; // number of protections + PROTECT(chnames = allocVector(STRSXP, cnames.size())); + for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { + SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); + } + np++; + + // sort + //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { + // sort(csi->begin(), csi->end(), lessAbsoluteValue()); + //} + + SEXP ans; + PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; + vector<vector<int> >::const_iterator nsi; + vector<vector<string> >::const_iterator ssi; + for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { + nsi=posnm.begin()+(csi-pos.begin()); + + SEXP dv,dnames_R; + PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; + SET_STRING_ELT(dnames_R, 0, mkChar("t")); + SET_STRING_ELT(dnames_R, 1, mkChar("n")); + if(read_names) { + SET_STRING_ELT(dnames_R, 2, mkChar("s")); + } + + + + SEXP tv,nv,sv; + PROTECT(tv=allocVector(INTSXP,csi->size())); np++; + PROTECT(nv=allocVector(INTSXP,csi->size())); np++; + if(read_names) { + PROTECT(sv=allocVector(STRSXP,csi->size())); np++; + } + int* i_tv=INTEGER(tv); + int* i_nv=INTEGER(nv); + + int i=0; + vector<int>::const_iterator ini=nsi->begin(); + for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { + i_tv[i]=*pi; + i_nv[i]=*ini++; + i++; + } + if(read_names) { + int i=0; + ssi=tagnames.begin()+(csi-pos.begin()); + for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { + SET_STRING_ELT(sv,i,mkChar(si->c_str())); + i++; + } + } + PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; + SET_VECTOR_ELT(dv, 0, tv); + SET_VECTOR_ELT(dv, 1, nv); + if(read_names) { + SET_VECTOR_ELT(dv, 2, sv); + } + setAttrib(dv, R_NamesSymbol, dnames_R); + + SET_VECTOR_ELT(ans, csi-pos.begin(), dv); + } + + setAttrib(ans,R_NamesSymbol,chnames); + +#ifdef DEBUG + Rprintf("unprotecting %d elements\n",np); +#endif + + UNPROTECT(np); + return(ans); +} + + +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/pc.h Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,20 @@ +#ifndef PC_H +#define PC_H 1 +#include <functional> +//#include <hash_map.h> +#include <ext/hash_set> +#include <ext/hash_map> + + +namespace __gnu_cxx +{ + template<> struct hash< std::string > + { + size_t operator()( const std::string& x ) const + { + return hash< const char* >()( x.c_str() ); + } + }; +} + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/peaks.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,804 @@ +#include <vector> +#include <string.h> +#include <iostream> +#include <string> +#include <set> + +extern "C" { +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" +#include "Rdefines.h" +} + +using namespace std; +using namespace __gnu_cxx; + +/** + * Calculate all local peaks + */ + +//#define DEBUG 1 + +extern "C" { + SEXP find_peaks(SEXP x_R,SEXP thr_R,SEXP max_span_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + double* x=REAL(x_R); + int nx=LENGTH(x_R); + int max_span=*INTEGER(max_span_R); + double thr=REAL(thr_R)[0]; +#ifdef DEBUG + Rprintf("n=%d; thr=%f; max_span=%d\n",nx,thr,max_span); +#endif + + vector<int> pos; + + double pv=x[0]; + double ppv=0; // previous peak value + int ppp=-max_span-1; // previous peak position + + for(int i=1;i<(nx-1);i++) { + if(x[i]>pv && x[i]>=thr && x[i]>x[i+1]) { + if(max_span>2) { + //Rprintf("i=%d; ppp=%d\n",i,ppp); + if(i-ppp > max_span) { + if(ppp>=0) { + pos.push_back(ppp); + } + //Rprintf("recorded %d; now %d\n",ppp,i); + ppp=i; ppv=x[i]; + } else { + if(x[i]>ppv) { + //Rprintf("reset from %d to %d\n",ppp,i); + ppp=i; ppv=x[i]; + } + } + } else { + pos.push_back(i); + } + } + if(x[i]!=x[i+1]) { pv=x[i]; } + } + + // add remaining peak + if(max_span>2 && ppp>=0) { + pos.push_back(ppp); + } + + SEXP nv; + PROTECT(nv=allocVector(INTSXP,pos.size())); + int* i_nv=INTEGER(nv); + int i=0; + for(vector<int> ::const_iterator pi=pos.begin();pi!=pos.end();++pi) { + i_nv[i++]=1+(*pi); + } + + UNPROTECT(1); + return(nv); + } + + + + + /************************************************************************/ + // given a data vector d (positive values) and a set of signed center coordinates pos, + // returns coordinates of data points relative to the centers + // size is the size of the region around the centers + // return: vector of relative coordinates (x) and indecies of centers relative the coordinate + // was calculated (i). + SEXP get_relative_coordinates(SEXP d_R, + SEXP pos_R, + SEXP size_R) + { + int *d, *pos; + int npos,nd,size; + + d = INTEGER(d_R); pos = INTEGER(pos_R); + npos=LENGTH(pos_R); nd=LENGTH(d_R); + size = INTEGER(size_R)[0]; +#ifdef DEBUG + Rprintf("|d|=%d, |c|=%d, size=%d\n",nd,npos,size); +#endif + + vector<int> x; vector<int> xi; + int k=0; // current pos index + + for(int i=0;i<nd;i++) { + // increment k until pos[k]+size>=d[i] + while((abs(pos[k])+size) < d[i]) { k++; if(k==npos) { break; }; +#ifdef DEBUG + Rprintf("advancing k to %d\n",k); +#endif + } + if(k==npos) { break; }; + // increment i until d[i]>=pos[k]-size + while((abs(pos[k])-size) > d[i]) { i++; if(i==nd) { break; } +#ifdef DEBUG + Rprintf("advancing i to %d\n",i); +#endif + } + if(i==nd) { break; } + + + int l=k; + while((l<npos) && ((abs(pos[l])-size) <= d[i])) { l++; +#ifdef DEBUG + Rprintf("advancing l to %d\n",l); +#endif + } + for(int j=k;j<l;j++) { + int pd=d[i]-abs(pos[j]); + if(abs(pd)<=size) { + // record + if(pos[j]>0) { + x.push_back(pd); + } else { + x.push_back(-1*pd); + } + xi.push_back(j); +#ifdef DEBUG + Rprintf("recorded i=%d, j=%d\n",i,j); +#endif + } else { + break; + } + } + } + + SEXP xv_R,xiv_R; + PROTECT(xv_R=allocVector(INTSXP,x.size())); + PROTECT(xiv_R=allocVector(INTSXP,x.size())); + int* xv=INTEGER(xv_R); + int* xiv=INTEGER(xiv_R); + + int i=0; + for(vector<int> ::const_iterator pi=x.begin();pi!=x.end();++pi) { + xv[i++]=*pi; + } + i=0; + for(vector<int> ::const_iterator pi=xi.begin();pi!=xi.end();++pi) { + xiv[i++]=1+(*pi); + } + + SEXP ans_R, names_R; + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("x")); + SET_STRING_ELT(names_R, 1, mkChar("i")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, xv_R); + SET_VECTOR_ELT(ans_R, 1, xiv_R); + setAttrib(ans_R, R_NamesSymbol, names_R); + + UNPROTECT(4); + return(ans_R); + } + + + // determines a set of points within a set of fragments + // note: all vectors sorted in ascending order + // note: all vectors are integers + // x_R - vector of point positions + // se_R - vector of start and end positions + // fi_R - vector of signed fragment indecies + // return_list_R - whether a list of fragments should be returned for each point + // return_unique_R - whether points in multiple fragments should be omitted + SEXP points_within(SEXP x_R,SEXP se_R,SEXP fi_R,SEXP return_list_R,SEXP return_unique_R,SEXP return_point_counts_R) { +#ifdef DEBUG + Rprintf("start\n"); +#endif + int* x=INTEGER(x_R); + int nx=LENGTH(x_R); + int* se=INTEGER(se_R); + int* fi=INTEGER(fi_R); + int nf=LENGTH(se_R); + + int return_list=*(INTEGER(return_list_R)); + int return_unique=*(INTEGER(return_unique_R)); + int return_point_counts=*(INTEGER(return_point_counts_R)); + +#ifdef DEBUG + Rprintf("nf=%d; nx=%d, return_list=%d, return_unique=%d, return_point_counts=%d\n",nf/2,nx,return_list,return_unique,return_point_counts); +#endif + set<int> fset; + + + SEXP nv; int *i_nv; + int np=0; + if(return_point_counts) { + PROTECT(nv = allocVector(INTSXP, nf/2)); np++; + i_nv=INTEGER(nv); + for(int i=0;i<nf/2;i++) { i_nv[i]=0; } + } else if(return_list) { + PROTECT(nv = allocVector(VECSXP, nx)); np++; + } else { + PROTECT(nv=allocVector(INTSXP,nx)); np++; + i_nv=INTEGER(nv); + } + + int j=0; + + for(int i=0;i<nx;i++) { + // advance j + while(j<nf && se[j]<x[i]) { + int frag=fi[j]; + if(frag>0) { // insert + fset.insert(frag); +#ifdef DEBUG + Rprintf("inserted frag %d, size=%d\n",frag,fset.size()); +#endif + } else { // remove + fset.erase(-frag); +#ifdef DEBUG + Rprintf("removed frag %d, size=%d\n",-frag,fset.size()); +#endif + } + j++; + } +#ifdef DEBUG + Rprintf("i=%d j=%d\n",i,j); +#endif + if(return_list) { + if(fset.empty() || (return_unique && fset.size()>1)) { + // assign null list? + } else { + SEXP fil_R; + PROTECT(fil_R=allocVector(INTSXP,fset.size())); np++; + int* fil=INTEGER(fil_R); + int k=0; + for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) { + fil[k]=*ki; k++; + } + SET_VECTOR_ELT(nv, i, fil_R); + UNPROTECT(1); np--; + } + } else { + if(return_point_counts) { + for(set<int>::const_iterator ki=fset.begin();ki!=fset.end();++ki) { + i_nv[*ki-1]++; + } + } else { + if(fset.empty() || (return_unique && fset.size()>1)) { + i_nv[i]=-1; + } else { + i_nv[i]=*fset.begin(); + } + } + } + } + + UNPROTECT(np); + return(nv); + } + + + SEXP expuni_lr(SEXP x_R, // positions and their number (assumed sorted in ascending order) + SEXP mdist_R, // max distance at which points should be considered + SEXP lambda_R, // lambda value + SEXP spos_R, // starting position + SEXP epos_R, // ending position + SEXP step_R, // step size + SEXP return_peaks_R, // whether peak positions should be returned, or entire score vector + SEXP min_peak_lr_R // min peak height (lr) + ) + { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + int* x=INTEGER(x_R); + int nx=LENGTH(x_R); + int mdist=INTEGER(mdist_R)[0]; + double lambda=*(REAL(lambda_R)); + + int return_peaks=*(INTEGER(return_peaks_R)); + double min_peak=*(REAL(min_peak_lr_R)); + + int spos=*(INTEGER(spos_R)); + int epos=*(INTEGER(epos_R)); + int step=*(INTEGER(step_R)); + + int nsteps=(int) (epos-spos)/step; + + +#ifdef DEBUG + Rprintf("n=%d; lambda=%f; mdist=%d; spos=%d; epos=%d; step=%d; nsteps=%d\n",nx,lambda,mdist,spos,epos,step,nsteps); +#endif + + + SEXP nv; + double *d_nv; + if(!return_peaks) { + PROTECT(nv=allocVector(REALSXP,nsteps+1)); + d_nv=REAL(nv); + } + + + int i=0; // current index of the first point being used in the calculations + int j=0; // current index of the last point being used in the calculations + int sx=0; // current sum of all positions + int n=0; + + for(int k=0; k<=nsteps; k++) { + int cpos=spos+k*step; + // increase i until x[i]>=cpos-mdist; remove x from sx; decrement n; + while(i<nx && x[i]<(cpos-mdist)) { + n--; sx-=x[i]; i++; + //Rprintf("incremented i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]); + } + //Rprintf("stable i: i=%d; n=%d; sx=%d; cpos-mdist=%d; x[i-1]=%d\n",i,n,sx,cpos-mdist,x[i-1]); + + //if(i>j) { j=i; } + + // increase j until x[j]>cpos + while(j<nx && x[j]<=cpos) { + n++; sx+=x[j]; j++; + //Rprintf("incremented j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j-1]); + } + //Rprintf("stable j: j=%d; n=%d; sx=%d; cpos=%d; x[j-1]=%d\n",j,n,sx,cpos,x[j]); + + // calculate lr + d_nv[k]=((double)(1-n))*log(lambda)-lambda*((double)(n*(cpos+1)-sx)); + //Rprintf("recorded lr[%d]=%f\n",k-1,d_nv[k-1]); + } + UNPROTECT(1); + return(nv); + } + + + SEXP allpdist(SEXP x_R,SEXP max_dist_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + double* x=REAL(x_R); + int nx=LENGTH(x_R); + double max_dist=*REAL(max_dist_R); +#ifdef DEBUG + Rprintf("n=%d; max_dist=%d\n",nx,max_dist); +#endif + + vector<double> dist; + + for(int i=0;i<nx;i++) { + for(int j=i+1;j<nx;j++) { + + double d=x[j]-x[i]; +#ifdef DEBUG + Rprintf("i=%d; j=%d; d=%f\n",i,j,d); +#endif + if(d<=max_dist) { + dist.push_back(d); + } else { + break; + } + } + } + + SEXP nv; + PROTECT(nv=allocVector(REALSXP,dist.size())); + double* i_nv=REAL(nv); + int i=0; + for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) { + i_nv[i++]=*pi; + } + + UNPROTECT(1); + return(nv); + } + + // same as above, but for two different sets + SEXP allxpdist(SEXP x_R,SEXP y_R, SEXP max_dist_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + double* x=REAL(x_R); + double* y=REAL(y_R); + int nx=LENGTH(x_R); + int ny=LENGTH(y_R); + double max_dist=*REAL(max_dist_R); +#ifdef DEBUG + Rprintf("nx=%d; ny=%d; max_dist=%d\n",nx,ny,max_dist); +#endif + + vector<double> dist; + int yi=0; // latest y start index + + for(int i=0;i<nx;i++) { + // adjust yi so that yi>=x[i]-max_dist_R + while(y[yi]<(x[i]-max_dist) && yi<ny) { yi++; } + if(yi==ny) { break; } + + for(int j=yi;j<ny;j++) { + double d=y[j]-x[i]; +#ifdef DEBUG + Rprintf("i=%d; j=%d; d=%f\n",i,j,d); +#endif + if(d<=max_dist) { + dist.push_back(d); + } else { + break; + } + } + } + + SEXP nv; + PROTECT(nv=allocVector(REALSXP,dist.size())); + double* i_nv=REAL(nv); + int i=0; + for(vector<double> ::const_iterator pi=dist.begin();pi!=dist.end();++pi) { + i_nv[i++]=*pi; + } + + UNPROTECT(1); + return(nv); + } + + // returns a vector giving for each point, + // number of points within a given max_dist + SEXP nwithindist(SEXP x_R,SEXP max_dist_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + double* x=REAL(x_R); + int nx=LENGTH(x_R); + double max_dist=*REAL(max_dist_R); + + SEXP nv; + PROTECT(nv=allocVector(REALSXP,nx)); + double* i_nv=REAL(nv); + for(int i=0;i<nx;i++) { i_nv[i]=0; } + +#ifdef DEBUG + Rprintf("n=%d; max_dist=%d\n",nx,max_dist); +#endif + + for(int i=0;i<nx;i++) { + for(int j=i+1;j<nx;j++) { + + double d=x[j]-x[i]; +#ifdef DEBUG + Rprintf("i=%d; j=%d; d=%f\n",i,j,d); +#endif + if(d<=max_dist) { + i_nv[i]++; + i_nv[j]++; + } else { + break; + } + } + } + + UNPROTECT(1); + return(nv); + } + + + + + // given a list of sorted chromosome signal and background vectors (unscaled), determine + // cluster contigs exceeding thr poisson P value, based on a whs window size, + // and satisfying mcs cluster size + SEXP find_poisson_enrichment_clusters(SEXP pos_R,SEXP flag_R,SEXP wsize_R,SEXP thr_R,SEXP mcs_R,SEXP bgm_R,SEXP mintag_R,SEXP either_R) { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + double* pos=REAL(pos_R); + int* flag=INTEGER(flag_R); + int nt=LENGTH(pos_R); + + int mcs=*INTEGER(mcs_R); + int wsize=*INTEGER(wsize_R); + int either=*INTEGER(either_R); + double thr=REAL(thr_R)[0]; + double bgm=REAL(bgm_R)[0]; + double mintag=REAL(mintag_R)[0]; + +#ifdef DEBUG + Rprintf("nt=%d; wsize=%d; thr=%f; mcs=%d; min.tag=%f; bgm=%f\n",nt,wsize,thr,mcs,mintag,bgm); +#endif + + + vector< pair<double,double> > contigs; + + // running indecies (start and end) + int si=0; + int ei=0; + + // current window coordinate + double ws=pos[0]; + + // current window tag counts + int cc[2]={0,0}; + + + if(nt>0) { + cc[flag[si]]++; + // increment window end + while(ei<(nt-1) && (pos[ei+1]-ws) <= wsize) { + ei++; + cc[flag[ei]]++; + } + + + // cluster start,end positions + double cs,ce; + int inclust=0; + + while(si<nt-1) { + + if((pos[si+1]-ws) > (pos[ei+1] - ws - wsize) && ei!=(nt-1)) { + // move end boudnary + ei++; + ws=pos[ei]-wsize; + cc[flag[ei]]++; + while(ei<(nt-1) && pos[ei+1]==ws+wsize) { + ei++; + cc[flag[ei]]++; + } + + // increment window start + while(si<(nt-1) && pos[si] < ws) { + cc[flag[si]]--; + si++; + } + + } else { + // move up start boundary + ws=pos[si+1]; + cc[flag[si]]--; + si++; + while(si<(nt-1) && pos[si+1]==ws) { + cc[flag[si]]--; + si++; + } + + // increment window end + while(ei<(nt-1) && (pos[ei+1] - ws) <= wsize) { + ei++; + cc[flag[ei]]++; + } + + } + + // calculate z score + double dc0=((double)cc[0])+0.5; + double dc1=((double)cc[1])+0.5; + double rte=dc0+dc1-0.25*thr*thr; + double lb; + if(rte<=0) { + lb=0; + } else { + lb=(sqrt(dc1*dc0) - 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr); + if(lb<0) { lb=0; } + lb*=lb; + } + + //Rprintf("%f=f(%f,%f,%f); %f=f(%f,%f,%f)\n",lb,1.0-thr,2.0*dc1,2.0*dc0,ub,thr,2.0*dc1,2.0*dc0); + +#ifdef DEBUG + //double ub=gsl_cdf_fdist_Qinv(thr,2.0*dc1,2.0*dc0)*dc1/dc0; + double ub=(sqrt(dc1*dc0) + 0.5*thr*sqrt(rte))/(dc0 - 0.25*thr*thr); + ub*=ub; + Rprintf("s=%d (%f); e=%d (%f); window: %f-%f; cc=[%d,%d]; lb=%f; ub=%f\n",si,pos[si],ei,pos[ei],ws,ws+wsize,cc[0],cc[1],lb,ub); +#endif + + int bc=lb>=bgm && cc[1]>=mintag; + if(either) { + bc=lb>=bgm || cc[1]>=mintag; + } + if(bc) { + if(inclust) { + double nce=ws+wsize/2.0; + if(nce-ce > wsize/2.0) { + // next point is too far removed, end cluster + if(ce-cs >= mcs) { + contigs.push_back(pair<double,double>(cs,ce)); +#ifdef DEBUG + Rprintf("recorded cluster %f-%f\n",cs,ce); +#endif + } + inclust=0; + } else { + ce=nce; + } + } else { + inclust=1; + cs=ws+wsize/2.0; + ce=cs; + } + } else { + if(inclust) { + if(ce-cs >= mcs) { + contigs.push_back(pair<double,double>(cs,ce)); +#ifdef DEBUG + Rprintf("recorded cluster %f-%f\n",cs,ce); +#endif + } + inclust=0; + } + } + + } + + if(inclust) { + if(ce-cs >= mcs) { + contigs.push_back(pair<double,double>(cs,ce)); +#ifdef DEBUG + Rprintf("recorded cluster %f-%f\n",cs,ce); +#endif + } + inclust=0; + } + } + + SEXP cs_R,ce_R; + PROTECT(cs_R=allocVector(REALSXP,contigs.size())); + PROTECT(ce_R=allocVector(REALSXP,contigs.size())); + double* csa=REAL(cs_R); + double* cea=REAL(ce_R); + + int i=0; + for(vector< pair<double,double> >::const_iterator ci=contigs.begin(); ci!=contigs.end();++ci) { + csa[i]=ci->first; + cea[i]=ci->second; + i++; + } + + SEXP ans_R, names_R; + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("s")); + SET_STRING_ELT(names_R, 1, mkChar("e")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, cs_R); + SET_VECTOR_ELT(ans_R, 1, ce_R); + setAttrib(ans_R, R_NamesSymbol, names_R); + + UNPROTECT(4); + return(ans_R); + + } + + + // finds intersection between a list of regions + // the flag has +n/-n value, corresponding to the start/end of a segment in n-th regionset + // max_val: 1 - report max overlapping value, -1: report min, 0 - don't look at values + // returns: $s, $e, ($v) lists + SEXP region_intersection(SEXP n_R,SEXP pos_R,SEXP flags_R,SEXP vals_R,SEXP max_val_R,SEXP union_R) { + const int max_val=*INTEGER(max_val_R); + const int unionr=*INTEGER(union_R); + const int n=*INTEGER(n_R); + double* pos=REAL(pos_R); + int* flags=INTEGER(flags_R); + double* val=REAL(vals_R); + +#ifdef DEBUG + Rprintf("n=%d; npos=%d; max_val=%d\n",n,LENGTH(pos_R),max_val); +#endif + + int s[n]; // flag status for each set + double mv[n]; // max/min value of current clusters + + for(int i=0;i<n;i++) { s[i]=0; } + + vector<double> starts; + vector<double> ends; + vector<double> values; + + int start=-1; + double mval=0; + for(int i=0;i<LENGTH(pos_R);i++) { + // update flags + int f=flags[i]; + if(f>0) { + s[abs(f)-1]++; + } else { + s[abs(f)-1]--; + } + + if(max_val!=0 && val[i]*max_val > mval*max_val) { mval=val[i]; } + + // joined status + int all; + if(unionr) { + all=0; + for(int j=0;j<n;j++) { if(s[j]>0) { all=1; break;} } + } else { + all=1; + for(int j=0;j<n;j++) { all=all & (s[j]>0); } + } + + + //Rprintf("i=%d; s=[",i); + //for(int j=0;j<n;j++) { Rprintf("%d",s[j]); } + //Rprintf("]; all=%d; start=%d\n",all,start); + + if(start>=0) { + // in fragment + if(!all) { + // end fragment + starts.push_back(pos[start]); + ends.push_back(pos[i]); + start=-1; + if(max_val!=0) { values.push_back(mval); } + +#ifdef DEBUG + Rprintf("recorded new fragment (s=%f,e=%f,v=%f);\n",pos[start],pos[i],mval); +#endif + } + } else { + // should a fragment be started? + if(all) { + start=i; + if(max_val!=0) { mval=val[i]; } +#ifdef DEBUG + Rprintf("starting new fragment (s=%f,i=%d);\n",pos[start],i); +#endif + } + } + } + SEXP cs_R,ce_R,cv_R; + PROTECT(cs_R=allocVector(REALSXP,starts.size())); + PROTECT(ce_R=allocVector(REALSXP,ends.size())); + + double* csa=REAL(cs_R); + int i=0; + for(vector<double>::const_iterator ci=starts.begin(); ci!=starts.end(); ++ci) { + csa[i]=*ci; i++; + } + + csa=REAL(ce_R); + i=0; + for(vector<double>::const_iterator ci=ends.begin(); ci!=ends.end(); ++ci) { + csa[i]=*ci; i++; + } + + if(max_val!=0) { + PROTECT(cv_R=allocVector(REALSXP,values.size())); + csa=REAL(cv_R); + i=0; + for(vector<double>::const_iterator ci=values.begin(); ci!=values.end(); ++ci) { + csa[i]=*ci; i++; + } + } + + SEXP ans_R, names_R; + if(max_val!=0) { + PROTECT(names_R = allocVector(STRSXP, 3)); + SET_STRING_ELT(names_R, 0, mkChar("s")); + SET_STRING_ELT(names_R, 1, mkChar("e")); + SET_STRING_ELT(names_R, 2, mkChar("v")); + + PROTECT(ans_R = allocVector(VECSXP, 3)); + SET_VECTOR_ELT(ans_R, 0, cs_R); + SET_VECTOR_ELT(ans_R, 1, ce_R); + SET_VECTOR_ELT(ans_R, 2, cv_R); + } else { + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("s")); + SET_STRING_ELT(names_R, 1, mkChar("e")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, cs_R); + SET_VECTOR_ELT(ans_R, 1, ce_R); + } + + setAttrib(ans_R, R_NamesSymbol, names_R); + + if(max_val!=0) { + UNPROTECT(5); + } else { + UNPROTECT(4); + } + return(ans_R); + } + +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/wdl.cpp Tue Nov 27 16:13:29 2012 -0500 @@ -0,0 +1,657 @@ +#include <vector> +#include <string.h> +#include <iostream> +#include <string> +#include <set> + +extern "C" { +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" +#include "Rdefines.h" +} + +using namespace std; +using namespace __gnu_cxx; + +//#define DEBUG 1 + +extern "C" { + + /************************************************************************/ + /* + * lwcc - calculate local window cross-correlation + */ + + SEXP lwcc(SEXP x_R, // positive strand hist + SEXP y_R, // negative strand hist of the same length + SEXP osize_R, // outer boundary distance + SEXP isize_R, // inner boundary distance + SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned + SEXP min_peak_dist_R, // distance between closest peaks + SEXP min_peak_val_R, // min peak threshold + SEXP tag_weight_R, // tag weight + SEXP bg_subtract_R, // a flag whether do background subtractio + SEXP bgp_R, // optional background hist for positive strand + SEXP bgn_R, // optional background hist for negative strand + SEXP bg_wsize_R, // window size for the background counts + SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout) + SEXP round_up_R // whether to round up fractional signal tag counts + ) + { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + + int is=INTEGER(isize_R)[0]; + int os=INTEGER(osize_R)[0]; + double rs=((double)(2*os+1)); + int* x=INTEGER(x_R); + int* y=INTEGER(y_R); + int n_x=LENGTH(x_R); + + // background-related + int* bgp=INTEGER(bgp_R); + int* bgn=INTEGER(bgn_R); + int bg_whs=INTEGER(bg_wsize_R)[0]; + + int return_peaks=*(INTEGER(return_peaks_R)); + double min_peak_val=*(REAL(min_peak_val_R)); + int min_peak_dist=*(INTEGER(min_peak_dist_R)); + double tag_weight=*(REAL(tag_weight_R)); + + const int round_up=*(INTEGER(round_up_R)); + const int bg_subtract=*(INTEGER(bg_subtract_R)); + const double bg_weight=*(REAL(bg_weight_R)); + + int i; // point at which the value is being calculated + int start=os; + int end=n_x-os-1; + + // bg tag counts within bg window + int bg_pn1=0; + int bg_nn1=0; + int bg_pn2=0; + int bg_nn2=0; + + + + // illustration for counting: + // + // 012345678901234567890123456789012 + // ==========------|------========== + // + // osize=16; isize=6; + + + SEXP nv; + double *d_nv; + vector<int> ppos; + vector<double> pval; + if(!return_peaks) { + PROTECT(nv=allocVector(REALSXP,n_x)); + d_nv=REAL(nv); + for(int i=0;i<n_x;i++) { + d_nv[i]=0; + } + } + +#ifdef DEBUG + Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight); + Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]); +#endif + + int lpp=-1; // last peak position + double lpv=-1e3; // last peak value + + double ppv=-1e3; // last value + double pppv=-11e-3; // value before last + + int pn1,pn2,nn1,nn2; + + + if(bg_subtract) { + // pre-initialize background tag counts, + for(int i=0;i<bg_whs;i++) { + if(i<n_x) { + bg_pn2+=bgp[i]; + bg_nn2+=bgn[i]; + } + } + } + + + for(i=0;i<end;i++) { +#ifdef DEBUG + //Rprintf("i=%d ", i); +#endif + + if(bg_subtract) { + // update background counts + int nl=i-bg_whs-1; + + if(nl>=0) { + bg_pn1-=bgp[nl]; + bg_nn1-=bgn[nl]; + } + bg_pn1+=bgp[i]; + bg_nn1+=bgn[i]; + + if(i>0) { + bg_pn2-=bgp[i-1]; + bg_nn2-=bgn[i-1]; + } + int nr=i+bg_whs; + if(nr<n_x) { + bg_pn2+=bgp[nr]; + bg_nn2+=bgn[nr]; + } + } + + if(i >= start) { + // update counts, taking into account masked out regions + pn1=pn2=nn1=nn2=0; + + for(int k=0;k<=(os-is);k++) { + int xp1=x[i-os+k]; + int xp2=x[i+os-k]; + int xn1=y[i+os-k]; + int xn2=y[i-os+k]; + + if(xp1!=-1 && xn1!=-1) { + pn1+=xp1; + nn1+=xn1; + } + if(xp2!=-1 && xn2!=-1) { + pn2+=xp2; + nn2+=xn2; + } + } + + // calculate the means + double mp=((double)(pn1+pn2))/rs; + double mn=((double)(pn1+pn2))/rs; +#ifdef DEBUG + Rprintf("mp=%f mn=%f\n",mp,mn); +#endif + // calculate correlation + double varp=0; + double varn=0; + double num=0; + double val=-1e3; + if(mp>0 & mn>0) { + for(int k=0;k<=(os-is);k++) { + int xp1=x[i-os+k]; + int xp2=x[i+os-k]; + int xn1=y[i+os-k]; + int xn2=y[i-os+k]; + + + if(xp1!=-1 && xn1!=-1) { + double nnp1=((double) xp1)-mp; + double nnn1=((double) xn1)-mn; + num+=nnp1*nnn1; + varp+=nnp1*nnp1; + varn+=nnn1*nnn1; + } + + if(xp2!=-1 && xn2!=-1) { + double nnp2=((double) xp2)-mp; + double nnn2=((double) xn2)-mn; + num+=nnp2*nnn2; + varp+=nnp2*nnp2; + varn+=nnn2*nnn2; + } + + } + double tagw; + double spn1=((double)pn1)*tag_weight; + double snn1=((double)nn1)*tag_weight; + double spn2=((double)pn2)*tag_weight; + double snn2=((double)nn2)*tag_weight; + if(round_up) { + if(pn1>0 && spn1<1) { spn1=1.0; } + //if(pn2>0 && spn2<1) { spn2=1.0; } + if(nn1>0 && snn1<1) { snn1=1.0; } + //if(nn2>0 && snn2<1) { snn2=1.0; } + } + + if(bg_subtract) { + spn1-=((double)bg_pn1)*bg_weight; + snn1-=((double)bg_nn2)*bg_weight; + spn2-=((double)bg_pn2)*bg_weight; + snn2-=((double)bg_nn1)*bg_weight; + + if(spn2<0) spn2=0; + if(snn2<0) snn2=0; + + if(spn1>0 && snn1>0) { + tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0)); + } else { + tagw=-(spn2+snn2+1.0); + } + //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl; + } else { + tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2); + } + + if(tagw<0) { + val=0.0; + } else { + if(num==0.0) { + val=0; + } else { + val=num/(sqrt(varp*varn)); + } + val=val*sqrt(tagw) + tagw; + + } + //cout<<"val="<<val<<endl; + +#ifdef DEBUG + Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw); + Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val); +#endif + } + + + + if(return_peaks) { + // determine if previous position was a peak + if(ppv>min_peak_val && ppv>val && ppv>pppv) { + if(lpp>0 && (i-lpp+1)>min_peak_dist) { + // record previous peak position + ppos.push_back(lpp); + pval.push_back(lpv); +#ifdef DEBUG + Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp)); +#endif + lpp=i-1; lpv=ppv; +#ifdef DEBUG + Rprintf("updated peak to x=%d y=%f\n",lpp,lpv); +#endif + } else { + if(ppv>lpv) { + // update last peak positions +#ifdef DEBUG + Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv); +#endif + lpp=i-1; lpv=ppv; + } + } + } + + // update previous values + if(val!=ppv) { + pppv=ppv; ppv=val; + } + } else { + d_nv[i]=val; + } + } + } + + if(return_peaks) { + // record last position + if(lpp>0) { +#ifdef DEBUG + Rprintf("recording last peak x=%d y=%f\n",lpp,lpv); +#endif + ppos.push_back(lpp); + pval.push_back(lpv); + } + + SEXP rpp_R,rpv_R; + PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); + PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); + int* rpp=INTEGER(rpp_R); + double* rpv=REAL(rpv_R); + + for(int i=0;i<ppos.size();i++) { + rpp[i]=ppos[i]; + rpv[i]=pval[i]; + } + + SEXP ans_R, names_R; + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("x")); + SET_STRING_ELT(names_R, 1, mkChar("v")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, rpp_R); + SET_VECTOR_ELT(ans_R, 1, rpv_R); + setAttrib(ans_R, R_NamesSymbol, names_R); + + UNPROTECT(4); + return(ans_R); + } else { + UNPROTECT(1); + return(nv); + } + + } + + + + /************************************************************************/ + /* + * wtd - window tag difference implementation + */ + + SEXP wtd(SEXP x_R, // positive strand hist + SEXP y_R, // negative strand hist of the same length + SEXP wsize_R, // outer boundary distance + SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned + SEXP min_peak_dist_R, // distance between closest peaks + SEXP min_peak_val_R, // min peak threshold + SEXP direct_count_R, // whether tag weighting should not be done + SEXP tag_weight_R, // tag weight + SEXP ignore_masking_R, // whether to ignore masked regions + SEXP bg_subtract_R, // a flag whether do background subtractio + SEXP bgp_R, // optional background hist for positive strand + SEXP bgn_R, // optional background hist for negative strand + SEXP bg_wsize_R, // window size for the background counts + SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference + SEXP round_up_R // whether to round up fractional signal tag counts + ) + { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + + int whs=INTEGER(wsize_R)[0]; + int* x=INTEGER(x_R); + int* y=INTEGER(y_R); + int n_x=LENGTH(x_R); + + // background-related + int* bgp=INTEGER(bgp_R); + int* bgn=INTEGER(bgn_R); + int bg_whs=INTEGER(bg_wsize_R)[0]; + + + const int return_peaks=*(INTEGER(return_peaks_R)); + const int direct_count=*(INTEGER(direct_count_R)); + const int ignore_masking=*(INTEGER(ignore_masking_R)); + const double min_peak_val=*(REAL(min_peak_val_R)); + const int min_peak_dist=*(INTEGER(min_peak_dist_R)); + const double tag_weight=*(REAL(tag_weight_R)); + + const int round_up=*(INTEGER(round_up_R)); + const int bg_subtract=*(INTEGER(bg_subtract_R)); + const double bg_weight=*(REAL(bg_weight_R)); + + int i; // point at which the value is being calculated + int start=whs+1; + int end=n_x-whs-1; + + // tag counts to calculate the means + int pn1=0; + int pn2=0; + int nn1=0; + int nn2=0; + + // bg tag counts within bg window + int bg_pn1=0; + int bg_pn2=0; + int bg_nn1=0; + int bg_nn2=0; + + SEXP nv; + double *d_nv; + vector<int> ppos; + vector<double> pval; + if(!return_peaks) { + PROTECT(nv=allocVector(REALSXP,n_x)); + d_nv=REAL(nv); + for(int i=0;i<n_x;i++) { + d_nv[i]=0; + } + } + +#ifdef DEBUG + Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking); + Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]); +#endif + + int lpp=-1; // last peak position + double lpv=-1000; // last peak value + + double ppv=-1000; // last value + int ppl=-1; // position of the last value + double pppv=-1000; // value before last + + + if(ignore_masking==1) { + for(int i=0;i<whs;i++) { + pn1+=x[i]; + pn2+=x[i+whs+1]; + nn1+=y[i]; + nn2+=y[i+whs+1]; + + } + } + + if(bg_subtract) { + // pre-initialize background tag counts, + for(int i=0;i<bg_whs;i++) { + if(i<n_x) { + bg_pn2+=bgp[i]; + bg_nn2+=bgn[i]; + } + } + // increment center of background count window to the start position + for(int i=0;i<start;i++) { + // update background counts + int nl=i-bg_whs-1; + + if(nl>=0) { + bg_pn1-=bgp[nl]; + bg_nn1-=bgn[nl]; + } + bg_pn1+=bgp[i]; + bg_nn1+=bgn[i]; + + if(i>0) { + bg_pn2-=bgp[i-1]; + bg_nn2-=bgn[i-1]; + } + int nr=i+bg_whs; + if(nr<n_x) { + bg_pn2+=bgp[nr]; + bg_nn2+=bgn[nr]; + } + } + + } + + +#ifdef DEBUG + Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2); +#endif + + for(i=start;i<end;i++) { + if(bg_subtract) { + // update background counts + int nl=i-bg_whs-1; + + if(nl>=0) { + bg_pn1-=bgp[nl]; + bg_nn1-=bgn[nl]; + } + bg_pn1+=bgp[i]; + bg_nn1+=bgn[i]; + + if(i>0) { + bg_pn2-=bgp[i-1]; + bg_nn2-=bgn[i-1]; + } + int nr=i+bg_whs; + if(nr<n_x) { + bg_pn2+=bgp[nr]; + bg_nn2+=bgn[nr]; + } + } + + // update counts + if(ignore_masking==1) { + pn1+=x[i-1]-x[i-whs-1]; + pn2+=x[i+whs]-x[i-1]; + nn1+=y[i-1]-y[i-whs-1]; + nn2+=y[i+whs]-y[i-1]; + + } else { + + pn1=pn2=nn1=nn2=0; + + for(int k=0;k<whs;k++) { + int xp1=x[i-k-1]; + int xp2=x[i+k]; + int xn1=y[i-k-1]; + int xn2=y[i+k]; + + // omit masked positions + if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) { + pn1+=xp1; + nn1+=xn1; + pn2+=xp2; + nn2+=xn2; + } + } + } + + double val; + double spn1=((double)pn1)*tag_weight; + double snn1=((double)nn1)*tag_weight; + double spn2=((double)pn2)*tag_weight; + double snn2=((double)nn2)*tag_weight; + if(round_up) { + if(pn1>0 && spn1<1) { spn1=1.0; } + //if(pn2>0 && spn2<1) { spn2=1.0; } + //if(nn1>0 && snn1<1) { snn1=1.0; } + if(nn2>0 && snn2<1) { snn2=1.0; } + } + + if(direct_count) { + val=spn1+snn2; + if(round_up && val<1) { + val=1.0; + } + if(bg_subtract) { + val-=((double) (bg_pn1+bg_nn2))*bg_weight; + } + } else { + if(bg_subtract) { + spn1-=((double)bg_pn1)*bg_weight; + snn1-=((double)bg_nn1)*bg_weight; + spn2-=((double)bg_pn2)*bg_weight; + snn2-=((double)bg_nn2)*bg_weight; + + if(spn2<0) spn2=0; + if(snn1<0) snn1=0; + + if(spn1>0 && snn2>0) { + val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0)); + } else { + val=-(spn2+snn1+1.0); + } + } else { + val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight); + } + } + //double val=sqrt(pn1*nn2); + //if(pn2>nn1) { val-=pn2; } else { val-=pn1; } +#ifdef DEBUG + Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val); +#endif + + if(return_peaks) { + // determine if previous position was a peak + if(ppv>min_peak_val && ppv>val && ppv>pppv) { + if(lpp>0 && (i-lpp+1)>min_peak_dist) { + // record previous peak position + ppos.push_back(lpp); + pval.push_back(lpv); +#ifdef DEBUG + Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp)); +#endif + if(ppl!=-1 && ppl!=i-1) { + lpp=(int) round((ppl+i-1)/2); + } else { + lpp=i-1; + } + lpv=ppv; +#ifdef DEBUG + Rprintf("updated peak to x=%d y=%f\n",lpp,lpv); +#endif + } else { + if(ppv>lpv) { + // update last peak positions +#ifdef DEBUG + Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv); +#endif + if(ppl!=-1 && ppl!=i-1) { + lpp=(int) round((ppl+i-1)/2); + } else { + lpp=i-1; + } + lpv=ppv; + } + } + } + + // update previous values + if(val!=ppv) { + pppv=ppv; ppv=val; ppl=i; + } + } else { + d_nv[i]=val; + } + } + + if(return_peaks) { + // record last position + if(lpp>0) { +#ifdef DEBUG + Rprintf("recording last peak x=%d y=%f\n",lpp,lpv); +#endif + ppos.push_back(lpp); + pval.push_back(lpv); + } + + SEXP rpp_R,rpv_R; + PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); + PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); + int* rpp=INTEGER(rpp_R); + double* rpv=REAL(rpv_R); + + for(int i=0;i<ppos.size();i++) { + rpp[i]=ppos[i]; + rpv[i]=pval[i]; + } + + SEXP ans_R, names_R; + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("x")); + SET_STRING_ELT(names_R, 1, mkChar("v")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, rpp_R); + SET_VECTOR_ELT(ans_R, 1, rpv_R); + setAttrib(ans_R, R_NamesSymbol, names_R); + + UNPROTECT(4); + return(ans_R); + } else { + UNPROTECT(1); + return(nv); + } + + } + + +} + +