Mercurial > repos > zzhou > spp_phantompeak
diff spp/src/wdl.cpp @ 15:e689b83b0257 draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:15:21 -0500 |
parents | ce08b0efa3fd |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/spp/src/wdl.cpp Tue Nov 27 16:15:21 2012 -0500 @@ -0,0 +1,657 @@ +#include <vector> +#include <string.h> +#include <iostream> +#include <string> +#include <set> + +extern "C" { +#include "R.h" +#include "Rmath.h" +#include "Rinternals.h" +#include "Rdefines.h" +} + +using namespace std; +using namespace __gnu_cxx; + +//#define DEBUG 1 + +extern "C" { + + /************************************************************************/ + /* + * lwcc - calculate local window cross-correlation + */ + + SEXP lwcc(SEXP x_R, // positive strand hist + SEXP y_R, // negative strand hist of the same length + SEXP osize_R, // outer boundary distance + SEXP isize_R, // inner boundary distance + SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned + SEXP min_peak_dist_R, // distance between closest peaks + SEXP min_peak_val_R, // min peak threshold + SEXP tag_weight_R, // tag weight + SEXP bg_subtract_R, // a flag whether do background subtractio + SEXP bgp_R, // optional background hist for positive strand + SEXP bgn_R, // optional background hist for negative strand + SEXP bg_wsize_R, // window size for the background counts + SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout) + SEXP round_up_R // whether to round up fractional signal tag counts + ) + { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + + int is=INTEGER(isize_R)[0]; + int os=INTEGER(osize_R)[0]; + double rs=((double)(2*os+1)); + int* x=INTEGER(x_R); + int* y=INTEGER(y_R); + int n_x=LENGTH(x_R); + + // background-related + int* bgp=INTEGER(bgp_R); + int* bgn=INTEGER(bgn_R); + int bg_whs=INTEGER(bg_wsize_R)[0]; + + int return_peaks=*(INTEGER(return_peaks_R)); + double min_peak_val=*(REAL(min_peak_val_R)); + int min_peak_dist=*(INTEGER(min_peak_dist_R)); + double tag_weight=*(REAL(tag_weight_R)); + + const int round_up=*(INTEGER(round_up_R)); + const int bg_subtract=*(INTEGER(bg_subtract_R)); + const double bg_weight=*(REAL(bg_weight_R)); + + int i; // point at which the value is being calculated + int start=os; + int end=n_x-os-1; + + // bg tag counts within bg window + int bg_pn1=0; + int bg_nn1=0; + int bg_pn2=0; + int bg_nn2=0; + + + + // illustration for counting: + // + // 012345678901234567890123456789012 + // ==========------|------========== + // + // osize=16; isize=6; + + + SEXP nv; + double *d_nv; + vector<int> ppos; + vector<double> pval; + if(!return_peaks) { + PROTECT(nv=allocVector(REALSXP,n_x)); + d_nv=REAL(nv); + for(int i=0;i<n_x;i++) { + d_nv[i]=0; + } + } + +#ifdef DEBUG + Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight); + Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]); +#endif + + int lpp=-1; // last peak position + double lpv=-1e3; // last peak value + + double ppv=-1e3; // last value + double pppv=-11e-3; // value before last + + int pn1,pn2,nn1,nn2; + + + if(bg_subtract) { + // pre-initialize background tag counts, + for(int i=0;i<bg_whs;i++) { + if(i<n_x) { + bg_pn2+=bgp[i]; + bg_nn2+=bgn[i]; + } + } + } + + + for(i=0;i<end;i++) { +#ifdef DEBUG + //Rprintf("i=%d ", i); +#endif + + if(bg_subtract) { + // update background counts + int nl=i-bg_whs-1; + + if(nl>=0) { + bg_pn1-=bgp[nl]; + bg_nn1-=bgn[nl]; + } + bg_pn1+=bgp[i]; + bg_nn1+=bgn[i]; + + if(i>0) { + bg_pn2-=bgp[i-1]; + bg_nn2-=bgn[i-1]; + } + int nr=i+bg_whs; + if(nr<n_x) { + bg_pn2+=bgp[nr]; + bg_nn2+=bgn[nr]; + } + } + + if(i >= start) { + // update counts, taking into account masked out regions + pn1=pn2=nn1=nn2=0; + + for(int k=0;k<=(os-is);k++) { + int xp1=x[i-os+k]; + int xp2=x[i+os-k]; + int xn1=y[i+os-k]; + int xn2=y[i-os+k]; + + if(xp1!=-1 && xn1!=-1) { + pn1+=xp1; + nn1+=xn1; + } + if(xp2!=-1 && xn2!=-1) { + pn2+=xp2; + nn2+=xn2; + } + } + + // calculate the means + double mp=((double)(pn1+pn2))/rs; + double mn=((double)(pn1+pn2))/rs; +#ifdef DEBUG + Rprintf("mp=%f mn=%f\n",mp,mn); +#endif + // calculate correlation + double varp=0; + double varn=0; + double num=0; + double val=-1e3; + if(mp>0 & mn>0) { + for(int k=0;k<=(os-is);k++) { + int xp1=x[i-os+k]; + int xp2=x[i+os-k]; + int xn1=y[i+os-k]; + int xn2=y[i-os+k]; + + + if(xp1!=-1 && xn1!=-1) { + double nnp1=((double) xp1)-mp; + double nnn1=((double) xn1)-mn; + num+=nnp1*nnn1; + varp+=nnp1*nnp1; + varn+=nnn1*nnn1; + } + + if(xp2!=-1 && xn2!=-1) { + double nnp2=((double) xp2)-mp; + double nnn2=((double) xn2)-mn; + num+=nnp2*nnn2; + varp+=nnp2*nnp2; + varn+=nnn2*nnn2; + } + + } + double tagw; + double spn1=((double)pn1)*tag_weight; + double snn1=((double)nn1)*tag_weight; + double spn2=((double)pn2)*tag_weight; + double snn2=((double)nn2)*tag_weight; + if(round_up) { + if(pn1>0 && spn1<1) { spn1=1.0; } + //if(pn2>0 && spn2<1) { spn2=1.0; } + if(nn1>0 && snn1<1) { snn1=1.0; } + //if(nn2>0 && snn2<1) { snn2=1.0; } + } + + if(bg_subtract) { + spn1-=((double)bg_pn1)*bg_weight; + snn1-=((double)bg_nn2)*bg_weight; + spn2-=((double)bg_pn2)*bg_weight; + snn2-=((double)bg_nn1)*bg_weight; + + if(spn2<0) spn2=0; + if(snn2<0) snn2=0; + + if(spn1>0 && snn1>0) { + tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0)); + } else { + tagw=-(spn2+snn2+1.0); + } + //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl; + } else { + tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2); + } + + if(tagw<0) { + val=0.0; + } else { + if(num==0.0) { + val=0; + } else { + val=num/(sqrt(varp*varn)); + } + val=val*sqrt(tagw) + tagw; + + } + //cout<<"val="<<val<<endl; + +#ifdef DEBUG + Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw); + Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val); +#endif + } + + + + if(return_peaks) { + // determine if previous position was a peak + if(ppv>min_peak_val && ppv>val && ppv>pppv) { + if(lpp>0 && (i-lpp+1)>min_peak_dist) { + // record previous peak position + ppos.push_back(lpp); + pval.push_back(lpv); +#ifdef DEBUG + Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp)); +#endif + lpp=i-1; lpv=ppv; +#ifdef DEBUG + Rprintf("updated peak to x=%d y=%f\n",lpp,lpv); +#endif + } else { + if(ppv>lpv) { + // update last peak positions +#ifdef DEBUG + Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv); +#endif + lpp=i-1; lpv=ppv; + } + } + } + + // update previous values + if(val!=ppv) { + pppv=ppv; ppv=val; + } + } else { + d_nv[i]=val; + } + } + } + + if(return_peaks) { + // record last position + if(lpp>0) { +#ifdef DEBUG + Rprintf("recording last peak x=%d y=%f\n",lpp,lpv); +#endif + ppos.push_back(lpp); + pval.push_back(lpv); + } + + SEXP rpp_R,rpv_R; + PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); + PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); + int* rpp=INTEGER(rpp_R); + double* rpv=REAL(rpv_R); + + for(int i=0;i<ppos.size();i++) { + rpp[i]=ppos[i]; + rpv[i]=pval[i]; + } + + SEXP ans_R, names_R; + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("x")); + SET_STRING_ELT(names_R, 1, mkChar("v")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, rpp_R); + SET_VECTOR_ELT(ans_R, 1, rpv_R); + setAttrib(ans_R, R_NamesSymbol, names_R); + + UNPROTECT(4); + return(ans_R); + } else { + UNPROTECT(1); + return(nv); + } + + } + + + + /************************************************************************/ + /* + * wtd - window tag difference implementation + */ + + SEXP wtd(SEXP x_R, // positive strand hist + SEXP y_R, // negative strand hist of the same length + SEXP wsize_R, // outer boundary distance + SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned + SEXP min_peak_dist_R, // distance between closest peaks + SEXP min_peak_val_R, // min peak threshold + SEXP direct_count_R, // whether tag weighting should not be done + SEXP tag_weight_R, // tag weight + SEXP ignore_masking_R, // whether to ignore masked regions + SEXP bg_subtract_R, // a flag whether do background subtractio + SEXP bgp_R, // optional background hist for positive strand + SEXP bgn_R, // optional background hist for negative strand + SEXP bg_wsize_R, // window size for the background counts + SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference + SEXP round_up_R // whether to round up fractional signal tag counts + ) + { + +#ifdef DEBUG + Rprintf("start\n"); +#endif + + int whs=INTEGER(wsize_R)[0]; + int* x=INTEGER(x_R); + int* y=INTEGER(y_R); + int n_x=LENGTH(x_R); + + // background-related + int* bgp=INTEGER(bgp_R); + int* bgn=INTEGER(bgn_R); + int bg_whs=INTEGER(bg_wsize_R)[0]; + + + const int return_peaks=*(INTEGER(return_peaks_R)); + const int direct_count=*(INTEGER(direct_count_R)); + const int ignore_masking=*(INTEGER(ignore_masking_R)); + const double min_peak_val=*(REAL(min_peak_val_R)); + const int min_peak_dist=*(INTEGER(min_peak_dist_R)); + const double tag_weight=*(REAL(tag_weight_R)); + + const int round_up=*(INTEGER(round_up_R)); + const int bg_subtract=*(INTEGER(bg_subtract_R)); + const double bg_weight=*(REAL(bg_weight_R)); + + int i; // point at which the value is being calculated + int start=whs+1; + int end=n_x-whs-1; + + // tag counts to calculate the means + int pn1=0; + int pn2=0; + int nn1=0; + int nn2=0; + + // bg tag counts within bg window + int bg_pn1=0; + int bg_pn2=0; + int bg_nn1=0; + int bg_nn2=0; + + SEXP nv; + double *d_nv; + vector<int> ppos; + vector<double> pval; + if(!return_peaks) { + PROTECT(nv=allocVector(REALSXP,n_x)); + d_nv=REAL(nv); + for(int i=0;i<n_x;i++) { + d_nv[i]=0; + } + } + +#ifdef DEBUG + Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking); + Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]); +#endif + + int lpp=-1; // last peak position + double lpv=-1000; // last peak value + + double ppv=-1000; // last value + int ppl=-1; // position of the last value + double pppv=-1000; // value before last + + + if(ignore_masking==1) { + for(int i=0;i<whs;i++) { + pn1+=x[i]; + pn2+=x[i+whs+1]; + nn1+=y[i]; + nn2+=y[i+whs+1]; + + } + } + + if(bg_subtract) { + // pre-initialize background tag counts, + for(int i=0;i<bg_whs;i++) { + if(i<n_x) { + bg_pn2+=bgp[i]; + bg_nn2+=bgn[i]; + } + } + // increment center of background count window to the start position + for(int i=0;i<start;i++) { + // update background counts + int nl=i-bg_whs-1; + + if(nl>=0) { + bg_pn1-=bgp[nl]; + bg_nn1-=bgn[nl]; + } + bg_pn1+=bgp[i]; + bg_nn1+=bgn[i]; + + if(i>0) { + bg_pn2-=bgp[i-1]; + bg_nn2-=bgn[i-1]; + } + int nr=i+bg_whs; + if(nr<n_x) { + bg_pn2+=bgp[nr]; + bg_nn2+=bgn[nr]; + } + } + + } + + +#ifdef DEBUG + Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2); +#endif + + for(i=start;i<end;i++) { + if(bg_subtract) { + // update background counts + int nl=i-bg_whs-1; + + if(nl>=0) { + bg_pn1-=bgp[nl]; + bg_nn1-=bgn[nl]; + } + bg_pn1+=bgp[i]; + bg_nn1+=bgn[i]; + + if(i>0) { + bg_pn2-=bgp[i-1]; + bg_nn2-=bgn[i-1]; + } + int nr=i+bg_whs; + if(nr<n_x) { + bg_pn2+=bgp[nr]; + bg_nn2+=bgn[nr]; + } + } + + // update counts + if(ignore_masking==1) { + pn1+=x[i-1]-x[i-whs-1]; + pn2+=x[i+whs]-x[i-1]; + nn1+=y[i-1]-y[i-whs-1]; + nn2+=y[i+whs]-y[i-1]; + + } else { + + pn1=pn2=nn1=nn2=0; + + for(int k=0;k<whs;k++) { + int xp1=x[i-k-1]; + int xp2=x[i+k]; + int xn1=y[i-k-1]; + int xn2=y[i+k]; + + // omit masked positions + if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) { + pn1+=xp1; + nn1+=xn1; + pn2+=xp2; + nn2+=xn2; + } + } + } + + double val; + double spn1=((double)pn1)*tag_weight; + double snn1=((double)nn1)*tag_weight; + double spn2=((double)pn2)*tag_weight; + double snn2=((double)nn2)*tag_weight; + if(round_up) { + if(pn1>0 && spn1<1) { spn1=1.0; } + //if(pn2>0 && spn2<1) { spn2=1.0; } + //if(nn1>0 && snn1<1) { snn1=1.0; } + if(nn2>0 && snn2<1) { snn2=1.0; } + } + + if(direct_count) { + val=spn1+snn2; + if(round_up && val<1) { + val=1.0; + } + if(bg_subtract) { + val-=((double) (bg_pn1+bg_nn2))*bg_weight; + } + } else { + if(bg_subtract) { + spn1-=((double)bg_pn1)*bg_weight; + snn1-=((double)bg_nn1)*bg_weight; + spn2-=((double)bg_pn2)*bg_weight; + snn2-=((double)bg_nn2)*bg_weight; + + if(spn2<0) spn2=0; + if(snn1<0) snn1=0; + + if(spn1>0 && snn2>0) { + val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0)); + } else { + val=-(spn2+snn1+1.0); + } + } else { + val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight); + } + } + //double val=sqrt(pn1*nn2); + //if(pn2>nn1) { val-=pn2; } else { val-=pn1; } +#ifdef DEBUG + Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val); +#endif + + if(return_peaks) { + // determine if previous position was a peak + if(ppv>min_peak_val && ppv>val && ppv>pppv) { + if(lpp>0 && (i-lpp+1)>min_peak_dist) { + // record previous peak position + ppos.push_back(lpp); + pval.push_back(lpv); +#ifdef DEBUG + Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp)); +#endif + if(ppl!=-1 && ppl!=i-1) { + lpp=(int) round((ppl+i-1)/2); + } else { + lpp=i-1; + } + lpv=ppv; +#ifdef DEBUG + Rprintf("updated peak to x=%d y=%f\n",lpp,lpv); +#endif + } else { + if(ppv>lpv) { + // update last peak positions +#ifdef DEBUG + Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv); +#endif + if(ppl!=-1 && ppl!=i-1) { + lpp=(int) round((ppl+i-1)/2); + } else { + lpp=i-1; + } + lpv=ppv; + } + } + } + + // update previous values + if(val!=ppv) { + pppv=ppv; ppv=val; ppl=i; + } + } else { + d_nv[i]=val; + } + } + + if(return_peaks) { + // record last position + if(lpp>0) { +#ifdef DEBUG + Rprintf("recording last peak x=%d y=%f\n",lpp,lpv); +#endif + ppos.push_back(lpp); + pval.push_back(lpv); + } + + SEXP rpp_R,rpv_R; + PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); + PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); + int* rpp=INTEGER(rpp_R); + double* rpv=REAL(rpv_R); + + for(int i=0;i<ppos.size();i++) { + rpp[i]=ppos[i]; + rpv[i]=pval[i]; + } + + SEXP ans_R, names_R; + PROTECT(names_R = allocVector(STRSXP, 2)); + SET_STRING_ELT(names_R, 0, mkChar("x")); + SET_STRING_ELT(names_R, 1, mkChar("v")); + + PROTECT(ans_R = allocVector(VECSXP, 2)); + SET_VECTOR_ELT(ans_R, 0, rpp_R); + SET_VECTOR_ELT(ans_R, 1, rpv_R); + setAttrib(ans_R, R_NamesSymbol, names_R); + + UNPROTECT(4); + return(ans_R); + } else { + UNPROTECT(1); + return(nv); + } + + } + + +} + +