diff spp/src/wdl.cpp @ 15:e689b83b0257 draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:15:21 -0500
parents ce08b0efa3fd
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spp/src/wdl.cpp	Tue Nov 27 16:15:21 2012 -0500
@@ -0,0 +1,657 @@
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <string>
+#include <set>
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx; 
+
+//#define DEBUG 1
+
+extern "C" {
+
+  /************************************************************************/
+  /*
+   * lwcc - calculate local window cross-correlation
+   */
+
+  SEXP lwcc(SEXP x_R, // positive strand hist 
+	    SEXP y_R, // negative strand hist of the same length
+	    SEXP osize_R,       // outer boundary distance
+	    SEXP isize_R,        // inner boundary distance
+	    SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
+	    SEXP min_peak_dist_R, // distance between closest peaks
+	    SEXP min_peak_val_R, // min peak threshold
+	    SEXP tag_weight_R,  // tag weight
+	    SEXP bg_subtract_R, // a flag whether do background subtractio
+	    SEXP bgp_R, // optional background hist for positive strand
+	    SEXP bgn_R, // optional background hist for negative strand
+	    SEXP bg_wsize_R, // window size for the background counts
+	    SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout)
+	    SEXP round_up_R // whether to round up fractional signal tag counts
+	    )
+  {
+
+#ifdef DEBUG  
+    Rprintf("start\n");
+#endif
+    
+    int is=INTEGER(isize_R)[0];
+    int os=INTEGER(osize_R)[0];
+    double rs=((double)(2*os+1));
+    int* x=INTEGER(x_R);
+    int* y=INTEGER(y_R);
+    int n_x=LENGTH(x_R);
+
+    // background-related 
+    int* bgp=INTEGER(bgp_R);
+    int* bgn=INTEGER(bgn_R);
+    int bg_whs=INTEGER(bg_wsize_R)[0];
+
+    int return_peaks=*(INTEGER(return_peaks_R));
+    double min_peak_val=*(REAL(min_peak_val_R));
+    int min_peak_dist=*(INTEGER(min_peak_dist_R));
+    double tag_weight=*(REAL(tag_weight_R));
+
+    const int round_up=*(INTEGER(round_up_R));
+    const int bg_subtract=*(INTEGER(bg_subtract_R));
+    const double bg_weight=*(REAL(bg_weight_R));
+
+    int i; // point at which the value is being calculated
+    int start=os;
+    int end=n_x-os-1;
+
+    // bg tag counts within bg window
+    int bg_pn1=0;
+    int bg_nn1=0;
+    int bg_pn2=0;
+    int bg_nn2=0;
+
+
+  
+    // illustration for counting:
+    //
+    // 012345678901234567890123456789012
+    // ==========------|------==========
+    //
+    //  osize=16; isize=6; 
+
+
+    SEXP nv;
+    double *d_nv;
+    vector<int> ppos;
+    vector<double> pval;
+    if(!return_peaks) {
+      PROTECT(nv=allocVector(REALSXP,n_x)); 
+      d_nv=REAL(nv);
+      for(int i=0;i<n_x;i++) {
+	d_nv[i]=0;
+      }
+    }
+
+#ifdef DEBUG  
+    Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight);
+    Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
+#endif
+
+    int lpp=-1; // last peak position
+    double lpv=-1e3; // last peak value
+    
+    double ppv=-1e3; // last value
+    double pppv=-11e-3; // value before last
+
+    int pn1,pn2,nn1,nn2;
+
+    
+    if(bg_subtract) {
+      // pre-initialize background tag counts, 
+      for(int i=0;i<bg_whs;i++) {
+	if(i<n_x) {
+	  bg_pn2+=bgp[i];
+	  bg_nn2+=bgn[i];
+	}
+      }
+    }
+
+
+    for(i=0;i<end;i++) {
+#ifdef DEBUG  
+      //Rprintf("i=%d ", i);
+#endif
+      
+      if(bg_subtract) {
+	// update background counts
+	int nl=i-bg_whs-1;
+
+	if(nl>=0) {
+	  bg_pn1-=bgp[nl];
+	  bg_nn1-=bgn[nl];
+	}
+	bg_pn1+=bgp[i];
+	bg_nn1+=bgn[i];
+
+	if(i>0) {
+	  bg_pn2-=bgp[i-1];
+	  bg_nn2-=bgn[i-1];
+	}
+	int nr=i+bg_whs;
+	if(nr<n_x) {
+	  bg_pn2+=bgp[nr];
+	  bg_nn2+=bgn[nr];
+	}
+      }
+
+      if(i >= start) {
+	// update counts, taking into account masked out regions
+	pn1=pn2=nn1=nn2=0;
+	
+	for(int k=0;k<=(os-is);k++) {
+	  int xp1=x[i-os+k];
+	  int xp2=x[i+os-k];
+	  int xn1=y[i+os-k];
+	  int xn2=y[i-os+k];
+
+	  if(xp1!=-1 && xn1!=-1) {
+	    pn1+=xp1;
+	    nn1+=xn1;
+	  }
+	  if(xp2!=-1 && xn2!=-1) {
+	    pn2+=xp2;
+	    nn2+=xn2;
+	  }
+	}
+      
+	// calculate the means
+	double mp=((double)(pn1+pn2))/rs;
+	double mn=((double)(pn1+pn2))/rs;
+#ifdef DEBUG  
+	Rprintf("mp=%f mn=%f\n",mp,mn);
+#endif
+	// calculate correlation
+	double varp=0;
+	double varn=0;
+	double num=0;
+	double val=-1e3;
+	if(mp>0 & mn>0) {
+	  for(int k=0;k<=(os-is);k++) {
+	    int xp1=x[i-os+k];
+	    int xp2=x[i+os-k];
+	    int xn1=y[i+os-k];
+	    int xn2=y[i-os+k];
+
+	    
+	    if(xp1!=-1 && xn1!=-1) {  
+	      double nnp1=((double) xp1)-mp;
+	      double nnn1=((double) xn1)-mn;
+	      num+=nnp1*nnn1;
+	      varp+=nnp1*nnp1;
+	      varn+=nnn1*nnn1;
+	    }
+	    
+	    if(xp2!=-1 && xn2!=-1) {
+	      double nnp2=((double) xp2)-mp;
+	      double nnn2=((double) xn2)-mn;
+	      num+=nnp2*nnn2;
+	      varp+=nnp2*nnp2;
+	      varn+=nnn2*nnn2;
+	    }
+
+	  }
+	  double tagw;
+	  double spn1=((double)pn1)*tag_weight;
+	  double snn1=((double)nn1)*tag_weight;
+	  double spn2=((double)pn2)*tag_weight;
+	  double snn2=((double)nn2)*tag_weight;
+	  if(round_up) {
+	    if(pn1>0 && spn1<1) { spn1=1.0; }
+	    //if(pn2>0 && spn2<1) { spn2=1.0; }
+	    if(nn1>0 && snn1<1) { snn1=1.0; }
+	    //if(nn2>0 && snn2<1) { snn2=1.0; }
+	  }
+
+	  if(bg_subtract) {
+	    spn1-=((double)bg_pn1)*bg_weight;
+	    snn1-=((double)bg_nn2)*bg_weight;
+	    spn2-=((double)bg_pn2)*bg_weight;
+	    snn2-=((double)bg_nn1)*bg_weight;
+
+	    if(spn2<0) spn2=0;
+	    if(snn2<0) snn2=0;
+	    
+	    if(spn1>0 && snn1>0) {
+	      tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0));
+	    } else {
+	      tagw=-(spn2+snn2+1.0);
+	    }
+	    //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl;
+	  } else {
+	    tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2);
+	  }
+
+	  if(tagw<0) {
+	    val=0.0; 
+	  } else {
+	    if(num==0.0) {
+	      val=0;
+	    } else {
+	      val=num/(sqrt(varp*varn));
+	    }
+	    val=val*sqrt(tagw) + tagw;
+
+	  }
+	  //cout<<"val="<<val<<endl;
+
+#ifdef DEBUG  
+        Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw);
+	Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val);
+#endif
+	}
+
+
+	
+	if(return_peaks) {
+	  // determine if previous position was a peak
+	  if(ppv>min_peak_val && ppv>val && ppv>pppv) {
+	    if(lpp>0 && (i-lpp+1)>min_peak_dist) {
+	      // record previous peak position
+	      ppos.push_back(lpp);
+	      pval.push_back(lpv);
+#ifdef DEBUG  
+	      Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
+#endif	    
+	      lpp=i-1; lpv=ppv;
+#ifdef DEBUG  
+	      Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
+#endif	    
+	    } else {
+	      if(ppv>lpv) {
+		// update last peak positions
+#ifdef DEBUG  
+		Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
+#endif
+		lpp=i-1; lpv=ppv;
+	      }
+	    }
+	  }
+
+	  // update previous values
+	  if(val!=ppv) {
+	    pppv=ppv; ppv=val;
+	  }
+	} else {
+	  d_nv[i]=val;
+	}
+      }
+    }
+
+    if(return_peaks) {
+      // record last position
+      if(lpp>0) {
+#ifdef DEBUG  
+	Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
+#endif
+	ppos.push_back(lpp);
+	pval.push_back(lpv);
+      }
+
+      SEXP rpp_R,rpv_R;
+      PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); 
+      PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); 
+      int* rpp=INTEGER(rpp_R);
+      double* rpv=REAL(rpv_R);
+
+      for(int i=0;i<ppos.size();i++) {
+	rpp[i]=ppos[i];
+	rpv[i]=pval[i];
+      }
+    
+      SEXP ans_R, names_R;
+      PROTECT(names_R = allocVector(STRSXP, 2));
+      SET_STRING_ELT(names_R, 0, mkChar("x"));
+      SET_STRING_ELT(names_R, 1, mkChar("v"));
+    
+      PROTECT(ans_R = allocVector(VECSXP, 2));
+      SET_VECTOR_ELT(ans_R, 0, rpp_R);
+      SET_VECTOR_ELT(ans_R, 1, rpv_R);
+      setAttrib(ans_R, R_NamesSymbol, names_R);
+  
+      UNPROTECT(4);
+      return(ans_R);
+    } else {
+      UNPROTECT(1);
+      return(nv);
+    }
+
+  }
+
+
+
+  /************************************************************************/
+  /*
+   * wtd - window tag difference implementation
+   */
+
+  SEXP wtd(SEXP x_R, // positive strand hist 
+	   SEXP y_R, // negative strand hist of the same length
+	   SEXP wsize_R,       // outer boundary distance
+	   SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
+	   SEXP min_peak_dist_R, // distance between closest peaks
+	   SEXP min_peak_val_R, // min peak threshold
+	   SEXP direct_count_R, // whether tag weighting should not be done
+	   SEXP tag_weight_R,  // tag weight
+	   SEXP ignore_masking_R,  // whether to ignore masked regions
+	   SEXP bg_subtract_R, // a flag whether do background subtractio
+	   SEXP bgp_R, // optional background hist for positive strand
+	   SEXP bgn_R, // optional background hist for negative strand
+	   SEXP bg_wsize_R, // window size for the background counts
+	   SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference
+	   SEXP round_up_R // whether to round up fractional signal tag counts
+	   )
+  {
+
+#ifdef DEBUG  
+    Rprintf("start\n");
+#endif
+    
+    int whs=INTEGER(wsize_R)[0];
+    int* x=INTEGER(x_R);
+    int* y=INTEGER(y_R);
+    int n_x=LENGTH(x_R);
+    
+    // background-related 
+    int* bgp=INTEGER(bgp_R);
+    int* bgn=INTEGER(bgn_R);
+    int bg_whs=INTEGER(bg_wsize_R)[0];
+    
+
+    const int return_peaks=*(INTEGER(return_peaks_R));
+    const int direct_count=*(INTEGER(direct_count_R));
+    const int ignore_masking=*(INTEGER(ignore_masking_R));
+    const double min_peak_val=*(REAL(min_peak_val_R));
+    const int min_peak_dist=*(INTEGER(min_peak_dist_R));
+    const double tag_weight=*(REAL(tag_weight_R));
+    
+    const int round_up=*(INTEGER(round_up_R));
+    const int bg_subtract=*(INTEGER(bg_subtract_R));
+    const double bg_weight=*(REAL(bg_weight_R));
+    
+    int i; // point at which the value is being calculated
+    int start=whs+1;
+    int end=n_x-whs-1;
+
+    // tag counts to calculate the means
+    int pn1=0;
+    int pn2=0;
+    int nn1=0;
+    int nn2=0;
+
+    // bg tag counts within bg window
+    int bg_pn1=0;
+    int bg_pn2=0;
+    int bg_nn1=0;
+    int bg_nn2=0;
+    
+    SEXP nv;
+    double *d_nv;
+    vector<int> ppos;
+    vector<double> pval;
+    if(!return_peaks) {
+      PROTECT(nv=allocVector(REALSXP,n_x)); 
+      d_nv=REAL(nv);
+      for(int i=0;i<n_x;i++) {
+	d_nv[i]=0;
+      }
+    }
+
+#ifdef DEBUG  
+    Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking);
+    Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
+#endif
+
+    int lpp=-1; // last peak position
+    double lpv=-1000; // last peak value
+    
+    double ppv=-1000; // last value
+    int ppl=-1; // position of the last value
+    double pppv=-1000; // value before last
+
+
+    if(ignore_masking==1) {
+      for(int i=0;i<whs;i++) {
+	pn1+=x[i];
+	pn2+=x[i+whs+1];
+	nn1+=y[i];
+	nn2+=y[i+whs+1];
+
+      }
+    }
+
+    if(bg_subtract) {
+      // pre-initialize background tag counts, 
+      for(int i=0;i<bg_whs;i++) {
+	if(i<n_x) {
+	  bg_pn2+=bgp[i];
+	  bg_nn2+=bgn[i];
+	}
+      }
+      // increment center of background count window to the start position
+      for(int i=0;i<start;i++) {
+	// update background counts
+	int nl=i-bg_whs-1;
+
+	if(nl>=0) {
+	  bg_pn1-=bgp[nl];
+	  bg_nn1-=bgn[nl];
+	}
+	bg_pn1+=bgp[i];
+	bg_nn1+=bgn[i];
+
+	if(i>0) {
+	  bg_pn2-=bgp[i-1];
+	  bg_nn2-=bgn[i-1];
+	}
+	int nr=i+bg_whs;
+	if(nr<n_x) {
+	  bg_pn2+=bgp[nr];
+	  bg_nn2+=bgn[nr];
+	}
+      }
+
+    }
+
+    
+#ifdef DEBUG  
+    Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2);
+#endif
+
+    for(i=start;i<end;i++) {
+      if(bg_subtract) {
+	// update background counts
+	int nl=i-bg_whs-1;
+
+	if(nl>=0) {
+	  bg_pn1-=bgp[nl];
+	  bg_nn1-=bgn[nl];
+	}
+	bg_pn1+=bgp[i];
+	bg_nn1+=bgn[i];
+
+	if(i>0) {
+	  bg_pn2-=bgp[i-1];
+	  bg_nn2-=bgn[i-1];
+	}
+	int nr=i+bg_whs;
+	if(nr<n_x) {
+	  bg_pn2+=bgp[nr];
+	  bg_nn2+=bgn[nr];
+	}
+      }
+
+      // update counts
+      if(ignore_masking==1) {
+	pn1+=x[i-1]-x[i-whs-1];
+	pn2+=x[i+whs]-x[i-1];
+	nn1+=y[i-1]-y[i-whs-1];
+	nn2+=y[i+whs]-y[i-1];
+
+      } else {
+
+	pn1=pn2=nn1=nn2=0;
+	
+	for(int k=0;k<whs;k++) {
+	  int xp1=x[i-k-1];
+	  int xp2=x[i+k];
+	  int xn1=y[i-k-1];
+	  int xn2=y[i+k];
+
+	  // omit masked positions
+	  if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) {
+	    pn1+=xp1;
+	    nn1+=xn1;
+	    pn2+=xp2;
+	    nn2+=xn2;
+	  }
+	}
+      }
+
+      double val;
+      double spn1=((double)pn1)*tag_weight;
+      double snn1=((double)nn1)*tag_weight;
+      double spn2=((double)pn2)*tag_weight;
+      double snn2=((double)nn2)*tag_weight;
+      if(round_up) {
+	if(pn1>0 && spn1<1) { spn1=1.0; }
+	//if(pn2>0 && spn2<1) { spn2=1.0; }
+	//if(nn1>0 && snn1<1) { snn1=1.0; }
+	if(nn2>0 && snn2<1) { snn2=1.0; }
+      }
+
+      if(direct_count) {
+	val=spn1+snn2;
+	if(round_up && val<1) {
+	  val=1.0;
+	}
+	if(bg_subtract) {
+	  val-=((double) (bg_pn1+bg_nn2))*bg_weight;
+	}
+      } else {
+	if(bg_subtract) {
+	  spn1-=((double)bg_pn1)*bg_weight;
+	  snn1-=((double)bg_nn1)*bg_weight;
+	  spn2-=((double)bg_pn2)*bg_weight;
+	  snn2-=((double)bg_nn2)*bg_weight;
+
+	  if(spn2<0) spn2=0;
+	  if(snn1<0) snn1=0;
+
+	  if(spn1>0 && snn2>0) {
+	    val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0));
+	  } else {
+	    val=-(spn2+snn1+1.0);
+	  }
+	} else {
+	  val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight);
+	}
+      }	
+      //double val=sqrt(pn1*nn2);
+      //if(pn2>nn1) { val-=pn2; } else { val-=pn1; }
+#ifdef DEBUG  
+      Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val);
+#endif
+      
+      if(return_peaks) {
+	// determine if previous position was a peak
+	if(ppv>min_peak_val && ppv>val && ppv>pppv) {
+	  if(lpp>0 && (i-lpp+1)>min_peak_dist) {
+	    // record previous peak position
+	    ppos.push_back(lpp);
+	    pval.push_back(lpv);
+#ifdef DEBUG  
+	    Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
+#endif	    
+	    if(ppl!=-1 && ppl!=i-1) {
+	      lpp=(int) round((ppl+i-1)/2);
+	    } else {
+	      lpp=i-1;
+	    }
+	    lpv=ppv;
+#ifdef DEBUG  
+	    Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
+#endif	    
+	  } else {
+	    if(ppv>lpv) {
+	      // update last peak positions
+#ifdef DEBUG  
+	      Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
+#endif
+	      if(ppl!=-1 && ppl!=i-1) {
+		lpp=(int) round((ppl+i-1)/2);
+	      } else {
+		lpp=i-1;
+	      }
+	      lpv=ppv;
+	    }
+	  }
+	}
+	
+	// update previous values
+	if(val!=ppv) {
+	  pppv=ppv; ppv=val; ppl=i;
+	}
+      } else {
+	d_nv[i]=val;
+      }
+    }
+
+    if(return_peaks) {
+      // record last position
+      if(lpp>0) {
+#ifdef DEBUG  
+	Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
+#endif
+	ppos.push_back(lpp);
+	pval.push_back(lpv);
+      }
+
+      SEXP rpp_R,rpv_R;
+      PROTECT(rpp_R=allocVector(INTSXP,ppos.size())); 
+      PROTECT(rpv_R=allocVector(REALSXP,ppos.size())); 
+      int* rpp=INTEGER(rpp_R);
+      double* rpv=REAL(rpv_R);
+
+      for(int i=0;i<ppos.size();i++) {
+	rpp[i]=ppos[i];
+	rpv[i]=pval[i];
+      }
+    
+      SEXP ans_R, names_R;
+      PROTECT(names_R = allocVector(STRSXP, 2));
+      SET_STRING_ELT(names_R, 0, mkChar("x"));
+      SET_STRING_ELT(names_R, 1, mkChar("v"));
+    
+      PROTECT(ans_R = allocVector(VECSXP, 2));
+      SET_VECTOR_ELT(ans_R, 0, rpp_R);
+      SET_VECTOR_ELT(ans_R, 1, rpv_R);
+      setAttrib(ans_R, R_NamesSymbol, names_R);
+  
+      UNPROTECT(4);
+      return(ans_R);
+    } else {
+      UNPROTECT(1);
+      return(nv);
+    }
+
+  }
+
+
+}
+
+