comparison spp/src/wdl.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
comparison
equal deleted inserted replaced
5:608a8e0eac56 6:ce08b0efa3fd
1 #include <vector>
2 #include <string.h>
3 #include <iostream>
4 #include <string>
5 #include <set>
6
7 extern "C" {
8 #include "R.h"
9 #include "Rmath.h"
10 #include "Rinternals.h"
11 #include "Rdefines.h"
12 }
13
14 using namespace std;
15 using namespace __gnu_cxx;
16
17 //#define DEBUG 1
18
19 extern "C" {
20
21 /************************************************************************/
22 /*
23 * lwcc - calculate local window cross-correlation
24 */
25
26 SEXP lwcc(SEXP x_R, // positive strand hist
27 SEXP y_R, // negative strand hist of the same length
28 SEXP osize_R, // outer boundary distance
29 SEXP isize_R, // inner boundary distance
30 SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
31 SEXP min_peak_dist_R, // distance between closest peaks
32 SEXP min_peak_val_R, // min peak threshold
33 SEXP tag_weight_R, // tag weight
34 SEXP bg_subtract_R, // a flag whether do background subtractio
35 SEXP bgp_R, // optional background hist for positive strand
36 SEXP bgn_R, // optional background hist for negative strand
37 SEXP bg_wsize_R, // window size for the background counts
38 SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference (including is cutout)
39 SEXP round_up_R // whether to round up fractional signal tag counts
40 )
41 {
42
43 #ifdef DEBUG
44 Rprintf("start\n");
45 #endif
46
47 int is=INTEGER(isize_R)[0];
48 int os=INTEGER(osize_R)[0];
49 double rs=((double)(2*os+1));
50 int* x=INTEGER(x_R);
51 int* y=INTEGER(y_R);
52 int n_x=LENGTH(x_R);
53
54 // background-related
55 int* bgp=INTEGER(bgp_R);
56 int* bgn=INTEGER(bgn_R);
57 int bg_whs=INTEGER(bg_wsize_R)[0];
58
59 int return_peaks=*(INTEGER(return_peaks_R));
60 double min_peak_val=*(REAL(min_peak_val_R));
61 int min_peak_dist=*(INTEGER(min_peak_dist_R));
62 double tag_weight=*(REAL(tag_weight_R));
63
64 const int round_up=*(INTEGER(round_up_R));
65 const int bg_subtract=*(INTEGER(bg_subtract_R));
66 const double bg_weight=*(REAL(bg_weight_R));
67
68 int i; // point at which the value is being calculated
69 int start=os;
70 int end=n_x-os-1;
71
72 // bg tag counts within bg window
73 int bg_pn1=0;
74 int bg_nn1=0;
75 int bg_pn2=0;
76 int bg_nn2=0;
77
78
79
80 // illustration for counting:
81 //
82 // 012345678901234567890123456789012
83 // ==========------|------==========
84 //
85 // osize=16; isize=6;
86
87
88 SEXP nv;
89 double *d_nv;
90 vector<int> ppos;
91 vector<double> pval;
92 if(!return_peaks) {
93 PROTECT(nv=allocVector(REALSXP,n_x));
94 d_nv=REAL(nv);
95 for(int i=0;i<n_x;i++) {
96 d_nv[i]=0;
97 }
98 }
99
100 #ifdef DEBUG
101 Rprintf("start=%d end=%d tag_weight=%f\n", start,end,tag_weight);
102 Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
103 #endif
104
105 int lpp=-1; // last peak position
106 double lpv=-1e3; // last peak value
107
108 double ppv=-1e3; // last value
109 double pppv=-11e-3; // value before last
110
111 int pn1,pn2,nn1,nn2;
112
113
114 if(bg_subtract) {
115 // pre-initialize background tag counts,
116 for(int i=0;i<bg_whs;i++) {
117 if(i<n_x) {
118 bg_pn2+=bgp[i];
119 bg_nn2+=bgn[i];
120 }
121 }
122 }
123
124
125 for(i=0;i<end;i++) {
126 #ifdef DEBUG
127 //Rprintf("i=%d ", i);
128 #endif
129
130 if(bg_subtract) {
131 // update background counts
132 int nl=i-bg_whs-1;
133
134 if(nl>=0) {
135 bg_pn1-=bgp[nl];
136 bg_nn1-=bgn[nl];
137 }
138 bg_pn1+=bgp[i];
139 bg_nn1+=bgn[i];
140
141 if(i>0) {
142 bg_pn2-=bgp[i-1];
143 bg_nn2-=bgn[i-1];
144 }
145 int nr=i+bg_whs;
146 if(nr<n_x) {
147 bg_pn2+=bgp[nr];
148 bg_nn2+=bgn[nr];
149 }
150 }
151
152 if(i >= start) {
153 // update counts, taking into account masked out regions
154 pn1=pn2=nn1=nn2=0;
155
156 for(int k=0;k<=(os-is);k++) {
157 int xp1=x[i-os+k];
158 int xp2=x[i+os-k];
159 int xn1=y[i+os-k];
160 int xn2=y[i-os+k];
161
162 if(xp1!=-1 && xn1!=-1) {
163 pn1+=xp1;
164 nn1+=xn1;
165 }
166 if(xp2!=-1 && xn2!=-1) {
167 pn2+=xp2;
168 nn2+=xn2;
169 }
170 }
171
172 // calculate the means
173 double mp=((double)(pn1+pn2))/rs;
174 double mn=((double)(pn1+pn2))/rs;
175 #ifdef DEBUG
176 Rprintf("mp=%f mn=%f\n",mp,mn);
177 #endif
178 // calculate correlation
179 double varp=0;
180 double varn=0;
181 double num=0;
182 double val=-1e3;
183 if(mp>0 & mn>0) {
184 for(int k=0;k<=(os-is);k++) {
185 int xp1=x[i-os+k];
186 int xp2=x[i+os-k];
187 int xn1=y[i+os-k];
188 int xn2=y[i-os+k];
189
190
191 if(xp1!=-1 && xn1!=-1) {
192 double nnp1=((double) xp1)-mp;
193 double nnn1=((double) xn1)-mn;
194 num+=nnp1*nnn1;
195 varp+=nnp1*nnp1;
196 varn+=nnn1*nnn1;
197 }
198
199 if(xp2!=-1 && xn2!=-1) {
200 double nnp2=((double) xp2)-mp;
201 double nnn2=((double) xn2)-mn;
202 num+=nnp2*nnn2;
203 varp+=nnp2*nnp2;
204 varn+=nnn2*nnn2;
205 }
206
207 }
208 double tagw;
209 double spn1=((double)pn1)*tag_weight;
210 double snn1=((double)nn1)*tag_weight;
211 double spn2=((double)pn2)*tag_weight;
212 double snn2=((double)nn2)*tag_weight;
213 if(round_up) {
214 if(pn1>0 && spn1<1) { spn1=1.0; }
215 //if(pn2>0 && spn2<1) { spn2=1.0; }
216 if(nn1>0 && snn1<1) { snn1=1.0; }
217 //if(nn2>0 && snn2<1) { snn2=1.0; }
218 }
219
220 if(bg_subtract) {
221 spn1-=((double)bg_pn1)*bg_weight;
222 snn1-=((double)bg_nn2)*bg_weight;
223 spn2-=((double)bg_pn2)*bg_weight;
224 snn2-=((double)bg_nn1)*bg_weight;
225
226 if(spn2<0) spn2=0;
227 if(snn2<0) snn2=0;
228
229 if(spn1>0 && snn1>0) {
230 tagw=(2.0*sqrt(spn1*snn1)-(spn2+snn2+1.0));
231 } else {
232 tagw=-(spn2+snn2+1.0);
233 }
234 //cout<<"bg_pn1="<<bg_pn1<<"; bg_pn2="<<bg_pn2<<"; bg_nn1="<<bg_nn1<<"; bg_nn2="<<bg_nn2<<endl;
235 } else {
236 tagw=2.0*sqrt(spn1*snn1)-(spn2+snn2);
237 }
238
239 if(tagw<0) {
240 val=0.0;
241 } else {
242 if(num==0.0) {
243 val=0;
244 } else {
245 val=num/(sqrt(varp*varn));
246 }
247 val=val*sqrt(tagw) + tagw;
248
249 }
250 //cout<<"val="<<val<<endl;
251
252 #ifdef DEBUG
253 Rprintf("pn1=%d pn2=%d nn1=%d nn2=%d tag.weight=%f tagw=%f\n",pn1,pn2,nn1,nn2,tag_weight,tagw);
254 Rprintf("tagw=%f varp=%f varn=%f num=%f cor=%f val=%f\n",tagw,varp,varn,num,num/sqrt(varp*varn),val);
255 #endif
256 }
257
258
259
260 if(return_peaks) {
261 // determine if previous position was a peak
262 if(ppv>min_peak_val && ppv>val && ppv>pppv) {
263 if(lpp>0 && (i-lpp+1)>min_peak_dist) {
264 // record previous peak position
265 ppos.push_back(lpp);
266 pval.push_back(lpv);
267 #ifdef DEBUG
268 Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
269 #endif
270 lpp=i-1; lpv=ppv;
271 #ifdef DEBUG
272 Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
273 #endif
274 } else {
275 if(ppv>lpv) {
276 // update last peak positions
277 #ifdef DEBUG
278 Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
279 #endif
280 lpp=i-1; lpv=ppv;
281 }
282 }
283 }
284
285 // update previous values
286 if(val!=ppv) {
287 pppv=ppv; ppv=val;
288 }
289 } else {
290 d_nv[i]=val;
291 }
292 }
293 }
294
295 if(return_peaks) {
296 // record last position
297 if(lpp>0) {
298 #ifdef DEBUG
299 Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
300 #endif
301 ppos.push_back(lpp);
302 pval.push_back(lpv);
303 }
304
305 SEXP rpp_R,rpv_R;
306 PROTECT(rpp_R=allocVector(INTSXP,ppos.size()));
307 PROTECT(rpv_R=allocVector(REALSXP,ppos.size()));
308 int* rpp=INTEGER(rpp_R);
309 double* rpv=REAL(rpv_R);
310
311 for(int i=0;i<ppos.size();i++) {
312 rpp[i]=ppos[i];
313 rpv[i]=pval[i];
314 }
315
316 SEXP ans_R, names_R;
317 PROTECT(names_R = allocVector(STRSXP, 2));
318 SET_STRING_ELT(names_R, 0, mkChar("x"));
319 SET_STRING_ELT(names_R, 1, mkChar("v"));
320
321 PROTECT(ans_R = allocVector(VECSXP, 2));
322 SET_VECTOR_ELT(ans_R, 0, rpp_R);
323 SET_VECTOR_ELT(ans_R, 1, rpv_R);
324 setAttrib(ans_R, R_NamesSymbol, names_R);
325
326 UNPROTECT(4);
327 return(ans_R);
328 } else {
329 UNPROTECT(1);
330 return(nv);
331 }
332
333 }
334
335
336
337 /************************************************************************/
338 /*
339 * wtd - window tag difference implementation
340 */
341
342 SEXP wtd(SEXP x_R, // positive strand hist
343 SEXP y_R, // negative strand hist of the same length
344 SEXP wsize_R, // outer boundary distance
345 SEXP return_peaks_R, // whether all correlation values, or just peaks should be returned
346 SEXP min_peak_dist_R, // distance between closest peaks
347 SEXP min_peak_val_R, // min peak threshold
348 SEXP direct_count_R, // whether tag weighting should not be done
349 SEXP tag_weight_R, // tag weight
350 SEXP ignore_masking_R, // whether to ignore masked regions
351 SEXP bg_subtract_R, // a flag whether do background subtractio
352 SEXP bgp_R, // optional background hist for positive strand
353 SEXP bgn_R, // optional background hist for negative strand
354 SEXP bg_wsize_R, // window size for the background counts
355 SEXP bg_weight_R, // optional weighting for the background tags, must compensate for window size difference
356 SEXP round_up_R // whether to round up fractional signal tag counts
357 )
358 {
359
360 #ifdef DEBUG
361 Rprintf("start\n");
362 #endif
363
364 int whs=INTEGER(wsize_R)[0];
365 int* x=INTEGER(x_R);
366 int* y=INTEGER(y_R);
367 int n_x=LENGTH(x_R);
368
369 // background-related
370 int* bgp=INTEGER(bgp_R);
371 int* bgn=INTEGER(bgn_R);
372 int bg_whs=INTEGER(bg_wsize_R)[0];
373
374
375 const int return_peaks=*(INTEGER(return_peaks_R));
376 const int direct_count=*(INTEGER(direct_count_R));
377 const int ignore_masking=*(INTEGER(ignore_masking_R));
378 const double min_peak_val=*(REAL(min_peak_val_R));
379 const int min_peak_dist=*(INTEGER(min_peak_dist_R));
380 const double tag_weight=*(REAL(tag_weight_R));
381
382 const int round_up=*(INTEGER(round_up_R));
383 const int bg_subtract=*(INTEGER(bg_subtract_R));
384 const double bg_weight=*(REAL(bg_weight_R));
385
386 int i; // point at which the value is being calculated
387 int start=whs+1;
388 int end=n_x-whs-1;
389
390 // tag counts to calculate the means
391 int pn1=0;
392 int pn2=0;
393 int nn1=0;
394 int nn2=0;
395
396 // bg tag counts within bg window
397 int bg_pn1=0;
398 int bg_pn2=0;
399 int bg_nn1=0;
400 int bg_nn2=0;
401
402 SEXP nv;
403 double *d_nv;
404 vector<int> ppos;
405 vector<double> pval;
406 if(!return_peaks) {
407 PROTECT(nv=allocVector(REALSXP,n_x));
408 d_nv=REAL(nv);
409 for(int i=0;i<n_x;i++) {
410 d_nv[i]=0;
411 }
412 }
413
414 #ifdef DEBUG
415 Rprintf("whs=%d start=%d end=%d tag_weight=%f ignore_masing=%d\n", whs, start,end,tag_weight,ignore_masking);
416 Rprintf("x[1]=%d x[2]=%d y[1]=%d y[2]=%d\n",x[1],x[2],y[1],y[2]);
417 #endif
418
419 int lpp=-1; // last peak position
420 double lpv=-1000; // last peak value
421
422 double ppv=-1000; // last value
423 int ppl=-1; // position of the last value
424 double pppv=-1000; // value before last
425
426
427 if(ignore_masking==1) {
428 for(int i=0;i<whs;i++) {
429 pn1+=x[i];
430 pn2+=x[i+whs+1];
431 nn1+=y[i];
432 nn2+=y[i+whs+1];
433
434 }
435 }
436
437 if(bg_subtract) {
438 // pre-initialize background tag counts,
439 for(int i=0;i<bg_whs;i++) {
440 if(i<n_x) {
441 bg_pn2+=bgp[i];
442 bg_nn2+=bgn[i];
443 }
444 }
445 // increment center of background count window to the start position
446 for(int i=0;i<start;i++) {
447 // update background counts
448 int nl=i-bg_whs-1;
449
450 if(nl>=0) {
451 bg_pn1-=bgp[nl];
452 bg_nn1-=bgn[nl];
453 }
454 bg_pn1+=bgp[i];
455 bg_nn1+=bgn[i];
456
457 if(i>0) {
458 bg_pn2-=bgp[i-1];
459 bg_nn2-=bgn[i-1];
460 }
461 int nr=i+bg_whs;
462 if(nr<n_x) {
463 bg_pn2+=bgp[nr];
464 bg_nn2+=bgn[nr];
465 }
466 }
467
468 }
469
470
471 #ifdef DEBUG
472 Rprintf("initialization: i=%d pn1=%d, pn2=%d, nn1=%d, nn2=%d", i,pn1,pn2,nn1,nn2);
473 #endif
474
475 for(i=start;i<end;i++) {
476 if(bg_subtract) {
477 // update background counts
478 int nl=i-bg_whs-1;
479
480 if(nl>=0) {
481 bg_pn1-=bgp[nl];
482 bg_nn1-=bgn[nl];
483 }
484 bg_pn1+=bgp[i];
485 bg_nn1+=bgn[i];
486
487 if(i>0) {
488 bg_pn2-=bgp[i-1];
489 bg_nn2-=bgn[i-1];
490 }
491 int nr=i+bg_whs;
492 if(nr<n_x) {
493 bg_pn2+=bgp[nr];
494 bg_nn2+=bgn[nr];
495 }
496 }
497
498 // update counts
499 if(ignore_masking==1) {
500 pn1+=x[i-1]-x[i-whs-1];
501 pn2+=x[i+whs]-x[i-1];
502 nn1+=y[i-1]-y[i-whs-1];
503 nn2+=y[i+whs]-y[i-1];
504
505 } else {
506
507 pn1=pn2=nn1=nn2=0;
508
509 for(int k=0;k<whs;k++) {
510 int xp1=x[i-k-1];
511 int xp2=x[i+k];
512 int xn1=y[i-k-1];
513 int xn2=y[i+k];
514
515 // omit masked positions
516 if(xp1!=-1 && xn1!=-1 && xp2!=-1 && xn2!=-1) {
517 pn1+=xp1;
518 nn1+=xn1;
519 pn2+=xp2;
520 nn2+=xn2;
521 }
522 }
523 }
524
525 double val;
526 double spn1=((double)pn1)*tag_weight;
527 double snn1=((double)nn1)*tag_weight;
528 double spn2=((double)pn2)*tag_weight;
529 double snn2=((double)nn2)*tag_weight;
530 if(round_up) {
531 if(pn1>0 && spn1<1) { spn1=1.0; }
532 //if(pn2>0 && spn2<1) { spn2=1.0; }
533 //if(nn1>0 && snn1<1) { snn1=1.0; }
534 if(nn2>0 && snn2<1) { snn2=1.0; }
535 }
536
537 if(direct_count) {
538 val=spn1+snn2;
539 if(round_up && val<1) {
540 val=1.0;
541 }
542 if(bg_subtract) {
543 val-=((double) (bg_pn1+bg_nn2))*bg_weight;
544 }
545 } else {
546 if(bg_subtract) {
547 spn1-=((double)bg_pn1)*bg_weight;
548 snn1-=((double)bg_nn1)*bg_weight;
549 spn2-=((double)bg_pn2)*bg_weight;
550 snn2-=((double)bg_nn2)*bg_weight;
551
552 if(spn2<0) spn2=0;
553 if(snn1<0) snn1=0;
554
555 if(spn1>0 && snn2>0) {
556 val=(2.0*sqrt(spn1*snn2)-(spn2+snn1+1.0));
557 } else {
558 val=-(spn2+snn1+1.0);
559 }
560 } else {
561 val=2.0*sqrt(spn1*snn2)-(spn2+snn1+tag_weight);
562 }
563 }
564 //double val=sqrt(pn1*nn2);
565 //if(pn2>nn1) { val-=pn2; } else { val-=pn1; }
566 #ifdef DEBUG
567 Rprintf("update: i=%d pn1=%d pn2=%d nn1=%d nn2=%d val=%f\n",i,pn1,pn2,nn1,nn2,val);
568 #endif
569
570 if(return_peaks) {
571 // determine if previous position was a peak
572 if(ppv>min_peak_val && ppv>val && ppv>pppv) {
573 if(lpp>0 && (i-lpp+1)>min_peak_dist) {
574 // record previous peak position
575 ppos.push_back(lpp);
576 pval.push_back(lpv);
577 #ifdef DEBUG
578 Rprintf("recording peak x=%d y=%f d=%d\n",lpp,lpv,(i-lpp));
579 #endif
580 if(ppl!=-1 && ppl!=i-1) {
581 lpp=(int) round((ppl+i-1)/2);
582 } else {
583 lpp=i-1;
584 }
585 lpv=ppv;
586 #ifdef DEBUG
587 Rprintf("updated peak to x=%d y=%f\n",lpp,lpv);
588 #endif
589 } else {
590 if(ppv>lpv) {
591 // update last peak positions
592 #ifdef DEBUG
593 Rprintf("skipping peak x=%d y=%f d=%d in favor of x=%d y=%f\n",lpp,lpv,(i-lpp),i-1,ppv);
594 #endif
595 if(ppl!=-1 && ppl!=i-1) {
596 lpp=(int) round((ppl+i-1)/2);
597 } else {
598 lpp=i-1;
599 }
600 lpv=ppv;
601 }
602 }
603 }
604
605 // update previous values
606 if(val!=ppv) {
607 pppv=ppv; ppv=val; ppl=i;
608 }
609 } else {
610 d_nv[i]=val;
611 }
612 }
613
614 if(return_peaks) {
615 // record last position
616 if(lpp>0) {
617 #ifdef DEBUG
618 Rprintf("recording last peak x=%d y=%f\n",lpp,lpv);
619 #endif
620 ppos.push_back(lpp);
621 pval.push_back(lpv);
622 }
623
624 SEXP rpp_R,rpv_R;
625 PROTECT(rpp_R=allocVector(INTSXP,ppos.size()));
626 PROTECT(rpv_R=allocVector(REALSXP,ppos.size()));
627 int* rpp=INTEGER(rpp_R);
628 double* rpv=REAL(rpv_R);
629
630 for(int i=0;i<ppos.size();i++) {
631 rpp[i]=ppos[i];
632 rpv[i]=pval[i];
633 }
634
635 SEXP ans_R, names_R;
636 PROTECT(names_R = allocVector(STRSXP, 2));
637 SET_STRING_ELT(names_R, 0, mkChar("x"));
638 SET_STRING_ELT(names_R, 1, mkChar("v"));
639
640 PROTECT(ans_R = allocVector(VECSXP, 2));
641 SET_VECTOR_ELT(ans_R, 0, rpp_R);
642 SET_VECTOR_ELT(ans_R, 1, rpv_R);
643 setAttrib(ans_R, R_NamesSymbol, names_R);
644
645 UNPROTECT(4);
646 return(ans_R);
647 } else {
648 UNPROTECT(1);
649 return(nv);
650 }
651
652 }
653
654
655 }
656
657