comparison spp/src/bed2vector.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
comparison
equal deleted inserted replaced
5:608a8e0eac56 6:ce08b0efa3fd
1 #include "pc.h"
2 #include "config.h"
3 #include <vector>
4 #include <string.h>
5 #include <iostream>
6 #include <fstream>
7 #include <sstream>
8 #include <strstream>
9 #include <algorithm>
10 #include <string>
11 #include <functional>
12 #include <utility>
13 #include <ext/hash_map>
14 #include <boost/tokenizer.hpp>
15
16 #ifdef HAVE_LIBBZ2
17 #include <bzlib.h>
18 #endif
19
20 extern "C" {
21 #include "R.h"
22 #include "Rmath.h"
23 #include "Rinternals.h"
24 #include "Rdefines.h"
25 }
26
27 using namespace std;
28 using namespace __gnu_cxx;
29
30
31 class lessAbsoluteValue {
32 public:
33 bool operator()(int a, int b) const {
34 return abs(a) < abs(b);
35 }
36 };
37
38
39
40 #ifdef HAVE_LIBBZ2
41 int get_bzline(BZFILE* b,string& line) {
42 char c;
43 int nBuf;
44 int bzerror=BZ_OK;
45
46 while(bzerror == BZ_OK) {
47 nBuf=BZ2_bzRead(&bzerror, b, &c, 1);
48 if(bzerror==BZ_OK) {
49 if(c=='\n') {
50 return bzerror;
51 } else {
52 line+=c;
53 }
54 }
55 }
56 return bzerror;
57 }
58
59 int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) {
60 line="";
61 if(bz2file) {
62 int bzerror=get_bzline(b,line);
63 if(bzerror==BZ_OK) {
64 return(1);
65 } else {
66 if(bzerror!=BZ_STREAM_END) {
67 cerr<<"encountered BZERROR="<<bzerror<<endl;
68 }
69 return(0);
70 }
71 } else {
72 char *cline=NULL;
73 size_t n;
74 if(getline(&cline,&n,f) != -1) {
75 if(cline) {
76 cline[strlen(cline)-1]='\0';
77 line+=cline;
78 free(cline);
79 }
80 return(1);
81 } else {
82 return(0);
83 }
84 }
85 }
86 #endif
87
88
89 /**
90 * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign
91 * corresponding to the strand.
92 */
93
94 //#define DEBUG 1
95
96 extern "C" {
97 SEXP read_bed_ends(SEXP filename) {
98
99 #ifdef DEBUG
100 Rprintf("start\n");
101 #endif
102 const char* fname=CHAR(asChar(filename));
103 #ifdef DEBUG
104 Rprintf("fname=%s\n",fname);
105 #endif
106
107 // main data vector
108 // chr - pos
109 vector< vector<int> > pos;
110
111 // chromosome map
112 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
113 vector<string> cnames;
114
115 typedef boost::tokenizer<boost::char_separator<char> > tokType;
116 boost::char_separator<char> sep(" \t");
117
118
119 ifstream bed_file(fname);
120
121 #ifdef DEBUG
122 Rprintf("opened %s\n",fname);
123 #endif
124
125 Rprintf("opened %s\n",fname);
126
127 // read in bed line
128 string line;
129
130 int fcount=0;
131 while(getline(bed_file,line)) {
132
133 #ifdef DEBUG
134 Rprintf("line: %s\n",line.c_str());
135 #endif
136
137
138 tokType tok(line, sep);
139 tokType::iterator sit=tok.begin();
140 if(sit!=tok.end()) {
141 string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str()));
142 string str_start=*sit++;
143 int fstart=atoi(str_start.c_str());
144 string str_end=*sit++;
145 int fend=atoi(str_end.c_str());
146 int fpos=fstart;
147 if(sit!=tok.end()) {
148 string u0=*sit++;
149 string nfield=*sit++;
150 string strand=*sit++;
151 if(strand=="-") {
152 fpos=-1*fend;
153 }
154 }
155
156 // determine the chromosome index
157 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
158 int cind=-1;
159 if(li==cind_map.end()) {
160 // register new chromosome
161 cind=cnames.size();
162 cnames.push_back(chr);
163 cind_map[chr]=cind;
164 // allocate new pos vector
165 pos.push_back(vector<int>());
166 #ifdef DEBUG
167 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
168 #endif
169 } else {
170 cind=li->second;
171 }
172 fcount++;
173 (pos[cind]).push_back(fpos);
174 #ifdef DEBUG
175 Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos);
176 if(fcount>30) {
177 break;
178 }
179 #endif
180
181 }
182 }
183 bed_file.close();
184
185
186 #ifdef DEBUG
187 Rprintf("done. read %d fragments\n",fcount);
188 #endif
189
190 Rprintf("done. read %d fragments\n",fcount);
191
192 // construct output structures
193 SEXP chnames;
194 int np=0; // number of protections
195 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
196 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
197 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
198 }
199 np++;
200
201 // sort
202 for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
203 sort(csi->begin(), csi->end(), lessAbsoluteValue());
204 }
205
206 SEXP ans;
207 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
208 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
209 SEXP nv;
210 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
211 int* i_nv=INTEGER(nv);
212 int i=0;
213 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
214 i_nv[i++]=*pi;
215 }
216 SET_VECTOR_ELT(ans, csi-pos.begin(), nv);
217 }
218
219 setAttrib(ans,R_NamesSymbol,chnames);
220
221 #ifdef DEBUG
222 Rprintf("unprotecting %d elements\n",np);
223 #endif
224
225 UNPROTECT(np);
226 return(ans);
227 }
228
229
230
231 SEXP read_meland_old(SEXP filename) {
232
233 #ifdef DEBUG
234 Rprintf("start\n");
235 #endif
236 const char* fname=CHAR(asChar(filename));
237 #ifdef DEBUG
238 Rprintf("fname=%s\n",fname);
239 #endif
240
241 // main data vector
242 // chr - pos
243 vector< vector<int> > pos;
244 vector< vector<int> > posnm; // number of mismatches
245 vector< vector<int> > poslen; // length
246
247 // chromosome map
248 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
249 vector<string> cnames;
250
251
252 typedef boost::tokenizer<boost::char_separator<char> > tokType;
253 boost::char_separator<char> sep(" \t");
254
255
256 ifstream bed_file(fname);
257
258 Rprintf("opened %s\n",fname);
259
260 // read in bed line
261 string line;
262
263 int fcount=0;
264 while(getline(bed_file,line)) {
265
266 #ifdef DEBUG
267 Rprintf("line: %s\n",line.c_str());
268 #endif
269
270
271 tokType tok(line, sep);
272 tokType::iterator sit=tok.begin();
273 if(sit!=tok.end()) {
274 sit++; sit++;
275 string str_nm=*sit++;
276 int nm=0;
277 if(str_nm[0]=='U') {
278 nm=atoi((str_nm.c_str()+1));
279 } else {
280 continue;
281 }
282 sit++; sit++; sit++;
283 string str_len=*sit++;
284 int len=atoi(str_len.c_str());
285 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
286 string str_pos=*sit++;
287 int fpos=atoi(str_pos.c_str());
288
289 // determine the chromosome index
290 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
291 int cind=-1;
292 if(li==cind_map.end()) {
293 // register new chromosome
294 cind=cnames.size();
295 cnames.push_back(chr);
296 cind_map[chr]=cind;
297 // allocate new pos vector
298 pos.push_back(vector<int>());
299 posnm.push_back(vector<int>());
300 poslen.push_back(vector<int>());
301 #ifdef DEBUG
302 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
303 #endif
304 } else {
305 cind=li->second;
306 }
307 fcount++;
308 (pos[cind]).push_back(fpos);
309 (posnm[cind]).push_back(nm);
310 (poslen[cind]).push_back(len);
311 #ifdef DEBUG
312 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
313 if(fcount>30) {
314 break;
315 }
316 #endif
317
318 }
319 }
320 bed_file.close();
321
322
323 #ifdef DEBUG
324 Rprintf("done. read %d fragments\n",fcount);
325 #endif
326
327 Rprintf("done. read %d fragments\n",fcount);
328
329 // construct output structures
330 SEXP chnames;
331 int np=0; // number of protections
332 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
333 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
334 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
335 }
336 np++;
337
338 // sort
339 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
340 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
341 //}
342
343 SEXP ans;
344 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
345 vector<vector<int> >::const_iterator nsi,lsi;
346 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
347 nsi=posnm.begin()+(csi-pos.begin());
348 lsi=poslen.begin()+(csi-pos.begin());
349
350 SEXP dv,dnames_R;
351 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
352 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
353 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
354 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
355
356
357
358 SEXP tv,nv,lv;
359 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
360 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
361 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
362 int* i_tv=INTEGER(tv);
363 int* i_nv=INTEGER(nv);
364 int* i_lv=INTEGER(lv);
365
366 int i=0;
367 vector<int>::const_iterator ini=nsi->begin();
368 vector<int>::const_iterator ili=lsi->begin();
369 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
370 i_tv[i]=*pi;
371 i_nv[i]=*ini++;
372 i_lv[i]=*ili++;
373 i++;
374 }
375 PROTECT(dv = allocVector(VECSXP, 3)); np++;
376 SET_VECTOR_ELT(dv, 0, tv);
377 SET_VECTOR_ELT(dv, 1, nv);
378 SET_VECTOR_ELT(dv, 2, lv);
379 setAttrib(dv, R_NamesSymbol, dnames_R);
380
381 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
382 }
383
384 setAttrib(ans,R_NamesSymbol,chnames);
385
386 #ifdef DEBUG
387 Rprintf("unprotecting %d elements\n",np);
388 #endif
389
390 UNPROTECT(np);
391 return(ans);
392 }
393
394
395 int get_a_line(FILE *f,string& line) {
396 line="";
397 char cline[1024];
398 if(fgets(cline,1024,f)) {
399 line+=cline;
400 return(1);
401 } else {
402 return(0);
403 }
404 }
405
406
407 SEXP read_meland(SEXP filename,SEXP read_tag_names_R) {
408
409 #ifdef DEBUG
410 Rprintf("start\n");
411 #endif
412 const char* fname=CHAR(asChar(filename));
413 int read_names=*(INTEGER(read_tag_names_R));
414 #ifdef DEBUG
415 Rprintf("fname=%s\n",fname);
416 #endif
417
418 // main data vector
419 // chr - pos
420 vector< vector<int> > pos;
421 vector< vector<int> > posnm; // number of mismatches
422 vector< vector<int> > poslen; // length
423 vector< vector<string> > tagnames;
424
425 // chromosome map
426 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
427 vector<string> cnames;
428
429
430 typedef boost::tokenizer<boost::char_separator<char> > tokType;
431 boost::char_separator<char> sep(" \t");
432
433
434 FILE *f=fopen(fname,"rb");
435 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
436
437 Rprintf("opened %s\n",fname);
438
439
440 // read in bed line
441 string line;
442 int fcount=0;
443 while(get_a_line(f,line)) {
444
445 #ifdef DEBUG
446 Rprintf("line: %s\n",line.c_str());
447 #endif
448
449
450 tokType tok(line, sep);
451 tokType::iterator sit=tok.begin();
452 if(sit!=tok.end()) {
453 string tagname=*sit++;
454 sit++;
455 string str_nm=*sit++;
456 int nm=0;
457 if(str_nm[0]=='U') {
458 nm=atoi((str_nm.c_str()+1));
459 } else {
460 continue;
461 }
462 sit++; sit++; sit++;
463 string str_len=*sit++;
464 int len=atoi(str_len.c_str());
465 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
466 string str_pos=*sit++;
467 int fpos=atoi(str_pos.c_str());
468
469 // determine the chromosome index
470 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
471 int cind=-1;
472 if(li==cind_map.end()) {
473 // register new chromosome
474 cind=cnames.size();
475 cnames.push_back(chr);
476 cind_map[chr]=cind;
477 // allocate new pos vector
478 pos.push_back(vector<int>());
479 posnm.push_back(vector<int>());
480 poslen.push_back(vector<int>());
481 if(read_names) {
482 tagnames.push_back(vector<string>());
483 }
484 #ifdef DEBUG
485 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
486 #endif
487 } else {
488 cind=li->second;
489 }
490 fcount++;
491 (pos[cind]).push_back(fpos);
492 (posnm[cind]).push_back(nm);
493 (poslen[cind]).push_back(len);
494 if(read_names) {
495 (tagnames[cind]).push_back(tagname);
496 }
497 #ifdef DEBUG
498 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
499 if(fcount>30) {
500 break;
501 }
502 #endif
503
504 }
505 }
506 fclose(f);
507
508
509 #ifdef DEBUG
510 Rprintf("done. read %d fragments\n",fcount);
511 #endif
512
513 Rprintf("done. read %d fragments\n",fcount);
514
515 // construct output structures
516 SEXP chnames;
517 int np=0; // number of protections
518 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
519 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
520 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
521 }
522 np++;
523
524 // sort
525 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
526 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
527 //}
528
529 SEXP ans;
530 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
531 vector<vector<int> >::const_iterator nsi,lsi;
532 vector<vector<string> >::const_iterator ssi;
533 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
534 nsi=posnm.begin()+(csi-pos.begin());
535 lsi=poslen.begin()+(csi-pos.begin());
536
537 SEXP dv,dnames_R;
538 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
539 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
540 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
541 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
542 if(read_names) {
543 SET_STRING_ELT(dnames_R, 3, mkChar("s"));
544 }
545
546
547
548 SEXP tv,nv,lv,sv;
549 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
550 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
551 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
552 if(read_names) {
553 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
554 }
555 int* i_tv=INTEGER(tv);
556 int* i_nv=INTEGER(nv);
557 int* i_lv=INTEGER(lv);
558
559 int i=0;
560 vector<int>::const_iterator ini=nsi->begin();
561 vector<int>::const_iterator ili=lsi->begin();
562 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
563 i_tv[i]=*pi;
564 i_nv[i]=*ini++;
565 i_lv[i]=*ili++;
566 i++;
567 }
568 if(read_names) {
569 int i=0;
570 ssi=tagnames.begin()+(csi-pos.begin());
571 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
572 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
573 i++;
574 }
575 }
576 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
577 SET_VECTOR_ELT(dv, 0, tv);
578 SET_VECTOR_ELT(dv, 1, nv);
579 SET_VECTOR_ELT(dv, 2, lv);
580 if(read_names) {
581 SET_VECTOR_ELT(dv, 3, sv);
582 }
583 setAttrib(dv, R_NamesSymbol, dnames_R);
584
585 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
586 }
587
588 setAttrib(ans,R_NamesSymbol,chnames);
589
590 #ifdef DEBUG
591 Rprintf("unprotecting %d elements\n",np);
592 #endif
593
594 UNPROTECT(np);
595 return(ans);
596 }
597
598
599
600 // reads regular eland files, recording mismatch positions
601 SEXP read_eland_mismatches(SEXP filename) {
602
603 #ifdef DEBUG
604 Rprintf("start\n");
605 #endif
606 const char* fname=CHAR(asChar(filename));
607 #ifdef DEBUG
608 Rprintf("fname=%s\n",fname);
609 #endif
610
611 // main data vector
612 // chr - pos
613 vector< vector<int> > pos;
614 vector< vector<int> > mm1; // position of the first mismatch (or 0 for none)
615 vector< vector<int> > mm2; // position of the second mismatch
616
617 // chromosome map
618 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
619 vector<string> cnames;
620
621
622 typedef boost::tokenizer<boost::char_separator<char> > tokType;
623 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
624
625
626 FILE *f=fopen(fname,"rb");
627 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
628
629 Rprintf("opened %s\n",fname);
630
631 // read in bed line
632 string line;
633 int fcount=0;
634 while(get_a_line(f,line)) {
635
636 #ifdef DEBUG
637 Rprintf("line: %s\n",line.c_str());
638 #endif
639
640
641 tokType tok(line, sep);
642 tokType::iterator sit=tok.begin();
643 if(sit!=tok.end()) {
644 sit++;
645 string seq=*sit++;
646 string str_nm=*sit++;
647 int nm=0;
648 if(str_nm[0]=='U') {
649 nm=atoi((str_nm.c_str()+1));
650 } else {
651 continue;
652 }
653 sit++; sit++; sit++;
654 string chr=*sit++;
655 // extract chromosome name from this
656 int chrp=chr.find("chr");
657 int pp=chr.find('.');
658 chr=chr.substr(chrp+3,pp-chrp-3);
659
660 string str_pos=*sit++;
661 int fpos=atoi(str_pos.c_str());
662
663
664 string strand=*sit++;
665 int nstrand=0;
666 if(strand=="R") {
667 fpos=-1*(fpos+seq.size()-1);
668 nstrand=1;
669 }
670
671 sit++;
672
673 int nm1=0; int nm2=0;
674 if(sit!=tok.end()) {
675 string nms=*sit++;
676 nm1=atoi(nms.substr(0,nms.size()-1).c_str());
677 if(nstrand) { nm1=seq.size()-nm1+1; }
678 }
679 if(sit!=tok.end()) {
680 string nms=*sit++;
681 nm2=atoi(nms.substr(0,nms.size()-1).c_str());
682 if(nstrand) { nm2=seq.size()-nm2+1; }
683 }
684
685 // determine the chromosome index
686 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
687 int cind=-1;
688 if(li==cind_map.end()) {
689 // register new chromosome
690 cind=cnames.size();
691 cnames.push_back(chr);
692 cind_map[chr]=cind;
693 // allocate new pos vector
694 pos.push_back(vector<int>());
695 mm1.push_back(vector<int>());
696 mm2.push_back(vector<int>());
697 #ifdef DEBUG
698 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
699 #endif
700 } else {
701 cind=li->second;
702 }
703 fcount++;
704 (pos[cind]).push_back(fpos);
705 (mm1[cind]).push_back(nm1);
706 (mm2[cind]).push_back(nm2);
707 #ifdef DEBUG
708 Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2);
709 if(fcount>30) {
710 break;
711 }
712 #endif
713
714 }
715 }
716 fclose(f);
717
718
719 #ifdef DEBUG
720 Rprintf("done. read %d fragments\n",fcount);
721 #endif
722
723 Rprintf("done. read %d fragments\n",fcount);
724
725 // construct output structures
726 SEXP chnames;
727 int np=0; // number of protections
728 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
729 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
730 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
731 }
732 np++;
733
734 // sort
735 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
736 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
737 //}
738
739 SEXP ans;
740 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
741 vector<vector<int> >::const_iterator nsi,lsi;
742 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
743 nsi=mm1.begin()+(csi-pos.begin());
744 lsi=mm2.begin()+(csi-pos.begin());
745
746 SEXP dv,dnames_R;
747 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
748 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
749 SET_STRING_ELT(dnames_R, 1, mkChar("f"));
750 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
751
752
753
754 SEXP tv,nv,lv;
755 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
756 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
757 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
758 int* i_tv=INTEGER(tv);
759 int* i_nv=INTEGER(nv);
760 int* i_lv=INTEGER(lv);
761
762 int i=0;
763 vector<int>::const_iterator ini=nsi->begin();
764 vector<int>::const_iterator ili=lsi->begin();
765 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
766 i_tv[i]=*pi;
767 i_nv[i]=*ini++;
768 i_lv[i]=*ili++;
769 i++;
770 }
771 PROTECT(dv = allocVector(VECSXP, 3)); np++;
772 SET_VECTOR_ELT(dv, 0, tv);
773 SET_VECTOR_ELT(dv, 1, nv);
774 SET_VECTOR_ELT(dv, 2, lv);
775 setAttrib(dv, R_NamesSymbol, dnames_R);
776
777 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
778 }
779
780 setAttrib(ans,R_NamesSymbol,chnames);
781
782 #ifdef DEBUG
783 Rprintf("unprotecting %d elements\n",np);
784 #endif
785
786 UNPROTECT(np);
787 return(ans);
788 }
789
790
791 // read in regular eland files, adjusting the negative strand coordinate by sequence length
792 SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
793
794 #ifdef DEBUG
795 Rprintf("start\n");
796 #endif
797 const char* fname=CHAR(asChar(filename));
798 int read_names=*(INTEGER(read_tag_names_R));
799 int eland_tag_length=*(INTEGER(eland_tag_length_R));
800 #ifdef DEBUG
801 Rprintf("fname=%s\n",fname);
802 #endif
803
804 // main data vector
805 // chr - pos
806 vector< vector<int> > pos;
807 vector< vector<int> > posnm; // number of mismatches
808 vector< vector<string> > tagnames;
809
810 // chromosome map
811 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
812 vector<string> cnames;
813
814
815 typedef boost::tokenizer<boost::char_separator<char> > tokType;
816 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
817
818
819 FILE *f=fopen(fname,"rb");
820 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
821 else {
822 Rprintf("opened %s\n",fname);
823
824 // read in bed line
825 string line;
826 int fcount=0;
827 while(get_a_line(f,line)) {
828
829 #ifdef DEBUG
830 Rprintf("line: %s\n",line.c_str());
831 #endif
832
833
834 tokType tok(line, sep);
835 tokType::iterator sit=tok.begin();
836 if(sit!=tok.end()) {
837 string tagname=*sit++;
838 string sequence=*sit++;
839 int len=sequence.size();
840 // adjust probe length if eland length limit was specified
841 if(eland_tag_length>0 && len>eland_tag_length) {
842 len=eland_tag_length;
843 }
844 string str_nm=*sit++;
845 int nm=0;
846 if(str_nm[0]=='U') {
847 nm=atoi((str_nm.c_str()+1));
848 } else {
849 continue;
850 }
851 sit++; sit++; sit++;
852 string chr=*sit++;
853 string str_pos=*sit++;
854 int fpos=atoi(str_pos.c_str());
855 string str_strand=*sit++;
856
857 if(str_strand[0]=='R') {
858 fpos=-1*(fpos+len-1);
859 }
860
861 // determine the chromosome index
862 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
863 int cind=-1;
864 if(li==cind_map.end()) {
865 // register new chromosome
866 cind=cnames.size();
867 cnames.push_back(chr);
868 cind_map[chr]=cind;
869 // allocate new pos vector
870 pos.push_back(vector<int>());
871 posnm.push_back(vector<int>());
872 if(read_names) {
873 tagnames.push_back(vector<string>());
874 }
875 #ifdef DEBUG
876 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
877 #endif
878 } else {
879 cind=li->second;
880 }
881 fcount++;
882 (pos[cind]).push_back(fpos);
883 (posnm[cind]).push_back(nm);
884 if(read_names) {
885 (tagnames[cind]).push_back(tagname);
886 }
887 #ifdef DEBUG
888 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
889 if(fcount>30) {
890 break;
891 }
892 #endif
893
894 }
895 }
896 fclose(f);
897
898 Rprintf("done. read %d fragments\n",fcount);
899 }
900 // construct output structures
901 SEXP chnames;
902 int np=0; // number of protections
903 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
904 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
905 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
906 }
907 np++;
908
909 // sort
910 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
911 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
912 //}
913
914 SEXP ans;
915 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
916 vector<vector<int> >::const_iterator nsi;
917 vector<vector<string> >::const_iterator ssi;
918 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
919 nsi=posnm.begin()+(csi-pos.begin());
920
921 SEXP dv,dnames_R;
922 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
923 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
924 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
925 if(read_names) {
926 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
927 }
928
929
930
931 SEXP tv,nv,sv;
932 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
933 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
934 if(read_names) {
935 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
936 }
937 int* i_tv=INTEGER(tv);
938 int* i_nv=INTEGER(nv);
939
940 int i=0;
941 vector<int>::const_iterator ini=nsi->begin();
942 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
943 i_tv[i]=*pi;
944 i_nv[i]=*ini++;
945 i++;
946 }
947 if(read_names) {
948 int i=0;
949 ssi=tagnames.begin()+(csi-pos.begin());
950 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
951 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
952 i++;
953 }
954 }
955 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
956 SET_VECTOR_ELT(dv, 0, tv);
957 SET_VECTOR_ELT(dv, 1, nv);
958 if(read_names) {
959 SET_VECTOR_ELT(dv, 2, sv);
960 }
961 setAttrib(dv, R_NamesSymbol, dnames_R);
962
963 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
964 }
965
966 setAttrib(ans,R_NamesSymbol,chnames);
967
968 #ifdef DEBUG
969 Rprintf("unprotecting %d elements\n",np);
970 #endif
971
972 UNPROTECT(np);
973 return(ans);
974 }
975
976
977
978 // read in extended eland files, adjusting the negative strand coordinate by sequence length
979 SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
980
981 #ifdef DEBUG
982 Rprintf("start\n");
983 #endif
984 const char* fname=CHAR(asChar(filename));
985 int read_names=*(INTEGER(read_tag_names_R));
986 int eland_tag_length=*(INTEGER(eland_tag_length_R));
987 #ifdef DEBUG
988 Rprintf("fname=%s\n",fname);
989 #endif
990
991 // main data vector
992 // chr - pos
993 vector< vector<int> > pos;
994 vector< vector<int> > posnm; // number of mismatches
995 vector< vector<string> > tagnames;
996
997 // chromosome map
998 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
999 vector<string> cnames;
1000
1001
1002 typedef boost::tokenizer<boost::char_separator<char> > tokType;
1003 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
1004
1005
1006 FILE *f=fopen(fname,"rb");
1007 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
1008 else {
1009 Rprintf("opened %s\n",fname);
1010
1011 // read in bed line
1012 string line;
1013 int fcount=0;
1014 while(get_a_line(f,line)) {
1015
1016 #ifdef DEBUG
1017 Rprintf("line: %s\n",line.c_str());
1018 #endif
1019
1020
1021 tokType tok(line, sep);
1022 tokType::iterator sit=tok.begin();
1023 if(sit!=tok.end()) {
1024 string machinename=*sit++;
1025 string runnumber=*sit++;
1026 string lanenumber=*sit++;
1027 *sit++;
1028
1029 string str_x=*sit++;
1030 string str_y=*sit++;
1031
1032 string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y;
1033
1034
1035
1036 *sit++;
1037 *sit++;
1038
1039
1040 string sequence=*sit++;
1041 *sit++;
1042
1043 string chr=*sit++;
1044 string contig=*sit++;
1045 chr=chr+contig;
1046
1047 int len=sequence.size();
1048 // adjust probe length if eland length limit was specified
1049 if(eland_tag_length>0 && len>eland_tag_length) {
1050 len=eland_tag_length;
1051 }
1052
1053
1054
1055 string str_pos=*sit++;
1056 if(str_pos.size()<1) { continue; }
1057 int fpos=atoi(str_pos.c_str());
1058 string str_strand=*sit++;
1059
1060 if(str_strand[0]=='R') {
1061 fpos=-1*(fpos+len-1);
1062 }
1063
1064 string str_nm=*sit++;
1065 // count non-digit characters
1066 int nm=0;
1067 for(int i=0;i<str_nm.size();i++) {
1068 if(!isdigit(str_nm[i])) { nm++; }
1069 }
1070
1071 // determine the chromosome index
1072 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
1073 int cind=-1;
1074 if(li==cind_map.end()) {
1075 // register new chromosome
1076 cind=cnames.size();
1077 cnames.push_back(chr);
1078 cind_map[chr]=cind;
1079 // allocate new pos vector
1080 pos.push_back(vector<int>());
1081 posnm.push_back(vector<int>());
1082 if(read_names) {
1083 tagnames.push_back(vector<string>());
1084 }
1085 #ifdef DEBUG
1086 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
1087 #endif
1088 } else {
1089 cind=li->second;
1090 }
1091 fcount++;
1092 (pos[cind]).push_back(fpos);
1093 (posnm[cind]).push_back(nm);
1094 if(read_names) {
1095 (tagnames[cind]).push_back(tagname);
1096 }
1097 #ifdef DEBUG
1098 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
1099 if(fcount>30) {
1100 break;
1101 }
1102 #endif
1103
1104 }
1105 }
1106 fclose(f);
1107
1108 Rprintf("done. read %d fragments\n",fcount);
1109 }
1110 // construct output structures
1111 SEXP chnames;
1112 int np=0; // number of protections
1113 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
1114 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
1115 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
1116 }
1117 np++;
1118
1119 // sort
1120 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
1121 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
1122 //}
1123
1124 SEXP ans;
1125 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
1126 vector<vector<int> >::const_iterator nsi;
1127 vector<vector<string> >::const_iterator ssi;
1128 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
1129 nsi=posnm.begin()+(csi-pos.begin());
1130
1131 SEXP dv,dnames_R;
1132 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
1133 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
1134 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
1135 if(read_names) {
1136 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
1137 }
1138
1139
1140
1141 SEXP tv,nv,sv;
1142 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
1143 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
1144 if(read_names) {
1145 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
1146 }
1147 int* i_tv=INTEGER(tv);
1148 int* i_nv=INTEGER(nv);
1149
1150 int i=0;
1151 vector<int>::const_iterator ini=nsi->begin();
1152 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
1153 i_tv[i]=*pi;
1154 i_nv[i]=*ini++;
1155 i++;
1156 }
1157 if(read_names) {
1158 int i=0;
1159 ssi=tagnames.begin()+(csi-pos.begin());
1160 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
1161 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
1162 i++;
1163 }
1164 }
1165 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
1166 SET_VECTOR_ELT(dv, 0, tv);
1167 SET_VECTOR_ELT(dv, 1, nv);
1168 if(read_names) {
1169 SET_VECTOR_ELT(dv, 2, sv);
1170 }
1171 setAttrib(dv, R_NamesSymbol, dnames_R);
1172
1173 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
1174 }
1175
1176 setAttrib(ans,R_NamesSymbol,chnames);
1177
1178 #ifdef DEBUG
1179 Rprintf("unprotecting %d elements\n",np);
1180 #endif
1181
1182 UNPROTECT(np);
1183 return(ans);
1184 }
1185
1186
1187 // read in eland multi files, adjusting the negative strand coordinate by sequence length
1188 SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
1189
1190 #ifdef DEBUG
1191 Rprintf("read_eland_muti() : start\n");
1192 #endif
1193 const char* fname=CHAR(asChar(filename));
1194 int read_names=*(INTEGER(read_tag_names_R));
1195 int eland_tag_length=*(INTEGER(eland_tag_length_R));
1196 #ifdef DEBUG
1197 Rprintf("fname=%s\n",fname);
1198 #endif
1199
1200 // main data vector
1201 // chr - pos
1202 vector< vector<int> > pos;
1203 vector< vector<int> > posnm; // number of mismatches
1204 vector< vector<string> > tagnames;
1205
1206 // chromosome map
1207 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
1208 vector<string> cnames;
1209
1210
1211 typedef boost::tokenizer<boost::char_separator<char> > tokType;
1212 boost::char_separator<char> sep(" \t","");
1213 boost::char_separator<char> comsep(",","",boost::keep_empty_tokens);
1214 boost::char_separator<char> colsep(":","",boost::keep_empty_tokens);
1215
1216
1217 FILE *f=fopen(fname,"rb");
1218 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
1219 else {
1220 Rprintf("opened %s\n",fname);
1221
1222 // read in bed line
1223 string line;
1224 int nline=0;
1225 int fcount=0;
1226 while(get_a_line(f,line)) {
1227 nline++;
1228 // chomp
1229 size_t elpos = line.find_last_not_of("\n");
1230 if(elpos != string::npos) {
1231 line = line.substr(0, elpos+1);
1232 }
1233 #ifdef DEBUG
1234 Rprintf("line %d: %s\n",nline,line.c_str());
1235 #endif
1236
1237 tokType tok(line, sep);
1238 tokType::iterator sit=tok.begin();
1239 if(sit!=tok.end()) {
1240 string tagname=*sit++;
1241 string sequence=*sit++;
1242 string mspec=*sit++;
1243 // parse out match spec
1244
1245 if(mspec=="NM" || mspec=="QC") { continue; }
1246 #ifdef DEBUG
1247 Rprintf("parsing out spec \"%s\" : ",mspec.c_str());
1248 #endif
1249
1250 tokType stok(mspec, colsep);
1251 tokType::iterator ssit=stok.begin();
1252 string str_nm0=*ssit++;
1253
1254 int nm=0;
1255 int nm0=atoi(str_nm0.c_str());
1256 if(nm0>1) {
1257 #ifdef DEBUG
1258 Rprintf("rejected for nm0\n");
1259 #endif
1260 continue;
1261 }
1262 if(nm0==0) {
1263 string str_nm1=*ssit++;
1264 int nm1=atoi(str_nm1.c_str());
1265 if(nm1>1) {
1266 #ifdef DEBUG
1267 Rprintf("rejected for nm1\n");
1268 #endif
1269 continue;
1270 }
1271 if(nm1==0) {
1272 string str_nm2=*ssit++;
1273 int nm2=atoi(str_nm2.c_str());
1274 if(nm2>1) {
1275 #ifdef DEBUG
1276 Rprintf("rejected for nm2\n");
1277 #endif
1278 continue;
1279 }
1280 nm=2;
1281 } else {
1282 nm=1;
1283 }
1284 }
1285
1286 #ifdef DEBUG
1287 Rprintf("accepted (nm=%d)\n",nm);
1288 #endif
1289 int npos=0;
1290 string mpos=*sit++;
1291 vector<string> mposc;
1292 vector<int> mposp;
1293 tokType ptok(mpos, comsep);
1294 string prevchr;
1295 for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) {
1296 string cpos=*psit;
1297 npos++;
1298 int strand=1;
1299 if(cpos.size()<5) {
1300 Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str());
1301 }
1302 char lc=cpos.at(cpos.size()-1);
1303
1304 if(atoi(&lc)==nm) {
1305 switch(cpos.at(cpos.size()-2)) {
1306 case 'R': strand=-1; break;
1307 case 'F': strand=1; break;
1308 default:
1309 Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break;
1310 continue;
1311 }
1312 string chr,str_pos;
1313 size_t colpos=cpos.find(":");
1314 if(colpos==string::npos) {
1315 if(npos>1) {
1316 chr=prevchr;
1317 str_pos=cpos.substr(0,cpos.size()-2);
1318 } else {
1319 Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str());
1320 continue;
1321 }
1322 } else {
1323 chr=cpos.substr(0,colpos);
1324 str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos);
1325 }
1326 #ifdef DEBUG
1327 Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand);
1328 #endif
1329 int pos=strand*atoi(str_pos.c_str());
1330 mposc.push_back(chr);
1331 mposp.push_back(pos);
1332 }
1333 }
1334
1335 string chr;
1336 int fpos;
1337 if(mposc.size()!=1) {
1338 if(mposc.size()==0) {
1339 Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str());
1340 } else {
1341 Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str());
1342 }
1343 continue;
1344 } else {
1345 chr=*mposc.begin();
1346 fpos=*mposp.begin();
1347 }
1348
1349 int len=sequence.size();
1350 // adjust probe length if eland length limit was specified
1351 if(eland_tag_length>0 && len>eland_tag_length) {
1352 len=eland_tag_length;
1353 }
1354
1355 if(fpos<0) {
1356 fpos=-1*(-1*fpos+len-1);
1357 }
1358
1359 // determine the chromosome index
1360 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
1361 int cind=-1;
1362 if(li==cind_map.end()) {
1363 // register new chromosome
1364 cind=cnames.size();
1365 cnames.push_back(chr);
1366 cind_map[chr]=cind;
1367 // allocate new pos vector
1368 pos.push_back(vector<int>());
1369 posnm.push_back(vector<int>());
1370 if(read_names) {
1371 tagnames.push_back(vector<string>());
1372 }
1373 #ifdef DEBUG
1374 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
1375 #endif
1376 } else {
1377 cind=li->second;
1378 }
1379 fcount++;
1380 (pos[cind]).push_back(fpos);
1381 (posnm[cind]).push_back(nm);
1382 if(read_names) {
1383 (tagnames[cind]).push_back(tagname);
1384 }
1385 #ifdef DEBUG
1386 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
1387 if(fcount>30) {
1388 break;
1389 }
1390 #endif
1391
1392 }
1393 }
1394 fclose(f);
1395
1396 Rprintf("done. read %d fragments\n",fcount);
1397 }
1398 // construct output structures
1399 SEXP chnames;
1400 int np=0; // number of protections
1401 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
1402 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
1403 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
1404 }
1405 np++;
1406
1407 // sort
1408 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
1409 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
1410 //}
1411
1412 SEXP ans;
1413 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
1414 vector<vector<int> >::const_iterator nsi;
1415 vector<vector<string> >::const_iterator ssi;
1416 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
1417 nsi=posnm.begin()+(csi-pos.begin());
1418
1419 SEXP dv,dnames_R;
1420 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
1421 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
1422 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
1423 if(read_names) {
1424 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
1425 }
1426
1427
1428
1429 SEXP tv,nv,sv;
1430 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
1431 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
1432 if(read_names) {
1433 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
1434 }
1435 int* i_tv=INTEGER(tv);
1436 int* i_nv=INTEGER(nv);
1437
1438 int i=0;
1439 vector<int>::const_iterator ini=nsi->begin();
1440 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
1441 i_tv[i]=*pi;
1442 i_nv[i]=*ini++;
1443 i++;
1444 }
1445 if(read_names) {
1446 int i=0;
1447 ssi=tagnames.begin()+(csi-pos.begin());
1448 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
1449 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
1450 i++;
1451 }
1452 }
1453 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
1454 SET_VECTOR_ELT(dv, 0, tv);
1455 SET_VECTOR_ELT(dv, 1, nv);
1456 if(read_names) {
1457 SET_VECTOR_ELT(dv, 2, sv);
1458 }
1459 setAttrib(dv, R_NamesSymbol, dnames_R);
1460
1461 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
1462 }
1463
1464 setAttrib(ans,R_NamesSymbol,chnames);
1465
1466 #ifdef DEBUG
1467 Rprintf("unprotecting %d elements\n",np);
1468 #endif
1469
1470 UNPROTECT(np);
1471 return(ans);
1472 }
1473
1474
1475 // read in regular eland files, adjusting the negative strand coordinate by sequence length
1476 SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) {
1477
1478 #ifdef DEBUG
1479 Rprintf("start\n");
1480 #endif
1481 const char* fname=CHAR(asChar(filename));
1482 int read_names=*(INTEGER(read_tag_names_R));
1483 #ifdef DEBUG
1484 Rprintf("fname=%s\n",fname);
1485 #endif
1486
1487 // main data vector
1488 // chr - pos
1489 vector< vector<int> > pos;
1490 vector< vector<int> > posnm; // number of mismatches
1491 vector< vector<string> > tagnames;
1492
1493 // chromosome map
1494 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
1495 vector<string> cnames;
1496
1497
1498 typedef boost::tokenizer<boost::char_separator<char> > tokType;
1499 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
1500 boost::char_separator<char> sep2(",");
1501
1502
1503 FILE *f=fopen(fname,"rb");
1504 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
1505 } else {
1506 #ifdef HAVE_LIBBZ2
1507 BZFILE* b;
1508 int bzerror;
1509
1510 int bz2file=0;
1511 if(strstr(fname,".bz2")) {
1512 bz2file=1;
1513 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
1514 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
1515 }
1516 #endif
1517
1518 Rprintf("opened %s\n",fname);
1519
1520 // read in bed line
1521 string line;
1522 int fcount=0;
1523 #ifdef HAVE_LIBBZ2
1524 while(get_a_line(f,b,bz2file,line)) {
1525 #else
1526 while(get_a_line(f,line)) {
1527 #endif
1528
1529 #ifdef DEBUG
1530 Rprintf("line: %s\n",line.c_str());
1531 #endif
1532
1533
1534 tokType tok(line, sep);
1535 tokType::iterator sit=tok.begin();
1536 if(sit!=tok.end()) {
1537 string tagname=*sit++;
1538 string str_strand=*sit++;
1539 string chr=*sit++;
1540
1541 string str_pos=*sit++;
1542 int fpos=atoi(str_pos.c_str());
1543
1544 string sequence=*sit++;
1545 sit++; sit++;
1546 string mm=*sit++;
1547
1548 int len=sequence.size();
1549 if(str_strand[0]=='-') {
1550 fpos=-1*(fpos+len-1);
1551 }
1552 // determine number of mismatches
1553 int nm=0;
1554 if(mm.size()>0) {
1555 nm++;
1556 string::size_type tp(0);
1557 while(tp!=string::npos) {
1558 tp = mm.find(",",tp);
1559 if(tp!=string::npos) {
1560 tp++;
1561 ++nm;
1562 }
1563 }
1564 }
1565
1566
1567
1568 // determine the chromosome index
1569 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
1570 int cind=-1;
1571 if(li==cind_map.end()) {
1572 // register new chromosome
1573 cind=cnames.size();
1574 cnames.push_back(chr);
1575 cind_map[chr]=cind;
1576 // allocate new pos vector
1577 pos.push_back(vector<int>());
1578 posnm.push_back(vector<int>());
1579 if(read_names) {
1580 tagnames.push_back(vector<string>());
1581 }
1582 #ifdef DEBUG
1583 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
1584 #endif
1585 } else {
1586 cind=li->second;
1587 }
1588 fcount++;
1589 (pos[cind]).push_back(fpos);
1590 (posnm[cind]).push_back(nm);
1591 if(read_names) {
1592 (tagnames[cind]).push_back(tagname);
1593 }
1594 #ifdef DEBUG
1595 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
1596 if(fcount>30) {
1597 break;
1598 }
1599 #endif
1600
1601 }
1602 }
1603
1604 #ifdef HAVE_LIBBZ2
1605 BZ2_bzReadClose( &bzerror, b);
1606 #endif
1607 fclose(f);
1608
1609 Rprintf("done. read %d fragments\n",fcount);
1610 }
1611 // construct output structures
1612 SEXP chnames;
1613 int np=0; // number of protections
1614 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
1615 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
1616 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
1617 }
1618 np++;
1619
1620 // sort
1621 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
1622 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
1623 //}
1624
1625 SEXP ans;
1626 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
1627 vector<vector<int> >::const_iterator nsi;
1628 vector<vector<string> >::const_iterator ssi;
1629 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
1630 nsi=posnm.begin()+(csi-pos.begin());
1631
1632 SEXP dv,dnames_R;
1633 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
1634 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
1635 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
1636 if(read_names) {
1637 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
1638 }
1639
1640
1641
1642 SEXP tv,nv,sv;
1643 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
1644 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
1645 if(read_names) {
1646 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
1647 }
1648 int* i_tv=INTEGER(tv);
1649 int* i_nv=INTEGER(nv);
1650
1651 int i=0;
1652 vector<int>::const_iterator ini=nsi->begin();
1653 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
1654 i_tv[i]=*pi;
1655 i_nv[i]=*ini++;
1656 i++;
1657 }
1658 if(read_names) {
1659 int i=0;
1660 ssi=tagnames.begin()+(csi-pos.begin());
1661 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
1662 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
1663 i++;
1664 }
1665 }
1666 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
1667 SET_VECTOR_ELT(dv, 0, tv);
1668 SET_VECTOR_ELT(dv, 1, nv);
1669 if(read_names) {
1670 SET_VECTOR_ELT(dv, 2, sv);
1671 }
1672 setAttrib(dv, R_NamesSymbol, dnames_R);
1673
1674 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
1675 }
1676
1677 setAttrib(ans,R_NamesSymbol,chnames);
1678
1679 #ifdef DEBUG
1680 Rprintf("unprotecting %d elements\n",np);
1681 #endif
1682
1683 UNPROTECT(np);
1684 return(ans);
1685 }
1686
1687
1688 // read in helicos tab-separated alignment output (regular or bz2)
1689 SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) {
1690
1691 #ifdef DEBUG
1692 Rprintf("start\n");
1693 #endif
1694 const char* fname=CHAR(asChar(filename));
1695 int read_names=*(INTEGER(read_tag_names_R));
1696 #ifdef DEBUG
1697 Rprintf("fname=%s\n",fname);
1698 #endif
1699
1700 // main data vector
1701 // chr - pos
1702 vector< vector<int> > pos;
1703 vector< vector<int> > posnm; // number of mismatches
1704 vector< vector<int> > poslen; // length of the match
1705 vector< vector<string> > tagnames;
1706
1707 // chromosome map
1708 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
1709 vector<string> cnames;
1710
1711
1712 typedef boost::tokenizer<boost::char_separator<char> > tokType;
1713 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
1714 boost::char_separator<char> sep2(",");
1715
1716
1717 FILE *f=fopen(fname,"rb");
1718 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
1719 } else {
1720 #ifdef HAVE_LIBBZ2
1721 BZFILE* b;
1722 int bzerror;
1723
1724 int bz2file=0;
1725 if(strstr(fname,".bz2")) {
1726 bz2file=1;
1727 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
1728 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
1729 }
1730 #endif
1731
1732 Rprintf("opened %s\n",fname);
1733
1734 // read in bed line
1735 string line;
1736 int fcount=0;
1737 int nlines=0;
1738 #ifdef HAVE_LIBBZ2
1739 while(get_a_line(f,b,bz2file,line)) {
1740 #else
1741 while(get_a_line(f,line)) {
1742 #endif
1743
1744 #ifdef DEBUG
1745 Rprintf("line: %s\n",line.c_str());
1746 #endif
1747 nlines++;
1748 // skip comments
1749 if(line[0]=='#') { continue; }
1750 if(line.compare(0,12,"Reference_ID")==0) {
1751 #ifdef DEBUG
1752 Rprintf("matched header on line %d\n",nlines);
1753 #endif
1754 continue;
1755 }
1756
1757 tokType tok(line, sep);
1758 tokType::iterator sit=tok.begin();
1759 if(sit!=tok.end()) {
1760 string chr=*sit++;
1761 string tagname=*sit++;
1762 string str_startpos=*sit++;
1763 string str_endpos=*sit++;
1764
1765 string str_tstart=*sit++;
1766 string str_tend=*sit++;
1767 int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str());
1768
1769 sit++; sit++;
1770 string str_ndel=*sit++;
1771 string str_nins=*sit++;
1772 string str_nsub=*sit++;
1773
1774 string str_strand=*sit++;
1775 int fpos;
1776 if(str_strand[0]=='-') {
1777 fpos=-1*atoi(str_endpos.c_str());
1778 } else {
1779 fpos=atoi(str_startpos.c_str());
1780 }
1781
1782 // determine number of mismatches
1783 int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str());
1784
1785 // determine the chromosome index
1786 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
1787 int cind=-1;
1788 if(li==cind_map.end()) {
1789 // register new chromosome
1790 cind=cnames.size();
1791 cnames.push_back(chr);
1792 cind_map[chr]=cind;
1793 // allocate new pos vector
1794 pos.push_back(vector<int>());
1795 posnm.push_back(vector<int>());
1796 poslen.push_back(vector<int>());
1797 if(read_names) {
1798 tagnames.push_back(vector<string>());
1799 }
1800 #ifdef DEBUG
1801 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
1802 #endif
1803 } else {
1804 cind=li->second;
1805 }
1806 fcount++;
1807 (pos[cind]).push_back(fpos);
1808 (posnm[cind]).push_back(nm);
1809 (poslen[cind]).push_back(len);
1810 if(read_names) {
1811 (tagnames[cind]).push_back(tagname);
1812 }
1813 #ifdef DEBUG
1814 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm);
1815 if(fcount>30) {
1816 break;
1817 }
1818 #endif
1819
1820 }
1821 }
1822
1823 #ifdef HAVE_LIBBZ2
1824 BZ2_bzReadClose( &bzerror, b);
1825 #endif
1826 fclose(f);
1827
1828 Rprintf("done. read %d fragments\n",fcount);
1829 }
1830 // construct output structures
1831 SEXP chnames;
1832 int np=0; // number of protections
1833 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
1834 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
1835 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
1836 }
1837 np++;
1838
1839 // sort
1840 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
1841 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
1842 //}
1843
1844 SEXP ans;
1845 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
1846 vector<vector<int> >::const_iterator nsi;
1847 vector<vector<int> >::const_iterator lsi;
1848 vector<vector<string> >::const_iterator ssi;
1849 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
1850 nsi=posnm.begin()+(csi-pos.begin());
1851 lsi=poslen.begin()+(csi-pos.begin());
1852
1853 SEXP dv,dnames_R;
1854 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
1855 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
1856 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
1857 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
1858 if(read_names) {
1859 SET_STRING_ELT(dnames_R, 3, mkChar("s"));
1860 }
1861
1862
1863
1864 SEXP tv,nv,lv,sv;
1865 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
1866 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
1867 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
1868 if(read_names) {
1869 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
1870 }
1871 int* i_tv=INTEGER(tv);
1872 int* i_nv=INTEGER(nv);
1873 int* i_lv=INTEGER(lv);
1874
1875 int i=0;
1876 vector<int>::const_iterator ini=nsi->begin();
1877 vector<int>::const_iterator lni=lsi->begin();
1878 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
1879 i_tv[i]=*pi;
1880 i_nv[i]=*ini++;
1881 i_lv[i]=*lni++;
1882 i++;
1883 }
1884 if(read_names) {
1885 int i=0;
1886 ssi=tagnames.begin()+(csi-pos.begin());
1887 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
1888 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
1889 i++;
1890 }
1891 }
1892 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
1893 SET_VECTOR_ELT(dv, 0, tv);
1894 SET_VECTOR_ELT(dv, 1, nv);
1895 SET_VECTOR_ELT(dv, 2, lv);
1896 if(read_names) {
1897 SET_VECTOR_ELT(dv, 3, sv);
1898 }
1899 setAttrib(dv, R_NamesSymbol, dnames_R);
1900
1901 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
1902 }
1903
1904 setAttrib(ans,R_NamesSymbol,chnames);
1905
1906 #ifdef DEBUG
1907 Rprintf("unprotecting %d elements\n",np);
1908 #endif
1909
1910 UNPROTECT(np);
1911 return(ans);
1912 }
1913
1914
1915
1916 // read in text version of maq map
1917 SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) {
1918
1919 #ifdef DEBUG
1920 Rprintf("start\n");
1921 #endif
1922 const char* fname=CHAR(asChar(filename));
1923 int read_names=*(INTEGER(read_tag_names_R));
1924 #ifdef DEBUG
1925 Rprintf("fname=%s\n",fname);
1926 #endif
1927
1928 // main data vector
1929 // chr - pos
1930 vector< vector<int> > pos;
1931 vector< vector<int> > posnm; // number of mismatches
1932 vector< vector<string> > tagnames;
1933
1934 // chromosome map
1935 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
1936 vector<string> cnames;
1937
1938
1939 typedef boost::tokenizer<boost::char_separator<char> > tokType;
1940 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
1941
1942
1943 FILE *f=fopen(fname,"rb");
1944 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
1945 else {
1946 Rprintf("opened %s\n",fname);
1947
1948 // read in bed line
1949 string line;
1950 int fcount=0;
1951 while(get_a_line(f,line)) {
1952
1953 #ifdef DEBUG
1954 Rprintf("line: %s\n",line.c_str());
1955 #endif
1956
1957
1958 tokType tok(line, sep);
1959 tokType::iterator sit=tok.begin();
1960 if(sit!=tok.end()) {
1961 string tagname=*sit++;
1962 string chr=*sit++;
1963 string str_pos=*sit++;
1964 int fpos=atoi(str_pos.c_str());
1965 string str_strand=*sit++;
1966 sit++; sit++; sit++; sit++; sit++;
1967 string str_nm=*sit++;
1968 sit++; sit++; sit++;
1969 string str_len=*sit++;
1970 int nm=atoi(str_nm.c_str());
1971 int len=atoi(str_len.c_str());
1972
1973 if(str_strand[0]=='-') {
1974 fpos=-1*(fpos+len-1);
1975 }
1976
1977 // determine the chromosome index
1978 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
1979 int cind=-1;
1980 if(li==cind_map.end()) {
1981 // register new chromosome
1982 cind=cnames.size();
1983 cnames.push_back(chr);
1984 cind_map[chr]=cind;
1985 // allocate new pos vector
1986 pos.push_back(vector<int>());
1987 posnm.push_back(vector<int>());
1988 if(read_names) {
1989 tagnames.push_back(vector<string>());
1990 }
1991 #ifdef DEBUG
1992 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
1993 #endif
1994 } else {
1995 cind=li->second;
1996 }
1997 fcount++;
1998 (pos[cind]).push_back(fpos);
1999 (posnm[cind]).push_back(nm);
2000 if(read_names) {
2001 (tagnames[cind]).push_back(tagname);
2002 }
2003 #ifdef DEBUG
2004 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
2005 if(fcount>30) {
2006 break;
2007 }
2008 #endif
2009
2010 }
2011 }
2012 fclose(f);
2013
2014 Rprintf("done. read %d fragments\n",fcount);
2015 }
2016 // construct output structures
2017 SEXP chnames;
2018 int np=0; // number of protections
2019 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
2020 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
2021 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
2022 }
2023 np++;
2024
2025 // sort
2026 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
2027 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
2028 //}
2029
2030 SEXP ans;
2031 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
2032 vector<vector<int> >::const_iterator nsi;
2033 vector<vector<string> >::const_iterator ssi;
2034 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
2035 nsi=posnm.begin()+(csi-pos.begin());
2036
2037 SEXP dv,dnames_R;
2038 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
2039 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
2040 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
2041 if(read_names) {
2042 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
2043 }
2044
2045
2046
2047 SEXP tv,nv,sv;
2048 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
2049 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
2050 if(read_names) {
2051 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
2052 }
2053 int* i_tv=INTEGER(tv);
2054 int* i_nv=INTEGER(nv);
2055
2056 int i=0;
2057 vector<int>::const_iterator ini=nsi->begin();
2058 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
2059 i_tv[i]=*pi;
2060 i_nv[i]=*ini++;
2061 i++;
2062 }
2063 if(read_names) {
2064 int i=0;
2065 ssi=tagnames.begin()+(csi-pos.begin());
2066 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
2067 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
2068 i++;
2069 }
2070 }
2071 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
2072 SET_VECTOR_ELT(dv, 0, tv);
2073 SET_VECTOR_ELT(dv, 1, nv);
2074 if(read_names) {
2075 SET_VECTOR_ELT(dv, 2, sv);
2076 }
2077 setAttrib(dv, R_NamesSymbol, dnames_R);
2078
2079 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
2080 }
2081
2082 setAttrib(ans,R_NamesSymbol,chnames);
2083
2084 #ifdef DEBUG
2085 Rprintf("unprotecting %d elements\n",np);
2086 #endif
2087
2088 UNPROTECT(np);
2089 return(ans);
2090 }
2091
2092
2093
2094
2095
2096 // read in tagalign file
2097 SEXP read_tagalign(SEXP filename) {
2098
2099 #ifdef DEBUG
2100 Rprintf("start\n");
2101 #endif
2102 const char* fname=CHAR(asChar(filename));
2103 #ifdef DEBUG
2104 Rprintf("fname=%s\n",fname);
2105 #endif
2106
2107 // main data vector
2108 // chr - pos
2109 vector< vector<int> > pos;
2110 vector< vector<int> > posnm; // number of mismatches
2111
2112 // chromosome map
2113 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
2114 vector<string> cnames;
2115
2116
2117 typedef boost::tokenizer<boost::char_separator<char> > tokType;
2118 boost::char_separator<char> sep(" \t");
2119
2120
2121 FILE *f=fopen(fname,"rb");
2122 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
2123 else {
2124 Rprintf("opened %s\n",fname);
2125
2126 // read in bed line
2127 string line;
2128 int fcount=0;
2129 while(get_a_line(f,line)) {
2130
2131 #ifdef DEBUG
2132 Rprintf("line: %s\n",line.c_str());
2133 #endif
2134
2135
2136 tokType tok(line, sep);
2137 tokType::iterator sit=tok.begin();
2138 if(sit!=tok.end()) {
2139 string chr=*sit++;
2140 string str_spos=*sit++;
2141 string str_epos=*sit++;
2142 sit++;
2143 string str_qual=*sit++;
2144 string str_strand=*sit;
2145
2146 int fpos;
2147 if(str_strand[0]=='+') {
2148 fpos=atoi(str_spos.c_str());
2149 } else {
2150 fpos=-1*atoi(str_epos.c_str());
2151 }
2152 int nm=atoi(str_qual.c_str());
2153
2154 // determine the chromosome index
2155 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
2156 int cind=-1;
2157 if(li==cind_map.end()) {
2158 // register new chromosome
2159 cind=cnames.size();
2160 cnames.push_back(chr);
2161 cind_map[chr]=cind;
2162 // allocate new pos vector
2163 pos.push_back(vector<int>());
2164 posnm.push_back(vector<int>());
2165 #ifdef DEBUG
2166 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
2167 #endif
2168 } else {
2169 cind=li->second;
2170 }
2171 fcount++;
2172 (pos[cind]).push_back(fpos);
2173 (posnm[cind]).push_back(nm);
2174 #ifdef DEBUG
2175 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
2176 if(fcount>30) {
2177 break;
2178 }
2179 #endif
2180
2181 }
2182 }
2183 fclose(f);
2184
2185 Rprintf("done. read %d fragments\n",fcount);
2186 }
2187 // construct output structures
2188 SEXP chnames;
2189 int np=0; // number of protections
2190 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
2191 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
2192 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
2193 }
2194 np++;
2195
2196 // sort
2197 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
2198 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
2199 //}
2200
2201 SEXP ans;
2202 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
2203 vector<vector<int> >::const_iterator nsi;
2204 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
2205 nsi=posnm.begin()+(csi-pos.begin());
2206
2207 SEXP dv,dnames_R;
2208 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
2209 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
2210 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
2211
2212
2213 SEXP tv,nv;
2214 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
2215 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
2216 int* i_tv=INTEGER(tv);
2217 int* i_nv=INTEGER(nv);
2218
2219 int i=0;
2220 vector<int>::const_iterator ini=nsi->begin();
2221 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
2222 i_tv[i]=*pi;
2223 i_nv[i]=*ini++;
2224 i++;
2225 }
2226 PROTECT(dv = allocVector(VECSXP, 2)); np++;
2227 SET_VECTOR_ELT(dv, 0, tv);
2228 SET_VECTOR_ELT(dv, 1, nv);
2229 setAttrib(dv, R_NamesSymbol, dnames_R);
2230
2231 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
2232 }
2233
2234 setAttrib(ans,R_NamesSymbol,chnames);
2235
2236 #ifdef DEBUG
2237 Rprintf("unprotecting %d elements\n",np);
2238 #endif
2239
2240 UNPROTECT(np);
2241 return(ans);
2242 }
2243
2244
2245
2246
2247 // arachne madness
2248 SEXP read_arachne(SEXP filename) {
2249
2250 #ifdef DEBUG
2251 Rprintf("start\n");
2252 #endif
2253 const char* fname=CHAR(asChar(filename));
2254 #ifdef DEBUG
2255 Rprintf("fname=%s\n",fname);
2256 #endif
2257
2258 // main data vector
2259 // chr - pos
2260 vector< vector<int> > pos;
2261 vector< vector<int> > posnm; // number of mismatches
2262
2263 // chromosome map
2264 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
2265 vector<string> cnames;
2266
2267
2268 typedef boost::tokenizer<boost::char_separator<char> > tokType;
2269 boost::char_separator<char> sep(" \t");
2270
2271
2272
2273
2274
2275 FILE *f=fopen(fname,"rb");
2276 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
2277 else {
2278
2279 #ifdef HAVE_LIBBZ2
2280 BZFILE* b;
2281 int bzerror;
2282
2283 int bz2file=0;
2284 if(strstr(fname,".bz2")) {
2285 bz2file=1;
2286 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
2287 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
2288 }
2289 #endif
2290
2291
2292 Rprintf("opened %s\n",fname);
2293
2294 // read in bed line
2295 string line;
2296 int fcount=0;
2297 #ifdef HAVE_LIBBZ2
2298 while(get_a_line(f,b,bz2file,line)) {
2299 #else
2300 while(get_a_line(f,line)) {
2301 #endif
2302
2303 #ifdef DEBUG
2304 Rprintf("line: %s\n",line.c_str());
2305 #endif
2306
2307
2308 tokType tok(line, sep);
2309 tokType::iterator sit=tok.begin();
2310 if(sit!=tok.end()) {
2311 string chr=*sit++;
2312 string str_spos=*sit++;
2313 int nm=0;
2314 if(sit!=tok.end()) {
2315 string str_mm=*sit;
2316 nm=atoi(str_mm.c_str());
2317 }
2318
2319 int fpos=atoi(str_spos.c_str());;
2320
2321
2322 // determine the chromosome index
2323 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
2324 int cind=-1;
2325 if(li==cind_map.end()) {
2326 // register new chromosome
2327 cind=cnames.size();
2328 cnames.push_back(chr);
2329 cind_map[chr]=cind;
2330 // allocate new pos vector
2331 pos.push_back(vector<int>());
2332 posnm.push_back(vector<int>());
2333 #ifdef DEBUG
2334 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
2335 #endif
2336 } else {
2337 cind=li->second;
2338 }
2339 fcount++;
2340 (pos[cind]).push_back(fpos);
2341 (posnm[cind]).push_back(nm);
2342 #ifdef DEBUG
2343 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
2344 if(fcount>30) {
2345 break;
2346 }
2347 #endif
2348
2349 }
2350 }
2351 #ifdef HAVE_LIBBZ2
2352 BZ2_bzReadClose( &bzerror, b);
2353 #endif
2354
2355 fclose(f);
2356
2357 Rprintf("done. read %d fragments\n",fcount);
2358 }
2359 // construct output structures
2360 SEXP chnames;
2361 int np=0; // number of protections
2362 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
2363 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
2364 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
2365 }
2366 np++;
2367
2368 // sort
2369 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
2370 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
2371 //}
2372
2373 SEXP ans;
2374 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
2375 vector<vector<int> >::const_iterator nsi;
2376 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
2377 nsi=posnm.begin()+(csi-pos.begin());
2378
2379 SEXP dv,dnames_R;
2380 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
2381 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
2382 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
2383
2384
2385 SEXP tv,nv;
2386 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
2387 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
2388 int* i_tv=INTEGER(tv);
2389 int* i_nv=INTEGER(nv);
2390
2391 int i=0;
2392 vector<int>::const_iterator ini=nsi->begin();
2393 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
2394 i_tv[i]=*pi;
2395 i_nv[i]=*ini++;
2396 i++;
2397 }
2398 PROTECT(dv = allocVector(VECSXP, 2)); np++;
2399 SET_VECTOR_ELT(dv, 0, tv);
2400 SET_VECTOR_ELT(dv, 1, nv);
2401 setAttrib(dv, R_NamesSymbol, dnames_R);
2402
2403 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
2404 }
2405
2406 setAttrib(ans,R_NamesSymbol,chnames);
2407
2408 #ifdef DEBUG
2409 Rprintf("unprotecting %d elements\n",np);
2410 #endif
2411
2412 UNPROTECT(np);
2413 return(ans);
2414 }
2415
2416
2417 // arachne madness
2418 SEXP read_arachne_long(SEXP filename) {
2419
2420 #ifdef DEBUG
2421 Rprintf("start\n");
2422 #endif
2423 const char* fname=CHAR(asChar(filename));
2424 #ifdef DEBUG
2425 Rprintf("fname=%s\n",fname);
2426 #endif
2427
2428 // main data vector
2429 // chr - pos
2430 vector< vector<int> > pos;
2431 vector< vector<int> > posnm; // number of mismatches
2432 vector< vector<int> > poslen; // length of the match
2433
2434 // chromosome map
2435 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
2436 vector<string> cnames;
2437
2438
2439 typedef boost::tokenizer<boost::char_separator<char> > tokType;
2440 boost::char_separator<char> sep(" \t");
2441
2442
2443
2444
2445
2446 FILE *f=fopen(fname,"rb");
2447 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
2448 else {
2449
2450 #ifdef HAVE_LIBBZ2
2451 BZFILE* b;
2452 int bzerror;
2453
2454 int bz2file=0;
2455 if(strstr(fname,".bz2")) {
2456 bz2file=1;
2457 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
2458 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
2459 }
2460 #endif
2461
2462
2463 Rprintf("opened %s\n",fname);
2464
2465 // read in bed line
2466 string line;
2467 int fcount=0;
2468 #ifdef HAVE_LIBBZ2
2469 while(get_a_line(f,b,bz2file,line)) {
2470 #else
2471 while(get_a_line(f,line)) {
2472 #endif
2473
2474 #ifdef DEBUG
2475 Rprintf("line: %s\n",line.c_str());
2476 #endif
2477
2478
2479 tokType tok(line, sep);
2480 tokType::iterator sit=tok.begin();
2481 if(sit!=tok.end()) {
2482 string query=*sit++;
2483 if(query!="QUERY") { continue; }
2484 *sit++; *sit++; *sit++; *sit++;
2485 string str_strand=*sit++;
2486 string chr=*sit++;
2487 string str_startpos=*sit++;
2488 string str_endpos=*sit++;
2489
2490 int fpos;
2491 if(str_strand[0]=='1') {
2492 fpos=-1*atoi(str_endpos.c_str());
2493 } else {
2494 fpos=atoi(str_startpos.c_str());
2495 }
2496 #ifdef DEBUG
2497 Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos);
2498 #endif
2499 *sit++;
2500 string str_nblocks=*sit++;
2501 int nblocks=atoi(str_nblocks.c_str());
2502 #ifdef DEBUG
2503 Rprintf("nblocks=%d\n",nblocks);
2504 #endif
2505 // tally up the read length and the number of mismatches for all blocks
2506 int len=0; int nm=0;
2507 for(int i=0;i<nblocks;i++) {
2508 string str_sgs=*sit++;
2509 int sgs=atoi(str_sgs.c_str());
2510 string str_slen=*sit++;
2511 int slen=atoi(str_slen.c_str());
2512 string str_snm=*sit++;
2513 int snm=atoi(str_snm.c_str());
2514 #ifdef DEBUG
2515 Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm);
2516 #endif
2517 len+=slen;
2518 nm+=abs(sgs)+snm;
2519 }
2520 nm+=nblocks-1;
2521
2522
2523 // determine the chromosome index
2524 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
2525 int cind=-1;
2526 if(li==cind_map.end()) {
2527 // register new chromosome
2528 cind=cnames.size();
2529 cnames.push_back(chr);
2530 cind_map[chr]=cind;
2531 // allocate new pos vector
2532 pos.push_back(vector<int>());
2533 posnm.push_back(vector<int>());
2534 poslen.push_back(vector<int>());
2535 #ifdef DEBUG
2536 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
2537 #endif
2538 } else {
2539 cind=li->second;
2540 }
2541 fcount++;
2542 (pos[cind]).push_back(fpos);
2543 (posnm[cind]).push_back(nm);
2544 (poslen[cind]).push_back(len);
2545 #ifdef DEBUG
2546 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len);
2547 if(fcount>30) {
2548 break;
2549 }
2550 #endif
2551
2552 }
2553 }
2554 #ifdef HAVE_LIBBZ2
2555 BZ2_bzReadClose( &bzerror, b);
2556 #endif
2557
2558 fclose(f);
2559
2560 Rprintf("done. read %d fragments\n",fcount);
2561 }
2562 // construct output structures
2563 SEXP chnames;
2564 int np=0; // number of protections
2565 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
2566 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
2567 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
2568 }
2569 np++;
2570
2571 // sort
2572 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
2573 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
2574 //}
2575
2576 SEXP ans;
2577 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
2578 vector<vector<int> >::const_iterator nsi;
2579 vector<vector<int> >::const_iterator lsi;
2580 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
2581 nsi=posnm.begin()+(csi-pos.begin());
2582 lsi=poslen.begin()+(csi-pos.begin());
2583
2584 SEXP dv,dnames_R;
2585 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
2586 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
2587 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
2588 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
2589
2590
2591 SEXP tv,nv,lv;
2592 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
2593 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
2594 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
2595 int* i_tv=INTEGER(tv);
2596 int* i_nv=INTEGER(nv);
2597 int* i_lv=INTEGER(lv);
2598
2599 int i=0;
2600 vector<int>::const_iterator ini=nsi->begin();
2601 vector<int>::const_iterator lni=lsi->begin();
2602 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
2603 i_tv[i]=*pi;
2604 i_nv[i]=*ini++;
2605 i_lv[i]=*lni++;
2606 i++;
2607 }
2608 PROTECT(dv = allocVector(VECSXP, 3)); np++;
2609 SET_VECTOR_ELT(dv, 0, tv);
2610 SET_VECTOR_ELT(dv, 1, nv);
2611 SET_VECTOR_ELT(dv, 2, lv);
2612 setAttrib(dv, R_NamesSymbol, dnames_R);
2613
2614 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
2615 }
2616
2617 setAttrib(ans,R_NamesSymbol,chnames);
2618
2619 #ifdef DEBUG
2620 Rprintf("unprotecting %d elements\n",np);
2621 #endif
2622
2623 UNPROTECT(np);
2624 return(ans);
2625 }
2626
2627
2628 }