Mercurial > repos > zzhou > spp_phantompeak
comparison spp/src/bed2vector.cpp @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:608a8e0eac56 | 6:ce08b0efa3fd |
---|---|
1 #include "pc.h" | |
2 #include "config.h" | |
3 #include <vector> | |
4 #include <string.h> | |
5 #include <iostream> | |
6 #include <fstream> | |
7 #include <sstream> | |
8 #include <strstream> | |
9 #include <algorithm> | |
10 #include <string> | |
11 #include <functional> | |
12 #include <utility> | |
13 #include <ext/hash_map> | |
14 #include <boost/tokenizer.hpp> | |
15 | |
16 #ifdef HAVE_LIBBZ2 | |
17 #include <bzlib.h> | |
18 #endif | |
19 | |
20 extern "C" { | |
21 #include "R.h" | |
22 #include "Rmath.h" | |
23 #include "Rinternals.h" | |
24 #include "Rdefines.h" | |
25 } | |
26 | |
27 using namespace std; | |
28 using namespace __gnu_cxx; | |
29 | |
30 | |
31 class lessAbsoluteValue { | |
32 public: | |
33 bool operator()(int a, int b) const { | |
34 return abs(a) < abs(b); | |
35 } | |
36 }; | |
37 | |
38 | |
39 | |
40 #ifdef HAVE_LIBBZ2 | |
41 int get_bzline(BZFILE* b,string& line) { | |
42 char c; | |
43 int nBuf; | |
44 int bzerror=BZ_OK; | |
45 | |
46 while(bzerror == BZ_OK) { | |
47 nBuf=BZ2_bzRead(&bzerror, b, &c, 1); | |
48 if(bzerror==BZ_OK) { | |
49 if(c=='\n') { | |
50 return bzerror; | |
51 } else { | |
52 line+=c; | |
53 } | |
54 } | |
55 } | |
56 return bzerror; | |
57 } | |
58 | |
59 int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) { | |
60 line=""; | |
61 if(bz2file) { | |
62 int bzerror=get_bzline(b,line); | |
63 if(bzerror==BZ_OK) { | |
64 return(1); | |
65 } else { | |
66 if(bzerror!=BZ_STREAM_END) { | |
67 cerr<<"encountered BZERROR="<<bzerror<<endl; | |
68 } | |
69 return(0); | |
70 } | |
71 } else { | |
72 char *cline=NULL; | |
73 size_t n; | |
74 if(getline(&cline,&n,f) != -1) { | |
75 if(cline) { | |
76 cline[strlen(cline)-1]='\0'; | |
77 line+=cline; | |
78 free(cline); | |
79 } | |
80 return(1); | |
81 } else { | |
82 return(0); | |
83 } | |
84 } | |
85 } | |
86 #endif | |
87 | |
88 | |
89 /** | |
90 * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign | |
91 * corresponding to the strand. | |
92 */ | |
93 | |
94 //#define DEBUG 1 | |
95 | |
96 extern "C" { | |
97 SEXP read_bed_ends(SEXP filename) { | |
98 | |
99 #ifdef DEBUG | |
100 Rprintf("start\n"); | |
101 #endif | |
102 const char* fname=CHAR(asChar(filename)); | |
103 #ifdef DEBUG | |
104 Rprintf("fname=%s\n",fname); | |
105 #endif | |
106 | |
107 // main data vector | |
108 // chr - pos | |
109 vector< vector<int> > pos; | |
110 | |
111 // chromosome map | |
112 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
113 vector<string> cnames; | |
114 | |
115 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
116 boost::char_separator<char> sep(" \t"); | |
117 | |
118 | |
119 ifstream bed_file(fname); | |
120 | |
121 #ifdef DEBUG | |
122 Rprintf("opened %s\n",fname); | |
123 #endif | |
124 | |
125 Rprintf("opened %s\n",fname); | |
126 | |
127 // read in bed line | |
128 string line; | |
129 | |
130 int fcount=0; | |
131 while(getline(bed_file,line)) { | |
132 | |
133 #ifdef DEBUG | |
134 Rprintf("line: %s\n",line.c_str()); | |
135 #endif | |
136 | |
137 | |
138 tokType tok(line, sep); | |
139 tokType::iterator sit=tok.begin(); | |
140 if(sit!=tok.end()) { | |
141 string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str())); | |
142 string str_start=*sit++; | |
143 int fstart=atoi(str_start.c_str()); | |
144 string str_end=*sit++; | |
145 int fend=atoi(str_end.c_str()); | |
146 int fpos=fstart; | |
147 if(sit!=tok.end()) { | |
148 string u0=*sit++; | |
149 string nfield=*sit++; | |
150 string strand=*sit++; | |
151 if(strand=="-") { | |
152 fpos=-1*fend; | |
153 } | |
154 } | |
155 | |
156 // determine the chromosome index | |
157 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
158 int cind=-1; | |
159 if(li==cind_map.end()) { | |
160 // register new chromosome | |
161 cind=cnames.size(); | |
162 cnames.push_back(chr); | |
163 cind_map[chr]=cind; | |
164 // allocate new pos vector | |
165 pos.push_back(vector<int>()); | |
166 #ifdef DEBUG | |
167 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
168 #endif | |
169 } else { | |
170 cind=li->second; | |
171 } | |
172 fcount++; | |
173 (pos[cind]).push_back(fpos); | |
174 #ifdef DEBUG | |
175 Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos); | |
176 if(fcount>30) { | |
177 break; | |
178 } | |
179 #endif | |
180 | |
181 } | |
182 } | |
183 bed_file.close(); | |
184 | |
185 | |
186 #ifdef DEBUG | |
187 Rprintf("done. read %d fragments\n",fcount); | |
188 #endif | |
189 | |
190 Rprintf("done. read %d fragments\n",fcount); | |
191 | |
192 // construct output structures | |
193 SEXP chnames; | |
194 int np=0; // number of protections | |
195 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
196 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
197 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
198 } | |
199 np++; | |
200 | |
201 // sort | |
202 for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
203 sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
204 } | |
205 | |
206 SEXP ans; | |
207 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
208 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
209 SEXP nv; | |
210 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
211 int* i_nv=INTEGER(nv); | |
212 int i=0; | |
213 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
214 i_nv[i++]=*pi; | |
215 } | |
216 SET_VECTOR_ELT(ans, csi-pos.begin(), nv); | |
217 } | |
218 | |
219 setAttrib(ans,R_NamesSymbol,chnames); | |
220 | |
221 #ifdef DEBUG | |
222 Rprintf("unprotecting %d elements\n",np); | |
223 #endif | |
224 | |
225 UNPROTECT(np); | |
226 return(ans); | |
227 } | |
228 | |
229 | |
230 | |
231 SEXP read_meland_old(SEXP filename) { | |
232 | |
233 #ifdef DEBUG | |
234 Rprintf("start\n"); | |
235 #endif | |
236 const char* fname=CHAR(asChar(filename)); | |
237 #ifdef DEBUG | |
238 Rprintf("fname=%s\n",fname); | |
239 #endif | |
240 | |
241 // main data vector | |
242 // chr - pos | |
243 vector< vector<int> > pos; | |
244 vector< vector<int> > posnm; // number of mismatches | |
245 vector< vector<int> > poslen; // length | |
246 | |
247 // chromosome map | |
248 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
249 vector<string> cnames; | |
250 | |
251 | |
252 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
253 boost::char_separator<char> sep(" \t"); | |
254 | |
255 | |
256 ifstream bed_file(fname); | |
257 | |
258 Rprintf("opened %s\n",fname); | |
259 | |
260 // read in bed line | |
261 string line; | |
262 | |
263 int fcount=0; | |
264 while(getline(bed_file,line)) { | |
265 | |
266 #ifdef DEBUG | |
267 Rprintf("line: %s\n",line.c_str()); | |
268 #endif | |
269 | |
270 | |
271 tokType tok(line, sep); | |
272 tokType::iterator sit=tok.begin(); | |
273 if(sit!=tok.end()) { | |
274 sit++; sit++; | |
275 string str_nm=*sit++; | |
276 int nm=0; | |
277 if(str_nm[0]=='U') { | |
278 nm=atoi((str_nm.c_str()+1)); | |
279 } else { | |
280 continue; | |
281 } | |
282 sit++; sit++; sit++; | |
283 string str_len=*sit++; | |
284 int len=atoi(str_len.c_str()); | |
285 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); | |
286 string str_pos=*sit++; | |
287 int fpos=atoi(str_pos.c_str()); | |
288 | |
289 // determine the chromosome index | |
290 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
291 int cind=-1; | |
292 if(li==cind_map.end()) { | |
293 // register new chromosome | |
294 cind=cnames.size(); | |
295 cnames.push_back(chr); | |
296 cind_map[chr]=cind; | |
297 // allocate new pos vector | |
298 pos.push_back(vector<int>()); | |
299 posnm.push_back(vector<int>()); | |
300 poslen.push_back(vector<int>()); | |
301 #ifdef DEBUG | |
302 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
303 #endif | |
304 } else { | |
305 cind=li->second; | |
306 } | |
307 fcount++; | |
308 (pos[cind]).push_back(fpos); | |
309 (posnm[cind]).push_back(nm); | |
310 (poslen[cind]).push_back(len); | |
311 #ifdef DEBUG | |
312 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
313 if(fcount>30) { | |
314 break; | |
315 } | |
316 #endif | |
317 | |
318 } | |
319 } | |
320 bed_file.close(); | |
321 | |
322 | |
323 #ifdef DEBUG | |
324 Rprintf("done. read %d fragments\n",fcount); | |
325 #endif | |
326 | |
327 Rprintf("done. read %d fragments\n",fcount); | |
328 | |
329 // construct output structures | |
330 SEXP chnames; | |
331 int np=0; // number of protections | |
332 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
333 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
334 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
335 } | |
336 np++; | |
337 | |
338 // sort | |
339 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
340 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
341 //} | |
342 | |
343 SEXP ans; | |
344 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
345 vector<vector<int> >::const_iterator nsi,lsi; | |
346 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
347 nsi=posnm.begin()+(csi-pos.begin()); | |
348 lsi=poslen.begin()+(csi-pos.begin()); | |
349 | |
350 SEXP dv,dnames_R; | |
351 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; | |
352 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
353 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
354 SET_STRING_ELT(dnames_R, 2, mkChar("l")); | |
355 | |
356 | |
357 | |
358 SEXP tv,nv,lv; | |
359 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
360 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
361 PROTECT(lv=allocVector(INTSXP,csi->size())); np++; | |
362 int* i_tv=INTEGER(tv); | |
363 int* i_nv=INTEGER(nv); | |
364 int* i_lv=INTEGER(lv); | |
365 | |
366 int i=0; | |
367 vector<int>::const_iterator ini=nsi->begin(); | |
368 vector<int>::const_iterator ili=lsi->begin(); | |
369 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
370 i_tv[i]=*pi; | |
371 i_nv[i]=*ini++; | |
372 i_lv[i]=*ili++; | |
373 i++; | |
374 } | |
375 PROTECT(dv = allocVector(VECSXP, 3)); np++; | |
376 SET_VECTOR_ELT(dv, 0, tv); | |
377 SET_VECTOR_ELT(dv, 1, nv); | |
378 SET_VECTOR_ELT(dv, 2, lv); | |
379 setAttrib(dv, R_NamesSymbol, dnames_R); | |
380 | |
381 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
382 } | |
383 | |
384 setAttrib(ans,R_NamesSymbol,chnames); | |
385 | |
386 #ifdef DEBUG | |
387 Rprintf("unprotecting %d elements\n",np); | |
388 #endif | |
389 | |
390 UNPROTECT(np); | |
391 return(ans); | |
392 } | |
393 | |
394 | |
395 int get_a_line(FILE *f,string& line) { | |
396 line=""; | |
397 char cline[1024]; | |
398 if(fgets(cline,1024,f)) { | |
399 line+=cline; | |
400 return(1); | |
401 } else { | |
402 return(0); | |
403 } | |
404 } | |
405 | |
406 | |
407 SEXP read_meland(SEXP filename,SEXP read_tag_names_R) { | |
408 | |
409 #ifdef DEBUG | |
410 Rprintf("start\n"); | |
411 #endif | |
412 const char* fname=CHAR(asChar(filename)); | |
413 int read_names=*(INTEGER(read_tag_names_R)); | |
414 #ifdef DEBUG | |
415 Rprintf("fname=%s\n",fname); | |
416 #endif | |
417 | |
418 // main data vector | |
419 // chr - pos | |
420 vector< vector<int> > pos; | |
421 vector< vector<int> > posnm; // number of mismatches | |
422 vector< vector<int> > poslen; // length | |
423 vector< vector<string> > tagnames; | |
424 | |
425 // chromosome map | |
426 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
427 vector<string> cnames; | |
428 | |
429 | |
430 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
431 boost::char_separator<char> sep(" \t"); | |
432 | |
433 | |
434 FILE *f=fopen(fname,"rb"); | |
435 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
436 | |
437 Rprintf("opened %s\n",fname); | |
438 | |
439 | |
440 // read in bed line | |
441 string line; | |
442 int fcount=0; | |
443 while(get_a_line(f,line)) { | |
444 | |
445 #ifdef DEBUG | |
446 Rprintf("line: %s\n",line.c_str()); | |
447 #endif | |
448 | |
449 | |
450 tokType tok(line, sep); | |
451 tokType::iterator sit=tok.begin(); | |
452 if(sit!=tok.end()) { | |
453 string tagname=*sit++; | |
454 sit++; | |
455 string str_nm=*sit++; | |
456 int nm=0; | |
457 if(str_nm[0]=='U') { | |
458 nm=atoi((str_nm.c_str()+1)); | |
459 } else { | |
460 continue; | |
461 } | |
462 sit++; sit++; sit++; | |
463 string str_len=*sit++; | |
464 int len=atoi(str_len.c_str()); | |
465 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); | |
466 string str_pos=*sit++; | |
467 int fpos=atoi(str_pos.c_str()); | |
468 | |
469 // determine the chromosome index | |
470 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
471 int cind=-1; | |
472 if(li==cind_map.end()) { | |
473 // register new chromosome | |
474 cind=cnames.size(); | |
475 cnames.push_back(chr); | |
476 cind_map[chr]=cind; | |
477 // allocate new pos vector | |
478 pos.push_back(vector<int>()); | |
479 posnm.push_back(vector<int>()); | |
480 poslen.push_back(vector<int>()); | |
481 if(read_names) { | |
482 tagnames.push_back(vector<string>()); | |
483 } | |
484 #ifdef DEBUG | |
485 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
486 #endif | |
487 } else { | |
488 cind=li->second; | |
489 } | |
490 fcount++; | |
491 (pos[cind]).push_back(fpos); | |
492 (posnm[cind]).push_back(nm); | |
493 (poslen[cind]).push_back(len); | |
494 if(read_names) { | |
495 (tagnames[cind]).push_back(tagname); | |
496 } | |
497 #ifdef DEBUG | |
498 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
499 if(fcount>30) { | |
500 break; | |
501 } | |
502 #endif | |
503 | |
504 } | |
505 } | |
506 fclose(f); | |
507 | |
508 | |
509 #ifdef DEBUG | |
510 Rprintf("done. read %d fragments\n",fcount); | |
511 #endif | |
512 | |
513 Rprintf("done. read %d fragments\n",fcount); | |
514 | |
515 // construct output structures | |
516 SEXP chnames; | |
517 int np=0; // number of protections | |
518 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
519 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
520 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
521 } | |
522 np++; | |
523 | |
524 // sort | |
525 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
526 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
527 //} | |
528 | |
529 SEXP ans; | |
530 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
531 vector<vector<int> >::const_iterator nsi,lsi; | |
532 vector<vector<string> >::const_iterator ssi; | |
533 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
534 nsi=posnm.begin()+(csi-pos.begin()); | |
535 lsi=poslen.begin()+(csi-pos.begin()); | |
536 | |
537 SEXP dv,dnames_R; | |
538 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; | |
539 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
540 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
541 SET_STRING_ELT(dnames_R, 2, mkChar("l")); | |
542 if(read_names) { | |
543 SET_STRING_ELT(dnames_R, 3, mkChar("s")); | |
544 } | |
545 | |
546 | |
547 | |
548 SEXP tv,nv,lv,sv; | |
549 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
550 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
551 PROTECT(lv=allocVector(INTSXP,csi->size())); np++; | |
552 if(read_names) { | |
553 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
554 } | |
555 int* i_tv=INTEGER(tv); | |
556 int* i_nv=INTEGER(nv); | |
557 int* i_lv=INTEGER(lv); | |
558 | |
559 int i=0; | |
560 vector<int>::const_iterator ini=nsi->begin(); | |
561 vector<int>::const_iterator ili=lsi->begin(); | |
562 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
563 i_tv[i]=*pi; | |
564 i_nv[i]=*ini++; | |
565 i_lv[i]=*ili++; | |
566 i++; | |
567 } | |
568 if(read_names) { | |
569 int i=0; | |
570 ssi=tagnames.begin()+(csi-pos.begin()); | |
571 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
572 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
573 i++; | |
574 } | |
575 } | |
576 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; | |
577 SET_VECTOR_ELT(dv, 0, tv); | |
578 SET_VECTOR_ELT(dv, 1, nv); | |
579 SET_VECTOR_ELT(dv, 2, lv); | |
580 if(read_names) { | |
581 SET_VECTOR_ELT(dv, 3, sv); | |
582 } | |
583 setAttrib(dv, R_NamesSymbol, dnames_R); | |
584 | |
585 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
586 } | |
587 | |
588 setAttrib(ans,R_NamesSymbol,chnames); | |
589 | |
590 #ifdef DEBUG | |
591 Rprintf("unprotecting %d elements\n",np); | |
592 #endif | |
593 | |
594 UNPROTECT(np); | |
595 return(ans); | |
596 } | |
597 | |
598 | |
599 | |
600 // reads regular eland files, recording mismatch positions | |
601 SEXP read_eland_mismatches(SEXP filename) { | |
602 | |
603 #ifdef DEBUG | |
604 Rprintf("start\n"); | |
605 #endif | |
606 const char* fname=CHAR(asChar(filename)); | |
607 #ifdef DEBUG | |
608 Rprintf("fname=%s\n",fname); | |
609 #endif | |
610 | |
611 // main data vector | |
612 // chr - pos | |
613 vector< vector<int> > pos; | |
614 vector< vector<int> > mm1; // position of the first mismatch (or 0 for none) | |
615 vector< vector<int> > mm2; // position of the second mismatch | |
616 | |
617 // chromosome map | |
618 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
619 vector<string> cnames; | |
620 | |
621 | |
622 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
623 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
624 | |
625 | |
626 FILE *f=fopen(fname,"rb"); | |
627 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
628 | |
629 Rprintf("opened %s\n",fname); | |
630 | |
631 // read in bed line | |
632 string line; | |
633 int fcount=0; | |
634 while(get_a_line(f,line)) { | |
635 | |
636 #ifdef DEBUG | |
637 Rprintf("line: %s\n",line.c_str()); | |
638 #endif | |
639 | |
640 | |
641 tokType tok(line, sep); | |
642 tokType::iterator sit=tok.begin(); | |
643 if(sit!=tok.end()) { | |
644 sit++; | |
645 string seq=*sit++; | |
646 string str_nm=*sit++; | |
647 int nm=0; | |
648 if(str_nm[0]=='U') { | |
649 nm=atoi((str_nm.c_str()+1)); | |
650 } else { | |
651 continue; | |
652 } | |
653 sit++; sit++; sit++; | |
654 string chr=*sit++; | |
655 // extract chromosome name from this | |
656 int chrp=chr.find("chr"); | |
657 int pp=chr.find('.'); | |
658 chr=chr.substr(chrp+3,pp-chrp-3); | |
659 | |
660 string str_pos=*sit++; | |
661 int fpos=atoi(str_pos.c_str()); | |
662 | |
663 | |
664 string strand=*sit++; | |
665 int nstrand=0; | |
666 if(strand=="R") { | |
667 fpos=-1*(fpos+seq.size()-1); | |
668 nstrand=1; | |
669 } | |
670 | |
671 sit++; | |
672 | |
673 int nm1=0; int nm2=0; | |
674 if(sit!=tok.end()) { | |
675 string nms=*sit++; | |
676 nm1=atoi(nms.substr(0,nms.size()-1).c_str()); | |
677 if(nstrand) { nm1=seq.size()-nm1+1; } | |
678 } | |
679 if(sit!=tok.end()) { | |
680 string nms=*sit++; | |
681 nm2=atoi(nms.substr(0,nms.size()-1).c_str()); | |
682 if(nstrand) { nm2=seq.size()-nm2+1; } | |
683 } | |
684 | |
685 // determine the chromosome index | |
686 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
687 int cind=-1; | |
688 if(li==cind_map.end()) { | |
689 // register new chromosome | |
690 cind=cnames.size(); | |
691 cnames.push_back(chr); | |
692 cind_map[chr]=cind; | |
693 // allocate new pos vector | |
694 pos.push_back(vector<int>()); | |
695 mm1.push_back(vector<int>()); | |
696 mm2.push_back(vector<int>()); | |
697 #ifdef DEBUG | |
698 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
699 #endif | |
700 } else { | |
701 cind=li->second; | |
702 } | |
703 fcount++; | |
704 (pos[cind]).push_back(fpos); | |
705 (mm1[cind]).push_back(nm1); | |
706 (mm2[cind]).push_back(nm2); | |
707 #ifdef DEBUG | |
708 Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2); | |
709 if(fcount>30) { | |
710 break; | |
711 } | |
712 #endif | |
713 | |
714 } | |
715 } | |
716 fclose(f); | |
717 | |
718 | |
719 #ifdef DEBUG | |
720 Rprintf("done. read %d fragments\n",fcount); | |
721 #endif | |
722 | |
723 Rprintf("done. read %d fragments\n",fcount); | |
724 | |
725 // construct output structures | |
726 SEXP chnames; | |
727 int np=0; // number of protections | |
728 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
729 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
730 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
731 } | |
732 np++; | |
733 | |
734 // sort | |
735 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
736 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
737 //} | |
738 | |
739 SEXP ans; | |
740 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
741 vector<vector<int> >::const_iterator nsi,lsi; | |
742 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
743 nsi=mm1.begin()+(csi-pos.begin()); | |
744 lsi=mm2.begin()+(csi-pos.begin()); | |
745 | |
746 SEXP dv,dnames_R; | |
747 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; | |
748 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
749 SET_STRING_ELT(dnames_R, 1, mkChar("f")); | |
750 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
751 | |
752 | |
753 | |
754 SEXP tv,nv,lv; | |
755 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
756 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
757 PROTECT(lv=allocVector(INTSXP,csi->size())); np++; | |
758 int* i_tv=INTEGER(tv); | |
759 int* i_nv=INTEGER(nv); | |
760 int* i_lv=INTEGER(lv); | |
761 | |
762 int i=0; | |
763 vector<int>::const_iterator ini=nsi->begin(); | |
764 vector<int>::const_iterator ili=lsi->begin(); | |
765 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
766 i_tv[i]=*pi; | |
767 i_nv[i]=*ini++; | |
768 i_lv[i]=*ili++; | |
769 i++; | |
770 } | |
771 PROTECT(dv = allocVector(VECSXP, 3)); np++; | |
772 SET_VECTOR_ELT(dv, 0, tv); | |
773 SET_VECTOR_ELT(dv, 1, nv); | |
774 SET_VECTOR_ELT(dv, 2, lv); | |
775 setAttrib(dv, R_NamesSymbol, dnames_R); | |
776 | |
777 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
778 } | |
779 | |
780 setAttrib(ans,R_NamesSymbol,chnames); | |
781 | |
782 #ifdef DEBUG | |
783 Rprintf("unprotecting %d elements\n",np); | |
784 #endif | |
785 | |
786 UNPROTECT(np); | |
787 return(ans); | |
788 } | |
789 | |
790 | |
791 // read in regular eland files, adjusting the negative strand coordinate by sequence length | |
792 SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { | |
793 | |
794 #ifdef DEBUG | |
795 Rprintf("start\n"); | |
796 #endif | |
797 const char* fname=CHAR(asChar(filename)); | |
798 int read_names=*(INTEGER(read_tag_names_R)); | |
799 int eland_tag_length=*(INTEGER(eland_tag_length_R)); | |
800 #ifdef DEBUG | |
801 Rprintf("fname=%s\n",fname); | |
802 #endif | |
803 | |
804 // main data vector | |
805 // chr - pos | |
806 vector< vector<int> > pos; | |
807 vector< vector<int> > posnm; // number of mismatches | |
808 vector< vector<string> > tagnames; | |
809 | |
810 // chromosome map | |
811 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
812 vector<string> cnames; | |
813 | |
814 | |
815 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
816 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
817 | |
818 | |
819 FILE *f=fopen(fname,"rb"); | |
820 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
821 else { | |
822 Rprintf("opened %s\n",fname); | |
823 | |
824 // read in bed line | |
825 string line; | |
826 int fcount=0; | |
827 while(get_a_line(f,line)) { | |
828 | |
829 #ifdef DEBUG | |
830 Rprintf("line: %s\n",line.c_str()); | |
831 #endif | |
832 | |
833 | |
834 tokType tok(line, sep); | |
835 tokType::iterator sit=tok.begin(); | |
836 if(sit!=tok.end()) { | |
837 string tagname=*sit++; | |
838 string sequence=*sit++; | |
839 int len=sequence.size(); | |
840 // adjust probe length if eland length limit was specified | |
841 if(eland_tag_length>0 && len>eland_tag_length) { | |
842 len=eland_tag_length; | |
843 } | |
844 string str_nm=*sit++; | |
845 int nm=0; | |
846 if(str_nm[0]=='U') { | |
847 nm=atoi((str_nm.c_str()+1)); | |
848 } else { | |
849 continue; | |
850 } | |
851 sit++; sit++; sit++; | |
852 string chr=*sit++; | |
853 string str_pos=*sit++; | |
854 int fpos=atoi(str_pos.c_str()); | |
855 string str_strand=*sit++; | |
856 | |
857 if(str_strand[0]=='R') { | |
858 fpos=-1*(fpos+len-1); | |
859 } | |
860 | |
861 // determine the chromosome index | |
862 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
863 int cind=-1; | |
864 if(li==cind_map.end()) { | |
865 // register new chromosome | |
866 cind=cnames.size(); | |
867 cnames.push_back(chr); | |
868 cind_map[chr]=cind; | |
869 // allocate new pos vector | |
870 pos.push_back(vector<int>()); | |
871 posnm.push_back(vector<int>()); | |
872 if(read_names) { | |
873 tagnames.push_back(vector<string>()); | |
874 } | |
875 #ifdef DEBUG | |
876 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
877 #endif | |
878 } else { | |
879 cind=li->second; | |
880 } | |
881 fcount++; | |
882 (pos[cind]).push_back(fpos); | |
883 (posnm[cind]).push_back(nm); | |
884 if(read_names) { | |
885 (tagnames[cind]).push_back(tagname); | |
886 } | |
887 #ifdef DEBUG | |
888 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
889 if(fcount>30) { | |
890 break; | |
891 } | |
892 #endif | |
893 | |
894 } | |
895 } | |
896 fclose(f); | |
897 | |
898 Rprintf("done. read %d fragments\n",fcount); | |
899 } | |
900 // construct output structures | |
901 SEXP chnames; | |
902 int np=0; // number of protections | |
903 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
904 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
905 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
906 } | |
907 np++; | |
908 | |
909 // sort | |
910 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
911 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
912 //} | |
913 | |
914 SEXP ans; | |
915 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
916 vector<vector<int> >::const_iterator nsi; | |
917 vector<vector<string> >::const_iterator ssi; | |
918 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
919 nsi=posnm.begin()+(csi-pos.begin()); | |
920 | |
921 SEXP dv,dnames_R; | |
922 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; | |
923 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
924 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
925 if(read_names) { | |
926 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
927 } | |
928 | |
929 | |
930 | |
931 SEXP tv,nv,sv; | |
932 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
933 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
934 if(read_names) { | |
935 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
936 } | |
937 int* i_tv=INTEGER(tv); | |
938 int* i_nv=INTEGER(nv); | |
939 | |
940 int i=0; | |
941 vector<int>::const_iterator ini=nsi->begin(); | |
942 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
943 i_tv[i]=*pi; | |
944 i_nv[i]=*ini++; | |
945 i++; | |
946 } | |
947 if(read_names) { | |
948 int i=0; | |
949 ssi=tagnames.begin()+(csi-pos.begin()); | |
950 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
951 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
952 i++; | |
953 } | |
954 } | |
955 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; | |
956 SET_VECTOR_ELT(dv, 0, tv); | |
957 SET_VECTOR_ELT(dv, 1, nv); | |
958 if(read_names) { | |
959 SET_VECTOR_ELT(dv, 2, sv); | |
960 } | |
961 setAttrib(dv, R_NamesSymbol, dnames_R); | |
962 | |
963 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
964 } | |
965 | |
966 setAttrib(ans,R_NamesSymbol,chnames); | |
967 | |
968 #ifdef DEBUG | |
969 Rprintf("unprotecting %d elements\n",np); | |
970 #endif | |
971 | |
972 UNPROTECT(np); | |
973 return(ans); | |
974 } | |
975 | |
976 | |
977 | |
978 // read in extended eland files, adjusting the negative strand coordinate by sequence length | |
979 SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { | |
980 | |
981 #ifdef DEBUG | |
982 Rprintf("start\n"); | |
983 #endif | |
984 const char* fname=CHAR(asChar(filename)); | |
985 int read_names=*(INTEGER(read_tag_names_R)); | |
986 int eland_tag_length=*(INTEGER(eland_tag_length_R)); | |
987 #ifdef DEBUG | |
988 Rprintf("fname=%s\n",fname); | |
989 #endif | |
990 | |
991 // main data vector | |
992 // chr - pos | |
993 vector< vector<int> > pos; | |
994 vector< vector<int> > posnm; // number of mismatches | |
995 vector< vector<string> > tagnames; | |
996 | |
997 // chromosome map | |
998 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
999 vector<string> cnames; | |
1000 | |
1001 | |
1002 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
1003 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
1004 | |
1005 | |
1006 FILE *f=fopen(fname,"rb"); | |
1007 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
1008 else { | |
1009 Rprintf("opened %s\n",fname); | |
1010 | |
1011 // read in bed line | |
1012 string line; | |
1013 int fcount=0; | |
1014 while(get_a_line(f,line)) { | |
1015 | |
1016 #ifdef DEBUG | |
1017 Rprintf("line: %s\n",line.c_str()); | |
1018 #endif | |
1019 | |
1020 | |
1021 tokType tok(line, sep); | |
1022 tokType::iterator sit=tok.begin(); | |
1023 if(sit!=tok.end()) { | |
1024 string machinename=*sit++; | |
1025 string runnumber=*sit++; | |
1026 string lanenumber=*sit++; | |
1027 *sit++; | |
1028 | |
1029 string str_x=*sit++; | |
1030 string str_y=*sit++; | |
1031 | |
1032 string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y; | |
1033 | |
1034 | |
1035 | |
1036 *sit++; | |
1037 *sit++; | |
1038 | |
1039 | |
1040 string sequence=*sit++; | |
1041 *sit++; | |
1042 | |
1043 string chr=*sit++; | |
1044 string contig=*sit++; | |
1045 chr=chr+contig; | |
1046 | |
1047 int len=sequence.size(); | |
1048 // adjust probe length if eland length limit was specified | |
1049 if(eland_tag_length>0 && len>eland_tag_length) { | |
1050 len=eland_tag_length; | |
1051 } | |
1052 | |
1053 | |
1054 | |
1055 string str_pos=*sit++; | |
1056 if(str_pos.size()<1) { continue; } | |
1057 int fpos=atoi(str_pos.c_str()); | |
1058 string str_strand=*sit++; | |
1059 | |
1060 if(str_strand[0]=='R') { | |
1061 fpos=-1*(fpos+len-1); | |
1062 } | |
1063 | |
1064 string str_nm=*sit++; | |
1065 // count non-digit characters | |
1066 int nm=0; | |
1067 for(int i=0;i<str_nm.size();i++) { | |
1068 if(!isdigit(str_nm[i])) { nm++; } | |
1069 } | |
1070 | |
1071 // determine the chromosome index | |
1072 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
1073 int cind=-1; | |
1074 if(li==cind_map.end()) { | |
1075 // register new chromosome | |
1076 cind=cnames.size(); | |
1077 cnames.push_back(chr); | |
1078 cind_map[chr]=cind; | |
1079 // allocate new pos vector | |
1080 pos.push_back(vector<int>()); | |
1081 posnm.push_back(vector<int>()); | |
1082 if(read_names) { | |
1083 tagnames.push_back(vector<string>()); | |
1084 } | |
1085 #ifdef DEBUG | |
1086 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
1087 #endif | |
1088 } else { | |
1089 cind=li->second; | |
1090 } | |
1091 fcount++; | |
1092 (pos[cind]).push_back(fpos); | |
1093 (posnm[cind]).push_back(nm); | |
1094 if(read_names) { | |
1095 (tagnames[cind]).push_back(tagname); | |
1096 } | |
1097 #ifdef DEBUG | |
1098 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
1099 if(fcount>30) { | |
1100 break; | |
1101 } | |
1102 #endif | |
1103 | |
1104 } | |
1105 } | |
1106 fclose(f); | |
1107 | |
1108 Rprintf("done. read %d fragments\n",fcount); | |
1109 } | |
1110 // construct output structures | |
1111 SEXP chnames; | |
1112 int np=0; // number of protections | |
1113 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
1114 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
1115 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
1116 } | |
1117 np++; | |
1118 | |
1119 // sort | |
1120 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1121 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
1122 //} | |
1123 | |
1124 SEXP ans; | |
1125 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
1126 vector<vector<int> >::const_iterator nsi; | |
1127 vector<vector<string> >::const_iterator ssi; | |
1128 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1129 nsi=posnm.begin()+(csi-pos.begin()); | |
1130 | |
1131 SEXP dv,dnames_R; | |
1132 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; | |
1133 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
1134 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
1135 if(read_names) { | |
1136 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
1137 } | |
1138 | |
1139 | |
1140 | |
1141 SEXP tv,nv,sv; | |
1142 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
1143 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
1144 if(read_names) { | |
1145 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
1146 } | |
1147 int* i_tv=INTEGER(tv); | |
1148 int* i_nv=INTEGER(nv); | |
1149 | |
1150 int i=0; | |
1151 vector<int>::const_iterator ini=nsi->begin(); | |
1152 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
1153 i_tv[i]=*pi; | |
1154 i_nv[i]=*ini++; | |
1155 i++; | |
1156 } | |
1157 if(read_names) { | |
1158 int i=0; | |
1159 ssi=tagnames.begin()+(csi-pos.begin()); | |
1160 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
1161 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
1162 i++; | |
1163 } | |
1164 } | |
1165 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; | |
1166 SET_VECTOR_ELT(dv, 0, tv); | |
1167 SET_VECTOR_ELT(dv, 1, nv); | |
1168 if(read_names) { | |
1169 SET_VECTOR_ELT(dv, 2, sv); | |
1170 } | |
1171 setAttrib(dv, R_NamesSymbol, dnames_R); | |
1172 | |
1173 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
1174 } | |
1175 | |
1176 setAttrib(ans,R_NamesSymbol,chnames); | |
1177 | |
1178 #ifdef DEBUG | |
1179 Rprintf("unprotecting %d elements\n",np); | |
1180 #endif | |
1181 | |
1182 UNPROTECT(np); | |
1183 return(ans); | |
1184 } | |
1185 | |
1186 | |
1187 // read in eland multi files, adjusting the negative strand coordinate by sequence length | |
1188 SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { | |
1189 | |
1190 #ifdef DEBUG | |
1191 Rprintf("read_eland_muti() : start\n"); | |
1192 #endif | |
1193 const char* fname=CHAR(asChar(filename)); | |
1194 int read_names=*(INTEGER(read_tag_names_R)); | |
1195 int eland_tag_length=*(INTEGER(eland_tag_length_R)); | |
1196 #ifdef DEBUG | |
1197 Rprintf("fname=%s\n",fname); | |
1198 #endif | |
1199 | |
1200 // main data vector | |
1201 // chr - pos | |
1202 vector< vector<int> > pos; | |
1203 vector< vector<int> > posnm; // number of mismatches | |
1204 vector< vector<string> > tagnames; | |
1205 | |
1206 // chromosome map | |
1207 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
1208 vector<string> cnames; | |
1209 | |
1210 | |
1211 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
1212 boost::char_separator<char> sep(" \t",""); | |
1213 boost::char_separator<char> comsep(",","",boost::keep_empty_tokens); | |
1214 boost::char_separator<char> colsep(":","",boost::keep_empty_tokens); | |
1215 | |
1216 | |
1217 FILE *f=fopen(fname,"rb"); | |
1218 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
1219 else { | |
1220 Rprintf("opened %s\n",fname); | |
1221 | |
1222 // read in bed line | |
1223 string line; | |
1224 int nline=0; | |
1225 int fcount=0; | |
1226 while(get_a_line(f,line)) { | |
1227 nline++; | |
1228 // chomp | |
1229 size_t elpos = line.find_last_not_of("\n"); | |
1230 if(elpos != string::npos) { | |
1231 line = line.substr(0, elpos+1); | |
1232 } | |
1233 #ifdef DEBUG | |
1234 Rprintf("line %d: %s\n",nline,line.c_str()); | |
1235 #endif | |
1236 | |
1237 tokType tok(line, sep); | |
1238 tokType::iterator sit=tok.begin(); | |
1239 if(sit!=tok.end()) { | |
1240 string tagname=*sit++; | |
1241 string sequence=*sit++; | |
1242 string mspec=*sit++; | |
1243 // parse out match spec | |
1244 | |
1245 if(mspec=="NM" || mspec=="QC") { continue; } | |
1246 #ifdef DEBUG | |
1247 Rprintf("parsing out spec \"%s\" : ",mspec.c_str()); | |
1248 #endif | |
1249 | |
1250 tokType stok(mspec, colsep); | |
1251 tokType::iterator ssit=stok.begin(); | |
1252 string str_nm0=*ssit++; | |
1253 | |
1254 int nm=0; | |
1255 int nm0=atoi(str_nm0.c_str()); | |
1256 if(nm0>1) { | |
1257 #ifdef DEBUG | |
1258 Rprintf("rejected for nm0\n"); | |
1259 #endif | |
1260 continue; | |
1261 } | |
1262 if(nm0==0) { | |
1263 string str_nm1=*ssit++; | |
1264 int nm1=atoi(str_nm1.c_str()); | |
1265 if(nm1>1) { | |
1266 #ifdef DEBUG | |
1267 Rprintf("rejected for nm1\n"); | |
1268 #endif | |
1269 continue; | |
1270 } | |
1271 if(nm1==0) { | |
1272 string str_nm2=*ssit++; | |
1273 int nm2=atoi(str_nm2.c_str()); | |
1274 if(nm2>1) { | |
1275 #ifdef DEBUG | |
1276 Rprintf("rejected for nm2\n"); | |
1277 #endif | |
1278 continue; | |
1279 } | |
1280 nm=2; | |
1281 } else { | |
1282 nm=1; | |
1283 } | |
1284 } | |
1285 | |
1286 #ifdef DEBUG | |
1287 Rprintf("accepted (nm=%d)\n",nm); | |
1288 #endif | |
1289 int npos=0; | |
1290 string mpos=*sit++; | |
1291 vector<string> mposc; | |
1292 vector<int> mposp; | |
1293 tokType ptok(mpos, comsep); | |
1294 string prevchr; | |
1295 for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) { | |
1296 string cpos=*psit; | |
1297 npos++; | |
1298 int strand=1; | |
1299 if(cpos.size()<5) { | |
1300 Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str()); | |
1301 } | |
1302 char lc=cpos.at(cpos.size()-1); | |
1303 | |
1304 if(atoi(&lc)==nm) { | |
1305 switch(cpos.at(cpos.size()-2)) { | |
1306 case 'R': strand=-1; break; | |
1307 case 'F': strand=1; break; | |
1308 default: | |
1309 Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break; | |
1310 continue; | |
1311 } | |
1312 string chr,str_pos; | |
1313 size_t colpos=cpos.find(":"); | |
1314 if(colpos==string::npos) { | |
1315 if(npos>1) { | |
1316 chr=prevchr; | |
1317 str_pos=cpos.substr(0,cpos.size()-2); | |
1318 } else { | |
1319 Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str()); | |
1320 continue; | |
1321 } | |
1322 } else { | |
1323 chr=cpos.substr(0,colpos); | |
1324 str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos); | |
1325 } | |
1326 #ifdef DEBUG | |
1327 Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand); | |
1328 #endif | |
1329 int pos=strand*atoi(str_pos.c_str()); | |
1330 mposc.push_back(chr); | |
1331 mposp.push_back(pos); | |
1332 } | |
1333 } | |
1334 | |
1335 string chr; | |
1336 int fpos; | |
1337 if(mposc.size()!=1) { | |
1338 if(mposc.size()==0) { | |
1339 Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str()); | |
1340 } else { | |
1341 Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str()); | |
1342 } | |
1343 continue; | |
1344 } else { | |
1345 chr=*mposc.begin(); | |
1346 fpos=*mposp.begin(); | |
1347 } | |
1348 | |
1349 int len=sequence.size(); | |
1350 // adjust probe length if eland length limit was specified | |
1351 if(eland_tag_length>0 && len>eland_tag_length) { | |
1352 len=eland_tag_length; | |
1353 } | |
1354 | |
1355 if(fpos<0) { | |
1356 fpos=-1*(-1*fpos+len-1); | |
1357 } | |
1358 | |
1359 // determine the chromosome index | |
1360 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
1361 int cind=-1; | |
1362 if(li==cind_map.end()) { | |
1363 // register new chromosome | |
1364 cind=cnames.size(); | |
1365 cnames.push_back(chr); | |
1366 cind_map[chr]=cind; | |
1367 // allocate new pos vector | |
1368 pos.push_back(vector<int>()); | |
1369 posnm.push_back(vector<int>()); | |
1370 if(read_names) { | |
1371 tagnames.push_back(vector<string>()); | |
1372 } | |
1373 #ifdef DEBUG | |
1374 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
1375 #endif | |
1376 } else { | |
1377 cind=li->second; | |
1378 } | |
1379 fcount++; | |
1380 (pos[cind]).push_back(fpos); | |
1381 (posnm[cind]).push_back(nm); | |
1382 if(read_names) { | |
1383 (tagnames[cind]).push_back(tagname); | |
1384 } | |
1385 #ifdef DEBUG | |
1386 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
1387 if(fcount>30) { | |
1388 break; | |
1389 } | |
1390 #endif | |
1391 | |
1392 } | |
1393 } | |
1394 fclose(f); | |
1395 | |
1396 Rprintf("done. read %d fragments\n",fcount); | |
1397 } | |
1398 // construct output structures | |
1399 SEXP chnames; | |
1400 int np=0; // number of protections | |
1401 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
1402 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
1403 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
1404 } | |
1405 np++; | |
1406 | |
1407 // sort | |
1408 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1409 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
1410 //} | |
1411 | |
1412 SEXP ans; | |
1413 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
1414 vector<vector<int> >::const_iterator nsi; | |
1415 vector<vector<string> >::const_iterator ssi; | |
1416 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1417 nsi=posnm.begin()+(csi-pos.begin()); | |
1418 | |
1419 SEXP dv,dnames_R; | |
1420 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; | |
1421 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
1422 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
1423 if(read_names) { | |
1424 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
1425 } | |
1426 | |
1427 | |
1428 | |
1429 SEXP tv,nv,sv; | |
1430 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
1431 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
1432 if(read_names) { | |
1433 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
1434 } | |
1435 int* i_tv=INTEGER(tv); | |
1436 int* i_nv=INTEGER(nv); | |
1437 | |
1438 int i=0; | |
1439 vector<int>::const_iterator ini=nsi->begin(); | |
1440 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
1441 i_tv[i]=*pi; | |
1442 i_nv[i]=*ini++; | |
1443 i++; | |
1444 } | |
1445 if(read_names) { | |
1446 int i=0; | |
1447 ssi=tagnames.begin()+(csi-pos.begin()); | |
1448 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
1449 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
1450 i++; | |
1451 } | |
1452 } | |
1453 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; | |
1454 SET_VECTOR_ELT(dv, 0, tv); | |
1455 SET_VECTOR_ELT(dv, 1, nv); | |
1456 if(read_names) { | |
1457 SET_VECTOR_ELT(dv, 2, sv); | |
1458 } | |
1459 setAttrib(dv, R_NamesSymbol, dnames_R); | |
1460 | |
1461 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
1462 } | |
1463 | |
1464 setAttrib(ans,R_NamesSymbol,chnames); | |
1465 | |
1466 #ifdef DEBUG | |
1467 Rprintf("unprotecting %d elements\n",np); | |
1468 #endif | |
1469 | |
1470 UNPROTECT(np); | |
1471 return(ans); | |
1472 } | |
1473 | |
1474 | |
1475 // read in regular eland files, adjusting the negative strand coordinate by sequence length | |
1476 SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) { | |
1477 | |
1478 #ifdef DEBUG | |
1479 Rprintf("start\n"); | |
1480 #endif | |
1481 const char* fname=CHAR(asChar(filename)); | |
1482 int read_names=*(INTEGER(read_tag_names_R)); | |
1483 #ifdef DEBUG | |
1484 Rprintf("fname=%s\n",fname); | |
1485 #endif | |
1486 | |
1487 // main data vector | |
1488 // chr - pos | |
1489 vector< vector<int> > pos; | |
1490 vector< vector<int> > posnm; // number of mismatches | |
1491 vector< vector<string> > tagnames; | |
1492 | |
1493 // chromosome map | |
1494 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
1495 vector<string> cnames; | |
1496 | |
1497 | |
1498 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
1499 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
1500 boost::char_separator<char> sep2(","); | |
1501 | |
1502 | |
1503 FILE *f=fopen(fname,"rb"); | |
1504 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; | |
1505 } else { | |
1506 #ifdef HAVE_LIBBZ2 | |
1507 BZFILE* b; | |
1508 int bzerror; | |
1509 | |
1510 int bz2file=0; | |
1511 if(strstr(fname,".bz2")) { | |
1512 bz2file=1; | |
1513 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); | |
1514 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } | |
1515 } | |
1516 #endif | |
1517 | |
1518 Rprintf("opened %s\n",fname); | |
1519 | |
1520 // read in bed line | |
1521 string line; | |
1522 int fcount=0; | |
1523 #ifdef HAVE_LIBBZ2 | |
1524 while(get_a_line(f,b,bz2file,line)) { | |
1525 #else | |
1526 while(get_a_line(f,line)) { | |
1527 #endif | |
1528 | |
1529 #ifdef DEBUG | |
1530 Rprintf("line: %s\n",line.c_str()); | |
1531 #endif | |
1532 | |
1533 | |
1534 tokType tok(line, sep); | |
1535 tokType::iterator sit=tok.begin(); | |
1536 if(sit!=tok.end()) { | |
1537 string tagname=*sit++; | |
1538 string str_strand=*sit++; | |
1539 string chr=*sit++; | |
1540 | |
1541 string str_pos=*sit++; | |
1542 int fpos=atoi(str_pos.c_str()); | |
1543 | |
1544 string sequence=*sit++; | |
1545 sit++; sit++; | |
1546 string mm=*sit++; | |
1547 | |
1548 int len=sequence.size(); | |
1549 if(str_strand[0]=='-') { | |
1550 fpos=-1*(fpos+len-1); | |
1551 } | |
1552 // determine number of mismatches | |
1553 int nm=0; | |
1554 if(mm.size()>0) { | |
1555 nm++; | |
1556 string::size_type tp(0); | |
1557 while(tp!=string::npos) { | |
1558 tp = mm.find(",",tp); | |
1559 if(tp!=string::npos) { | |
1560 tp++; | |
1561 ++nm; | |
1562 } | |
1563 } | |
1564 } | |
1565 | |
1566 | |
1567 | |
1568 // determine the chromosome index | |
1569 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
1570 int cind=-1; | |
1571 if(li==cind_map.end()) { | |
1572 // register new chromosome | |
1573 cind=cnames.size(); | |
1574 cnames.push_back(chr); | |
1575 cind_map[chr]=cind; | |
1576 // allocate new pos vector | |
1577 pos.push_back(vector<int>()); | |
1578 posnm.push_back(vector<int>()); | |
1579 if(read_names) { | |
1580 tagnames.push_back(vector<string>()); | |
1581 } | |
1582 #ifdef DEBUG | |
1583 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
1584 #endif | |
1585 } else { | |
1586 cind=li->second; | |
1587 } | |
1588 fcount++; | |
1589 (pos[cind]).push_back(fpos); | |
1590 (posnm[cind]).push_back(nm); | |
1591 if(read_names) { | |
1592 (tagnames[cind]).push_back(tagname); | |
1593 } | |
1594 #ifdef DEBUG | |
1595 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
1596 if(fcount>30) { | |
1597 break; | |
1598 } | |
1599 #endif | |
1600 | |
1601 } | |
1602 } | |
1603 | |
1604 #ifdef HAVE_LIBBZ2 | |
1605 BZ2_bzReadClose( &bzerror, b); | |
1606 #endif | |
1607 fclose(f); | |
1608 | |
1609 Rprintf("done. read %d fragments\n",fcount); | |
1610 } | |
1611 // construct output structures | |
1612 SEXP chnames; | |
1613 int np=0; // number of protections | |
1614 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
1615 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
1616 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
1617 } | |
1618 np++; | |
1619 | |
1620 // sort | |
1621 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1622 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
1623 //} | |
1624 | |
1625 SEXP ans; | |
1626 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
1627 vector<vector<int> >::const_iterator nsi; | |
1628 vector<vector<string> >::const_iterator ssi; | |
1629 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1630 nsi=posnm.begin()+(csi-pos.begin()); | |
1631 | |
1632 SEXP dv,dnames_R; | |
1633 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; | |
1634 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
1635 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
1636 if(read_names) { | |
1637 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
1638 } | |
1639 | |
1640 | |
1641 | |
1642 SEXP tv,nv,sv; | |
1643 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
1644 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
1645 if(read_names) { | |
1646 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
1647 } | |
1648 int* i_tv=INTEGER(tv); | |
1649 int* i_nv=INTEGER(nv); | |
1650 | |
1651 int i=0; | |
1652 vector<int>::const_iterator ini=nsi->begin(); | |
1653 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
1654 i_tv[i]=*pi; | |
1655 i_nv[i]=*ini++; | |
1656 i++; | |
1657 } | |
1658 if(read_names) { | |
1659 int i=0; | |
1660 ssi=tagnames.begin()+(csi-pos.begin()); | |
1661 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
1662 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
1663 i++; | |
1664 } | |
1665 } | |
1666 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; | |
1667 SET_VECTOR_ELT(dv, 0, tv); | |
1668 SET_VECTOR_ELT(dv, 1, nv); | |
1669 if(read_names) { | |
1670 SET_VECTOR_ELT(dv, 2, sv); | |
1671 } | |
1672 setAttrib(dv, R_NamesSymbol, dnames_R); | |
1673 | |
1674 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
1675 } | |
1676 | |
1677 setAttrib(ans,R_NamesSymbol,chnames); | |
1678 | |
1679 #ifdef DEBUG | |
1680 Rprintf("unprotecting %d elements\n",np); | |
1681 #endif | |
1682 | |
1683 UNPROTECT(np); | |
1684 return(ans); | |
1685 } | |
1686 | |
1687 | |
1688 // read in helicos tab-separated alignment output (regular or bz2) | |
1689 SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) { | |
1690 | |
1691 #ifdef DEBUG | |
1692 Rprintf("start\n"); | |
1693 #endif | |
1694 const char* fname=CHAR(asChar(filename)); | |
1695 int read_names=*(INTEGER(read_tag_names_R)); | |
1696 #ifdef DEBUG | |
1697 Rprintf("fname=%s\n",fname); | |
1698 #endif | |
1699 | |
1700 // main data vector | |
1701 // chr - pos | |
1702 vector< vector<int> > pos; | |
1703 vector< vector<int> > posnm; // number of mismatches | |
1704 vector< vector<int> > poslen; // length of the match | |
1705 vector< vector<string> > tagnames; | |
1706 | |
1707 // chromosome map | |
1708 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
1709 vector<string> cnames; | |
1710 | |
1711 | |
1712 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
1713 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
1714 boost::char_separator<char> sep2(","); | |
1715 | |
1716 | |
1717 FILE *f=fopen(fname,"rb"); | |
1718 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; | |
1719 } else { | |
1720 #ifdef HAVE_LIBBZ2 | |
1721 BZFILE* b; | |
1722 int bzerror; | |
1723 | |
1724 int bz2file=0; | |
1725 if(strstr(fname,".bz2")) { | |
1726 bz2file=1; | |
1727 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); | |
1728 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } | |
1729 } | |
1730 #endif | |
1731 | |
1732 Rprintf("opened %s\n",fname); | |
1733 | |
1734 // read in bed line | |
1735 string line; | |
1736 int fcount=0; | |
1737 int nlines=0; | |
1738 #ifdef HAVE_LIBBZ2 | |
1739 while(get_a_line(f,b,bz2file,line)) { | |
1740 #else | |
1741 while(get_a_line(f,line)) { | |
1742 #endif | |
1743 | |
1744 #ifdef DEBUG | |
1745 Rprintf("line: %s\n",line.c_str()); | |
1746 #endif | |
1747 nlines++; | |
1748 // skip comments | |
1749 if(line[0]=='#') { continue; } | |
1750 if(line.compare(0,12,"Reference_ID")==0) { | |
1751 #ifdef DEBUG | |
1752 Rprintf("matched header on line %d\n",nlines); | |
1753 #endif | |
1754 continue; | |
1755 } | |
1756 | |
1757 tokType tok(line, sep); | |
1758 tokType::iterator sit=tok.begin(); | |
1759 if(sit!=tok.end()) { | |
1760 string chr=*sit++; | |
1761 string tagname=*sit++; | |
1762 string str_startpos=*sit++; | |
1763 string str_endpos=*sit++; | |
1764 | |
1765 string str_tstart=*sit++; | |
1766 string str_tend=*sit++; | |
1767 int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str()); | |
1768 | |
1769 sit++; sit++; | |
1770 string str_ndel=*sit++; | |
1771 string str_nins=*sit++; | |
1772 string str_nsub=*sit++; | |
1773 | |
1774 string str_strand=*sit++; | |
1775 int fpos; | |
1776 if(str_strand[0]=='-') { | |
1777 fpos=-1*atoi(str_endpos.c_str()); | |
1778 } else { | |
1779 fpos=atoi(str_startpos.c_str()); | |
1780 } | |
1781 | |
1782 // determine number of mismatches | |
1783 int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str()); | |
1784 | |
1785 // determine the chromosome index | |
1786 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
1787 int cind=-1; | |
1788 if(li==cind_map.end()) { | |
1789 // register new chromosome | |
1790 cind=cnames.size(); | |
1791 cnames.push_back(chr); | |
1792 cind_map[chr]=cind; | |
1793 // allocate new pos vector | |
1794 pos.push_back(vector<int>()); | |
1795 posnm.push_back(vector<int>()); | |
1796 poslen.push_back(vector<int>()); | |
1797 if(read_names) { | |
1798 tagnames.push_back(vector<string>()); | |
1799 } | |
1800 #ifdef DEBUG | |
1801 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
1802 #endif | |
1803 } else { | |
1804 cind=li->second; | |
1805 } | |
1806 fcount++; | |
1807 (pos[cind]).push_back(fpos); | |
1808 (posnm[cind]).push_back(nm); | |
1809 (poslen[cind]).push_back(len); | |
1810 if(read_names) { | |
1811 (tagnames[cind]).push_back(tagname); | |
1812 } | |
1813 #ifdef DEBUG | |
1814 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm); | |
1815 if(fcount>30) { | |
1816 break; | |
1817 } | |
1818 #endif | |
1819 | |
1820 } | |
1821 } | |
1822 | |
1823 #ifdef HAVE_LIBBZ2 | |
1824 BZ2_bzReadClose( &bzerror, b); | |
1825 #endif | |
1826 fclose(f); | |
1827 | |
1828 Rprintf("done. read %d fragments\n",fcount); | |
1829 } | |
1830 // construct output structures | |
1831 SEXP chnames; | |
1832 int np=0; // number of protections | |
1833 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
1834 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
1835 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
1836 } | |
1837 np++; | |
1838 | |
1839 // sort | |
1840 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1841 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
1842 //} | |
1843 | |
1844 SEXP ans; | |
1845 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
1846 vector<vector<int> >::const_iterator nsi; | |
1847 vector<vector<int> >::const_iterator lsi; | |
1848 vector<vector<string> >::const_iterator ssi; | |
1849 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
1850 nsi=posnm.begin()+(csi-pos.begin()); | |
1851 lsi=poslen.begin()+(csi-pos.begin()); | |
1852 | |
1853 SEXP dv,dnames_R; | |
1854 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; | |
1855 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
1856 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
1857 SET_STRING_ELT(dnames_R, 2, mkChar("l")); | |
1858 if(read_names) { | |
1859 SET_STRING_ELT(dnames_R, 3, mkChar("s")); | |
1860 } | |
1861 | |
1862 | |
1863 | |
1864 SEXP tv,nv,lv,sv; | |
1865 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
1866 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
1867 PROTECT(lv=allocVector(INTSXP,csi->size())); np++; | |
1868 if(read_names) { | |
1869 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
1870 } | |
1871 int* i_tv=INTEGER(tv); | |
1872 int* i_nv=INTEGER(nv); | |
1873 int* i_lv=INTEGER(lv); | |
1874 | |
1875 int i=0; | |
1876 vector<int>::const_iterator ini=nsi->begin(); | |
1877 vector<int>::const_iterator lni=lsi->begin(); | |
1878 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
1879 i_tv[i]=*pi; | |
1880 i_nv[i]=*ini++; | |
1881 i_lv[i]=*lni++; | |
1882 i++; | |
1883 } | |
1884 if(read_names) { | |
1885 int i=0; | |
1886 ssi=tagnames.begin()+(csi-pos.begin()); | |
1887 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
1888 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
1889 i++; | |
1890 } | |
1891 } | |
1892 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; | |
1893 SET_VECTOR_ELT(dv, 0, tv); | |
1894 SET_VECTOR_ELT(dv, 1, nv); | |
1895 SET_VECTOR_ELT(dv, 2, lv); | |
1896 if(read_names) { | |
1897 SET_VECTOR_ELT(dv, 3, sv); | |
1898 } | |
1899 setAttrib(dv, R_NamesSymbol, dnames_R); | |
1900 | |
1901 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
1902 } | |
1903 | |
1904 setAttrib(ans,R_NamesSymbol,chnames); | |
1905 | |
1906 #ifdef DEBUG | |
1907 Rprintf("unprotecting %d elements\n",np); | |
1908 #endif | |
1909 | |
1910 UNPROTECT(np); | |
1911 return(ans); | |
1912 } | |
1913 | |
1914 | |
1915 | |
1916 // read in text version of maq map | |
1917 SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) { | |
1918 | |
1919 #ifdef DEBUG | |
1920 Rprintf("start\n"); | |
1921 #endif | |
1922 const char* fname=CHAR(asChar(filename)); | |
1923 int read_names=*(INTEGER(read_tag_names_R)); | |
1924 #ifdef DEBUG | |
1925 Rprintf("fname=%s\n",fname); | |
1926 #endif | |
1927 | |
1928 // main data vector | |
1929 // chr - pos | |
1930 vector< vector<int> > pos; | |
1931 vector< vector<int> > posnm; // number of mismatches | |
1932 vector< vector<string> > tagnames; | |
1933 | |
1934 // chromosome map | |
1935 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
1936 vector<string> cnames; | |
1937 | |
1938 | |
1939 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
1940 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
1941 | |
1942 | |
1943 FILE *f=fopen(fname,"rb"); | |
1944 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
1945 else { | |
1946 Rprintf("opened %s\n",fname); | |
1947 | |
1948 // read in bed line | |
1949 string line; | |
1950 int fcount=0; | |
1951 while(get_a_line(f,line)) { | |
1952 | |
1953 #ifdef DEBUG | |
1954 Rprintf("line: %s\n",line.c_str()); | |
1955 #endif | |
1956 | |
1957 | |
1958 tokType tok(line, sep); | |
1959 tokType::iterator sit=tok.begin(); | |
1960 if(sit!=tok.end()) { | |
1961 string tagname=*sit++; | |
1962 string chr=*sit++; | |
1963 string str_pos=*sit++; | |
1964 int fpos=atoi(str_pos.c_str()); | |
1965 string str_strand=*sit++; | |
1966 sit++; sit++; sit++; sit++; sit++; | |
1967 string str_nm=*sit++; | |
1968 sit++; sit++; sit++; | |
1969 string str_len=*sit++; | |
1970 int nm=atoi(str_nm.c_str()); | |
1971 int len=atoi(str_len.c_str()); | |
1972 | |
1973 if(str_strand[0]=='-') { | |
1974 fpos=-1*(fpos+len-1); | |
1975 } | |
1976 | |
1977 // determine the chromosome index | |
1978 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
1979 int cind=-1; | |
1980 if(li==cind_map.end()) { | |
1981 // register new chromosome | |
1982 cind=cnames.size(); | |
1983 cnames.push_back(chr); | |
1984 cind_map[chr]=cind; | |
1985 // allocate new pos vector | |
1986 pos.push_back(vector<int>()); | |
1987 posnm.push_back(vector<int>()); | |
1988 if(read_names) { | |
1989 tagnames.push_back(vector<string>()); | |
1990 } | |
1991 #ifdef DEBUG | |
1992 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
1993 #endif | |
1994 } else { | |
1995 cind=li->second; | |
1996 } | |
1997 fcount++; | |
1998 (pos[cind]).push_back(fpos); | |
1999 (posnm[cind]).push_back(nm); | |
2000 if(read_names) { | |
2001 (tagnames[cind]).push_back(tagname); | |
2002 } | |
2003 #ifdef DEBUG | |
2004 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
2005 if(fcount>30) { | |
2006 break; | |
2007 } | |
2008 #endif | |
2009 | |
2010 } | |
2011 } | |
2012 fclose(f); | |
2013 | |
2014 Rprintf("done. read %d fragments\n",fcount); | |
2015 } | |
2016 // construct output structures | |
2017 SEXP chnames; | |
2018 int np=0; // number of protections | |
2019 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
2020 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
2021 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
2022 } | |
2023 np++; | |
2024 | |
2025 // sort | |
2026 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2027 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
2028 //} | |
2029 | |
2030 SEXP ans; | |
2031 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
2032 vector<vector<int> >::const_iterator nsi; | |
2033 vector<vector<string> >::const_iterator ssi; | |
2034 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2035 nsi=posnm.begin()+(csi-pos.begin()); | |
2036 | |
2037 SEXP dv,dnames_R; | |
2038 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; | |
2039 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
2040 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
2041 if(read_names) { | |
2042 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
2043 } | |
2044 | |
2045 | |
2046 | |
2047 SEXP tv,nv,sv; | |
2048 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
2049 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
2050 if(read_names) { | |
2051 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
2052 } | |
2053 int* i_tv=INTEGER(tv); | |
2054 int* i_nv=INTEGER(nv); | |
2055 | |
2056 int i=0; | |
2057 vector<int>::const_iterator ini=nsi->begin(); | |
2058 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
2059 i_tv[i]=*pi; | |
2060 i_nv[i]=*ini++; | |
2061 i++; | |
2062 } | |
2063 if(read_names) { | |
2064 int i=0; | |
2065 ssi=tagnames.begin()+(csi-pos.begin()); | |
2066 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
2067 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
2068 i++; | |
2069 } | |
2070 } | |
2071 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; | |
2072 SET_VECTOR_ELT(dv, 0, tv); | |
2073 SET_VECTOR_ELT(dv, 1, nv); | |
2074 if(read_names) { | |
2075 SET_VECTOR_ELT(dv, 2, sv); | |
2076 } | |
2077 setAttrib(dv, R_NamesSymbol, dnames_R); | |
2078 | |
2079 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
2080 } | |
2081 | |
2082 setAttrib(ans,R_NamesSymbol,chnames); | |
2083 | |
2084 #ifdef DEBUG | |
2085 Rprintf("unprotecting %d elements\n",np); | |
2086 #endif | |
2087 | |
2088 UNPROTECT(np); | |
2089 return(ans); | |
2090 } | |
2091 | |
2092 | |
2093 | |
2094 | |
2095 | |
2096 // read in tagalign file | |
2097 SEXP read_tagalign(SEXP filename) { | |
2098 | |
2099 #ifdef DEBUG | |
2100 Rprintf("start\n"); | |
2101 #endif | |
2102 const char* fname=CHAR(asChar(filename)); | |
2103 #ifdef DEBUG | |
2104 Rprintf("fname=%s\n",fname); | |
2105 #endif | |
2106 | |
2107 // main data vector | |
2108 // chr - pos | |
2109 vector< vector<int> > pos; | |
2110 vector< vector<int> > posnm; // number of mismatches | |
2111 | |
2112 // chromosome map | |
2113 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
2114 vector<string> cnames; | |
2115 | |
2116 | |
2117 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
2118 boost::char_separator<char> sep(" \t"); | |
2119 | |
2120 | |
2121 FILE *f=fopen(fname,"rb"); | |
2122 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
2123 else { | |
2124 Rprintf("opened %s\n",fname); | |
2125 | |
2126 // read in bed line | |
2127 string line; | |
2128 int fcount=0; | |
2129 while(get_a_line(f,line)) { | |
2130 | |
2131 #ifdef DEBUG | |
2132 Rprintf("line: %s\n",line.c_str()); | |
2133 #endif | |
2134 | |
2135 | |
2136 tokType tok(line, sep); | |
2137 tokType::iterator sit=tok.begin(); | |
2138 if(sit!=tok.end()) { | |
2139 string chr=*sit++; | |
2140 string str_spos=*sit++; | |
2141 string str_epos=*sit++; | |
2142 sit++; | |
2143 string str_qual=*sit++; | |
2144 string str_strand=*sit; | |
2145 | |
2146 int fpos; | |
2147 if(str_strand[0]=='+') { | |
2148 fpos=atoi(str_spos.c_str()); | |
2149 } else { | |
2150 fpos=-1*atoi(str_epos.c_str()); | |
2151 } | |
2152 int nm=atoi(str_qual.c_str()); | |
2153 | |
2154 // determine the chromosome index | |
2155 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
2156 int cind=-1; | |
2157 if(li==cind_map.end()) { | |
2158 // register new chromosome | |
2159 cind=cnames.size(); | |
2160 cnames.push_back(chr); | |
2161 cind_map[chr]=cind; | |
2162 // allocate new pos vector | |
2163 pos.push_back(vector<int>()); | |
2164 posnm.push_back(vector<int>()); | |
2165 #ifdef DEBUG | |
2166 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
2167 #endif | |
2168 } else { | |
2169 cind=li->second; | |
2170 } | |
2171 fcount++; | |
2172 (pos[cind]).push_back(fpos); | |
2173 (posnm[cind]).push_back(nm); | |
2174 #ifdef DEBUG | |
2175 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); | |
2176 if(fcount>30) { | |
2177 break; | |
2178 } | |
2179 #endif | |
2180 | |
2181 } | |
2182 } | |
2183 fclose(f); | |
2184 | |
2185 Rprintf("done. read %d fragments\n",fcount); | |
2186 } | |
2187 // construct output structures | |
2188 SEXP chnames; | |
2189 int np=0; // number of protections | |
2190 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
2191 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
2192 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
2193 } | |
2194 np++; | |
2195 | |
2196 // sort | |
2197 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2198 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
2199 //} | |
2200 | |
2201 SEXP ans; | |
2202 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
2203 vector<vector<int> >::const_iterator nsi; | |
2204 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2205 nsi=posnm.begin()+(csi-pos.begin()); | |
2206 | |
2207 SEXP dv,dnames_R; | |
2208 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; | |
2209 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
2210 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
2211 | |
2212 | |
2213 SEXP tv,nv; | |
2214 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
2215 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
2216 int* i_tv=INTEGER(tv); | |
2217 int* i_nv=INTEGER(nv); | |
2218 | |
2219 int i=0; | |
2220 vector<int>::const_iterator ini=nsi->begin(); | |
2221 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
2222 i_tv[i]=*pi; | |
2223 i_nv[i]=*ini++; | |
2224 i++; | |
2225 } | |
2226 PROTECT(dv = allocVector(VECSXP, 2)); np++; | |
2227 SET_VECTOR_ELT(dv, 0, tv); | |
2228 SET_VECTOR_ELT(dv, 1, nv); | |
2229 setAttrib(dv, R_NamesSymbol, dnames_R); | |
2230 | |
2231 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
2232 } | |
2233 | |
2234 setAttrib(ans,R_NamesSymbol,chnames); | |
2235 | |
2236 #ifdef DEBUG | |
2237 Rprintf("unprotecting %d elements\n",np); | |
2238 #endif | |
2239 | |
2240 UNPROTECT(np); | |
2241 return(ans); | |
2242 } | |
2243 | |
2244 | |
2245 | |
2246 | |
2247 // arachne madness | |
2248 SEXP read_arachne(SEXP filename) { | |
2249 | |
2250 #ifdef DEBUG | |
2251 Rprintf("start\n"); | |
2252 #endif | |
2253 const char* fname=CHAR(asChar(filename)); | |
2254 #ifdef DEBUG | |
2255 Rprintf("fname=%s\n",fname); | |
2256 #endif | |
2257 | |
2258 // main data vector | |
2259 // chr - pos | |
2260 vector< vector<int> > pos; | |
2261 vector< vector<int> > posnm; // number of mismatches | |
2262 | |
2263 // chromosome map | |
2264 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
2265 vector<string> cnames; | |
2266 | |
2267 | |
2268 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
2269 boost::char_separator<char> sep(" \t"); | |
2270 | |
2271 | |
2272 | |
2273 | |
2274 | |
2275 FILE *f=fopen(fname,"rb"); | |
2276 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
2277 else { | |
2278 | |
2279 #ifdef HAVE_LIBBZ2 | |
2280 BZFILE* b; | |
2281 int bzerror; | |
2282 | |
2283 int bz2file=0; | |
2284 if(strstr(fname,".bz2")) { | |
2285 bz2file=1; | |
2286 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); | |
2287 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } | |
2288 } | |
2289 #endif | |
2290 | |
2291 | |
2292 Rprintf("opened %s\n",fname); | |
2293 | |
2294 // read in bed line | |
2295 string line; | |
2296 int fcount=0; | |
2297 #ifdef HAVE_LIBBZ2 | |
2298 while(get_a_line(f,b,bz2file,line)) { | |
2299 #else | |
2300 while(get_a_line(f,line)) { | |
2301 #endif | |
2302 | |
2303 #ifdef DEBUG | |
2304 Rprintf("line: %s\n",line.c_str()); | |
2305 #endif | |
2306 | |
2307 | |
2308 tokType tok(line, sep); | |
2309 tokType::iterator sit=tok.begin(); | |
2310 if(sit!=tok.end()) { | |
2311 string chr=*sit++; | |
2312 string str_spos=*sit++; | |
2313 int nm=0; | |
2314 if(sit!=tok.end()) { | |
2315 string str_mm=*sit; | |
2316 nm=atoi(str_mm.c_str()); | |
2317 } | |
2318 | |
2319 int fpos=atoi(str_spos.c_str());; | |
2320 | |
2321 | |
2322 // determine the chromosome index | |
2323 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
2324 int cind=-1; | |
2325 if(li==cind_map.end()) { | |
2326 // register new chromosome | |
2327 cind=cnames.size(); | |
2328 cnames.push_back(chr); | |
2329 cind_map[chr]=cind; | |
2330 // allocate new pos vector | |
2331 pos.push_back(vector<int>()); | |
2332 posnm.push_back(vector<int>()); | |
2333 #ifdef DEBUG | |
2334 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
2335 #endif | |
2336 } else { | |
2337 cind=li->second; | |
2338 } | |
2339 fcount++; | |
2340 (pos[cind]).push_back(fpos); | |
2341 (posnm[cind]).push_back(nm); | |
2342 #ifdef DEBUG | |
2343 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); | |
2344 if(fcount>30) { | |
2345 break; | |
2346 } | |
2347 #endif | |
2348 | |
2349 } | |
2350 } | |
2351 #ifdef HAVE_LIBBZ2 | |
2352 BZ2_bzReadClose( &bzerror, b); | |
2353 #endif | |
2354 | |
2355 fclose(f); | |
2356 | |
2357 Rprintf("done. read %d fragments\n",fcount); | |
2358 } | |
2359 // construct output structures | |
2360 SEXP chnames; | |
2361 int np=0; // number of protections | |
2362 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
2363 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
2364 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
2365 } | |
2366 np++; | |
2367 | |
2368 // sort | |
2369 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2370 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
2371 //} | |
2372 | |
2373 SEXP ans; | |
2374 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
2375 vector<vector<int> >::const_iterator nsi; | |
2376 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2377 nsi=posnm.begin()+(csi-pos.begin()); | |
2378 | |
2379 SEXP dv,dnames_R; | |
2380 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; | |
2381 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
2382 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
2383 | |
2384 | |
2385 SEXP tv,nv; | |
2386 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
2387 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
2388 int* i_tv=INTEGER(tv); | |
2389 int* i_nv=INTEGER(nv); | |
2390 | |
2391 int i=0; | |
2392 vector<int>::const_iterator ini=nsi->begin(); | |
2393 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
2394 i_tv[i]=*pi; | |
2395 i_nv[i]=*ini++; | |
2396 i++; | |
2397 } | |
2398 PROTECT(dv = allocVector(VECSXP, 2)); np++; | |
2399 SET_VECTOR_ELT(dv, 0, tv); | |
2400 SET_VECTOR_ELT(dv, 1, nv); | |
2401 setAttrib(dv, R_NamesSymbol, dnames_R); | |
2402 | |
2403 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
2404 } | |
2405 | |
2406 setAttrib(ans,R_NamesSymbol,chnames); | |
2407 | |
2408 #ifdef DEBUG | |
2409 Rprintf("unprotecting %d elements\n",np); | |
2410 #endif | |
2411 | |
2412 UNPROTECT(np); | |
2413 return(ans); | |
2414 } | |
2415 | |
2416 | |
2417 // arachne madness | |
2418 SEXP read_arachne_long(SEXP filename) { | |
2419 | |
2420 #ifdef DEBUG | |
2421 Rprintf("start\n"); | |
2422 #endif | |
2423 const char* fname=CHAR(asChar(filename)); | |
2424 #ifdef DEBUG | |
2425 Rprintf("fname=%s\n",fname); | |
2426 #endif | |
2427 | |
2428 // main data vector | |
2429 // chr - pos | |
2430 vector< vector<int> > pos; | |
2431 vector< vector<int> > posnm; // number of mismatches | |
2432 vector< vector<int> > poslen; // length of the match | |
2433 | |
2434 // chromosome map | |
2435 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
2436 vector<string> cnames; | |
2437 | |
2438 | |
2439 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
2440 boost::char_separator<char> sep(" \t"); | |
2441 | |
2442 | |
2443 | |
2444 | |
2445 | |
2446 FILE *f=fopen(fname,"rb"); | |
2447 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } | |
2448 else { | |
2449 | |
2450 #ifdef HAVE_LIBBZ2 | |
2451 BZFILE* b; | |
2452 int bzerror; | |
2453 | |
2454 int bz2file=0; | |
2455 if(strstr(fname,".bz2")) { | |
2456 bz2file=1; | |
2457 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); | |
2458 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } | |
2459 } | |
2460 #endif | |
2461 | |
2462 | |
2463 Rprintf("opened %s\n",fname); | |
2464 | |
2465 // read in bed line | |
2466 string line; | |
2467 int fcount=0; | |
2468 #ifdef HAVE_LIBBZ2 | |
2469 while(get_a_line(f,b,bz2file,line)) { | |
2470 #else | |
2471 while(get_a_line(f,line)) { | |
2472 #endif | |
2473 | |
2474 #ifdef DEBUG | |
2475 Rprintf("line: %s\n",line.c_str()); | |
2476 #endif | |
2477 | |
2478 | |
2479 tokType tok(line, sep); | |
2480 tokType::iterator sit=tok.begin(); | |
2481 if(sit!=tok.end()) { | |
2482 string query=*sit++; | |
2483 if(query!="QUERY") { continue; } | |
2484 *sit++; *sit++; *sit++; *sit++; | |
2485 string str_strand=*sit++; | |
2486 string chr=*sit++; | |
2487 string str_startpos=*sit++; | |
2488 string str_endpos=*sit++; | |
2489 | |
2490 int fpos; | |
2491 if(str_strand[0]=='1') { | |
2492 fpos=-1*atoi(str_endpos.c_str()); | |
2493 } else { | |
2494 fpos=atoi(str_startpos.c_str()); | |
2495 } | |
2496 #ifdef DEBUG | |
2497 Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos); | |
2498 #endif | |
2499 *sit++; | |
2500 string str_nblocks=*sit++; | |
2501 int nblocks=atoi(str_nblocks.c_str()); | |
2502 #ifdef DEBUG | |
2503 Rprintf("nblocks=%d\n",nblocks); | |
2504 #endif | |
2505 // tally up the read length and the number of mismatches for all blocks | |
2506 int len=0; int nm=0; | |
2507 for(int i=0;i<nblocks;i++) { | |
2508 string str_sgs=*sit++; | |
2509 int sgs=atoi(str_sgs.c_str()); | |
2510 string str_slen=*sit++; | |
2511 int slen=atoi(str_slen.c_str()); | |
2512 string str_snm=*sit++; | |
2513 int snm=atoi(str_snm.c_str()); | |
2514 #ifdef DEBUG | |
2515 Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm); | |
2516 #endif | |
2517 len+=slen; | |
2518 nm+=abs(sgs)+snm; | |
2519 } | |
2520 nm+=nblocks-1; | |
2521 | |
2522 | |
2523 // determine the chromosome index | |
2524 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
2525 int cind=-1; | |
2526 if(li==cind_map.end()) { | |
2527 // register new chromosome | |
2528 cind=cnames.size(); | |
2529 cnames.push_back(chr); | |
2530 cind_map[chr]=cind; | |
2531 // allocate new pos vector | |
2532 pos.push_back(vector<int>()); | |
2533 posnm.push_back(vector<int>()); | |
2534 poslen.push_back(vector<int>()); | |
2535 #ifdef DEBUG | |
2536 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
2537 #endif | |
2538 } else { | |
2539 cind=li->second; | |
2540 } | |
2541 fcount++; | |
2542 (pos[cind]).push_back(fpos); | |
2543 (posnm[cind]).push_back(nm); | |
2544 (poslen[cind]).push_back(len); | |
2545 #ifdef DEBUG | |
2546 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len); | |
2547 if(fcount>30) { | |
2548 break; | |
2549 } | |
2550 #endif | |
2551 | |
2552 } | |
2553 } | |
2554 #ifdef HAVE_LIBBZ2 | |
2555 BZ2_bzReadClose( &bzerror, b); | |
2556 #endif | |
2557 | |
2558 fclose(f); | |
2559 | |
2560 Rprintf("done. read %d fragments\n",fcount); | |
2561 } | |
2562 // construct output structures | |
2563 SEXP chnames; | |
2564 int np=0; // number of protections | |
2565 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
2566 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
2567 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
2568 } | |
2569 np++; | |
2570 | |
2571 // sort | |
2572 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2573 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
2574 //} | |
2575 | |
2576 SEXP ans; | |
2577 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
2578 vector<vector<int> >::const_iterator nsi; | |
2579 vector<vector<int> >::const_iterator lsi; | |
2580 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
2581 nsi=posnm.begin()+(csi-pos.begin()); | |
2582 lsi=poslen.begin()+(csi-pos.begin()); | |
2583 | |
2584 SEXP dv,dnames_R; | |
2585 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; | |
2586 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
2587 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
2588 SET_STRING_ELT(dnames_R, 2, mkChar("l")); | |
2589 | |
2590 | |
2591 SEXP tv,nv,lv; | |
2592 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
2593 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
2594 PROTECT(lv=allocVector(INTSXP,csi->size())); np++; | |
2595 int* i_tv=INTEGER(tv); | |
2596 int* i_nv=INTEGER(nv); | |
2597 int* i_lv=INTEGER(lv); | |
2598 | |
2599 int i=0; | |
2600 vector<int>::const_iterator ini=nsi->begin(); | |
2601 vector<int>::const_iterator lni=lsi->begin(); | |
2602 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
2603 i_tv[i]=*pi; | |
2604 i_nv[i]=*ini++; | |
2605 i_lv[i]=*lni++; | |
2606 i++; | |
2607 } | |
2608 PROTECT(dv = allocVector(VECSXP, 3)); np++; | |
2609 SET_VECTOR_ELT(dv, 0, tv); | |
2610 SET_VECTOR_ELT(dv, 1, nv); | |
2611 SET_VECTOR_ELT(dv, 2, lv); | |
2612 setAttrib(dv, R_NamesSymbol, dnames_R); | |
2613 | |
2614 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
2615 } | |
2616 | |
2617 setAttrib(ans,R_NamesSymbol,chnames); | |
2618 | |
2619 #ifdef DEBUG | |
2620 Rprintf("unprotecting %d elements\n",np); | |
2621 #endif | |
2622 | |
2623 UNPROTECT(np); | |
2624 return(ans); | |
2625 } | |
2626 | |
2627 | |
2628 } |