6
|
1 #include "pc.h"
|
|
2 #include "config.h"
|
|
3 #include <vector>
|
|
4 #include <string.h>
|
|
5 #include <iostream>
|
|
6 #include <fstream>
|
|
7 #include <sstream>
|
|
8 #include <strstream>
|
|
9 #include <algorithm>
|
|
10 #include <string>
|
|
11 #include <functional>
|
|
12 #include <utility>
|
|
13 #include <ext/hash_map>
|
|
14 #include <boost/tokenizer.hpp>
|
|
15
|
|
16 #ifdef HAVE_LIBBZ2
|
|
17 #include <bzlib.h>
|
|
18 #endif
|
|
19
|
|
20 extern "C" {
|
|
21 #include "R.h"
|
|
22 #include "Rmath.h"
|
|
23 #include "Rinternals.h"
|
|
24 #include "Rdefines.h"
|
|
25 }
|
|
26
|
|
27 using namespace std;
|
|
28 using namespace __gnu_cxx;
|
|
29
|
|
30
|
|
31 class lessAbsoluteValue {
|
|
32 public:
|
|
33 bool operator()(int a, int b) const {
|
|
34 return abs(a) < abs(b);
|
|
35 }
|
|
36 };
|
|
37
|
|
38
|
|
39
|
|
40 #ifdef HAVE_LIBBZ2
|
|
41 int get_bzline(BZFILE* b,string& line) {
|
|
42 char c;
|
|
43 int nBuf;
|
|
44 int bzerror=BZ_OK;
|
|
45
|
|
46 while(bzerror == BZ_OK) {
|
|
47 nBuf=BZ2_bzRead(&bzerror, b, &c, 1);
|
|
48 if(bzerror==BZ_OK) {
|
|
49 if(c=='\n') {
|
|
50 return bzerror;
|
|
51 } else {
|
|
52 line+=c;
|
|
53 }
|
|
54 }
|
|
55 }
|
|
56 return bzerror;
|
|
57 }
|
|
58
|
|
59 int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) {
|
|
60 line="";
|
|
61 if(bz2file) {
|
|
62 int bzerror=get_bzline(b,line);
|
|
63 if(bzerror==BZ_OK) {
|
|
64 return(1);
|
|
65 } else {
|
|
66 if(bzerror!=BZ_STREAM_END) {
|
|
67 cerr<<"encountered BZERROR="<<bzerror<<endl;
|
|
68 }
|
|
69 return(0);
|
|
70 }
|
|
71 } else {
|
|
72 char *cline=NULL;
|
|
73 size_t n;
|
|
74 if(getline(&cline,&n,f) != -1) {
|
|
75 if(cline) {
|
|
76 cline[strlen(cline)-1]='\0';
|
|
77 line+=cline;
|
|
78 free(cline);
|
|
79 }
|
|
80 return(1);
|
|
81 } else {
|
|
82 return(0);
|
|
83 }
|
|
84 }
|
|
85 }
|
|
86 #endif
|
|
87
|
|
88
|
|
89 /**
|
|
90 * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign
|
|
91 * corresponding to the strand.
|
|
92 */
|
|
93
|
|
94 //#define DEBUG 1
|
|
95
|
|
96 extern "C" {
|
|
97 SEXP read_bed_ends(SEXP filename) {
|
|
98
|
|
99 #ifdef DEBUG
|
|
100 Rprintf("start\n");
|
|
101 #endif
|
|
102 const char* fname=CHAR(asChar(filename));
|
|
103 #ifdef DEBUG
|
|
104 Rprintf("fname=%s\n",fname);
|
|
105 #endif
|
|
106
|
|
107 // main data vector
|
|
108 // chr - pos
|
|
109 vector< vector<int> > pos;
|
|
110
|
|
111 // chromosome map
|
|
112 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
113 vector<string> cnames;
|
|
114
|
|
115 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
116 boost::char_separator<char> sep(" \t");
|
|
117
|
|
118
|
|
119 ifstream bed_file(fname);
|
|
120
|
|
121 #ifdef DEBUG
|
|
122 Rprintf("opened %s\n",fname);
|
|
123 #endif
|
|
124
|
|
125 Rprintf("opened %s\n",fname);
|
|
126
|
|
127 // read in bed line
|
|
128 string line;
|
|
129
|
|
130 int fcount=0;
|
|
131 while(getline(bed_file,line)) {
|
|
132
|
|
133 #ifdef DEBUG
|
|
134 Rprintf("line: %s\n",line.c_str());
|
|
135 #endif
|
|
136
|
|
137
|
|
138 tokType tok(line, sep);
|
|
139 tokType::iterator sit=tok.begin();
|
|
140 if(sit!=tok.end()) {
|
|
141 string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str()));
|
|
142 string str_start=*sit++;
|
|
143 int fstart=atoi(str_start.c_str());
|
|
144 string str_end=*sit++;
|
|
145 int fend=atoi(str_end.c_str());
|
|
146 int fpos=fstart;
|
|
147 if(sit!=tok.end()) {
|
|
148 string u0=*sit++;
|
|
149 string nfield=*sit++;
|
|
150 string strand=*sit++;
|
|
151 if(strand=="-") {
|
|
152 fpos=-1*fend;
|
|
153 }
|
|
154 }
|
|
155
|
|
156 // determine the chromosome index
|
|
157 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
158 int cind=-1;
|
|
159 if(li==cind_map.end()) {
|
|
160 // register new chromosome
|
|
161 cind=cnames.size();
|
|
162 cnames.push_back(chr);
|
|
163 cind_map[chr]=cind;
|
|
164 // allocate new pos vector
|
|
165 pos.push_back(vector<int>());
|
|
166 #ifdef DEBUG
|
|
167 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
168 #endif
|
|
169 } else {
|
|
170 cind=li->second;
|
|
171 }
|
|
172 fcount++;
|
|
173 (pos[cind]).push_back(fpos);
|
|
174 #ifdef DEBUG
|
|
175 Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos);
|
|
176 if(fcount>30) {
|
|
177 break;
|
|
178 }
|
|
179 #endif
|
|
180
|
|
181 }
|
|
182 }
|
|
183 bed_file.close();
|
|
184
|
|
185
|
|
186 #ifdef DEBUG
|
|
187 Rprintf("done. read %d fragments\n",fcount);
|
|
188 #endif
|
|
189
|
|
190 Rprintf("done. read %d fragments\n",fcount);
|
|
191
|
|
192 // construct output structures
|
|
193 SEXP chnames;
|
|
194 int np=0; // number of protections
|
|
195 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
196 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
197 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
198 }
|
|
199 np++;
|
|
200
|
|
201 // sort
|
|
202 for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
203 sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
204 }
|
|
205
|
|
206 SEXP ans;
|
|
207 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
208 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
209 SEXP nv;
|
|
210 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
211 int* i_nv=INTEGER(nv);
|
|
212 int i=0;
|
|
213 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
214 i_nv[i++]=*pi;
|
|
215 }
|
|
216 SET_VECTOR_ELT(ans, csi-pos.begin(), nv);
|
|
217 }
|
|
218
|
|
219 setAttrib(ans,R_NamesSymbol,chnames);
|
|
220
|
|
221 #ifdef DEBUG
|
|
222 Rprintf("unprotecting %d elements\n",np);
|
|
223 #endif
|
|
224
|
|
225 UNPROTECT(np);
|
|
226 return(ans);
|
|
227 }
|
|
228
|
|
229
|
|
230
|
|
231 SEXP read_meland_old(SEXP filename) {
|
|
232
|
|
233 #ifdef DEBUG
|
|
234 Rprintf("start\n");
|
|
235 #endif
|
|
236 const char* fname=CHAR(asChar(filename));
|
|
237 #ifdef DEBUG
|
|
238 Rprintf("fname=%s\n",fname);
|
|
239 #endif
|
|
240
|
|
241 // main data vector
|
|
242 // chr - pos
|
|
243 vector< vector<int> > pos;
|
|
244 vector< vector<int> > posnm; // number of mismatches
|
|
245 vector< vector<int> > poslen; // length
|
|
246
|
|
247 // chromosome map
|
|
248 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
249 vector<string> cnames;
|
|
250
|
|
251
|
|
252 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
253 boost::char_separator<char> sep(" \t");
|
|
254
|
|
255
|
|
256 ifstream bed_file(fname);
|
|
257
|
|
258 Rprintf("opened %s\n",fname);
|
|
259
|
|
260 // read in bed line
|
|
261 string line;
|
|
262
|
|
263 int fcount=0;
|
|
264 while(getline(bed_file,line)) {
|
|
265
|
|
266 #ifdef DEBUG
|
|
267 Rprintf("line: %s\n",line.c_str());
|
|
268 #endif
|
|
269
|
|
270
|
|
271 tokType tok(line, sep);
|
|
272 tokType::iterator sit=tok.begin();
|
|
273 if(sit!=tok.end()) {
|
|
274 sit++; sit++;
|
|
275 string str_nm=*sit++;
|
|
276 int nm=0;
|
|
277 if(str_nm[0]=='U') {
|
|
278 nm=atoi((str_nm.c_str()+1));
|
|
279 } else {
|
|
280 continue;
|
|
281 }
|
|
282 sit++; sit++; sit++;
|
|
283 string str_len=*sit++;
|
|
284 int len=atoi(str_len.c_str());
|
|
285 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
|
|
286 string str_pos=*sit++;
|
|
287 int fpos=atoi(str_pos.c_str());
|
|
288
|
|
289 // determine the chromosome index
|
|
290 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
291 int cind=-1;
|
|
292 if(li==cind_map.end()) {
|
|
293 // register new chromosome
|
|
294 cind=cnames.size();
|
|
295 cnames.push_back(chr);
|
|
296 cind_map[chr]=cind;
|
|
297 // allocate new pos vector
|
|
298 pos.push_back(vector<int>());
|
|
299 posnm.push_back(vector<int>());
|
|
300 poslen.push_back(vector<int>());
|
|
301 #ifdef DEBUG
|
|
302 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
303 #endif
|
|
304 } else {
|
|
305 cind=li->second;
|
|
306 }
|
|
307 fcount++;
|
|
308 (pos[cind]).push_back(fpos);
|
|
309 (posnm[cind]).push_back(nm);
|
|
310 (poslen[cind]).push_back(len);
|
|
311 #ifdef DEBUG
|
|
312 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
313 if(fcount>30) {
|
|
314 break;
|
|
315 }
|
|
316 #endif
|
|
317
|
|
318 }
|
|
319 }
|
|
320 bed_file.close();
|
|
321
|
|
322
|
|
323 #ifdef DEBUG
|
|
324 Rprintf("done. read %d fragments\n",fcount);
|
|
325 #endif
|
|
326
|
|
327 Rprintf("done. read %d fragments\n",fcount);
|
|
328
|
|
329 // construct output structures
|
|
330 SEXP chnames;
|
|
331 int np=0; // number of protections
|
|
332 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
333 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
334 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
335 }
|
|
336 np++;
|
|
337
|
|
338 // sort
|
|
339 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
340 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
341 //}
|
|
342
|
|
343 SEXP ans;
|
|
344 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
345 vector<vector<int> >::const_iterator nsi,lsi;
|
|
346 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
347 nsi=posnm.begin()+(csi-pos.begin());
|
|
348 lsi=poslen.begin()+(csi-pos.begin());
|
|
349
|
|
350 SEXP dv,dnames_R;
|
|
351 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
|
|
352 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
353 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
354 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
|
|
355
|
|
356
|
|
357
|
|
358 SEXP tv,nv,lv;
|
|
359 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
360 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
361 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
|
|
362 int* i_tv=INTEGER(tv);
|
|
363 int* i_nv=INTEGER(nv);
|
|
364 int* i_lv=INTEGER(lv);
|
|
365
|
|
366 int i=0;
|
|
367 vector<int>::const_iterator ini=nsi->begin();
|
|
368 vector<int>::const_iterator ili=lsi->begin();
|
|
369 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
370 i_tv[i]=*pi;
|
|
371 i_nv[i]=*ini++;
|
|
372 i_lv[i]=*ili++;
|
|
373 i++;
|
|
374 }
|
|
375 PROTECT(dv = allocVector(VECSXP, 3)); np++;
|
|
376 SET_VECTOR_ELT(dv, 0, tv);
|
|
377 SET_VECTOR_ELT(dv, 1, nv);
|
|
378 SET_VECTOR_ELT(dv, 2, lv);
|
|
379 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
380
|
|
381 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
382 }
|
|
383
|
|
384 setAttrib(ans,R_NamesSymbol,chnames);
|
|
385
|
|
386 #ifdef DEBUG
|
|
387 Rprintf("unprotecting %d elements\n",np);
|
|
388 #endif
|
|
389
|
|
390 UNPROTECT(np);
|
|
391 return(ans);
|
|
392 }
|
|
393
|
|
394
|
|
395 int get_a_line(FILE *f,string& line) {
|
|
396 line="";
|
|
397 char cline[1024];
|
|
398 if(fgets(cline,1024,f)) {
|
|
399 line+=cline;
|
|
400 return(1);
|
|
401 } else {
|
|
402 return(0);
|
|
403 }
|
|
404 }
|
|
405
|
|
406
|
|
407 SEXP read_meland(SEXP filename,SEXP read_tag_names_R) {
|
|
408
|
|
409 #ifdef DEBUG
|
|
410 Rprintf("start\n");
|
|
411 #endif
|
|
412 const char* fname=CHAR(asChar(filename));
|
|
413 int read_names=*(INTEGER(read_tag_names_R));
|
|
414 #ifdef DEBUG
|
|
415 Rprintf("fname=%s\n",fname);
|
|
416 #endif
|
|
417
|
|
418 // main data vector
|
|
419 // chr - pos
|
|
420 vector< vector<int> > pos;
|
|
421 vector< vector<int> > posnm; // number of mismatches
|
|
422 vector< vector<int> > poslen; // length
|
|
423 vector< vector<string> > tagnames;
|
|
424
|
|
425 // chromosome map
|
|
426 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
427 vector<string> cnames;
|
|
428
|
|
429
|
|
430 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
431 boost::char_separator<char> sep(" \t");
|
|
432
|
|
433
|
|
434 FILE *f=fopen(fname,"rb");
|
|
435 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
436
|
|
437 Rprintf("opened %s\n",fname);
|
|
438
|
|
439
|
|
440 // read in bed line
|
|
441 string line;
|
|
442 int fcount=0;
|
|
443 while(get_a_line(f,line)) {
|
|
444
|
|
445 #ifdef DEBUG
|
|
446 Rprintf("line: %s\n",line.c_str());
|
|
447 #endif
|
|
448
|
|
449
|
|
450 tokType tok(line, sep);
|
|
451 tokType::iterator sit=tok.begin();
|
|
452 if(sit!=tok.end()) {
|
|
453 string tagname=*sit++;
|
|
454 sit++;
|
|
455 string str_nm=*sit++;
|
|
456 int nm=0;
|
|
457 if(str_nm[0]=='U') {
|
|
458 nm=atoi((str_nm.c_str()+1));
|
|
459 } else {
|
|
460 continue;
|
|
461 }
|
|
462 sit++; sit++; sit++;
|
|
463 string str_len=*sit++;
|
|
464 int len=atoi(str_len.c_str());
|
|
465 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
|
|
466 string str_pos=*sit++;
|
|
467 int fpos=atoi(str_pos.c_str());
|
|
468
|
|
469 // determine the chromosome index
|
|
470 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
471 int cind=-1;
|
|
472 if(li==cind_map.end()) {
|
|
473 // register new chromosome
|
|
474 cind=cnames.size();
|
|
475 cnames.push_back(chr);
|
|
476 cind_map[chr]=cind;
|
|
477 // allocate new pos vector
|
|
478 pos.push_back(vector<int>());
|
|
479 posnm.push_back(vector<int>());
|
|
480 poslen.push_back(vector<int>());
|
|
481 if(read_names) {
|
|
482 tagnames.push_back(vector<string>());
|
|
483 }
|
|
484 #ifdef DEBUG
|
|
485 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
486 #endif
|
|
487 } else {
|
|
488 cind=li->second;
|
|
489 }
|
|
490 fcount++;
|
|
491 (pos[cind]).push_back(fpos);
|
|
492 (posnm[cind]).push_back(nm);
|
|
493 (poslen[cind]).push_back(len);
|
|
494 if(read_names) {
|
|
495 (tagnames[cind]).push_back(tagname);
|
|
496 }
|
|
497 #ifdef DEBUG
|
|
498 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
499 if(fcount>30) {
|
|
500 break;
|
|
501 }
|
|
502 #endif
|
|
503
|
|
504 }
|
|
505 }
|
|
506 fclose(f);
|
|
507
|
|
508
|
|
509 #ifdef DEBUG
|
|
510 Rprintf("done. read %d fragments\n",fcount);
|
|
511 #endif
|
|
512
|
|
513 Rprintf("done. read %d fragments\n",fcount);
|
|
514
|
|
515 // construct output structures
|
|
516 SEXP chnames;
|
|
517 int np=0; // number of protections
|
|
518 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
519 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
520 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
521 }
|
|
522 np++;
|
|
523
|
|
524 // sort
|
|
525 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
526 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
527 //}
|
|
528
|
|
529 SEXP ans;
|
|
530 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
531 vector<vector<int> >::const_iterator nsi,lsi;
|
|
532 vector<vector<string> >::const_iterator ssi;
|
|
533 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
534 nsi=posnm.begin()+(csi-pos.begin());
|
|
535 lsi=poslen.begin()+(csi-pos.begin());
|
|
536
|
|
537 SEXP dv,dnames_R;
|
|
538 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
|
|
539 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
540 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
541 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
|
|
542 if(read_names) {
|
|
543 SET_STRING_ELT(dnames_R, 3, mkChar("s"));
|
|
544 }
|
|
545
|
|
546
|
|
547
|
|
548 SEXP tv,nv,lv,sv;
|
|
549 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
550 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
551 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
|
|
552 if(read_names) {
|
|
553 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
554 }
|
|
555 int* i_tv=INTEGER(tv);
|
|
556 int* i_nv=INTEGER(nv);
|
|
557 int* i_lv=INTEGER(lv);
|
|
558
|
|
559 int i=0;
|
|
560 vector<int>::const_iterator ini=nsi->begin();
|
|
561 vector<int>::const_iterator ili=lsi->begin();
|
|
562 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
563 i_tv[i]=*pi;
|
|
564 i_nv[i]=*ini++;
|
|
565 i_lv[i]=*ili++;
|
|
566 i++;
|
|
567 }
|
|
568 if(read_names) {
|
|
569 int i=0;
|
|
570 ssi=tagnames.begin()+(csi-pos.begin());
|
|
571 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
572 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
573 i++;
|
|
574 }
|
|
575 }
|
|
576 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
|
|
577 SET_VECTOR_ELT(dv, 0, tv);
|
|
578 SET_VECTOR_ELT(dv, 1, nv);
|
|
579 SET_VECTOR_ELT(dv, 2, lv);
|
|
580 if(read_names) {
|
|
581 SET_VECTOR_ELT(dv, 3, sv);
|
|
582 }
|
|
583 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
584
|
|
585 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
586 }
|
|
587
|
|
588 setAttrib(ans,R_NamesSymbol,chnames);
|
|
589
|
|
590 #ifdef DEBUG
|
|
591 Rprintf("unprotecting %d elements\n",np);
|
|
592 #endif
|
|
593
|
|
594 UNPROTECT(np);
|
|
595 return(ans);
|
|
596 }
|
|
597
|
|
598
|
|
599
|
|
600 // reads regular eland files, recording mismatch positions
|
|
601 SEXP read_eland_mismatches(SEXP filename) {
|
|
602
|
|
603 #ifdef DEBUG
|
|
604 Rprintf("start\n");
|
|
605 #endif
|
|
606 const char* fname=CHAR(asChar(filename));
|
|
607 #ifdef DEBUG
|
|
608 Rprintf("fname=%s\n",fname);
|
|
609 #endif
|
|
610
|
|
611 // main data vector
|
|
612 // chr - pos
|
|
613 vector< vector<int> > pos;
|
|
614 vector< vector<int> > mm1; // position of the first mismatch (or 0 for none)
|
|
615 vector< vector<int> > mm2; // position of the second mismatch
|
|
616
|
|
617 // chromosome map
|
|
618 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
619 vector<string> cnames;
|
|
620
|
|
621
|
|
622 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
623 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
624
|
|
625
|
|
626 FILE *f=fopen(fname,"rb");
|
|
627 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
628
|
|
629 Rprintf("opened %s\n",fname);
|
|
630
|
|
631 // read in bed line
|
|
632 string line;
|
|
633 int fcount=0;
|
|
634 while(get_a_line(f,line)) {
|
|
635
|
|
636 #ifdef DEBUG
|
|
637 Rprintf("line: %s\n",line.c_str());
|
|
638 #endif
|
|
639
|
|
640
|
|
641 tokType tok(line, sep);
|
|
642 tokType::iterator sit=tok.begin();
|
|
643 if(sit!=tok.end()) {
|
|
644 sit++;
|
|
645 string seq=*sit++;
|
|
646 string str_nm=*sit++;
|
|
647 int nm=0;
|
|
648 if(str_nm[0]=='U') {
|
|
649 nm=atoi((str_nm.c_str()+1));
|
|
650 } else {
|
|
651 continue;
|
|
652 }
|
|
653 sit++; sit++; sit++;
|
|
654 string chr=*sit++;
|
|
655 // extract chromosome name from this
|
|
656 int chrp=chr.find("chr");
|
|
657 int pp=chr.find('.');
|
|
658 chr=chr.substr(chrp+3,pp-chrp-3);
|
|
659
|
|
660 string str_pos=*sit++;
|
|
661 int fpos=atoi(str_pos.c_str());
|
|
662
|
|
663
|
|
664 string strand=*sit++;
|
|
665 int nstrand=0;
|
|
666 if(strand=="R") {
|
|
667 fpos=-1*(fpos+seq.size()-1);
|
|
668 nstrand=1;
|
|
669 }
|
|
670
|
|
671 sit++;
|
|
672
|
|
673 int nm1=0; int nm2=0;
|
|
674 if(sit!=tok.end()) {
|
|
675 string nms=*sit++;
|
|
676 nm1=atoi(nms.substr(0,nms.size()-1).c_str());
|
|
677 if(nstrand) { nm1=seq.size()-nm1+1; }
|
|
678 }
|
|
679 if(sit!=tok.end()) {
|
|
680 string nms=*sit++;
|
|
681 nm2=atoi(nms.substr(0,nms.size()-1).c_str());
|
|
682 if(nstrand) { nm2=seq.size()-nm2+1; }
|
|
683 }
|
|
684
|
|
685 // determine the chromosome index
|
|
686 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
687 int cind=-1;
|
|
688 if(li==cind_map.end()) {
|
|
689 // register new chromosome
|
|
690 cind=cnames.size();
|
|
691 cnames.push_back(chr);
|
|
692 cind_map[chr]=cind;
|
|
693 // allocate new pos vector
|
|
694 pos.push_back(vector<int>());
|
|
695 mm1.push_back(vector<int>());
|
|
696 mm2.push_back(vector<int>());
|
|
697 #ifdef DEBUG
|
|
698 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
699 #endif
|
|
700 } else {
|
|
701 cind=li->second;
|
|
702 }
|
|
703 fcount++;
|
|
704 (pos[cind]).push_back(fpos);
|
|
705 (mm1[cind]).push_back(nm1);
|
|
706 (mm2[cind]).push_back(nm2);
|
|
707 #ifdef DEBUG
|
|
708 Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2);
|
|
709 if(fcount>30) {
|
|
710 break;
|
|
711 }
|
|
712 #endif
|
|
713
|
|
714 }
|
|
715 }
|
|
716 fclose(f);
|
|
717
|
|
718
|
|
719 #ifdef DEBUG
|
|
720 Rprintf("done. read %d fragments\n",fcount);
|
|
721 #endif
|
|
722
|
|
723 Rprintf("done. read %d fragments\n",fcount);
|
|
724
|
|
725 // construct output structures
|
|
726 SEXP chnames;
|
|
727 int np=0; // number of protections
|
|
728 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
729 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
730 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
731 }
|
|
732 np++;
|
|
733
|
|
734 // sort
|
|
735 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
736 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
737 //}
|
|
738
|
|
739 SEXP ans;
|
|
740 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
741 vector<vector<int> >::const_iterator nsi,lsi;
|
|
742 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
743 nsi=mm1.begin()+(csi-pos.begin());
|
|
744 lsi=mm2.begin()+(csi-pos.begin());
|
|
745
|
|
746 SEXP dv,dnames_R;
|
|
747 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
|
|
748 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
749 SET_STRING_ELT(dnames_R, 1, mkChar("f"));
|
|
750 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
751
|
|
752
|
|
753
|
|
754 SEXP tv,nv,lv;
|
|
755 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
756 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
757 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
|
|
758 int* i_tv=INTEGER(tv);
|
|
759 int* i_nv=INTEGER(nv);
|
|
760 int* i_lv=INTEGER(lv);
|
|
761
|
|
762 int i=0;
|
|
763 vector<int>::const_iterator ini=nsi->begin();
|
|
764 vector<int>::const_iterator ili=lsi->begin();
|
|
765 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
766 i_tv[i]=*pi;
|
|
767 i_nv[i]=*ini++;
|
|
768 i_lv[i]=*ili++;
|
|
769 i++;
|
|
770 }
|
|
771 PROTECT(dv = allocVector(VECSXP, 3)); np++;
|
|
772 SET_VECTOR_ELT(dv, 0, tv);
|
|
773 SET_VECTOR_ELT(dv, 1, nv);
|
|
774 SET_VECTOR_ELT(dv, 2, lv);
|
|
775 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
776
|
|
777 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
778 }
|
|
779
|
|
780 setAttrib(ans,R_NamesSymbol,chnames);
|
|
781
|
|
782 #ifdef DEBUG
|
|
783 Rprintf("unprotecting %d elements\n",np);
|
|
784 #endif
|
|
785
|
|
786 UNPROTECT(np);
|
|
787 return(ans);
|
|
788 }
|
|
789
|
|
790
|
|
791 // read in regular eland files, adjusting the negative strand coordinate by sequence length
|
|
792 SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
|
|
793
|
|
794 #ifdef DEBUG
|
|
795 Rprintf("start\n");
|
|
796 #endif
|
|
797 const char* fname=CHAR(asChar(filename));
|
|
798 int read_names=*(INTEGER(read_tag_names_R));
|
|
799 int eland_tag_length=*(INTEGER(eland_tag_length_R));
|
|
800 #ifdef DEBUG
|
|
801 Rprintf("fname=%s\n",fname);
|
|
802 #endif
|
|
803
|
|
804 // main data vector
|
|
805 // chr - pos
|
|
806 vector< vector<int> > pos;
|
|
807 vector< vector<int> > posnm; // number of mismatches
|
|
808 vector< vector<string> > tagnames;
|
|
809
|
|
810 // chromosome map
|
|
811 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
812 vector<string> cnames;
|
|
813
|
|
814
|
|
815 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
816 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
817
|
|
818
|
|
819 FILE *f=fopen(fname,"rb");
|
|
820 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
821 else {
|
|
822 Rprintf("opened %s\n",fname);
|
|
823
|
|
824 // read in bed line
|
|
825 string line;
|
|
826 int fcount=0;
|
|
827 while(get_a_line(f,line)) {
|
|
828
|
|
829 #ifdef DEBUG
|
|
830 Rprintf("line: %s\n",line.c_str());
|
|
831 #endif
|
|
832
|
|
833
|
|
834 tokType tok(line, sep);
|
|
835 tokType::iterator sit=tok.begin();
|
|
836 if(sit!=tok.end()) {
|
|
837 string tagname=*sit++;
|
|
838 string sequence=*sit++;
|
|
839 int len=sequence.size();
|
|
840 // adjust probe length if eland length limit was specified
|
|
841 if(eland_tag_length>0 && len>eland_tag_length) {
|
|
842 len=eland_tag_length;
|
|
843 }
|
|
844 string str_nm=*sit++;
|
|
845 int nm=0;
|
|
846 if(str_nm[0]=='U') {
|
|
847 nm=atoi((str_nm.c_str()+1));
|
|
848 } else {
|
|
849 continue;
|
|
850 }
|
|
851 sit++; sit++; sit++;
|
|
852 string chr=*sit++;
|
|
853 string str_pos=*sit++;
|
|
854 int fpos=atoi(str_pos.c_str());
|
|
855 string str_strand=*sit++;
|
|
856
|
|
857 if(str_strand[0]=='R') {
|
|
858 fpos=-1*(fpos+len-1);
|
|
859 }
|
|
860
|
|
861 // determine the chromosome index
|
|
862 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
863 int cind=-1;
|
|
864 if(li==cind_map.end()) {
|
|
865 // register new chromosome
|
|
866 cind=cnames.size();
|
|
867 cnames.push_back(chr);
|
|
868 cind_map[chr]=cind;
|
|
869 // allocate new pos vector
|
|
870 pos.push_back(vector<int>());
|
|
871 posnm.push_back(vector<int>());
|
|
872 if(read_names) {
|
|
873 tagnames.push_back(vector<string>());
|
|
874 }
|
|
875 #ifdef DEBUG
|
|
876 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
877 #endif
|
|
878 } else {
|
|
879 cind=li->second;
|
|
880 }
|
|
881 fcount++;
|
|
882 (pos[cind]).push_back(fpos);
|
|
883 (posnm[cind]).push_back(nm);
|
|
884 if(read_names) {
|
|
885 (tagnames[cind]).push_back(tagname);
|
|
886 }
|
|
887 #ifdef DEBUG
|
|
888 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
889 if(fcount>30) {
|
|
890 break;
|
|
891 }
|
|
892 #endif
|
|
893
|
|
894 }
|
|
895 }
|
|
896 fclose(f);
|
|
897
|
|
898 Rprintf("done. read %d fragments\n",fcount);
|
|
899 }
|
|
900 // construct output structures
|
|
901 SEXP chnames;
|
|
902 int np=0; // number of protections
|
|
903 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
904 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
905 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
906 }
|
|
907 np++;
|
|
908
|
|
909 // sort
|
|
910 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
911 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
912 //}
|
|
913
|
|
914 SEXP ans;
|
|
915 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
916 vector<vector<int> >::const_iterator nsi;
|
|
917 vector<vector<string> >::const_iterator ssi;
|
|
918 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
919 nsi=posnm.begin()+(csi-pos.begin());
|
|
920
|
|
921 SEXP dv,dnames_R;
|
|
922 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
|
|
923 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
924 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
925 if(read_names) {
|
|
926 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
927 }
|
|
928
|
|
929
|
|
930
|
|
931 SEXP tv,nv,sv;
|
|
932 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
933 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
934 if(read_names) {
|
|
935 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
936 }
|
|
937 int* i_tv=INTEGER(tv);
|
|
938 int* i_nv=INTEGER(nv);
|
|
939
|
|
940 int i=0;
|
|
941 vector<int>::const_iterator ini=nsi->begin();
|
|
942 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
943 i_tv[i]=*pi;
|
|
944 i_nv[i]=*ini++;
|
|
945 i++;
|
|
946 }
|
|
947 if(read_names) {
|
|
948 int i=0;
|
|
949 ssi=tagnames.begin()+(csi-pos.begin());
|
|
950 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
951 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
952 i++;
|
|
953 }
|
|
954 }
|
|
955 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
|
|
956 SET_VECTOR_ELT(dv, 0, tv);
|
|
957 SET_VECTOR_ELT(dv, 1, nv);
|
|
958 if(read_names) {
|
|
959 SET_VECTOR_ELT(dv, 2, sv);
|
|
960 }
|
|
961 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
962
|
|
963 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
964 }
|
|
965
|
|
966 setAttrib(ans,R_NamesSymbol,chnames);
|
|
967
|
|
968 #ifdef DEBUG
|
|
969 Rprintf("unprotecting %d elements\n",np);
|
|
970 #endif
|
|
971
|
|
972 UNPROTECT(np);
|
|
973 return(ans);
|
|
974 }
|
|
975
|
|
976
|
|
977
|
|
978 // read in extended eland files, adjusting the negative strand coordinate by sequence length
|
|
979 SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
|
|
980
|
|
981 #ifdef DEBUG
|
|
982 Rprintf("start\n");
|
|
983 #endif
|
|
984 const char* fname=CHAR(asChar(filename));
|
|
985 int read_names=*(INTEGER(read_tag_names_R));
|
|
986 int eland_tag_length=*(INTEGER(eland_tag_length_R));
|
|
987 #ifdef DEBUG
|
|
988 Rprintf("fname=%s\n",fname);
|
|
989 #endif
|
|
990
|
|
991 // main data vector
|
|
992 // chr - pos
|
|
993 vector< vector<int> > pos;
|
|
994 vector< vector<int> > posnm; // number of mismatches
|
|
995 vector< vector<string> > tagnames;
|
|
996
|
|
997 // chromosome map
|
|
998 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
999 vector<string> cnames;
|
|
1000
|
|
1001
|
|
1002 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
1003 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
1004
|
|
1005
|
|
1006 FILE *f=fopen(fname,"rb");
|
|
1007 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
1008 else {
|
|
1009 Rprintf("opened %s\n",fname);
|
|
1010
|
|
1011 // read in bed line
|
|
1012 string line;
|
|
1013 int fcount=0;
|
|
1014 while(get_a_line(f,line)) {
|
|
1015
|
|
1016 #ifdef DEBUG
|
|
1017 Rprintf("line: %s\n",line.c_str());
|
|
1018 #endif
|
|
1019
|
|
1020
|
|
1021 tokType tok(line, sep);
|
|
1022 tokType::iterator sit=tok.begin();
|
|
1023 if(sit!=tok.end()) {
|
|
1024 string machinename=*sit++;
|
|
1025 string runnumber=*sit++;
|
|
1026 string lanenumber=*sit++;
|
|
1027 *sit++;
|
|
1028
|
|
1029 string str_x=*sit++;
|
|
1030 string str_y=*sit++;
|
|
1031
|
|
1032 string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y;
|
|
1033
|
|
1034
|
|
1035
|
|
1036 *sit++;
|
|
1037 *sit++;
|
|
1038
|
|
1039
|
|
1040 string sequence=*sit++;
|
|
1041 *sit++;
|
|
1042
|
|
1043 string chr=*sit++;
|
|
1044 string contig=*sit++;
|
|
1045 chr=chr+contig;
|
|
1046
|
|
1047 int len=sequence.size();
|
|
1048 // adjust probe length if eland length limit was specified
|
|
1049 if(eland_tag_length>0 && len>eland_tag_length) {
|
|
1050 len=eland_tag_length;
|
|
1051 }
|
|
1052
|
|
1053
|
|
1054
|
|
1055 string str_pos=*sit++;
|
|
1056 if(str_pos.size()<1) { continue; }
|
|
1057 int fpos=atoi(str_pos.c_str());
|
|
1058 string str_strand=*sit++;
|
|
1059
|
|
1060 if(str_strand[0]=='R') {
|
|
1061 fpos=-1*(fpos+len-1);
|
|
1062 }
|
|
1063
|
|
1064 string str_nm=*sit++;
|
|
1065 // count non-digit characters
|
|
1066 int nm=0;
|
|
1067 for(int i=0;i<str_nm.size();i++) {
|
|
1068 if(!isdigit(str_nm[i])) { nm++; }
|
|
1069 }
|
|
1070
|
|
1071 // determine the chromosome index
|
|
1072 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
1073 int cind=-1;
|
|
1074 if(li==cind_map.end()) {
|
|
1075 // register new chromosome
|
|
1076 cind=cnames.size();
|
|
1077 cnames.push_back(chr);
|
|
1078 cind_map[chr]=cind;
|
|
1079 // allocate new pos vector
|
|
1080 pos.push_back(vector<int>());
|
|
1081 posnm.push_back(vector<int>());
|
|
1082 if(read_names) {
|
|
1083 tagnames.push_back(vector<string>());
|
|
1084 }
|
|
1085 #ifdef DEBUG
|
|
1086 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
1087 #endif
|
|
1088 } else {
|
|
1089 cind=li->second;
|
|
1090 }
|
|
1091 fcount++;
|
|
1092 (pos[cind]).push_back(fpos);
|
|
1093 (posnm[cind]).push_back(nm);
|
|
1094 if(read_names) {
|
|
1095 (tagnames[cind]).push_back(tagname);
|
|
1096 }
|
|
1097 #ifdef DEBUG
|
|
1098 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
1099 if(fcount>30) {
|
|
1100 break;
|
|
1101 }
|
|
1102 #endif
|
|
1103
|
|
1104 }
|
|
1105 }
|
|
1106 fclose(f);
|
|
1107
|
|
1108 Rprintf("done. read %d fragments\n",fcount);
|
|
1109 }
|
|
1110 // construct output structures
|
|
1111 SEXP chnames;
|
|
1112 int np=0; // number of protections
|
|
1113 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
1114 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
1115 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
1116 }
|
|
1117 np++;
|
|
1118
|
|
1119 // sort
|
|
1120 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1121 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
1122 //}
|
|
1123
|
|
1124 SEXP ans;
|
|
1125 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
1126 vector<vector<int> >::const_iterator nsi;
|
|
1127 vector<vector<string> >::const_iterator ssi;
|
|
1128 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1129 nsi=posnm.begin()+(csi-pos.begin());
|
|
1130
|
|
1131 SEXP dv,dnames_R;
|
|
1132 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
|
|
1133 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
1134 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
1135 if(read_names) {
|
|
1136 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
1137 }
|
|
1138
|
|
1139
|
|
1140
|
|
1141 SEXP tv,nv,sv;
|
|
1142 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
1143 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
1144 if(read_names) {
|
|
1145 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
1146 }
|
|
1147 int* i_tv=INTEGER(tv);
|
|
1148 int* i_nv=INTEGER(nv);
|
|
1149
|
|
1150 int i=0;
|
|
1151 vector<int>::const_iterator ini=nsi->begin();
|
|
1152 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
1153 i_tv[i]=*pi;
|
|
1154 i_nv[i]=*ini++;
|
|
1155 i++;
|
|
1156 }
|
|
1157 if(read_names) {
|
|
1158 int i=0;
|
|
1159 ssi=tagnames.begin()+(csi-pos.begin());
|
|
1160 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
1161 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
1162 i++;
|
|
1163 }
|
|
1164 }
|
|
1165 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
|
|
1166 SET_VECTOR_ELT(dv, 0, tv);
|
|
1167 SET_VECTOR_ELT(dv, 1, nv);
|
|
1168 if(read_names) {
|
|
1169 SET_VECTOR_ELT(dv, 2, sv);
|
|
1170 }
|
|
1171 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
1172
|
|
1173 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
1174 }
|
|
1175
|
|
1176 setAttrib(ans,R_NamesSymbol,chnames);
|
|
1177
|
|
1178 #ifdef DEBUG
|
|
1179 Rprintf("unprotecting %d elements\n",np);
|
|
1180 #endif
|
|
1181
|
|
1182 UNPROTECT(np);
|
|
1183 return(ans);
|
|
1184 }
|
|
1185
|
|
1186
|
|
1187 // read in eland multi files, adjusting the negative strand coordinate by sequence length
|
|
1188 SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
|
|
1189
|
|
1190 #ifdef DEBUG
|
|
1191 Rprintf("read_eland_muti() : start\n");
|
|
1192 #endif
|
|
1193 const char* fname=CHAR(asChar(filename));
|
|
1194 int read_names=*(INTEGER(read_tag_names_R));
|
|
1195 int eland_tag_length=*(INTEGER(eland_tag_length_R));
|
|
1196 #ifdef DEBUG
|
|
1197 Rprintf("fname=%s\n",fname);
|
|
1198 #endif
|
|
1199
|
|
1200 // main data vector
|
|
1201 // chr - pos
|
|
1202 vector< vector<int> > pos;
|
|
1203 vector< vector<int> > posnm; // number of mismatches
|
|
1204 vector< vector<string> > tagnames;
|
|
1205
|
|
1206 // chromosome map
|
|
1207 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
1208 vector<string> cnames;
|
|
1209
|
|
1210
|
|
1211 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
1212 boost::char_separator<char> sep(" \t","");
|
|
1213 boost::char_separator<char> comsep(",","",boost::keep_empty_tokens);
|
|
1214 boost::char_separator<char> colsep(":","",boost::keep_empty_tokens);
|
|
1215
|
|
1216
|
|
1217 FILE *f=fopen(fname,"rb");
|
|
1218 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
1219 else {
|
|
1220 Rprintf("opened %s\n",fname);
|
|
1221
|
|
1222 // read in bed line
|
|
1223 string line;
|
|
1224 int nline=0;
|
|
1225 int fcount=0;
|
|
1226 while(get_a_line(f,line)) {
|
|
1227 nline++;
|
|
1228 // chomp
|
|
1229 size_t elpos = line.find_last_not_of("\n");
|
|
1230 if(elpos != string::npos) {
|
|
1231 line = line.substr(0, elpos+1);
|
|
1232 }
|
|
1233 #ifdef DEBUG
|
|
1234 Rprintf("line %d: %s\n",nline,line.c_str());
|
|
1235 #endif
|
|
1236
|
|
1237 tokType tok(line, sep);
|
|
1238 tokType::iterator sit=tok.begin();
|
|
1239 if(sit!=tok.end()) {
|
|
1240 string tagname=*sit++;
|
|
1241 string sequence=*sit++;
|
|
1242 string mspec=*sit++;
|
|
1243 // parse out match spec
|
|
1244
|
|
1245 if(mspec=="NM" || mspec=="QC") { continue; }
|
|
1246 #ifdef DEBUG
|
|
1247 Rprintf("parsing out spec \"%s\" : ",mspec.c_str());
|
|
1248 #endif
|
|
1249
|
|
1250 tokType stok(mspec, colsep);
|
|
1251 tokType::iterator ssit=stok.begin();
|
|
1252 string str_nm0=*ssit++;
|
|
1253
|
|
1254 int nm=0;
|
|
1255 int nm0=atoi(str_nm0.c_str());
|
|
1256 if(nm0>1) {
|
|
1257 #ifdef DEBUG
|
|
1258 Rprintf("rejected for nm0\n");
|
|
1259 #endif
|
|
1260 continue;
|
|
1261 }
|
|
1262 if(nm0==0) {
|
|
1263 string str_nm1=*ssit++;
|
|
1264 int nm1=atoi(str_nm1.c_str());
|
|
1265 if(nm1>1) {
|
|
1266 #ifdef DEBUG
|
|
1267 Rprintf("rejected for nm1\n");
|
|
1268 #endif
|
|
1269 continue;
|
|
1270 }
|
|
1271 if(nm1==0) {
|
|
1272 string str_nm2=*ssit++;
|
|
1273 int nm2=atoi(str_nm2.c_str());
|
|
1274 if(nm2>1) {
|
|
1275 #ifdef DEBUG
|
|
1276 Rprintf("rejected for nm2\n");
|
|
1277 #endif
|
|
1278 continue;
|
|
1279 }
|
|
1280 nm=2;
|
|
1281 } else {
|
|
1282 nm=1;
|
|
1283 }
|
|
1284 }
|
|
1285
|
|
1286 #ifdef DEBUG
|
|
1287 Rprintf("accepted (nm=%d)\n",nm);
|
|
1288 #endif
|
|
1289 int npos=0;
|
|
1290 string mpos=*sit++;
|
|
1291 vector<string> mposc;
|
|
1292 vector<int> mposp;
|
|
1293 tokType ptok(mpos, comsep);
|
|
1294 string prevchr;
|
|
1295 for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) {
|
|
1296 string cpos=*psit;
|
|
1297 npos++;
|
|
1298 int strand=1;
|
|
1299 if(cpos.size()<5) {
|
|
1300 Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str());
|
|
1301 }
|
|
1302 char lc=cpos.at(cpos.size()-1);
|
|
1303
|
|
1304 if(atoi(&lc)==nm) {
|
|
1305 switch(cpos.at(cpos.size()-2)) {
|
|
1306 case 'R': strand=-1; break;
|
|
1307 case 'F': strand=1; break;
|
|
1308 default:
|
|
1309 Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break;
|
|
1310 continue;
|
|
1311 }
|
|
1312 string chr,str_pos;
|
|
1313 size_t colpos=cpos.find(":");
|
|
1314 if(colpos==string::npos) {
|
|
1315 if(npos>1) {
|
|
1316 chr=prevchr;
|
|
1317 str_pos=cpos.substr(0,cpos.size()-2);
|
|
1318 } else {
|
|
1319 Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str());
|
|
1320 continue;
|
|
1321 }
|
|
1322 } else {
|
|
1323 chr=cpos.substr(0,colpos);
|
|
1324 str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos);
|
|
1325 }
|
|
1326 #ifdef DEBUG
|
|
1327 Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand);
|
|
1328 #endif
|
|
1329 int pos=strand*atoi(str_pos.c_str());
|
|
1330 mposc.push_back(chr);
|
|
1331 mposp.push_back(pos);
|
|
1332 }
|
|
1333 }
|
|
1334
|
|
1335 string chr;
|
|
1336 int fpos;
|
|
1337 if(mposc.size()!=1) {
|
|
1338 if(mposc.size()==0) {
|
|
1339 Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str());
|
|
1340 } else {
|
|
1341 Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str());
|
|
1342 }
|
|
1343 continue;
|
|
1344 } else {
|
|
1345 chr=*mposc.begin();
|
|
1346 fpos=*mposp.begin();
|
|
1347 }
|
|
1348
|
|
1349 int len=sequence.size();
|
|
1350 // adjust probe length if eland length limit was specified
|
|
1351 if(eland_tag_length>0 && len>eland_tag_length) {
|
|
1352 len=eland_tag_length;
|
|
1353 }
|
|
1354
|
|
1355 if(fpos<0) {
|
|
1356 fpos=-1*(-1*fpos+len-1);
|
|
1357 }
|
|
1358
|
|
1359 // determine the chromosome index
|
|
1360 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
1361 int cind=-1;
|
|
1362 if(li==cind_map.end()) {
|
|
1363 // register new chromosome
|
|
1364 cind=cnames.size();
|
|
1365 cnames.push_back(chr);
|
|
1366 cind_map[chr]=cind;
|
|
1367 // allocate new pos vector
|
|
1368 pos.push_back(vector<int>());
|
|
1369 posnm.push_back(vector<int>());
|
|
1370 if(read_names) {
|
|
1371 tagnames.push_back(vector<string>());
|
|
1372 }
|
|
1373 #ifdef DEBUG
|
|
1374 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
1375 #endif
|
|
1376 } else {
|
|
1377 cind=li->second;
|
|
1378 }
|
|
1379 fcount++;
|
|
1380 (pos[cind]).push_back(fpos);
|
|
1381 (posnm[cind]).push_back(nm);
|
|
1382 if(read_names) {
|
|
1383 (tagnames[cind]).push_back(tagname);
|
|
1384 }
|
|
1385 #ifdef DEBUG
|
|
1386 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
1387 if(fcount>30) {
|
|
1388 break;
|
|
1389 }
|
|
1390 #endif
|
|
1391
|
|
1392 }
|
|
1393 }
|
|
1394 fclose(f);
|
|
1395
|
|
1396 Rprintf("done. read %d fragments\n",fcount);
|
|
1397 }
|
|
1398 // construct output structures
|
|
1399 SEXP chnames;
|
|
1400 int np=0; // number of protections
|
|
1401 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
1402 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
1403 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
1404 }
|
|
1405 np++;
|
|
1406
|
|
1407 // sort
|
|
1408 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1409 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
1410 //}
|
|
1411
|
|
1412 SEXP ans;
|
|
1413 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
1414 vector<vector<int> >::const_iterator nsi;
|
|
1415 vector<vector<string> >::const_iterator ssi;
|
|
1416 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1417 nsi=posnm.begin()+(csi-pos.begin());
|
|
1418
|
|
1419 SEXP dv,dnames_R;
|
|
1420 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
|
|
1421 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
1422 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
1423 if(read_names) {
|
|
1424 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
1425 }
|
|
1426
|
|
1427
|
|
1428
|
|
1429 SEXP tv,nv,sv;
|
|
1430 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
1431 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
1432 if(read_names) {
|
|
1433 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
1434 }
|
|
1435 int* i_tv=INTEGER(tv);
|
|
1436 int* i_nv=INTEGER(nv);
|
|
1437
|
|
1438 int i=0;
|
|
1439 vector<int>::const_iterator ini=nsi->begin();
|
|
1440 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
1441 i_tv[i]=*pi;
|
|
1442 i_nv[i]=*ini++;
|
|
1443 i++;
|
|
1444 }
|
|
1445 if(read_names) {
|
|
1446 int i=0;
|
|
1447 ssi=tagnames.begin()+(csi-pos.begin());
|
|
1448 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
1449 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
1450 i++;
|
|
1451 }
|
|
1452 }
|
|
1453 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
|
|
1454 SET_VECTOR_ELT(dv, 0, tv);
|
|
1455 SET_VECTOR_ELT(dv, 1, nv);
|
|
1456 if(read_names) {
|
|
1457 SET_VECTOR_ELT(dv, 2, sv);
|
|
1458 }
|
|
1459 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
1460
|
|
1461 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
1462 }
|
|
1463
|
|
1464 setAttrib(ans,R_NamesSymbol,chnames);
|
|
1465
|
|
1466 #ifdef DEBUG
|
|
1467 Rprintf("unprotecting %d elements\n",np);
|
|
1468 #endif
|
|
1469
|
|
1470 UNPROTECT(np);
|
|
1471 return(ans);
|
|
1472 }
|
|
1473
|
|
1474
|
|
1475 // read in regular eland files, adjusting the negative strand coordinate by sequence length
|
|
1476 SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) {
|
|
1477
|
|
1478 #ifdef DEBUG
|
|
1479 Rprintf("start\n");
|
|
1480 #endif
|
|
1481 const char* fname=CHAR(asChar(filename));
|
|
1482 int read_names=*(INTEGER(read_tag_names_R));
|
|
1483 #ifdef DEBUG
|
|
1484 Rprintf("fname=%s\n",fname);
|
|
1485 #endif
|
|
1486
|
|
1487 // main data vector
|
|
1488 // chr - pos
|
|
1489 vector< vector<int> > pos;
|
|
1490 vector< vector<int> > posnm; // number of mismatches
|
|
1491 vector< vector<string> > tagnames;
|
|
1492
|
|
1493 // chromosome map
|
|
1494 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
1495 vector<string> cnames;
|
|
1496
|
|
1497
|
|
1498 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
1499 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
1500 boost::char_separator<char> sep2(",");
|
|
1501
|
|
1502
|
|
1503 FILE *f=fopen(fname,"rb");
|
|
1504 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
|
|
1505 } else {
|
|
1506 #ifdef HAVE_LIBBZ2
|
|
1507 BZFILE* b;
|
|
1508 int bzerror;
|
|
1509
|
|
1510 int bz2file=0;
|
|
1511 if(strstr(fname,".bz2")) {
|
|
1512 bz2file=1;
|
|
1513 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
|
|
1514 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
|
|
1515 }
|
|
1516 #endif
|
|
1517
|
|
1518 Rprintf("opened %s\n",fname);
|
|
1519
|
|
1520 // read in bed line
|
|
1521 string line;
|
|
1522 int fcount=0;
|
|
1523 #ifdef HAVE_LIBBZ2
|
|
1524 while(get_a_line(f,b,bz2file,line)) {
|
|
1525 #else
|
|
1526 while(get_a_line(f,line)) {
|
|
1527 #endif
|
|
1528
|
|
1529 #ifdef DEBUG
|
|
1530 Rprintf("line: %s\n",line.c_str());
|
|
1531 #endif
|
|
1532
|
|
1533
|
|
1534 tokType tok(line, sep);
|
|
1535 tokType::iterator sit=tok.begin();
|
|
1536 if(sit!=tok.end()) {
|
|
1537 string tagname=*sit++;
|
|
1538 string str_strand=*sit++;
|
|
1539 string chr=*sit++;
|
|
1540
|
|
1541 string str_pos=*sit++;
|
|
1542 int fpos=atoi(str_pos.c_str());
|
|
1543
|
|
1544 string sequence=*sit++;
|
|
1545 sit++; sit++;
|
|
1546 string mm=*sit++;
|
|
1547
|
|
1548 int len=sequence.size();
|
|
1549 if(str_strand[0]=='-') {
|
|
1550 fpos=-1*(fpos+len-1);
|
|
1551 }
|
|
1552 // determine number of mismatches
|
|
1553 int nm=0;
|
|
1554 if(mm.size()>0) {
|
|
1555 nm++;
|
|
1556 string::size_type tp(0);
|
|
1557 while(tp!=string::npos) {
|
|
1558 tp = mm.find(",",tp);
|
|
1559 if(tp!=string::npos) {
|
|
1560 tp++;
|
|
1561 ++nm;
|
|
1562 }
|
|
1563 }
|
|
1564 }
|
|
1565
|
|
1566
|
|
1567
|
|
1568 // determine the chromosome index
|
|
1569 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
1570 int cind=-1;
|
|
1571 if(li==cind_map.end()) {
|
|
1572 // register new chromosome
|
|
1573 cind=cnames.size();
|
|
1574 cnames.push_back(chr);
|
|
1575 cind_map[chr]=cind;
|
|
1576 // allocate new pos vector
|
|
1577 pos.push_back(vector<int>());
|
|
1578 posnm.push_back(vector<int>());
|
|
1579 if(read_names) {
|
|
1580 tagnames.push_back(vector<string>());
|
|
1581 }
|
|
1582 #ifdef DEBUG
|
|
1583 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
1584 #endif
|
|
1585 } else {
|
|
1586 cind=li->second;
|
|
1587 }
|
|
1588 fcount++;
|
|
1589 (pos[cind]).push_back(fpos);
|
|
1590 (posnm[cind]).push_back(nm);
|
|
1591 if(read_names) {
|
|
1592 (tagnames[cind]).push_back(tagname);
|
|
1593 }
|
|
1594 #ifdef DEBUG
|
|
1595 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
1596 if(fcount>30) {
|
|
1597 break;
|
|
1598 }
|
|
1599 #endif
|
|
1600
|
|
1601 }
|
|
1602 }
|
|
1603
|
|
1604 #ifdef HAVE_LIBBZ2
|
|
1605 BZ2_bzReadClose( &bzerror, b);
|
|
1606 #endif
|
|
1607 fclose(f);
|
|
1608
|
|
1609 Rprintf("done. read %d fragments\n",fcount);
|
|
1610 }
|
|
1611 // construct output structures
|
|
1612 SEXP chnames;
|
|
1613 int np=0; // number of protections
|
|
1614 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
1615 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
1616 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
1617 }
|
|
1618 np++;
|
|
1619
|
|
1620 // sort
|
|
1621 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1622 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
1623 //}
|
|
1624
|
|
1625 SEXP ans;
|
|
1626 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
1627 vector<vector<int> >::const_iterator nsi;
|
|
1628 vector<vector<string> >::const_iterator ssi;
|
|
1629 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1630 nsi=posnm.begin()+(csi-pos.begin());
|
|
1631
|
|
1632 SEXP dv,dnames_R;
|
|
1633 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
|
|
1634 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
1635 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
1636 if(read_names) {
|
|
1637 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
1638 }
|
|
1639
|
|
1640
|
|
1641
|
|
1642 SEXP tv,nv,sv;
|
|
1643 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
1644 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
1645 if(read_names) {
|
|
1646 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
1647 }
|
|
1648 int* i_tv=INTEGER(tv);
|
|
1649 int* i_nv=INTEGER(nv);
|
|
1650
|
|
1651 int i=0;
|
|
1652 vector<int>::const_iterator ini=nsi->begin();
|
|
1653 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
1654 i_tv[i]=*pi;
|
|
1655 i_nv[i]=*ini++;
|
|
1656 i++;
|
|
1657 }
|
|
1658 if(read_names) {
|
|
1659 int i=0;
|
|
1660 ssi=tagnames.begin()+(csi-pos.begin());
|
|
1661 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
1662 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
1663 i++;
|
|
1664 }
|
|
1665 }
|
|
1666 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
|
|
1667 SET_VECTOR_ELT(dv, 0, tv);
|
|
1668 SET_VECTOR_ELT(dv, 1, nv);
|
|
1669 if(read_names) {
|
|
1670 SET_VECTOR_ELT(dv, 2, sv);
|
|
1671 }
|
|
1672 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
1673
|
|
1674 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
1675 }
|
|
1676
|
|
1677 setAttrib(ans,R_NamesSymbol,chnames);
|
|
1678
|
|
1679 #ifdef DEBUG
|
|
1680 Rprintf("unprotecting %d elements\n",np);
|
|
1681 #endif
|
|
1682
|
|
1683 UNPROTECT(np);
|
|
1684 return(ans);
|
|
1685 }
|
|
1686
|
|
1687
|
|
1688 // read in helicos tab-separated alignment output (regular or bz2)
|
|
1689 SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) {
|
|
1690
|
|
1691 #ifdef DEBUG
|
|
1692 Rprintf("start\n");
|
|
1693 #endif
|
|
1694 const char* fname=CHAR(asChar(filename));
|
|
1695 int read_names=*(INTEGER(read_tag_names_R));
|
|
1696 #ifdef DEBUG
|
|
1697 Rprintf("fname=%s\n",fname);
|
|
1698 #endif
|
|
1699
|
|
1700 // main data vector
|
|
1701 // chr - pos
|
|
1702 vector< vector<int> > pos;
|
|
1703 vector< vector<int> > posnm; // number of mismatches
|
|
1704 vector< vector<int> > poslen; // length of the match
|
|
1705 vector< vector<string> > tagnames;
|
|
1706
|
|
1707 // chromosome map
|
|
1708 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
1709 vector<string> cnames;
|
|
1710
|
|
1711
|
|
1712 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
1713 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
1714 boost::char_separator<char> sep2(",");
|
|
1715
|
|
1716
|
|
1717 FILE *f=fopen(fname,"rb");
|
|
1718 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
|
|
1719 } else {
|
|
1720 #ifdef HAVE_LIBBZ2
|
|
1721 BZFILE* b;
|
|
1722 int bzerror;
|
|
1723
|
|
1724 int bz2file=0;
|
|
1725 if(strstr(fname,".bz2")) {
|
|
1726 bz2file=1;
|
|
1727 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
|
|
1728 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
|
|
1729 }
|
|
1730 #endif
|
|
1731
|
|
1732 Rprintf("opened %s\n",fname);
|
|
1733
|
|
1734 // read in bed line
|
|
1735 string line;
|
|
1736 int fcount=0;
|
|
1737 int nlines=0;
|
|
1738 #ifdef HAVE_LIBBZ2
|
|
1739 while(get_a_line(f,b,bz2file,line)) {
|
|
1740 #else
|
|
1741 while(get_a_line(f,line)) {
|
|
1742 #endif
|
|
1743
|
|
1744 #ifdef DEBUG
|
|
1745 Rprintf("line: %s\n",line.c_str());
|
|
1746 #endif
|
|
1747 nlines++;
|
|
1748 // skip comments
|
|
1749 if(line[0]=='#') { continue; }
|
|
1750 if(line.compare(0,12,"Reference_ID")==0) {
|
|
1751 #ifdef DEBUG
|
|
1752 Rprintf("matched header on line %d\n",nlines);
|
|
1753 #endif
|
|
1754 continue;
|
|
1755 }
|
|
1756
|
|
1757 tokType tok(line, sep);
|
|
1758 tokType::iterator sit=tok.begin();
|
|
1759 if(sit!=tok.end()) {
|
|
1760 string chr=*sit++;
|
|
1761 string tagname=*sit++;
|
|
1762 string str_startpos=*sit++;
|
|
1763 string str_endpos=*sit++;
|
|
1764
|
|
1765 string str_tstart=*sit++;
|
|
1766 string str_tend=*sit++;
|
|
1767 int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str());
|
|
1768
|
|
1769 sit++; sit++;
|
|
1770 string str_ndel=*sit++;
|
|
1771 string str_nins=*sit++;
|
|
1772 string str_nsub=*sit++;
|
|
1773
|
|
1774 string str_strand=*sit++;
|
|
1775 int fpos;
|
|
1776 if(str_strand[0]=='-') {
|
|
1777 fpos=-1*atoi(str_endpos.c_str());
|
|
1778 } else {
|
|
1779 fpos=atoi(str_startpos.c_str());
|
|
1780 }
|
|
1781
|
|
1782 // determine number of mismatches
|
|
1783 int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str());
|
|
1784
|
|
1785 // determine the chromosome index
|
|
1786 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
1787 int cind=-1;
|
|
1788 if(li==cind_map.end()) {
|
|
1789 // register new chromosome
|
|
1790 cind=cnames.size();
|
|
1791 cnames.push_back(chr);
|
|
1792 cind_map[chr]=cind;
|
|
1793 // allocate new pos vector
|
|
1794 pos.push_back(vector<int>());
|
|
1795 posnm.push_back(vector<int>());
|
|
1796 poslen.push_back(vector<int>());
|
|
1797 if(read_names) {
|
|
1798 tagnames.push_back(vector<string>());
|
|
1799 }
|
|
1800 #ifdef DEBUG
|
|
1801 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
1802 #endif
|
|
1803 } else {
|
|
1804 cind=li->second;
|
|
1805 }
|
|
1806 fcount++;
|
|
1807 (pos[cind]).push_back(fpos);
|
|
1808 (posnm[cind]).push_back(nm);
|
|
1809 (poslen[cind]).push_back(len);
|
|
1810 if(read_names) {
|
|
1811 (tagnames[cind]).push_back(tagname);
|
|
1812 }
|
|
1813 #ifdef DEBUG
|
|
1814 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm);
|
|
1815 if(fcount>30) {
|
|
1816 break;
|
|
1817 }
|
|
1818 #endif
|
|
1819
|
|
1820 }
|
|
1821 }
|
|
1822
|
|
1823 #ifdef HAVE_LIBBZ2
|
|
1824 BZ2_bzReadClose( &bzerror, b);
|
|
1825 #endif
|
|
1826 fclose(f);
|
|
1827
|
|
1828 Rprintf("done. read %d fragments\n",fcount);
|
|
1829 }
|
|
1830 // construct output structures
|
|
1831 SEXP chnames;
|
|
1832 int np=0; // number of protections
|
|
1833 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
1834 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
1835 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
1836 }
|
|
1837 np++;
|
|
1838
|
|
1839 // sort
|
|
1840 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1841 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
1842 //}
|
|
1843
|
|
1844 SEXP ans;
|
|
1845 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
1846 vector<vector<int> >::const_iterator nsi;
|
|
1847 vector<vector<int> >::const_iterator lsi;
|
|
1848 vector<vector<string> >::const_iterator ssi;
|
|
1849 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
1850 nsi=posnm.begin()+(csi-pos.begin());
|
|
1851 lsi=poslen.begin()+(csi-pos.begin());
|
|
1852
|
|
1853 SEXP dv,dnames_R;
|
|
1854 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
|
|
1855 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
1856 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
1857 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
|
|
1858 if(read_names) {
|
|
1859 SET_STRING_ELT(dnames_R, 3, mkChar("s"));
|
|
1860 }
|
|
1861
|
|
1862
|
|
1863
|
|
1864 SEXP tv,nv,lv,sv;
|
|
1865 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
1866 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
1867 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
|
|
1868 if(read_names) {
|
|
1869 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
1870 }
|
|
1871 int* i_tv=INTEGER(tv);
|
|
1872 int* i_nv=INTEGER(nv);
|
|
1873 int* i_lv=INTEGER(lv);
|
|
1874
|
|
1875 int i=0;
|
|
1876 vector<int>::const_iterator ini=nsi->begin();
|
|
1877 vector<int>::const_iterator lni=lsi->begin();
|
|
1878 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
1879 i_tv[i]=*pi;
|
|
1880 i_nv[i]=*ini++;
|
|
1881 i_lv[i]=*lni++;
|
|
1882 i++;
|
|
1883 }
|
|
1884 if(read_names) {
|
|
1885 int i=0;
|
|
1886 ssi=tagnames.begin()+(csi-pos.begin());
|
|
1887 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
1888 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
1889 i++;
|
|
1890 }
|
|
1891 }
|
|
1892 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
|
|
1893 SET_VECTOR_ELT(dv, 0, tv);
|
|
1894 SET_VECTOR_ELT(dv, 1, nv);
|
|
1895 SET_VECTOR_ELT(dv, 2, lv);
|
|
1896 if(read_names) {
|
|
1897 SET_VECTOR_ELT(dv, 3, sv);
|
|
1898 }
|
|
1899 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
1900
|
|
1901 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
1902 }
|
|
1903
|
|
1904 setAttrib(ans,R_NamesSymbol,chnames);
|
|
1905
|
|
1906 #ifdef DEBUG
|
|
1907 Rprintf("unprotecting %d elements\n",np);
|
|
1908 #endif
|
|
1909
|
|
1910 UNPROTECT(np);
|
|
1911 return(ans);
|
|
1912 }
|
|
1913
|
|
1914
|
|
1915
|
|
1916 // read in text version of maq map
|
|
1917 SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) {
|
|
1918
|
|
1919 #ifdef DEBUG
|
|
1920 Rprintf("start\n");
|
|
1921 #endif
|
|
1922 const char* fname=CHAR(asChar(filename));
|
|
1923 int read_names=*(INTEGER(read_tag_names_R));
|
|
1924 #ifdef DEBUG
|
|
1925 Rprintf("fname=%s\n",fname);
|
|
1926 #endif
|
|
1927
|
|
1928 // main data vector
|
|
1929 // chr - pos
|
|
1930 vector< vector<int> > pos;
|
|
1931 vector< vector<int> > posnm; // number of mismatches
|
|
1932 vector< vector<string> > tagnames;
|
|
1933
|
|
1934 // chromosome map
|
|
1935 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
1936 vector<string> cnames;
|
|
1937
|
|
1938
|
|
1939 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
1940 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
1941
|
|
1942
|
|
1943 FILE *f=fopen(fname,"rb");
|
|
1944 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
1945 else {
|
|
1946 Rprintf("opened %s\n",fname);
|
|
1947
|
|
1948 // read in bed line
|
|
1949 string line;
|
|
1950 int fcount=0;
|
|
1951 while(get_a_line(f,line)) {
|
|
1952
|
|
1953 #ifdef DEBUG
|
|
1954 Rprintf("line: %s\n",line.c_str());
|
|
1955 #endif
|
|
1956
|
|
1957
|
|
1958 tokType tok(line, sep);
|
|
1959 tokType::iterator sit=tok.begin();
|
|
1960 if(sit!=tok.end()) {
|
|
1961 string tagname=*sit++;
|
|
1962 string chr=*sit++;
|
|
1963 string str_pos=*sit++;
|
|
1964 int fpos=atoi(str_pos.c_str());
|
|
1965 string str_strand=*sit++;
|
|
1966 sit++; sit++; sit++; sit++; sit++;
|
|
1967 string str_nm=*sit++;
|
|
1968 sit++; sit++; sit++;
|
|
1969 string str_len=*sit++;
|
|
1970 int nm=atoi(str_nm.c_str());
|
|
1971 int len=atoi(str_len.c_str());
|
|
1972
|
|
1973 if(str_strand[0]=='-') {
|
|
1974 fpos=-1*(fpos+len-1);
|
|
1975 }
|
|
1976
|
|
1977 // determine the chromosome index
|
|
1978 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
1979 int cind=-1;
|
|
1980 if(li==cind_map.end()) {
|
|
1981 // register new chromosome
|
|
1982 cind=cnames.size();
|
|
1983 cnames.push_back(chr);
|
|
1984 cind_map[chr]=cind;
|
|
1985 // allocate new pos vector
|
|
1986 pos.push_back(vector<int>());
|
|
1987 posnm.push_back(vector<int>());
|
|
1988 if(read_names) {
|
|
1989 tagnames.push_back(vector<string>());
|
|
1990 }
|
|
1991 #ifdef DEBUG
|
|
1992 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
1993 #endif
|
|
1994 } else {
|
|
1995 cind=li->second;
|
|
1996 }
|
|
1997 fcount++;
|
|
1998 (pos[cind]).push_back(fpos);
|
|
1999 (posnm[cind]).push_back(nm);
|
|
2000 if(read_names) {
|
|
2001 (tagnames[cind]).push_back(tagname);
|
|
2002 }
|
|
2003 #ifdef DEBUG
|
|
2004 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
2005 if(fcount>30) {
|
|
2006 break;
|
|
2007 }
|
|
2008 #endif
|
|
2009
|
|
2010 }
|
|
2011 }
|
|
2012 fclose(f);
|
|
2013
|
|
2014 Rprintf("done. read %d fragments\n",fcount);
|
|
2015 }
|
|
2016 // construct output structures
|
|
2017 SEXP chnames;
|
|
2018 int np=0; // number of protections
|
|
2019 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
2020 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
2021 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
2022 }
|
|
2023 np++;
|
|
2024
|
|
2025 // sort
|
|
2026 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2027 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
2028 //}
|
|
2029
|
|
2030 SEXP ans;
|
|
2031 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
2032 vector<vector<int> >::const_iterator nsi;
|
|
2033 vector<vector<string> >::const_iterator ssi;
|
|
2034 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2035 nsi=posnm.begin()+(csi-pos.begin());
|
|
2036
|
|
2037 SEXP dv,dnames_R;
|
|
2038 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
|
|
2039 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
2040 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
2041 if(read_names) {
|
|
2042 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
2043 }
|
|
2044
|
|
2045
|
|
2046
|
|
2047 SEXP tv,nv,sv;
|
|
2048 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
2049 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
2050 if(read_names) {
|
|
2051 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
2052 }
|
|
2053 int* i_tv=INTEGER(tv);
|
|
2054 int* i_nv=INTEGER(nv);
|
|
2055
|
|
2056 int i=0;
|
|
2057 vector<int>::const_iterator ini=nsi->begin();
|
|
2058 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
2059 i_tv[i]=*pi;
|
|
2060 i_nv[i]=*ini++;
|
|
2061 i++;
|
|
2062 }
|
|
2063 if(read_names) {
|
|
2064 int i=0;
|
|
2065 ssi=tagnames.begin()+(csi-pos.begin());
|
|
2066 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
2067 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
2068 i++;
|
|
2069 }
|
|
2070 }
|
|
2071 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
|
|
2072 SET_VECTOR_ELT(dv, 0, tv);
|
|
2073 SET_VECTOR_ELT(dv, 1, nv);
|
|
2074 if(read_names) {
|
|
2075 SET_VECTOR_ELT(dv, 2, sv);
|
|
2076 }
|
|
2077 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
2078
|
|
2079 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
2080 }
|
|
2081
|
|
2082 setAttrib(ans,R_NamesSymbol,chnames);
|
|
2083
|
|
2084 #ifdef DEBUG
|
|
2085 Rprintf("unprotecting %d elements\n",np);
|
|
2086 #endif
|
|
2087
|
|
2088 UNPROTECT(np);
|
|
2089 return(ans);
|
|
2090 }
|
|
2091
|
|
2092
|
|
2093
|
|
2094
|
|
2095
|
|
2096 // read in tagalign file
|
|
2097 SEXP read_tagalign(SEXP filename) {
|
|
2098
|
|
2099 #ifdef DEBUG
|
|
2100 Rprintf("start\n");
|
|
2101 #endif
|
|
2102 const char* fname=CHAR(asChar(filename));
|
|
2103 #ifdef DEBUG
|
|
2104 Rprintf("fname=%s\n",fname);
|
|
2105 #endif
|
|
2106
|
|
2107 // main data vector
|
|
2108 // chr - pos
|
|
2109 vector< vector<int> > pos;
|
|
2110 vector< vector<int> > posnm; // number of mismatches
|
|
2111
|
|
2112 // chromosome map
|
|
2113 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
2114 vector<string> cnames;
|
|
2115
|
|
2116
|
|
2117 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
2118 boost::char_separator<char> sep(" \t");
|
|
2119
|
|
2120
|
|
2121 FILE *f=fopen(fname,"rb");
|
|
2122 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
2123 else {
|
|
2124 Rprintf("opened %s\n",fname);
|
|
2125
|
|
2126 // read in bed line
|
|
2127 string line;
|
|
2128 int fcount=0;
|
|
2129 while(get_a_line(f,line)) {
|
|
2130
|
|
2131 #ifdef DEBUG
|
|
2132 Rprintf("line: %s\n",line.c_str());
|
|
2133 #endif
|
|
2134
|
|
2135
|
|
2136 tokType tok(line, sep);
|
|
2137 tokType::iterator sit=tok.begin();
|
|
2138 if(sit!=tok.end()) {
|
|
2139 string chr=*sit++;
|
|
2140 string str_spos=*sit++;
|
|
2141 string str_epos=*sit++;
|
|
2142 sit++;
|
|
2143 string str_qual=*sit++;
|
|
2144 string str_strand=*sit;
|
|
2145
|
|
2146 int fpos;
|
|
2147 if(str_strand[0]=='+') {
|
|
2148 fpos=atoi(str_spos.c_str());
|
|
2149 } else {
|
|
2150 fpos=-1*atoi(str_epos.c_str());
|
|
2151 }
|
|
2152 int nm=atoi(str_qual.c_str());
|
|
2153
|
|
2154 // determine the chromosome index
|
|
2155 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
2156 int cind=-1;
|
|
2157 if(li==cind_map.end()) {
|
|
2158 // register new chromosome
|
|
2159 cind=cnames.size();
|
|
2160 cnames.push_back(chr);
|
|
2161 cind_map[chr]=cind;
|
|
2162 // allocate new pos vector
|
|
2163 pos.push_back(vector<int>());
|
|
2164 posnm.push_back(vector<int>());
|
|
2165 #ifdef DEBUG
|
|
2166 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
2167 #endif
|
|
2168 } else {
|
|
2169 cind=li->second;
|
|
2170 }
|
|
2171 fcount++;
|
|
2172 (pos[cind]).push_back(fpos);
|
|
2173 (posnm[cind]).push_back(nm);
|
|
2174 #ifdef DEBUG
|
|
2175 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
|
|
2176 if(fcount>30) {
|
|
2177 break;
|
|
2178 }
|
|
2179 #endif
|
|
2180
|
|
2181 }
|
|
2182 }
|
|
2183 fclose(f);
|
|
2184
|
|
2185 Rprintf("done. read %d fragments\n",fcount);
|
|
2186 }
|
|
2187 // construct output structures
|
|
2188 SEXP chnames;
|
|
2189 int np=0; // number of protections
|
|
2190 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
2191 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
2192 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
2193 }
|
|
2194 np++;
|
|
2195
|
|
2196 // sort
|
|
2197 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2198 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
2199 //}
|
|
2200
|
|
2201 SEXP ans;
|
|
2202 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
2203 vector<vector<int> >::const_iterator nsi;
|
|
2204 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2205 nsi=posnm.begin()+(csi-pos.begin());
|
|
2206
|
|
2207 SEXP dv,dnames_R;
|
|
2208 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
|
|
2209 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
2210 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
2211
|
|
2212
|
|
2213 SEXP tv,nv;
|
|
2214 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
2215 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
2216 int* i_tv=INTEGER(tv);
|
|
2217 int* i_nv=INTEGER(nv);
|
|
2218
|
|
2219 int i=0;
|
|
2220 vector<int>::const_iterator ini=nsi->begin();
|
|
2221 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
2222 i_tv[i]=*pi;
|
|
2223 i_nv[i]=*ini++;
|
|
2224 i++;
|
|
2225 }
|
|
2226 PROTECT(dv = allocVector(VECSXP, 2)); np++;
|
|
2227 SET_VECTOR_ELT(dv, 0, tv);
|
|
2228 SET_VECTOR_ELT(dv, 1, nv);
|
|
2229 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
2230
|
|
2231 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
2232 }
|
|
2233
|
|
2234 setAttrib(ans,R_NamesSymbol,chnames);
|
|
2235
|
|
2236 #ifdef DEBUG
|
|
2237 Rprintf("unprotecting %d elements\n",np);
|
|
2238 #endif
|
|
2239
|
|
2240 UNPROTECT(np);
|
|
2241 return(ans);
|
|
2242 }
|
|
2243
|
|
2244
|
|
2245
|
|
2246
|
|
2247 // arachne madness
|
|
2248 SEXP read_arachne(SEXP filename) {
|
|
2249
|
|
2250 #ifdef DEBUG
|
|
2251 Rprintf("start\n");
|
|
2252 #endif
|
|
2253 const char* fname=CHAR(asChar(filename));
|
|
2254 #ifdef DEBUG
|
|
2255 Rprintf("fname=%s\n",fname);
|
|
2256 #endif
|
|
2257
|
|
2258 // main data vector
|
|
2259 // chr - pos
|
|
2260 vector< vector<int> > pos;
|
|
2261 vector< vector<int> > posnm; // number of mismatches
|
|
2262
|
|
2263 // chromosome map
|
|
2264 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
2265 vector<string> cnames;
|
|
2266
|
|
2267
|
|
2268 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
2269 boost::char_separator<char> sep(" \t");
|
|
2270
|
|
2271
|
|
2272
|
|
2273
|
|
2274
|
|
2275 FILE *f=fopen(fname,"rb");
|
|
2276 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
2277 else {
|
|
2278
|
|
2279 #ifdef HAVE_LIBBZ2
|
|
2280 BZFILE* b;
|
|
2281 int bzerror;
|
|
2282
|
|
2283 int bz2file=0;
|
|
2284 if(strstr(fname,".bz2")) {
|
|
2285 bz2file=1;
|
|
2286 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
|
|
2287 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
|
|
2288 }
|
|
2289 #endif
|
|
2290
|
|
2291
|
|
2292 Rprintf("opened %s\n",fname);
|
|
2293
|
|
2294 // read in bed line
|
|
2295 string line;
|
|
2296 int fcount=0;
|
|
2297 #ifdef HAVE_LIBBZ2
|
|
2298 while(get_a_line(f,b,bz2file,line)) {
|
|
2299 #else
|
|
2300 while(get_a_line(f,line)) {
|
|
2301 #endif
|
|
2302
|
|
2303 #ifdef DEBUG
|
|
2304 Rprintf("line: %s\n",line.c_str());
|
|
2305 #endif
|
|
2306
|
|
2307
|
|
2308 tokType tok(line, sep);
|
|
2309 tokType::iterator sit=tok.begin();
|
|
2310 if(sit!=tok.end()) {
|
|
2311 string chr=*sit++;
|
|
2312 string str_spos=*sit++;
|
|
2313 int nm=0;
|
|
2314 if(sit!=tok.end()) {
|
|
2315 string str_mm=*sit;
|
|
2316 nm=atoi(str_mm.c_str());
|
|
2317 }
|
|
2318
|
|
2319 int fpos=atoi(str_spos.c_str());;
|
|
2320
|
|
2321
|
|
2322 // determine the chromosome index
|
|
2323 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
2324 int cind=-1;
|
|
2325 if(li==cind_map.end()) {
|
|
2326 // register new chromosome
|
|
2327 cind=cnames.size();
|
|
2328 cnames.push_back(chr);
|
|
2329 cind_map[chr]=cind;
|
|
2330 // allocate new pos vector
|
|
2331 pos.push_back(vector<int>());
|
|
2332 posnm.push_back(vector<int>());
|
|
2333 #ifdef DEBUG
|
|
2334 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
2335 #endif
|
|
2336 } else {
|
|
2337 cind=li->second;
|
|
2338 }
|
|
2339 fcount++;
|
|
2340 (pos[cind]).push_back(fpos);
|
|
2341 (posnm[cind]).push_back(nm);
|
|
2342 #ifdef DEBUG
|
|
2343 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
|
|
2344 if(fcount>30) {
|
|
2345 break;
|
|
2346 }
|
|
2347 #endif
|
|
2348
|
|
2349 }
|
|
2350 }
|
|
2351 #ifdef HAVE_LIBBZ2
|
|
2352 BZ2_bzReadClose( &bzerror, b);
|
|
2353 #endif
|
|
2354
|
|
2355 fclose(f);
|
|
2356
|
|
2357 Rprintf("done. read %d fragments\n",fcount);
|
|
2358 }
|
|
2359 // construct output structures
|
|
2360 SEXP chnames;
|
|
2361 int np=0; // number of protections
|
|
2362 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
2363 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
2364 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
2365 }
|
|
2366 np++;
|
|
2367
|
|
2368 // sort
|
|
2369 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2370 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
2371 //}
|
|
2372
|
|
2373 SEXP ans;
|
|
2374 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
2375 vector<vector<int> >::const_iterator nsi;
|
|
2376 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2377 nsi=posnm.begin()+(csi-pos.begin());
|
|
2378
|
|
2379 SEXP dv,dnames_R;
|
|
2380 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
|
|
2381 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
2382 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
2383
|
|
2384
|
|
2385 SEXP tv,nv;
|
|
2386 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
2387 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
2388 int* i_tv=INTEGER(tv);
|
|
2389 int* i_nv=INTEGER(nv);
|
|
2390
|
|
2391 int i=0;
|
|
2392 vector<int>::const_iterator ini=nsi->begin();
|
|
2393 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
2394 i_tv[i]=*pi;
|
|
2395 i_nv[i]=*ini++;
|
|
2396 i++;
|
|
2397 }
|
|
2398 PROTECT(dv = allocVector(VECSXP, 2)); np++;
|
|
2399 SET_VECTOR_ELT(dv, 0, tv);
|
|
2400 SET_VECTOR_ELT(dv, 1, nv);
|
|
2401 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
2402
|
|
2403 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
2404 }
|
|
2405
|
|
2406 setAttrib(ans,R_NamesSymbol,chnames);
|
|
2407
|
|
2408 #ifdef DEBUG
|
|
2409 Rprintf("unprotecting %d elements\n",np);
|
|
2410 #endif
|
|
2411
|
|
2412 UNPROTECT(np);
|
|
2413 return(ans);
|
|
2414 }
|
|
2415
|
|
2416
|
|
2417 // arachne madness
|
|
2418 SEXP read_arachne_long(SEXP filename) {
|
|
2419
|
|
2420 #ifdef DEBUG
|
|
2421 Rprintf("start\n");
|
|
2422 #endif
|
|
2423 const char* fname=CHAR(asChar(filename));
|
|
2424 #ifdef DEBUG
|
|
2425 Rprintf("fname=%s\n",fname);
|
|
2426 #endif
|
|
2427
|
|
2428 // main data vector
|
|
2429 // chr - pos
|
|
2430 vector< vector<int> > pos;
|
|
2431 vector< vector<int> > posnm; // number of mismatches
|
|
2432 vector< vector<int> > poslen; // length of the match
|
|
2433
|
|
2434 // chromosome map
|
|
2435 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
2436 vector<string> cnames;
|
|
2437
|
|
2438
|
|
2439 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
2440 boost::char_separator<char> sep(" \t");
|
|
2441
|
|
2442
|
|
2443
|
|
2444
|
|
2445
|
|
2446 FILE *f=fopen(fname,"rb");
|
|
2447 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
|
|
2448 else {
|
|
2449
|
|
2450 #ifdef HAVE_LIBBZ2
|
|
2451 BZFILE* b;
|
|
2452 int bzerror;
|
|
2453
|
|
2454 int bz2file=0;
|
|
2455 if(strstr(fname,".bz2")) {
|
|
2456 bz2file=1;
|
|
2457 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
|
|
2458 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
|
|
2459 }
|
|
2460 #endif
|
|
2461
|
|
2462
|
|
2463 Rprintf("opened %s\n",fname);
|
|
2464
|
|
2465 // read in bed line
|
|
2466 string line;
|
|
2467 int fcount=0;
|
|
2468 #ifdef HAVE_LIBBZ2
|
|
2469 while(get_a_line(f,b,bz2file,line)) {
|
|
2470 #else
|
|
2471 while(get_a_line(f,line)) {
|
|
2472 #endif
|
|
2473
|
|
2474 #ifdef DEBUG
|
|
2475 Rprintf("line: %s\n",line.c_str());
|
|
2476 #endif
|
|
2477
|
|
2478
|
|
2479 tokType tok(line, sep);
|
|
2480 tokType::iterator sit=tok.begin();
|
|
2481 if(sit!=tok.end()) {
|
|
2482 string query=*sit++;
|
|
2483 if(query!="QUERY") { continue; }
|
|
2484 *sit++; *sit++; *sit++; *sit++;
|
|
2485 string str_strand=*sit++;
|
|
2486 string chr=*sit++;
|
|
2487 string str_startpos=*sit++;
|
|
2488 string str_endpos=*sit++;
|
|
2489
|
|
2490 int fpos;
|
|
2491 if(str_strand[0]=='1') {
|
|
2492 fpos=-1*atoi(str_endpos.c_str());
|
|
2493 } else {
|
|
2494 fpos=atoi(str_startpos.c_str());
|
|
2495 }
|
|
2496 #ifdef DEBUG
|
|
2497 Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos);
|
|
2498 #endif
|
|
2499 *sit++;
|
|
2500 string str_nblocks=*sit++;
|
|
2501 int nblocks=atoi(str_nblocks.c_str());
|
|
2502 #ifdef DEBUG
|
|
2503 Rprintf("nblocks=%d\n",nblocks);
|
|
2504 #endif
|
|
2505 // tally up the read length and the number of mismatches for all blocks
|
|
2506 int len=0; int nm=0;
|
|
2507 for(int i=0;i<nblocks;i++) {
|
|
2508 string str_sgs=*sit++;
|
|
2509 int sgs=atoi(str_sgs.c_str());
|
|
2510 string str_slen=*sit++;
|
|
2511 int slen=atoi(str_slen.c_str());
|
|
2512 string str_snm=*sit++;
|
|
2513 int snm=atoi(str_snm.c_str());
|
|
2514 #ifdef DEBUG
|
|
2515 Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm);
|
|
2516 #endif
|
|
2517 len+=slen;
|
|
2518 nm+=abs(sgs)+snm;
|
|
2519 }
|
|
2520 nm+=nblocks-1;
|
|
2521
|
|
2522
|
|
2523 // determine the chromosome index
|
|
2524 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
2525 int cind=-1;
|
|
2526 if(li==cind_map.end()) {
|
|
2527 // register new chromosome
|
|
2528 cind=cnames.size();
|
|
2529 cnames.push_back(chr);
|
|
2530 cind_map[chr]=cind;
|
|
2531 // allocate new pos vector
|
|
2532 pos.push_back(vector<int>());
|
|
2533 posnm.push_back(vector<int>());
|
|
2534 poslen.push_back(vector<int>());
|
|
2535 #ifdef DEBUG
|
|
2536 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
2537 #endif
|
|
2538 } else {
|
|
2539 cind=li->second;
|
|
2540 }
|
|
2541 fcount++;
|
|
2542 (pos[cind]).push_back(fpos);
|
|
2543 (posnm[cind]).push_back(nm);
|
|
2544 (poslen[cind]).push_back(len);
|
|
2545 #ifdef DEBUG
|
|
2546 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len);
|
|
2547 if(fcount>30) {
|
|
2548 break;
|
|
2549 }
|
|
2550 #endif
|
|
2551
|
|
2552 }
|
|
2553 }
|
|
2554 #ifdef HAVE_LIBBZ2
|
|
2555 BZ2_bzReadClose( &bzerror, b);
|
|
2556 #endif
|
|
2557
|
|
2558 fclose(f);
|
|
2559
|
|
2560 Rprintf("done. read %d fragments\n",fcount);
|
|
2561 }
|
|
2562 // construct output structures
|
|
2563 SEXP chnames;
|
|
2564 int np=0; // number of protections
|
|
2565 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
2566 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
2567 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
2568 }
|
|
2569 np++;
|
|
2570
|
|
2571 // sort
|
|
2572 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2573 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
2574 //}
|
|
2575
|
|
2576 SEXP ans;
|
|
2577 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
2578 vector<vector<int> >::const_iterator nsi;
|
|
2579 vector<vector<int> >::const_iterator lsi;
|
|
2580 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
2581 nsi=posnm.begin()+(csi-pos.begin());
|
|
2582 lsi=poslen.begin()+(csi-pos.begin());
|
|
2583
|
|
2584 SEXP dv,dnames_R;
|
|
2585 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
|
|
2586 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
2587 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
2588 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
|
|
2589
|
|
2590
|
|
2591 SEXP tv,nv,lv;
|
|
2592 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
2593 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
2594 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
|
|
2595 int* i_tv=INTEGER(tv);
|
|
2596 int* i_nv=INTEGER(nv);
|
|
2597 int* i_lv=INTEGER(lv);
|
|
2598
|
|
2599 int i=0;
|
|
2600 vector<int>::const_iterator ini=nsi->begin();
|
|
2601 vector<int>::const_iterator lni=lsi->begin();
|
|
2602 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
2603 i_tv[i]=*pi;
|
|
2604 i_nv[i]=*ini++;
|
|
2605 i_lv[i]=*lni++;
|
|
2606 i++;
|
|
2607 }
|
|
2608 PROTECT(dv = allocVector(VECSXP, 3)); np++;
|
|
2609 SET_VECTOR_ELT(dv, 0, tv);
|
|
2610 SET_VECTOR_ELT(dv, 1, nv);
|
|
2611 SET_VECTOR_ELT(dv, 2, lv);
|
|
2612 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
2613
|
|
2614 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
2615 }
|
|
2616
|
|
2617 setAttrib(ans,R_NamesSymbol,chnames);
|
|
2618
|
|
2619 #ifdef DEBUG
|
|
2620 Rprintf("unprotecting %d elements\n",np);
|
|
2621 #endif
|
|
2622
|
|
2623 UNPROTECT(np);
|
|
2624 return(ans);
|
|
2625 }
|
|
2626
|
|
2627
|
|
2628 }
|