annotate spp/src/bed2vector.cpp @ 15:e689b83b0257 draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:15:21 -0500
parents ce08b0efa3fd
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1 #include "pc.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2 #include "config.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
3 #include <vector>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
4 #include <string.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
5 #include <iostream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
6 #include <fstream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
7 #include <sstream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
8 #include <strstream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
9 #include <algorithm>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
10 #include <string>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
11 #include <functional>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
12 #include <utility>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
13 #include <ext/hash_map>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
14 #include <boost/tokenizer.hpp>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
15
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
16 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
17 #include <bzlib.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
18 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
19
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
20 extern "C" {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
21 #include "R.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
22 #include "Rmath.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
23 #include "Rinternals.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
24 #include "Rdefines.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
25 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
26
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
27 using namespace std;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
28 using namespace __gnu_cxx;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
29
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
30
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
31 class lessAbsoluteValue {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
32 public:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
33 bool operator()(int a, int b) const {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
34 return abs(a) < abs(b);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
35 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
36 };
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
37
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
38
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
39
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
40 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
41 int get_bzline(BZFILE* b,string& line) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
42 char c;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
43 int nBuf;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
44 int bzerror=BZ_OK;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
45
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
46 while(bzerror == BZ_OK) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
47 nBuf=BZ2_bzRead(&bzerror, b, &c, 1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
48 if(bzerror==BZ_OK) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
49 if(c=='\n') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
50 return bzerror;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
51 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
52 line+=c;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
53 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
54 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
55 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
56 return bzerror;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
57 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
58
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
59 int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
60 line="";
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
61 if(bz2file) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
62 int bzerror=get_bzline(b,line);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
63 if(bzerror==BZ_OK) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
64 return(1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
65 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
66 if(bzerror!=BZ_STREAM_END) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
67 cerr<<"encountered BZERROR="<<bzerror<<endl;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
68 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
69 return(0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
70 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
71 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
72 char *cline=NULL;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
73 size_t n;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
74 if(getline(&cline,&n,f) != -1) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
75 if(cline) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
76 cline[strlen(cline)-1]='\0';
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
77 line+=cline;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
78 free(cline);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
79 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
80 return(1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
81 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
82 return(0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
83 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
84 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
85 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
86 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
87
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
88
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
89 /**
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
90 * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
91 * corresponding to the strand.
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
92 */
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
93
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
94 //#define DEBUG 1
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
95
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
96 extern "C" {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
97 SEXP read_bed_ends(SEXP filename) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
98
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
99 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
100 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
101 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
102 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
103 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
104 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
105 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
106
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
107 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
108 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
109 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
110
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
111 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
112 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
113 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
114
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
115 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
116 boost::char_separator<char> sep(" \t");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
117
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
118
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
119 ifstream bed_file(fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
120
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
121 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
122 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
123 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
124
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
125 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
126
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
127 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
128 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
129
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
130 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
131 while(getline(bed_file,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
132
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
133 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
134 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
135 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
136
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
137
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
138 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
139 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
140 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
141 string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
142 string str_start=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
143 int fstart=atoi(str_start.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
144 string str_end=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
145 int fend=atoi(str_end.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
146 int fpos=fstart;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
147 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
148 string u0=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
149 string nfield=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
150 string strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
151 if(strand=="-") {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
152 fpos=-1*fend;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
153 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
154 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
155
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
156 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
157 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
158 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
159 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
160 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
161 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
162 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
163 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
164 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
165 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
166 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
167 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
168 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
169 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
170 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
171 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
172 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
173 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
174 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
175 Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
176 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
177 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
178 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
179 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
180
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
181 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
182 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
183 bed_file.close();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
184
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
185
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
186 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
187 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
188 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
189
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
190 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
191
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
192 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
193 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
194 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
195 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
196 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
197 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
198 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
199 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
200
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
201 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
202 for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
203 sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
204 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
205
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
206 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
207 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
208 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
209 SEXP nv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
210 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
211 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
212 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
213 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
214 i_nv[i++]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
215 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
216 SET_VECTOR_ELT(ans, csi-pos.begin(), nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
217 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
218
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
219 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
220
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
221 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
222 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
223 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
224
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
225 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
226 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
227 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
228
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
229
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
230
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
231 SEXP read_meland_old(SEXP filename) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
232
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
233 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
234 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
235 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
236 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
237 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
238 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
239 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
240
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
241 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
242 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
243 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
244 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
245 vector< vector<int> > poslen; // length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
246
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
247 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
248 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
249 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
250
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
251
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
252 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
253 boost::char_separator<char> sep(" \t");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
254
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
255
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
256 ifstream bed_file(fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
257
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
258 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
259
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
260 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
261 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
262
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
263 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
264 while(getline(bed_file,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
265
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
266 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
267 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
268 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
269
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
270
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
271 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
272 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
273 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
274 sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
275 string str_nm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
276 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
277 if(str_nm[0]=='U') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
278 nm=atoi((str_nm.c_str()+1));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
279 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
280 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
281 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
282 sit++; sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
283 string str_len=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
284 int len=atoi(str_len.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
285 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
286 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
287 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
288
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
289 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
290 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
291 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
292 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
293 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
294 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
295 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
296 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
297 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
298 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
299 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
300 poslen.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
301 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
302 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
303 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
304 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
305 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
306 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
307 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
308 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
309 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
310 (poslen[cind]).push_back(len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
311 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
312 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
313 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
314 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
315 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
316 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
317
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
318 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
319 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
320 bed_file.close();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
321
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
322
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
323 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
324 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
325 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
326
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
327 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
328
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
329 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
330 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
331 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
332 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
333 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
334 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
335 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
336 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
337
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
338 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
339 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
340 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
341 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
342
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
343 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
344 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
345 vector<vector<int> >::const_iterator nsi,lsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
346 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
347 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
348 lsi=poslen.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
349
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
350 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
351 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
352 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
353 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
354 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
355
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
356
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
357
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
358 SEXP tv,nv,lv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
359 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
360 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
361 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
362 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
363 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
364 int* i_lv=INTEGER(lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
365
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
366 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
367 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
368 vector<int>::const_iterator ili=lsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
369 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
370 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
371 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
372 i_lv[i]=*ili++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
373 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
374 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
375 PROTECT(dv = allocVector(VECSXP, 3)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
376 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
377 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
378 SET_VECTOR_ELT(dv, 2, lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
379 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
380
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
381 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
382 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
383
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
384 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
385
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
386 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
387 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
388 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
389
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
390 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
391 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
392 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
393
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
394
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
395 int get_a_line(FILE *f,string& line) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
396 line="";
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
397 char cline[1024];
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
398 if(fgets(cline,1024,f)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
399 line+=cline;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
400 return(1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
401 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
402 return(0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
403 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
404 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
405
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
406
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
407 SEXP read_meland(SEXP filename,SEXP read_tag_names_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
408
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
409 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
410 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
411 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
412 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
413 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
414 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
415 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
416 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
417
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
418 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
419 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
420 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
421 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
422 vector< vector<int> > poslen; // length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
423 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
424
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
425 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
426 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
427 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
428
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
429
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
430 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
431 boost::char_separator<char> sep(" \t");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
432
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
433
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
434 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
435 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
436
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
437 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
438
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
439
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
440 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
441 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
442 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
443 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
444
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
445 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
446 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
447 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
448
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
449
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
450 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
451 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
452 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
453 string tagname=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
454 sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
455 string str_nm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
456 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
457 if(str_nm[0]=='U') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
458 nm=atoi((str_nm.c_str()+1));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
459 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
460 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
461 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
462 sit++; sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
463 string str_len=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
464 int len=atoi(str_len.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
465 string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
466 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
467 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
468
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
469 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
470 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
471 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
472 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
473 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
474 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
475 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
476 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
477 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
478 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
479 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
480 poslen.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
481 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
482 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
483 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
484 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
485 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
486 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
487 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
488 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
489 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
490 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
491 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
492 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
493 (poslen[cind]).push_back(len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
494 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
495 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
496 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
497 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
498 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
499 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
500 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
501 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
502 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
503
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
504 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
505 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
506 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
507
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
508
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
509 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
510 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
511 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
512
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
513 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
514
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
515 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
516 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
517 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
518 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
519 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
520 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
521 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
522 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
523
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
524 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
525 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
526 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
527 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
528
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
529 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
530 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
531 vector<vector<int> >::const_iterator nsi,lsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
532 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
533 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
534 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
535 lsi=poslen.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
536
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
537 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
538 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
539 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
540 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
541 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
542 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
543 SET_STRING_ELT(dnames_R, 3, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
544 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
545
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
546
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
547
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
548 SEXP tv,nv,lv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
549 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
550 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
551 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
552 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
553 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
554 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
555 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
556 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
557 int* i_lv=INTEGER(lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
558
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
559 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
560 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
561 vector<int>::const_iterator ili=lsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
562 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
563 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
564 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
565 i_lv[i]=*ili++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
566 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
567 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
568 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
569 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
570 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
571 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
572 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
573 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
574 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
575 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
576 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
577 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
578 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
579 SET_VECTOR_ELT(dv, 2, lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
580 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
581 SET_VECTOR_ELT(dv, 3, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
582 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
583 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
584
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
585 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
586 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
587
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
588 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
589
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
590 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
591 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
592 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
593
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
594 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
595 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
596 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
597
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
598
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
599
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
600 // reads regular eland files, recording mismatch positions
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
601 SEXP read_eland_mismatches(SEXP filename) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
602
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
603 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
604 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
605 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
606 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
607 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
608 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
609 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
610
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
611 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
612 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
613 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
614 vector< vector<int> > mm1; // position of the first mismatch (or 0 for none)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
615 vector< vector<int> > mm2; // position of the second mismatch
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
616
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
617 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
618 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
619 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
620
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
621
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
622 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
623 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
624
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
625
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
626 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
627 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
628
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
629 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
630
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
631 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
632 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
633 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
634 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
635
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
636 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
637 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
638 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
639
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
640
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
641 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
642 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
643 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
644 sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
645 string seq=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
646 string str_nm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
647 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
648 if(str_nm[0]=='U') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
649 nm=atoi((str_nm.c_str()+1));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
650 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
651 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
652 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
653 sit++; sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
654 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
655 // extract chromosome name from this
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
656 int chrp=chr.find("chr");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
657 int pp=chr.find('.');
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
658 chr=chr.substr(chrp+3,pp-chrp-3);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
659
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
660 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
661 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
662
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
663
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
664 string strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
665 int nstrand=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
666 if(strand=="R") {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
667 fpos=-1*(fpos+seq.size()-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
668 nstrand=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
669 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
670
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
671 sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
672
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
673 int nm1=0; int nm2=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
674 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
675 string nms=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
676 nm1=atoi(nms.substr(0,nms.size()-1).c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
677 if(nstrand) { nm1=seq.size()-nm1+1; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
678 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
679 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
680 string nms=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
681 nm2=atoi(nms.substr(0,nms.size()-1).c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
682 if(nstrand) { nm2=seq.size()-nm2+1; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
683 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
684
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
685 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
686 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
687 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
688 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
689 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
690 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
691 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
692 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
693 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
694 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
695 mm1.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
696 mm2.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
697 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
698 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
699 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
700 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
701 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
702 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
703 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
704 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
705 (mm1[cind]).push_back(nm1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
706 (mm2[cind]).push_back(nm2);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
707 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
708 Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
709 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
710 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
711 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
712 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
713
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
714 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
715 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
716 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
717
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
718
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
719 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
720 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
721 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
722
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
723 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
724
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
725 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
726 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
727 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
728 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
729 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
730 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
731 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
732 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
733
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
734 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
735 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
736 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
737 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
738
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
739 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
740 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
741 vector<vector<int> >::const_iterator nsi,lsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
742 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
743 nsi=mm1.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
744 lsi=mm2.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
745
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
746 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
747 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
748 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
749 SET_STRING_ELT(dnames_R, 1, mkChar("f"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
750 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
751
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
752
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
753
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
754 SEXP tv,nv,lv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
755 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
756 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
757 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
758 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
759 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
760 int* i_lv=INTEGER(lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
761
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
762 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
763 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
764 vector<int>::const_iterator ili=lsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
765 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
766 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
767 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
768 i_lv[i]=*ili++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
769 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
770 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
771 PROTECT(dv = allocVector(VECSXP, 3)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
772 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
773 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
774 SET_VECTOR_ELT(dv, 2, lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
775 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
776
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
777 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
778 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
779
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
780 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
781
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
782 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
783 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
784 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
785
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
786 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
787 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
788 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
789
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
790
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
791 // read in regular eland files, adjusting the negative strand coordinate by sequence length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
792 SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
793
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
794 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
795 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
796 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
797 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
798 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
799 int eland_tag_length=*(INTEGER(eland_tag_length_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
800 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
801 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
802 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
803
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
804 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
805 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
806 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
807 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
808 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
809
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
810 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
811 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
812 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
813
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
814
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
815 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
816 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
817
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
818
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
819 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
820 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
821 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
822 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
823
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
824 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
825 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
826 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
827 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
828
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
829 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
830 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
831 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
832
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
833
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
834 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
835 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
836 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
837 string tagname=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
838 string sequence=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
839 int len=sequence.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
840 // adjust probe length if eland length limit was specified
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
841 if(eland_tag_length>0 && len>eland_tag_length) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
842 len=eland_tag_length;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
843 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
844 string str_nm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
845 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
846 if(str_nm[0]=='U') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
847 nm=atoi((str_nm.c_str()+1));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
848 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
849 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
850 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
851 sit++; sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
852 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
853 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
854 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
855 string str_strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
856
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
857 if(str_strand[0]=='R') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
858 fpos=-1*(fpos+len-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
859 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
860
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
861 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
862 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
863 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
864 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
865 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
866 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
867 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
868 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
869 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
870 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
871 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
872 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
873 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
874 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
875 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
876 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
877 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
878 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
879 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
880 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
881 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
882 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
883 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
884 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
885 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
886 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
887 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
888 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
889 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
890 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
891 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
892 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
893
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
894 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
895 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
896 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
897
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
898 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
899 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
900 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
901 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
902 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
903 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
904 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
905 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
906 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
907 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
908
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
909 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
910 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
911 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
912 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
913
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
914 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
915 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
916 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
917 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
918 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
919 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
920
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
921 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
922 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
923 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
924 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
925 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
926 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
927 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
928
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
929
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
930
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
931 SEXP tv,nv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
932 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
933 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
934 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
935 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
936 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
937 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
938 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
939
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
940 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
941 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
942 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
943 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
944 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
945 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
946 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
947 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
948 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
949 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
950 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
951 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
952 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
953 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
954 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
955 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
956 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
957 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
958 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
959 SET_VECTOR_ELT(dv, 2, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
960 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
961 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
962
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
963 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
964 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
965
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
966 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
967
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
968 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
969 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
970 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
971
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
972 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
973 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
974 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
975
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
976
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
977
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
978 // read in extended eland files, adjusting the negative strand coordinate by sequence length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
979 SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
980
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
981 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
982 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
983 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
984 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
985 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
986 int eland_tag_length=*(INTEGER(eland_tag_length_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
987 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
988 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
989 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
990
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
991 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
992 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
993 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
994 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
995 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
996
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
997 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
998 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
999 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1000
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1001
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1002 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1003 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1004
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1005
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1006 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1007 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1008 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1009 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1010
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1011 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1012 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1013 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1014 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1015
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1016 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1017 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1018 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1019
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1020
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1021 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1022 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1023 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1024 string machinename=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1025 string runnumber=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1026 string lanenumber=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1027 *sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1028
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1029 string str_x=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1030 string str_y=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1031
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1032 string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1033
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1034
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1035
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1036 *sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1037 *sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1038
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1039
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1040 string sequence=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1041 *sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1042
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1043 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1044 string contig=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1045 chr=chr+contig;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1046
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1047 int len=sequence.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1048 // adjust probe length if eland length limit was specified
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1049 if(eland_tag_length>0 && len>eland_tag_length) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1050 len=eland_tag_length;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1051 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1052
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1053
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1054
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1055 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1056 if(str_pos.size()<1) { continue; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1057 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1058 string str_strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1059
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1060 if(str_strand[0]=='R') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1061 fpos=-1*(fpos+len-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1062 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1063
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1064 string str_nm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1065 // count non-digit characters
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1066 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1067 for(int i=0;i<str_nm.size();i++) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1068 if(!isdigit(str_nm[i])) { nm++; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1069 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1070
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1071 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1072 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1073 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1074 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1075 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1076 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1077 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1078 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1079 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1080 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1081 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1082 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1083 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1084 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1085 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1086 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1087 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1088 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1089 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1090 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1091 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1092 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1093 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1094 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1095 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1096 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1097 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1098 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1099 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1100 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1101 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1102 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1103
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1104 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1105 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1106 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1107
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1108 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1109 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1110 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1111 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1112 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1113 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1114 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1115 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1116 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1117 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1118
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1119 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1120 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1121 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1122 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1123
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1124 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1125 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1126 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1127 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1128 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1129 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1130
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1131 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1132 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1133 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1134 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1135 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1136 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1137 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1138
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1139
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1140
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1141 SEXP tv,nv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1142 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1143 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1144 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1145 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1146 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1147 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1148 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1149
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1150 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1151 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1152 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1153 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1154 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1155 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1156 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1157 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1158 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1159 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1160 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1161 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1162 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1163 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1164 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1165 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1166 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1167 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1168 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1169 SET_VECTOR_ELT(dv, 2, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1170 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1171 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1172
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1173 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1174 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1175
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1176 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1177
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1178 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1179 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1180 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1181
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1182 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1183 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1184 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1185
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1186
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1187 // read in eland multi files, adjusting the negative strand coordinate by sequence length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1188 SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1189
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1190 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1191 Rprintf("read_eland_muti() : start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1192 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1193 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1194 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1195 int eland_tag_length=*(INTEGER(eland_tag_length_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1196 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1197 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1198 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1199
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1200 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1201 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1202 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1203 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1204 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1205
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1206 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1207 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1208 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1209
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1210
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1211 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1212 boost::char_separator<char> sep(" \t","");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1213 boost::char_separator<char> comsep(",","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1214 boost::char_separator<char> colsep(":","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1215
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1216
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1217 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1218 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1219 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1220 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1221
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1222 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1223 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1224 int nline=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1225 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1226 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1227 nline++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1228 // chomp
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1229 size_t elpos = line.find_last_not_of("\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1230 if(elpos != string::npos) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1231 line = line.substr(0, elpos+1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1232 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1233 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1234 Rprintf("line %d: %s\n",nline,line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1235 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1236
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1237 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1238 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1239 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1240 string tagname=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1241 string sequence=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1242 string mspec=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1243 // parse out match spec
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1244
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1245 if(mspec=="NM" || mspec=="QC") { continue; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1246 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1247 Rprintf("parsing out spec \"%s\" : ",mspec.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1248 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1249
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1250 tokType stok(mspec, colsep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1251 tokType::iterator ssit=stok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1252 string str_nm0=*ssit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1253
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1254 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1255 int nm0=atoi(str_nm0.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1256 if(nm0>1) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1257 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1258 Rprintf("rejected for nm0\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1259 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1260 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1261 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1262 if(nm0==0) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1263 string str_nm1=*ssit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1264 int nm1=atoi(str_nm1.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1265 if(nm1>1) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1266 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1267 Rprintf("rejected for nm1\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1268 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1269 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1270 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1271 if(nm1==0) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1272 string str_nm2=*ssit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1273 int nm2=atoi(str_nm2.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1274 if(nm2>1) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1275 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1276 Rprintf("rejected for nm2\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1277 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1278 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1279 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1280 nm=2;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1281 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1282 nm=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1283 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1284 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1285
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1286 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1287 Rprintf("accepted (nm=%d)\n",nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1288 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1289 int npos=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1290 string mpos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1291 vector<string> mposc;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1292 vector<int> mposp;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1293 tokType ptok(mpos, comsep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1294 string prevchr;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1295 for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1296 string cpos=*psit;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1297 npos++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1298 int strand=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1299 if(cpos.size()<5) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1300 Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1301 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1302 char lc=cpos.at(cpos.size()-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1303
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1304 if(atoi(&lc)==nm) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1305 switch(cpos.at(cpos.size()-2)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1306 case 'R': strand=-1; break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1307 case 'F': strand=1; break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1308 default:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1309 Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1310 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1311 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1312 string chr,str_pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1313 size_t colpos=cpos.find(":");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1314 if(colpos==string::npos) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1315 if(npos>1) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1316 chr=prevchr;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1317 str_pos=cpos.substr(0,cpos.size()-2);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1318 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1319 Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1320 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1321 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1322 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1323 chr=cpos.substr(0,colpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1324 str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1325 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1326 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1327 Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1328 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1329 int pos=strand*atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1330 mposc.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1331 mposp.push_back(pos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1332 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1333 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1334
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1335 string chr;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1336 int fpos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1337 if(mposc.size()!=1) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1338 if(mposc.size()==0) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1339 Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1340 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1341 Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1342 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1343 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1344 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1345 chr=*mposc.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1346 fpos=*mposp.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1347 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1348
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1349 int len=sequence.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1350 // adjust probe length if eland length limit was specified
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1351 if(eland_tag_length>0 && len>eland_tag_length) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1352 len=eland_tag_length;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1353 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1354
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1355 if(fpos<0) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1356 fpos=-1*(-1*fpos+len-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1357 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1358
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1359 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1360 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1361 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1362 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1363 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1364 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1365 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1366 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1367 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1368 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1369 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1370 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1371 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1372 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1373 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1374 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1375 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1376 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1377 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1378 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1379 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1380 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1381 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1382 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1383 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1384 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1385 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1386 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1387 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1388 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1389 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1390 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1391
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1392 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1393 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1394 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1395
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1396 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1397 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1398 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1399 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1400 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1401 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1402 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1403 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1404 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1405 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1406
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1407 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1408 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1409 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1410 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1411
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1412 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1413 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1414 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1415 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1416 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1417 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1418
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1419 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1420 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1421 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1422 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1423 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1424 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1425 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1426
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1427
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1428
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1429 SEXP tv,nv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1430 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1431 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1432 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1433 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1434 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1435 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1436 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1437
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1438 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1439 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1440 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1441 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1442 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1443 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1444 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1445 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1446 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1447 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1448 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1449 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1450 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1451 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1452 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1453 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1454 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1455 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1456 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1457 SET_VECTOR_ELT(dv, 2, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1458 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1459 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1460
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1461 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1462 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1463
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1464 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1465
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1466 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1467 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1468 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1469
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1470 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1471 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1472 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1473
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1474
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1475 // read in regular eland files, adjusting the negative strand coordinate by sequence length
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1476 SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1477
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1478 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1479 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1480 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1481 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1482 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1483 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1484 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1485 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1486
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1487 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1488 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1489 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1490 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1491 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1492
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1493 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1494 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1495 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1496
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1497
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1498 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1499 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1500 boost::char_separator<char> sep2(",");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1501
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1502
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1503 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1504 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1505 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1506 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1507 BZFILE* b;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1508 int bzerror;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1509
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1510 int bz2file=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1511 if(strstr(fname,".bz2")) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1512 bz2file=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1513 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1514 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1515 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1516 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1517
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1518 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1519
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1520 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1521 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1522 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1523 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1524 while(get_a_line(f,b,bz2file,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1525 #else
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1526 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1527 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1528
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1529 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1530 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1531 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1532
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1533
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1534 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1535 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1536 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1537 string tagname=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1538 string str_strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1539 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1540
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1541 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1542 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1543
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1544 string sequence=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1545 sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1546 string mm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1547
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1548 int len=sequence.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1549 if(str_strand[0]=='-') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1550 fpos=-1*(fpos+len-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1551 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1552 // determine number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1553 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1554 if(mm.size()>0) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1555 nm++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1556 string::size_type tp(0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1557 while(tp!=string::npos) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1558 tp = mm.find(",",tp);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1559 if(tp!=string::npos) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1560 tp++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1561 ++nm;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1562 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1563 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1564 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1565
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1566
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1567
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1568 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1569 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1570 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1571 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1572 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1573 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1574 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1575 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1576 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1577 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1578 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1579 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1580 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1581 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1582 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1583 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1584 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1585 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1586 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1587 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1588 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1589 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1590 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1591 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1592 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1593 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1594 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1595 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1596 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1597 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1598 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1599 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1600
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1601 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1602 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1603
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1604 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1605 BZ2_bzReadClose( &bzerror, b);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1606 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1607 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1608
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1609 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1610 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1611 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1612 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1613 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1614 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1615 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1616 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1617 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1618 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1619
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1620 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1621 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1622 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1623 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1624
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1625 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1626 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1627 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1628 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1629 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1630 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1631
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1632 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1633 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1634 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1635 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1636 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1637 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1638 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1639
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1640
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1641
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1642 SEXP tv,nv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1643 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1644 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1645 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1646 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1647 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1648 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1649 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1650
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1651 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1652 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1653 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1654 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1655 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1656 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1657 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1658 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1659 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1660 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1661 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1662 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1663 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1664 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1665 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1666 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1667 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1668 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1669 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1670 SET_VECTOR_ELT(dv, 2, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1671 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1672 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1673
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1674 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1675 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1676
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1677 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1678
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1679 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1680 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1681 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1682
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1683 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1684 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1685 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1686
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1687
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1688 // read in helicos tab-separated alignment output (regular or bz2)
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1689 SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1690
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1691 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1692 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1693 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1694 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1695 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1696 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1697 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1698 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1699
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1700 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1701 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1702 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1703 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1704 vector< vector<int> > poslen; // length of the match
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1705 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1706
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1707 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1708 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1709 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1710
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1711
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1712 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1713 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1714 boost::char_separator<char> sep2(",");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1715
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1716
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1717 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1718 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n";
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1719 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1720 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1721 BZFILE* b;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1722 int bzerror;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1723
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1724 int bz2file=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1725 if(strstr(fname,".bz2")) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1726 bz2file=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1727 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1728 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1729 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1730 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1731
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1732 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1733
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1734 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1735 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1736 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1737 int nlines=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1738 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1739 while(get_a_line(f,b,bz2file,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1740 #else
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1741 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1742 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1743
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1744 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1745 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1746 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1747 nlines++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1748 // skip comments
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1749 if(line[0]=='#') { continue; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1750 if(line.compare(0,12,"Reference_ID")==0) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1751 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1752 Rprintf("matched header on line %d\n",nlines);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1753 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1754 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1755 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1756
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1757 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1758 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1759 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1760 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1761 string tagname=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1762 string str_startpos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1763 string str_endpos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1764
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1765 string str_tstart=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1766 string str_tend=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1767 int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1768
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1769 sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1770 string str_ndel=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1771 string str_nins=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1772 string str_nsub=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1773
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1774 string str_strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1775 int fpos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1776 if(str_strand[0]=='-') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1777 fpos=-1*atoi(str_endpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1778 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1779 fpos=atoi(str_startpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1780 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1781
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1782 // determine number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1783 int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1784
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1785 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1786 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1787 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1788 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1789 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1790 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1791 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1792 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1793 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1794 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1795 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1796 poslen.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1797 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1798 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1799 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1800 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1801 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1802 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1803 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1804 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1805 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1806 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1807 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1808 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1809 (poslen[cind]).push_back(len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1810 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1811 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1812 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1813 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1814 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1815 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1816 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1817 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1818 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1819
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1820 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1821 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1822
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1823 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1824 BZ2_bzReadClose( &bzerror, b);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1825 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1826 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1827
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1828 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1829 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1830 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1831 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1832 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1833 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1834 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1835 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1836 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1837 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1838
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1839 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1840 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1841 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1842 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1843
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1844 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1845 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1846 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1847 vector<vector<int> >::const_iterator lsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1848 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1849 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1850 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1851 lsi=poslen.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1852
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1853 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1854 PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1855 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1856 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1857 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1858 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1859 SET_STRING_ELT(dnames_R, 3, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1860 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1861
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1862
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1863
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1864 SEXP tv,nv,lv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1865 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1866 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1867 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1868 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1869 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1870 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1871 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1872 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1873 int* i_lv=INTEGER(lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1874
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1875 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1876 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1877 vector<int>::const_iterator lni=lsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1878 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1879 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1880 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1881 i_lv[i]=*lni++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1882 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1883 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1884 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1885 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1886 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1887 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1888 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1889 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1890 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1891 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1892 PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1893 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1894 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1895 SET_VECTOR_ELT(dv, 2, lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1896 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1897 SET_VECTOR_ELT(dv, 3, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1898 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1899 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1900
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1901 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1902 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1903
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1904 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1905
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1906 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1907 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1908 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1909
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1910 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1911 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1912 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1913
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1914
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1915
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1916 // read in text version of maq map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1917 SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1918
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1919 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1920 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1921 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1922 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1923 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1924 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1925 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1926 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1927
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1928 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1929 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1930 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1931 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1932 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1933
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1934 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1935 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1936 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1937
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1938
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1939 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1940 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1941
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1942
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1943 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1944 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1945 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1946 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1947
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1948 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1949 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1950 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1951 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1952
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1953 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1954 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1955 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1956
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1957
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1958 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1959 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1960 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1961 string tagname=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1962 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1963 string str_pos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1964 int fpos=atoi(str_pos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1965 string str_strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1966 sit++; sit++; sit++; sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1967 string str_nm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1968 sit++; sit++; sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1969 string str_len=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1970 int nm=atoi(str_nm.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1971 int len=atoi(str_len.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1972
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1973 if(str_strand[0]=='-') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1974 fpos=-1*(fpos+len-1);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1975 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1976
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1977 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1978 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1979 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1980 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1981 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1982 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1983 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1984 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1985 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1986 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1987 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1988 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1989 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1990 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1991 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1992 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1993 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1994 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1995 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1996 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1997 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1998 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1999 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2000 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2001 (tagnames[cind]).push_back(tagname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2002 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2003 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2004 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2005 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2006 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2007 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2008 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2009
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2010 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2011 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2012 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2013
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2014 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2015 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2016 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2017 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2018 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2019 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2020 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2021 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2022 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2023 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2024
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2025 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2026 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2027 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2028 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2029
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2030 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2031 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2032 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2033 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2034 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2035 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2036
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2037 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2038 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2039 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2040 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2041 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2042 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2043 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2044
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2045
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2046
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2047 SEXP tv,nv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2048 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2049 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2050 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2051 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2052 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2053 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2054 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2055
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2056 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2057 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2058 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2059 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2060 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2061 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2062 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2063 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2064 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2065 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2066 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2067 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2068 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2069 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2070 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2071 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2072 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2073 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2074 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2075 SET_VECTOR_ELT(dv, 2, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2076 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2077 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2078
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2079 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2080 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2081
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2082 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2083
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2084 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2085 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2086 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2087
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2088 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2089 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2090 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2091
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2092
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2093
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2094
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2095
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2096 // read in tagalign file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2097 SEXP read_tagalign(SEXP filename) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2098
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2099 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2100 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2101 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2102 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2103 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2104 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2105 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2106
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2107 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2108 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2109 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2110 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2111
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2112 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2113 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2114 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2115
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2116
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2117 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2118 boost::char_separator<char> sep(" \t");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2119
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2120
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2121 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2122 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2123 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2124 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2125
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2126 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2127 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2128 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2129 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2130
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2131 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2132 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2133 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2134
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2135
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2136 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2137 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2138 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2139 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2140 string str_spos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2141 string str_epos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2142 sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2143 string str_qual=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2144 string str_strand=*sit;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2145
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2146 int fpos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2147 if(str_strand[0]=='+') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2148 fpos=atoi(str_spos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2149 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2150 fpos=-1*atoi(str_epos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2151 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2152 int nm=atoi(str_qual.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2153
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2154 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2155 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2156 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2157 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2158 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2159 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2160 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2161 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2162 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2163 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2164 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2165 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2166 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2167 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2168 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2169 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2170 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2171 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2172 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2173 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2174 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2175 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2176 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2177 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2178 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2179 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2180
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2181 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2182 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2183 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2184
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2185 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2186 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2187 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2188 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2189 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2190 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2191 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2192 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2193 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2194 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2195
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2196 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2197 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2198 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2199 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2200
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2201 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2202 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2203 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2204 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2205 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2206
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2207 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2208 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2209 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2210 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2211
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2212
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2213 SEXP tv,nv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2214 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2215 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2216 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2217 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2218
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2219 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2220 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2221 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2222 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2223 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2224 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2225 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2226 PROTECT(dv = allocVector(VECSXP, 2)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2227 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2228 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2229 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2230
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2231 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2232 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2233
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2234 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2235
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2236 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2237 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2238 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2239
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2240 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2241 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2242 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2243
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2244
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2245
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2246
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2247 // arachne madness
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2248 SEXP read_arachne(SEXP filename) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2249
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2250 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2251 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2252 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2253 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2254 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2255 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2256 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2257
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2258 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2259 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2260 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2261 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2262
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2263 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2264 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2265 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2266
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2267
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2268 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2269 boost::char_separator<char> sep(" \t");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2270
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2271
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2272
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2273
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2274
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2275 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2276 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2277 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2278
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2279 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2280 BZFILE* b;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2281 int bzerror;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2282
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2283 int bz2file=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2284 if(strstr(fname,".bz2")) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2285 bz2file=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2286 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2287 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2288 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2289 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2290
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2291
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2292 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2293
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2294 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2295 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2296 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2297 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2298 while(get_a_line(f,b,bz2file,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2299 #else
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2300 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2301 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2302
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2303 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2304 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2305 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2306
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2307
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2308 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2309 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2310 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2311 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2312 string str_spos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2313 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2314 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2315 string str_mm=*sit;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2316 nm=atoi(str_mm.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2317 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2318
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2319 int fpos=atoi(str_spos.c_str());;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2320
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2321
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2322 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2323 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2324 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2325 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2326 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2327 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2328 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2329 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2330 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2331 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2332 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2333 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2334 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2335 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2336 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2337 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2338 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2339 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2340 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2341 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2342 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2343 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2344 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2345 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2346 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2347 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2348
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2349 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2350 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2351 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2352 BZ2_bzReadClose( &bzerror, b);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2353 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2354
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2355 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2356
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2357 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2358 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2359 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2360 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2361 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2362 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2363 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2364 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2365 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2366 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2367
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2368 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2369 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2370 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2371 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2372
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2373 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2374 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2375 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2376 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2377 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2378
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2379 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2380 PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2381 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2382 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2383
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2384
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2385 SEXP tv,nv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2386 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2387 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2388 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2389 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2390
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2391 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2392 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2393 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2394 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2395 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2396 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2397 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2398 PROTECT(dv = allocVector(VECSXP, 2)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2399 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2400 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2401 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2402
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2403 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2404 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2405
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2406 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2407
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2408 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2409 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2410 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2411
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2412 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2413 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2414 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2415
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2416
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2417 // arachne madness
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2418 SEXP read_arachne_long(SEXP filename) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2419
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2420 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2421 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2422 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2423 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2424 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2425 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2426 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2427
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2428 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2429 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2430 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2431 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2432 vector< vector<int> > poslen; // length of the match
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2433
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2434 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2435 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2436 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2437
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2438
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2439 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2440 boost::char_separator<char> sep(" \t");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2441
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2442
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2443
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2444
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2445
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2446 FILE *f=fopen(fname,"rb");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2447 if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2448 else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2449
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2450 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2451 BZFILE* b;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2452 int bzerror;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2453
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2454 int bz2file=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2455 if(strstr(fname,".bz2")) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2456 bz2file=1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2457 b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2458 if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2459 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2460 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2461
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2462
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2463 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2464
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2465 // read in bed line
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2466 string line;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2467 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2468 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2469 while(get_a_line(f,b,bz2file,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2470 #else
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2471 while(get_a_line(f,line)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2472 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2473
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2474 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2475 Rprintf("line: %s\n",line.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2476 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2477
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2478
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2479 tokType tok(line, sep);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2480 tokType::iterator sit=tok.begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2481 if(sit!=tok.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2482 string query=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2483 if(query!="QUERY") { continue; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2484 *sit++; *sit++; *sit++; *sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2485 string str_strand=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2486 string chr=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2487 string str_startpos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2488 string str_endpos=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2489
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2490 int fpos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2491 if(str_strand[0]=='1') {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2492 fpos=-1*atoi(str_endpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2493 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2494 fpos=atoi(str_startpos.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2495 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2496 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2497 Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2498 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2499 *sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2500 string str_nblocks=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2501 int nblocks=atoi(str_nblocks.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2502 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2503 Rprintf("nblocks=%d\n",nblocks);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2504 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2505 // tally up the read length and the number of mismatches for all blocks
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2506 int len=0; int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2507 for(int i=0;i<nblocks;i++) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2508 string str_sgs=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2509 int sgs=atoi(str_sgs.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2510 string str_slen=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2511 int slen=atoi(str_slen.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2512 string str_snm=*sit++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2513 int snm=atoi(str_snm.c_str());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2514 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2515 Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2516 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2517 len+=slen;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2518 nm+=abs(sgs)+snm;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2519 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2520 nm+=nblocks-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2521
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2522
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2523 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2524 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2525 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2526 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2527 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2528 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2529 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2530 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2531 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2532 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2533 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2534 poslen.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2535 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2536 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2537 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2538 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2539 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2540 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2541 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2542 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2543 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2544 (poslen[cind]).push_back(len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2545 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2546 Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2547 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2548 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2549 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2550 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2551
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2552 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2553 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2554 #ifdef HAVE_LIBBZ2
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2555 BZ2_bzReadClose( &bzerror, b);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2556 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2557
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2558 fclose(f);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2559
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2560 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2561 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2562 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2563 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2564 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2565 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2566 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2567 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2568 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2569 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2570
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2571 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2572 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2573 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2574 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2575
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2576 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2577 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2578 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2579 vector<vector<int> >::const_iterator lsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2580 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2581 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2582 lsi=poslen.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2583
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2584 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2585 PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2586 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2587 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2588 SET_STRING_ELT(dnames_R, 2, mkChar("l"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2589
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2590
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2591 SEXP tv,nv,lv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2592 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2593 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2594 PROTECT(lv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2595 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2596 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2597 int* i_lv=INTEGER(lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2598
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2599 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2600 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2601 vector<int>::const_iterator lni=lsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2602 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2603 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2604 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2605 i_lv[i]=*lni++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2606 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2607 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2608 PROTECT(dv = allocVector(VECSXP, 3)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2609 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2610 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2611 SET_VECTOR_ELT(dv, 2, lv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2612 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2613
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2614 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2615 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2616
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2617 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2618
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2619 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2620 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2621 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2622
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2623 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2624 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2625 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2626
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2627
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2628 }