Mercurial > repos > zzhou > spp_phantompeak
comparison spp/src/bamread.cpp @ 6:ce08b0efa3fd draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:11:40 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:608a8e0eac56 | 6:ce08b0efa3fd |
---|---|
1 #include "pc.h" | |
2 #include "config.h" | |
3 #include <vector> | |
4 #include <string.h> | |
5 #include <iostream> | |
6 #include <fstream> | |
7 #include <sstream> | |
8 #include <strstream> | |
9 #include <algorithm> | |
10 #include <string> | |
11 #include <functional> | |
12 #include <utility> | |
13 #include <ext/hash_map> | |
14 #include <boost/tokenizer.hpp> | |
15 | |
16 #include "BamAlignment.h" | |
17 #include "BamAux.h" /* RefVector/RefData */ | |
18 #include "BamReader.h" | |
19 | |
20 | |
21 extern "C" { | |
22 #include "R.h" | |
23 #include "Rmath.h" | |
24 #include "Rinternals.h" | |
25 #include "Rdefines.h" | |
26 } | |
27 | |
28 using namespace std; | |
29 using namespace __gnu_cxx; | |
30 | |
31 | |
32 class lessAbsoluteValue { | |
33 public: | |
34 bool operator()(int a, int b) const { | |
35 return abs(a) < abs(b); | |
36 } | |
37 }; | |
38 | |
39 | |
40 | |
41 | |
42 | |
43 //#define DEBUG 1 | |
44 | |
45 extern "C" { | |
46 | |
47 | |
48 // read in bam file | |
49 SEXP read_bam(SEXP filename,SEXP read_tag_names_R) { | |
50 | |
51 #ifdef DEBUG | |
52 Rprintf("start\n"); | |
53 #endif | |
54 const char* fname=CHAR(asChar(filename)); | |
55 int read_names=*(INTEGER(read_tag_names_R)); | |
56 #ifdef DEBUG | |
57 Rprintf("fname=%s\n",fname); | |
58 #endif | |
59 | |
60 // main data vector | |
61 // chr - pos | |
62 vector< vector<int> > pos; | |
63 vector< vector<int> > posnm; // number of mismatches | |
64 vector< vector<string> > tagnames; | |
65 | |
66 // chromosome map | |
67 hash_map<string, int, hash<string>,equal_to<string> > cind_map; | |
68 vector<string> cnames; | |
69 | |
70 | |
71 typedef boost::tokenizer<boost::char_separator<char> > tokType; | |
72 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); | |
73 boost::char_separator<char> sep2(","); | |
74 | |
75 BamTools::BamReader bamf; | |
76 | |
77 if (!bamf.Open(fname)) { | |
78 cout << "ERROR: failed to open BAM file '" << fname << "'" << endl; | |
79 } else { | |
80 | |
81 Rprintf("opened %s\n",fname); | |
82 BamTools::RefVector refs = bamf.GetReferenceData(); | |
83 BamTools::BamAlignment al; | |
84 | |
85 int fcount=0; | |
86 while (bamf.GetNextAlignment(al)) { | |
87 if (!al.IsMapped() || !al.IsPrimaryAlignment()) { | |
88 continue; | |
89 } | |
90 | |
91 string tagname=al.Name; | |
92 string chr=refs[al.RefID].RefName; | |
93 int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0)); | |
94 if(al.IsReverseStrand()) { fpos=-1*fpos; } | |
95 | |
96 uint32_t nms; | |
97 int nm=0; | |
98 if (al.GetEditDistance(nms)) { | |
99 nm=nms; | |
100 } | |
101 | |
102 | |
103 // determine the chromosome index | |
104 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); | |
105 int cind=-1; | |
106 if(li==cind_map.end()) { | |
107 // register new chromosome | |
108 cind=cnames.size(); | |
109 cnames.push_back(chr); | |
110 cind_map[chr]=cind; | |
111 // allocate new pos vector | |
112 pos.push_back(vector<int>()); | |
113 posnm.push_back(vector<int>()); | |
114 if(read_names) { | |
115 tagnames.push_back(vector<string>()); | |
116 } | |
117 #ifdef DEBUG | |
118 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); | |
119 #endif | |
120 } else { | |
121 cind=li->second; | |
122 } | |
123 fcount++; | |
124 (pos[cind]).push_back(fpos); | |
125 (posnm[cind]).push_back(nm); | |
126 if(read_names) { | |
127 (tagnames[cind]).push_back(al.Name); | |
128 } | |
129 #ifdef DEBUG | |
130 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm); | |
131 if(fcount>30) { | |
132 break; | |
133 } | |
134 #endif | |
135 | |
136 } | |
137 bamf.Close(); | |
138 | |
139 Rprintf("done. read %d fragments\n",fcount); | |
140 } | |
141 | |
142 | |
143 | |
144 // construct output structures | |
145 SEXP chnames; | |
146 int np=0; // number of protections | |
147 PROTECT(chnames = allocVector(STRSXP, cnames.size())); | |
148 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { | |
149 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); | |
150 } | |
151 np++; | |
152 | |
153 // sort | |
154 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { | |
155 // sort(csi->begin(), csi->end(), lessAbsoluteValue()); | |
156 //} | |
157 | |
158 SEXP ans; | |
159 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; | |
160 vector<vector<int> >::const_iterator nsi; | |
161 vector<vector<string> >::const_iterator ssi; | |
162 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { | |
163 nsi=posnm.begin()+(csi-pos.begin()); | |
164 | |
165 SEXP dv,dnames_R; | |
166 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; | |
167 SET_STRING_ELT(dnames_R, 0, mkChar("t")); | |
168 SET_STRING_ELT(dnames_R, 1, mkChar("n")); | |
169 if(read_names) { | |
170 SET_STRING_ELT(dnames_R, 2, mkChar("s")); | |
171 } | |
172 | |
173 | |
174 | |
175 SEXP tv,nv,sv; | |
176 PROTECT(tv=allocVector(INTSXP,csi->size())); np++; | |
177 PROTECT(nv=allocVector(INTSXP,csi->size())); np++; | |
178 if(read_names) { | |
179 PROTECT(sv=allocVector(STRSXP,csi->size())); np++; | |
180 } | |
181 int* i_tv=INTEGER(tv); | |
182 int* i_nv=INTEGER(nv); | |
183 | |
184 int i=0; | |
185 vector<int>::const_iterator ini=nsi->begin(); | |
186 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { | |
187 i_tv[i]=*pi; | |
188 i_nv[i]=*ini++; | |
189 i++; | |
190 } | |
191 if(read_names) { | |
192 int i=0; | |
193 ssi=tagnames.begin()+(csi-pos.begin()); | |
194 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { | |
195 SET_STRING_ELT(sv,i,mkChar(si->c_str())); | |
196 i++; | |
197 } | |
198 } | |
199 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; | |
200 SET_VECTOR_ELT(dv, 0, tv); | |
201 SET_VECTOR_ELT(dv, 1, nv); | |
202 if(read_names) { | |
203 SET_VECTOR_ELT(dv, 2, sv); | |
204 } | |
205 setAttrib(dv, R_NamesSymbol, dnames_R); | |
206 | |
207 SET_VECTOR_ELT(ans, csi-pos.begin(), dv); | |
208 } | |
209 | |
210 setAttrib(ans,R_NamesSymbol,chnames); | |
211 | |
212 #ifdef DEBUG | |
213 Rprintf("unprotecting %d elements\n",np); | |
214 #endif | |
215 | |
216 UNPROTECT(np); | |
217 return(ans); | |
218 } | |
219 | |
220 | |
221 | |
222 } |