comparison spp/src/bamread.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
comparison
equal deleted inserted replaced
5:608a8e0eac56 6:ce08b0efa3fd
1 #include "pc.h"
2 #include "config.h"
3 #include <vector>
4 #include <string.h>
5 #include <iostream>
6 #include <fstream>
7 #include <sstream>
8 #include <strstream>
9 #include <algorithm>
10 #include <string>
11 #include <functional>
12 #include <utility>
13 #include <ext/hash_map>
14 #include <boost/tokenizer.hpp>
15
16 #include "BamAlignment.h"
17 #include "BamAux.h" /* RefVector/RefData */
18 #include "BamReader.h"
19
20
21 extern "C" {
22 #include "R.h"
23 #include "Rmath.h"
24 #include "Rinternals.h"
25 #include "Rdefines.h"
26 }
27
28 using namespace std;
29 using namespace __gnu_cxx;
30
31
32 class lessAbsoluteValue {
33 public:
34 bool operator()(int a, int b) const {
35 return abs(a) < abs(b);
36 }
37 };
38
39
40
41
42
43 //#define DEBUG 1
44
45 extern "C" {
46
47
48 // read in bam file
49 SEXP read_bam(SEXP filename,SEXP read_tag_names_R) {
50
51 #ifdef DEBUG
52 Rprintf("start\n");
53 #endif
54 const char* fname=CHAR(asChar(filename));
55 int read_names=*(INTEGER(read_tag_names_R));
56 #ifdef DEBUG
57 Rprintf("fname=%s\n",fname);
58 #endif
59
60 // main data vector
61 // chr - pos
62 vector< vector<int> > pos;
63 vector< vector<int> > posnm; // number of mismatches
64 vector< vector<string> > tagnames;
65
66 // chromosome map
67 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
68 vector<string> cnames;
69
70
71 typedef boost::tokenizer<boost::char_separator<char> > tokType;
72 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
73 boost::char_separator<char> sep2(",");
74
75 BamTools::BamReader bamf;
76
77 if (!bamf.Open(fname)) {
78 cout << "ERROR: failed to open BAM file '" << fname << "'" << endl;
79 } else {
80
81 Rprintf("opened %s\n",fname);
82 BamTools::RefVector refs = bamf.GetReferenceData();
83 BamTools::BamAlignment al;
84
85 int fcount=0;
86 while (bamf.GetNextAlignment(al)) {
87 if (!al.IsMapped() || !al.IsPrimaryAlignment()) {
88 continue;
89 }
90
91 string tagname=al.Name;
92 string chr=refs[al.RefID].RefName;
93 int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0));
94 if(al.IsReverseStrand()) { fpos=-1*fpos; }
95
96 uint32_t nms;
97 int nm=0;
98 if (al.GetEditDistance(nms)) {
99 nm=nms;
100 }
101
102
103 // determine the chromosome index
104 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
105 int cind=-1;
106 if(li==cind_map.end()) {
107 // register new chromosome
108 cind=cnames.size();
109 cnames.push_back(chr);
110 cind_map[chr]=cind;
111 // allocate new pos vector
112 pos.push_back(vector<int>());
113 posnm.push_back(vector<int>());
114 if(read_names) {
115 tagnames.push_back(vector<string>());
116 }
117 #ifdef DEBUG
118 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
119 #endif
120 } else {
121 cind=li->second;
122 }
123 fcount++;
124 (pos[cind]).push_back(fpos);
125 (posnm[cind]).push_back(nm);
126 if(read_names) {
127 (tagnames[cind]).push_back(al.Name);
128 }
129 #ifdef DEBUG
130 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm);
131 if(fcount>30) {
132 break;
133 }
134 #endif
135
136 }
137 bamf.Close();
138
139 Rprintf("done. read %d fragments\n",fcount);
140 }
141
142
143
144 // construct output structures
145 SEXP chnames;
146 int np=0; // number of protections
147 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
148 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
149 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
150 }
151 np++;
152
153 // sort
154 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
155 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
156 //}
157
158 SEXP ans;
159 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
160 vector<vector<int> >::const_iterator nsi;
161 vector<vector<string> >::const_iterator ssi;
162 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
163 nsi=posnm.begin()+(csi-pos.begin());
164
165 SEXP dv,dnames_R;
166 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
167 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
168 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
169 if(read_names) {
170 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
171 }
172
173
174
175 SEXP tv,nv,sv;
176 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
177 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
178 if(read_names) {
179 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
180 }
181 int* i_tv=INTEGER(tv);
182 int* i_nv=INTEGER(nv);
183
184 int i=0;
185 vector<int>::const_iterator ini=nsi->begin();
186 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
187 i_tv[i]=*pi;
188 i_nv[i]=*ini++;
189 i++;
190 }
191 if(read_names) {
192 int i=0;
193 ssi=tagnames.begin()+(csi-pos.begin());
194 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
195 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
196 i++;
197 }
198 }
199 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
200 SET_VECTOR_ELT(dv, 0, tv);
201 SET_VECTOR_ELT(dv, 1, nv);
202 if(read_names) {
203 SET_VECTOR_ELT(dv, 2, sv);
204 }
205 setAttrib(dv, R_NamesSymbol, dnames_R);
206
207 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
208 }
209
210 setAttrib(ans,R_NamesSymbol,chnames);
211
212 #ifdef DEBUG
213 Rprintf("unprotecting %d elements\n",np);
214 #endif
215
216 UNPROTECT(np);
217 return(ans);
218 }
219
220
221
222 }