annotate spp/src/bamread.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
1 #include "pc.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
2 #include "config.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
3 #include <vector>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
4 #include <string.h>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
5 #include <iostream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
6 #include <fstream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
7 #include <sstream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
8 #include <strstream>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
9 #include <algorithm>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
10 #include <string>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
11 #include <functional>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
12 #include <utility>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
13 #include <ext/hash_map>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
14 #include <boost/tokenizer.hpp>
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
15
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
16 #include "BamAlignment.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
17 #include "BamAux.h" /* RefVector/RefData */
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
18 #include "BamReader.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
19
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
20
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
21 extern "C" {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
22 #include "R.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
23 #include "Rmath.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
24 #include "Rinternals.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
25 #include "Rdefines.h"
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
26 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
27
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
28 using namespace std;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
29 using namespace __gnu_cxx;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
30
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
31
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
32 class lessAbsoluteValue {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
33 public:
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
34 bool operator()(int a, int b) const {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
35 return abs(a) < abs(b);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
36 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
37 };
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
38
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
39
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
40
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
41
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
42
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
43 //#define DEBUG 1
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
44
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
45 extern "C" {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
46
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
47
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
48 // read in bam file
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
49 SEXP read_bam(SEXP filename,SEXP read_tag_names_R) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
50
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
51 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
52 Rprintf("start\n");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
53 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
54 const char* fname=CHAR(asChar(filename));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
55 int read_names=*(INTEGER(read_tag_names_R));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
56 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
57 Rprintf("fname=%s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
58 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
59
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
60 // main data vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
61 // chr - pos
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
62 vector< vector<int> > pos;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
63 vector< vector<int> > posnm; // number of mismatches
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
64 vector< vector<string> > tagnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
65
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
66 // chromosome map
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
67 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
68 vector<string> cnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
69
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
70
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
71 typedef boost::tokenizer<boost::char_separator<char> > tokType;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
72 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
73 boost::char_separator<char> sep2(",");
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
74
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
75 BamTools::BamReader bamf;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
76
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
77 if (!bamf.Open(fname)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
78 cout << "ERROR: failed to open BAM file '" << fname << "'" << endl;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
79 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
80
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
81 Rprintf("opened %s\n",fname);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
82 BamTools::RefVector refs = bamf.GetReferenceData();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
83 BamTools::BamAlignment al;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
84
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
85 int fcount=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
86 while (bamf.GetNextAlignment(al)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
87 if (!al.IsMapped() || !al.IsPrimaryAlignment()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
88 continue;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
89 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
90
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
91 string tagname=al.Name;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
92 string chr=refs[al.RefID].RefName;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
93 int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
94 if(al.IsReverseStrand()) { fpos=-1*fpos; }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
95
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
96 uint32_t nms;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
97 int nm=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
98 if (al.GetEditDistance(nms)) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
99 nm=nms;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
100 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
101
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
102
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
103 // determine the chromosome index
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
104 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
105 int cind=-1;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
106 if(li==cind_map.end()) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
107 // register new chromosome
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
108 cind=cnames.size();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
109 cnames.push_back(chr);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
110 cind_map[chr]=cind;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
111 // allocate new pos vector
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
112 pos.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
113 posnm.push_back(vector<int>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
114 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
115 tagnames.push_back(vector<string>());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
116 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
117 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
118 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
119 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
120 } else {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
121 cind=li->second;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
122 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
123 fcount++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
124 (pos[cind]).push_back(fpos);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
125 (posnm[cind]).push_back(nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
126 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
127 (tagnames[cind]).push_back(al.Name);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
128 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
129 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
130 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
131 if(fcount>30) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
132 break;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
133 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
134 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
135
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
136 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
137 bamf.Close();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
138
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
139 Rprintf("done. read %d fragments\n",fcount);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
140 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
141
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
142
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
143
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
144 // construct output structures
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
145 SEXP chnames;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
146 int np=0; // number of protections
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
147 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
148 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
149 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
150 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
151 np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
152
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
153 // sort
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
154 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
155 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
156 //}
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
157
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
158 SEXP ans;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
159 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
160 vector<vector<int> >::const_iterator nsi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
161 vector<vector<string> >::const_iterator ssi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
162 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
163 nsi=posnm.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
164
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
165 SEXP dv,dnames_R;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
166 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
167 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
168 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
169 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
170 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
171 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
172
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
173
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
174
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
175 SEXP tv,nv,sv;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
176 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
177 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
178 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
179 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
180 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
181 int* i_tv=INTEGER(tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
182 int* i_nv=INTEGER(nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
183
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
184 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
185 vector<int>::const_iterator ini=nsi->begin();
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
186 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
187 i_tv[i]=*pi;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
188 i_nv[i]=*ini++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
189 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
190 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
191 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
192 int i=0;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
193 ssi=tagnames.begin()+(csi-pos.begin());
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
194 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
195 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
196 i++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
197 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
198 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
199 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
200 SET_VECTOR_ELT(dv, 0, tv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
201 SET_VECTOR_ELT(dv, 1, nv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
202 if(read_names) {
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
203 SET_VECTOR_ELT(dv, 2, sv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
204 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
205 setAttrib(dv, R_NamesSymbol, dnames_R);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
206
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
207 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
208 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
209
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
210 setAttrib(ans,R_NamesSymbol,chnames);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
211
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
212 #ifdef DEBUG
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
213 Rprintf("unprotecting %d elements\n",np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
214 #endif
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
215
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
216 UNPROTECT(np);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
217 return(ans);
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
218 }
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
219
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
220
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
221
ce08b0efa3fd Uploaded
zzhou
parents:
diff changeset
222 }