6
|
1 #include "pc.h"
|
|
2 #include "config.h"
|
|
3 #include <vector>
|
|
4 #include <string.h>
|
|
5 #include <iostream>
|
|
6 #include <fstream>
|
|
7 #include <sstream>
|
|
8 #include <strstream>
|
|
9 #include <algorithm>
|
|
10 #include <string>
|
|
11 #include <functional>
|
|
12 #include <utility>
|
|
13 #include <ext/hash_map>
|
|
14 #include <boost/tokenizer.hpp>
|
|
15
|
|
16 #include "BamAlignment.h"
|
|
17 #include "BamAux.h" /* RefVector/RefData */
|
|
18 #include "BamReader.h"
|
|
19
|
|
20
|
|
21 extern "C" {
|
|
22 #include "R.h"
|
|
23 #include "Rmath.h"
|
|
24 #include "Rinternals.h"
|
|
25 #include "Rdefines.h"
|
|
26 }
|
|
27
|
|
28 using namespace std;
|
|
29 using namespace __gnu_cxx;
|
|
30
|
|
31
|
|
32 class lessAbsoluteValue {
|
|
33 public:
|
|
34 bool operator()(int a, int b) const {
|
|
35 return abs(a) < abs(b);
|
|
36 }
|
|
37 };
|
|
38
|
|
39
|
|
40
|
|
41
|
|
42
|
|
43 //#define DEBUG 1
|
|
44
|
|
45 extern "C" {
|
|
46
|
|
47
|
|
48 // read in bam file
|
|
49 SEXP read_bam(SEXP filename,SEXP read_tag_names_R) {
|
|
50
|
|
51 #ifdef DEBUG
|
|
52 Rprintf("start\n");
|
|
53 #endif
|
|
54 const char* fname=CHAR(asChar(filename));
|
|
55 int read_names=*(INTEGER(read_tag_names_R));
|
|
56 #ifdef DEBUG
|
|
57 Rprintf("fname=%s\n",fname);
|
|
58 #endif
|
|
59
|
|
60 // main data vector
|
|
61 // chr - pos
|
|
62 vector< vector<int> > pos;
|
|
63 vector< vector<int> > posnm; // number of mismatches
|
|
64 vector< vector<string> > tagnames;
|
|
65
|
|
66 // chromosome map
|
|
67 hash_map<string, int, hash<string>,equal_to<string> > cind_map;
|
|
68 vector<string> cnames;
|
|
69
|
|
70
|
|
71 typedef boost::tokenizer<boost::char_separator<char> > tokType;
|
|
72 boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
|
|
73 boost::char_separator<char> sep2(",");
|
|
74
|
|
75 BamTools::BamReader bamf;
|
|
76
|
|
77 if (!bamf.Open(fname)) {
|
|
78 cout << "ERROR: failed to open BAM file '" << fname << "'" << endl;
|
|
79 } else {
|
|
80
|
|
81 Rprintf("opened %s\n",fname);
|
|
82 BamTools::RefVector refs = bamf.GetReferenceData();
|
|
83 BamTools::BamAlignment al;
|
|
84
|
|
85 int fcount=0;
|
|
86 while (bamf.GetNextAlignment(al)) {
|
|
87 if (!al.IsMapped() || !al.IsPrimaryAlignment()) {
|
|
88 continue;
|
|
89 }
|
|
90
|
|
91 string tagname=al.Name;
|
|
92 string chr=refs[al.RefID].RefName;
|
|
93 int fpos=(int) (al.Position + (al.IsReverseStrand() ? al.Length : 0));
|
|
94 if(al.IsReverseStrand()) { fpos=-1*fpos; }
|
|
95
|
|
96 uint32_t nms;
|
|
97 int nm=0;
|
|
98 if (al.GetEditDistance(nms)) {
|
|
99 nm=nms;
|
|
100 }
|
|
101
|
|
102
|
|
103 // determine the chromosome index
|
|
104 hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
|
|
105 int cind=-1;
|
|
106 if(li==cind_map.end()) {
|
|
107 // register new chromosome
|
|
108 cind=cnames.size();
|
|
109 cnames.push_back(chr);
|
|
110 cind_map[chr]=cind;
|
|
111 // allocate new pos vector
|
|
112 pos.push_back(vector<int>());
|
|
113 posnm.push_back(vector<int>());
|
|
114 if(read_names) {
|
|
115 tagnames.push_back(vector<string>());
|
|
116 }
|
|
117 #ifdef DEBUG
|
|
118 Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
|
|
119 #endif
|
|
120 } else {
|
|
121 cind=li->second;
|
|
122 }
|
|
123 fcount++;
|
|
124 (pos[cind]).push_back(fpos);
|
|
125 (posnm[cind]).push_back(nm);
|
|
126 if(read_names) {
|
|
127 (tagnames[cind]).push_back(al.Name);
|
|
128 }
|
|
129 #ifdef DEBUG
|
|
130 Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d",chr.c_str(),cind,fpos,nm);
|
|
131 if(fcount>30) {
|
|
132 break;
|
|
133 }
|
|
134 #endif
|
|
135
|
|
136 }
|
|
137 bamf.Close();
|
|
138
|
|
139 Rprintf("done. read %d fragments\n",fcount);
|
|
140 }
|
|
141
|
|
142
|
|
143
|
|
144 // construct output structures
|
|
145 SEXP chnames;
|
|
146 int np=0; // number of protections
|
|
147 PROTECT(chnames = allocVector(STRSXP, cnames.size()));
|
|
148 for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
|
|
149 SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
|
|
150 }
|
|
151 np++;
|
|
152
|
|
153 // sort
|
|
154 //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
155 // sort(csi->begin(), csi->end(), lessAbsoluteValue());
|
|
156 //}
|
|
157
|
|
158 SEXP ans;
|
|
159 PROTECT(ans = allocVector(VECSXP, cnames.size())); np++;
|
|
160 vector<vector<int> >::const_iterator nsi;
|
|
161 vector<vector<string> >::const_iterator ssi;
|
|
162 for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
|
|
163 nsi=posnm.begin()+(csi-pos.begin());
|
|
164
|
|
165 SEXP dv,dnames_R;
|
|
166 PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
|
|
167 SET_STRING_ELT(dnames_R, 0, mkChar("t"));
|
|
168 SET_STRING_ELT(dnames_R, 1, mkChar("n"));
|
|
169 if(read_names) {
|
|
170 SET_STRING_ELT(dnames_R, 2, mkChar("s"));
|
|
171 }
|
|
172
|
|
173
|
|
174
|
|
175 SEXP tv,nv,sv;
|
|
176 PROTECT(tv=allocVector(INTSXP,csi->size())); np++;
|
|
177 PROTECT(nv=allocVector(INTSXP,csi->size())); np++;
|
|
178 if(read_names) {
|
|
179 PROTECT(sv=allocVector(STRSXP,csi->size())); np++;
|
|
180 }
|
|
181 int* i_tv=INTEGER(tv);
|
|
182 int* i_nv=INTEGER(nv);
|
|
183
|
|
184 int i=0;
|
|
185 vector<int>::const_iterator ini=nsi->begin();
|
|
186 for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
|
|
187 i_tv[i]=*pi;
|
|
188 i_nv[i]=*ini++;
|
|
189 i++;
|
|
190 }
|
|
191 if(read_names) {
|
|
192 int i=0;
|
|
193 ssi=tagnames.begin()+(csi-pos.begin());
|
|
194 for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
|
|
195 SET_STRING_ELT(sv,i,mkChar(si->c_str()));
|
|
196 i++;
|
|
197 }
|
|
198 }
|
|
199 PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++;
|
|
200 SET_VECTOR_ELT(dv, 0, tv);
|
|
201 SET_VECTOR_ELT(dv, 1, nv);
|
|
202 if(read_names) {
|
|
203 SET_VECTOR_ELT(dv, 2, sv);
|
|
204 }
|
|
205 setAttrib(dv, R_NamesSymbol, dnames_R);
|
|
206
|
|
207 SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
|
|
208 }
|
|
209
|
|
210 setAttrib(ans,R_NamesSymbol,chnames);
|
|
211
|
|
212 #ifdef DEBUG
|
|
213 Rprintf("unprotecting %d elements\n",np);
|
|
214 #endif
|
|
215
|
|
216 UNPROTECT(np);
|
|
217 return(ans);
|
|
218 }
|
|
219
|
|
220
|
|
221
|
|
222 }
|