diff spp/src/maqread.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spp/src/maqread.cpp	Tue Nov 27 16:11:40 2012 -0500
@@ -0,0 +1,207 @@
+#include "pc.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <zlib.h>
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+#include "maqmap.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx; 
+
+
+class lessAbsoluteValue {
+public:
+  bool operator()(int a, int b) const {
+    return abs(a) < abs(b);
+  }
+};
+
+
+
+//#define DEBUG 1
+
+extern "C" {
+
+  // read in text version of maq map
+  SEXP read_binmaqmap(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  gzFile f=gzopen(fname,"r");
+
+  maqmap_t *m = maqmap_read_header(f);  
+  maqmap1_t *m1, mm1;
+  m1 = &mm1;
+
+  if (!f)  { 
+    cout<<"can't open input file \""<<fname<<"\"\n"; 
+  }  else {
+    Rprintf("opened %s\n",fname);
+
+    // read in bed line
+    string line;
+    int fcount=0;
+    while(maqmap_read1(f, m1)) {
+      string tagname=string(m1->name);
+      string chr=string(m->ref_name[m1->seqid]);
+      int len=m1->size;
+      int fpos=(m1->pos>>1) + 1;
+      if(m1->pos&1) {
+	fpos=-1*(fpos+len-1);
+      }
+      int nm=m1->info1&0xf;
+
+#ifdef DEBUG  
+      Rprintf("read in map line chr=%s tagname=%s fpos=%d, nm=%d, len=%d\n",chr.c_str(),tagname.c_str(),fpos,nm,len);
+#endif
+    
+
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+    gzclose(f);
+    Rprintf("done. read %d fragments\n",fcount);
+  }
+  
+
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 2+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 2, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+}