diff spp/src/bed2vector.cpp @ 6:ce08b0efa3fd draft

Uploaded
author zzhou
date Tue, 27 Nov 2012 16:11:40 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/spp/src/bed2vector.cpp	Tue Nov 27 16:11:40 2012 -0500
@@ -0,0 +1,2628 @@
+#include "pc.h"
+#include "config.h"
+#include <vector>
+#include <string.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <strstream>
+#include <algorithm>
+#include <string>
+#include <functional>
+#include <utility>
+#include <ext/hash_map>
+#include <boost/tokenizer.hpp>
+
+#ifdef HAVE_LIBBZ2
+#include <bzlib.h>
+#endif 
+
+extern "C" {
+#include "R.h"
+#include "Rmath.h"
+#include "Rinternals.h"
+#include "Rdefines.h"
+}
+
+using namespace std;
+using namespace __gnu_cxx; 
+
+
+class lessAbsoluteValue {
+public:
+  bool operator()(int a, int b) const {
+    return abs(a) < abs(b);
+  }
+};
+
+
+
+#ifdef HAVE_LIBBZ2
+int get_bzline(BZFILE* b,string& line) {
+  char c;
+  int     nBuf;
+  int bzerror=BZ_OK;
+
+  while(bzerror == BZ_OK)  {  
+    nBuf=BZ2_bzRead(&bzerror, b, &c, 1);
+    if(bzerror==BZ_OK) {
+      if(c=='\n') {
+	return bzerror;
+      } else {
+	line+=c;
+      }
+    }
+  }
+  return bzerror;
+}
+
+int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) {
+  line="";
+  if(bz2file) {
+    int bzerror=get_bzline(b,line);
+    if(bzerror==BZ_OK) {
+      return(1);
+    } else {
+      if(bzerror!=BZ_STREAM_END) {
+	cerr<<"encountered BZERROR="<<bzerror<<endl;
+      }
+      return(0);
+    }
+  } else {
+    char *cline=NULL;
+    size_t n;
+    if(getline(&cline,&n,f) != -1) {
+      if(cline) {
+	cline[strlen(cline)-1]='\0';
+	line+=cline;
+	free(cline);
+      }
+      return(1);
+    } else {
+      return(0);
+    }
+  }
+}
+#endif
+
+
+/**
+ * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign
+ * corresponding to the strand.
+ */
+
+//#define DEBUG 1
+
+extern "C" {
+SEXP read_bed_ends(SEXP filename) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t");
+
+
+  ifstream bed_file(fname);
+
+#ifdef DEBUG  
+  Rprintf("opened %s\n",fname);
+#endif
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+    
+  int fcount=0;
+  while(getline(bed_file,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str()));
+      string str_start=*sit++;
+      int fstart=atoi(str_start.c_str());
+      string str_end=*sit++;
+      int fend=atoi(str_end.c_str());
+      int fpos=fstart;
+      if(sit!=tok.end()) {
+         string u0=*sit++;
+         string nfield=*sit++;
+         string strand=*sit++;
+         if(strand=="-") { 
+	   fpos=-1*fend;
+         }
+      }
+
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  bed_file.close();
+    
+    
+#ifdef DEBUG  
+  Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+  Rprintf("done. read %d fragments\n",fcount);
+
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+    sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  }
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    SEXP nv;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    int* i_nv=INTEGER(nv);
+    int i=0;
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_nv[i++]=*pi;
+    }
+    SET_VECTOR_ELT(ans, csi-pos.begin(), nv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+
+SEXP read_meland_old(SEXP filename) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<int> > poslen; // length
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t");
+
+
+  ifstream bed_file(fname);
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+    
+  int fcount=0;
+  while(getline(bed_file,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      sit++; sit++; 
+      string str_nm=*sit++;
+      int nm=0;
+      if(str_nm[0]=='U') {
+	nm=atoi((str_nm.c_str()+1));
+      } else {
+	continue;
+      }
+      sit++; sit++; sit++;
+      string str_len=*sit++;
+      int len=atoi(str_len.c_str());
+      string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
+      string str_pos=*sit++;
+      int fpos=atoi(str_pos.c_str());
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	poslen.push_back(vector<int>());
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      (poslen[cind]).push_back(len);
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  bed_file.close();
+    
+    
+#ifdef DEBUG  
+  Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+  Rprintf("done. read %d fragments\n",fcount);
+
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi,lsi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+    lsi=poslen.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+    
+    
+    
+    SEXP tv,nv,lv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(lv=allocVector(INTSXP,csi->size()));   np++;
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    int* i_lv=INTEGER(lv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    vector<int>::const_iterator ili=lsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i_lv[i]=*ili++;
+      i++;
+    }
+    PROTECT(dv = allocVector(VECSXP, 3));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    SET_VECTOR_ELT(dv, 2, lv);
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+  int get_a_line(FILE *f,string& line) {
+    line="";
+    char cline[1024];
+    if(fgets(cline,1024,f)) {
+      line+=cline;
+      return(1);
+    } else {
+      return(0);
+    }
+  }
+
+
+  SEXP read_meland(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<int> > poslen; // length
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t");
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  
+  Rprintf("opened %s\n",fname);
+
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string tagname=*sit++;
+      sit++; 
+      string str_nm=*sit++;
+      int nm=0;
+      if(str_nm[0]=='U') {
+	nm=atoi((str_nm.c_str()+1));
+      } else {
+	continue;
+      }
+      sit++; sit++; sit++;
+      string str_len=*sit++;
+      int len=atoi(str_len.c_str());
+      string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str()));
+      string str_pos=*sit++;
+      int fpos=atoi(str_pos.c_str());
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	poslen.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      (poslen[cind]).push_back(len);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+    
+    
+#ifdef DEBUG  
+  Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+  Rprintf("done. read %d fragments\n",fcount);
+
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi,lsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+    lsi=poslen.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 3, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,lv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(lv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    int* i_lv=INTEGER(lv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    vector<int>::const_iterator ili=lsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i_lv[i]=*ili++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 3+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    SET_VECTOR_ELT(dv, 2, lv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 3, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+
+// reads regular eland files, recording mismatch positions
+SEXP read_eland_mismatches(SEXP filename) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > mm1; // position of the first mismatch (or 0 for none)
+  vector< vector<int> > mm2; // position of the second mismatch
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      sit++; 
+      string seq=*sit++; 
+      string str_nm=*sit++;
+      int nm=0;
+      if(str_nm[0]=='U') {
+	nm=atoi((str_nm.c_str()+1));
+      } else {
+	continue;
+      }
+      sit++; sit++; sit++;
+      string chr=*sit++; 
+      // extract chromosome name from this
+      int chrp=chr.find("chr");
+      int pp=chr.find('.');
+      chr=chr.substr(chrp+3,pp-chrp-3);
+      
+      string str_pos=*sit++;
+      int fpos=atoi(str_pos.c_str());
+
+
+      string strand=*sit++;
+      int nstrand=0;
+      if(strand=="R") { 
+	fpos=-1*(fpos+seq.size()-1);
+	nstrand=1;
+      }
+
+      sit++;
+      
+      int nm1=0; int nm2=0;
+      if(sit!=tok.end()) {
+	string nms=*sit++;
+	nm1=atoi(nms.substr(0,nms.size()-1).c_str());
+	if(nstrand) { nm1=seq.size()-nm1+1; }
+      }
+      if(sit!=tok.end()) {
+	string nms=*sit++;
+	nm2=atoi(nms.substr(0,nms.size()-1).c_str());
+	if(nstrand) { nm2=seq.size()-nm2+1; }
+      }
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	mm1.push_back(vector<int>());
+	mm2.push_back(vector<int>());
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (mm1[cind]).push_back(nm1);
+      (mm2[cind]).push_back(nm2);
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+    
+    
+#ifdef DEBUG  
+  Rprintf("done. read %d fragments\n",fcount);
+#endif
+
+  Rprintf("done. read %d fragments\n",fcount);
+
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi,lsi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=mm1.begin()+(csi-pos.begin());
+    lsi=mm2.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("f"));
+    SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    
+    
+    
+    SEXP tv,nv,lv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(lv=allocVector(INTSXP,csi->size()));   np++;
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    int* i_lv=INTEGER(lv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    vector<int>::const_iterator ili=lsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i_lv[i]=*ili++;
+      i++;
+    }
+    PROTECT(dv = allocVector(VECSXP, 3));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    SET_VECTOR_ELT(dv, 2, lv);
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+  // read in regular eland files, adjusting the negative strand coordinate by sequence length
+  SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+  int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string tagname=*sit++;
+      string sequence=*sit++;
+      int len=sequence.size();
+      // adjust probe length if eland length limit was specified
+      if(eland_tag_length>0 && len>eland_tag_length) {
+	len=eland_tag_length;
+      }
+      string str_nm=*sit++;
+      int nm=0;
+      if(str_nm[0]=='U') {
+	nm=atoi((str_nm.c_str()+1));
+      } else {
+	continue;
+      }
+      sit++; sit++; sit++;
+      string chr=*sit++; 
+      string str_pos=*sit++;
+      int fpos=atoi(str_pos.c_str());
+      string str_strand=*sit++;
+
+      if(str_strand[0]=='R') {
+	fpos=-1*(fpos+len-1);
+      }
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 2+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 2, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+
+  // read in extended eland files, adjusting the negative strand coordinate by sequence length
+  SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+  int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string machinename=*sit++;
+      string runnumber=*sit++;
+      string lanenumber=*sit++;
+      *sit++;
+      
+      string str_x=*sit++;
+      string str_y=*sit++;
+
+      string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y;
+
+      
+
+      *sit++;
+      *sit++;
+
+      
+      string sequence=*sit++;
+      *sit++;
+      
+      string chr=*sit++; 
+      string contig=*sit++; 
+      chr=chr+contig;
+      
+      int len=sequence.size();
+      // adjust probe length if eland length limit was specified
+      if(eland_tag_length>0 && len>eland_tag_length) {
+	len=eland_tag_length;
+      }
+
+
+      
+      string str_pos=*sit++;
+      if(str_pos.size()<1) { continue; }
+      int fpos=atoi(str_pos.c_str());
+      string str_strand=*sit++;
+
+      if(str_strand[0]=='R') {
+	fpos=-1*(fpos+len-1);
+      }
+
+      string str_nm=*sit++;
+      // count non-digit characters
+      int nm=0;
+      for(int i=0;i<str_nm.size();i++) {
+	if(!isdigit(str_nm[i])) { nm++; }
+      }
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 2+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 2, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+  // read in eland multi files, adjusting the negative strand coordinate by sequence length
+SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) {
+  
+#ifdef DEBUG  
+  Rprintf("read_eland_muti() : start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+  int eland_tag_length=*(INTEGER(eland_tag_length_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t","");
+  boost::char_separator<char> comsep(",","",boost::keep_empty_tokens);
+  boost::char_separator<char> colsep(":","",boost::keep_empty_tokens);
+  
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int nline=0;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+    nline++;
+    // chomp
+    size_t elpos = line.find_last_not_of("\n");
+    if(elpos != string::npos) {
+      line = line.substr(0, elpos+1);
+    }
+#ifdef DEBUG  
+    Rprintf("line %d: %s\n",nline,line.c_str());
+#endif
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string tagname=*sit++;
+      string sequence=*sit++;
+      string mspec=*sit++;
+      // parse out match spec
+      
+      if(mspec=="NM" || mspec=="QC") { continue; }
+#ifdef DEBUG  
+      Rprintf("parsing out spec \"%s\" : ",mspec.c_str());
+#endif
+      
+      tokType stok(mspec, colsep);
+      tokType::iterator ssit=stok.begin();
+      string str_nm0=*ssit++;
+      
+      int nm=0;
+      int nm0=atoi(str_nm0.c_str());
+      if(nm0>1) { 
+#ifdef DEBUG  
+	Rprintf("rejected for nm0\n");
+#endif
+	continue; 
+      }
+      if(nm0==0) {
+	string str_nm1=*ssit++;
+	int nm1=atoi(str_nm1.c_str());
+	if(nm1>1) { 
+#ifdef DEBUG  
+	  Rprintf("rejected for nm1\n");
+#endif
+	  continue; 
+	}
+	if(nm1==0) {
+	  string str_nm2=*ssit++;
+	  int nm2=atoi(str_nm2.c_str());
+	  if(nm2>1) { 
+#ifdef DEBUG  
+	    Rprintf("rejected for nm2\n");
+#endif
+	    continue; 
+	  }
+	  nm=2;
+	} else {
+	  nm=1;
+	}
+      }
+
+#ifdef DEBUG  
+      Rprintf("accepted (nm=%d)\n",nm);
+#endif
+      int npos=0;
+      string mpos=*sit++;
+      vector<string> mposc;
+      vector<int> mposp;
+      tokType ptok(mpos, comsep);
+      string prevchr;
+      for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) {
+	string cpos=*psit;
+	npos++;
+	int strand=1;
+	if(cpos.size()<5) {
+	  Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str());
+	}
+	char lc=cpos.at(cpos.size()-1);
+	
+	if(atoi(&lc)==nm) {
+	  switch(cpos.at(cpos.size()-2)) {
+	  case 'R': strand=-1; break;
+	  case 'F': strand=1; break;
+	  default:
+	    Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break;
+	    continue;
+	  }
+          string chr,str_pos;
+	  size_t colpos=cpos.find(":");
+	  if(colpos==string::npos) {
+            if(npos>1) {
+              chr=prevchr;
+              str_pos=cpos.substr(0,cpos.size()-2);
+            } else {
+	      Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str()); 
+	      continue;
+            }
+	  } else {
+	      chr=cpos.substr(0,colpos);
+	      str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos);
+          }
+#ifdef DEBUG  
+	  Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand);
+#endif	  
+	  int pos=strand*atoi(str_pos.c_str());
+	  mposc.push_back(chr);
+	  mposp.push_back(pos);
+	}
+      }
+
+      string chr;
+      int fpos;
+      if(mposc.size()!=1) {
+	if(mposc.size()==0) {
+	  Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str()); 
+	} else {
+	  Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str()); 
+	}
+	continue;
+      } else {
+	chr=*mposc.begin();
+	fpos=*mposp.begin();
+      }
+      
+      int len=sequence.size();
+      // adjust probe length if eland length limit was specified
+      if(eland_tag_length>0 && len>eland_tag_length) {
+	len=eland_tag_length;
+      }
+
+      if(fpos<0) {
+	fpos=-1*(-1*fpos+len-1);
+      }
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 2+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 2, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+  // read in regular eland files, adjusting the negative strand coordinate by sequence length
+  SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+  boost::char_separator<char> sep2(",");
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; 
+  } else {
+#ifdef HAVE_LIBBZ2
+    BZFILE* b;  
+    int bzerror;
+    
+    int bz2file=0;
+    if(strstr(fname,".bz2")) {
+      bz2file=1;
+      b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+      if (bzerror != BZ_OK)  { cout<<"bzerror="<<bzerror<<endl; }
+    }
+#endif
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+#ifdef HAVE_LIBBZ2
+  while(get_a_line(f,b,bz2file,line)) {
+#else
+  while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string tagname=*sit++;
+      string str_strand=*sit++;
+      string chr=*sit++; 
+
+      string str_pos=*sit++;
+      int fpos=atoi(str_pos.c_str());
+
+      string sequence=*sit++;
+      sit++; sit++;
+      string mm=*sit++;
+
+      int len=sequence.size();
+      if(str_strand[0]=='-') {
+	fpos=-1*(fpos+len-1);
+      }
+      // determine number of mismatches
+      int nm=0;
+      if(mm.size()>0) {
+	nm++;
+	string::size_type tp(0);
+	while(tp!=string::npos) {
+	  tp = mm.find(",",tp);
+	  if(tp!=string::npos) {
+	    tp++;
+	    ++nm;
+	  }
+	}
+      }
+
+
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+
+#ifdef HAVE_LIBBZ2
+  BZ2_bzReadClose( &bzerror, b);
+#endif
+ fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 2+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 2, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+  // read in helicos tab-separated alignment output (regular or bz2)
+  SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<int> > poslen; // length of the match
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+  boost::char_separator<char> sep2(",");
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; 
+  } else {
+#ifdef HAVE_LIBBZ2
+    BZFILE* b;  
+    int bzerror;
+    
+    int bz2file=0;
+    if(strstr(fname,".bz2")) {
+      bz2file=1;
+      b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+      if (bzerror != BZ_OK)  { cout<<"bzerror="<<bzerror<<endl; }
+    }
+#endif
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  int nlines=0;
+#ifdef HAVE_LIBBZ2
+  while(get_a_line(f,b,bz2file,line)) {
+#else
+  while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+    nlines++;
+    // skip comments
+    if(line[0]=='#') { continue; }
+    if(line.compare(0,12,"Reference_ID")==0) { 
+#ifdef DEBUG  
+      Rprintf("matched header on line %d\n",nlines); 
+#endif
+      continue; 
+    }
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string chr=*sit++; 
+      string tagname=*sit++;
+      string str_startpos=*sit++;
+      string str_endpos=*sit++;
+
+      string str_tstart=*sit++;
+      string str_tend=*sit++;
+      int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str());
+
+      sit++; sit++;
+      string str_ndel=*sit++;
+      string str_nins=*sit++;
+      string str_nsub=*sit++;
+      
+      string str_strand=*sit++;
+      int fpos;
+      if(str_strand[0]=='-') {
+	fpos=-1*atoi(str_endpos.c_str()); 
+      } else {
+	fpos=atoi(str_startpos.c_str()); 
+      }
+
+      // determine number of mismatches
+      int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str());
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	poslen.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      (poslen[cind]).push_back(len);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+
+#ifdef HAVE_LIBBZ2
+  BZ2_bzReadClose( &bzerror, b);
+#endif
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<int> >::const_iterator lsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+    lsi=poslen.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 3, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,lv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(lv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    int* i_lv=INTEGER(lv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    vector<int>::const_iterator lni=lsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i_lv[i]=*lni++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 3+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    SET_VECTOR_ELT(dv, 2, lv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 3, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+
+  // read in text version of maq map
+  SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+  int read_names=*(INTEGER(read_tag_names_R));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<string> > tagnames;
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep("\t","",boost::keep_empty_tokens);
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string tagname=*sit++;
+      string chr=*sit++;
+      string str_pos=*sit++;
+      int fpos=atoi(str_pos.c_str());
+      string str_strand=*sit++;
+      sit++; sit++; sit++; sit++; sit++; 
+      string str_nm=*sit++;
+      sit++; sit++; sit++; 
+      string str_len=*sit++;
+      int nm=atoi(str_nm.c_str());
+      int len=atoi(str_len.c_str());
+
+      if(str_strand[0]=='-') {
+	fpos=-1*(fpos+len-1);
+      }
+
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	if(read_names) {
+	  tagnames.push_back(vector<string>());
+	}
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      if(read_names) {
+	(tagnames[cind]).push_back(tagname);
+      }
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<string> >::const_iterator ssi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    if(read_names) {
+      SET_STRING_ELT(dnames_R, 2, mkChar("s"));
+    }
+    
+    
+    
+    SEXP tv,nv,sv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    if(read_names) {
+      PROTECT(sv=allocVector(STRSXP,csi->size()));   np++;
+    }
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    if(read_names) {
+      int i=0;
+      ssi=tagnames.begin()+(csi-pos.begin());
+      for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) {
+	SET_STRING_ELT(sv,i,mkChar(si->c_str()));
+	i++;
+      }
+    }
+    PROTECT(dv = allocVector(VECSXP, 2+read_names));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    if(read_names) {
+      SET_VECTOR_ELT(dv, 2, sv);
+    }
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+
+
+
+  // read in tagalign file
+  SEXP read_tagalign(SEXP filename) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t");
+
+  
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+  while(get_a_line(f,line)) {
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string chr=*sit++;
+      string str_spos=*sit++;
+      string str_epos=*sit++;
+      sit++; 
+      string str_qual=*sit++;
+      string str_strand=*sit;
+
+      int fpos;
+      if(str_strand[0]=='+') {
+	fpos=atoi(str_spos.c_str());
+      } else {
+	fpos=-1*atoi(str_epos.c_str());
+      }
+      int nm=atoi(str_qual.c_str());
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    
+    
+    SEXP tv,nv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    PROTECT(dv = allocVector(VECSXP, 2));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+
+
+  // arachne madness
+  SEXP read_arachne(SEXP filename) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t");
+
+  
+
+
+
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+
+#ifdef HAVE_LIBBZ2
+    BZFILE* b;  
+    int bzerror;
+    
+    int bz2file=0;
+    if(strstr(fname,".bz2")) {
+      bz2file=1;
+      b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+      if (bzerror != BZ_OK)  { cout<<"bzerror="<<bzerror<<endl; }
+    }
+#endif
+
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+#ifdef HAVE_LIBBZ2
+  while(get_a_line(f,b,bz2file,line)) {
+#else
+  while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string chr=*sit++;
+      string str_spos=*sit++;
+      int nm=0;
+      if(sit!=tok.end()) {
+	string str_mm=*sit;
+	nm=atoi(str_mm.c_str());
+      }
+      
+      int fpos=atoi(str_spos.c_str());;
+      
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+#ifdef HAVE_LIBBZ2
+  BZ2_bzReadClose( &bzerror, b);
+#endif
+
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 2)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    
+    
+    SEXP tv,nv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i++;
+    }
+    PROTECT(dv = allocVector(VECSXP, 2));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+  // arachne madness
+  SEXP read_arachne_long(SEXP filename) {
+
+#ifdef DEBUG  
+  Rprintf("start\n");
+#endif
+  const char* fname=CHAR(asChar(filename));
+#ifdef DEBUG  
+  Rprintf("fname=%s\n",fname);
+#endif
+
+  // main data vector
+  // chr - pos
+  vector< vector<int> > pos;
+  vector< vector<int> > posnm; // number of mismatches
+  vector< vector<int> > poslen; // length of the match
+
+  // chromosome map
+  hash_map<string, int, hash<string>,equal_to<string> > cind_map;
+  vector<string> cnames;
+  
+
+  typedef boost::tokenizer<boost::char_separator<char> >  tokType;
+  boost::char_separator<char> sep(" \t");
+
+  
+
+
+
+  FILE *f=fopen(fname,"rb");
+  if (!f)  { cout<<"can't open input file \""<<fname<<"\"\n"; }
+  else {
+
+#ifdef HAVE_LIBBZ2
+    BZFILE* b;  
+    int bzerror;
+    
+    int bz2file=0;
+    if(strstr(fname,".bz2")) {
+      bz2file=1;
+      b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0);
+      if (bzerror != BZ_OK)  { cout<<"bzerror="<<bzerror<<endl; }
+    }
+#endif
+
+
+  Rprintf("opened %s\n",fname);
+
+  // read in bed line
+  string line;
+  int fcount=0;
+#ifdef HAVE_LIBBZ2
+  while(get_a_line(f,b,bz2file,line)) {
+#else
+  while(get_a_line(f,line)) {
+#endif
+
+#ifdef DEBUG  
+    Rprintf("line: %s\n",line.c_str());
+#endif
+
+
+    tokType tok(line, sep);
+    tokType::iterator sit=tok.begin();
+    if(sit!=tok.end()) {
+      string query=*sit++;
+      if(query!="QUERY") { continue; }
+      *sit++; *sit++; *sit++; *sit++; 
+      string str_strand=*sit++;
+      string chr=*sit++;
+      string str_startpos=*sit++;
+      string str_endpos=*sit++;
+      
+      int fpos;
+      if(str_strand[0]=='1') {
+	fpos=-1*atoi(str_endpos.c_str()); 
+      } else {
+	fpos=atoi(str_startpos.c_str()); 
+      }
+#ifdef DEBUG  
+      Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos);
+#endif
+      *sit++;
+      string str_nblocks=*sit++;
+      int nblocks=atoi(str_nblocks.c_str());
+#ifdef DEBUG  
+      Rprintf("nblocks=%d\n",nblocks);
+#endif
+      // tally up the read length and the number of mismatches for all blocks
+      int len=0; int nm=0;
+      for(int i=0;i<nblocks;i++) {
+	string str_sgs=*sit++;
+	int sgs=atoi(str_sgs.c_str());
+	string str_slen=*sit++;
+	int slen=atoi(str_slen.c_str());
+	string str_snm=*sit++;
+	int snm=atoi(str_snm.c_str());
+#ifdef DEBUG  
+	Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm);
+#endif
+	len+=slen;
+	nm+=abs(sgs)+snm;
+      }
+      nm+=nblocks-1;
+      
+      
+      // determine the chromosome index
+      hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr);
+      int cind=-1;
+      if(li==cind_map.end()) {
+	// register new chromosome
+	cind=cnames.size();
+	cnames.push_back(chr);
+	cind_map[chr]=cind;
+	// allocate new pos vector
+	pos.push_back(vector<int>());
+	posnm.push_back(vector<int>());
+	poslen.push_back(vector<int>());
+#ifdef DEBUG  
+	Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size());
+#endif
+      } else {
+	cind=li->second;
+      }
+      fcount++;
+      (pos[cind]).push_back(fpos);
+      (posnm[cind]).push_back(nm);
+      (poslen[cind]).push_back(len);
+#ifdef DEBUG  
+      Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len);
+      if(fcount>30) {
+	break;
+      }
+#endif
+      
+    }
+  }
+#ifdef HAVE_LIBBZ2
+  BZ2_bzReadClose( &bzerror, b);
+#endif
+
+  fclose(f);
+     
+  Rprintf("done. read %d fragments\n",fcount);
+  }
+    // construct output structures
+  SEXP chnames;
+  int np=0; // number of protections
+  PROTECT(chnames = allocVector(STRSXP, cnames.size()));
+  for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) {
+    SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str()));
+  }
+  np++;
+
+  // sort
+  //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) {
+  //  sort(csi->begin(), csi->end(), lessAbsoluteValue());
+  //}
+
+  SEXP ans;
+  PROTECT(ans = allocVector(VECSXP, cnames.size()));   np++;
+  vector<vector<int> >::const_iterator nsi;
+  vector<vector<int> >::const_iterator lsi;
+  for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) {
+    nsi=posnm.begin()+(csi-pos.begin());
+    lsi=poslen.begin()+(csi-pos.begin());
+
+    SEXP dv,dnames_R;
+    PROTECT(dnames_R = allocVector(STRSXP, 3)); np++;
+    SET_STRING_ELT(dnames_R, 0, mkChar("t"));
+    SET_STRING_ELT(dnames_R, 1, mkChar("n"));
+    SET_STRING_ELT(dnames_R, 2, mkChar("l"));
+    
+    
+    SEXP tv,nv,lv;
+    PROTECT(tv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(nv=allocVector(INTSXP,csi->size()));   np++;
+    PROTECT(lv=allocVector(INTSXP,csi->size()));   np++;
+    int* i_tv=INTEGER(tv);
+    int* i_nv=INTEGER(nv);
+    int* i_lv=INTEGER(lv);
+    
+    int i=0;
+    vector<int>::const_iterator ini=nsi->begin();
+    vector<int>::const_iterator lni=lsi->begin();
+    for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) {
+      i_tv[i]=*pi;
+      i_nv[i]=*ini++;
+      i_lv[i]=*lni++;
+      i++;
+    }
+    PROTECT(dv = allocVector(VECSXP, 3));   np++;
+    SET_VECTOR_ELT(dv, 0, tv);
+    SET_VECTOR_ELT(dv, 1, nv);
+    SET_VECTOR_ELT(dv, 2, lv);
+    setAttrib(dv, R_NamesSymbol, dnames_R);
+    
+    SET_VECTOR_ELT(ans, csi-pos.begin(), dv);
+  }
+
+  setAttrib(ans,R_NamesSymbol,chnames);
+
+#ifdef DEBUG  
+  Rprintf("unprotecting %d elements\n",np);
+#endif
+  
+  UNPROTECT(np);
+  return(ans);
+}
+
+
+}