Mercurial > repos > zzhou > spp_phantompeak
view spp/src/bed2vector.cpp @ 15:e689b83b0257 draft
Uploaded
author | zzhou |
---|---|
date | Tue, 27 Nov 2012 16:15:21 -0500 |
parents | ce08b0efa3fd |
children |
line wrap: on
line source
#include "pc.h" #include "config.h" #include <vector> #include <string.h> #include <iostream> #include <fstream> #include <sstream> #include <strstream> #include <algorithm> #include <string> #include <functional> #include <utility> #include <ext/hash_map> #include <boost/tokenizer.hpp> #ifdef HAVE_LIBBZ2 #include <bzlib.h> #endif extern "C" { #include "R.h" #include "Rmath.h" #include "Rinternals.h" #include "Rdefines.h" } using namespace std; using namespace __gnu_cxx; class lessAbsoluteValue { public: bool operator()(int a, int b) const { return abs(a) < abs(b); } }; #ifdef HAVE_LIBBZ2 int get_bzline(BZFILE* b,string& line) { char c; int nBuf; int bzerror=BZ_OK; while(bzerror == BZ_OK) { nBuf=BZ2_bzRead(&bzerror, b, &c, 1); if(bzerror==BZ_OK) { if(c=='\n') { return bzerror; } else { line+=c; } } } return bzerror; } int get_a_line(FILE *f,BZFILE *b,int bz2file,string& line) { line=""; if(bz2file) { int bzerror=get_bzline(b,line); if(bzerror==BZ_OK) { return(1); } else { if(bzerror!=BZ_STREAM_END) { cerr<<"encountered BZERROR="<<bzerror<<endl; } return(0); } } else { char *cline=NULL; size_t n; if(getline(&cline,&n,f) != -1) { if(cline) { cline[strlen(cline)-1]='\0'; line+=cline; free(cline); } return(1); } else { return(0); } } } #endif /** * Read in .bed data into a list chromosome of vectors representing 5' positions, with sign * corresponding to the strand. */ //#define DEBUG 1 extern "C" { SEXP read_bed_ends(SEXP filename) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t"); ifstream bed_file(fname); #ifdef DEBUG Rprintf("opened %s\n",fname); #endif Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(getline(bed_file,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string chr=*sit++; //chr=chr.substr(3,strlen(chr.c_str())); string str_start=*sit++; int fstart=atoi(str_start.c_str()); string str_end=*sit++; int fend=atoi(str_end.c_str()); int fpos=fstart; if(sit!=tok.end()) { string u0=*sit++; string nfield=*sit++; string strand=*sit++; if(strand=="-") { fpos=-1*fend; } } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d\n",chr.c_str(),cind,fpos); if(fcount>30) { break; } #endif } } bed_file.close(); #ifdef DEBUG Rprintf("done. read %d fragments\n",fcount); #endif Rprintf("done. read %d fragments\n",fcount); // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { sort(csi->begin(), csi->end(), lessAbsoluteValue()); } SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { SEXP nv; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; int* i_nv=INTEGER(nv); int i=0; for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_nv[i++]=*pi; } SET_VECTOR_ELT(ans, csi-pos.begin(), nv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } SEXP read_meland_old(SEXP filename) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<int> > poslen; // length // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t"); ifstream bed_file(fname); Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(getline(bed_file,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { sit++; sit++; string str_nm=*sit++; int nm=0; if(str_nm[0]=='U') { nm=atoi((str_nm.c_str()+1)); } else { continue; } sit++; sit++; sit++; string str_len=*sit++; int len=atoi(str_len.c_str()); string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); string str_pos=*sit++; int fpos=atoi(str_pos.c_str()); // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); poslen.push_back(vector<int>()); #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); (poslen[cind]).push_back(len); #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } bed_file.close(); #ifdef DEBUG Rprintf("done. read %d fragments\n",fcount); #endif Rprintf("done. read %d fragments\n",fcount); // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi,lsi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); lsi=poslen.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); SET_STRING_ELT(dnames_R, 2, mkChar("l")); SEXP tv,nv,lv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; PROTECT(lv=allocVector(INTSXP,csi->size())); np++; int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int* i_lv=INTEGER(lv); int i=0; vector<int>::const_iterator ini=nsi->begin(); vector<int>::const_iterator ili=lsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i_lv[i]=*ili++; i++; } PROTECT(dv = allocVector(VECSXP, 3)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); SET_VECTOR_ELT(dv, 2, lv); setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } int get_a_line(FILE *f,string& line) { line=""; char cline[1024]; if(fgets(cline,1024,f)) { line+=cline; return(1); } else { return(0); } } SEXP read_meland(SEXP filename,SEXP read_tag_names_R) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<int> > poslen; // length vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t"); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(get_a_line(f,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string tagname=*sit++; sit++; string str_nm=*sit++; int nm=0; if(str_nm[0]=='U') { nm=atoi((str_nm.c_str()+1)); } else { continue; } sit++; sit++; sit++; string str_len=*sit++; int len=atoi(str_len.c_str()); string chr=*sit++; chr=chr.substr(3,strlen(chr.c_str())); string str_pos=*sit++; int fpos=atoi(str_pos.c_str()); // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); poslen.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); (poslen[cind]).push_back(len); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } fclose(f); #ifdef DEBUG Rprintf("done. read %d fragments\n",fcount); #endif Rprintf("done. read %d fragments\n",fcount); // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi,lsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); lsi=poslen.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); SET_STRING_ELT(dnames_R, 2, mkChar("l")); if(read_names) { SET_STRING_ELT(dnames_R, 3, mkChar("s")); } SEXP tv,nv,lv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; PROTECT(lv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int* i_lv=INTEGER(lv); int i=0; vector<int>::const_iterator ini=nsi->begin(); vector<int>::const_iterator ili=lsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i_lv[i]=*ili++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); SET_VECTOR_ELT(dv, 2, lv); if(read_names) { SET_VECTOR_ELT(dv, 3, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // reads regular eland files, recording mismatch positions SEXP read_eland_mismatches(SEXP filename) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > mm1; // position of the first mismatch (or 0 for none) vector< vector<int> > mm2; // position of the second mismatch // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(get_a_line(f,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { sit++; string seq=*sit++; string str_nm=*sit++; int nm=0; if(str_nm[0]=='U') { nm=atoi((str_nm.c_str()+1)); } else { continue; } sit++; sit++; sit++; string chr=*sit++; // extract chromosome name from this int chrp=chr.find("chr"); int pp=chr.find('.'); chr=chr.substr(chrp+3,pp-chrp-3); string str_pos=*sit++; int fpos=atoi(str_pos.c_str()); string strand=*sit++; int nstrand=0; if(strand=="R") { fpos=-1*(fpos+seq.size()-1); nstrand=1; } sit++; int nm1=0; int nm2=0; if(sit!=tok.end()) { string nms=*sit++; nm1=atoi(nms.substr(0,nms.size()-1).c_str()); if(nstrand) { nm1=seq.size()-nm1+1; } } if(sit!=tok.end()) { string nms=*sit++; nm2=atoi(nms.substr(0,nms.size()-1).c_str()); if(nstrand) { nm2=seq.size()-nm2+1; } } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); mm1.push_back(vector<int>()); mm2.push_back(vector<int>()); #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (mm1[cind]).push_back(nm1); (mm2[cind]).push_back(nm2); #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm1=%d, nm2=%d\n",chr.c_str(),cind,fpos,nm1,nm2); if(fcount>30) { break; } #endif } } fclose(f); #ifdef DEBUG Rprintf("done. read %d fragments\n",fcount); #endif Rprintf("done. read %d fragments\n",fcount); // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi,lsi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=mm1.begin()+(csi-pos.begin()); lsi=mm2.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("f")); SET_STRING_ELT(dnames_R, 2, mkChar("s")); SEXP tv,nv,lv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; PROTECT(lv=allocVector(INTSXP,csi->size())); np++; int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int* i_lv=INTEGER(lv); int i=0; vector<int>::const_iterator ini=nsi->begin(); vector<int>::const_iterator ili=lsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i_lv[i]=*ili++; i++; } PROTECT(dv = allocVector(VECSXP, 3)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); SET_VECTOR_ELT(dv, 2, lv); setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in regular eland files, adjusting the negative strand coordinate by sequence length SEXP read_eland(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); int eland_tag_length=*(INTEGER(eland_tag_length_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(get_a_line(f,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string tagname=*sit++; string sequence=*sit++; int len=sequence.size(); // adjust probe length if eland length limit was specified if(eland_tag_length>0 && len>eland_tag_length) { len=eland_tag_length; } string str_nm=*sit++; int nm=0; if(str_nm[0]=='U') { nm=atoi((str_nm.c_str()+1)); } else { continue; } sit++; sit++; sit++; string chr=*sit++; string str_pos=*sit++; int fpos=atoi(str_pos.c_str()); string str_strand=*sit++; if(str_strand[0]=='R') { fpos=-1*(fpos+len-1); } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); if(read_names) { SET_STRING_ELT(dnames_R, 2, mkChar("s")); } SEXP tv,nv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); if(read_names) { SET_VECTOR_ELT(dv, 2, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in extended eland files, adjusting the negative strand coordinate by sequence length SEXP read_eland_extended(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); int eland_tag_length=*(INTEGER(eland_tag_length_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(get_a_line(f,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string machinename=*sit++; string runnumber=*sit++; string lanenumber=*sit++; *sit++; string str_x=*sit++; string str_y=*sit++; string tagname=machinename+"."+runnumber+"."+lanenumber+"."+str_x+"."+str_y; *sit++; *sit++; string sequence=*sit++; *sit++; string chr=*sit++; string contig=*sit++; chr=chr+contig; int len=sequence.size(); // adjust probe length if eland length limit was specified if(eland_tag_length>0 && len>eland_tag_length) { len=eland_tag_length; } string str_pos=*sit++; if(str_pos.size()<1) { continue; } int fpos=atoi(str_pos.c_str()); string str_strand=*sit++; if(str_strand[0]=='R') { fpos=-1*(fpos+len-1); } string str_nm=*sit++; // count non-digit characters int nm=0; for(int i=0;i<str_nm.size();i++) { if(!isdigit(str_nm[i])) { nm++; } } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); if(read_names) { SET_STRING_ELT(dnames_R, 2, mkChar("s")); } SEXP tv,nv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); if(read_names) { SET_VECTOR_ELT(dv, 2, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in eland multi files, adjusting the negative strand coordinate by sequence length SEXP read_eland_multi(SEXP filename,SEXP read_tag_names_R,SEXP eland_tag_length_R) { #ifdef DEBUG Rprintf("read_eland_muti() : start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); int eland_tag_length=*(INTEGER(eland_tag_length_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t",""); boost::char_separator<char> comsep(",","",boost::keep_empty_tokens); boost::char_separator<char> colsep(":","",boost::keep_empty_tokens); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { Rprintf("opened %s\n",fname); // read in bed line string line; int nline=0; int fcount=0; while(get_a_line(f,line)) { nline++; // chomp size_t elpos = line.find_last_not_of("\n"); if(elpos != string::npos) { line = line.substr(0, elpos+1); } #ifdef DEBUG Rprintf("line %d: %s\n",nline,line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string tagname=*sit++; string sequence=*sit++; string mspec=*sit++; // parse out match spec if(mspec=="NM" || mspec=="QC") { continue; } #ifdef DEBUG Rprintf("parsing out spec \"%s\" : ",mspec.c_str()); #endif tokType stok(mspec, colsep); tokType::iterator ssit=stok.begin(); string str_nm0=*ssit++; int nm=0; int nm0=atoi(str_nm0.c_str()); if(nm0>1) { #ifdef DEBUG Rprintf("rejected for nm0\n"); #endif continue; } if(nm0==0) { string str_nm1=*ssit++; int nm1=atoi(str_nm1.c_str()); if(nm1>1) { #ifdef DEBUG Rprintf("rejected for nm1\n"); #endif continue; } if(nm1==0) { string str_nm2=*ssit++; int nm2=atoi(str_nm2.c_str()); if(nm2>1) { #ifdef DEBUG Rprintf("rejected for nm2\n"); #endif continue; } nm=2; } else { nm=1; } } #ifdef DEBUG Rprintf("accepted (nm=%d)\n",nm); #endif int npos=0; string mpos=*sit++; vector<string> mposc; vector<int> mposp; tokType ptok(mpos, comsep); string prevchr; for(tokType::iterator psit=ptok.begin();psit!=ptok.end();psit++) { string cpos=*psit; npos++; int strand=1; if(cpos.size()<5) { Rprintf("ERROR: line=%d, match %d is too short: \"%s\"; ",nline,npos,cpos.c_str()); } char lc=cpos.at(cpos.size()-1); if(atoi(&lc)==nm) { switch(cpos.at(cpos.size()-2)) { case 'R': strand=-1; break; case 'F': strand=1; break; default: Rprintf("ERROR: line=%d, match %d specifies an invalid strand %c\n",nline,npos,cpos.at(cpos.size()-2)); break; continue; } string chr,str_pos; size_t colpos=cpos.find(":"); if(colpos==string::npos) { if(npos>1) { chr=prevchr; str_pos=cpos.substr(0,cpos.size()-2); } else { Rprintf("ERROR: line=%d, match %d does not contain chromosome separator: \"%s\"\n",nline,npos,cpos.c_str()); continue; } } else { chr=cpos.substr(0,colpos); str_pos=cpos.substr(colpos+1,cpos.size()-3-colpos); } #ifdef DEBUG Rprintf("\"%s\" : chr=%s, pos=%s, strand=%d\n",cpos.c_str(),chr.c_str(),str_pos.c_str(),strand); #endif int pos=strand*atoi(str_pos.c_str()); mposc.push_back(chr); mposp.push_back(pos); } } string chr; int fpos; if(mposc.size()!=1) { if(mposc.size()==0) { Rprintf("ERROR: line=%d: no %d-mismatch matches were found in \"%s\"\n",nline,nm,mpos.c_str()); } else { Rprintf("ERROR: line=%d: more than one (%d) %d-mismatch matches were found in \"%s\"\n",nline,mposc.size(),nm,mpos.c_str()); } continue; } else { chr=*mposc.begin(); fpos=*mposp.begin(); } int len=sequence.size(); // adjust probe length if eland length limit was specified if(eland_tag_length>0 && len>eland_tag_length) { len=eland_tag_length; } if(fpos<0) { fpos=-1*(-1*fpos+len-1); } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); if(read_names) { SET_STRING_ELT(dnames_R, 2, mkChar("s")); } SEXP tv,nv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); if(read_names) { SET_VECTOR_ELT(dv, 2, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in regular eland files, adjusting the negative strand coordinate by sequence length SEXP read_bowtie(SEXP filename,SEXP read_tag_names_R) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); boost::char_separator<char> sep2(","); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { #ifdef HAVE_LIBBZ2 BZFILE* b; int bzerror; int bz2file=0; if(strstr(fname,".bz2")) { bz2file=1; b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } } #endif Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; #ifdef HAVE_LIBBZ2 while(get_a_line(f,b,bz2file,line)) { #else while(get_a_line(f,line)) { #endif #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string tagname=*sit++; string str_strand=*sit++; string chr=*sit++; string str_pos=*sit++; int fpos=atoi(str_pos.c_str()); string sequence=*sit++; sit++; sit++; string mm=*sit++; int len=sequence.size(); if(str_strand[0]=='-') { fpos=-1*(fpos+len-1); } // determine number of mismatches int nm=0; if(mm.size()>0) { nm++; string::size_type tp(0); while(tp!=string::npos) { tp = mm.find(",",tp); if(tp!=string::npos) { tp++; ++nm; } } } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } #ifdef HAVE_LIBBZ2 BZ2_bzReadClose( &bzerror, b); #endif fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); if(read_names) { SET_STRING_ELT(dnames_R, 2, mkChar("s")); } SEXP tv,nv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); if(read_names) { SET_VECTOR_ELT(dv, 2, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in helicos tab-separated alignment output (regular or bz2) SEXP read_helicostabf(SEXP filename,SEXP read_tag_names_R) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<int> > poslen; // length of the match vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); boost::char_separator<char> sep2(","); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { #ifdef HAVE_LIBBZ2 BZFILE* b; int bzerror; int bz2file=0; if(strstr(fname,".bz2")) { bz2file=1; b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } } #endif Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; int nlines=0; #ifdef HAVE_LIBBZ2 while(get_a_line(f,b,bz2file,line)) { #else while(get_a_line(f,line)) { #endif #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif nlines++; // skip comments if(line[0]=='#') { continue; } if(line.compare(0,12,"Reference_ID")==0) { #ifdef DEBUG Rprintf("matched header on line %d\n",nlines); #endif continue; } tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string chr=*sit++; string tagname=*sit++; string str_startpos=*sit++; string str_endpos=*sit++; string str_tstart=*sit++; string str_tend=*sit++; int len=atoi(str_tend.c_str())-atoi(str_tstart.c_str()); sit++; sit++; string str_ndel=*sit++; string str_nins=*sit++; string str_nsub=*sit++; string str_strand=*sit++; int fpos; if(str_strand[0]=='-') { fpos=-1*atoi(str_endpos.c_str()); } else { fpos=atoi(str_startpos.c_str()); } // determine number of mismatches int nm=atoi(str_ndel.c_str())+atoi(str_nins.c_str())+atoi(str_nsub.c_str()); // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); poslen.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); (poslen[cind]).push_back(len); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d\n",chr.c_str(),cind,fpos,nm); if(fcount>30) { break; } #endif } } #ifdef HAVE_LIBBZ2 BZ2_bzReadClose( &bzerror, b); #endif fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<int> >::const_iterator lsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); lsi=poslen.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 3+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); SET_STRING_ELT(dnames_R, 2, mkChar("l")); if(read_names) { SET_STRING_ELT(dnames_R, 3, mkChar("s")); } SEXP tv,nv,lv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; PROTECT(lv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int* i_lv=INTEGER(lv); int i=0; vector<int>::const_iterator ini=nsi->begin(); vector<int>::const_iterator lni=lsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i_lv[i]=*lni++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 3+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); SET_VECTOR_ELT(dv, 2, lv); if(read_names) { SET_VECTOR_ELT(dv, 3, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in text version of maq map SEXP read_maqmap(SEXP filename,SEXP read_tag_names_R) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); int read_names=*(INTEGER(read_tag_names_R)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<string> > tagnames; // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep("\t","",boost::keep_empty_tokens); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(get_a_line(f,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string tagname=*sit++; string chr=*sit++; string str_pos=*sit++; int fpos=atoi(str_pos.c_str()); string str_strand=*sit++; sit++; sit++; sit++; sit++; sit++; string str_nm=*sit++; sit++; sit++; sit++; string str_len=*sit++; int nm=atoi(str_nm.c_str()); int len=atoi(str_len.c_str()); if(str_strand[0]=='-') { fpos=-1*(fpos+len-1); } // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); if(read_names) { tagnames.push_back(vector<string>()); } #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); if(read_names) { (tagnames[cind]).push_back(tagname); } #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d, nm=%d, len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<string> >::const_iterator ssi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2+read_names)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); if(read_names) { SET_STRING_ELT(dnames_R, 2, mkChar("s")); } SEXP tv,nv,sv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; if(read_names) { PROTECT(sv=allocVector(STRSXP,csi->size())); np++; } int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } if(read_names) { int i=0; ssi=tagnames.begin()+(csi-pos.begin()); for(vector<string>::const_iterator si=ssi->begin();si!=ssi->end();++si) { SET_STRING_ELT(sv,i,mkChar(si->c_str())); i++; } } PROTECT(dv = allocVector(VECSXP, 2+read_names)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); if(read_names) { SET_VECTOR_ELT(dv, 2, sv); } setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // read in tagalign file SEXP read_tagalign(SEXP filename) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t"); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; while(get_a_line(f,line)) { #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string chr=*sit++; string str_spos=*sit++; string str_epos=*sit++; sit++; string str_qual=*sit++; string str_strand=*sit; int fpos; if(str_strand[0]=='+') { fpos=atoi(str_spos.c_str()); } else { fpos=-1*atoi(str_epos.c_str()); } int nm=atoi(str_qual.c_str()); // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); if(fcount>30) { break; } #endif } } fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); SEXP tv,nv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } PROTECT(dv = allocVector(VECSXP, 2)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // arachne madness SEXP read_arachne(SEXP filename) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t"); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { #ifdef HAVE_LIBBZ2 BZFILE* b; int bzerror; int bz2file=0; if(strstr(fname,".bz2")) { bz2file=1; b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } } #endif Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; #ifdef HAVE_LIBBZ2 while(get_a_line(f,b,bz2file,line)) { #else while(get_a_line(f,line)) { #endif #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string chr=*sit++; string str_spos=*sit++; int nm=0; if(sit!=tok.end()) { string str_mm=*sit; nm=atoi(str_mm.c_str()); } int fpos=atoi(str_spos.c_str());; // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d\n",chr.c_str(),cind,fpos,nm); if(fcount>30) { break; } #endif } } #ifdef HAVE_LIBBZ2 BZ2_bzReadClose( &bzerror, b); #endif fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 2)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); SEXP tv,nv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int i=0; vector<int>::const_iterator ini=nsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i++; } PROTECT(dv = allocVector(VECSXP, 2)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } // arachne madness SEXP read_arachne_long(SEXP filename) { #ifdef DEBUG Rprintf("start\n"); #endif const char* fname=CHAR(asChar(filename)); #ifdef DEBUG Rprintf("fname=%s\n",fname); #endif // main data vector // chr - pos vector< vector<int> > pos; vector< vector<int> > posnm; // number of mismatches vector< vector<int> > poslen; // length of the match // chromosome map hash_map<string, int, hash<string>,equal_to<string> > cind_map; vector<string> cnames; typedef boost::tokenizer<boost::char_separator<char> > tokType; boost::char_separator<char> sep(" \t"); FILE *f=fopen(fname,"rb"); if (!f) { cout<<"can't open input file \""<<fname<<"\"\n"; } else { #ifdef HAVE_LIBBZ2 BZFILE* b; int bzerror; int bz2file=0; if(strstr(fname,".bz2")) { bz2file=1; b=BZ2_bzReadOpen (&bzerror, f, 0, 0, NULL, 0); if (bzerror != BZ_OK) { cout<<"bzerror="<<bzerror<<endl; } } #endif Rprintf("opened %s\n",fname); // read in bed line string line; int fcount=0; #ifdef HAVE_LIBBZ2 while(get_a_line(f,b,bz2file,line)) { #else while(get_a_line(f,line)) { #endif #ifdef DEBUG Rprintf("line: %s\n",line.c_str()); #endif tokType tok(line, sep); tokType::iterator sit=tok.begin(); if(sit!=tok.end()) { string query=*sit++; if(query!="QUERY") { continue; } *sit++; *sit++; *sit++; *sit++; string str_strand=*sit++; string chr=*sit++; string str_startpos=*sit++; string str_endpos=*sit++; int fpos; if(str_strand[0]=='1') { fpos=-1*atoi(str_endpos.c_str()); } else { fpos=atoi(str_startpos.c_str()); } #ifdef DEBUG Rprintf("chr=%s, fpos=%d\n",chr.c_str(),fpos); #endif *sit++; string str_nblocks=*sit++; int nblocks=atoi(str_nblocks.c_str()); #ifdef DEBUG Rprintf("nblocks=%d\n",nblocks); #endif // tally up the read length and the number of mismatches for all blocks int len=0; int nm=0; for(int i=0;i<nblocks;i++) { string str_sgs=*sit++; int sgs=atoi(str_sgs.c_str()); string str_slen=*sit++; int slen=atoi(str_slen.c_str()); string str_snm=*sit++; int snm=atoi(str_snm.c_str()); #ifdef DEBUG Rprintf("sgs=%d, slen=%d, snm=%d\n",sgs,slen,snm); #endif len+=slen; nm+=abs(sgs)+snm; } nm+=nblocks-1; // determine the chromosome index hash_map<string, int, hash<string>,equal_to<string> >::const_iterator li=cind_map.find(chr); int cind=-1; if(li==cind_map.end()) { // register new chromosome cind=cnames.size(); cnames.push_back(chr); cind_map[chr]=cind; // allocate new pos vector pos.push_back(vector<int>()); posnm.push_back(vector<int>()); poslen.push_back(vector<int>()); #ifdef DEBUG Rprintf("registered new chromosome %s with cind=%d, pos.size=%d\n",chr.c_str(),cind,pos.size()); #endif } else { cind=li->second; } fcount++; (pos[cind]).push_back(fpos); (posnm[cind]).push_back(nm); (poslen[cind]).push_back(len); #ifdef DEBUG Rprintf("read in position chr=%s cind=%d fpos=%d nm=%d len=%d\n",chr.c_str(),cind,fpos,nm,len); if(fcount>30) { break; } #endif } } #ifdef HAVE_LIBBZ2 BZ2_bzReadClose( &bzerror, b); #endif fclose(f); Rprintf("done. read %d fragments\n",fcount); } // construct output structures SEXP chnames; int np=0; // number of protections PROTECT(chnames = allocVector(STRSXP, cnames.size())); for(vector<string>::const_iterator csi=cnames.begin();csi!=cnames.end();++csi) { SET_STRING_ELT(chnames, csi-cnames.begin(), mkChar(csi->c_str())); } np++; // sort //for(vector<vector<int> >::iterator csi=pos.begin();csi!=pos.end();++csi) { // sort(csi->begin(), csi->end(), lessAbsoluteValue()); //} SEXP ans; PROTECT(ans = allocVector(VECSXP, cnames.size())); np++; vector<vector<int> >::const_iterator nsi; vector<vector<int> >::const_iterator lsi; for(vector<vector<int> >::const_iterator csi=pos.begin();csi!=pos.end();++csi) { nsi=posnm.begin()+(csi-pos.begin()); lsi=poslen.begin()+(csi-pos.begin()); SEXP dv,dnames_R; PROTECT(dnames_R = allocVector(STRSXP, 3)); np++; SET_STRING_ELT(dnames_R, 0, mkChar("t")); SET_STRING_ELT(dnames_R, 1, mkChar("n")); SET_STRING_ELT(dnames_R, 2, mkChar("l")); SEXP tv,nv,lv; PROTECT(tv=allocVector(INTSXP,csi->size())); np++; PROTECT(nv=allocVector(INTSXP,csi->size())); np++; PROTECT(lv=allocVector(INTSXP,csi->size())); np++; int* i_tv=INTEGER(tv); int* i_nv=INTEGER(nv); int* i_lv=INTEGER(lv); int i=0; vector<int>::const_iterator ini=nsi->begin(); vector<int>::const_iterator lni=lsi->begin(); for(vector<int> ::const_iterator pi=csi->begin();pi!=csi->end();++pi) { i_tv[i]=*pi; i_nv[i]=*ini++; i_lv[i]=*lni++; i++; } PROTECT(dv = allocVector(VECSXP, 3)); np++; SET_VECTOR_ELT(dv, 0, tv); SET_VECTOR_ELT(dv, 1, nv); SET_VECTOR_ELT(dv, 2, lv); setAttrib(dv, R_NamesSymbol, dnames_R); SET_VECTOR_ELT(ans, csi-pos.begin(), dv); } setAttrib(ans,R_NamesSymbol,chnames); #ifdef DEBUG Rprintf("unprotecting %d elements\n",np); #endif UNPROTECT(np); return(ans); } }