Mercurial > repos > fz77 > pscan_chip_test
comparison pscan_chip/pscan_chip_g.cpp @ 9:6a5cfb8a872f draft
Uploaded
| author | fz77 |
|---|---|
| date | Mon, 17 Nov 2014 09:40:44 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 8:2aad8f91e6f2 | 9:6a5cfb8a872f |
|---|---|
| 1 // Pscan-ChIP 1.1 | |
| 2 // // Copyright (C) 2013 Federico Zambelli and Giulio Pavesi | |
| 3 // // | |
| 4 // // This program is free software: you can redistribute it and/or modify | |
| 5 // // it under the terms of the GNU General Public License as published by | |
| 6 // // the Free Software Foundation, either version 3 of the License, or | |
| 7 // // (at your option) any later version. | |
| 8 // // | |
| 9 // // This program is distributed in the hope that it will be useful, | |
| 10 // // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 // // GNU General Public License for more details. | |
| 13 // // | |
| 14 // // You should have received a copy of the GNU General Public License | |
| 15 // // along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 16 // // | |
| 17 // // If you find Pscan-ChIP useful in your research please cite our paper: | |
| 18 // // | |
| 19 // // Zambelli F, Pesole G, Pavesi G. PscanChIP: Finding over-represented | |
| 20 // // transcription factor-binding site motifs and their correlations in sequences from | |
| 21 // // ChIP-Seq experiments. | |
| 22 // // Nucleic Acids Res. 2013 Jul;41(Web Server issue):W535-43. | |
| 23 | |
| 24 #include <string> | |
| 25 #include <iostream> | |
| 26 #include <fstream> | |
| 27 #include <vector> | |
| 28 #include <sstream> | |
| 29 #include <math.h> | |
| 30 #include <map> | |
| 31 #include <cstdlib> | |
| 32 #include <gsl/gsl_randist.h> | |
| 33 #include <gsl/gsl_cdf.h> | |
| 34 #include <string.h> | |
| 35 #include <ctime> | |
| 36 | |
| 37 using namespace std; | |
| 38 | |
| 39 const double MIN_ADD_FREQ = 0.01, EPSILON = pow(10,-21), INF = pow(10,21), /*POS_VIS_THRESHOLD = pow(10,-5),*/ DIST_RATIO_THRESHOLD = 3, DIST_PV_THRESHOLD = pow(10,-5); | |
| 40 const unsigned int MAX_REG_SIZE = 3000, ALPHABET_SIZE = 4, TOP_P = 1, DIST_HIT_THRESHOLD = 10; | |
| 41 //const char ALPHABET[4] = {'A','C','G','T'}, RC_ALPHABET[4] = {'T','G','C','A'}; | |
| 42 char VI['T' + 1]; | |
| 43 string regfile, genome, matrixfile, additional_matrixfile, outfile, bgfile, selected, obstf, GALAXY_OUTPUT; | |
| 44 unsigned int RSIZE = 150, MAX_L = 0, W_SIZE = 10, W_STEP = 5; | |
| 45 bool SECOND_BEST_POS = false, SINGLE_STRAND = false, VERBOSE = false, GET_SEQUENCES = false, POSITIONAL = false, SINGLE_MATRIX = false, SPEARMAN = false, TWOBIT = false, QUIET = false; //NAIVE = false; | |
| 46 ofstream fasta_out; | |
| 47 double SINGLE_MATRIX_BG_AVG = 0; | |
| 48 | |
| 49 struct mpoint | |
| 50 { | |
| 51 unsigned int x; | |
| 52 unsigned int y; | |
| 53 }; | |
| 54 | |
| 55 struct bg | |
| 56 { | |
| 57 unsigned int Size; | |
| 58 double Avg, Var; | |
| 59 }; | |
| 60 | |
| 61 class sequence | |
| 62 { | |
| 63 private: | |
| 64 string Chr, Name; | |
| 65 char *Seq; | |
| 66 unsigned int Start, End, E_Rank; | |
| 67 char Strand, AntiStrand; | |
| 68 bool good; | |
| 69 public: | |
| 70 sequence(string*, ifstream*, unsigned int); | |
| 71 sequence(string*, unsigned int, string, unsigned int); | |
| 72 char strand(); | |
| 73 char antistrand(); | |
| 74 string chr(); | |
| 75 unsigned int start(); | |
| 76 unsigned int end(); | |
| 77 unsigned int e_rank(); | |
| 78 string name(); | |
| 79 char* seq(); | |
| 80 bool is_good(); | |
| 81 double score(); | |
| 82 int mpos(); | |
| 83 }; | |
| 84 | |
| 85 class matrix | |
| 86 { | |
| 87 friend void scanner(vector<matrix>*, vector<sequence>*); | |
| 88 private: | |
| 89 void normalizer(); | |
| 90 void to_log(); | |
| 91 void avg(); | |
| 92 void max_min(); | |
| 93 void rev_c(); | |
| 94 double get_M_value(int, char); | |
| 95 double get_rc_M_value(int, char); | |
| 96 void calc_var(double); | |
| 97 void calc_W_param(double); | |
| 98 void welch_test(double); | |
| 99 void welch_test_bg(double); | |
| 100 // void welch_test_bg_G(double); | |
| 101 void welch_test_pos(double); | |
| 102 void z_test_pos(double); | |
| 103 unsigned int L, bg_Size, BIN_NUM; | |
| 104 double **M; | |
| 105 double **rc_M; | |
| 106 double max_score, min_score, t, dof, t_bg, dof_bg; | |
| 107 double spearman; | |
| 108 string Name, Id; | |
| 109 bool O_U, O_U_bg; | |
| 110 vector<bool> W_O_U; | |
| 111 bool good, BG; | |
| 112 bool norm_sec_step; | |
| 113 vector<double> v_score, vF_score; | |
| 114 vector<unsigned int> v_pos; | |
| 115 vector<bool> v_strand; | |
| 116 vector<vector<double> > W_R_v_score; | |
| 117 vector<double> W_R_Avg, W_R_Var, W_R_pvalue, W_R_T_pvalue; | |
| 118 double R_Avg, F_Avg, R_Var, F_Var, /*G_Avg, G_Var,*/ pvalue, W_Avg, W_Var; | |
| 119 double bg_Avg, bg_Var, pvalue_bg; | |
| 120 public: | |
| 121 matrix(vector<string>*, map<string,bg>*); | |
| 122 void display(); | |
| 123 bool is_good(); | |
| 124 string name(); | |
| 125 string id(); | |
| 126 string o_or_u(); | |
| 127 string o_or_u_bg(); | |
| 128 int W_o_or_u(unsigned int); | |
| 129 void scan(vector<sequence>*); | |
| 130 double scan_seq_ds(char*, bool, unsigned int, vector<double>*); | |
| 131 double scan_seq_ss(char*, bool, unsigned int, char, vector<double>*); | |
| 132 double scan_seq_ds_second_best(char*, unsigned int, unsigned int *, bool *, int); | |
| 133 double scan_seq_ss_second_best(char*, unsigned int, unsigned int *, bool *, unsigned int, bool); | |
| 134 double get_score_at(unsigned int); | |
| 135 unsigned int get_pos_at(unsigned int); | |
| 136 unsigned int length(); | |
| 137 bool get_strand_at(unsigned int); | |
| 138 bool have_bg(); | |
| 139 unsigned int bin_num(); | |
| 140 double get_R_Avg(); | |
| 141 double get_F_Avg(); | |
| 142 double get_bg_Avg(); | |
| 143 double get_R_Var(); | |
| 144 double get_F_Var(); | |
| 145 double max(); | |
| 146 double min(); | |
| 147 double get_pvalue(); | |
| 148 double get_pvalue_bg(); | |
| 149 double get_W_pvalue(unsigned int); | |
| 150 double get_W_R_Avg_at_pos(unsigned int); | |
| 151 double get_t(); | |
| 152 double get_dof(); | |
| 153 double get_spearman(); | |
| 154 double get_W_Avg(); | |
| 155 double get_W_R_T_pvalue(unsigned int); | |
| 156 }; | |
| 157 | |
| 158 void command_line_parser(int, char**); | |
| 159 void acquire_regions(vector<sequence>*); | |
| 160 void acquire_matrices(vector<matrix>*, map<string,bg>*, string*); | |
| 161 void acquire_background(map<string,bg>*); | |
| 162 void scanner(vector<matrix>*, vector<sequence>*); | |
| 163 void main_output(vector<matrix>*, vector<sequence>*, unsigned int); | |
| 164 void main_output_single_matrix(vector<matrix>*, vector<sequence>*); | |
| 165 void distance_analysis(vector<matrix>*, vector<sequence>*, unsigned int, multimap <double, vector<matrix>::iterator>*); | |
| 166 void display_help(); | |
| 167 | |
| 168 int main(int argc, char **argv) | |
| 169 { | |
| 170 srand(time(NULL)); | |
| 171 | |
| 172 vector<sequence> Seq; | |
| 173 vector<matrix> Mat; | |
| 174 map<string,bg> Bg; | |
| 175 VI['A'] = 0; | |
| 176 VI['C'] = 1; | |
| 177 VI['G'] = 2; | |
| 178 VI['T'] = 3; | |
| 179 VI['N'] = 4; | |
| 180 | |
| 181 obstf.clear(); | |
| 182 | |
| 183 command_line_parser(argc, argv); | |
| 184 | |
| 185 if(GET_SEQUENCES) | |
| 186 { | |
| 187 string outfile = regfile; | |
| 188 outfile += ".fasta"; | |
| 189 fasta_out.open(outfile.c_str(), ios::out); | |
| 190 } | |
| 191 | |
| 192 if(!regfile.empty() && !matrixfile.empty()) | |
| 193 { | |
| 194 if(!bgfile.empty()) | |
| 195 { | |
| 196 if(!QUIET) cerr << "\nAcquiring background file... "; | |
| 197 acquire_background(&Bg); | |
| 198 if(!QUIET) cerr << "done" << endl; | |
| 199 } | |
| 200 | |
| 201 if(!QUIET) cerr << "\nReading binding profiles..."; | |
| 202 acquire_matrices(&Mat, &Bg, &matrixfile); | |
| 203 | |
| 204 if(!additional_matrixfile.empty()) | |
| 205 acquire_matrices(&Mat, &Bg, &additional_matrixfile); | |
| 206 | |
| 207 if(Mat.empty()) | |
| 208 { | |
| 209 if(!QUIET) cerr << "\nZero valid matrices acquired. Quitting..." << endl; | |
| 210 exit(1); | |
| 211 } | |
| 212 | |
| 213 if(!QUIET) cerr << "done: " << Mat.size() << " binding profiles acquired. Max matrix length = " << MAX_L << endl; | |
| 214 | |
| 215 if(Mat.size() == 1) | |
| 216 { | |
| 217 if(!QUIET) cerr << "\nSwitching to single matrix mode..." << endl; | |
| 218 SINGLE_MATRIX = true; | |
| 219 } | |
| 220 | |
| 221 if(!QUIET) cerr << "\nAcquiring regions..."; | |
| 222 acquire_regions(&Seq); | |
| 223 if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl; | |
| 224 | |
| 225 if(Seq.size() <= 2) | |
| 226 POSITIONAL = false; | |
| 227 | |
| 228 if(Seq.empty()) | |
| 229 { | |
| 230 if(!QUIET) cerr << "\nZero regions to work on! Exiting..." << endl; | |
| 231 exit(1); | |
| 232 } | |
| 233 | |
| 234 if(GET_SEQUENCES) | |
| 235 { | |
| 236 fasta_out.close(); | |
| 237 exit(0); | |
| 238 } | |
| 239 | |
| 240 // for(unsigned int i = 0; i < Mat.size(); i++) | |
| 241 // Mat[i].display(); | |
| 242 | |
| 243 if(!QUIET) cerr << "\nScanning..."; | |
| 244 scanner(&Mat, &Seq); | |
| 245 | |
| 246 if(!QUIET) cerr << "\nWriting output..." << endl; | |
| 247 | |
| 248 if(!SINGLE_MATRIX) | |
| 249 main_output(&Mat, &Seq, Seq.size()); | |
| 250 else | |
| 251 main_output_single_matrix(&Mat, &Seq); | |
| 252 | |
| 253 if(!QUIET) cerr << "\nOutput written in file: " << outfile << endl; | |
| 254 | |
| 255 } | |
| 256 else if(GET_SEQUENCES && !regfile.empty()) | |
| 257 { | |
| 258 if(!QUIET) cerr << "\nAcquiring regions..."; | |
| 259 acquire_regions(&Seq); | |
| 260 if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl; | |
| 261 | |
| 262 fasta_out.close(); | |
| 263 exit(0); | |
| 264 } | |
| 265 else | |
| 266 display_help(); | |
| 267 | |
| 268 | |
| 269 /* for(unsigned int i = 0; i < Seq.size(); i++) | |
| 270 { | |
| 271 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << Seq[i].start() << '\t' << Seq[i].end() << '\t' | |
| 272 << Mat[0].get_score_at(i) << '\t' << Mat[0].get_pos_at(i) << '\t' << Mat[0].get_strand_at(i) << '\t' | |
| 273 << "Max = " << Mat[0].max() << '\t' << "Min = " << Mat[0].min() | |
| 274 << endl << Seq[i].seq() << endl; | |
| 275 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "UPSTREAM" << endl << Seq[i].uf_seq() << endl; | |
| 276 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "DOWNSTREAM" << endl << Seq[i].df_seq() << endl; | |
| 277 } | |
| 278 */ | |
| 279 | |
| 280 return 0; | |
| 281 } | |
| 282 | |
| 283 void display_help() | |
| 284 { | |
| 285 if(!QUIET) cerr << endl << "Syntax: pscan_chip -r region_file.bed -M matrixfile.wil -g genome [-pos] [-wsize window_size] [-wstep window_step] [-bg background_file] [-s region_size]" << endl << endl; | |
| 286 exit(0); | |
| 287 } | |
| 288 | |
| 289 void main_output_single_matrix(vector<matrix> *Mat, vector<sequence> *Seq) | |
| 290 { | |
| 291 outfile = regfile; | |
| 292 outfile += "."; | |
| 293 outfile += Mat->at(0).id(); | |
| 294 | |
| 295 string outfile2 = outfile; | |
| 296 | |
| 297 outfile += ".ris"; | |
| 298 outfile2 += ".all.ris"; | |
| 299 | |
| 300 if(!GALAXY_OUTPUT.empty()) | |
| 301 outfile = GALAXY_OUTPUT; | |
| 302 | |
| 303 multimap <double, string> outmap; | |
| 304 | |
| 305 ofstream out(outfile.c_str(), ios::out); | |
| 306 ofstream out2(outfile2.c_str(), ios::out); | |
| 307 | |
| 308 if(!out) | |
| 309 { | |
| 310 if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl; | |
| 311 exit(1); | |
| 312 } | |
| 313 | |
| 314 out << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE"; | |
| 315 out2 << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE"; | |
| 316 | |
| 317 // if(SPEARMAN) | |
| 318 // out << "\tE_RANK"; | |
| 319 | |
| 320 if(SECOND_BEST_POS) | |
| 321 { | |
| 322 out << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl; | |
| 323 out2 << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl; | |
| 324 } | |
| 325 else | |
| 326 { | |
| 327 out << endl; | |
| 328 out2 << endl; | |
| 329 } | |
| 330 | |
| 331 matrix *M = &Mat->at(0); | |
| 332 | |
| 333 /* if(SINGLE_MATRIX_BG_AVG == 0) //QUESTO SERVE PER QUANDO NON C'E' IL VALORE DI BACKGROUND! | |
| 334 { | |
| 335 double loc_avg = 0, loc_stdev = 0; | |
| 336 | |
| 337 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 338 loc_avg += M->get_score_at(i); | |
| 339 | |
| 340 loc_avg /= Seq->size(); | |
| 341 | |
| 342 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 343 loc_stdev += pow(M->get_score_at(i) - loc_avg, 2); | |
| 344 | |
| 345 loc_stdev /= (Seq->size() - 1); | |
| 346 | |
| 347 loc_stdev = sqrt(loc_stdev); | |
| 348 | |
| 349 SINGLE_MATRIX_BG_AVG = loc_avg - (2*loc_stdev); | |
| 350 | |
| 351 if(!QUIET) cerr << "\nSMBG = " << SINGLE_MATRIX_BG_AVG; | |
| 352 } | |
| 353 */ | |
| 354 char Oligo[M->length() + 1]; | |
| 355 | |
| 356 Oligo[M->length()] = '\0'; | |
| 357 | |
| 358 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 359 { | |
| 360 // if(!QUIET) cerr << endl << M->get_score_at(i) << '\t' << M->get_bg_Avg() << '\t'; | |
| 361 | |
| 362 // if(M->get_score_at(i) < SINGLE_MATRIX_BG_AVG) //ONLY REPORT OCCURRENCES WITH SCORE >= BG_AVG | |
| 363 // continue; | |
| 364 | |
| 365 // if(!QUIET) cerr << "passed"; | |
| 366 | |
| 367 ostringstream outbuf; | |
| 368 | |
| 369 sequence *S = &Seq->at(i); | |
| 370 | |
| 371 strncpy(Oligo, S->seq() + RSIZE/2 + (M->get_pos_at(i) - M->length()/2), M->length()); | |
| 372 | |
| 373 outbuf << S->chr() << '\t' << S->start() << '\t' << S->end() << '\t' << S->strand() << '\t' | |
| 374 << S->start() + M->get_pos_at(i) - M->length()/2 << '\t' << S->start() + M->get_pos_at(i) + M->length()/2 << '\t' | |
| 375 << (int)((M->get_pos_at(i) - M->length()/2) - RSIZE/2) << '\t' << (int)((M->get_pos_at(i) + M->length()/2 - 1) - RSIZE/2)<< '\t'; | |
| 376 | |
| 377 if(S->strand() == '+') | |
| 378 { | |
| 379 if(!M->get_strand_at(i)) | |
| 380 outbuf << S->strand(); | |
| 381 else | |
| 382 outbuf << S->antistrand(); | |
| 383 } | |
| 384 else | |
| 385 { | |
| 386 if(!M->get_strand_at(i)) | |
| 387 outbuf << S->antistrand(); | |
| 388 else | |
| 389 outbuf << S->strand(); | |
| 390 } | |
| 391 | |
| 392 outbuf << '\t' << M->get_score_at(i) << '\t' << Oligo; | |
| 393 | |
| 394 // if(SPEARMAN) | |
| 395 // outbuf << '\t' << S->e_rank(); | |
| 396 | |
| 397 if(SECOND_BEST_POS) | |
| 398 { | |
| 399 if(SINGLE_STRAND) | |
| 400 { | |
| 401 bool s; | |
| 402 unsigned int P; | |
| 403 double score; | |
| 404 | |
| 405 score = M->scan_seq_ss_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s,M->get_pos_at(i), S->strand()); | |
| 406 | |
| 407 strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length()); | |
| 408 | |
| 409 outbuf << '\t' << P << '\t' << score << '\t' << Oligo; | |
| 410 } | |
| 411 else | |
| 412 { | |
| 413 bool s; | |
| 414 unsigned int P; | |
| 415 double score; | |
| 416 | |
| 417 score = M->scan_seq_ds_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s, (int)M->get_pos_at(i)); | |
| 418 | |
| 419 strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length()); | |
| 420 | |
| 421 outbuf << '\t' << (int)P - (int)RSIZE/2 << '\t' << score << '\t' << Oligo << '\t'; | |
| 422 | |
| 423 if(s) | |
| 424 outbuf << '-'; | |
| 425 else | |
| 426 outbuf << '+'; | |
| 427 } | |
| 428 | |
| 429 outbuf << endl; | |
| 430 } | |
| 431 else | |
| 432 outbuf << endl; | |
| 433 | |
| 434 outmap.insert(make_pair(M->get_score_at(i),outbuf.str())); | |
| 435 } | |
| 436 | |
| 437 multimap<double,string>::reverse_iterator mi = outmap.rbegin(); | |
| 438 | |
| 439 // unsigned int count = 0; | |
| 440 | |
| 441 while(mi != outmap.rend()) | |
| 442 { | |
| 443 // if(count >= outmap.size()/2 && outmap.size() > 1) //SOLO LA MIGLIORE META' DELLE OCCORRENZE!!! | |
| 444 // break; | |
| 445 if(mi->first >= SINGLE_MATRIX_BG_AVG || mi == outmap.rbegin()) | |
| 446 out << mi->second; | |
| 447 | |
| 448 out2 << mi->second; | |
| 449 | |
| 450 mi++; | |
| 451 // count++; | |
| 452 } | |
| 453 | |
| 454 out.close(); | |
| 455 out2.close(); | |
| 456 return; | |
| 457 } | |
| 458 | |
| 459 void main_output(vector<matrix> *Mat, vector<sequence> *Seq, unsigned int SIZE) | |
| 460 { | |
| 461 multimap <double, vector<matrix>::iterator> S_map; | |
| 462 outfile = regfile; | |
| 463 | |
| 464 outfile += ".res"; | |
| 465 | |
| 466 if(!GALAXY_OUTPUT.empty()) | |
| 467 outfile = GALAXY_OUTPUT; | |
| 468 | |
| 469 ofstream out(outfile.c_str(),ios::out); | |
| 470 | |
| 471 if(!out) | |
| 472 { | |
| 473 if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl; | |
| 474 exit(1); | |
| 475 } | |
| 476 | |
| 477 out << "NAME\tID\tREG_N\tL_PVALUE O/U\tG_PVALUE O/U\tR_AVG\tR_STDEV\tF_AVG\tBG_AVG"; | |
| 478 | |
| 479 if(SPEARMAN) | |
| 480 out << "\tSP_COR"; | |
| 481 | |
| 482 if(POSITIONAL) | |
| 483 out << "\tPREF_POS\tPREF_POS_PV\n"; | |
| 484 else | |
| 485 out << endl; | |
| 486 | |
| 487 vector<matrix>::iterator vi = Mat->begin(); | |
| 488 | |
| 489 while(vi != Mat->end()) | |
| 490 { | |
| 491 if(S_map.find(vi->get_pvalue()) != S_map.end()) //GFI WORKAROUND | |
| 492 { | |
| 493 if(S_map.find(vi->get_pvalue())->second->get_pvalue_bg() < vi->get_pvalue_bg()) | |
| 494 { | |
| 495 S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi)); | |
| 496 vi++; | |
| 497 continue; | |
| 498 } | |
| 499 else if((S_map.find(vi->get_pvalue())->second->get_pvalue_bg() > vi->get_pvalue_bg())) | |
| 500 { | |
| 501 S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi)); | |
| 502 vi++; | |
| 503 continue; | |
| 504 } | |
| 505 else | |
| 506 { | |
| 507 if(S_map.find(vi->get_pvalue())->second->get_spearman() > vi->get_spearman()) | |
| 508 { | |
| 509 S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi)); | |
| 510 vi++; | |
| 511 continue; | |
| 512 } | |
| 513 else if(S_map.find(vi->get_pvalue())->second->get_spearman() <= vi->get_spearman()) | |
| 514 { | |
| 515 S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi)); | |
| 516 vi++; | |
| 517 continue; | |
| 518 } | |
| 519 } | |
| 520 | |
| 521 } | |
| 522 else | |
| 523 S_map.insert(make_pair(vi->get_pvalue(), vi)); | |
| 524 | |
| 525 vi++; | |
| 526 } | |
| 527 | |
| 528 multimap <double, vector<matrix>::iterator>::iterator mi = S_map.begin(); | |
| 529 | |
| 530 while(mi != S_map.end()) | |
| 531 { | |
| 532 double pv1, pv2; | |
| 533 | |
| 534 pv1 = mi->second->get_pvalue() * Mat->size(); | |
| 535 | |
| 536 if(pv1 > 1) | |
| 537 pv1 = 1; | |
| 538 | |
| 539 pv2 = mi->second->get_pvalue_bg() * Mat->size(); | |
| 540 | |
| 541 if(pv2 > 1) | |
| 542 pv2 = 1; | |
| 543 | |
| 544 out << mi->second->name() << '\t' << mi->second->id() << '\t' << SIZE << '\t' << pv1 << ' ' << mi->second->o_or_u() << '\t'; | |
| 545 | |
| 546 if(mi->second->have_bg()) | |
| 547 out << pv2 << ' ' << mi->second->o_or_u_bg() << '\t'; | |
| 548 else | |
| 549 out << "N/A N/A\t"; | |
| 550 | |
| 551 out << mi->second->get_R_Avg() << '\t' << pow(mi->second->get_R_Var(), 0.5) << '\t' << mi->second->get_F_Avg() << '\t'; | |
| 552 | |
| 553 if(mi->second->have_bg()) | |
| 554 out << mi->second->get_bg_Avg() << '\t'; | |
| 555 else | |
| 556 out << "N/A" << '\t'; | |
| 557 | |
| 558 if(SPEARMAN) | |
| 559 out << mi->second->get_spearman() << '\t'; | |
| 560 | |
| 561 if(POSITIONAL) | |
| 562 { | |
| 563 double min_pv = 1, min_pv_c; | |
| 564 unsigned int min_pos; | |
| 565 | |
| 566 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++) | |
| 567 { | |
| 568 if(mi->second->get_W_pvalue(i) <= min_pv) //&& mi->second->W_o_or_u(i) == 1) | |
| 569 { | |
| 570 min_pv = mi->second->get_W_pvalue(i); | |
| 571 min_pos = i; | |
| 572 } | |
| 573 } | |
| 574 | |
| 575 min_pv_c = min_pv * (mi->second->bin_num() * Seq->size()); | |
| 576 | |
| 577 if(min_pv_c > 1) | |
| 578 min_pv_c = 1; | |
| 579 | |
| 580 out << '[' << (int)(((W_STEP * min_pos) + mi->second->length()/2) - RSIZE/2) << ',' << (int)(((W_STEP * min_pos) + W_SIZE) + mi->second->length()/2 - RSIZE/2) << ']' << '\t' << min_pv_c;// << '\t' << mi->second->get_W_R_T_pvalue(min_pos) << '\t' << min_pv_c; | |
| 581 } | |
| 582 | |
| 583 out << endl; | |
| 584 | |
| 585 mi++; | |
| 586 } | |
| 587 | |
| 588 if(POSITIONAL) | |
| 589 { | |
| 590 out << endl << endl << "POSITIONAL ANALYSIS: W_SIZE = " << W_SIZE << " W_STEP = " << W_STEP << endl << endl; | |
| 591 | |
| 592 out << "NAME\tID\t"; | |
| 593 | |
| 594 mi = S_map.begin(); | |
| 595 | |
| 596 for(int i = -(int)(RSIZE/2); i <= (int)((RSIZE/2) - W_SIZE); i += W_STEP) | |
| 597 out << i /*<< '|' << i + (int)(W_SIZE - 1)*/ << '\t'; | |
| 598 | |
| 599 out << endl; | |
| 600 | |
| 601 while(mi != S_map.end()) | |
| 602 { | |
| 603 /* bool gflag = false; | |
| 604 | |
| 605 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++) | |
| 606 if(mi->second->get_W_pvalue(i) <= POS_VIS_THRESHOLD) | |
| 607 { | |
| 608 gflag = true; | |
| 609 break; | |
| 610 } | |
| 611 | |
| 612 if(!gflag) | |
| 613 { | |
| 614 mi++; | |
| 615 continue; | |
| 616 } | |
| 617 */ | |
| 618 out << mi->second->name() << '\t' << mi->second->id() << '\t'; | |
| 619 | |
| 620 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++) | |
| 621 { | |
| 622 // out << (-log(mi->second->get_W_pvalue(i))/log(10)) * mi->second->W_o_or_u(i) << '\t';//<< " [" << mi->second->W_o_or_u(i) << "]\t"; | |
| 623 // | |
| 624 double bpv = mi->second->get_W_pvalue(i) * (mi->second->bin_num() * Seq->size()); | |
| 625 | |
| 626 if(bpv > 1) | |
| 627 bpv = 1; | |
| 628 | |
| 629 out << bpv << '\t'; | |
| 630 } | |
| 631 | |
| 632 out << endl; | |
| 633 | |
| 634 mi++; | |
| 635 } | |
| 636 | |
| 637 if(!obstf.empty()) | |
| 638 distance_analysis(Mat, Seq, SIZE, &S_map); | |
| 639 } | |
| 640 | |
| 641 out.close(); | |
| 642 | |
| 643 return; | |
| 644 } | |
| 645 | |
| 646 void distance_analysis(vector<matrix> *Mat, vector<sequence> *Seq, unsigned int SIZE, multimap <double, vector<matrix>::iterator> *S_map) | |
| 647 { | |
| 648 string dist_outfile = regfile, dist2_outfile = regfile, dist3_outfile = regfile; | |
| 649 | |
| 650 dist_outfile += ".dist"; | |
| 651 dist2_outfile += ".exp_dist"; | |
| 652 dist3_outfile += ".pv_dist"; | |
| 653 | |
| 654 ofstream out(dist_outfile.c_str()), out2, out3; | |
| 655 | |
| 656 if(!obstf.empty()) | |
| 657 { | |
| 658 out2.open(dist2_outfile.c_str()); | |
| 659 out3.open(dist3_outfile.c_str()); | |
| 660 | |
| 661 out2 << "TF/EXP_HITS\t"; | |
| 662 out3 << "TF/PVALUE\t"; | |
| 663 } | |
| 664 | |
| 665 out << "TF1|TF2/HITS\t"; | |
| 666 | |
| 667 for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++) | |
| 668 out << p << '\t'; | |
| 669 | |
| 670 out << endl; | |
| 671 | |
| 672 if(!obstf.empty()) | |
| 673 { | |
| 674 for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++) | |
| 675 { | |
| 676 out2 << p << '\t'; | |
| 677 out3 << p << '\t'; | |
| 678 } | |
| 679 | |
| 680 out2 << endl; | |
| 681 out3 << endl; | |
| 682 } | |
| 683 | |
| 684 multimap <double, vector<matrix>::iterator>::iterator MRI = S_map->begin(); | |
| 685 unsigned int LCOUNT = 0; | |
| 686 | |
| 687 while(MRI != S_map->end()) | |
| 688 { | |
| 689 if((MRI->second->get_pvalue_bg() > DIST_PV_THRESHOLD || MRI->second->o_or_u_bg() == "UNDER") && | |
| 690 (MRI->second->get_pvalue() > DIST_PV_THRESHOLD || MRI->second->o_or_u() == "UNDER") ) | |
| 691 { | |
| 692 MRI++; | |
| 693 continue; | |
| 694 } | |
| 695 | |
| 696 // if(!QUIET) cerr << endl << MRI->second->name() << endl; | |
| 697 | |
| 698 LCOUNT++; | |
| 699 | |
| 700 map<int, double> R_dist_map; | |
| 701 | |
| 702 if(!obstf.empty() && obstf == MRI->second->id()) | |
| 703 { | |
| 704 map<int, unsigned int> M_hit_distr; | |
| 705 | |
| 706 for(unsigned int a = 0; a < SIZE; a++) | |
| 707 { | |
| 708 map<int, unsigned int>::iterator mi; | |
| 709 | |
| 710 if(!MRI->second->get_strand_at(a)) | |
| 711 mi = M_hit_distr.find(MRI->second->get_pos_at(a)); | |
| 712 else | |
| 713 mi = M_hit_distr.find(RSIZE - MRI->second->get_pos_at(a)); | |
| 714 | |
| 715 if(mi == M_hit_distr.end()) | |
| 716 { | |
| 717 if(!MRI->second->get_strand_at(a)) | |
| 718 M_hit_distr[MRI->second->get_pos_at(a)] = 1; | |
| 719 else | |
| 720 M_hit_distr[RSIZE - MRI->second->get_pos_at(a)] = 1; | |
| 721 } | |
| 722 else | |
| 723 mi->second++; | |
| 724 } | |
| 725 | |
| 726 map<int, unsigned int>::iterator mi = M_hit_distr.begin(); | |
| 727 | |
| 728 while(mi != M_hit_distr.end()) | |
| 729 { | |
| 730 for(int b = 0; b < RSIZE; b++) | |
| 731 { | |
| 732 int dpos = mi->first - b; | |
| 733 | |
| 734 map<int, double>::iterator ri = R_dist_map.find(dpos); | |
| 735 | |
| 736 if(ri == R_dist_map.end()) | |
| 737 R_dist_map[dpos] = mi->second; | |
| 738 else | |
| 739 ri->second += mi->second; | |
| 740 | |
| 741 } | |
| 742 | |
| 743 mi++; | |
| 744 } | |
| 745 | |
| 746 map<int, double>::iterator ri = R_dist_map.begin(); | |
| 747 double risum = SIZE * RSIZE; | |
| 748 | |
| 749 out2 << MRI->second->name() << '\t'; | |
| 750 | |
| 751 while(ri != R_dist_map.end()) | |
| 752 { | |
| 753 ri->second /= risum; | |
| 754 out2 << ri->second*SIZE << '\t'; | |
| 755 ri++; | |
| 756 } | |
| 757 | |
| 758 out2 << endl; | |
| 759 } | |
| 760 | |
| 761 // multimap <double, vector<matrix>::iterator>::iterator MRI2 = MRI; | |
| 762 multimap <double, vector<matrix>::iterator>::iterator MRI2 = S_map->begin(); | |
| 763 | |
| 764 while(MRI2 != S_map->end()) | |
| 765 { | |
| 766 vector<unsigned int> hit_v((RSIZE*2) - 1, 0); | |
| 767 | |
| 768 if(MRI2 == MRI) | |
| 769 { | |
| 770 vector<unsigned int> SBPOS; | |
| 771 // vector<bool> SBSTRAND; | |
| 772 | |
| 773 if(SINGLE_STRAND) | |
| 774 { | |
| 775 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 776 { | |
| 777 bool s; | |
| 778 unsigned int P; | |
| 779 double score; | |
| 780 | |
| 781 score = MRI->second->scan_seq_ss_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, MRI->second->get_pos_at(i), Seq->at(i).strand()); | |
| 782 SBPOS.push_back(P); | |
| 783 // SBSTRAND.push_back(s); | |
| 784 } | |
| 785 | |
| 786 } | |
| 787 else | |
| 788 { | |
| 789 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 790 { | |
| 791 bool s; | |
| 792 unsigned int P; | |
| 793 double score; | |
| 794 | |
| 795 score = MRI->second->scan_seq_ds_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, (int)MRI->second->get_pos_at(i)); | |
| 796 | |
| 797 SBPOS.push_back(P + MRI->second->length()/2); | |
| 798 // SBSTRAND.push_back(s); | |
| 799 | |
| 800 } | |
| 801 } | |
| 802 | |
| 803 for(unsigned int k = 0; k < SIZE; k++) | |
| 804 { | |
| 805 int dist; | |
| 806 | |
| 807 if(!MRI->second->get_strand_at(k)) | |
| 808 dist = (int)SBPOS.at(k) - (int)MRI->second->get_pos_at(k); | |
| 809 else | |
| 810 dist = ((int)(RSIZE-1) - (int)SBPOS.at(k)) - ((int)(RSIZE - 1) - (int)MRI->second->get_pos_at(k)); | |
| 811 | |
| 812 hit_v.at((RSIZE-1) + dist)++; | |
| 813 } | |
| 814 | |
| 815 } | |
| 816 else | |
| 817 { | |
| 818 for(unsigned int k = 0; k < SIZE; k++) | |
| 819 { | |
| 820 int dist; | |
| 821 | |
| 822 if(!MRI->second->get_strand_at(k)) | |
| 823 dist = (int)MRI2->second->get_pos_at(k) - (int)MRI->second->get_pos_at(k); | |
| 824 else | |
| 825 dist = ((int)(RSIZE - 1 ) - (int)MRI2->second->get_pos_at(k)) - ((int)(RSIZE - 1 ) - (int)MRI->second->get_pos_at(k)); | |
| 826 | |
| 827 // if(!QUIET) cerr << endl << (int)((RSIZE-1) + dist) << endl; | |
| 828 | |
| 829 hit_v.at((RSIZE-1) + dist)++; | |
| 830 } | |
| 831 } | |
| 832 | |
| 833 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty()) | |
| 834 out << MRI->second->name() << "|" << MRI2->second->name() << '\t'; | |
| 835 | |
| 836 if(!obstf.empty() && obstf == MRI->second->id()) | |
| 837 out3 << MRI->second->name() << "|" << MRI2->second->name() << '\t'; | |
| 838 | |
| 839 for(unsigned int p = 0; p < (RSIZE*2) - 1; p++) | |
| 840 { | |
| 841 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty()) | |
| 842 out << hit_v[p] << '\t'; | |
| 843 | |
| 844 if(!obstf.empty() && obstf == MRI->second->id()) | |
| 845 { | |
| 846 map<int, double>::iterator ri = R_dist_map.find(p - (RSIZE - 1)); | |
| 847 | |
| 848 if(hit_v[p] >= DIST_HIT_THRESHOLD && hit_v[p]/(ri->second * SIZE) >= DIST_RATIO_THRESHOLD) | |
| 849 { | |
| 850 double pv = gsl_cdf_binomial_Q(hit_v[p], ri->second, SIZE); | |
| 851 | |
| 852 if(pv <= DIST_PV_THRESHOLD) | |
| 853 out3 << pv << '\t'; | |
| 854 else | |
| 855 out3 << "NA" << '\t'; | |
| 856 } | |
| 857 else | |
| 858 out3 << "NA" << '\t'; | |
| 859 } | |
| 860 } | |
| 861 | |
| 862 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty()) | |
| 863 out << endl; | |
| 864 | |
| 865 if(!obstf.empty() && obstf == MRI->second->id()) | |
| 866 out3 << endl; | |
| 867 | |
| 868 MRI2++; | |
| 869 } | |
| 870 | |
| 871 MRI++; | |
| 872 } | |
| 873 | |
| 874 out.close(); | |
| 875 | |
| 876 if(!obstf.empty()) | |
| 877 { | |
| 878 out2.close(); | |
| 879 out3.close(); | |
| 880 } | |
| 881 | |
| 882 if(!LCOUNT) | |
| 883 { | |
| 884 remove(dist_outfile.c_str()); | |
| 885 | |
| 886 if(!obstf.empty()) | |
| 887 { | |
| 888 remove(dist2_outfile.c_str()); | |
| 889 remove(dist3_outfile.c_str()); | |
| 890 } | |
| 891 } | |
| 892 | |
| 893 if(!QUIET) cerr << "\nDistance analysis written in file: " << dist_outfile << endl; | |
| 894 | |
| 895 return; | |
| 896 } | |
| 897 | |
| 898 void scanner(vector<matrix> *Mat, vector<sequence> *Seq) | |
| 899 { | |
| 900 for(unsigned int i = 0; i < Mat->size(); i++) | |
| 901 { | |
| 902 if(VERBOSE) | |
| 903 { | |
| 904 if(!QUIET) cerr << "\nScanning sequences for " | |
| 905 << Mat->at(i).name() << " (" << Mat->at(i).id() << ")"<< endl; | |
| 906 } | |
| 907 | |
| 908 Mat->at(i).scan(Seq); | |
| 909 Mat->at(i).calc_var((double)Seq->size()); | |
| 910 | |
| 911 if(POSITIONAL) | |
| 912 Mat->at(i).calc_W_param((double)Seq->size()); | |
| 913 | |
| 914 if(VERBOSE) | |
| 915 { | |
| 916 if(!QUIET) cerr << "\nRegions avg = " << Mat->at(i).R_Avg << '\t' << "Flanks avg = " << Mat->at(i).F_Avg << '\t' | |
| 917 << "Regions devstd = " << pow(Mat->at(i).R_Var,0.5) << '\t' << "Flanks devstd = " << pow(Mat->at(i).F_Var,0.5); | |
| 918 if(!QUIET) cerr << "\nDoing Welch's t test... "; | |
| 919 } | |
| 920 | |
| 921 Mat->at(i).welch_test((double)Seq->size()); | |
| 922 | |
| 923 if(Mat->at(i).have_bg()) | |
| 924 Mat->at(i).welch_test_bg((double)Seq->size()); | |
| 925 | |
| 926 if(POSITIONAL) | |
| 927 { | |
| 928 // Mat->at(i).welch_test_pos((double)Seq->size()); | |
| 929 Mat->at(i).z_test_pos((double)Seq->size()); | |
| 930 } | |
| 931 | |
| 932 if(VERBOSE) | |
| 933 { | |
| 934 if(!QUIET) cerr << "done. Pvalue = " << Mat->at(i).get_pvalue() << ' ' << Mat->at(i).o_or_u() << endl; | |
| 935 if(!QUIET) cerr << endl; | |
| 936 } | |
| 937 } | |
| 938 | |
| 939 return; | |
| 940 } | |
| 941 | |
| 942 void acquire_matrices(vector<matrix> *Mat, map<string,bg> *Bg, string *matfile) | |
| 943 { | |
| 944 ifstream in(matfile->c_str()); | |
| 945 string line; | |
| 946 | |
| 947 if(!in) | |
| 948 { | |
| 949 if(!QUIET) cerr << "Can't find " << *matfile << endl; | |
| 950 exit(1); | |
| 951 } | |
| 952 | |
| 953 while(getline(in,line)) | |
| 954 { | |
| 955 if(line.empty()) | |
| 956 continue; | |
| 957 | |
| 958 vector<string> mbuf; | |
| 959 mbuf.push_back(line); | |
| 960 | |
| 961 for(unsigned int i = 0; i < ALPHABET_SIZE; i++) | |
| 962 { | |
| 963 getline(in,line); | |
| 964 | |
| 965 if(!line.empty()) | |
| 966 mbuf.push_back(line); | |
| 967 } | |
| 968 | |
| 969 if(mbuf.size() == ALPHABET_SIZE + 1) | |
| 970 Mat->push_back(matrix(&mbuf,Bg)); | |
| 971 } | |
| 972 | |
| 973 in.close(); | |
| 974 | |
| 975 vector<matrix>::iterator vi = Mat->begin(); | |
| 976 | |
| 977 while(vi != Mat->end()) | |
| 978 { | |
| 979 if(!vi->is_good()) | |
| 980 { | |
| 981 vi = Mat->erase(vi); | |
| 982 continue; | |
| 983 } | |
| 984 | |
| 985 vi++; | |
| 986 } | |
| 987 | |
| 988 return; | |
| 989 } | |
| 990 | |
| 991 void acquire_background(map<string, bg> *Bg) | |
| 992 { | |
| 993 ifstream in(bgfile.c_str()); | |
| 994 | |
| 995 if(!in) | |
| 996 { | |
| 997 if(!QUIET) cerr << "\nWarning: background file " << bgfile << " not found..." << endl; | |
| 998 return; | |
| 999 } | |
| 1000 | |
| 1001 string line; | |
| 1002 | |
| 1003 getline(in,line); | |
| 1004 | |
| 1005 while(getline(in, line)) | |
| 1006 { | |
| 1007 istringstream str(line); | |
| 1008 string id, trash; | |
| 1009 bg nbg; | |
| 1010 | |
| 1011 str >> trash >> id >> nbg.Size >> trash >> trash >> trash >> trash >> nbg.Avg >> nbg.Var; | |
| 1012 | |
| 1013 nbg.Var = pow(nbg.Var,2); | |
| 1014 | |
| 1015 Bg->insert(make_pair(id,nbg)); | |
| 1016 } | |
| 1017 | |
| 1018 in.close(); | |
| 1019 return; | |
| 1020 } | |
| 1021 | |
| 1022 void acquire_regions(vector<sequence> *Seq) | |
| 1023 { | |
| 1024 ifstream in(regfile.c_str()); | |
| 1025 // map<string, vector<string> > Regs; | |
| 1026 map<string, map<unsigned int, string> > Regs; //PREVENT REG DUPLICATES | |
| 1027 map<string, map<unsigned int, unsigned int > > M_RANK; | |
| 1028 | |
| 1029 if(!in) | |
| 1030 { | |
| 1031 if(!QUIET) cerr << "Can't find " << regfile << endl; | |
| 1032 exit(1); | |
| 1033 } | |
| 1034 | |
| 1035 string line; | |
| 1036 | |
| 1037 unsigned int r_count = 0; | |
| 1038 | |
| 1039 while(getline(in,line)) | |
| 1040 { | |
| 1041 if(line.empty()) | |
| 1042 continue; | |
| 1043 | |
| 1044 if(line[0] == '#') | |
| 1045 continue; | |
| 1046 | |
| 1047 for(unsigned int i = 0; i < 3; i++) | |
| 1048 line[i] = tolower(line[i]); | |
| 1049 | |
| 1050 for(unsigned int i = 3; i < line.size(); i++) | |
| 1051 line[i] = toupper(line[i]); | |
| 1052 | |
| 1053 istringstream str(line); | |
| 1054 string chr; | |
| 1055 unsigned int start, end, mp; | |
| 1056 | |
| 1057 str >> chr >> start >> end; | |
| 1058 | |
| 1059 // Regs[chr].push_back(line); | |
| 1060 mp = start + ((end - start)/2); | |
| 1061 | |
| 1062 if(mp > (RSIZE + RSIZE/2) + 1) | |
| 1063 { | |
| 1064 Regs[chr][mp] = line; | |
| 1065 | |
| 1066 // if(!QUIET) cerr << endl << line; | |
| 1067 | |
| 1068 if(M_RANK.find(chr) != M_RANK.end()) //PREVENT REGIONS PRESENT MORE THAN ONCE TO DO TOO MUCH MESS WITH SPEARMAN CORRELATION | |
| 1069 if(M_RANK[chr].find(mp) != M_RANK[chr].end()) | |
| 1070 continue; | |
| 1071 | |
| 1072 M_RANK[chr][mp] = r_count; | |
| 1073 r_count++; | |
| 1074 } | |
| 1075 else | |
| 1076 if(!QUIET) cerr << "\nSkipping region: too close to chromosome start..." << endl << line << endl; | |
| 1077 } | |
| 1078 | |
| 1079 in.close(); | |
| 1080 | |
| 1081 map<string, map<unsigned int, string> >::iterator mi = Regs.begin(); | |
| 1082 | |
| 1083 if(!TWOBIT) | |
| 1084 { | |
| 1085 while(mi != Regs.end()) | |
| 1086 { | |
| 1087 string file = genome; | |
| 1088 file += "/"; | |
| 1089 file += mi->first; | |
| 1090 file += ".raw"; | |
| 1091 | |
| 1092 in.open(file.c_str()); | |
| 1093 | |
| 1094 if(!in) | |
| 1095 { | |
| 1096 if(!QUIET) cerr << "\nCan't find " << file << ". Skipping chromosome:\n" << mi->first << endl; | |
| 1097 | |
| 1098 if(SPEARMAN) | |
| 1099 { | |
| 1100 if(!QUIET) cerr << "\nSpearman correlation disabled..." << endl; | |
| 1101 SPEARMAN = false; | |
| 1102 } | |
| 1103 | |
| 1104 mi++; | |
| 1105 in.close(); | |
| 1106 continue; | |
| 1107 } | |
| 1108 | |
| 1109 map<unsigned int, string>::iterator si = mi->second.begin(); | |
| 1110 | |
| 1111 while(si != mi->second.end()) | |
| 1112 { | |
| 1113 Seq->push_back(sequence(&si->second, &in, M_RANK[mi->first][si->first])); | |
| 1114 si++; | |
| 1115 } | |
| 1116 | |
| 1117 in.close(); | |
| 1118 mi++; | |
| 1119 } | |
| 1120 } | |
| 1121 else | |
| 1122 { | |
| 1123 ostringstream str; | |
| 1124 ofstream outfile; | |
| 1125 str << ".pchip." << rand() << ".tmp"; | |
| 1126 string tmp1 = str.str(); | |
| 1127 outfile.open(tmp1.c_str(), ios::out); | |
| 1128 vector<string> k1; | |
| 1129 vector<unsigned int> k2; | |
| 1130 | |
| 1131 while(mi != Regs.end()) | |
| 1132 { | |
| 1133 map<unsigned int, string>::iterator si = mi->second.begin(); | |
| 1134 | |
| 1135 while(si != mi->second.end()) | |
| 1136 { | |
| 1137 unsigned int Start, End; | |
| 1138 | |
| 1139 Start = si->first - RSIZE; | |
| 1140 End = si->first + RSIZE + 1 + MAX_L; | |
| 1141 | |
| 1142 outfile << mi->first << '\t' << Start << '\t' << End << '\t' << "." << endl; | |
| 1143 | |
| 1144 k1.push_back(mi->first); | |
| 1145 k2.push_back(si->first); | |
| 1146 | |
| 1147 si++; | |
| 1148 } | |
| 1149 | |
| 1150 mi++; | |
| 1151 } | |
| 1152 | |
| 1153 string cline = "twoBitToFa -bedPos -bed="; | |
| 1154 cline += str.str(); | |
| 1155 str << ".fa"; | |
| 1156 cline += " "; | |
| 1157 cline += genome; | |
| 1158 /* cline += "/"; | |
| 1159 cline += genome; | |
| 1160 cline += ".2bit "; */ | |
| 1161 cline += " "; | |
| 1162 cline += str.str(); | |
| 1163 | |
| 1164 // if(!QUIET) cerr << endl << "TwoBit command line:\n" << cline << endl; | |
| 1165 | |
| 1166 if(system(cline.c_str()) != 0) | |
| 1167 { | |
| 1168 if(!QUIET) cerr << endl << "twoBitToFa returned non 0 error code... aborting pscan_chip run!\n" << endl; | |
| 1169 exit(1); | |
| 1170 } | |
| 1171 else | |
| 1172 { | |
| 1173 ifstream in(str.str().c_str()); | |
| 1174 string seq; | |
| 1175 seq.clear(); | |
| 1176 | |
| 1177 if(!in) | |
| 1178 { | |
| 1179 if(!QUIET) cerr << endl << "Can't access twoBitToFa output file! Aborting pscan_chip run!\n" << endl; | |
| 1180 exit(1); | |
| 1181 } | |
| 1182 | |
| 1183 unsigned int scount = 0; | |
| 1184 | |
| 1185 while(getline(in,line)) | |
| 1186 { | |
| 1187 if(line.empty()) | |
| 1188 continue; | |
| 1189 | |
| 1190 if(line[0] == '>') | |
| 1191 { | |
| 1192 if(seq.empty()) | |
| 1193 continue; | |
| 1194 else | |
| 1195 { | |
| 1196 Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount))); | |
| 1197 seq.clear(); | |
| 1198 scount++; | |
| 1199 continue; | |
| 1200 } | |
| 1201 } | |
| 1202 else | |
| 1203 seq += line; | |
| 1204 } | |
| 1205 | |
| 1206 Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount))); //LAST SEQUENCE! | |
| 1207 | |
| 1208 in.close(); | |
| 1209 remove(str.str().c_str()); | |
| 1210 remove(tmp1.c_str()); | |
| 1211 } | |
| 1212 | |
| 1213 // exit(1); | |
| 1214 } | |
| 1215 | |
| 1216 vector<sequence>::iterator vi = Seq->begin(); | |
| 1217 | |
| 1218 while(vi != Seq->end()) | |
| 1219 { | |
| 1220 if(!vi->is_good()) | |
| 1221 { | |
| 1222 vi = Seq->erase(vi); | |
| 1223 continue; | |
| 1224 } | |
| 1225 | |
| 1226 vi++; | |
| 1227 } | |
| 1228 | |
| 1229 return; | |
| 1230 } | |
| 1231 | |
| 1232 void command_line_parser(int argc, char **argv) | |
| 1233 { | |
| 1234 if(argc == 1) | |
| 1235 return; | |
| 1236 | |
| 1237 for(int i = 1; i < argc; i++) | |
| 1238 { | |
| 1239 string buf = argv[i]; | |
| 1240 | |
| 1241 if(buf == "-r") | |
| 1242 { | |
| 1243 if(i < argc - 1) | |
| 1244 regfile = argv[++i]; | |
| 1245 continue; | |
| 1246 } | |
| 1247 | |
| 1248 if(buf == "-g") | |
| 1249 { | |
| 1250 if(i < argc - 1) | |
| 1251 genome = argv[++i]; | |
| 1252 continue; | |
| 1253 } | |
| 1254 | |
| 1255 if(buf == "-s") | |
| 1256 { | |
| 1257 if(i < argc - 1) | |
| 1258 RSIZE = atoi(argv[++i]); | |
| 1259 continue; | |
| 1260 } | |
| 1261 if(buf == "-bgavg") | |
| 1262 { | |
| 1263 if(i < argc - 1) | |
| 1264 SINGLE_MATRIX_BG_AVG = atof(argv[++i]); | |
| 1265 continue; | |
| 1266 } | |
| 1267 if(buf == "-M") | |
| 1268 { | |
| 1269 if(i < argc - 1) | |
| 1270 matrixfile = argv[++i]; | |
| 1271 continue; | |
| 1272 } | |
| 1273 if(buf == "-MADD") | |
| 1274 { | |
| 1275 if(i < argc - 1) | |
| 1276 additional_matrixfile = argv[++i]; | |
| 1277 continue; | |
| 1278 } | |
| 1279 if(buf == "-m") | |
| 1280 { | |
| 1281 if(i < argc - 1) | |
| 1282 selected = argv[++i]; | |
| 1283 continue; | |
| 1284 } | |
| 1285 | |
| 1286 if(buf == "-bg") | |
| 1287 { | |
| 1288 if(i < argc - 1) | |
| 1289 bgfile = argv[++i]; | |
| 1290 continue; | |
| 1291 } | |
| 1292 if(buf == "-go") | |
| 1293 { | |
| 1294 if(i < argc - 1) | |
| 1295 GALAXY_OUTPUT = argv[++i]; | |
| 1296 continue; | |
| 1297 } | |
| 1298 if(buf == "-obs") | |
| 1299 { | |
| 1300 if(i < argc - 1) | |
| 1301 obstf = argv[++i]; | |
| 1302 | |
| 1303 continue; | |
| 1304 } | |
| 1305 if(buf == "-ss") | |
| 1306 { | |
| 1307 SINGLE_STRAND = true; | |
| 1308 continue; | |
| 1309 } | |
| 1310 if(buf == "-2bit") | |
| 1311 { | |
| 1312 TWOBIT = true; | |
| 1313 continue; | |
| 1314 } | |
| 1315 if(buf == "-quiet") | |
| 1316 { | |
| 1317 QUIET = true; | |
| 1318 continue; | |
| 1319 } | |
| 1320 if(buf == "-sbp") | |
| 1321 { | |
| 1322 SECOND_BEST_POS = true; | |
| 1323 continue; | |
| 1324 } | |
| 1325 if(buf == "-fasta") | |
| 1326 { | |
| 1327 GET_SEQUENCES = true; | |
| 1328 continue; | |
| 1329 } | |
| 1330 if(buf == "-pos") | |
| 1331 { | |
| 1332 POSITIONAL = true; | |
| 1333 continue; | |
| 1334 } | |
| 1335 if(buf == "-spear") | |
| 1336 { | |
| 1337 SPEARMAN = true; | |
| 1338 continue; | |
| 1339 } | |
| 1340 if(buf == "-wsize") | |
| 1341 { | |
| 1342 if(i < argc - 1) | |
| 1343 W_SIZE = atoi(argv[++i]); | |
| 1344 | |
| 1345 W_STEP = W_SIZE / 2; | |
| 1346 continue; | |
| 1347 } | |
| 1348 /* if(buf == "-wstep") | |
| 1349 { | |
| 1350 if(i < argc - 1) | |
| 1351 W_STEP = atoi(argv[++i]); | |
| 1352 continue; | |
| 1353 } | |
| 1354 | |
| 1355 if(buf == "-fast") | |
| 1356 { | |
| 1357 FAST_SCAN = true; | |
| 1358 continue; | |
| 1359 }*/ | |
| 1360 /* if(buf == "-naive") | |
| 1361 { | |
| 1362 NAIVE = true; | |
| 1363 continue; | |
| 1364 } | |
| 1365 */ | |
| 1366 if(buf == "-v") | |
| 1367 { | |
| 1368 VERBOSE = true; | |
| 1369 continue; | |
| 1370 } | |
| 1371 else | |
| 1372 { | |
| 1373 if(!QUIET) cerr << "\nUnknown option: " << buf << endl; | |
| 1374 exit(1); | |
| 1375 } | |
| 1376 | |
| 1377 /* if(buf == "-Q") | |
| 1378 { | |
| 1379 if(i < argc - 1) | |
| 1380 idfile = argv[++i]; | |
| 1381 continue; | |
| 1382 } | |
| 1383 if(buf == "-ql") | |
| 1384 { | |
| 1385 if(i < argc - 1) | |
| 1386 q_idlist = argv[++i]; | |
| 1387 continue; | |
| 1388 } | |
| 1389 if(buf == "-imgm") | |
| 1390 { | |
| 1391 if(i < argc - 1) | |
| 1392 bg_for_img = argv[++i]; | |
| 1393 continue; | |
| 1394 } | |
| 1395 if(buf == "-magic") | |
| 1396 { | |
| 1397 if(i < argc - 1) | |
| 1398 magic = argv[++i]; | |
| 1399 continue; | |
| 1400 }*/ | |
| 1401 } | |
| 1402 | |
| 1403 if(RSIZE > MAX_REG_SIZE) | |
| 1404 { | |
| 1405 if(!QUIET) cerr << "\nRegion size exceeds " << MAX_REG_SIZE << "..." << endl; | |
| 1406 exit(1); | |
| 1407 } | |
| 1408 | |
| 1409 if(RSIZE % W_SIZE != 0 || W_SIZE % 2 != 0 || RSIZE / W_SIZE < 5) | |
| 1410 { | |
| 1411 if(!QUIET) cerr << "\nWindow size must be even and both an exact divisor of region size and at least five times smaller of it: positional analysis disabled." << endl; | |
| 1412 POSITIONAL = false; | |
| 1413 } | |
| 1414 | |
| 1415 return; | |
| 1416 } | |
| 1417 | |
| 1418 //SEQUENCE CLASS BEGIN | |
| 1419 | |
| 1420 sequence::sequence(string *seq, unsigned int R, string chr, unsigned int mp) //CONSTRUCTOR FOR 2BIT INPUT | |
| 1421 { | |
| 1422 Chr = chr; | |
| 1423 Strand = '+'; | |
| 1424 AntiStrand = '-'; | |
| 1425 E_Rank = R; | |
| 1426 good = true; | |
| 1427 | |
| 1428 Start = mp - RSIZE/2; | |
| 1429 End = (mp + RSIZE/2) - 1; | |
| 1430 | |
| 1431 for(unsigned int i = 0; i < seq->size(); i++) | |
| 1432 seq->at(i) = toupper(seq->at(i)); | |
| 1433 | |
| 1434 Seq = new char[(RSIZE*2) + 1 + MAX_L]; | |
| 1435 | |
| 1436 // if(!QUIET) cerr << "\nseq.size = " << seq->size() << endl; | |
| 1437 // if(!QUIET) cerr << "\narray.size = " << (RSIZE*2) + 1 + MAX_L << endl; | |
| 1438 | |
| 1439 strcpy(Seq, seq->c_str()); | |
| 1440 | |
| 1441 if(seq->size() != RSIZE*2 + MAX_L + 1) | |
| 1442 { | |
| 1443 good = false; | |
| 1444 return; | |
| 1445 } | |
| 1446 | |
| 1447 if(GET_SEQUENCES) | |
| 1448 fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << endl << seq->substr(RSIZE/2, RSIZE) << endl; | |
| 1449 | |
| 1450 return; | |
| 1451 } | |
| 1452 | |
| 1453 sequence::sequence(string *line, ifstream *in, unsigned int R) | |
| 1454 { | |
| 1455 istringstream str(*line); | |
| 1456 string trash; | |
| 1457 int buf_s, buf_e, mp; | |
| 1458 good = true; | |
| 1459 | |
| 1460 E_Rank = R; | |
| 1461 | |
| 1462 str >> Chr >> buf_s >> buf_e >> Name >> trash >> Strand; | |
| 1463 | |
| 1464 if(Strand == 'R') | |
| 1465 Strand = '-'; | |
| 1466 | |
| 1467 if(Strand != '+' && Strand != '-') | |
| 1468 Strand = '+'; | |
| 1469 | |
| 1470 if(Strand == '+') | |
| 1471 AntiStrand = '-'; | |
| 1472 else | |
| 1473 AntiStrand = '+'; | |
| 1474 | |
| 1475 mp = buf_s + ((buf_e - buf_s)/2); | |
| 1476 | |
| 1477 Start = mp - RSIZE/2; | |
| 1478 End = (mp + RSIZE/2) - 1; | |
| 1479 | |
| 1480 // Seq = new char[RSIZE + 1]; | |
| 1481 // UF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2]; | |
| 1482 // DF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2]; | |
| 1483 | |
| 1484 Seq = new char[(RSIZE*2) + 1 + MAX_L]; | |
| 1485 | |
| 1486 in->seekg((Start) - RSIZE/2); | |
| 1487 | |
| 1488 in->get(Seq, (RSIZE*2 + 1) + MAX_L); | |
| 1489 | |
| 1490 if(in->gcount() != RSIZE*2 + MAX_L) | |
| 1491 { | |
| 1492 good = false; | |
| 1493 return; | |
| 1494 } | |
| 1495 | |
| 1496 if(GET_SEQUENCES) | |
| 1497 { | |
| 1498 char *out_seq; | |
| 1499 out_seq = new char[RSIZE + 1]; | |
| 1500 | |
| 1501 in->seekg(Start); | |
| 1502 in->get(out_seq, (RSIZE + 1)); | |
| 1503 | |
| 1504 fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << '\t' << "---" << '\t' << buf_s << '\t' << buf_e << '\t' << endl << out_seq << endl; | |
| 1505 | |
| 1506 delete[] out_seq; | |
| 1507 } | |
| 1508 | |
| 1509 /*in->seekg(((Start - 1) - RSIZE/2) - (MAX_L/2)); | |
| 1510 | |
| 1511 in->get(UF_Seq, RSIZE/2 + 1 + MAX_L /2); | |
| 1512 | |
| 1513 if(in->gcount() != RSIZE/2 + (MAX_L/2)) | |
| 1514 { | |
| 1515 good = false; | |
| 1516 return; | |
| 1517 } | |
| 1518 | |
| 1519 in->get(Seq, RSIZE + 1); | |
| 1520 | |
| 1521 if(in->gcount() != RSIZE) | |
| 1522 { | |
| 1523 good = false; | |
| 1524 return; | |
| 1525 } | |
| 1526 | |
| 1527 in->get(DF_Seq, RSIZE/2 + 1 + (MAX_L/2)); | |
| 1528 | |
| 1529 if(in->gcount() != RSIZE/2 + (MAX_L/2)) | |
| 1530 good = false; | |
| 1531 */ | |
| 1532 | |
| 1533 return; | |
| 1534 } | |
| 1535 | |
| 1536 unsigned int sequence::e_rank() | |
| 1537 { | |
| 1538 return E_Rank; | |
| 1539 } | |
| 1540 | |
| 1541 string sequence::chr() | |
| 1542 { | |
| 1543 return Chr; | |
| 1544 } | |
| 1545 | |
| 1546 /*string* sequence::seq() | |
| 1547 { | |
| 1548 return &Seq; | |
| 1549 }*/ | |
| 1550 | |
| 1551 char* sequence::seq() | |
| 1552 { | |
| 1553 return Seq; | |
| 1554 } | |
| 1555 | |
| 1556 char sequence::strand() | |
| 1557 { | |
| 1558 return Strand; | |
| 1559 } | |
| 1560 | |
| 1561 char sequence::antistrand() | |
| 1562 { | |
| 1563 return AntiStrand; | |
| 1564 } | |
| 1565 | |
| 1566 /*char* sequence::uf_seq() | |
| 1567 { | |
| 1568 return UF_Seq; | |
| 1569 } | |
| 1570 | |
| 1571 char* sequence::df_seq() | |
| 1572 { | |
| 1573 return DF_Seq; | |
| 1574 } | |
| 1575 */ | |
| 1576 string sequence::name() | |
| 1577 { | |
| 1578 return Name; | |
| 1579 } | |
| 1580 | |
| 1581 unsigned int sequence::start() | |
| 1582 { | |
| 1583 return Start; | |
| 1584 } | |
| 1585 | |
| 1586 unsigned int sequence::end() | |
| 1587 { | |
| 1588 return End; | |
| 1589 } | |
| 1590 | |
| 1591 bool sequence::is_good() | |
| 1592 { | |
| 1593 return good; | |
| 1594 } | |
| 1595 | |
| 1596 // SEQUENCE CLASS END | |
| 1597 | |
| 1598 // MATRIX CLASS BEGIN | |
| 1599 | |
| 1600 matrix::matrix(vector<string>* mbuf, map<string, bg> *Bg) | |
| 1601 { | |
| 1602 string buf; | |
| 1603 istringstream bstr(mbuf->at(1)), nstr(mbuf->at(0)); | |
| 1604 L = 0; | |
| 1605 R_Avg = 0; | |
| 1606 F_Avg = 0; | |
| 1607 R_Var = 0; | |
| 1608 F_Var = 0; | |
| 1609 bg_Avg = 0; | |
| 1610 bg_Var = 0; | |
| 1611 | |
| 1612 BIN_NUM = RSIZE / W_STEP; | |
| 1613 | |
| 1614 W_R_Avg.resize(BIN_NUM, 0); | |
| 1615 W_R_Var.resize(BIN_NUM, 0); | |
| 1616 W_R_pvalue.resize(BIN_NUM, 0); | |
| 1617 W_R_T_pvalue.resize(BIN_NUM,0); | |
| 1618 spearman = 42; | |
| 1619 | |
| 1620 O_U = false; | |
| 1621 O_U_bg = false; | |
| 1622 W_O_U.resize(BIN_NUM, 0); | |
| 1623 | |
| 1624 BG = false; | |
| 1625 good = true; | |
| 1626 norm_sec_step = false; | |
| 1627 | |
| 1628 while(bstr >> buf) | |
| 1629 L++; | |
| 1630 | |
| 1631 if(L > RSIZE/2) | |
| 1632 { | |
| 1633 good = false; | |
| 1634 return; | |
| 1635 } | |
| 1636 | |
| 1637 if(L > MAX_L) | |
| 1638 MAX_L = L; | |
| 1639 | |
| 1640 if(!mbuf->at(0).empty()) | |
| 1641 { | |
| 1642 if(mbuf->at(0).at(0) != '>') | |
| 1643 { | |
| 1644 good = false; | |
| 1645 return; | |
| 1646 } | |
| 1647 } | |
| 1648 else | |
| 1649 { | |
| 1650 good = false; | |
| 1651 return; | |
| 1652 } | |
| 1653 | |
| 1654 // Name = mbuf->at(0).substr(1); | |
| 1655 nstr >> Id >> Name; | |
| 1656 | |
| 1657 Id = Id.substr(1); | |
| 1658 | |
| 1659 if(!selected.empty()) | |
| 1660 { | |
| 1661 // cerr << endl << selected << '\t' << Id << '\t' << Name << endl; | |
| 1662 | |
| 1663 if(selected != Id && selected != Name) | |
| 1664 { | |
| 1665 good = false; | |
| 1666 return; | |
| 1667 } | |
| 1668 } | |
| 1669 | |
| 1670 if(!Bg->empty()) | |
| 1671 { | |
| 1672 map<string,bg>::iterator mi = Bg->find(Id); | |
| 1673 | |
| 1674 if(mi == Bg->end()) | |
| 1675 mi = Bg->find(Name); | |
| 1676 | |
| 1677 if(mi != Bg->end()) | |
| 1678 { | |
| 1679 bg_Avg = mi->second.Avg; | |
| 1680 bg_Var = mi->second.Var; | |
| 1681 bg_Size = mi->second.Size; | |
| 1682 BG = true; | |
| 1683 } | |
| 1684 else | |
| 1685 if(!QUIET) cerr << "\nWarning: no background for matrix: " << Id << endl; | |
| 1686 } | |
| 1687 | |
| 1688 M = new double*[L]; | |
| 1689 rc_M = new double*[L]; | |
| 1690 | |
| 1691 for(unsigned int i = 1; i < ALPHABET_SIZE + 1; i++) | |
| 1692 { | |
| 1693 istringstream str(mbuf->at(i)); | |
| 1694 | |
| 1695 unsigned int count = 0; | |
| 1696 | |
| 1697 while(str >> buf) | |
| 1698 { | |
| 1699 if(i == 1) | |
| 1700 { | |
| 1701 M[count] = new double[ALPHABET_SIZE + 1]; | |
| 1702 rc_M[count] = new double[ALPHABET_SIZE + 1]; | |
| 1703 } | |
| 1704 | |
| 1705 M[count][i-1] = atof(buf.c_str()); | |
| 1706 count++; | |
| 1707 | |
| 1708 if(count > L) | |
| 1709 { | |
| 1710 good = false; | |
| 1711 return; | |
| 1712 } | |
| 1713 } | |
| 1714 | |
| 1715 if(count != L) | |
| 1716 { | |
| 1717 good = false; | |
| 1718 return; | |
| 1719 } | |
| 1720 } | |
| 1721 | |
| 1722 normalizer(); | |
| 1723 to_log(); | |
| 1724 rev_c(); | |
| 1725 avg(); | |
| 1726 max_min(); | |
| 1727 // set_top_pos(); | |
| 1728 | |
| 1729 if(min_score == max_score) | |
| 1730 good = false; | |
| 1731 | |
| 1732 return; | |
| 1733 } | |
| 1734 | |
| 1735 void matrix::display() | |
| 1736 { | |
| 1737 if(!QUIET) cerr << "\n>" << Name << '\t' << good << endl; | |
| 1738 | |
| 1739 for(unsigned int i = 0; i < ALPHABET_SIZE; i++) | |
| 1740 { | |
| 1741 for(unsigned int j = 0; j < L; j++) | |
| 1742 if(!QUIET) cerr << M[j][i] << '\t'; | |
| 1743 | |
| 1744 if(!QUIET) cerr << endl; | |
| 1745 } | |
| 1746 | |
| 1747 if(!QUIET) cerr << endl; | |
| 1748 | |
| 1749 // for(unsigned int i = 0; i < TOP_P; i++) | |
| 1750 // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << top_pos[i] + 1 << "-" << top_char[i] << '\t'; | |
| 1751 | |
| 1752 if(!QUIET) cerr << "\n\n>" << Name << '\t' << good << '\t' << "RC" << endl; | |
| 1753 | |
| 1754 for(unsigned int i = 0; i < ALPHABET_SIZE; i++) | |
| 1755 { | |
| 1756 for(unsigned int j = 0; j < L; j++) | |
| 1757 if(!QUIET) cerr << rc_M[j][i] << '\t'; | |
| 1758 | |
| 1759 if(!QUIET) cerr << endl; | |
| 1760 } | |
| 1761 | |
| 1762 if(!QUIET) cerr << endl; | |
| 1763 | |
| 1764 // for(unsigned int i = 0; i < TOP_P; i++) | |
| 1765 // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << rc_top_pos[i] + 1 << "-" << rc_top_char[i] << '\t'; | |
| 1766 | |
| 1767 if(!QUIET) cerr << endl; | |
| 1768 | |
| 1769 return; | |
| 1770 } | |
| 1771 | |
| 1772 bool matrix::is_good() | |
| 1773 { | |
| 1774 return good; | |
| 1775 } | |
| 1776 | |
| 1777 | |
| 1778 void matrix::normalizer() | |
| 1779 { | |
| 1780 double *sums = new double[L]; | |
| 1781 | |
| 1782 for(int x = 0; x < L; x++) | |
| 1783 { | |
| 1784 sums[x] = 0; | |
| 1785 | |
| 1786 for(int y = 0; y < ALPHABET_SIZE; y++) | |
| 1787 sums[x] += M[x][y]; | |
| 1788 } | |
| 1789 | |
| 1790 for(int x = 0; x < L; x++) | |
| 1791 for(int y = 0; y < ALPHABET_SIZE; y++) | |
| 1792 M[x][y] /= sums[x]; | |
| 1793 | |
| 1794 delete[] sums; | |
| 1795 | |
| 1796 if(!norm_sec_step) | |
| 1797 { | |
| 1798 for(int x = 0; x < L; x++) | |
| 1799 for(int y = 0; y < ALPHABET_SIZE; y++) | |
| 1800 M[x][y] += MIN_ADD_FREQ; | |
| 1801 | |
| 1802 norm_sec_step = true; | |
| 1803 | |
| 1804 normalizer(); | |
| 1805 } | |
| 1806 | |
| 1807 return; | |
| 1808 | |
| 1809 } | |
| 1810 | |
| 1811 void matrix::to_log() | |
| 1812 { | |
| 1813 for(int x = 0; x < L; x++) | |
| 1814 for(int y = 0; y < ALPHABET_SIZE; y++) | |
| 1815 M[x][y] = log(M[x][y]); | |
| 1816 | |
| 1817 return; | |
| 1818 } | |
| 1819 | |
| 1820 void matrix::avg() | |
| 1821 { | |
| 1822 // mat_avg = new double[L]; | |
| 1823 | |
| 1824 for(int x = 0; x < L; x++) | |
| 1825 { | |
| 1826 double buf = 0; | |
| 1827 | |
| 1828 for(int y = 0; y < ALPHABET_SIZE; y++) | |
| 1829 buf += M[x][y]; | |
| 1830 | |
| 1831 // mat_avg[x] = buf/ALPHABET_SIZE; | |
| 1832 M[x][ALPHABET_SIZE] = buf/ALPHABET_SIZE; | |
| 1833 rc_M[(L-1) -x][ALPHABET_SIZE] = buf/ALPHABET_SIZE; | |
| 1834 } | |
| 1835 | |
| 1836 return; | |
| 1837 } | |
| 1838 | |
| 1839 void matrix::max_min() | |
| 1840 { | |
| 1841 double min, max; | |
| 1842 | |
| 1843 max_score = 0; | |
| 1844 min_score = 0; | |
| 1845 | |
| 1846 for(int x = 0; x < L; x++) | |
| 1847 { | |
| 1848 min = M[x][0]; | |
| 1849 max = M[x][0]; | |
| 1850 | |
| 1851 for(int y = 1; y < ALPHABET_SIZE; y++) | |
| 1852 { | |
| 1853 if(M[x][y] < min) | |
| 1854 min = M[x][y]; | |
| 1855 | |
| 1856 if(M[x][y] > max) | |
| 1857 max = M[x][y]; | |
| 1858 } | |
| 1859 | |
| 1860 min_score += min; | |
| 1861 max_score += max; | |
| 1862 } | |
| 1863 | |
| 1864 return; | |
| 1865 } | |
| 1866 | |
| 1867 void matrix::rev_c() | |
| 1868 { | |
| 1869 for(unsigned int x = 0; x < L; x++) | |
| 1870 for(unsigned int y = 0; y < ALPHABET_SIZE; y++) | |
| 1871 rc_M[(L - 1) - x][(ALPHABET_SIZE - 1) - y] = M[x][y]; | |
| 1872 | |
| 1873 return; | |
| 1874 } | |
| 1875 | |
| 1876 void matrix::scan(vector<sequence> *Seq) | |
| 1877 { | |
| 1878 v_score.reserve(Seq->size()); | |
| 1879 vF_score.reserve(Seq->size()); | |
| 1880 v_pos.reserve(Seq->size()); | |
| 1881 v_strand.reserve(Seq->size()); | |
| 1882 | |
| 1883 if(POSITIONAL) | |
| 1884 W_R_v_score.reserve(Seq->size()); | |
| 1885 | |
| 1886 unsigned int progress; | |
| 1887 | |
| 1888 if(!SINGLE_STRAND) | |
| 1889 { | |
| 1890 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 1891 { | |
| 1892 if(VERBOSE) | |
| 1893 { | |
| 1894 progress = (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100); | |
| 1895 | |
| 1896 if(!(progress%5)) | |
| 1897 if(!QUIET) cerr << progress << '%'; | |
| 1898 } | |
| 1899 | |
| 1900 double max_flank_u, max_flank_d; | |
| 1901 vector<double> W_buf(BIN_NUM, 0); | |
| 1902 | |
| 1903 { | |
| 1904 double r_score; | |
| 1905 | |
| 1906 max_flank_u = scan_seq_ds(Seq->at(i).seq() , true, RSIZE/2, &W_buf); | |
| 1907 r_score = scan_seq_ds(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, &W_buf); | |
| 1908 max_flank_d = scan_seq_ds(Seq->at(i).seq() + ((RSIZE/2)*3) , true, RSIZE/2, &W_buf); | |
| 1909 | |
| 1910 R_Avg += r_score; | |
| 1911 | |
| 1912 /* if(r_score < max_flank_u || r_score < max_flank_d) | |
| 1913 { | |
| 1914 if(max_flank_u > max_flank_d) | |
| 1915 G_Avg += max_flank_u; | |
| 1916 else | |
| 1917 G_Avg += max_flank_d; | |
| 1918 } | |
| 1919 else | |
| 1920 G_Avg += r_score;*/ | |
| 1921 | |
| 1922 if(POSITIONAL) | |
| 1923 { | |
| 1924 // for(unsigned int i = 0; i < BIN_NUM; i++) | |
| 1925 // W_R_Avg[i] += W_buf[i]; | |
| 1926 | |
| 1927 W_R_v_score.push_back(W_buf); | |
| 1928 } | |
| 1929 | |
| 1930 } | |
| 1931 | |
| 1932 if(max_flank_u > max_flank_d) | |
| 1933 { | |
| 1934 F_Avg += max_flank_u; | |
| 1935 vF_score.push_back(max_flank_u); | |
| 1936 } | |
| 1937 else | |
| 1938 { | |
| 1939 F_Avg += max_flank_d; | |
| 1940 vF_score.push_back(max_flank_d); | |
| 1941 } | |
| 1942 | |
| 1943 | |
| 1944 if(VERBOSE) | |
| 1945 if(!(progress%5)) | |
| 1946 if(!QUIET) cerr << (char)13; | |
| 1947 } | |
| 1948 } | |
| 1949 else | |
| 1950 { | |
| 1951 for(unsigned int i = 0; i < Seq->size(); i++) | |
| 1952 { | |
| 1953 | |
| 1954 if(VERBOSE) | |
| 1955 if(!QUIET) cerr << (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100) << '%'; | |
| 1956 | |
| 1957 double max_flank_u, max_flank_d; | |
| 1958 vector<double> W_buf(BIN_NUM, 0); | |
| 1959 | |
| 1960 { | |
| 1961 | |
| 1962 double r_score; | |
| 1963 | |
| 1964 max_flank_u = scan_seq_ss(Seq->at(i).seq() , true, RSIZE/2, Seq->at(i).strand(), &W_buf); | |
| 1965 r_score += scan_seq_ss(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, Seq->at(i).strand(), &W_buf); | |
| 1966 max_flank_d = scan_seq_ss(Seq->at(i).seq() + ((RSIZE/2)*3), true, RSIZE/2, Seq->at(i).strand(), &W_buf); | |
| 1967 | |
| 1968 R_Avg += r_score; | |
| 1969 | |
| 1970 /* if(r_score < max_flank_u || r_score < max_flank_d) | |
| 1971 { | |
| 1972 if(max_flank_u > max_flank_d) | |
| 1973 G_Avg += max_flank_u; | |
| 1974 else | |
| 1975 G_Avg += max_flank_d; | |
| 1976 } | |
| 1977 else | |
| 1978 G_Avg += r_score;*/ | |
| 1979 | |
| 1980 if(POSITIONAL) | |
| 1981 { | |
| 1982 // for(unsigned int i = 0; i < BIN_NUM; i++) | |
| 1983 // W_R_Avg[i] += W_buf[i]; | |
| 1984 | |
| 1985 W_R_v_score.push_back(W_buf); | |
| 1986 } | |
| 1987 } | |
| 1988 | |
| 1989 if(max_flank_u > max_flank_d) | |
| 1990 { | |
| 1991 F_Avg += max_flank_u; | |
| 1992 vF_score.push_back(max_flank_u); | |
| 1993 } | |
| 1994 else | |
| 1995 { | |
| 1996 F_Avg += max_flank_d; | |
| 1997 vF_score.push_back(max_flank_d); | |
| 1998 } | |
| 1999 | |
| 2000 if(VERBOSE) | |
| 2001 if(!QUIET) cerr << (char)13; | |
| 2002 } | |
| 2003 } | |
| 2004 | |
| 2005 R_Avg /= Seq->size(); | |
| 2006 F_Avg /= Seq->size(); | |
| 2007 // G_Avg /= Seq->size(); | |
| 2008 | |
| 2009 // for(unsigned int i = 0; i < BIN_NUM; i++) | |
| 2010 // W_R_Avg[i] /= Seq->size(); | |
| 2011 | |
| 2012 if(SPEARMAN) | |
| 2013 { | |
| 2014 multimap<double, unsigned int> SRM; | |
| 2015 | |
| 2016 for(unsigned int i = 0; i < v_score.size(); i++) | |
| 2017 SRM.insert(make_pair(v_score[i], i)); | |
| 2018 | |
| 2019 multimap<double, unsigned int>::reverse_iterator mmi = SRM.rbegin(); | |
| 2020 int sr = 0; | |
| 2021 double d = 0; | |
| 2022 | |
| 2023 while(mmi != SRM.rend()) | |
| 2024 { | |
| 2025 // if(!QUIET) cerr << endl << mmi->first << '\t' << sr << '\t' << Seq->at(mmi->second).e_rank() << '\t' << this->name(); | |
| 2026 | |
| 2027 d += pow((sr - (int)Seq->at(mmi->second).e_rank()), 2); | |
| 2028 sr++; | |
| 2029 mmi++; | |
| 2030 } | |
| 2031 | |
| 2032 d *= 6; | |
| 2033 | |
| 2034 // if(!QUIET) cerr << endl << "d = " << d << "\tden = " << (Seq->size() * (pow(Seq->size(), 2) - 1)) << '\t' << "ssize = " << Seq->size(); | |
| 2035 | |
| 2036 d /= (Seq->size() * (pow(Seq->size(), 2) - 1)); | |
| 2037 | |
| 2038 spearman = 1 - d; | |
| 2039 } | |
| 2040 | |
| 2041 return; | |
| 2042 } | |
| 2043 | |
| 2044 void matrix::calc_var(double S) | |
| 2045 { | |
| 2046 for(unsigned int i = 0; i < S; i++) | |
| 2047 { | |
| 2048 R_Var += pow(v_score[i] - R_Avg,2); | |
| 2049 F_Var += pow(vF_score[i] - F_Avg,2); | |
| 2050 | |
| 2051 /* if(v_score[i] >= vF_score[i]) | |
| 2052 G_Var += pow(v_score[i] - G_Avg,2); | |
| 2053 else | |
| 2054 G_Var += pow(vF_score[i] - G_Avg,2);*/ | |
| 2055 } | |
| 2056 | |
| 2057 if(S > 1) | |
| 2058 { | |
| 2059 R_Var /= (S - 1); | |
| 2060 F_Var /= (S - 1); | |
| 2061 // G_Var /= (S - 1); | |
| 2062 } | |
| 2063 | |
| 2064 if(R_Var == 0) | |
| 2065 R_Var += EPSILON; | |
| 2066 | |
| 2067 if(F_Var == 0) | |
| 2068 F_Var += EPSILON; | |
| 2069 | |
| 2070 // if(G_Var == 0) | |
| 2071 // G_Var += EPSILON; | |
| 2072 | |
| 2073 return; | |
| 2074 } | |
| 2075 | |
| 2076 void matrix::calc_W_param(double S) | |
| 2077 { | |
| 2078 if(S > 1) | |
| 2079 S = S/2; //SOLO PER LA PRIMA META' DELLE SEQUENZE!!! | |
| 2080 | |
| 2081 W_Avg = 0; | |
| 2082 W_Var = 0; | |
| 2083 | |
| 2084 multimap<double, unsigned int> score_seq_map; | |
| 2085 | |
| 2086 for(unsigned int i = 0; i < v_score.size(); i++) | |
| 2087 score_seq_map.insert(make_pair(v_score.at(i), i)); | |
| 2088 | |
| 2089 multimap<double, unsigned int>::reverse_iterator mi = score_seq_map.rbegin(); | |
| 2090 | |
| 2091 for(unsigned int i = 0; i < S; i++) | |
| 2092 { | |
| 2093 for(unsigned int j = 0; j < BIN_NUM; j++) | |
| 2094 W_R_Avg[j] += W_R_v_score[mi->second][j]; | |
| 2095 | |
| 2096 // if(!QUIET) cerr << Name << '\t' << mi->first << endl; | |
| 2097 | |
| 2098 mi++; | |
| 2099 } | |
| 2100 | |
| 2101 for(unsigned int j = 0; j < BIN_NUM; j++) | |
| 2102 W_R_Avg[j] /= S; | |
| 2103 | |
| 2104 mi = score_seq_map.rbegin(); | |
| 2105 | |
| 2106 for(unsigned int i = 0; i < S; i++) | |
| 2107 { | |
| 2108 for(unsigned int j = 0; j < BIN_NUM; j++) | |
| 2109 W_R_Var[j] += pow(W_R_v_score[mi->second][j] - W_R_Avg[j],2); | |
| 2110 | |
| 2111 mi++; | |
| 2112 } | |
| 2113 | |
| 2114 /* for(unsigned int i = 0; i < S; i++) | |
| 2115 for(unsigned int j = 0; j < BIN_NUM; j++) | |
| 2116 W_R_Var[j] += pow(W_R_v_score[i][j] - W_R_Avg[j],2);*/ | |
| 2117 | |
| 2118 if(S > 1) | |
| 2119 { | |
| 2120 for(unsigned int j = 0; j < BIN_NUM; j++) | |
| 2121 W_R_Var[j] /= (S - 1); | |
| 2122 } | |
| 2123 | |
| 2124 for(unsigned int j = 0; j < BIN_NUM; j++) | |
| 2125 if(W_R_Var[j] == 0) | |
| 2126 W_R_Var[j] += EPSILON; | |
| 2127 | |
| 2128 for(unsigned int j = 0; j < BIN_NUM - 1; j++) //THE LAST BIN IS WASTED SINCE IT IS AN HALF BIN | |
| 2129 W_Avg += W_R_Avg[j]; | |
| 2130 | |
| 2131 W_Avg /= (BIN_NUM - 1); | |
| 2132 | |
| 2133 for(unsigned int j = 0; j < BIN_NUM - 1; j++) | |
| 2134 W_Var += pow(W_R_Avg[j] - W_Avg, 2); | |
| 2135 | |
| 2136 W_Var /= (BIN_NUM - 2); | |
| 2137 | |
| 2138 return; | |
| 2139 } | |
| 2140 | |
| 2141 void matrix::welch_test(double S) | |
| 2142 { | |
| 2143 t = R_Avg - F_Avg; | |
| 2144 | |
| 2145 if(t == 0) | |
| 2146 t = EPSILON; | |
| 2147 | |
| 2148 t /= pow((R_Var/S) + (F_Var/S),0.5); | |
| 2149 | |
| 2150 dof = pow((R_Var/S) + (F_Var/S),2); | |
| 2151 | |
| 2152 dof /= (pow(R_Var,2)/((pow(S,2)) * (S-1))) + (pow(F_Var,2)/((pow(S,2)) * (S-1))); | |
| 2153 | |
| 2154 if(t > 0) | |
| 2155 pvalue = gsl_cdf_tdist_Q(t,dof); | |
| 2156 else | |
| 2157 { | |
| 2158 pvalue = gsl_cdf_tdist_P(t,dof); | |
| 2159 O_U = true; | |
| 2160 } | |
| 2161 | |
| 2162 return; | |
| 2163 } | |
| 2164 | |
| 2165 void matrix::z_test_pos(double S) | |
| 2166 { | |
| 2167 // if(!QUIET) cerr << endl << Name; | |
| 2168 | |
| 2169 for(unsigned int i = 0; i < BIN_NUM; i++) | |
| 2170 { | |
| 2171 double se = sqrt(W_R_Var.at(i)), z; | |
| 2172 se /= sqrt(S/2); | |
| 2173 | |
| 2174 z = W_R_Avg.at(i) - W_Avg; | |
| 2175 z /=se; | |
| 2176 | |
| 2177 if(z >= 0) | |
| 2178 W_O_U.at(i) = true; | |
| 2179 | |
| 2180 W_R_pvalue.at(i) = gsl_cdf_ugaussian_Q(z); | |
| 2181 | |
| 2182 // if(!QUIET) cerr << endl << "W_R_Avg.at(i) " << W_R_Avg.at(i) << '\t' << W_Avg << '\t' << W_R_pvalue.at(i) << endl; | |
| 2183 | |
| 2184 } | |
| 2185 | |
| 2186 return; | |
| 2187 } | |
| 2188 | |
| 2189 void matrix::welch_test_pos(double S) | |
| 2190 { | |
| 2191 if(S > 1) | |
| 2192 S = S/2; //SOLO LA PRIMA META' DELLE SEQUENZE!!! | |
| 2193 | |
| 2194 // if(!QUIET) cerr << "\nRAVG = " << W_Avg << endl; | |
| 2195 for(unsigned int i = 0; i < BIN_NUM; i++) | |
| 2196 { | |
| 2197 double T, DOF; | |
| 2198 | |
| 2199 T = W_R_Avg.at(i) - W_Avg; | |
| 2200 | |
| 2201 // if(!QUIET) cerr << W_R_Avg.at(i) << '\t'; | |
| 2202 | |
| 2203 // if(!QUIET) cerr << endl << Name << endl << "T = " << T << '\t' << W_R_Avg.at(i) << '\t' << W_Avg; | |
| 2204 | |
| 2205 if(T == 0) | |
| 2206 T = EPSILON; | |
| 2207 | |
| 2208 T /= pow((W_R_Var.at(i)/S) + (W_Var/S),0.5); | |
| 2209 | |
| 2210 DOF = pow((W_R_Var.at(i)/S) + (W_Var/S),2); | |
| 2211 | |
| 2212 DOF /= (pow(W_R_Var.at(i),2)/((pow(S,2)) * (S-1))) + (pow(W_Var,2)/((pow(S,2)) * (S-1))); | |
| 2213 | |
| 2214 if(T > 0) | |
| 2215 W_R_T_pvalue.at(i) = gsl_cdf_tdist_Q(T,DOF); | |
| 2216 else | |
| 2217 { | |
| 2218 W_R_T_pvalue.at(i) = gsl_cdf_tdist_P(T,DOF); | |
| 2219 W_O_U.at(i) = true; | |
| 2220 } | |
| 2221 | |
| 2222 // if(!QUIET) cerr << "\tPV= " << W_R_pvalue.at(i); | |
| 2223 | |
| 2224 } | |
| 2225 | |
| 2226 // if(!QUIET) cerr << endl; | |
| 2227 | |
| 2228 return; | |
| 2229 } | |
| 2230 | |
| 2231 void matrix::welch_test_bg(double S1) | |
| 2232 { | |
| 2233 double S2 = bg_Size; | |
| 2234 | |
| 2235 // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl; | |
| 2236 | |
| 2237 t_bg = R_Avg - bg_Avg; | |
| 2238 | |
| 2239 if(t_bg == 0) | |
| 2240 t_bg = EPSILON; | |
| 2241 | |
| 2242 t_bg /= pow((R_Var/S1) + (bg_Var/S2),0.5); | |
| 2243 | |
| 2244 dof_bg = pow((R_Var/S1) + (bg_Var/S2),2); | |
| 2245 | |
| 2246 dof_bg /= (pow(R_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1))); | |
| 2247 | |
| 2248 if(t_bg > 0) | |
| 2249 pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg); | |
| 2250 else | |
| 2251 { | |
| 2252 pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg); | |
| 2253 O_U_bg = true; | |
| 2254 } | |
| 2255 | |
| 2256 return; | |
| 2257 } | |
| 2258 | |
| 2259 /*void matrix::welch_test_bg_G(double S1) | |
| 2260 { | |
| 2261 double S2 = bg_Size; | |
| 2262 | |
| 2263 // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl; | |
| 2264 | |
| 2265 t_bg = G_Avg - bg_Avg; | |
| 2266 | |
| 2267 if(t_bg == 0) | |
| 2268 t_bg = EPSILON; | |
| 2269 | |
| 2270 t_bg /= pow((G_Var/S1) + (bg_Var/S2),0.5); | |
| 2271 | |
| 2272 dof_bg = pow((G_Var/S1) + (bg_Var/S2),2); | |
| 2273 | |
| 2274 dof_bg /= (pow(G_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1))); | |
| 2275 | |
| 2276 if(t_bg > 0) | |
| 2277 pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg); | |
| 2278 else | |
| 2279 { | |
| 2280 pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg); | |
| 2281 O_U_bg = true; | |
| 2282 } | |
| 2283 | |
| 2284 return; | |
| 2285 }*/ | |
| 2286 | |
| 2287 double matrix::scan_seq_ds(char *S, bool flank, unsigned int l, vector<double> *W_buf) | |
| 2288 { | |
| 2289 unsigned int pos = 0, max_pos = 0; | |
| 2290 double seq_score; | |
| 2291 bool strand = false; | |
| 2292 | |
| 2293 l -= L/2; | |
| 2294 | |
| 2295 while(pos < l) | |
| 2296 { | |
| 2297 | |
| 2298 double pos_score[2] = {0,0}; | |
| 2299 | |
| 2300 for(unsigned int j = 0; j < L; j++) | |
| 2301 { | |
| 2302 pos_score[0] += get_M_value(j, S[pos+j]); | |
| 2303 pos_score[1] += get_rc_M_value(j, S[pos+j]); | |
| 2304 } | |
| 2305 | |
| 2306 if(seq_score < pos_score[0] || !pos) | |
| 2307 { | |
| 2308 seq_score = pos_score[0]; | |
| 2309 max_pos = pos; | |
| 2310 strand = false; | |
| 2311 } | |
| 2312 if(seq_score < pos_score[1]) | |
| 2313 { | |
| 2314 seq_score = pos_score[1]; | |
| 2315 max_pos = pos; | |
| 2316 strand = true; | |
| 2317 } | |
| 2318 | |
| 2319 if(!flank && POSITIONAL) | |
| 2320 { | |
| 2321 unsigned int vpos = pos / W_STEP; | |
| 2322 | |
| 2323 if(W_buf->at(vpos) < pos_score[0] || !(pos % W_STEP)) | |
| 2324 W_buf->at(vpos) = pos_score[0]; | |
| 2325 | |
| 2326 if(W_buf->at(vpos) < pos_score[1]) | |
| 2327 W_buf->at(vpos) = pos_score[1]; | |
| 2328 | |
| 2329 if(vpos) | |
| 2330 { | |
| 2331 if(W_buf->at(vpos - 1) < pos_score[0]) | |
| 2332 W_buf->at(vpos - 1) = pos_score[0]; | |
| 2333 | |
| 2334 if(W_buf->at(vpos - 1) < pos_score[1]) | |
| 2335 W_buf->at(vpos - 1) = pos_score[1]; | |
| 2336 } | |
| 2337 } | |
| 2338 | |
| 2339 pos++; | |
| 2340 } | |
| 2341 | |
| 2342 seq_score = 1 + (seq_score - max_score)/(max_score - min_score); | |
| 2343 | |
| 2344 if(!flank && POSITIONAL) | |
| 2345 { | |
| 2346 // if(!QUIET) cerr << endl << Name << endl; | |
| 2347 | |
| 2348 unsigned int binm = 0; | |
| 2349 | |
| 2350 for(unsigned int i = 0; i < BIN_NUM; i++) | |
| 2351 { | |
| 2352 if(W_buf->at(i) != 0) | |
| 2353 W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score); | |
| 2354 else | |
| 2355 binm++; | |
| 2356 } | |
| 2357 | |
| 2358 BIN_NUM -= binm; | |
| 2359 } | |
| 2360 | |
| 2361 if(!flank) | |
| 2362 { | |
| 2363 v_score.push_back(seq_score); | |
| 2364 v_pos.push_back(max_pos + L/2); | |
| 2365 v_strand.push_back(strand); | |
| 2366 } | |
| 2367 | |
| 2368 return seq_score; | |
| 2369 } | |
| 2370 | |
| 2371 double matrix::scan_seq_ss_second_best(char *S, unsigned int l, unsigned int *P, bool *s, unsigned int POS, bool seq_strand) | |
| 2372 { | |
| 2373 unsigned int pos = 0, max_pos = 0; | |
| 2374 double seq_score = min_score; | |
| 2375 bool strand = false; | |
| 2376 | |
| 2377 while(pos < l) | |
| 2378 { | |
| 2379 if(pos == POS) | |
| 2380 { | |
| 2381 pos++; | |
| 2382 continue; | |
| 2383 } | |
| 2384 | |
| 2385 double pos_score[2] = {0,0}; | |
| 2386 | |
| 2387 if(seq_strand == '+') | |
| 2388 { | |
| 2389 for(unsigned int j = 0; j < L; j++) | |
| 2390 pos_score[0] += get_M_value(j, S[pos+j]); | |
| 2391 } | |
| 2392 else | |
| 2393 { | |
| 2394 for(unsigned int j = 0; j < L; j++) | |
| 2395 pos_score[0] += get_rc_M_value(j, S[pos+j]); | |
| 2396 } | |
| 2397 | |
| 2398 if(seq_score < pos_score[0]) | |
| 2399 { | |
| 2400 seq_score = pos_score[0]; | |
| 2401 max_pos = pos; | |
| 2402 strand = false; | |
| 2403 } | |
| 2404 | |
| 2405 | |
| 2406 pos++; | |
| 2407 } | |
| 2408 | |
| 2409 seq_score = 1 + (seq_score - max_score)/(max_score - min_score); | |
| 2410 | |
| 2411 *P = max_pos; | |
| 2412 *s = strand; | |
| 2413 | |
| 2414 return seq_score; | |
| 2415 } | |
| 2416 | |
| 2417 double matrix::scan_seq_ds_second_best(char *S, unsigned int l, unsigned int *P, bool *s, int POS) | |
| 2418 { | |
| 2419 int pos = 0, max_pos = 0, count = 0; | |
| 2420 double seq_score; | |
| 2421 bool strand = false; | |
| 2422 | |
| 2423 POS -= L/2; | |
| 2424 l -= L/2; | |
| 2425 | |
| 2426 while(pos < l) | |
| 2427 { | |
| 2428 // if(pos == POS) | |
| 2429 if(pos >= (POS - L) && pos <= (POS + L)) | |
| 2430 { | |
| 2431 pos++; | |
| 2432 continue; | |
| 2433 } | |
| 2434 | |
| 2435 double pos_score[2] = {0,0}; | |
| 2436 | |
| 2437 for(unsigned int j = 0; j < L; j++) | |
| 2438 { | |
| 2439 pos_score[0] += get_M_value(j, S[pos+j]); | |
| 2440 pos_score[1] += get_rc_M_value(j, S[pos+j]); | |
| 2441 } | |
| 2442 | |
| 2443 if(seq_score < pos_score[0] || !count) | |
| 2444 { | |
| 2445 seq_score = pos_score[0]; | |
| 2446 max_pos = pos; | |
| 2447 strand = false; | |
| 2448 } | |
| 2449 if(seq_score < pos_score[1]) | |
| 2450 { | |
| 2451 seq_score = pos_score[1]; | |
| 2452 max_pos = pos; | |
| 2453 strand = true; | |
| 2454 } | |
| 2455 | |
| 2456 pos++; | |
| 2457 count++; | |
| 2458 } | |
| 2459 | |
| 2460 seq_score = 1 + (seq_score - max_score)/(max_score - min_score); | |
| 2461 | |
| 2462 *P = max_pos ;//+ L/2; | |
| 2463 *s = strand; | |
| 2464 | |
| 2465 return seq_score; | |
| 2466 } | |
| 2467 | |
| 2468 | |
| 2469 | |
| 2470 /*double matrix::scan_seq_ds_quick(char *S, bool flank, unsigned int l) | |
| 2471 { | |
| 2472 unsigned int pos = 0, max_pos = 0; | |
| 2473 double seq_score = min_score; | |
| 2474 bool strand = false; | |
| 2475 | |
| 2476 while(pos < l) | |
| 2477 { | |
| 2478 double pos_score[2] = {0,0}; | |
| 2479 | |
| 2480 for(unsigned int j = 0; j < L; j++) | |
| 2481 { | |
| 2482 pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]); | |
| 2483 | |
| 2484 if((pos_score[0] + p_max[j+1]) <= seq_score) | |
| 2485 { | |
| 2486 pos_score[0] = min_score; | |
| 2487 break; | |
| 2488 } | |
| 2489 | |
| 2490 } | |
| 2491 | |
| 2492 for(unsigned int j = 0; j < L; j++) | |
| 2493 { | |
| 2494 pos_score[1] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]); | |
| 2495 | |
| 2496 if((pos_score[1] + p_max[j+1]) <= seq_score) | |
| 2497 { | |
| 2498 pos_score[1] = min_score; | |
| 2499 break; | |
| 2500 } | |
| 2501 } | |
| 2502 | |
| 2503 if(seq_score < pos_score[0]) | |
| 2504 { | |
| 2505 seq_score = pos_score[0]; | |
| 2506 max_pos = pos; | |
| 2507 strand = false; | |
| 2508 } | |
| 2509 if(seq_score < pos_score[1]) | |
| 2510 { | |
| 2511 seq_score = pos_score[1]; | |
| 2512 max_pos = pos; | |
| 2513 strand = true; | |
| 2514 } | |
| 2515 | |
| 2516 pos++; | |
| 2517 } | |
| 2518 | |
| 2519 seq_score = 1 + (seq_score - max_score)/(max_score - min_score); | |
| 2520 | |
| 2521 if(!flank) | |
| 2522 { | |
| 2523 v_score.push_back(seq_score); | |
| 2524 v_pos.push_back(max_pos); | |
| 2525 v_strand.push_back(strand); | |
| 2526 } | |
| 2527 | |
| 2528 | |
| 2529 return seq_score; | |
| 2530 } | |
| 2531 */ | |
| 2532 double matrix::scan_seq_ss(char *S, bool flank, unsigned int l, char seq_strand, vector<double> *W_buf) | |
| 2533 { | |
| 2534 unsigned int pos = 0, max_pos = 0; | |
| 2535 double seq_score = min_score; | |
| 2536 bool strand = false; | |
| 2537 | |
| 2538 while(pos < l) | |
| 2539 { | |
| 2540 double pos_score[2] = {0,0}; | |
| 2541 | |
| 2542 if(seq_strand == '+') | |
| 2543 { | |
| 2544 for(unsigned int j = 0; j < L; j++) | |
| 2545 pos_score[0] += get_M_value(j, S[pos+j]); | |
| 2546 } | |
| 2547 else | |
| 2548 { | |
| 2549 for(unsigned int j = 0; j < L; j++) | |
| 2550 pos_score[0] += get_rc_M_value(j, S[pos+j]); | |
| 2551 } | |
| 2552 | |
| 2553 if(seq_score < pos_score[0]) | |
| 2554 { | |
| 2555 seq_score = pos_score[0]; | |
| 2556 max_pos = pos; | |
| 2557 strand = false; | |
| 2558 } | |
| 2559 | |
| 2560 if(!flank && POSITIONAL) | |
| 2561 { | |
| 2562 unsigned int vpos = pos / W_STEP; | |
| 2563 | |
| 2564 if(W_buf->at(vpos) < pos_score[0]) | |
| 2565 W_buf->at(vpos) = pos_score[0]; | |
| 2566 | |
| 2567 if(vpos) | |
| 2568 { | |
| 2569 if(W_buf->at(vpos - 1) < pos_score[0]) | |
| 2570 W_buf->at(vpos - 1) = pos_score[0]; | |
| 2571 } | |
| 2572 } | |
| 2573 | |
| 2574 pos++; | |
| 2575 } | |
| 2576 | |
| 2577 seq_score = 1 + (seq_score - max_score)/(max_score - min_score); | |
| 2578 | |
| 2579 if(POSITIONAL && !flank) | |
| 2580 for(unsigned int i = 0; i < W_buf->size(); i++) | |
| 2581 W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score); | |
| 2582 | |
| 2583 if(!flank) | |
| 2584 { | |
| 2585 v_score.push_back(seq_score); | |
| 2586 v_pos.push_back(max_pos); | |
| 2587 v_strand.push_back(strand); | |
| 2588 } | |
| 2589 | |
| 2590 return seq_score; | |
| 2591 } | |
| 2592 | |
| 2593 /* double matrix::scan_seq_ss_quick(char *S, bool flank, unsigned int l, char seq_strand) | |
| 2594 { | |
| 2595 unsigned int pos = 0, max_pos = 0; | |
| 2596 double seq_score = min_score; | |
| 2597 bool strand = false; | |
| 2598 | |
| 2599 while(pos < l) | |
| 2600 { | |
| 2601 double pos_score[2] = {0,0}; | |
| 2602 | |
| 2603 if(seq_strand == '+') | |
| 2604 { | |
| 2605 for(unsigned int j = 0; j < L; j++) | |
| 2606 { | |
| 2607 pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]); | |
| 2608 | |
| 2609 if((pos_score[0] + p_max[j+1]) <= seq_score) | |
| 2610 { | |
| 2611 pos_score[0] = min_score; | |
| 2612 break; | |
| 2613 } | |
| 2614 | |
| 2615 | |
| 2616 } | |
| 2617 } | |
| 2618 else | |
| 2619 { | |
| 2620 for(unsigned int j = 0; j < L; j++) | |
| 2621 { | |
| 2622 pos_score[0] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]); | |
| 2623 | |
| 2624 if((pos_score[0] + p_max[j+1]) <= seq_score) | |
| 2625 { | |
| 2626 pos_score[0] = min_score; | |
| 2627 break; | |
| 2628 } | |
| 2629 | |
| 2630 } | |
| 2631 } | |
| 2632 | |
| 2633 if(seq_score < pos_score[0]) | |
| 2634 { | |
| 2635 seq_score = pos_score[0]; | |
| 2636 max_pos = pos; | |
| 2637 strand = false; | |
| 2638 } | |
| 2639 | |
| 2640 pos++; | |
| 2641 } | |
| 2642 | |
| 2643 seq_score = 1 + (seq_score - max_score)/(max_score - min_score); | |
| 2644 | |
| 2645 if(!flank) | |
| 2646 { | |
| 2647 v_score.push_back(seq_score); | |
| 2648 v_pos.push_back(max_pos); | |
| 2649 v_strand.push_back(strand); | |
| 2650 } | |
| 2651 | |
| 2652 return seq_score; | |
| 2653 }*/ | |
| 2654 | |
| 2655 double matrix::get_M_value(int pos, char C) | |
| 2656 { | |
| 2657 return M[pos][VI[C]]; | |
| 2658 } | |
| 2659 | |
| 2660 double matrix::get_rc_M_value(int pos, char C) | |
| 2661 { | |
| 2662 return rc_M[pos][VI[C]]; | |
| 2663 } | |
| 2664 | |
| 2665 /* | |
| 2666 void matrix::set_top_pos() | |
| 2667 { | |
| 2668 multimap<double, mpoint> Tmap; | |
| 2669 vector<double> p_max_buf, rc_p_max_buf; | |
| 2670 | |
| 2671 for(unsigned int x = 0; x < L; x++) | |
| 2672 { | |
| 2673 for(unsigned int y = 0; y < ALPHABET_SIZE; y++) | |
| 2674 { | |
| 2675 mpoint p; | |
| 2676 | |
| 2677 p.x = x; | |
| 2678 p.y = y; | |
| 2679 | |
| 2680 Tmap.insert(make_pair(M[x][y],p)); | |
| 2681 | |
| 2682 if(!y) | |
| 2683 { | |
| 2684 p_max_buf.push_back(M[x][y]); | |
| 2685 rc_p_max_buf.push_back(rc_M[x][y]); | |
| 2686 } | |
| 2687 else | |
| 2688 { | |
| 2689 if(M[x][y] > p_max_buf.back()) | |
| 2690 p_max_buf[p_max_buf.size() - 1] = M[x][y]; | |
| 2691 | |
| 2692 if(rc_M[x][y] > rc_p_max_buf.back()) | |
| 2693 rc_p_max_buf[rc_p_max_buf.size() - 1] = rc_M[x][y]; | |
| 2694 } | |
| 2695 | |
| 2696 } | |
| 2697 | |
| 2698 // if(!QUIET) cerr << endl << p_max_buf.back(); | |
| 2699 } | |
| 2700 | |
| 2701 multimap<double, mpoint>::reverse_iterator ri = Tmap.rbegin(); | |
| 2702 | |
| 2703 for(unsigned int i = 0; i < TOP_P; i++) | |
| 2704 { | |
| 2705 top_pos[i] = ri->second.x; | |
| 2706 top_char[i] = ALPHABET[ri->second.y]; | |
| 2707 | |
| 2708 rc_top_pos[i] = (L - 1) - ri->second.x; | |
| 2709 rc_top_char[i] = RC_ALPHABET[ri->second.y]; | |
| 2710 | |
| 2711 ri++; | |
| 2712 } | |
| 2713 | |
| 2714 // if(!QUIET) cerr << endl << name() << '\t' << max_score << '\t' << min_score << endl; | |
| 2715 | |
| 2716 while(p_max.size() != p_max_buf.size()) | |
| 2717 { | |
| 2718 double buf = -INF; | |
| 2719 unsigned int pos; | |
| 2720 | |
| 2721 for(unsigned int i = 0; i < p_max_buf.size(); i++) | |
| 2722 if(p_max_buf.at(i) > buf) | |
| 2723 { | |
| 2724 pos = i; | |
| 2725 buf = p_max_buf.at(i); | |
| 2726 } | |
| 2727 | |
| 2728 p_pos.push_back(pos); | |
| 2729 rc_p_pos.push_back((L - 1) - pos); | |
| 2730 | |
| 2731 p_max.push_back(buf); | |
| 2732 | |
| 2733 p_max_buf[pos] = -INF; | |
| 2734 } | |
| 2735 | |
| 2736 for(unsigned int i = 0; i < L; i++) | |
| 2737 { | |
| 2738 //if(!QUIET) cerr << p_max[i] << "//"<< '\t'; | |
| 2739 | |
| 2740 for(unsigned int j = i + 1; j < L; j++) | |
| 2741 p_max[i] += p_max[j]; | |
| 2742 | |
| 2743 // for(unsigned int j = i + 1; j < L; j++) | |
| 2744 // rc_p_max[i] += rc_p_max[j]; | |
| 2745 | |
| 2746 // if(!QUIET) cerr << p_max[i] << "(" << p_pos[i] << ")" << rc_p_max[i] << '\t'; | |
| 2747 } | |
| 2748 | |
| 2749 p_max.push_back(0); //WE NEED A 0 AT THE END OF THE VECTOR! | |
| 2750 // rc_p_max.push_back(0); | |
| 2751 | |
| 2752 // if(!QUIET) cerr << endl; | |
| 2753 | |
| 2754 return; | |
| 2755 } | |
| 2756 */ | |
| 2757 double matrix::get_score_at(unsigned int a) | |
| 2758 { | |
| 2759 return v_score.at(a); | |
| 2760 } | |
| 2761 unsigned int matrix::get_pos_at(unsigned int a) | |
| 2762 { | |
| 2763 return v_pos.at(a); | |
| 2764 } | |
| 2765 bool matrix::get_strand_at(unsigned int a) | |
| 2766 { | |
| 2767 return v_strand.at(a); | |
| 2768 } | |
| 2769 | |
| 2770 double matrix::max() | |
| 2771 { | |
| 2772 return max_score; | |
| 2773 } | |
| 2774 | |
| 2775 double matrix::min() | |
| 2776 { | |
| 2777 return min_score; | |
| 2778 } | |
| 2779 | |
| 2780 string matrix::name() | |
| 2781 { | |
| 2782 if(!Name.empty()) | |
| 2783 return Name; | |
| 2784 else | |
| 2785 return Id; | |
| 2786 } | |
| 2787 | |
| 2788 string matrix::id() | |
| 2789 { | |
| 2790 return Id; | |
| 2791 } | |
| 2792 | |
| 2793 double matrix::get_pvalue() | |
| 2794 { | |
| 2795 return pvalue; | |
| 2796 } | |
| 2797 | |
| 2798 double matrix::get_pvalue_bg() | |
| 2799 { | |
| 2800 return pvalue_bg; | |
| 2801 } | |
| 2802 | |
| 2803 double matrix::get_W_pvalue(unsigned int pos) | |
| 2804 { | |
| 2805 return W_R_pvalue.at(pos); | |
| 2806 } | |
| 2807 | |
| 2808 double matrix::get_W_R_Avg_at_pos(unsigned int pos) | |
| 2809 { | |
| 2810 return W_R_Avg.at(pos); | |
| 2811 } | |
| 2812 | |
| 2813 string matrix::o_or_u() | |
| 2814 { | |
| 2815 if(O_U) | |
| 2816 return "UNDER"; | |
| 2817 else | |
| 2818 return "OVER"; | |
| 2819 } | |
| 2820 | |
| 2821 string matrix::o_or_u_bg() | |
| 2822 { | |
| 2823 if(O_U_bg) | |
| 2824 return "UNDER"; | |
| 2825 else | |
| 2826 return "OVER"; | |
| 2827 } | |
| 2828 | |
| 2829 int matrix::W_o_or_u(unsigned int pos) | |
| 2830 { | |
| 2831 if(W_O_U[pos]) | |
| 2832 return -1; | |
| 2833 else | |
| 2834 return 1; | |
| 2835 } | |
| 2836 | |
| 2837 double matrix::get_R_Avg() | |
| 2838 { | |
| 2839 return R_Avg; | |
| 2840 } | |
| 2841 double matrix::get_F_Avg() | |
| 2842 { | |
| 2843 return F_Avg; | |
| 2844 } | |
| 2845 double matrix::get_R_Var() | |
| 2846 { | |
| 2847 return R_Var; | |
| 2848 } | |
| 2849 double matrix::get_F_Var() | |
| 2850 { | |
| 2851 return F_Var; | |
| 2852 } | |
| 2853 double matrix::get_bg_Avg() | |
| 2854 { | |
| 2855 return bg_Avg; | |
| 2856 } | |
| 2857 | |
| 2858 double matrix::get_t() | |
| 2859 { | |
| 2860 return t; | |
| 2861 } | |
| 2862 | |
| 2863 double matrix::get_dof() | |
| 2864 { | |
| 2865 return dof; | |
| 2866 } | |
| 2867 | |
| 2868 bool matrix::have_bg() | |
| 2869 { | |
| 2870 return BG; | |
| 2871 } | |
| 2872 | |
| 2873 unsigned int matrix::bin_num() | |
| 2874 { | |
| 2875 return BIN_NUM; | |
| 2876 } | |
| 2877 | |
| 2878 unsigned int matrix::length() | |
| 2879 { | |
| 2880 return L; | |
| 2881 } | |
| 2882 | |
| 2883 double matrix::get_spearman() | |
| 2884 { | |
| 2885 double s = spearman; | |
| 2886 | |
| 2887 s *= 10000; | |
| 2888 | |
| 2889 if(s > 0) | |
| 2890 s = floor(s); | |
| 2891 else if(s < 0) | |
| 2892 s = ceil(s); | |
| 2893 | |
| 2894 s /= 10000; | |
| 2895 | |
| 2896 return s; | |
| 2897 } | |
| 2898 | |
| 2899 double matrix::get_W_Avg() | |
| 2900 { | |
| 2901 return W_Avg; | |
| 2902 } | |
| 2903 | |
| 2904 double matrix::get_W_R_T_pvalue(unsigned int i) | |
| 2905 { | |
| 2906 return W_R_T_pvalue.at(i); | |
| 2907 } | |
| 2908 // MATRIX CLASS END |
