comparison pscan_chip/pscan_chip_g.cpp @ 9:6a5cfb8a872f draft

Uploaded
author fz77
date Mon, 17 Nov 2014 09:40:44 -0500
parents
children
comparison
equal deleted inserted replaced
8:2aad8f91e6f2 9:6a5cfb8a872f
1 // Pscan-ChIP 1.1
2 // // Copyright (C) 2013 Federico Zambelli and Giulio Pavesi
3 // //
4 // // This program is free software: you can redistribute it and/or modify
5 // // it under the terms of the GNU General Public License as published by
6 // // the Free Software Foundation, either version 3 of the License, or
7 // // (at your option) any later version.
8 // //
9 // // This program is distributed in the hope that it will be useful,
10 // // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // // GNU General Public License for more details.
13 // //
14 // // You should have received a copy of the GNU General Public License
15 // // along with this program. If not, see <http://www.gnu.org/licenses/>.
16 // //
17 // // If you find Pscan-ChIP useful in your research please cite our paper:
18 // //
19 // // Zambelli F, Pesole G, Pavesi G. PscanChIP: Finding over-represented
20 // // transcription factor-binding site motifs and their correlations in sequences from
21 // // ChIP-Seq experiments.
22 // // Nucleic Acids Res. 2013 Jul;41(Web Server issue):W535-43.
23
24 #include <string>
25 #include <iostream>
26 #include <fstream>
27 #include <vector>
28 #include <sstream>
29 #include <math.h>
30 #include <map>
31 #include <cstdlib>
32 #include <gsl/gsl_randist.h>
33 #include <gsl/gsl_cdf.h>
34 #include <string.h>
35 #include <ctime>
36
37 using namespace std;
38
39 const double MIN_ADD_FREQ = 0.01, EPSILON = pow(10,-21), INF = pow(10,21), /*POS_VIS_THRESHOLD = pow(10,-5),*/ DIST_RATIO_THRESHOLD = 3, DIST_PV_THRESHOLD = pow(10,-5);
40 const unsigned int MAX_REG_SIZE = 3000, ALPHABET_SIZE = 4, TOP_P = 1, DIST_HIT_THRESHOLD = 10;
41 //const char ALPHABET[4] = {'A','C','G','T'}, RC_ALPHABET[4] = {'T','G','C','A'};
42 char VI['T' + 1];
43 string regfile, genome, matrixfile, additional_matrixfile, outfile, bgfile, selected, obstf, GALAXY_OUTPUT;
44 unsigned int RSIZE = 150, MAX_L = 0, W_SIZE = 10, W_STEP = 5;
45 bool SECOND_BEST_POS = false, SINGLE_STRAND = false, VERBOSE = false, GET_SEQUENCES = false, POSITIONAL = false, SINGLE_MATRIX = false, SPEARMAN = false, TWOBIT = false, QUIET = false; //NAIVE = false;
46 ofstream fasta_out;
47 double SINGLE_MATRIX_BG_AVG = 0;
48
49 struct mpoint
50 {
51 unsigned int x;
52 unsigned int y;
53 };
54
55 struct bg
56 {
57 unsigned int Size;
58 double Avg, Var;
59 };
60
61 class sequence
62 {
63 private:
64 string Chr, Name;
65 char *Seq;
66 unsigned int Start, End, E_Rank;
67 char Strand, AntiStrand;
68 bool good;
69 public:
70 sequence(string*, ifstream*, unsigned int);
71 sequence(string*, unsigned int, string, unsigned int);
72 char strand();
73 char antistrand();
74 string chr();
75 unsigned int start();
76 unsigned int end();
77 unsigned int e_rank();
78 string name();
79 char* seq();
80 bool is_good();
81 double score();
82 int mpos();
83 };
84
85 class matrix
86 {
87 friend void scanner(vector<matrix>*, vector<sequence>*);
88 private:
89 void normalizer();
90 void to_log();
91 void avg();
92 void max_min();
93 void rev_c();
94 double get_M_value(int, char);
95 double get_rc_M_value(int, char);
96 void calc_var(double);
97 void calc_W_param(double);
98 void welch_test(double);
99 void welch_test_bg(double);
100 // void welch_test_bg_G(double);
101 void welch_test_pos(double);
102 void z_test_pos(double);
103 unsigned int L, bg_Size, BIN_NUM;
104 double **M;
105 double **rc_M;
106 double max_score, min_score, t, dof, t_bg, dof_bg;
107 double spearman;
108 string Name, Id;
109 bool O_U, O_U_bg;
110 vector<bool> W_O_U;
111 bool good, BG;
112 bool norm_sec_step;
113 vector<double> v_score, vF_score;
114 vector<unsigned int> v_pos;
115 vector<bool> v_strand;
116 vector<vector<double> > W_R_v_score;
117 vector<double> W_R_Avg, W_R_Var, W_R_pvalue, W_R_T_pvalue;
118 double R_Avg, F_Avg, R_Var, F_Var, /*G_Avg, G_Var,*/ pvalue, W_Avg, W_Var;
119 double bg_Avg, bg_Var, pvalue_bg;
120 public:
121 matrix(vector<string>*, map<string,bg>*);
122 void display();
123 bool is_good();
124 string name();
125 string id();
126 string o_or_u();
127 string o_or_u_bg();
128 int W_o_or_u(unsigned int);
129 void scan(vector<sequence>*);
130 double scan_seq_ds(char*, bool, unsigned int, vector<double>*);
131 double scan_seq_ss(char*, bool, unsigned int, char, vector<double>*);
132 double scan_seq_ds_second_best(char*, unsigned int, unsigned int *, bool *, int);
133 double scan_seq_ss_second_best(char*, unsigned int, unsigned int *, bool *, unsigned int, bool);
134 double get_score_at(unsigned int);
135 unsigned int get_pos_at(unsigned int);
136 unsigned int length();
137 bool get_strand_at(unsigned int);
138 bool have_bg();
139 unsigned int bin_num();
140 double get_R_Avg();
141 double get_F_Avg();
142 double get_bg_Avg();
143 double get_R_Var();
144 double get_F_Var();
145 double max();
146 double min();
147 double get_pvalue();
148 double get_pvalue_bg();
149 double get_W_pvalue(unsigned int);
150 double get_W_R_Avg_at_pos(unsigned int);
151 double get_t();
152 double get_dof();
153 double get_spearman();
154 double get_W_Avg();
155 double get_W_R_T_pvalue(unsigned int);
156 };
157
158 void command_line_parser(int, char**);
159 void acquire_regions(vector<sequence>*);
160 void acquire_matrices(vector<matrix>*, map<string,bg>*, string*);
161 void acquire_background(map<string,bg>*);
162 void scanner(vector<matrix>*, vector<sequence>*);
163 void main_output(vector<matrix>*, vector<sequence>*, unsigned int);
164 void main_output_single_matrix(vector<matrix>*, vector<sequence>*);
165 void distance_analysis(vector<matrix>*, vector<sequence>*, unsigned int, multimap <double, vector<matrix>::iterator>*);
166 void display_help();
167
168 int main(int argc, char **argv)
169 {
170 srand(time(NULL));
171
172 vector<sequence> Seq;
173 vector<matrix> Mat;
174 map<string,bg> Bg;
175 VI['A'] = 0;
176 VI['C'] = 1;
177 VI['G'] = 2;
178 VI['T'] = 3;
179 VI['N'] = 4;
180
181 obstf.clear();
182
183 command_line_parser(argc, argv);
184
185 if(GET_SEQUENCES)
186 {
187 string outfile = regfile;
188 outfile += ".fasta";
189 fasta_out.open(outfile.c_str(), ios::out);
190 }
191
192 if(!regfile.empty() && !matrixfile.empty())
193 {
194 if(!bgfile.empty())
195 {
196 if(!QUIET) cerr << "\nAcquiring background file... ";
197 acquire_background(&Bg);
198 if(!QUIET) cerr << "done" << endl;
199 }
200
201 if(!QUIET) cerr << "\nReading binding profiles...";
202 acquire_matrices(&Mat, &Bg, &matrixfile);
203
204 if(!additional_matrixfile.empty())
205 acquire_matrices(&Mat, &Bg, &additional_matrixfile);
206
207 if(Mat.empty())
208 {
209 if(!QUIET) cerr << "\nZero valid matrices acquired. Quitting..." << endl;
210 exit(1);
211 }
212
213 if(!QUIET) cerr << "done: " << Mat.size() << " binding profiles acquired. Max matrix length = " << MAX_L << endl;
214
215 if(Mat.size() == 1)
216 {
217 if(!QUIET) cerr << "\nSwitching to single matrix mode..." << endl;
218 SINGLE_MATRIX = true;
219 }
220
221 if(!QUIET) cerr << "\nAcquiring regions...";
222 acquire_regions(&Seq);
223 if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl;
224
225 if(Seq.size() <= 2)
226 POSITIONAL = false;
227
228 if(Seq.empty())
229 {
230 if(!QUIET) cerr << "\nZero regions to work on! Exiting..." << endl;
231 exit(1);
232 }
233
234 if(GET_SEQUENCES)
235 {
236 fasta_out.close();
237 exit(0);
238 }
239
240 // for(unsigned int i = 0; i < Mat.size(); i++)
241 // Mat[i].display();
242
243 if(!QUIET) cerr << "\nScanning...";
244 scanner(&Mat, &Seq);
245
246 if(!QUIET) cerr << "\nWriting output..." << endl;
247
248 if(!SINGLE_MATRIX)
249 main_output(&Mat, &Seq, Seq.size());
250 else
251 main_output_single_matrix(&Mat, &Seq);
252
253 if(!QUIET) cerr << "\nOutput written in file: " << outfile << endl;
254
255 }
256 else if(GET_SEQUENCES && !regfile.empty())
257 {
258 if(!QUIET) cerr << "\nAcquiring regions...";
259 acquire_regions(&Seq);
260 if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl;
261
262 fasta_out.close();
263 exit(0);
264 }
265 else
266 display_help();
267
268
269 /* for(unsigned int i = 0; i < Seq.size(); i++)
270 {
271 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << Seq[i].start() << '\t' << Seq[i].end() << '\t'
272 << Mat[0].get_score_at(i) << '\t' << Mat[0].get_pos_at(i) << '\t' << Mat[0].get_strand_at(i) << '\t'
273 << "Max = " << Mat[0].max() << '\t' << "Min = " << Mat[0].min()
274 << endl << Seq[i].seq() << endl;
275 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "UPSTREAM" << endl << Seq[i].uf_seq() << endl;
276 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "DOWNSTREAM" << endl << Seq[i].df_seq() << endl;
277 }
278 */
279
280 return 0;
281 }
282
283 void display_help()
284 {
285 if(!QUIET) cerr << endl << "Syntax: pscan_chip -r region_file.bed -M matrixfile.wil -g genome [-pos] [-wsize window_size] [-wstep window_step] [-bg background_file] [-s region_size]" << endl << endl;
286 exit(0);
287 }
288
289 void main_output_single_matrix(vector<matrix> *Mat, vector<sequence> *Seq)
290 {
291 outfile = regfile;
292 outfile += ".";
293 outfile += Mat->at(0).id();
294
295 string outfile2 = outfile;
296
297 outfile += ".ris";
298 outfile2 += ".all.ris";
299
300 if(!GALAXY_OUTPUT.empty())
301 outfile = GALAXY_OUTPUT;
302
303 multimap <double, string> outmap;
304
305 ofstream out(outfile.c_str(), ios::out);
306 ofstream out2(outfile2.c_str(), ios::out);
307
308 if(!out)
309 {
310 if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl;
311 exit(1);
312 }
313
314 out << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE";
315 out2 << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE";
316
317 // if(SPEARMAN)
318 // out << "\tE_RANK";
319
320 if(SECOND_BEST_POS)
321 {
322 out << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl;
323 out2 << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl;
324 }
325 else
326 {
327 out << endl;
328 out2 << endl;
329 }
330
331 matrix *M = &Mat->at(0);
332
333 /* if(SINGLE_MATRIX_BG_AVG == 0) //QUESTO SERVE PER QUANDO NON C'E' IL VALORE DI BACKGROUND!
334 {
335 double loc_avg = 0, loc_stdev = 0;
336
337 for(unsigned int i = 0; i < Seq->size(); i++)
338 loc_avg += M->get_score_at(i);
339
340 loc_avg /= Seq->size();
341
342 for(unsigned int i = 0; i < Seq->size(); i++)
343 loc_stdev += pow(M->get_score_at(i) - loc_avg, 2);
344
345 loc_stdev /= (Seq->size() - 1);
346
347 loc_stdev = sqrt(loc_stdev);
348
349 SINGLE_MATRIX_BG_AVG = loc_avg - (2*loc_stdev);
350
351 if(!QUIET) cerr << "\nSMBG = " << SINGLE_MATRIX_BG_AVG;
352 }
353 */
354 char Oligo[M->length() + 1];
355
356 Oligo[M->length()] = '\0';
357
358 for(unsigned int i = 0; i < Seq->size(); i++)
359 {
360 // if(!QUIET) cerr << endl << M->get_score_at(i) << '\t' << M->get_bg_Avg() << '\t';
361
362 // if(M->get_score_at(i) < SINGLE_MATRIX_BG_AVG) //ONLY REPORT OCCURRENCES WITH SCORE >= BG_AVG
363 // continue;
364
365 // if(!QUIET) cerr << "passed";
366
367 ostringstream outbuf;
368
369 sequence *S = &Seq->at(i);
370
371 strncpy(Oligo, S->seq() + RSIZE/2 + (M->get_pos_at(i) - M->length()/2), M->length());
372
373 outbuf << S->chr() << '\t' << S->start() << '\t' << S->end() << '\t' << S->strand() << '\t'
374 << S->start() + M->get_pos_at(i) - M->length()/2 << '\t' << S->start() + M->get_pos_at(i) + M->length()/2 << '\t'
375 << (int)((M->get_pos_at(i) - M->length()/2) - RSIZE/2) << '\t' << (int)((M->get_pos_at(i) + M->length()/2 - 1) - RSIZE/2)<< '\t';
376
377 if(S->strand() == '+')
378 {
379 if(!M->get_strand_at(i))
380 outbuf << S->strand();
381 else
382 outbuf << S->antistrand();
383 }
384 else
385 {
386 if(!M->get_strand_at(i))
387 outbuf << S->antistrand();
388 else
389 outbuf << S->strand();
390 }
391
392 outbuf << '\t' << M->get_score_at(i) << '\t' << Oligo;
393
394 // if(SPEARMAN)
395 // outbuf << '\t' << S->e_rank();
396
397 if(SECOND_BEST_POS)
398 {
399 if(SINGLE_STRAND)
400 {
401 bool s;
402 unsigned int P;
403 double score;
404
405 score = M->scan_seq_ss_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s,M->get_pos_at(i), S->strand());
406
407 strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length());
408
409 outbuf << '\t' << P << '\t' << score << '\t' << Oligo;
410 }
411 else
412 {
413 bool s;
414 unsigned int P;
415 double score;
416
417 score = M->scan_seq_ds_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s, (int)M->get_pos_at(i));
418
419 strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length());
420
421 outbuf << '\t' << (int)P - (int)RSIZE/2 << '\t' << score << '\t' << Oligo << '\t';
422
423 if(s)
424 outbuf << '-';
425 else
426 outbuf << '+';
427 }
428
429 outbuf << endl;
430 }
431 else
432 outbuf << endl;
433
434 outmap.insert(make_pair(M->get_score_at(i),outbuf.str()));
435 }
436
437 multimap<double,string>::reverse_iterator mi = outmap.rbegin();
438
439 // unsigned int count = 0;
440
441 while(mi != outmap.rend())
442 {
443 // if(count >= outmap.size()/2 && outmap.size() > 1) //SOLO LA MIGLIORE META' DELLE OCCORRENZE!!!
444 // break;
445 if(mi->first >= SINGLE_MATRIX_BG_AVG || mi == outmap.rbegin())
446 out << mi->second;
447
448 out2 << mi->second;
449
450 mi++;
451 // count++;
452 }
453
454 out.close();
455 out2.close();
456 return;
457 }
458
459 void main_output(vector<matrix> *Mat, vector<sequence> *Seq, unsigned int SIZE)
460 {
461 multimap <double, vector<matrix>::iterator> S_map;
462 outfile = regfile;
463
464 outfile += ".res";
465
466 if(!GALAXY_OUTPUT.empty())
467 outfile = GALAXY_OUTPUT;
468
469 ofstream out(outfile.c_str(),ios::out);
470
471 if(!out)
472 {
473 if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl;
474 exit(1);
475 }
476
477 out << "NAME\tID\tREG_N\tL_PVALUE O/U\tG_PVALUE O/U\tR_AVG\tR_STDEV\tF_AVG\tBG_AVG";
478
479 if(SPEARMAN)
480 out << "\tSP_COR";
481
482 if(POSITIONAL)
483 out << "\tPREF_POS\tPREF_POS_PV\n";
484 else
485 out << endl;
486
487 vector<matrix>::iterator vi = Mat->begin();
488
489 while(vi != Mat->end())
490 {
491 if(S_map.find(vi->get_pvalue()) != S_map.end()) //GFI WORKAROUND
492 {
493 if(S_map.find(vi->get_pvalue())->second->get_pvalue_bg() < vi->get_pvalue_bg())
494 {
495 S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi));
496 vi++;
497 continue;
498 }
499 else if((S_map.find(vi->get_pvalue())->second->get_pvalue_bg() > vi->get_pvalue_bg()))
500 {
501 S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi));
502 vi++;
503 continue;
504 }
505 else
506 {
507 if(S_map.find(vi->get_pvalue())->second->get_spearman() > vi->get_spearman())
508 {
509 S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi));
510 vi++;
511 continue;
512 }
513 else if(S_map.find(vi->get_pvalue())->second->get_spearman() <= vi->get_spearman())
514 {
515 S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi));
516 vi++;
517 continue;
518 }
519 }
520
521 }
522 else
523 S_map.insert(make_pair(vi->get_pvalue(), vi));
524
525 vi++;
526 }
527
528 multimap <double, vector<matrix>::iterator>::iterator mi = S_map.begin();
529
530 while(mi != S_map.end())
531 {
532 double pv1, pv2;
533
534 pv1 = mi->second->get_pvalue() * Mat->size();
535
536 if(pv1 > 1)
537 pv1 = 1;
538
539 pv2 = mi->second->get_pvalue_bg() * Mat->size();
540
541 if(pv2 > 1)
542 pv2 = 1;
543
544 out << mi->second->name() << '\t' << mi->second->id() << '\t' << SIZE << '\t' << pv1 << ' ' << mi->second->o_or_u() << '\t';
545
546 if(mi->second->have_bg())
547 out << pv2 << ' ' << mi->second->o_or_u_bg() << '\t';
548 else
549 out << "N/A N/A\t";
550
551 out << mi->second->get_R_Avg() << '\t' << pow(mi->second->get_R_Var(), 0.5) << '\t' << mi->second->get_F_Avg() << '\t';
552
553 if(mi->second->have_bg())
554 out << mi->second->get_bg_Avg() << '\t';
555 else
556 out << "N/A" << '\t';
557
558 if(SPEARMAN)
559 out << mi->second->get_spearman() << '\t';
560
561 if(POSITIONAL)
562 {
563 double min_pv = 1, min_pv_c;
564 unsigned int min_pos;
565
566 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++)
567 {
568 if(mi->second->get_W_pvalue(i) <= min_pv) //&& mi->second->W_o_or_u(i) == 1)
569 {
570 min_pv = mi->second->get_W_pvalue(i);
571 min_pos = i;
572 }
573 }
574
575 min_pv_c = min_pv * (mi->second->bin_num() * Seq->size());
576
577 if(min_pv_c > 1)
578 min_pv_c = 1;
579
580 out << '[' << (int)(((W_STEP * min_pos) + mi->second->length()/2) - RSIZE/2) << ',' << (int)(((W_STEP * min_pos) + W_SIZE) + mi->second->length()/2 - RSIZE/2) << ']' << '\t' << min_pv_c;// << '\t' << mi->second->get_W_R_T_pvalue(min_pos) << '\t' << min_pv_c;
581 }
582
583 out << endl;
584
585 mi++;
586 }
587
588 if(POSITIONAL)
589 {
590 out << endl << endl << "POSITIONAL ANALYSIS: W_SIZE = " << W_SIZE << " W_STEP = " << W_STEP << endl << endl;
591
592 out << "NAME\tID\t";
593
594 mi = S_map.begin();
595
596 for(int i = -(int)(RSIZE/2); i <= (int)((RSIZE/2) - W_SIZE); i += W_STEP)
597 out << i /*<< '|' << i + (int)(W_SIZE - 1)*/ << '\t';
598
599 out << endl;
600
601 while(mi != S_map.end())
602 {
603 /* bool gflag = false;
604
605 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++)
606 if(mi->second->get_W_pvalue(i) <= POS_VIS_THRESHOLD)
607 {
608 gflag = true;
609 break;
610 }
611
612 if(!gflag)
613 {
614 mi++;
615 continue;
616 }
617 */
618 out << mi->second->name() << '\t' << mi->second->id() << '\t';
619
620 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++)
621 {
622 // out << (-log(mi->second->get_W_pvalue(i))/log(10)) * mi->second->W_o_or_u(i) << '\t';//<< " [" << mi->second->W_o_or_u(i) << "]\t";
623 //
624 double bpv = mi->second->get_W_pvalue(i) * (mi->second->bin_num() * Seq->size());
625
626 if(bpv > 1)
627 bpv = 1;
628
629 out << bpv << '\t';
630 }
631
632 out << endl;
633
634 mi++;
635 }
636
637 if(!obstf.empty())
638 distance_analysis(Mat, Seq, SIZE, &S_map);
639 }
640
641 out.close();
642
643 return;
644 }
645
646 void distance_analysis(vector<matrix> *Mat, vector<sequence> *Seq, unsigned int SIZE, multimap <double, vector<matrix>::iterator> *S_map)
647 {
648 string dist_outfile = regfile, dist2_outfile = regfile, dist3_outfile = regfile;
649
650 dist_outfile += ".dist";
651 dist2_outfile += ".exp_dist";
652 dist3_outfile += ".pv_dist";
653
654 ofstream out(dist_outfile.c_str()), out2, out3;
655
656 if(!obstf.empty())
657 {
658 out2.open(dist2_outfile.c_str());
659 out3.open(dist3_outfile.c_str());
660
661 out2 << "TF/EXP_HITS\t";
662 out3 << "TF/PVALUE\t";
663 }
664
665 out << "TF1|TF2/HITS\t";
666
667 for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++)
668 out << p << '\t';
669
670 out << endl;
671
672 if(!obstf.empty())
673 {
674 for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++)
675 {
676 out2 << p << '\t';
677 out3 << p << '\t';
678 }
679
680 out2 << endl;
681 out3 << endl;
682 }
683
684 multimap <double, vector<matrix>::iterator>::iterator MRI = S_map->begin();
685 unsigned int LCOUNT = 0;
686
687 while(MRI != S_map->end())
688 {
689 if((MRI->second->get_pvalue_bg() > DIST_PV_THRESHOLD || MRI->second->o_or_u_bg() == "UNDER") &&
690 (MRI->second->get_pvalue() > DIST_PV_THRESHOLD || MRI->second->o_or_u() == "UNDER") )
691 {
692 MRI++;
693 continue;
694 }
695
696 // if(!QUIET) cerr << endl << MRI->second->name() << endl;
697
698 LCOUNT++;
699
700 map<int, double> R_dist_map;
701
702 if(!obstf.empty() && obstf == MRI->second->id())
703 {
704 map<int, unsigned int> M_hit_distr;
705
706 for(unsigned int a = 0; a < SIZE; a++)
707 {
708 map<int, unsigned int>::iterator mi;
709
710 if(!MRI->second->get_strand_at(a))
711 mi = M_hit_distr.find(MRI->second->get_pos_at(a));
712 else
713 mi = M_hit_distr.find(RSIZE - MRI->second->get_pos_at(a));
714
715 if(mi == M_hit_distr.end())
716 {
717 if(!MRI->second->get_strand_at(a))
718 M_hit_distr[MRI->second->get_pos_at(a)] = 1;
719 else
720 M_hit_distr[RSIZE - MRI->second->get_pos_at(a)] = 1;
721 }
722 else
723 mi->second++;
724 }
725
726 map<int, unsigned int>::iterator mi = M_hit_distr.begin();
727
728 while(mi != M_hit_distr.end())
729 {
730 for(int b = 0; b < RSIZE; b++)
731 {
732 int dpos = mi->first - b;
733
734 map<int, double>::iterator ri = R_dist_map.find(dpos);
735
736 if(ri == R_dist_map.end())
737 R_dist_map[dpos] = mi->second;
738 else
739 ri->second += mi->second;
740
741 }
742
743 mi++;
744 }
745
746 map<int, double>::iterator ri = R_dist_map.begin();
747 double risum = SIZE * RSIZE;
748
749 out2 << MRI->second->name() << '\t';
750
751 while(ri != R_dist_map.end())
752 {
753 ri->second /= risum;
754 out2 << ri->second*SIZE << '\t';
755 ri++;
756 }
757
758 out2 << endl;
759 }
760
761 // multimap <double, vector<matrix>::iterator>::iterator MRI2 = MRI;
762 multimap <double, vector<matrix>::iterator>::iterator MRI2 = S_map->begin();
763
764 while(MRI2 != S_map->end())
765 {
766 vector<unsigned int> hit_v((RSIZE*2) - 1, 0);
767
768 if(MRI2 == MRI)
769 {
770 vector<unsigned int> SBPOS;
771 // vector<bool> SBSTRAND;
772
773 if(SINGLE_STRAND)
774 {
775 for(unsigned int i = 0; i < Seq->size(); i++)
776 {
777 bool s;
778 unsigned int P;
779 double score;
780
781 score = MRI->second->scan_seq_ss_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, MRI->second->get_pos_at(i), Seq->at(i).strand());
782 SBPOS.push_back(P);
783 // SBSTRAND.push_back(s);
784 }
785
786 }
787 else
788 {
789 for(unsigned int i = 0; i < Seq->size(); i++)
790 {
791 bool s;
792 unsigned int P;
793 double score;
794
795 score = MRI->second->scan_seq_ds_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, (int)MRI->second->get_pos_at(i));
796
797 SBPOS.push_back(P + MRI->second->length()/2);
798 // SBSTRAND.push_back(s);
799
800 }
801 }
802
803 for(unsigned int k = 0; k < SIZE; k++)
804 {
805 int dist;
806
807 if(!MRI->second->get_strand_at(k))
808 dist = (int)SBPOS.at(k) - (int)MRI->second->get_pos_at(k);
809 else
810 dist = ((int)(RSIZE-1) - (int)SBPOS.at(k)) - ((int)(RSIZE - 1) - (int)MRI->second->get_pos_at(k));
811
812 hit_v.at((RSIZE-1) + dist)++;
813 }
814
815 }
816 else
817 {
818 for(unsigned int k = 0; k < SIZE; k++)
819 {
820 int dist;
821
822 if(!MRI->second->get_strand_at(k))
823 dist = (int)MRI2->second->get_pos_at(k) - (int)MRI->second->get_pos_at(k);
824 else
825 dist = ((int)(RSIZE - 1 ) - (int)MRI2->second->get_pos_at(k)) - ((int)(RSIZE - 1 ) - (int)MRI->second->get_pos_at(k));
826
827 // if(!QUIET) cerr << endl << (int)((RSIZE-1) + dist) << endl;
828
829 hit_v.at((RSIZE-1) + dist)++;
830 }
831 }
832
833 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty())
834 out << MRI->second->name() << "|" << MRI2->second->name() << '\t';
835
836 if(!obstf.empty() && obstf == MRI->second->id())
837 out3 << MRI->second->name() << "|" << MRI2->second->name() << '\t';
838
839 for(unsigned int p = 0; p < (RSIZE*2) - 1; p++)
840 {
841 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty())
842 out << hit_v[p] << '\t';
843
844 if(!obstf.empty() && obstf == MRI->second->id())
845 {
846 map<int, double>::iterator ri = R_dist_map.find(p - (RSIZE - 1));
847
848 if(hit_v[p] >= DIST_HIT_THRESHOLD && hit_v[p]/(ri->second * SIZE) >= DIST_RATIO_THRESHOLD)
849 {
850 double pv = gsl_cdf_binomial_Q(hit_v[p], ri->second, SIZE);
851
852 if(pv <= DIST_PV_THRESHOLD)
853 out3 << pv << '\t';
854 else
855 out3 << "NA" << '\t';
856 }
857 else
858 out3 << "NA" << '\t';
859 }
860 }
861
862 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty())
863 out << endl;
864
865 if(!obstf.empty() && obstf == MRI->second->id())
866 out3 << endl;
867
868 MRI2++;
869 }
870
871 MRI++;
872 }
873
874 out.close();
875
876 if(!obstf.empty())
877 {
878 out2.close();
879 out3.close();
880 }
881
882 if(!LCOUNT)
883 {
884 remove(dist_outfile.c_str());
885
886 if(!obstf.empty())
887 {
888 remove(dist2_outfile.c_str());
889 remove(dist3_outfile.c_str());
890 }
891 }
892
893 if(!QUIET) cerr << "\nDistance analysis written in file: " << dist_outfile << endl;
894
895 return;
896 }
897
898 void scanner(vector<matrix> *Mat, vector<sequence> *Seq)
899 {
900 for(unsigned int i = 0; i < Mat->size(); i++)
901 {
902 if(VERBOSE)
903 {
904 if(!QUIET) cerr << "\nScanning sequences for "
905 << Mat->at(i).name() << " (" << Mat->at(i).id() << ")"<< endl;
906 }
907
908 Mat->at(i).scan(Seq);
909 Mat->at(i).calc_var((double)Seq->size());
910
911 if(POSITIONAL)
912 Mat->at(i).calc_W_param((double)Seq->size());
913
914 if(VERBOSE)
915 {
916 if(!QUIET) cerr << "\nRegions avg = " << Mat->at(i).R_Avg << '\t' << "Flanks avg = " << Mat->at(i).F_Avg << '\t'
917 << "Regions devstd = " << pow(Mat->at(i).R_Var,0.5) << '\t' << "Flanks devstd = " << pow(Mat->at(i).F_Var,0.5);
918 if(!QUIET) cerr << "\nDoing Welch's t test... ";
919 }
920
921 Mat->at(i).welch_test((double)Seq->size());
922
923 if(Mat->at(i).have_bg())
924 Mat->at(i).welch_test_bg((double)Seq->size());
925
926 if(POSITIONAL)
927 {
928 // Mat->at(i).welch_test_pos((double)Seq->size());
929 Mat->at(i).z_test_pos((double)Seq->size());
930 }
931
932 if(VERBOSE)
933 {
934 if(!QUIET) cerr << "done. Pvalue = " << Mat->at(i).get_pvalue() << ' ' << Mat->at(i).o_or_u() << endl;
935 if(!QUIET) cerr << endl;
936 }
937 }
938
939 return;
940 }
941
942 void acquire_matrices(vector<matrix> *Mat, map<string,bg> *Bg, string *matfile)
943 {
944 ifstream in(matfile->c_str());
945 string line;
946
947 if(!in)
948 {
949 if(!QUIET) cerr << "Can't find " << *matfile << endl;
950 exit(1);
951 }
952
953 while(getline(in,line))
954 {
955 if(line.empty())
956 continue;
957
958 vector<string> mbuf;
959 mbuf.push_back(line);
960
961 for(unsigned int i = 0; i < ALPHABET_SIZE; i++)
962 {
963 getline(in,line);
964
965 if(!line.empty())
966 mbuf.push_back(line);
967 }
968
969 if(mbuf.size() == ALPHABET_SIZE + 1)
970 Mat->push_back(matrix(&mbuf,Bg));
971 }
972
973 in.close();
974
975 vector<matrix>::iterator vi = Mat->begin();
976
977 while(vi != Mat->end())
978 {
979 if(!vi->is_good())
980 {
981 vi = Mat->erase(vi);
982 continue;
983 }
984
985 vi++;
986 }
987
988 return;
989 }
990
991 void acquire_background(map<string, bg> *Bg)
992 {
993 ifstream in(bgfile.c_str());
994
995 if(!in)
996 {
997 if(!QUIET) cerr << "\nWarning: background file " << bgfile << " not found..." << endl;
998 return;
999 }
1000
1001 string line;
1002
1003 getline(in,line);
1004
1005 while(getline(in, line))
1006 {
1007 istringstream str(line);
1008 string id, trash;
1009 bg nbg;
1010
1011 str >> trash >> id >> nbg.Size >> trash >> trash >> trash >> trash >> nbg.Avg >> nbg.Var;
1012
1013 nbg.Var = pow(nbg.Var,2);
1014
1015 Bg->insert(make_pair(id,nbg));
1016 }
1017
1018 in.close();
1019 return;
1020 }
1021
1022 void acquire_regions(vector<sequence> *Seq)
1023 {
1024 ifstream in(regfile.c_str());
1025 // map<string, vector<string> > Regs;
1026 map<string, map<unsigned int, string> > Regs; //PREVENT REG DUPLICATES
1027 map<string, map<unsigned int, unsigned int > > M_RANK;
1028
1029 if(!in)
1030 {
1031 if(!QUIET) cerr << "Can't find " << regfile << endl;
1032 exit(1);
1033 }
1034
1035 string line;
1036
1037 unsigned int r_count = 0;
1038
1039 while(getline(in,line))
1040 {
1041 if(line.empty())
1042 continue;
1043
1044 if(line[0] == '#')
1045 continue;
1046
1047 for(unsigned int i = 0; i < 3; i++)
1048 line[i] = tolower(line[i]);
1049
1050 for(unsigned int i = 3; i < line.size(); i++)
1051 line[i] = toupper(line[i]);
1052
1053 istringstream str(line);
1054 string chr;
1055 unsigned int start, end, mp;
1056
1057 str >> chr >> start >> end;
1058
1059 // Regs[chr].push_back(line);
1060 mp = start + ((end - start)/2);
1061
1062 if(mp > (RSIZE + RSIZE/2) + 1)
1063 {
1064 Regs[chr][mp] = line;
1065
1066 // if(!QUIET) cerr << endl << line;
1067
1068 if(M_RANK.find(chr) != M_RANK.end()) //PREVENT REGIONS PRESENT MORE THAN ONCE TO DO TOO MUCH MESS WITH SPEARMAN CORRELATION
1069 if(M_RANK[chr].find(mp) != M_RANK[chr].end())
1070 continue;
1071
1072 M_RANK[chr][mp] = r_count;
1073 r_count++;
1074 }
1075 else
1076 if(!QUIET) cerr << "\nSkipping region: too close to chromosome start..." << endl << line << endl;
1077 }
1078
1079 in.close();
1080
1081 map<string, map<unsigned int, string> >::iterator mi = Regs.begin();
1082
1083 if(!TWOBIT)
1084 {
1085 while(mi != Regs.end())
1086 {
1087 string file = genome;
1088 file += "/";
1089 file += mi->first;
1090 file += ".raw";
1091
1092 in.open(file.c_str());
1093
1094 if(!in)
1095 {
1096 if(!QUIET) cerr << "\nCan't find " << file << ". Skipping chromosome:\n" << mi->first << endl;
1097
1098 if(SPEARMAN)
1099 {
1100 if(!QUIET) cerr << "\nSpearman correlation disabled..." << endl;
1101 SPEARMAN = false;
1102 }
1103
1104 mi++;
1105 in.close();
1106 continue;
1107 }
1108
1109 map<unsigned int, string>::iterator si = mi->second.begin();
1110
1111 while(si != mi->second.end())
1112 {
1113 Seq->push_back(sequence(&si->second, &in, M_RANK[mi->first][si->first]));
1114 si++;
1115 }
1116
1117 in.close();
1118 mi++;
1119 }
1120 }
1121 else
1122 {
1123 ostringstream str;
1124 ofstream outfile;
1125 str << ".pchip." << rand() << ".tmp";
1126 string tmp1 = str.str();
1127 outfile.open(tmp1.c_str(), ios::out);
1128 vector<string> k1;
1129 vector<unsigned int> k2;
1130
1131 while(mi != Regs.end())
1132 {
1133 map<unsigned int, string>::iterator si = mi->second.begin();
1134
1135 while(si != mi->second.end())
1136 {
1137 unsigned int Start, End;
1138
1139 Start = si->first - RSIZE;
1140 End = si->first + RSIZE + 1 + MAX_L;
1141
1142 outfile << mi->first << '\t' << Start << '\t' << End << '\t' << "." << endl;
1143
1144 k1.push_back(mi->first);
1145 k2.push_back(si->first);
1146
1147 si++;
1148 }
1149
1150 mi++;
1151 }
1152
1153 string cline = "twoBitToFa -bedPos -bed=";
1154 cline += str.str();
1155 str << ".fa";
1156 cline += " ";
1157 cline += genome;
1158 /* cline += "/";
1159 cline += genome;
1160 cline += ".2bit "; */
1161 cline += " ";
1162 cline += str.str();
1163
1164 // if(!QUIET) cerr << endl << "TwoBit command line:\n" << cline << endl;
1165
1166 if(system(cline.c_str()) != 0)
1167 {
1168 if(!QUIET) cerr << endl << "twoBitToFa returned non 0 error code... aborting pscan_chip run!\n" << endl;
1169 exit(1);
1170 }
1171 else
1172 {
1173 ifstream in(str.str().c_str());
1174 string seq;
1175 seq.clear();
1176
1177 if(!in)
1178 {
1179 if(!QUIET) cerr << endl << "Can't access twoBitToFa output file! Aborting pscan_chip run!\n" << endl;
1180 exit(1);
1181 }
1182
1183 unsigned int scount = 0;
1184
1185 while(getline(in,line))
1186 {
1187 if(line.empty())
1188 continue;
1189
1190 if(line[0] == '>')
1191 {
1192 if(seq.empty())
1193 continue;
1194 else
1195 {
1196 Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount)));
1197 seq.clear();
1198 scount++;
1199 continue;
1200 }
1201 }
1202 else
1203 seq += line;
1204 }
1205
1206 Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount))); //LAST SEQUENCE!
1207
1208 in.close();
1209 remove(str.str().c_str());
1210 remove(tmp1.c_str());
1211 }
1212
1213 // exit(1);
1214 }
1215
1216 vector<sequence>::iterator vi = Seq->begin();
1217
1218 while(vi != Seq->end())
1219 {
1220 if(!vi->is_good())
1221 {
1222 vi = Seq->erase(vi);
1223 continue;
1224 }
1225
1226 vi++;
1227 }
1228
1229 return;
1230 }
1231
1232 void command_line_parser(int argc, char **argv)
1233 {
1234 if(argc == 1)
1235 return;
1236
1237 for(int i = 1; i < argc; i++)
1238 {
1239 string buf = argv[i];
1240
1241 if(buf == "-r")
1242 {
1243 if(i < argc - 1)
1244 regfile = argv[++i];
1245 continue;
1246 }
1247
1248 if(buf == "-g")
1249 {
1250 if(i < argc - 1)
1251 genome = argv[++i];
1252 continue;
1253 }
1254
1255 if(buf == "-s")
1256 {
1257 if(i < argc - 1)
1258 RSIZE = atoi(argv[++i]);
1259 continue;
1260 }
1261 if(buf == "-bgavg")
1262 {
1263 if(i < argc - 1)
1264 SINGLE_MATRIX_BG_AVG = atof(argv[++i]);
1265 continue;
1266 }
1267 if(buf == "-M")
1268 {
1269 if(i < argc - 1)
1270 matrixfile = argv[++i];
1271 continue;
1272 }
1273 if(buf == "-MADD")
1274 {
1275 if(i < argc - 1)
1276 additional_matrixfile = argv[++i];
1277 continue;
1278 }
1279 if(buf == "-m")
1280 {
1281 if(i < argc - 1)
1282 selected = argv[++i];
1283 continue;
1284 }
1285
1286 if(buf == "-bg")
1287 {
1288 if(i < argc - 1)
1289 bgfile = argv[++i];
1290 continue;
1291 }
1292 if(buf == "-go")
1293 {
1294 if(i < argc - 1)
1295 GALAXY_OUTPUT = argv[++i];
1296 continue;
1297 }
1298 if(buf == "-obs")
1299 {
1300 if(i < argc - 1)
1301 obstf = argv[++i];
1302
1303 continue;
1304 }
1305 if(buf == "-ss")
1306 {
1307 SINGLE_STRAND = true;
1308 continue;
1309 }
1310 if(buf == "-2bit")
1311 {
1312 TWOBIT = true;
1313 continue;
1314 }
1315 if(buf == "-quiet")
1316 {
1317 QUIET = true;
1318 continue;
1319 }
1320 if(buf == "-sbp")
1321 {
1322 SECOND_BEST_POS = true;
1323 continue;
1324 }
1325 if(buf == "-fasta")
1326 {
1327 GET_SEQUENCES = true;
1328 continue;
1329 }
1330 if(buf == "-pos")
1331 {
1332 POSITIONAL = true;
1333 continue;
1334 }
1335 if(buf == "-spear")
1336 {
1337 SPEARMAN = true;
1338 continue;
1339 }
1340 if(buf == "-wsize")
1341 {
1342 if(i < argc - 1)
1343 W_SIZE = atoi(argv[++i]);
1344
1345 W_STEP = W_SIZE / 2;
1346 continue;
1347 }
1348 /* if(buf == "-wstep")
1349 {
1350 if(i < argc - 1)
1351 W_STEP = atoi(argv[++i]);
1352 continue;
1353 }
1354
1355 if(buf == "-fast")
1356 {
1357 FAST_SCAN = true;
1358 continue;
1359 }*/
1360 /* if(buf == "-naive")
1361 {
1362 NAIVE = true;
1363 continue;
1364 }
1365 */
1366 if(buf == "-v")
1367 {
1368 VERBOSE = true;
1369 continue;
1370 }
1371 else
1372 {
1373 if(!QUIET) cerr << "\nUnknown option: " << buf << endl;
1374 exit(1);
1375 }
1376
1377 /* if(buf == "-Q")
1378 {
1379 if(i < argc - 1)
1380 idfile = argv[++i];
1381 continue;
1382 }
1383 if(buf == "-ql")
1384 {
1385 if(i < argc - 1)
1386 q_idlist = argv[++i];
1387 continue;
1388 }
1389 if(buf == "-imgm")
1390 {
1391 if(i < argc - 1)
1392 bg_for_img = argv[++i];
1393 continue;
1394 }
1395 if(buf == "-magic")
1396 {
1397 if(i < argc - 1)
1398 magic = argv[++i];
1399 continue;
1400 }*/
1401 }
1402
1403 if(RSIZE > MAX_REG_SIZE)
1404 {
1405 if(!QUIET) cerr << "\nRegion size exceeds " << MAX_REG_SIZE << "..." << endl;
1406 exit(1);
1407 }
1408
1409 if(RSIZE % W_SIZE != 0 || W_SIZE % 2 != 0 || RSIZE / W_SIZE < 5)
1410 {
1411 if(!QUIET) cerr << "\nWindow size must be even and both an exact divisor of region size and at least five times smaller of it: positional analysis disabled." << endl;
1412 POSITIONAL = false;
1413 }
1414
1415 return;
1416 }
1417
1418 //SEQUENCE CLASS BEGIN
1419
1420 sequence::sequence(string *seq, unsigned int R, string chr, unsigned int mp) //CONSTRUCTOR FOR 2BIT INPUT
1421 {
1422 Chr = chr;
1423 Strand = '+';
1424 AntiStrand = '-';
1425 E_Rank = R;
1426 good = true;
1427
1428 Start = mp - RSIZE/2;
1429 End = (mp + RSIZE/2) - 1;
1430
1431 for(unsigned int i = 0; i < seq->size(); i++)
1432 seq->at(i) = toupper(seq->at(i));
1433
1434 Seq = new char[(RSIZE*2) + 1 + MAX_L];
1435
1436 // if(!QUIET) cerr << "\nseq.size = " << seq->size() << endl;
1437 // if(!QUIET) cerr << "\narray.size = " << (RSIZE*2) + 1 + MAX_L << endl;
1438
1439 strcpy(Seq, seq->c_str());
1440
1441 if(seq->size() != RSIZE*2 + MAX_L + 1)
1442 {
1443 good = false;
1444 return;
1445 }
1446
1447 if(GET_SEQUENCES)
1448 fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << endl << seq->substr(RSIZE/2, RSIZE) << endl;
1449
1450 return;
1451 }
1452
1453 sequence::sequence(string *line, ifstream *in, unsigned int R)
1454 {
1455 istringstream str(*line);
1456 string trash;
1457 int buf_s, buf_e, mp;
1458 good = true;
1459
1460 E_Rank = R;
1461
1462 str >> Chr >> buf_s >> buf_e >> Name >> trash >> Strand;
1463
1464 if(Strand == 'R')
1465 Strand = '-';
1466
1467 if(Strand != '+' && Strand != '-')
1468 Strand = '+';
1469
1470 if(Strand == '+')
1471 AntiStrand = '-';
1472 else
1473 AntiStrand = '+';
1474
1475 mp = buf_s + ((buf_e - buf_s)/2);
1476
1477 Start = mp - RSIZE/2;
1478 End = (mp + RSIZE/2) - 1;
1479
1480 // Seq = new char[RSIZE + 1];
1481 // UF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2];
1482 // DF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2];
1483
1484 Seq = new char[(RSIZE*2) + 1 + MAX_L];
1485
1486 in->seekg((Start) - RSIZE/2);
1487
1488 in->get(Seq, (RSIZE*2 + 1) + MAX_L);
1489
1490 if(in->gcount() != RSIZE*2 + MAX_L)
1491 {
1492 good = false;
1493 return;
1494 }
1495
1496 if(GET_SEQUENCES)
1497 {
1498 char *out_seq;
1499 out_seq = new char[RSIZE + 1];
1500
1501 in->seekg(Start);
1502 in->get(out_seq, (RSIZE + 1));
1503
1504 fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << '\t' << "---" << '\t' << buf_s << '\t' << buf_e << '\t' << endl << out_seq << endl;
1505
1506 delete[] out_seq;
1507 }
1508
1509 /*in->seekg(((Start - 1) - RSIZE/2) - (MAX_L/2));
1510
1511 in->get(UF_Seq, RSIZE/2 + 1 + MAX_L /2);
1512
1513 if(in->gcount() != RSIZE/2 + (MAX_L/2))
1514 {
1515 good = false;
1516 return;
1517 }
1518
1519 in->get(Seq, RSIZE + 1);
1520
1521 if(in->gcount() != RSIZE)
1522 {
1523 good = false;
1524 return;
1525 }
1526
1527 in->get(DF_Seq, RSIZE/2 + 1 + (MAX_L/2));
1528
1529 if(in->gcount() != RSIZE/2 + (MAX_L/2))
1530 good = false;
1531 */
1532
1533 return;
1534 }
1535
1536 unsigned int sequence::e_rank()
1537 {
1538 return E_Rank;
1539 }
1540
1541 string sequence::chr()
1542 {
1543 return Chr;
1544 }
1545
1546 /*string* sequence::seq()
1547 {
1548 return &Seq;
1549 }*/
1550
1551 char* sequence::seq()
1552 {
1553 return Seq;
1554 }
1555
1556 char sequence::strand()
1557 {
1558 return Strand;
1559 }
1560
1561 char sequence::antistrand()
1562 {
1563 return AntiStrand;
1564 }
1565
1566 /*char* sequence::uf_seq()
1567 {
1568 return UF_Seq;
1569 }
1570
1571 char* sequence::df_seq()
1572 {
1573 return DF_Seq;
1574 }
1575 */
1576 string sequence::name()
1577 {
1578 return Name;
1579 }
1580
1581 unsigned int sequence::start()
1582 {
1583 return Start;
1584 }
1585
1586 unsigned int sequence::end()
1587 {
1588 return End;
1589 }
1590
1591 bool sequence::is_good()
1592 {
1593 return good;
1594 }
1595
1596 // SEQUENCE CLASS END
1597
1598 // MATRIX CLASS BEGIN
1599
1600 matrix::matrix(vector<string>* mbuf, map<string, bg> *Bg)
1601 {
1602 string buf;
1603 istringstream bstr(mbuf->at(1)), nstr(mbuf->at(0));
1604 L = 0;
1605 R_Avg = 0;
1606 F_Avg = 0;
1607 R_Var = 0;
1608 F_Var = 0;
1609 bg_Avg = 0;
1610 bg_Var = 0;
1611
1612 BIN_NUM = RSIZE / W_STEP;
1613
1614 W_R_Avg.resize(BIN_NUM, 0);
1615 W_R_Var.resize(BIN_NUM, 0);
1616 W_R_pvalue.resize(BIN_NUM, 0);
1617 W_R_T_pvalue.resize(BIN_NUM,0);
1618 spearman = 42;
1619
1620 O_U = false;
1621 O_U_bg = false;
1622 W_O_U.resize(BIN_NUM, 0);
1623
1624 BG = false;
1625 good = true;
1626 norm_sec_step = false;
1627
1628 while(bstr >> buf)
1629 L++;
1630
1631 if(L > RSIZE/2)
1632 {
1633 good = false;
1634 return;
1635 }
1636
1637 if(L > MAX_L)
1638 MAX_L = L;
1639
1640 if(!mbuf->at(0).empty())
1641 {
1642 if(mbuf->at(0).at(0) != '>')
1643 {
1644 good = false;
1645 return;
1646 }
1647 }
1648 else
1649 {
1650 good = false;
1651 return;
1652 }
1653
1654 // Name = mbuf->at(0).substr(1);
1655 nstr >> Id >> Name;
1656
1657 Id = Id.substr(1);
1658
1659 if(!selected.empty())
1660 {
1661 // cerr << endl << selected << '\t' << Id << '\t' << Name << endl;
1662
1663 if(selected != Id && selected != Name)
1664 {
1665 good = false;
1666 return;
1667 }
1668 }
1669
1670 if(!Bg->empty())
1671 {
1672 map<string,bg>::iterator mi = Bg->find(Id);
1673
1674 if(mi == Bg->end())
1675 mi = Bg->find(Name);
1676
1677 if(mi != Bg->end())
1678 {
1679 bg_Avg = mi->second.Avg;
1680 bg_Var = mi->second.Var;
1681 bg_Size = mi->second.Size;
1682 BG = true;
1683 }
1684 else
1685 if(!QUIET) cerr << "\nWarning: no background for matrix: " << Id << endl;
1686 }
1687
1688 M = new double*[L];
1689 rc_M = new double*[L];
1690
1691 for(unsigned int i = 1; i < ALPHABET_SIZE + 1; i++)
1692 {
1693 istringstream str(mbuf->at(i));
1694
1695 unsigned int count = 0;
1696
1697 while(str >> buf)
1698 {
1699 if(i == 1)
1700 {
1701 M[count] = new double[ALPHABET_SIZE + 1];
1702 rc_M[count] = new double[ALPHABET_SIZE + 1];
1703 }
1704
1705 M[count][i-1] = atof(buf.c_str());
1706 count++;
1707
1708 if(count > L)
1709 {
1710 good = false;
1711 return;
1712 }
1713 }
1714
1715 if(count != L)
1716 {
1717 good = false;
1718 return;
1719 }
1720 }
1721
1722 normalizer();
1723 to_log();
1724 rev_c();
1725 avg();
1726 max_min();
1727 // set_top_pos();
1728
1729 if(min_score == max_score)
1730 good = false;
1731
1732 return;
1733 }
1734
1735 void matrix::display()
1736 {
1737 if(!QUIET) cerr << "\n>" << Name << '\t' << good << endl;
1738
1739 for(unsigned int i = 0; i < ALPHABET_SIZE; i++)
1740 {
1741 for(unsigned int j = 0; j < L; j++)
1742 if(!QUIET) cerr << M[j][i] << '\t';
1743
1744 if(!QUIET) cerr << endl;
1745 }
1746
1747 if(!QUIET) cerr << endl;
1748
1749 // for(unsigned int i = 0; i < TOP_P; i++)
1750 // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << top_pos[i] + 1 << "-" << top_char[i] << '\t';
1751
1752 if(!QUIET) cerr << "\n\n>" << Name << '\t' << good << '\t' << "RC" << endl;
1753
1754 for(unsigned int i = 0; i < ALPHABET_SIZE; i++)
1755 {
1756 for(unsigned int j = 0; j < L; j++)
1757 if(!QUIET) cerr << rc_M[j][i] << '\t';
1758
1759 if(!QUIET) cerr << endl;
1760 }
1761
1762 if(!QUIET) cerr << endl;
1763
1764 // for(unsigned int i = 0; i < TOP_P; i++)
1765 // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << rc_top_pos[i] + 1 << "-" << rc_top_char[i] << '\t';
1766
1767 if(!QUIET) cerr << endl;
1768
1769 return;
1770 }
1771
1772 bool matrix::is_good()
1773 {
1774 return good;
1775 }
1776
1777
1778 void matrix::normalizer()
1779 {
1780 double *sums = new double[L];
1781
1782 for(int x = 0; x < L; x++)
1783 {
1784 sums[x] = 0;
1785
1786 for(int y = 0; y < ALPHABET_SIZE; y++)
1787 sums[x] += M[x][y];
1788 }
1789
1790 for(int x = 0; x < L; x++)
1791 for(int y = 0; y < ALPHABET_SIZE; y++)
1792 M[x][y] /= sums[x];
1793
1794 delete[] sums;
1795
1796 if(!norm_sec_step)
1797 {
1798 for(int x = 0; x < L; x++)
1799 for(int y = 0; y < ALPHABET_SIZE; y++)
1800 M[x][y] += MIN_ADD_FREQ;
1801
1802 norm_sec_step = true;
1803
1804 normalizer();
1805 }
1806
1807 return;
1808
1809 }
1810
1811 void matrix::to_log()
1812 {
1813 for(int x = 0; x < L; x++)
1814 for(int y = 0; y < ALPHABET_SIZE; y++)
1815 M[x][y] = log(M[x][y]);
1816
1817 return;
1818 }
1819
1820 void matrix::avg()
1821 {
1822 // mat_avg = new double[L];
1823
1824 for(int x = 0; x < L; x++)
1825 {
1826 double buf = 0;
1827
1828 for(int y = 0; y < ALPHABET_SIZE; y++)
1829 buf += M[x][y];
1830
1831 // mat_avg[x] = buf/ALPHABET_SIZE;
1832 M[x][ALPHABET_SIZE] = buf/ALPHABET_SIZE;
1833 rc_M[(L-1) -x][ALPHABET_SIZE] = buf/ALPHABET_SIZE;
1834 }
1835
1836 return;
1837 }
1838
1839 void matrix::max_min()
1840 {
1841 double min, max;
1842
1843 max_score = 0;
1844 min_score = 0;
1845
1846 for(int x = 0; x < L; x++)
1847 {
1848 min = M[x][0];
1849 max = M[x][0];
1850
1851 for(int y = 1; y < ALPHABET_SIZE; y++)
1852 {
1853 if(M[x][y] < min)
1854 min = M[x][y];
1855
1856 if(M[x][y] > max)
1857 max = M[x][y];
1858 }
1859
1860 min_score += min;
1861 max_score += max;
1862 }
1863
1864 return;
1865 }
1866
1867 void matrix::rev_c()
1868 {
1869 for(unsigned int x = 0; x < L; x++)
1870 for(unsigned int y = 0; y < ALPHABET_SIZE; y++)
1871 rc_M[(L - 1) - x][(ALPHABET_SIZE - 1) - y] = M[x][y];
1872
1873 return;
1874 }
1875
1876 void matrix::scan(vector<sequence> *Seq)
1877 {
1878 v_score.reserve(Seq->size());
1879 vF_score.reserve(Seq->size());
1880 v_pos.reserve(Seq->size());
1881 v_strand.reserve(Seq->size());
1882
1883 if(POSITIONAL)
1884 W_R_v_score.reserve(Seq->size());
1885
1886 unsigned int progress;
1887
1888 if(!SINGLE_STRAND)
1889 {
1890 for(unsigned int i = 0; i < Seq->size(); i++)
1891 {
1892 if(VERBOSE)
1893 {
1894 progress = (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100);
1895
1896 if(!(progress%5))
1897 if(!QUIET) cerr << progress << '%';
1898 }
1899
1900 double max_flank_u, max_flank_d;
1901 vector<double> W_buf(BIN_NUM, 0);
1902
1903 {
1904 double r_score;
1905
1906 max_flank_u = scan_seq_ds(Seq->at(i).seq() , true, RSIZE/2, &W_buf);
1907 r_score = scan_seq_ds(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, &W_buf);
1908 max_flank_d = scan_seq_ds(Seq->at(i).seq() + ((RSIZE/2)*3) , true, RSIZE/2, &W_buf);
1909
1910 R_Avg += r_score;
1911
1912 /* if(r_score < max_flank_u || r_score < max_flank_d)
1913 {
1914 if(max_flank_u > max_flank_d)
1915 G_Avg += max_flank_u;
1916 else
1917 G_Avg += max_flank_d;
1918 }
1919 else
1920 G_Avg += r_score;*/
1921
1922 if(POSITIONAL)
1923 {
1924 // for(unsigned int i = 0; i < BIN_NUM; i++)
1925 // W_R_Avg[i] += W_buf[i];
1926
1927 W_R_v_score.push_back(W_buf);
1928 }
1929
1930 }
1931
1932 if(max_flank_u > max_flank_d)
1933 {
1934 F_Avg += max_flank_u;
1935 vF_score.push_back(max_flank_u);
1936 }
1937 else
1938 {
1939 F_Avg += max_flank_d;
1940 vF_score.push_back(max_flank_d);
1941 }
1942
1943
1944 if(VERBOSE)
1945 if(!(progress%5))
1946 if(!QUIET) cerr << (char)13;
1947 }
1948 }
1949 else
1950 {
1951 for(unsigned int i = 0; i < Seq->size(); i++)
1952 {
1953
1954 if(VERBOSE)
1955 if(!QUIET) cerr << (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100) << '%';
1956
1957 double max_flank_u, max_flank_d;
1958 vector<double> W_buf(BIN_NUM, 0);
1959
1960 {
1961
1962 double r_score;
1963
1964 max_flank_u = scan_seq_ss(Seq->at(i).seq() , true, RSIZE/2, Seq->at(i).strand(), &W_buf);
1965 r_score += scan_seq_ss(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, Seq->at(i).strand(), &W_buf);
1966 max_flank_d = scan_seq_ss(Seq->at(i).seq() + ((RSIZE/2)*3), true, RSIZE/2, Seq->at(i).strand(), &W_buf);
1967
1968 R_Avg += r_score;
1969
1970 /* if(r_score < max_flank_u || r_score < max_flank_d)
1971 {
1972 if(max_flank_u > max_flank_d)
1973 G_Avg += max_flank_u;
1974 else
1975 G_Avg += max_flank_d;
1976 }
1977 else
1978 G_Avg += r_score;*/
1979
1980 if(POSITIONAL)
1981 {
1982 // for(unsigned int i = 0; i < BIN_NUM; i++)
1983 // W_R_Avg[i] += W_buf[i];
1984
1985 W_R_v_score.push_back(W_buf);
1986 }
1987 }
1988
1989 if(max_flank_u > max_flank_d)
1990 {
1991 F_Avg += max_flank_u;
1992 vF_score.push_back(max_flank_u);
1993 }
1994 else
1995 {
1996 F_Avg += max_flank_d;
1997 vF_score.push_back(max_flank_d);
1998 }
1999
2000 if(VERBOSE)
2001 if(!QUIET) cerr << (char)13;
2002 }
2003 }
2004
2005 R_Avg /= Seq->size();
2006 F_Avg /= Seq->size();
2007 // G_Avg /= Seq->size();
2008
2009 // for(unsigned int i = 0; i < BIN_NUM; i++)
2010 // W_R_Avg[i] /= Seq->size();
2011
2012 if(SPEARMAN)
2013 {
2014 multimap<double, unsigned int> SRM;
2015
2016 for(unsigned int i = 0; i < v_score.size(); i++)
2017 SRM.insert(make_pair(v_score[i], i));
2018
2019 multimap<double, unsigned int>::reverse_iterator mmi = SRM.rbegin();
2020 int sr = 0;
2021 double d = 0;
2022
2023 while(mmi != SRM.rend())
2024 {
2025 // if(!QUIET) cerr << endl << mmi->first << '\t' << sr << '\t' << Seq->at(mmi->second).e_rank() << '\t' << this->name();
2026
2027 d += pow((sr - (int)Seq->at(mmi->second).e_rank()), 2);
2028 sr++;
2029 mmi++;
2030 }
2031
2032 d *= 6;
2033
2034 // if(!QUIET) cerr << endl << "d = " << d << "\tden = " << (Seq->size() * (pow(Seq->size(), 2) - 1)) << '\t' << "ssize = " << Seq->size();
2035
2036 d /= (Seq->size() * (pow(Seq->size(), 2) - 1));
2037
2038 spearman = 1 - d;
2039 }
2040
2041 return;
2042 }
2043
2044 void matrix::calc_var(double S)
2045 {
2046 for(unsigned int i = 0; i < S; i++)
2047 {
2048 R_Var += pow(v_score[i] - R_Avg,2);
2049 F_Var += pow(vF_score[i] - F_Avg,2);
2050
2051 /* if(v_score[i] >= vF_score[i])
2052 G_Var += pow(v_score[i] - G_Avg,2);
2053 else
2054 G_Var += pow(vF_score[i] - G_Avg,2);*/
2055 }
2056
2057 if(S > 1)
2058 {
2059 R_Var /= (S - 1);
2060 F_Var /= (S - 1);
2061 // G_Var /= (S - 1);
2062 }
2063
2064 if(R_Var == 0)
2065 R_Var += EPSILON;
2066
2067 if(F_Var == 0)
2068 F_Var += EPSILON;
2069
2070 // if(G_Var == 0)
2071 // G_Var += EPSILON;
2072
2073 return;
2074 }
2075
2076 void matrix::calc_W_param(double S)
2077 {
2078 if(S > 1)
2079 S = S/2; //SOLO PER LA PRIMA META' DELLE SEQUENZE!!!
2080
2081 W_Avg = 0;
2082 W_Var = 0;
2083
2084 multimap<double, unsigned int> score_seq_map;
2085
2086 for(unsigned int i = 0; i < v_score.size(); i++)
2087 score_seq_map.insert(make_pair(v_score.at(i), i));
2088
2089 multimap<double, unsigned int>::reverse_iterator mi = score_seq_map.rbegin();
2090
2091 for(unsigned int i = 0; i < S; i++)
2092 {
2093 for(unsigned int j = 0; j < BIN_NUM; j++)
2094 W_R_Avg[j] += W_R_v_score[mi->second][j];
2095
2096 // if(!QUIET) cerr << Name << '\t' << mi->first << endl;
2097
2098 mi++;
2099 }
2100
2101 for(unsigned int j = 0; j < BIN_NUM; j++)
2102 W_R_Avg[j] /= S;
2103
2104 mi = score_seq_map.rbegin();
2105
2106 for(unsigned int i = 0; i < S; i++)
2107 {
2108 for(unsigned int j = 0; j < BIN_NUM; j++)
2109 W_R_Var[j] += pow(W_R_v_score[mi->second][j] - W_R_Avg[j],2);
2110
2111 mi++;
2112 }
2113
2114 /* for(unsigned int i = 0; i < S; i++)
2115 for(unsigned int j = 0; j < BIN_NUM; j++)
2116 W_R_Var[j] += pow(W_R_v_score[i][j] - W_R_Avg[j],2);*/
2117
2118 if(S > 1)
2119 {
2120 for(unsigned int j = 0; j < BIN_NUM; j++)
2121 W_R_Var[j] /= (S - 1);
2122 }
2123
2124 for(unsigned int j = 0; j < BIN_NUM; j++)
2125 if(W_R_Var[j] == 0)
2126 W_R_Var[j] += EPSILON;
2127
2128 for(unsigned int j = 0; j < BIN_NUM - 1; j++) //THE LAST BIN IS WASTED SINCE IT IS AN HALF BIN
2129 W_Avg += W_R_Avg[j];
2130
2131 W_Avg /= (BIN_NUM - 1);
2132
2133 for(unsigned int j = 0; j < BIN_NUM - 1; j++)
2134 W_Var += pow(W_R_Avg[j] - W_Avg, 2);
2135
2136 W_Var /= (BIN_NUM - 2);
2137
2138 return;
2139 }
2140
2141 void matrix::welch_test(double S)
2142 {
2143 t = R_Avg - F_Avg;
2144
2145 if(t == 0)
2146 t = EPSILON;
2147
2148 t /= pow((R_Var/S) + (F_Var/S),0.5);
2149
2150 dof = pow((R_Var/S) + (F_Var/S),2);
2151
2152 dof /= (pow(R_Var,2)/((pow(S,2)) * (S-1))) + (pow(F_Var,2)/((pow(S,2)) * (S-1)));
2153
2154 if(t > 0)
2155 pvalue = gsl_cdf_tdist_Q(t,dof);
2156 else
2157 {
2158 pvalue = gsl_cdf_tdist_P(t,dof);
2159 O_U = true;
2160 }
2161
2162 return;
2163 }
2164
2165 void matrix::z_test_pos(double S)
2166 {
2167 // if(!QUIET) cerr << endl << Name;
2168
2169 for(unsigned int i = 0; i < BIN_NUM; i++)
2170 {
2171 double se = sqrt(W_R_Var.at(i)), z;
2172 se /= sqrt(S/2);
2173
2174 z = W_R_Avg.at(i) - W_Avg;
2175 z /=se;
2176
2177 if(z >= 0)
2178 W_O_U.at(i) = true;
2179
2180 W_R_pvalue.at(i) = gsl_cdf_ugaussian_Q(z);
2181
2182 // if(!QUIET) cerr << endl << "W_R_Avg.at(i) " << W_R_Avg.at(i) << '\t' << W_Avg << '\t' << W_R_pvalue.at(i) << endl;
2183
2184 }
2185
2186 return;
2187 }
2188
2189 void matrix::welch_test_pos(double S)
2190 {
2191 if(S > 1)
2192 S = S/2; //SOLO LA PRIMA META' DELLE SEQUENZE!!!
2193
2194 // if(!QUIET) cerr << "\nRAVG = " << W_Avg << endl;
2195 for(unsigned int i = 0; i < BIN_NUM; i++)
2196 {
2197 double T, DOF;
2198
2199 T = W_R_Avg.at(i) - W_Avg;
2200
2201 // if(!QUIET) cerr << W_R_Avg.at(i) << '\t';
2202
2203 // if(!QUIET) cerr << endl << Name << endl << "T = " << T << '\t' << W_R_Avg.at(i) << '\t' << W_Avg;
2204
2205 if(T == 0)
2206 T = EPSILON;
2207
2208 T /= pow((W_R_Var.at(i)/S) + (W_Var/S),0.5);
2209
2210 DOF = pow((W_R_Var.at(i)/S) + (W_Var/S),2);
2211
2212 DOF /= (pow(W_R_Var.at(i),2)/((pow(S,2)) * (S-1))) + (pow(W_Var,2)/((pow(S,2)) * (S-1)));
2213
2214 if(T > 0)
2215 W_R_T_pvalue.at(i) = gsl_cdf_tdist_Q(T,DOF);
2216 else
2217 {
2218 W_R_T_pvalue.at(i) = gsl_cdf_tdist_P(T,DOF);
2219 W_O_U.at(i) = true;
2220 }
2221
2222 // if(!QUIET) cerr << "\tPV= " << W_R_pvalue.at(i);
2223
2224 }
2225
2226 // if(!QUIET) cerr << endl;
2227
2228 return;
2229 }
2230
2231 void matrix::welch_test_bg(double S1)
2232 {
2233 double S2 = bg_Size;
2234
2235 // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl;
2236
2237 t_bg = R_Avg - bg_Avg;
2238
2239 if(t_bg == 0)
2240 t_bg = EPSILON;
2241
2242 t_bg /= pow((R_Var/S1) + (bg_Var/S2),0.5);
2243
2244 dof_bg = pow((R_Var/S1) + (bg_Var/S2),2);
2245
2246 dof_bg /= (pow(R_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1)));
2247
2248 if(t_bg > 0)
2249 pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg);
2250 else
2251 {
2252 pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg);
2253 O_U_bg = true;
2254 }
2255
2256 return;
2257 }
2258
2259 /*void matrix::welch_test_bg_G(double S1)
2260 {
2261 double S2 = bg_Size;
2262
2263 // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl;
2264
2265 t_bg = G_Avg - bg_Avg;
2266
2267 if(t_bg == 0)
2268 t_bg = EPSILON;
2269
2270 t_bg /= pow((G_Var/S1) + (bg_Var/S2),0.5);
2271
2272 dof_bg = pow((G_Var/S1) + (bg_Var/S2),2);
2273
2274 dof_bg /= (pow(G_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1)));
2275
2276 if(t_bg > 0)
2277 pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg);
2278 else
2279 {
2280 pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg);
2281 O_U_bg = true;
2282 }
2283
2284 return;
2285 }*/
2286
2287 double matrix::scan_seq_ds(char *S, bool flank, unsigned int l, vector<double> *W_buf)
2288 {
2289 unsigned int pos = 0, max_pos = 0;
2290 double seq_score;
2291 bool strand = false;
2292
2293 l -= L/2;
2294
2295 while(pos < l)
2296 {
2297
2298 double pos_score[2] = {0,0};
2299
2300 for(unsigned int j = 0; j < L; j++)
2301 {
2302 pos_score[0] += get_M_value(j, S[pos+j]);
2303 pos_score[1] += get_rc_M_value(j, S[pos+j]);
2304 }
2305
2306 if(seq_score < pos_score[0] || !pos)
2307 {
2308 seq_score = pos_score[0];
2309 max_pos = pos;
2310 strand = false;
2311 }
2312 if(seq_score < pos_score[1])
2313 {
2314 seq_score = pos_score[1];
2315 max_pos = pos;
2316 strand = true;
2317 }
2318
2319 if(!flank && POSITIONAL)
2320 {
2321 unsigned int vpos = pos / W_STEP;
2322
2323 if(W_buf->at(vpos) < pos_score[0] || !(pos % W_STEP))
2324 W_buf->at(vpos) = pos_score[0];
2325
2326 if(W_buf->at(vpos) < pos_score[1])
2327 W_buf->at(vpos) = pos_score[1];
2328
2329 if(vpos)
2330 {
2331 if(W_buf->at(vpos - 1) < pos_score[0])
2332 W_buf->at(vpos - 1) = pos_score[0];
2333
2334 if(W_buf->at(vpos - 1) < pos_score[1])
2335 W_buf->at(vpos - 1) = pos_score[1];
2336 }
2337 }
2338
2339 pos++;
2340 }
2341
2342 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
2343
2344 if(!flank && POSITIONAL)
2345 {
2346 // if(!QUIET) cerr << endl << Name << endl;
2347
2348 unsigned int binm = 0;
2349
2350 for(unsigned int i = 0; i < BIN_NUM; i++)
2351 {
2352 if(W_buf->at(i) != 0)
2353 W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score);
2354 else
2355 binm++;
2356 }
2357
2358 BIN_NUM -= binm;
2359 }
2360
2361 if(!flank)
2362 {
2363 v_score.push_back(seq_score);
2364 v_pos.push_back(max_pos + L/2);
2365 v_strand.push_back(strand);
2366 }
2367
2368 return seq_score;
2369 }
2370
2371 double matrix::scan_seq_ss_second_best(char *S, unsigned int l, unsigned int *P, bool *s, unsigned int POS, bool seq_strand)
2372 {
2373 unsigned int pos = 0, max_pos = 0;
2374 double seq_score = min_score;
2375 bool strand = false;
2376
2377 while(pos < l)
2378 {
2379 if(pos == POS)
2380 {
2381 pos++;
2382 continue;
2383 }
2384
2385 double pos_score[2] = {0,0};
2386
2387 if(seq_strand == '+')
2388 {
2389 for(unsigned int j = 0; j < L; j++)
2390 pos_score[0] += get_M_value(j, S[pos+j]);
2391 }
2392 else
2393 {
2394 for(unsigned int j = 0; j < L; j++)
2395 pos_score[0] += get_rc_M_value(j, S[pos+j]);
2396 }
2397
2398 if(seq_score < pos_score[0])
2399 {
2400 seq_score = pos_score[0];
2401 max_pos = pos;
2402 strand = false;
2403 }
2404
2405
2406 pos++;
2407 }
2408
2409 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
2410
2411 *P = max_pos;
2412 *s = strand;
2413
2414 return seq_score;
2415 }
2416
2417 double matrix::scan_seq_ds_second_best(char *S, unsigned int l, unsigned int *P, bool *s, int POS)
2418 {
2419 int pos = 0, max_pos = 0, count = 0;
2420 double seq_score;
2421 bool strand = false;
2422
2423 POS -= L/2;
2424 l -= L/2;
2425
2426 while(pos < l)
2427 {
2428 // if(pos == POS)
2429 if(pos >= (POS - L) && pos <= (POS + L))
2430 {
2431 pos++;
2432 continue;
2433 }
2434
2435 double pos_score[2] = {0,0};
2436
2437 for(unsigned int j = 0; j < L; j++)
2438 {
2439 pos_score[0] += get_M_value(j, S[pos+j]);
2440 pos_score[1] += get_rc_M_value(j, S[pos+j]);
2441 }
2442
2443 if(seq_score < pos_score[0] || !count)
2444 {
2445 seq_score = pos_score[0];
2446 max_pos = pos;
2447 strand = false;
2448 }
2449 if(seq_score < pos_score[1])
2450 {
2451 seq_score = pos_score[1];
2452 max_pos = pos;
2453 strand = true;
2454 }
2455
2456 pos++;
2457 count++;
2458 }
2459
2460 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
2461
2462 *P = max_pos ;//+ L/2;
2463 *s = strand;
2464
2465 return seq_score;
2466 }
2467
2468
2469
2470 /*double matrix::scan_seq_ds_quick(char *S, bool flank, unsigned int l)
2471 {
2472 unsigned int pos = 0, max_pos = 0;
2473 double seq_score = min_score;
2474 bool strand = false;
2475
2476 while(pos < l)
2477 {
2478 double pos_score[2] = {0,0};
2479
2480 for(unsigned int j = 0; j < L; j++)
2481 {
2482 pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]);
2483
2484 if((pos_score[0] + p_max[j+1]) <= seq_score)
2485 {
2486 pos_score[0] = min_score;
2487 break;
2488 }
2489
2490 }
2491
2492 for(unsigned int j = 0; j < L; j++)
2493 {
2494 pos_score[1] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]);
2495
2496 if((pos_score[1] + p_max[j+1]) <= seq_score)
2497 {
2498 pos_score[1] = min_score;
2499 break;
2500 }
2501 }
2502
2503 if(seq_score < pos_score[0])
2504 {
2505 seq_score = pos_score[0];
2506 max_pos = pos;
2507 strand = false;
2508 }
2509 if(seq_score < pos_score[1])
2510 {
2511 seq_score = pos_score[1];
2512 max_pos = pos;
2513 strand = true;
2514 }
2515
2516 pos++;
2517 }
2518
2519 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
2520
2521 if(!flank)
2522 {
2523 v_score.push_back(seq_score);
2524 v_pos.push_back(max_pos);
2525 v_strand.push_back(strand);
2526 }
2527
2528
2529 return seq_score;
2530 }
2531 */
2532 double matrix::scan_seq_ss(char *S, bool flank, unsigned int l, char seq_strand, vector<double> *W_buf)
2533 {
2534 unsigned int pos = 0, max_pos = 0;
2535 double seq_score = min_score;
2536 bool strand = false;
2537
2538 while(pos < l)
2539 {
2540 double pos_score[2] = {0,0};
2541
2542 if(seq_strand == '+')
2543 {
2544 for(unsigned int j = 0; j < L; j++)
2545 pos_score[0] += get_M_value(j, S[pos+j]);
2546 }
2547 else
2548 {
2549 for(unsigned int j = 0; j < L; j++)
2550 pos_score[0] += get_rc_M_value(j, S[pos+j]);
2551 }
2552
2553 if(seq_score < pos_score[0])
2554 {
2555 seq_score = pos_score[0];
2556 max_pos = pos;
2557 strand = false;
2558 }
2559
2560 if(!flank && POSITIONAL)
2561 {
2562 unsigned int vpos = pos / W_STEP;
2563
2564 if(W_buf->at(vpos) < pos_score[0])
2565 W_buf->at(vpos) = pos_score[0];
2566
2567 if(vpos)
2568 {
2569 if(W_buf->at(vpos - 1) < pos_score[0])
2570 W_buf->at(vpos - 1) = pos_score[0];
2571 }
2572 }
2573
2574 pos++;
2575 }
2576
2577 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
2578
2579 if(POSITIONAL && !flank)
2580 for(unsigned int i = 0; i < W_buf->size(); i++)
2581 W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score);
2582
2583 if(!flank)
2584 {
2585 v_score.push_back(seq_score);
2586 v_pos.push_back(max_pos);
2587 v_strand.push_back(strand);
2588 }
2589
2590 return seq_score;
2591 }
2592
2593 /* double matrix::scan_seq_ss_quick(char *S, bool flank, unsigned int l, char seq_strand)
2594 {
2595 unsigned int pos = 0, max_pos = 0;
2596 double seq_score = min_score;
2597 bool strand = false;
2598
2599 while(pos < l)
2600 {
2601 double pos_score[2] = {0,0};
2602
2603 if(seq_strand == '+')
2604 {
2605 for(unsigned int j = 0; j < L; j++)
2606 {
2607 pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]);
2608
2609 if((pos_score[0] + p_max[j+1]) <= seq_score)
2610 {
2611 pos_score[0] = min_score;
2612 break;
2613 }
2614
2615
2616 }
2617 }
2618 else
2619 {
2620 for(unsigned int j = 0; j < L; j++)
2621 {
2622 pos_score[0] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]);
2623
2624 if((pos_score[0] + p_max[j+1]) <= seq_score)
2625 {
2626 pos_score[0] = min_score;
2627 break;
2628 }
2629
2630 }
2631 }
2632
2633 if(seq_score < pos_score[0])
2634 {
2635 seq_score = pos_score[0];
2636 max_pos = pos;
2637 strand = false;
2638 }
2639
2640 pos++;
2641 }
2642
2643 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
2644
2645 if(!flank)
2646 {
2647 v_score.push_back(seq_score);
2648 v_pos.push_back(max_pos);
2649 v_strand.push_back(strand);
2650 }
2651
2652 return seq_score;
2653 }*/
2654
2655 double matrix::get_M_value(int pos, char C)
2656 {
2657 return M[pos][VI[C]];
2658 }
2659
2660 double matrix::get_rc_M_value(int pos, char C)
2661 {
2662 return rc_M[pos][VI[C]];
2663 }
2664
2665 /*
2666 void matrix::set_top_pos()
2667 {
2668 multimap<double, mpoint> Tmap;
2669 vector<double> p_max_buf, rc_p_max_buf;
2670
2671 for(unsigned int x = 0; x < L; x++)
2672 {
2673 for(unsigned int y = 0; y < ALPHABET_SIZE; y++)
2674 {
2675 mpoint p;
2676
2677 p.x = x;
2678 p.y = y;
2679
2680 Tmap.insert(make_pair(M[x][y],p));
2681
2682 if(!y)
2683 {
2684 p_max_buf.push_back(M[x][y]);
2685 rc_p_max_buf.push_back(rc_M[x][y]);
2686 }
2687 else
2688 {
2689 if(M[x][y] > p_max_buf.back())
2690 p_max_buf[p_max_buf.size() - 1] = M[x][y];
2691
2692 if(rc_M[x][y] > rc_p_max_buf.back())
2693 rc_p_max_buf[rc_p_max_buf.size() - 1] = rc_M[x][y];
2694 }
2695
2696 }
2697
2698 // if(!QUIET) cerr << endl << p_max_buf.back();
2699 }
2700
2701 multimap<double, mpoint>::reverse_iterator ri = Tmap.rbegin();
2702
2703 for(unsigned int i = 0; i < TOP_P; i++)
2704 {
2705 top_pos[i] = ri->second.x;
2706 top_char[i] = ALPHABET[ri->second.y];
2707
2708 rc_top_pos[i] = (L - 1) - ri->second.x;
2709 rc_top_char[i] = RC_ALPHABET[ri->second.y];
2710
2711 ri++;
2712 }
2713
2714 // if(!QUIET) cerr << endl << name() << '\t' << max_score << '\t' << min_score << endl;
2715
2716 while(p_max.size() != p_max_buf.size())
2717 {
2718 double buf = -INF;
2719 unsigned int pos;
2720
2721 for(unsigned int i = 0; i < p_max_buf.size(); i++)
2722 if(p_max_buf.at(i) > buf)
2723 {
2724 pos = i;
2725 buf = p_max_buf.at(i);
2726 }
2727
2728 p_pos.push_back(pos);
2729 rc_p_pos.push_back((L - 1) - pos);
2730
2731 p_max.push_back(buf);
2732
2733 p_max_buf[pos] = -INF;
2734 }
2735
2736 for(unsigned int i = 0; i < L; i++)
2737 {
2738 //if(!QUIET) cerr << p_max[i] << "//"<< '\t';
2739
2740 for(unsigned int j = i + 1; j < L; j++)
2741 p_max[i] += p_max[j];
2742
2743 // for(unsigned int j = i + 1; j < L; j++)
2744 // rc_p_max[i] += rc_p_max[j];
2745
2746 // if(!QUIET) cerr << p_max[i] << "(" << p_pos[i] << ")" << rc_p_max[i] << '\t';
2747 }
2748
2749 p_max.push_back(0); //WE NEED A 0 AT THE END OF THE VECTOR!
2750 // rc_p_max.push_back(0);
2751
2752 // if(!QUIET) cerr << endl;
2753
2754 return;
2755 }
2756 */
2757 double matrix::get_score_at(unsigned int a)
2758 {
2759 return v_score.at(a);
2760 }
2761 unsigned int matrix::get_pos_at(unsigned int a)
2762 {
2763 return v_pos.at(a);
2764 }
2765 bool matrix::get_strand_at(unsigned int a)
2766 {
2767 return v_strand.at(a);
2768 }
2769
2770 double matrix::max()
2771 {
2772 return max_score;
2773 }
2774
2775 double matrix::min()
2776 {
2777 return min_score;
2778 }
2779
2780 string matrix::name()
2781 {
2782 if(!Name.empty())
2783 return Name;
2784 else
2785 return Id;
2786 }
2787
2788 string matrix::id()
2789 {
2790 return Id;
2791 }
2792
2793 double matrix::get_pvalue()
2794 {
2795 return pvalue;
2796 }
2797
2798 double matrix::get_pvalue_bg()
2799 {
2800 return pvalue_bg;
2801 }
2802
2803 double matrix::get_W_pvalue(unsigned int pos)
2804 {
2805 return W_R_pvalue.at(pos);
2806 }
2807
2808 double matrix::get_W_R_Avg_at_pos(unsigned int pos)
2809 {
2810 return W_R_Avg.at(pos);
2811 }
2812
2813 string matrix::o_or_u()
2814 {
2815 if(O_U)
2816 return "UNDER";
2817 else
2818 return "OVER";
2819 }
2820
2821 string matrix::o_or_u_bg()
2822 {
2823 if(O_U_bg)
2824 return "UNDER";
2825 else
2826 return "OVER";
2827 }
2828
2829 int matrix::W_o_or_u(unsigned int pos)
2830 {
2831 if(W_O_U[pos])
2832 return -1;
2833 else
2834 return 1;
2835 }
2836
2837 double matrix::get_R_Avg()
2838 {
2839 return R_Avg;
2840 }
2841 double matrix::get_F_Avg()
2842 {
2843 return F_Avg;
2844 }
2845 double matrix::get_R_Var()
2846 {
2847 return R_Var;
2848 }
2849 double matrix::get_F_Var()
2850 {
2851 return F_Var;
2852 }
2853 double matrix::get_bg_Avg()
2854 {
2855 return bg_Avg;
2856 }
2857
2858 double matrix::get_t()
2859 {
2860 return t;
2861 }
2862
2863 double matrix::get_dof()
2864 {
2865 return dof;
2866 }
2867
2868 bool matrix::have_bg()
2869 {
2870 return BG;
2871 }
2872
2873 unsigned int matrix::bin_num()
2874 {
2875 return BIN_NUM;
2876 }
2877
2878 unsigned int matrix::length()
2879 {
2880 return L;
2881 }
2882
2883 double matrix::get_spearman()
2884 {
2885 double s = spearman;
2886
2887 s *= 10000;
2888
2889 if(s > 0)
2890 s = floor(s);
2891 else if(s < 0)
2892 s = ceil(s);
2893
2894 s /= 10000;
2895
2896 return s;
2897 }
2898
2899 double matrix::get_W_Avg()
2900 {
2901 return W_Avg;
2902 }
2903
2904 double matrix::get_W_R_T_pvalue(unsigned int i)
2905 {
2906 return W_R_T_pvalue.at(i);
2907 }
2908 // MATRIX CLASS END