|
9
|
1 // Pscan-ChIP 1.1
|
|
|
2 // // Copyright (C) 2013 Federico Zambelli and Giulio Pavesi
|
|
|
3 // //
|
|
|
4 // // This program is free software: you can redistribute it and/or modify
|
|
|
5 // // it under the terms of the GNU General Public License as published by
|
|
|
6 // // the Free Software Foundation, either version 3 of the License, or
|
|
|
7 // // (at your option) any later version.
|
|
|
8 // //
|
|
|
9 // // This program is distributed in the hope that it will be useful,
|
|
|
10 // // but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
11 // // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
12 // // GNU General Public License for more details.
|
|
|
13 // //
|
|
|
14 // // You should have received a copy of the GNU General Public License
|
|
|
15 // // along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
16 // //
|
|
|
17 // // If you find Pscan-ChIP useful in your research please cite our paper:
|
|
|
18 // //
|
|
|
19 // // Zambelli F, Pesole G, Pavesi G. PscanChIP: Finding over-represented
|
|
|
20 // // transcription factor-binding site motifs and their correlations in sequences from
|
|
|
21 // // ChIP-Seq experiments.
|
|
|
22 // // Nucleic Acids Res. 2013 Jul;41(Web Server issue):W535-43.
|
|
|
23
|
|
|
24 #include <string>
|
|
|
25 #include <iostream>
|
|
|
26 #include <fstream>
|
|
|
27 #include <vector>
|
|
|
28 #include <sstream>
|
|
|
29 #include <math.h>
|
|
|
30 #include <map>
|
|
|
31 #include <cstdlib>
|
|
|
32 #include <gsl/gsl_randist.h>
|
|
|
33 #include <gsl/gsl_cdf.h>
|
|
|
34 #include <string.h>
|
|
|
35 #include <ctime>
|
|
|
36
|
|
|
37 using namespace std;
|
|
|
38
|
|
|
39 const double MIN_ADD_FREQ = 0.01, EPSILON = pow(10,-21), INF = pow(10,21), /*POS_VIS_THRESHOLD = pow(10,-5),*/ DIST_RATIO_THRESHOLD = 3, DIST_PV_THRESHOLD = pow(10,-5);
|
|
|
40 const unsigned int MAX_REG_SIZE = 3000, ALPHABET_SIZE = 4, TOP_P = 1, DIST_HIT_THRESHOLD = 10;
|
|
|
41 //const char ALPHABET[4] = {'A','C','G','T'}, RC_ALPHABET[4] = {'T','G','C','A'};
|
|
|
42 char VI['T' + 1];
|
|
|
43 string regfile, genome, matrixfile, additional_matrixfile, outfile, bgfile, selected, obstf, GALAXY_OUTPUT;
|
|
|
44 unsigned int RSIZE = 150, MAX_L = 0, W_SIZE = 10, W_STEP = 5;
|
|
|
45 bool SECOND_BEST_POS = false, SINGLE_STRAND = false, VERBOSE = false, GET_SEQUENCES = false, POSITIONAL = false, SINGLE_MATRIX = false, SPEARMAN = false, TWOBIT = false, QUIET = false; //NAIVE = false;
|
|
|
46 ofstream fasta_out;
|
|
|
47 double SINGLE_MATRIX_BG_AVG = 0;
|
|
|
48
|
|
|
49 struct mpoint
|
|
|
50 {
|
|
|
51 unsigned int x;
|
|
|
52 unsigned int y;
|
|
|
53 };
|
|
|
54
|
|
|
55 struct bg
|
|
|
56 {
|
|
|
57 unsigned int Size;
|
|
|
58 double Avg, Var;
|
|
|
59 };
|
|
|
60
|
|
|
61 class sequence
|
|
|
62 {
|
|
|
63 private:
|
|
|
64 string Chr, Name;
|
|
|
65 char *Seq;
|
|
|
66 unsigned int Start, End, E_Rank;
|
|
|
67 char Strand, AntiStrand;
|
|
|
68 bool good;
|
|
|
69 public:
|
|
|
70 sequence(string*, ifstream*, unsigned int);
|
|
|
71 sequence(string*, unsigned int, string, unsigned int);
|
|
|
72 char strand();
|
|
|
73 char antistrand();
|
|
|
74 string chr();
|
|
|
75 unsigned int start();
|
|
|
76 unsigned int end();
|
|
|
77 unsigned int e_rank();
|
|
|
78 string name();
|
|
|
79 char* seq();
|
|
|
80 bool is_good();
|
|
|
81 double score();
|
|
|
82 int mpos();
|
|
|
83 };
|
|
|
84
|
|
|
85 class matrix
|
|
|
86 {
|
|
|
87 friend void scanner(vector<matrix>*, vector<sequence>*);
|
|
|
88 private:
|
|
|
89 void normalizer();
|
|
|
90 void to_log();
|
|
|
91 void avg();
|
|
|
92 void max_min();
|
|
|
93 void rev_c();
|
|
|
94 double get_M_value(int, char);
|
|
|
95 double get_rc_M_value(int, char);
|
|
|
96 void calc_var(double);
|
|
|
97 void calc_W_param(double);
|
|
|
98 void welch_test(double);
|
|
|
99 void welch_test_bg(double);
|
|
|
100 // void welch_test_bg_G(double);
|
|
|
101 void welch_test_pos(double);
|
|
|
102 void z_test_pos(double);
|
|
|
103 unsigned int L, bg_Size, BIN_NUM;
|
|
|
104 double **M;
|
|
|
105 double **rc_M;
|
|
|
106 double max_score, min_score, t, dof, t_bg, dof_bg;
|
|
|
107 double spearman;
|
|
|
108 string Name, Id;
|
|
|
109 bool O_U, O_U_bg;
|
|
|
110 vector<bool> W_O_U;
|
|
|
111 bool good, BG;
|
|
|
112 bool norm_sec_step;
|
|
|
113 vector<double> v_score, vF_score;
|
|
|
114 vector<unsigned int> v_pos;
|
|
|
115 vector<bool> v_strand;
|
|
|
116 vector<vector<double> > W_R_v_score;
|
|
|
117 vector<double> W_R_Avg, W_R_Var, W_R_pvalue, W_R_T_pvalue;
|
|
|
118 double R_Avg, F_Avg, R_Var, F_Var, /*G_Avg, G_Var,*/ pvalue, W_Avg, W_Var;
|
|
|
119 double bg_Avg, bg_Var, pvalue_bg;
|
|
|
120 public:
|
|
|
121 matrix(vector<string>*, map<string,bg>*);
|
|
|
122 void display();
|
|
|
123 bool is_good();
|
|
|
124 string name();
|
|
|
125 string id();
|
|
|
126 string o_or_u();
|
|
|
127 string o_or_u_bg();
|
|
|
128 int W_o_or_u(unsigned int);
|
|
|
129 void scan(vector<sequence>*);
|
|
|
130 double scan_seq_ds(char*, bool, unsigned int, vector<double>*);
|
|
|
131 double scan_seq_ss(char*, bool, unsigned int, char, vector<double>*);
|
|
|
132 double scan_seq_ds_second_best(char*, unsigned int, unsigned int *, bool *, int);
|
|
|
133 double scan_seq_ss_second_best(char*, unsigned int, unsigned int *, bool *, unsigned int, bool);
|
|
|
134 double get_score_at(unsigned int);
|
|
|
135 unsigned int get_pos_at(unsigned int);
|
|
|
136 unsigned int length();
|
|
|
137 bool get_strand_at(unsigned int);
|
|
|
138 bool have_bg();
|
|
|
139 unsigned int bin_num();
|
|
|
140 double get_R_Avg();
|
|
|
141 double get_F_Avg();
|
|
|
142 double get_bg_Avg();
|
|
|
143 double get_R_Var();
|
|
|
144 double get_F_Var();
|
|
|
145 double max();
|
|
|
146 double min();
|
|
|
147 double get_pvalue();
|
|
|
148 double get_pvalue_bg();
|
|
|
149 double get_W_pvalue(unsigned int);
|
|
|
150 double get_W_R_Avg_at_pos(unsigned int);
|
|
|
151 double get_t();
|
|
|
152 double get_dof();
|
|
|
153 double get_spearman();
|
|
|
154 double get_W_Avg();
|
|
|
155 double get_W_R_T_pvalue(unsigned int);
|
|
|
156 };
|
|
|
157
|
|
|
158 void command_line_parser(int, char**);
|
|
|
159 void acquire_regions(vector<sequence>*);
|
|
|
160 void acquire_matrices(vector<matrix>*, map<string,bg>*, string*);
|
|
|
161 void acquire_background(map<string,bg>*);
|
|
|
162 void scanner(vector<matrix>*, vector<sequence>*);
|
|
|
163 void main_output(vector<matrix>*, vector<sequence>*, unsigned int);
|
|
|
164 void main_output_single_matrix(vector<matrix>*, vector<sequence>*);
|
|
|
165 void distance_analysis(vector<matrix>*, vector<sequence>*, unsigned int, multimap <double, vector<matrix>::iterator>*);
|
|
|
166 void display_help();
|
|
|
167
|
|
|
168 int main(int argc, char **argv)
|
|
|
169 {
|
|
|
170 srand(time(NULL));
|
|
|
171
|
|
|
172 vector<sequence> Seq;
|
|
|
173 vector<matrix> Mat;
|
|
|
174 map<string,bg> Bg;
|
|
|
175 VI['A'] = 0;
|
|
|
176 VI['C'] = 1;
|
|
|
177 VI['G'] = 2;
|
|
|
178 VI['T'] = 3;
|
|
|
179 VI['N'] = 4;
|
|
|
180
|
|
|
181 obstf.clear();
|
|
|
182
|
|
|
183 command_line_parser(argc, argv);
|
|
|
184
|
|
|
185 if(GET_SEQUENCES)
|
|
|
186 {
|
|
|
187 string outfile = regfile;
|
|
|
188 outfile += ".fasta";
|
|
|
189 fasta_out.open(outfile.c_str(), ios::out);
|
|
|
190 }
|
|
|
191
|
|
|
192 if(!regfile.empty() && !matrixfile.empty())
|
|
|
193 {
|
|
|
194 if(!bgfile.empty())
|
|
|
195 {
|
|
|
196 if(!QUIET) cerr << "\nAcquiring background file... ";
|
|
|
197 acquire_background(&Bg);
|
|
|
198 if(!QUIET) cerr << "done" << endl;
|
|
|
199 }
|
|
|
200
|
|
|
201 if(!QUIET) cerr << "\nReading binding profiles...";
|
|
|
202 acquire_matrices(&Mat, &Bg, &matrixfile);
|
|
|
203
|
|
|
204 if(!additional_matrixfile.empty())
|
|
|
205 acquire_matrices(&Mat, &Bg, &additional_matrixfile);
|
|
|
206
|
|
|
207 if(Mat.empty())
|
|
|
208 {
|
|
|
209 if(!QUIET) cerr << "\nZero valid matrices acquired. Quitting..." << endl;
|
|
|
210 exit(1);
|
|
|
211 }
|
|
|
212
|
|
|
213 if(!QUIET) cerr << "done: " << Mat.size() << " binding profiles acquired. Max matrix length = " << MAX_L << endl;
|
|
|
214
|
|
|
215 if(Mat.size() == 1)
|
|
|
216 {
|
|
|
217 if(!QUIET) cerr << "\nSwitching to single matrix mode..." << endl;
|
|
|
218 SINGLE_MATRIX = true;
|
|
|
219 }
|
|
|
220
|
|
|
221 if(!QUIET) cerr << "\nAcquiring regions...";
|
|
|
222 acquire_regions(&Seq);
|
|
|
223 if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl;
|
|
|
224
|
|
|
225 if(Seq.size() <= 2)
|
|
|
226 POSITIONAL = false;
|
|
|
227
|
|
|
228 if(Seq.empty())
|
|
|
229 {
|
|
|
230 if(!QUIET) cerr << "\nZero regions to work on! Exiting..." << endl;
|
|
|
231 exit(1);
|
|
|
232 }
|
|
|
233
|
|
|
234 if(GET_SEQUENCES)
|
|
|
235 {
|
|
|
236 fasta_out.close();
|
|
|
237 exit(0);
|
|
|
238 }
|
|
|
239
|
|
|
240 // for(unsigned int i = 0; i < Mat.size(); i++)
|
|
|
241 // Mat[i].display();
|
|
|
242
|
|
|
243 if(!QUIET) cerr << "\nScanning...";
|
|
|
244 scanner(&Mat, &Seq);
|
|
|
245
|
|
|
246 if(!QUIET) cerr << "\nWriting output..." << endl;
|
|
|
247
|
|
|
248 if(!SINGLE_MATRIX)
|
|
|
249 main_output(&Mat, &Seq, Seq.size());
|
|
|
250 else
|
|
|
251 main_output_single_matrix(&Mat, &Seq);
|
|
|
252
|
|
|
253 if(!QUIET) cerr << "\nOutput written in file: " << outfile << endl;
|
|
|
254
|
|
|
255 }
|
|
|
256 else if(GET_SEQUENCES && !regfile.empty())
|
|
|
257 {
|
|
|
258 if(!QUIET) cerr << "\nAcquiring regions...";
|
|
|
259 acquire_regions(&Seq);
|
|
|
260 if(!QUIET) cerr << "done: " << Seq.size() << " unique regions acquired." << endl;
|
|
|
261
|
|
|
262 fasta_out.close();
|
|
|
263 exit(0);
|
|
|
264 }
|
|
|
265 else
|
|
|
266 display_help();
|
|
|
267
|
|
|
268
|
|
|
269 /* for(unsigned int i = 0; i < Seq.size(); i++)
|
|
|
270 {
|
|
|
271 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << Seq[i].start() << '\t' << Seq[i].end() << '\t'
|
|
|
272 << Mat[0].get_score_at(i) << '\t' << Mat[0].get_pos_at(i) << '\t' << Mat[0].get_strand_at(i) << '\t'
|
|
|
273 << "Max = " << Mat[0].max() << '\t' << "Min = " << Mat[0].min()
|
|
|
274 << endl << Seq[i].seq() << endl;
|
|
|
275 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "UPSTREAM" << endl << Seq[i].uf_seq() << endl;
|
|
|
276 cout << ">" << i+1 << '\t' << Seq[i].name() << '\t' << "DOWNSTREAM" << endl << Seq[i].df_seq() << endl;
|
|
|
277 }
|
|
|
278 */
|
|
|
279
|
|
|
280 return 0;
|
|
|
281 }
|
|
|
282
|
|
|
283 void display_help()
|
|
|
284 {
|
|
|
285 if(!QUIET) cerr << endl << "Syntax: pscan_chip -r region_file.bed -M matrixfile.wil -g genome [-pos] [-wsize window_size] [-wstep window_step] [-bg background_file] [-s region_size]" << endl << endl;
|
|
|
286 exit(0);
|
|
|
287 }
|
|
|
288
|
|
|
289 void main_output_single_matrix(vector<matrix> *Mat, vector<sequence> *Seq)
|
|
|
290 {
|
|
|
291 outfile = regfile;
|
|
|
292 outfile += ".";
|
|
|
293 outfile += Mat->at(0).id();
|
|
|
294
|
|
|
295 string outfile2 = outfile;
|
|
|
296
|
|
|
297 outfile += ".ris";
|
|
|
298 outfile2 += ".all.ris";
|
|
|
299
|
|
|
300 if(!GALAXY_OUTPUT.empty())
|
|
|
301 outfile = GALAXY_OUTPUT;
|
|
|
302
|
|
|
303 multimap <double, string> outmap;
|
|
|
304
|
|
|
305 ofstream out(outfile.c_str(), ios::out);
|
|
|
306 ofstream out2(outfile2.c_str(), ios::out);
|
|
|
307
|
|
|
308 if(!out)
|
|
|
309 {
|
|
|
310 if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl;
|
|
|
311 exit(1);
|
|
|
312 }
|
|
|
313
|
|
|
314 out << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE";
|
|
|
315 out2 << "CHR\tREG_START\tREG_END\tREG_STRAND\tABS_SITE_START\tABS_SITE_END\tREL_SITE_START\tREL_SITE_END\tSITE_STRAND\tSCORE\tSITE";
|
|
|
316
|
|
|
317 // if(SPEARMAN)
|
|
|
318 // out << "\tE_RANK";
|
|
|
319
|
|
|
320 if(SECOND_BEST_POS)
|
|
|
321 {
|
|
|
322 out << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl;
|
|
|
323 out2 << "\tSB_POS\tSB_SCORE\tSB_OLIGO\tSB_STRAND" << endl;
|
|
|
324 }
|
|
|
325 else
|
|
|
326 {
|
|
|
327 out << endl;
|
|
|
328 out2 << endl;
|
|
|
329 }
|
|
|
330
|
|
|
331 matrix *M = &Mat->at(0);
|
|
|
332
|
|
|
333 /* if(SINGLE_MATRIX_BG_AVG == 0) //QUESTO SERVE PER QUANDO NON C'E' IL VALORE DI BACKGROUND!
|
|
|
334 {
|
|
|
335 double loc_avg = 0, loc_stdev = 0;
|
|
|
336
|
|
|
337 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
338 loc_avg += M->get_score_at(i);
|
|
|
339
|
|
|
340 loc_avg /= Seq->size();
|
|
|
341
|
|
|
342 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
343 loc_stdev += pow(M->get_score_at(i) - loc_avg, 2);
|
|
|
344
|
|
|
345 loc_stdev /= (Seq->size() - 1);
|
|
|
346
|
|
|
347 loc_stdev = sqrt(loc_stdev);
|
|
|
348
|
|
|
349 SINGLE_MATRIX_BG_AVG = loc_avg - (2*loc_stdev);
|
|
|
350
|
|
|
351 if(!QUIET) cerr << "\nSMBG = " << SINGLE_MATRIX_BG_AVG;
|
|
|
352 }
|
|
|
353 */
|
|
|
354 char Oligo[M->length() + 1];
|
|
|
355
|
|
|
356 Oligo[M->length()] = '\0';
|
|
|
357
|
|
|
358 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
359 {
|
|
|
360 // if(!QUIET) cerr << endl << M->get_score_at(i) << '\t' << M->get_bg_Avg() << '\t';
|
|
|
361
|
|
|
362 // if(M->get_score_at(i) < SINGLE_MATRIX_BG_AVG) //ONLY REPORT OCCURRENCES WITH SCORE >= BG_AVG
|
|
|
363 // continue;
|
|
|
364
|
|
|
365 // if(!QUIET) cerr << "passed";
|
|
|
366
|
|
|
367 ostringstream outbuf;
|
|
|
368
|
|
|
369 sequence *S = &Seq->at(i);
|
|
|
370
|
|
|
371 strncpy(Oligo, S->seq() + RSIZE/2 + (M->get_pos_at(i) - M->length()/2), M->length());
|
|
|
372
|
|
|
373 outbuf << S->chr() << '\t' << S->start() << '\t' << S->end() << '\t' << S->strand() << '\t'
|
|
|
374 << S->start() + M->get_pos_at(i) - M->length()/2 << '\t' << S->start() + M->get_pos_at(i) + M->length()/2 << '\t'
|
|
|
375 << (int)((M->get_pos_at(i) - M->length()/2) - RSIZE/2) << '\t' << (int)((M->get_pos_at(i) + M->length()/2 - 1) - RSIZE/2)<< '\t';
|
|
|
376
|
|
|
377 if(S->strand() == '+')
|
|
|
378 {
|
|
|
379 if(!M->get_strand_at(i))
|
|
|
380 outbuf << S->strand();
|
|
|
381 else
|
|
|
382 outbuf << S->antistrand();
|
|
|
383 }
|
|
|
384 else
|
|
|
385 {
|
|
|
386 if(!M->get_strand_at(i))
|
|
|
387 outbuf << S->antistrand();
|
|
|
388 else
|
|
|
389 outbuf << S->strand();
|
|
|
390 }
|
|
|
391
|
|
|
392 outbuf << '\t' << M->get_score_at(i) << '\t' << Oligo;
|
|
|
393
|
|
|
394 // if(SPEARMAN)
|
|
|
395 // outbuf << '\t' << S->e_rank();
|
|
|
396
|
|
|
397 if(SECOND_BEST_POS)
|
|
|
398 {
|
|
|
399 if(SINGLE_STRAND)
|
|
|
400 {
|
|
|
401 bool s;
|
|
|
402 unsigned int P;
|
|
|
403 double score;
|
|
|
404
|
|
|
405 score = M->scan_seq_ss_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s,M->get_pos_at(i), S->strand());
|
|
|
406
|
|
|
407 strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length());
|
|
|
408
|
|
|
409 outbuf << '\t' << P << '\t' << score << '\t' << Oligo;
|
|
|
410 }
|
|
|
411 else
|
|
|
412 {
|
|
|
413 bool s;
|
|
|
414 unsigned int P;
|
|
|
415 double score;
|
|
|
416
|
|
|
417 score = M->scan_seq_ds_second_best(S->seq() + (RSIZE/2), RSIZE, &P, &s, (int)M->get_pos_at(i));
|
|
|
418
|
|
|
419 strncpy(Oligo, S->seq() + RSIZE/2 + P, M->length());
|
|
|
420
|
|
|
421 outbuf << '\t' << (int)P - (int)RSIZE/2 << '\t' << score << '\t' << Oligo << '\t';
|
|
|
422
|
|
|
423 if(s)
|
|
|
424 outbuf << '-';
|
|
|
425 else
|
|
|
426 outbuf << '+';
|
|
|
427 }
|
|
|
428
|
|
|
429 outbuf << endl;
|
|
|
430 }
|
|
|
431 else
|
|
|
432 outbuf << endl;
|
|
|
433
|
|
|
434 outmap.insert(make_pair(M->get_score_at(i),outbuf.str()));
|
|
|
435 }
|
|
|
436
|
|
|
437 multimap<double,string>::reverse_iterator mi = outmap.rbegin();
|
|
|
438
|
|
|
439 // unsigned int count = 0;
|
|
|
440
|
|
|
441 while(mi != outmap.rend())
|
|
|
442 {
|
|
|
443 // if(count >= outmap.size()/2 && outmap.size() > 1) //SOLO LA MIGLIORE META' DELLE OCCORRENZE!!!
|
|
|
444 // break;
|
|
|
445 if(mi->first >= SINGLE_MATRIX_BG_AVG || mi == outmap.rbegin())
|
|
|
446 out << mi->second;
|
|
|
447
|
|
|
448 out2 << mi->second;
|
|
|
449
|
|
|
450 mi++;
|
|
|
451 // count++;
|
|
|
452 }
|
|
|
453
|
|
|
454 out.close();
|
|
|
455 out2.close();
|
|
|
456 return;
|
|
|
457 }
|
|
|
458
|
|
|
459 void main_output(vector<matrix> *Mat, vector<sequence> *Seq, unsigned int SIZE)
|
|
|
460 {
|
|
|
461 multimap <double, vector<matrix>::iterator> S_map;
|
|
|
462 outfile = regfile;
|
|
|
463
|
|
|
464 outfile += ".res";
|
|
|
465
|
|
|
466 if(!GALAXY_OUTPUT.empty())
|
|
|
467 outfile = GALAXY_OUTPUT;
|
|
|
468
|
|
|
469 ofstream out(outfile.c_str(),ios::out);
|
|
|
470
|
|
|
471 if(!out)
|
|
|
472 {
|
|
|
473 if(!QUIET) cerr << "\nError! Can't open output file: " << outfile << " for writing..." << endl;
|
|
|
474 exit(1);
|
|
|
475 }
|
|
|
476
|
|
|
477 out << "NAME\tID\tREG_N\tL_PVALUE O/U\tG_PVALUE O/U\tR_AVG\tR_STDEV\tF_AVG\tBG_AVG";
|
|
|
478
|
|
|
479 if(SPEARMAN)
|
|
|
480 out << "\tSP_COR";
|
|
|
481
|
|
|
482 if(POSITIONAL)
|
|
|
483 out << "\tPREF_POS\tPREF_POS_PV\n";
|
|
|
484 else
|
|
|
485 out << endl;
|
|
|
486
|
|
|
487 vector<matrix>::iterator vi = Mat->begin();
|
|
|
488
|
|
|
489 while(vi != Mat->end())
|
|
|
490 {
|
|
|
491 if(S_map.find(vi->get_pvalue()) != S_map.end()) //GFI WORKAROUND
|
|
|
492 {
|
|
|
493 if(S_map.find(vi->get_pvalue())->second->get_pvalue_bg() < vi->get_pvalue_bg())
|
|
|
494 {
|
|
|
495 S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi));
|
|
|
496 vi++;
|
|
|
497 continue;
|
|
|
498 }
|
|
|
499 else if((S_map.find(vi->get_pvalue())->second->get_pvalue_bg() > vi->get_pvalue_bg()))
|
|
|
500 {
|
|
|
501 S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi));
|
|
|
502 vi++;
|
|
|
503 continue;
|
|
|
504 }
|
|
|
505 else
|
|
|
506 {
|
|
|
507 if(S_map.find(vi->get_pvalue())->second->get_spearman() > vi->get_spearman())
|
|
|
508 {
|
|
|
509 S_map.insert(make_pair(vi->get_pvalue() + pow(10,-240), vi));
|
|
|
510 vi++;
|
|
|
511 continue;
|
|
|
512 }
|
|
|
513 else if(S_map.find(vi->get_pvalue())->second->get_spearman() <= vi->get_spearman())
|
|
|
514 {
|
|
|
515 S_map.insert(make_pair(vi->get_pvalue() - pow(10,-240), vi));
|
|
|
516 vi++;
|
|
|
517 continue;
|
|
|
518 }
|
|
|
519 }
|
|
|
520
|
|
|
521 }
|
|
|
522 else
|
|
|
523 S_map.insert(make_pair(vi->get_pvalue(), vi));
|
|
|
524
|
|
|
525 vi++;
|
|
|
526 }
|
|
|
527
|
|
|
528 multimap <double, vector<matrix>::iterator>::iterator mi = S_map.begin();
|
|
|
529
|
|
|
530 while(mi != S_map.end())
|
|
|
531 {
|
|
|
532 double pv1, pv2;
|
|
|
533
|
|
|
534 pv1 = mi->second->get_pvalue() * Mat->size();
|
|
|
535
|
|
|
536 if(pv1 > 1)
|
|
|
537 pv1 = 1;
|
|
|
538
|
|
|
539 pv2 = mi->second->get_pvalue_bg() * Mat->size();
|
|
|
540
|
|
|
541 if(pv2 > 1)
|
|
|
542 pv2 = 1;
|
|
|
543
|
|
|
544 out << mi->second->name() << '\t' << mi->second->id() << '\t' << SIZE << '\t' << pv1 << ' ' << mi->second->o_or_u() << '\t';
|
|
|
545
|
|
|
546 if(mi->second->have_bg())
|
|
|
547 out << pv2 << ' ' << mi->second->o_or_u_bg() << '\t';
|
|
|
548 else
|
|
|
549 out << "N/A N/A\t";
|
|
|
550
|
|
|
551 out << mi->second->get_R_Avg() << '\t' << pow(mi->second->get_R_Var(), 0.5) << '\t' << mi->second->get_F_Avg() << '\t';
|
|
|
552
|
|
|
553 if(mi->second->have_bg())
|
|
|
554 out << mi->second->get_bg_Avg() << '\t';
|
|
|
555 else
|
|
|
556 out << "N/A" << '\t';
|
|
|
557
|
|
|
558 if(SPEARMAN)
|
|
|
559 out << mi->second->get_spearman() << '\t';
|
|
|
560
|
|
|
561 if(POSITIONAL)
|
|
|
562 {
|
|
|
563 double min_pv = 1, min_pv_c;
|
|
|
564 unsigned int min_pos;
|
|
|
565
|
|
|
566 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++)
|
|
|
567 {
|
|
|
568 if(mi->second->get_W_pvalue(i) <= min_pv) //&& mi->second->W_o_or_u(i) == 1)
|
|
|
569 {
|
|
|
570 min_pv = mi->second->get_W_pvalue(i);
|
|
|
571 min_pos = i;
|
|
|
572 }
|
|
|
573 }
|
|
|
574
|
|
|
575 min_pv_c = min_pv * (mi->second->bin_num() * Seq->size());
|
|
|
576
|
|
|
577 if(min_pv_c > 1)
|
|
|
578 min_pv_c = 1;
|
|
|
579
|
|
|
580 out << '[' << (int)(((W_STEP * min_pos) + mi->second->length()/2) - RSIZE/2) << ',' << (int)(((W_STEP * min_pos) + W_SIZE) + mi->second->length()/2 - RSIZE/2) << ']' << '\t' << min_pv_c;// << '\t' << mi->second->get_W_R_T_pvalue(min_pos) << '\t' << min_pv_c;
|
|
|
581 }
|
|
|
582
|
|
|
583 out << endl;
|
|
|
584
|
|
|
585 mi++;
|
|
|
586 }
|
|
|
587
|
|
|
588 if(POSITIONAL)
|
|
|
589 {
|
|
|
590 out << endl << endl << "POSITIONAL ANALYSIS: W_SIZE = " << W_SIZE << " W_STEP = " << W_STEP << endl << endl;
|
|
|
591
|
|
|
592 out << "NAME\tID\t";
|
|
|
593
|
|
|
594 mi = S_map.begin();
|
|
|
595
|
|
|
596 for(int i = -(int)(RSIZE/2); i <= (int)((RSIZE/2) - W_SIZE); i += W_STEP)
|
|
|
597 out << i /*<< '|' << i + (int)(W_SIZE - 1)*/ << '\t';
|
|
|
598
|
|
|
599 out << endl;
|
|
|
600
|
|
|
601 while(mi != S_map.end())
|
|
|
602 {
|
|
|
603 /* bool gflag = false;
|
|
|
604
|
|
|
605 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++)
|
|
|
606 if(mi->second->get_W_pvalue(i) <= POS_VIS_THRESHOLD)
|
|
|
607 {
|
|
|
608 gflag = true;
|
|
|
609 break;
|
|
|
610 }
|
|
|
611
|
|
|
612 if(!gflag)
|
|
|
613 {
|
|
|
614 mi++;
|
|
|
615 continue;
|
|
|
616 }
|
|
|
617 */
|
|
|
618 out << mi->second->name() << '\t' << mi->second->id() << '\t';
|
|
|
619
|
|
|
620 for(unsigned int i = 0; i < mi->second->bin_num() - 1; i++)
|
|
|
621 {
|
|
|
622 // out << (-log(mi->second->get_W_pvalue(i))/log(10)) * mi->second->W_o_or_u(i) << '\t';//<< " [" << mi->second->W_o_or_u(i) << "]\t";
|
|
|
623 //
|
|
|
624 double bpv = mi->second->get_W_pvalue(i) * (mi->second->bin_num() * Seq->size());
|
|
|
625
|
|
|
626 if(bpv > 1)
|
|
|
627 bpv = 1;
|
|
|
628
|
|
|
629 out << bpv << '\t';
|
|
|
630 }
|
|
|
631
|
|
|
632 out << endl;
|
|
|
633
|
|
|
634 mi++;
|
|
|
635 }
|
|
|
636
|
|
|
637 if(!obstf.empty())
|
|
|
638 distance_analysis(Mat, Seq, SIZE, &S_map);
|
|
|
639 }
|
|
|
640
|
|
|
641 out.close();
|
|
|
642
|
|
|
643 return;
|
|
|
644 }
|
|
|
645
|
|
|
646 void distance_analysis(vector<matrix> *Mat, vector<sequence> *Seq, unsigned int SIZE, multimap <double, vector<matrix>::iterator> *S_map)
|
|
|
647 {
|
|
|
648 string dist_outfile = regfile, dist2_outfile = regfile, dist3_outfile = regfile;
|
|
|
649
|
|
|
650 dist_outfile += ".dist";
|
|
|
651 dist2_outfile += ".exp_dist";
|
|
|
652 dist3_outfile += ".pv_dist";
|
|
|
653
|
|
|
654 ofstream out(dist_outfile.c_str()), out2, out3;
|
|
|
655
|
|
|
656 if(!obstf.empty())
|
|
|
657 {
|
|
|
658 out2.open(dist2_outfile.c_str());
|
|
|
659 out3.open(dist3_outfile.c_str());
|
|
|
660
|
|
|
661 out2 << "TF/EXP_HITS\t";
|
|
|
662 out3 << "TF/PVALUE\t";
|
|
|
663 }
|
|
|
664
|
|
|
665 out << "TF1|TF2/HITS\t";
|
|
|
666
|
|
|
667 for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++)
|
|
|
668 out << p << '\t';
|
|
|
669
|
|
|
670 out << endl;
|
|
|
671
|
|
|
672 if(!obstf.empty())
|
|
|
673 {
|
|
|
674 for(int p = -(int)(RSIZE - 1); p < (int)RSIZE; p++)
|
|
|
675 {
|
|
|
676 out2 << p << '\t';
|
|
|
677 out3 << p << '\t';
|
|
|
678 }
|
|
|
679
|
|
|
680 out2 << endl;
|
|
|
681 out3 << endl;
|
|
|
682 }
|
|
|
683
|
|
|
684 multimap <double, vector<matrix>::iterator>::iterator MRI = S_map->begin();
|
|
|
685 unsigned int LCOUNT = 0;
|
|
|
686
|
|
|
687 while(MRI != S_map->end())
|
|
|
688 {
|
|
|
689 if((MRI->second->get_pvalue_bg() > DIST_PV_THRESHOLD || MRI->second->o_or_u_bg() == "UNDER") &&
|
|
|
690 (MRI->second->get_pvalue() > DIST_PV_THRESHOLD || MRI->second->o_or_u() == "UNDER") )
|
|
|
691 {
|
|
|
692 MRI++;
|
|
|
693 continue;
|
|
|
694 }
|
|
|
695
|
|
|
696 // if(!QUIET) cerr << endl << MRI->second->name() << endl;
|
|
|
697
|
|
|
698 LCOUNT++;
|
|
|
699
|
|
|
700 map<int, double> R_dist_map;
|
|
|
701
|
|
|
702 if(!obstf.empty() && obstf == MRI->second->id())
|
|
|
703 {
|
|
|
704 map<int, unsigned int> M_hit_distr;
|
|
|
705
|
|
|
706 for(unsigned int a = 0; a < SIZE; a++)
|
|
|
707 {
|
|
|
708 map<int, unsigned int>::iterator mi;
|
|
|
709
|
|
|
710 if(!MRI->second->get_strand_at(a))
|
|
|
711 mi = M_hit_distr.find(MRI->second->get_pos_at(a));
|
|
|
712 else
|
|
|
713 mi = M_hit_distr.find(RSIZE - MRI->second->get_pos_at(a));
|
|
|
714
|
|
|
715 if(mi == M_hit_distr.end())
|
|
|
716 {
|
|
|
717 if(!MRI->second->get_strand_at(a))
|
|
|
718 M_hit_distr[MRI->second->get_pos_at(a)] = 1;
|
|
|
719 else
|
|
|
720 M_hit_distr[RSIZE - MRI->second->get_pos_at(a)] = 1;
|
|
|
721 }
|
|
|
722 else
|
|
|
723 mi->second++;
|
|
|
724 }
|
|
|
725
|
|
|
726 map<int, unsigned int>::iterator mi = M_hit_distr.begin();
|
|
|
727
|
|
|
728 while(mi != M_hit_distr.end())
|
|
|
729 {
|
|
|
730 for(int b = 0; b < RSIZE; b++)
|
|
|
731 {
|
|
|
732 int dpos = mi->first - b;
|
|
|
733
|
|
|
734 map<int, double>::iterator ri = R_dist_map.find(dpos);
|
|
|
735
|
|
|
736 if(ri == R_dist_map.end())
|
|
|
737 R_dist_map[dpos] = mi->second;
|
|
|
738 else
|
|
|
739 ri->second += mi->second;
|
|
|
740
|
|
|
741 }
|
|
|
742
|
|
|
743 mi++;
|
|
|
744 }
|
|
|
745
|
|
|
746 map<int, double>::iterator ri = R_dist_map.begin();
|
|
|
747 double risum = SIZE * RSIZE;
|
|
|
748
|
|
|
749 out2 << MRI->second->name() << '\t';
|
|
|
750
|
|
|
751 while(ri != R_dist_map.end())
|
|
|
752 {
|
|
|
753 ri->second /= risum;
|
|
|
754 out2 << ri->second*SIZE << '\t';
|
|
|
755 ri++;
|
|
|
756 }
|
|
|
757
|
|
|
758 out2 << endl;
|
|
|
759 }
|
|
|
760
|
|
|
761 // multimap <double, vector<matrix>::iterator>::iterator MRI2 = MRI;
|
|
|
762 multimap <double, vector<matrix>::iterator>::iterator MRI2 = S_map->begin();
|
|
|
763
|
|
|
764 while(MRI2 != S_map->end())
|
|
|
765 {
|
|
|
766 vector<unsigned int> hit_v((RSIZE*2) - 1, 0);
|
|
|
767
|
|
|
768 if(MRI2 == MRI)
|
|
|
769 {
|
|
|
770 vector<unsigned int> SBPOS;
|
|
|
771 // vector<bool> SBSTRAND;
|
|
|
772
|
|
|
773 if(SINGLE_STRAND)
|
|
|
774 {
|
|
|
775 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
776 {
|
|
|
777 bool s;
|
|
|
778 unsigned int P;
|
|
|
779 double score;
|
|
|
780
|
|
|
781 score = MRI->second->scan_seq_ss_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, MRI->second->get_pos_at(i), Seq->at(i).strand());
|
|
|
782 SBPOS.push_back(P);
|
|
|
783 // SBSTRAND.push_back(s);
|
|
|
784 }
|
|
|
785
|
|
|
786 }
|
|
|
787 else
|
|
|
788 {
|
|
|
789 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
790 {
|
|
|
791 bool s;
|
|
|
792 unsigned int P;
|
|
|
793 double score;
|
|
|
794
|
|
|
795 score = MRI->second->scan_seq_ds_second_best(Seq->at(i).seq() + (RSIZE/2), RSIZE, &P, &s, (int)MRI->second->get_pos_at(i));
|
|
|
796
|
|
|
797 SBPOS.push_back(P + MRI->second->length()/2);
|
|
|
798 // SBSTRAND.push_back(s);
|
|
|
799
|
|
|
800 }
|
|
|
801 }
|
|
|
802
|
|
|
803 for(unsigned int k = 0; k < SIZE; k++)
|
|
|
804 {
|
|
|
805 int dist;
|
|
|
806
|
|
|
807 if(!MRI->second->get_strand_at(k))
|
|
|
808 dist = (int)SBPOS.at(k) - (int)MRI->second->get_pos_at(k);
|
|
|
809 else
|
|
|
810 dist = ((int)(RSIZE-1) - (int)SBPOS.at(k)) - ((int)(RSIZE - 1) - (int)MRI->second->get_pos_at(k));
|
|
|
811
|
|
|
812 hit_v.at((RSIZE-1) + dist)++;
|
|
|
813 }
|
|
|
814
|
|
|
815 }
|
|
|
816 else
|
|
|
817 {
|
|
|
818 for(unsigned int k = 0; k < SIZE; k++)
|
|
|
819 {
|
|
|
820 int dist;
|
|
|
821
|
|
|
822 if(!MRI->second->get_strand_at(k))
|
|
|
823 dist = (int)MRI2->second->get_pos_at(k) - (int)MRI->second->get_pos_at(k);
|
|
|
824 else
|
|
|
825 dist = ((int)(RSIZE - 1 ) - (int)MRI2->second->get_pos_at(k)) - ((int)(RSIZE - 1 ) - (int)MRI->second->get_pos_at(k));
|
|
|
826
|
|
|
827 // if(!QUIET) cerr << endl << (int)((RSIZE-1) + dist) << endl;
|
|
|
828
|
|
|
829 hit_v.at((RSIZE-1) + dist)++;
|
|
|
830 }
|
|
|
831 }
|
|
|
832
|
|
|
833 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty())
|
|
|
834 out << MRI->second->name() << "|" << MRI2->second->name() << '\t';
|
|
|
835
|
|
|
836 if(!obstf.empty() && obstf == MRI->second->id())
|
|
|
837 out3 << MRI->second->name() << "|" << MRI2->second->name() << '\t';
|
|
|
838
|
|
|
839 for(unsigned int p = 0; p < (RSIZE*2) - 1; p++)
|
|
|
840 {
|
|
|
841 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty())
|
|
|
842 out << hit_v[p] << '\t';
|
|
|
843
|
|
|
844 if(!obstf.empty() && obstf == MRI->second->id())
|
|
|
845 {
|
|
|
846 map<int, double>::iterator ri = R_dist_map.find(p - (RSIZE - 1));
|
|
|
847
|
|
|
848 if(hit_v[p] >= DIST_HIT_THRESHOLD && hit_v[p]/(ri->second * SIZE) >= DIST_RATIO_THRESHOLD)
|
|
|
849 {
|
|
|
850 double pv = gsl_cdf_binomial_Q(hit_v[p], ri->second, SIZE);
|
|
|
851
|
|
|
852 if(pv <= DIST_PV_THRESHOLD)
|
|
|
853 out3 << pv << '\t';
|
|
|
854 else
|
|
|
855 out3 << "NA" << '\t';
|
|
|
856 }
|
|
|
857 else
|
|
|
858 out3 << "NA" << '\t';
|
|
|
859 }
|
|
|
860 }
|
|
|
861
|
|
|
862 if((!obstf.empty() && obstf == MRI->second->id()) || obstf.empty())
|
|
|
863 out << endl;
|
|
|
864
|
|
|
865 if(!obstf.empty() && obstf == MRI->second->id())
|
|
|
866 out3 << endl;
|
|
|
867
|
|
|
868 MRI2++;
|
|
|
869 }
|
|
|
870
|
|
|
871 MRI++;
|
|
|
872 }
|
|
|
873
|
|
|
874 out.close();
|
|
|
875
|
|
|
876 if(!obstf.empty())
|
|
|
877 {
|
|
|
878 out2.close();
|
|
|
879 out3.close();
|
|
|
880 }
|
|
|
881
|
|
|
882 if(!LCOUNT)
|
|
|
883 {
|
|
|
884 remove(dist_outfile.c_str());
|
|
|
885
|
|
|
886 if(!obstf.empty())
|
|
|
887 {
|
|
|
888 remove(dist2_outfile.c_str());
|
|
|
889 remove(dist3_outfile.c_str());
|
|
|
890 }
|
|
|
891 }
|
|
|
892
|
|
|
893 if(!QUIET) cerr << "\nDistance analysis written in file: " << dist_outfile << endl;
|
|
|
894
|
|
|
895 return;
|
|
|
896 }
|
|
|
897
|
|
|
898 void scanner(vector<matrix> *Mat, vector<sequence> *Seq)
|
|
|
899 {
|
|
|
900 for(unsigned int i = 0; i < Mat->size(); i++)
|
|
|
901 {
|
|
|
902 if(VERBOSE)
|
|
|
903 {
|
|
|
904 if(!QUIET) cerr << "\nScanning sequences for "
|
|
|
905 << Mat->at(i).name() << " (" << Mat->at(i).id() << ")"<< endl;
|
|
|
906 }
|
|
|
907
|
|
|
908 Mat->at(i).scan(Seq);
|
|
|
909 Mat->at(i).calc_var((double)Seq->size());
|
|
|
910
|
|
|
911 if(POSITIONAL)
|
|
|
912 Mat->at(i).calc_W_param((double)Seq->size());
|
|
|
913
|
|
|
914 if(VERBOSE)
|
|
|
915 {
|
|
|
916 if(!QUIET) cerr << "\nRegions avg = " << Mat->at(i).R_Avg << '\t' << "Flanks avg = " << Mat->at(i).F_Avg << '\t'
|
|
|
917 << "Regions devstd = " << pow(Mat->at(i).R_Var,0.5) << '\t' << "Flanks devstd = " << pow(Mat->at(i).F_Var,0.5);
|
|
|
918 if(!QUIET) cerr << "\nDoing Welch's t test... ";
|
|
|
919 }
|
|
|
920
|
|
|
921 Mat->at(i).welch_test((double)Seq->size());
|
|
|
922
|
|
|
923 if(Mat->at(i).have_bg())
|
|
|
924 Mat->at(i).welch_test_bg((double)Seq->size());
|
|
|
925
|
|
|
926 if(POSITIONAL)
|
|
|
927 {
|
|
|
928 // Mat->at(i).welch_test_pos((double)Seq->size());
|
|
|
929 Mat->at(i).z_test_pos((double)Seq->size());
|
|
|
930 }
|
|
|
931
|
|
|
932 if(VERBOSE)
|
|
|
933 {
|
|
|
934 if(!QUIET) cerr << "done. Pvalue = " << Mat->at(i).get_pvalue() << ' ' << Mat->at(i).o_or_u() << endl;
|
|
|
935 if(!QUIET) cerr << endl;
|
|
|
936 }
|
|
|
937 }
|
|
|
938
|
|
|
939 return;
|
|
|
940 }
|
|
|
941
|
|
|
942 void acquire_matrices(vector<matrix> *Mat, map<string,bg> *Bg, string *matfile)
|
|
|
943 {
|
|
|
944 ifstream in(matfile->c_str());
|
|
|
945 string line;
|
|
|
946
|
|
|
947 if(!in)
|
|
|
948 {
|
|
|
949 if(!QUIET) cerr << "Can't find " << *matfile << endl;
|
|
|
950 exit(1);
|
|
|
951 }
|
|
|
952
|
|
|
953 while(getline(in,line))
|
|
|
954 {
|
|
|
955 if(line.empty())
|
|
|
956 continue;
|
|
|
957
|
|
|
958 vector<string> mbuf;
|
|
|
959 mbuf.push_back(line);
|
|
|
960
|
|
|
961 for(unsigned int i = 0; i < ALPHABET_SIZE; i++)
|
|
|
962 {
|
|
|
963 getline(in,line);
|
|
|
964
|
|
|
965 if(!line.empty())
|
|
|
966 mbuf.push_back(line);
|
|
|
967 }
|
|
|
968
|
|
|
969 if(mbuf.size() == ALPHABET_SIZE + 1)
|
|
|
970 Mat->push_back(matrix(&mbuf,Bg));
|
|
|
971 }
|
|
|
972
|
|
|
973 in.close();
|
|
|
974
|
|
|
975 vector<matrix>::iterator vi = Mat->begin();
|
|
|
976
|
|
|
977 while(vi != Mat->end())
|
|
|
978 {
|
|
|
979 if(!vi->is_good())
|
|
|
980 {
|
|
|
981 vi = Mat->erase(vi);
|
|
|
982 continue;
|
|
|
983 }
|
|
|
984
|
|
|
985 vi++;
|
|
|
986 }
|
|
|
987
|
|
|
988 return;
|
|
|
989 }
|
|
|
990
|
|
|
991 void acquire_background(map<string, bg> *Bg)
|
|
|
992 {
|
|
|
993 ifstream in(bgfile.c_str());
|
|
|
994
|
|
|
995 if(!in)
|
|
|
996 {
|
|
|
997 if(!QUIET) cerr << "\nWarning: background file " << bgfile << " not found..." << endl;
|
|
|
998 return;
|
|
|
999 }
|
|
|
1000
|
|
|
1001 string line;
|
|
|
1002
|
|
|
1003 getline(in,line);
|
|
|
1004
|
|
|
1005 while(getline(in, line))
|
|
|
1006 {
|
|
|
1007 istringstream str(line);
|
|
|
1008 string id, trash;
|
|
|
1009 bg nbg;
|
|
|
1010
|
|
|
1011 str >> trash >> id >> nbg.Size >> trash >> trash >> trash >> trash >> nbg.Avg >> nbg.Var;
|
|
|
1012
|
|
|
1013 nbg.Var = pow(nbg.Var,2);
|
|
|
1014
|
|
|
1015 Bg->insert(make_pair(id,nbg));
|
|
|
1016 }
|
|
|
1017
|
|
|
1018 in.close();
|
|
|
1019 return;
|
|
|
1020 }
|
|
|
1021
|
|
|
1022 void acquire_regions(vector<sequence> *Seq)
|
|
|
1023 {
|
|
|
1024 ifstream in(regfile.c_str());
|
|
|
1025 // map<string, vector<string> > Regs;
|
|
|
1026 map<string, map<unsigned int, string> > Regs; //PREVENT REG DUPLICATES
|
|
|
1027 map<string, map<unsigned int, unsigned int > > M_RANK;
|
|
|
1028
|
|
|
1029 if(!in)
|
|
|
1030 {
|
|
|
1031 if(!QUIET) cerr << "Can't find " << regfile << endl;
|
|
|
1032 exit(1);
|
|
|
1033 }
|
|
|
1034
|
|
|
1035 string line;
|
|
|
1036
|
|
|
1037 unsigned int r_count = 0;
|
|
|
1038
|
|
|
1039 while(getline(in,line))
|
|
|
1040 {
|
|
|
1041 if(line.empty())
|
|
|
1042 continue;
|
|
|
1043
|
|
|
1044 if(line[0] == '#')
|
|
|
1045 continue;
|
|
|
1046
|
|
|
1047 for(unsigned int i = 0; i < 3; i++)
|
|
|
1048 line[i] = tolower(line[i]);
|
|
|
1049
|
|
|
1050 for(unsigned int i = 3; i < line.size(); i++)
|
|
|
1051 line[i] = toupper(line[i]);
|
|
|
1052
|
|
|
1053 istringstream str(line);
|
|
|
1054 string chr;
|
|
|
1055 unsigned int start, end, mp;
|
|
|
1056
|
|
|
1057 str >> chr >> start >> end;
|
|
|
1058
|
|
|
1059 // Regs[chr].push_back(line);
|
|
|
1060 mp = start + ((end - start)/2);
|
|
|
1061
|
|
|
1062 if(mp > (RSIZE + RSIZE/2) + 1)
|
|
|
1063 {
|
|
|
1064 Regs[chr][mp] = line;
|
|
|
1065
|
|
|
1066 // if(!QUIET) cerr << endl << line;
|
|
|
1067
|
|
|
1068 if(M_RANK.find(chr) != M_RANK.end()) //PREVENT REGIONS PRESENT MORE THAN ONCE TO DO TOO MUCH MESS WITH SPEARMAN CORRELATION
|
|
|
1069 if(M_RANK[chr].find(mp) != M_RANK[chr].end())
|
|
|
1070 continue;
|
|
|
1071
|
|
|
1072 M_RANK[chr][mp] = r_count;
|
|
|
1073 r_count++;
|
|
|
1074 }
|
|
|
1075 else
|
|
|
1076 if(!QUIET) cerr << "\nSkipping region: too close to chromosome start..." << endl << line << endl;
|
|
|
1077 }
|
|
|
1078
|
|
|
1079 in.close();
|
|
|
1080
|
|
|
1081 map<string, map<unsigned int, string> >::iterator mi = Regs.begin();
|
|
|
1082
|
|
|
1083 if(!TWOBIT)
|
|
|
1084 {
|
|
|
1085 while(mi != Regs.end())
|
|
|
1086 {
|
|
|
1087 string file = genome;
|
|
|
1088 file += "/";
|
|
|
1089 file += mi->first;
|
|
|
1090 file += ".raw";
|
|
|
1091
|
|
|
1092 in.open(file.c_str());
|
|
|
1093
|
|
|
1094 if(!in)
|
|
|
1095 {
|
|
|
1096 if(!QUIET) cerr << "\nCan't find " << file << ". Skipping chromosome:\n" << mi->first << endl;
|
|
|
1097
|
|
|
1098 if(SPEARMAN)
|
|
|
1099 {
|
|
|
1100 if(!QUIET) cerr << "\nSpearman correlation disabled..." << endl;
|
|
|
1101 SPEARMAN = false;
|
|
|
1102 }
|
|
|
1103
|
|
|
1104 mi++;
|
|
|
1105 in.close();
|
|
|
1106 continue;
|
|
|
1107 }
|
|
|
1108
|
|
|
1109 map<unsigned int, string>::iterator si = mi->second.begin();
|
|
|
1110
|
|
|
1111 while(si != mi->second.end())
|
|
|
1112 {
|
|
|
1113 Seq->push_back(sequence(&si->second, &in, M_RANK[mi->first][si->first]));
|
|
|
1114 si++;
|
|
|
1115 }
|
|
|
1116
|
|
|
1117 in.close();
|
|
|
1118 mi++;
|
|
|
1119 }
|
|
|
1120 }
|
|
|
1121 else
|
|
|
1122 {
|
|
|
1123 ostringstream str;
|
|
|
1124 ofstream outfile;
|
|
|
1125 str << ".pchip." << rand() << ".tmp";
|
|
|
1126 string tmp1 = str.str();
|
|
|
1127 outfile.open(tmp1.c_str(), ios::out);
|
|
|
1128 vector<string> k1;
|
|
|
1129 vector<unsigned int> k2;
|
|
|
1130
|
|
|
1131 while(mi != Regs.end())
|
|
|
1132 {
|
|
|
1133 map<unsigned int, string>::iterator si = mi->second.begin();
|
|
|
1134
|
|
|
1135 while(si != mi->second.end())
|
|
|
1136 {
|
|
|
1137 unsigned int Start, End;
|
|
|
1138
|
|
|
1139 Start = si->first - RSIZE;
|
|
|
1140 End = si->first + RSIZE + 1 + MAX_L;
|
|
|
1141
|
|
|
1142 outfile << mi->first << '\t' << Start << '\t' << End << '\t' << "." << endl;
|
|
|
1143
|
|
|
1144 k1.push_back(mi->first);
|
|
|
1145 k2.push_back(si->first);
|
|
|
1146
|
|
|
1147 si++;
|
|
|
1148 }
|
|
|
1149
|
|
|
1150 mi++;
|
|
|
1151 }
|
|
|
1152
|
|
|
1153 string cline = "twoBitToFa -bedPos -bed=";
|
|
|
1154 cline += str.str();
|
|
|
1155 str << ".fa";
|
|
|
1156 cline += " ";
|
|
|
1157 cline += genome;
|
|
|
1158 /* cline += "/";
|
|
|
1159 cline += genome;
|
|
|
1160 cline += ".2bit "; */
|
|
|
1161 cline += " ";
|
|
|
1162 cline += str.str();
|
|
|
1163
|
|
|
1164 // if(!QUIET) cerr << endl << "TwoBit command line:\n" << cline << endl;
|
|
|
1165
|
|
|
1166 if(system(cline.c_str()) != 0)
|
|
|
1167 {
|
|
|
1168 if(!QUIET) cerr << endl << "twoBitToFa returned non 0 error code... aborting pscan_chip run!\n" << endl;
|
|
|
1169 exit(1);
|
|
|
1170 }
|
|
|
1171 else
|
|
|
1172 {
|
|
|
1173 ifstream in(str.str().c_str());
|
|
|
1174 string seq;
|
|
|
1175 seq.clear();
|
|
|
1176
|
|
|
1177 if(!in)
|
|
|
1178 {
|
|
|
1179 if(!QUIET) cerr << endl << "Can't access twoBitToFa output file! Aborting pscan_chip run!\n" << endl;
|
|
|
1180 exit(1);
|
|
|
1181 }
|
|
|
1182
|
|
|
1183 unsigned int scount = 0;
|
|
|
1184
|
|
|
1185 while(getline(in,line))
|
|
|
1186 {
|
|
|
1187 if(line.empty())
|
|
|
1188 continue;
|
|
|
1189
|
|
|
1190 if(line[0] == '>')
|
|
|
1191 {
|
|
|
1192 if(seq.empty())
|
|
|
1193 continue;
|
|
|
1194 else
|
|
|
1195 {
|
|
|
1196 Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount)));
|
|
|
1197 seq.clear();
|
|
|
1198 scount++;
|
|
|
1199 continue;
|
|
|
1200 }
|
|
|
1201 }
|
|
|
1202 else
|
|
|
1203 seq += line;
|
|
|
1204 }
|
|
|
1205
|
|
|
1206 Seq->push_back(sequence(&seq, M_RANK[k1.at(scount)][k2.at(scount)], k1.at(scount), k2.at(scount))); //LAST SEQUENCE!
|
|
|
1207
|
|
|
1208 in.close();
|
|
|
1209 remove(str.str().c_str());
|
|
|
1210 remove(tmp1.c_str());
|
|
|
1211 }
|
|
|
1212
|
|
|
1213 // exit(1);
|
|
|
1214 }
|
|
|
1215
|
|
|
1216 vector<sequence>::iterator vi = Seq->begin();
|
|
|
1217
|
|
|
1218 while(vi != Seq->end())
|
|
|
1219 {
|
|
|
1220 if(!vi->is_good())
|
|
|
1221 {
|
|
|
1222 vi = Seq->erase(vi);
|
|
|
1223 continue;
|
|
|
1224 }
|
|
|
1225
|
|
|
1226 vi++;
|
|
|
1227 }
|
|
|
1228
|
|
|
1229 return;
|
|
|
1230 }
|
|
|
1231
|
|
|
1232 void command_line_parser(int argc, char **argv)
|
|
|
1233 {
|
|
|
1234 if(argc == 1)
|
|
|
1235 return;
|
|
|
1236
|
|
|
1237 for(int i = 1; i < argc; i++)
|
|
|
1238 {
|
|
|
1239 string buf = argv[i];
|
|
|
1240
|
|
|
1241 if(buf == "-r")
|
|
|
1242 {
|
|
|
1243 if(i < argc - 1)
|
|
|
1244 regfile = argv[++i];
|
|
|
1245 continue;
|
|
|
1246 }
|
|
|
1247
|
|
|
1248 if(buf == "-g")
|
|
|
1249 {
|
|
|
1250 if(i < argc - 1)
|
|
|
1251 genome = argv[++i];
|
|
|
1252 continue;
|
|
|
1253 }
|
|
|
1254
|
|
|
1255 if(buf == "-s")
|
|
|
1256 {
|
|
|
1257 if(i < argc - 1)
|
|
|
1258 RSIZE = atoi(argv[++i]);
|
|
|
1259 continue;
|
|
|
1260 }
|
|
|
1261 if(buf == "-bgavg")
|
|
|
1262 {
|
|
|
1263 if(i < argc - 1)
|
|
|
1264 SINGLE_MATRIX_BG_AVG = atof(argv[++i]);
|
|
|
1265 continue;
|
|
|
1266 }
|
|
|
1267 if(buf == "-M")
|
|
|
1268 {
|
|
|
1269 if(i < argc - 1)
|
|
|
1270 matrixfile = argv[++i];
|
|
|
1271 continue;
|
|
|
1272 }
|
|
|
1273 if(buf == "-MADD")
|
|
|
1274 {
|
|
|
1275 if(i < argc - 1)
|
|
|
1276 additional_matrixfile = argv[++i];
|
|
|
1277 continue;
|
|
|
1278 }
|
|
|
1279 if(buf == "-m")
|
|
|
1280 {
|
|
|
1281 if(i < argc - 1)
|
|
|
1282 selected = argv[++i];
|
|
|
1283 continue;
|
|
|
1284 }
|
|
|
1285
|
|
|
1286 if(buf == "-bg")
|
|
|
1287 {
|
|
|
1288 if(i < argc - 1)
|
|
|
1289 bgfile = argv[++i];
|
|
|
1290 continue;
|
|
|
1291 }
|
|
|
1292 if(buf == "-go")
|
|
|
1293 {
|
|
|
1294 if(i < argc - 1)
|
|
|
1295 GALAXY_OUTPUT = argv[++i];
|
|
|
1296 continue;
|
|
|
1297 }
|
|
|
1298 if(buf == "-obs")
|
|
|
1299 {
|
|
|
1300 if(i < argc - 1)
|
|
|
1301 obstf = argv[++i];
|
|
|
1302
|
|
|
1303 continue;
|
|
|
1304 }
|
|
|
1305 if(buf == "-ss")
|
|
|
1306 {
|
|
|
1307 SINGLE_STRAND = true;
|
|
|
1308 continue;
|
|
|
1309 }
|
|
|
1310 if(buf == "-2bit")
|
|
|
1311 {
|
|
|
1312 TWOBIT = true;
|
|
|
1313 continue;
|
|
|
1314 }
|
|
|
1315 if(buf == "-quiet")
|
|
|
1316 {
|
|
|
1317 QUIET = true;
|
|
|
1318 continue;
|
|
|
1319 }
|
|
|
1320 if(buf == "-sbp")
|
|
|
1321 {
|
|
|
1322 SECOND_BEST_POS = true;
|
|
|
1323 continue;
|
|
|
1324 }
|
|
|
1325 if(buf == "-fasta")
|
|
|
1326 {
|
|
|
1327 GET_SEQUENCES = true;
|
|
|
1328 continue;
|
|
|
1329 }
|
|
|
1330 if(buf == "-pos")
|
|
|
1331 {
|
|
|
1332 POSITIONAL = true;
|
|
|
1333 continue;
|
|
|
1334 }
|
|
|
1335 if(buf == "-spear")
|
|
|
1336 {
|
|
|
1337 SPEARMAN = true;
|
|
|
1338 continue;
|
|
|
1339 }
|
|
|
1340 if(buf == "-wsize")
|
|
|
1341 {
|
|
|
1342 if(i < argc - 1)
|
|
|
1343 W_SIZE = atoi(argv[++i]);
|
|
|
1344
|
|
|
1345 W_STEP = W_SIZE / 2;
|
|
|
1346 continue;
|
|
|
1347 }
|
|
|
1348 /* if(buf == "-wstep")
|
|
|
1349 {
|
|
|
1350 if(i < argc - 1)
|
|
|
1351 W_STEP = atoi(argv[++i]);
|
|
|
1352 continue;
|
|
|
1353 }
|
|
|
1354
|
|
|
1355 if(buf == "-fast")
|
|
|
1356 {
|
|
|
1357 FAST_SCAN = true;
|
|
|
1358 continue;
|
|
|
1359 }*/
|
|
|
1360 /* if(buf == "-naive")
|
|
|
1361 {
|
|
|
1362 NAIVE = true;
|
|
|
1363 continue;
|
|
|
1364 }
|
|
|
1365 */
|
|
|
1366 if(buf == "-v")
|
|
|
1367 {
|
|
|
1368 VERBOSE = true;
|
|
|
1369 continue;
|
|
|
1370 }
|
|
|
1371 else
|
|
|
1372 {
|
|
|
1373 if(!QUIET) cerr << "\nUnknown option: " << buf << endl;
|
|
|
1374 exit(1);
|
|
|
1375 }
|
|
|
1376
|
|
|
1377 /* if(buf == "-Q")
|
|
|
1378 {
|
|
|
1379 if(i < argc - 1)
|
|
|
1380 idfile = argv[++i];
|
|
|
1381 continue;
|
|
|
1382 }
|
|
|
1383 if(buf == "-ql")
|
|
|
1384 {
|
|
|
1385 if(i < argc - 1)
|
|
|
1386 q_idlist = argv[++i];
|
|
|
1387 continue;
|
|
|
1388 }
|
|
|
1389 if(buf == "-imgm")
|
|
|
1390 {
|
|
|
1391 if(i < argc - 1)
|
|
|
1392 bg_for_img = argv[++i];
|
|
|
1393 continue;
|
|
|
1394 }
|
|
|
1395 if(buf == "-magic")
|
|
|
1396 {
|
|
|
1397 if(i < argc - 1)
|
|
|
1398 magic = argv[++i];
|
|
|
1399 continue;
|
|
|
1400 }*/
|
|
|
1401 }
|
|
|
1402
|
|
|
1403 if(RSIZE > MAX_REG_SIZE)
|
|
|
1404 {
|
|
|
1405 if(!QUIET) cerr << "\nRegion size exceeds " << MAX_REG_SIZE << "..." << endl;
|
|
|
1406 exit(1);
|
|
|
1407 }
|
|
|
1408
|
|
|
1409 if(RSIZE % W_SIZE != 0 || W_SIZE % 2 != 0 || RSIZE / W_SIZE < 5)
|
|
|
1410 {
|
|
|
1411 if(!QUIET) cerr << "\nWindow size must be even and both an exact divisor of region size and at least five times smaller of it: positional analysis disabled." << endl;
|
|
|
1412 POSITIONAL = false;
|
|
|
1413 }
|
|
|
1414
|
|
|
1415 return;
|
|
|
1416 }
|
|
|
1417
|
|
|
1418 //SEQUENCE CLASS BEGIN
|
|
|
1419
|
|
|
1420 sequence::sequence(string *seq, unsigned int R, string chr, unsigned int mp) //CONSTRUCTOR FOR 2BIT INPUT
|
|
|
1421 {
|
|
|
1422 Chr = chr;
|
|
|
1423 Strand = '+';
|
|
|
1424 AntiStrand = '-';
|
|
|
1425 E_Rank = R;
|
|
|
1426 good = true;
|
|
|
1427
|
|
|
1428 Start = mp - RSIZE/2;
|
|
|
1429 End = (mp + RSIZE/2) - 1;
|
|
|
1430
|
|
|
1431 for(unsigned int i = 0; i < seq->size(); i++)
|
|
|
1432 seq->at(i) = toupper(seq->at(i));
|
|
|
1433
|
|
|
1434 Seq = new char[(RSIZE*2) + 1 + MAX_L];
|
|
|
1435
|
|
|
1436 // if(!QUIET) cerr << "\nseq.size = " << seq->size() << endl;
|
|
|
1437 // if(!QUIET) cerr << "\narray.size = " << (RSIZE*2) + 1 + MAX_L << endl;
|
|
|
1438
|
|
|
1439 strcpy(Seq, seq->c_str());
|
|
|
1440
|
|
|
1441 if(seq->size() != RSIZE*2 + MAX_L + 1)
|
|
|
1442 {
|
|
|
1443 good = false;
|
|
|
1444 return;
|
|
|
1445 }
|
|
|
1446
|
|
|
1447 if(GET_SEQUENCES)
|
|
|
1448 fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << endl << seq->substr(RSIZE/2, RSIZE) << endl;
|
|
|
1449
|
|
|
1450 return;
|
|
|
1451 }
|
|
|
1452
|
|
|
1453 sequence::sequence(string *line, ifstream *in, unsigned int R)
|
|
|
1454 {
|
|
|
1455 istringstream str(*line);
|
|
|
1456 string trash;
|
|
|
1457 int buf_s, buf_e, mp;
|
|
|
1458 good = true;
|
|
|
1459
|
|
|
1460 E_Rank = R;
|
|
|
1461
|
|
|
1462 str >> Chr >> buf_s >> buf_e >> Name >> trash >> Strand;
|
|
|
1463
|
|
|
1464 if(Strand == 'R')
|
|
|
1465 Strand = '-';
|
|
|
1466
|
|
|
1467 if(Strand != '+' && Strand != '-')
|
|
|
1468 Strand = '+';
|
|
|
1469
|
|
|
1470 if(Strand == '+')
|
|
|
1471 AntiStrand = '-';
|
|
|
1472 else
|
|
|
1473 AntiStrand = '+';
|
|
|
1474
|
|
|
1475 mp = buf_s + ((buf_e - buf_s)/2);
|
|
|
1476
|
|
|
1477 Start = mp - RSIZE/2;
|
|
|
1478 End = (mp + RSIZE/2) - 1;
|
|
|
1479
|
|
|
1480 // Seq = new char[RSIZE + 1];
|
|
|
1481 // UF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2];
|
|
|
1482 // DF_Seq = new char[(RSIZE/2) + 1 + MAX_L/2];
|
|
|
1483
|
|
|
1484 Seq = new char[(RSIZE*2) + 1 + MAX_L];
|
|
|
1485
|
|
|
1486 in->seekg((Start) - RSIZE/2);
|
|
|
1487
|
|
|
1488 in->get(Seq, (RSIZE*2 + 1) + MAX_L);
|
|
|
1489
|
|
|
1490 if(in->gcount() != RSIZE*2 + MAX_L)
|
|
|
1491 {
|
|
|
1492 good = false;
|
|
|
1493 return;
|
|
|
1494 }
|
|
|
1495
|
|
|
1496 if(GET_SEQUENCES)
|
|
|
1497 {
|
|
|
1498 char *out_seq;
|
|
|
1499 out_seq = new char[RSIZE + 1];
|
|
|
1500
|
|
|
1501 in->seekg(Start);
|
|
|
1502 in->get(out_seq, (RSIZE + 1));
|
|
|
1503
|
|
|
1504 fasta_out << ">" << Chr << '\t' << Start + 1 << '\t' << End + 1 << '\t' << "---" << '\t' << buf_s << '\t' << buf_e << '\t' << endl << out_seq << endl;
|
|
|
1505
|
|
|
1506 delete[] out_seq;
|
|
|
1507 }
|
|
|
1508
|
|
|
1509 /*in->seekg(((Start - 1) - RSIZE/2) - (MAX_L/2));
|
|
|
1510
|
|
|
1511 in->get(UF_Seq, RSIZE/2 + 1 + MAX_L /2);
|
|
|
1512
|
|
|
1513 if(in->gcount() != RSIZE/2 + (MAX_L/2))
|
|
|
1514 {
|
|
|
1515 good = false;
|
|
|
1516 return;
|
|
|
1517 }
|
|
|
1518
|
|
|
1519 in->get(Seq, RSIZE + 1);
|
|
|
1520
|
|
|
1521 if(in->gcount() != RSIZE)
|
|
|
1522 {
|
|
|
1523 good = false;
|
|
|
1524 return;
|
|
|
1525 }
|
|
|
1526
|
|
|
1527 in->get(DF_Seq, RSIZE/2 + 1 + (MAX_L/2));
|
|
|
1528
|
|
|
1529 if(in->gcount() != RSIZE/2 + (MAX_L/2))
|
|
|
1530 good = false;
|
|
|
1531 */
|
|
|
1532
|
|
|
1533 return;
|
|
|
1534 }
|
|
|
1535
|
|
|
1536 unsigned int sequence::e_rank()
|
|
|
1537 {
|
|
|
1538 return E_Rank;
|
|
|
1539 }
|
|
|
1540
|
|
|
1541 string sequence::chr()
|
|
|
1542 {
|
|
|
1543 return Chr;
|
|
|
1544 }
|
|
|
1545
|
|
|
1546 /*string* sequence::seq()
|
|
|
1547 {
|
|
|
1548 return &Seq;
|
|
|
1549 }*/
|
|
|
1550
|
|
|
1551 char* sequence::seq()
|
|
|
1552 {
|
|
|
1553 return Seq;
|
|
|
1554 }
|
|
|
1555
|
|
|
1556 char sequence::strand()
|
|
|
1557 {
|
|
|
1558 return Strand;
|
|
|
1559 }
|
|
|
1560
|
|
|
1561 char sequence::antistrand()
|
|
|
1562 {
|
|
|
1563 return AntiStrand;
|
|
|
1564 }
|
|
|
1565
|
|
|
1566 /*char* sequence::uf_seq()
|
|
|
1567 {
|
|
|
1568 return UF_Seq;
|
|
|
1569 }
|
|
|
1570
|
|
|
1571 char* sequence::df_seq()
|
|
|
1572 {
|
|
|
1573 return DF_Seq;
|
|
|
1574 }
|
|
|
1575 */
|
|
|
1576 string sequence::name()
|
|
|
1577 {
|
|
|
1578 return Name;
|
|
|
1579 }
|
|
|
1580
|
|
|
1581 unsigned int sequence::start()
|
|
|
1582 {
|
|
|
1583 return Start;
|
|
|
1584 }
|
|
|
1585
|
|
|
1586 unsigned int sequence::end()
|
|
|
1587 {
|
|
|
1588 return End;
|
|
|
1589 }
|
|
|
1590
|
|
|
1591 bool sequence::is_good()
|
|
|
1592 {
|
|
|
1593 return good;
|
|
|
1594 }
|
|
|
1595
|
|
|
1596 // SEQUENCE CLASS END
|
|
|
1597
|
|
|
1598 // MATRIX CLASS BEGIN
|
|
|
1599
|
|
|
1600 matrix::matrix(vector<string>* mbuf, map<string, bg> *Bg)
|
|
|
1601 {
|
|
|
1602 string buf;
|
|
|
1603 istringstream bstr(mbuf->at(1)), nstr(mbuf->at(0));
|
|
|
1604 L = 0;
|
|
|
1605 R_Avg = 0;
|
|
|
1606 F_Avg = 0;
|
|
|
1607 R_Var = 0;
|
|
|
1608 F_Var = 0;
|
|
|
1609 bg_Avg = 0;
|
|
|
1610 bg_Var = 0;
|
|
|
1611
|
|
|
1612 BIN_NUM = RSIZE / W_STEP;
|
|
|
1613
|
|
|
1614 W_R_Avg.resize(BIN_NUM, 0);
|
|
|
1615 W_R_Var.resize(BIN_NUM, 0);
|
|
|
1616 W_R_pvalue.resize(BIN_NUM, 0);
|
|
|
1617 W_R_T_pvalue.resize(BIN_NUM,0);
|
|
|
1618 spearman = 42;
|
|
|
1619
|
|
|
1620 O_U = false;
|
|
|
1621 O_U_bg = false;
|
|
|
1622 W_O_U.resize(BIN_NUM, 0);
|
|
|
1623
|
|
|
1624 BG = false;
|
|
|
1625 good = true;
|
|
|
1626 norm_sec_step = false;
|
|
|
1627
|
|
|
1628 while(bstr >> buf)
|
|
|
1629 L++;
|
|
|
1630
|
|
|
1631 if(L > RSIZE/2)
|
|
|
1632 {
|
|
|
1633 good = false;
|
|
|
1634 return;
|
|
|
1635 }
|
|
|
1636
|
|
|
1637 if(L > MAX_L)
|
|
|
1638 MAX_L = L;
|
|
|
1639
|
|
|
1640 if(!mbuf->at(0).empty())
|
|
|
1641 {
|
|
|
1642 if(mbuf->at(0).at(0) != '>')
|
|
|
1643 {
|
|
|
1644 good = false;
|
|
|
1645 return;
|
|
|
1646 }
|
|
|
1647 }
|
|
|
1648 else
|
|
|
1649 {
|
|
|
1650 good = false;
|
|
|
1651 return;
|
|
|
1652 }
|
|
|
1653
|
|
|
1654 // Name = mbuf->at(0).substr(1);
|
|
|
1655 nstr >> Id >> Name;
|
|
|
1656
|
|
|
1657 Id = Id.substr(1);
|
|
|
1658
|
|
|
1659 if(!selected.empty())
|
|
|
1660 {
|
|
|
1661 // cerr << endl << selected << '\t' << Id << '\t' << Name << endl;
|
|
|
1662
|
|
|
1663 if(selected != Id && selected != Name)
|
|
|
1664 {
|
|
|
1665 good = false;
|
|
|
1666 return;
|
|
|
1667 }
|
|
|
1668 }
|
|
|
1669
|
|
|
1670 if(!Bg->empty())
|
|
|
1671 {
|
|
|
1672 map<string,bg>::iterator mi = Bg->find(Id);
|
|
|
1673
|
|
|
1674 if(mi == Bg->end())
|
|
|
1675 mi = Bg->find(Name);
|
|
|
1676
|
|
|
1677 if(mi != Bg->end())
|
|
|
1678 {
|
|
|
1679 bg_Avg = mi->second.Avg;
|
|
|
1680 bg_Var = mi->second.Var;
|
|
|
1681 bg_Size = mi->second.Size;
|
|
|
1682 BG = true;
|
|
|
1683 }
|
|
|
1684 else
|
|
|
1685 if(!QUIET) cerr << "\nWarning: no background for matrix: " << Id << endl;
|
|
|
1686 }
|
|
|
1687
|
|
|
1688 M = new double*[L];
|
|
|
1689 rc_M = new double*[L];
|
|
|
1690
|
|
|
1691 for(unsigned int i = 1; i < ALPHABET_SIZE + 1; i++)
|
|
|
1692 {
|
|
|
1693 istringstream str(mbuf->at(i));
|
|
|
1694
|
|
|
1695 unsigned int count = 0;
|
|
|
1696
|
|
|
1697 while(str >> buf)
|
|
|
1698 {
|
|
|
1699 if(i == 1)
|
|
|
1700 {
|
|
|
1701 M[count] = new double[ALPHABET_SIZE + 1];
|
|
|
1702 rc_M[count] = new double[ALPHABET_SIZE + 1];
|
|
|
1703 }
|
|
|
1704
|
|
|
1705 M[count][i-1] = atof(buf.c_str());
|
|
|
1706 count++;
|
|
|
1707
|
|
|
1708 if(count > L)
|
|
|
1709 {
|
|
|
1710 good = false;
|
|
|
1711 return;
|
|
|
1712 }
|
|
|
1713 }
|
|
|
1714
|
|
|
1715 if(count != L)
|
|
|
1716 {
|
|
|
1717 good = false;
|
|
|
1718 return;
|
|
|
1719 }
|
|
|
1720 }
|
|
|
1721
|
|
|
1722 normalizer();
|
|
|
1723 to_log();
|
|
|
1724 rev_c();
|
|
|
1725 avg();
|
|
|
1726 max_min();
|
|
|
1727 // set_top_pos();
|
|
|
1728
|
|
|
1729 if(min_score == max_score)
|
|
|
1730 good = false;
|
|
|
1731
|
|
|
1732 return;
|
|
|
1733 }
|
|
|
1734
|
|
|
1735 void matrix::display()
|
|
|
1736 {
|
|
|
1737 if(!QUIET) cerr << "\n>" << Name << '\t' << good << endl;
|
|
|
1738
|
|
|
1739 for(unsigned int i = 0; i < ALPHABET_SIZE; i++)
|
|
|
1740 {
|
|
|
1741 for(unsigned int j = 0; j < L; j++)
|
|
|
1742 if(!QUIET) cerr << M[j][i] << '\t';
|
|
|
1743
|
|
|
1744 if(!QUIET) cerr << endl;
|
|
|
1745 }
|
|
|
1746
|
|
|
1747 if(!QUIET) cerr << endl;
|
|
|
1748
|
|
|
1749 // for(unsigned int i = 0; i < TOP_P; i++)
|
|
|
1750 // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << top_pos[i] + 1 << "-" << top_char[i] << '\t';
|
|
|
1751
|
|
|
1752 if(!QUIET) cerr << "\n\n>" << Name << '\t' << good << '\t' << "RC" << endl;
|
|
|
1753
|
|
|
1754 for(unsigned int i = 0; i < ALPHABET_SIZE; i++)
|
|
|
1755 {
|
|
|
1756 for(unsigned int j = 0; j < L; j++)
|
|
|
1757 if(!QUIET) cerr << rc_M[j][i] << '\t';
|
|
|
1758
|
|
|
1759 if(!QUIET) cerr << endl;
|
|
|
1760 }
|
|
|
1761
|
|
|
1762 if(!QUIET) cerr << endl;
|
|
|
1763
|
|
|
1764 // for(unsigned int i = 0; i < TOP_P; i++)
|
|
|
1765 // if(!QUIET) cerr << "TOP[" << i+1 << "] = " << rc_top_pos[i] + 1 << "-" << rc_top_char[i] << '\t';
|
|
|
1766
|
|
|
1767 if(!QUIET) cerr << endl;
|
|
|
1768
|
|
|
1769 return;
|
|
|
1770 }
|
|
|
1771
|
|
|
1772 bool matrix::is_good()
|
|
|
1773 {
|
|
|
1774 return good;
|
|
|
1775 }
|
|
|
1776
|
|
|
1777
|
|
|
1778 void matrix::normalizer()
|
|
|
1779 {
|
|
|
1780 double *sums = new double[L];
|
|
|
1781
|
|
|
1782 for(int x = 0; x < L; x++)
|
|
|
1783 {
|
|
|
1784 sums[x] = 0;
|
|
|
1785
|
|
|
1786 for(int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
1787 sums[x] += M[x][y];
|
|
|
1788 }
|
|
|
1789
|
|
|
1790 for(int x = 0; x < L; x++)
|
|
|
1791 for(int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
1792 M[x][y] /= sums[x];
|
|
|
1793
|
|
|
1794 delete[] sums;
|
|
|
1795
|
|
|
1796 if(!norm_sec_step)
|
|
|
1797 {
|
|
|
1798 for(int x = 0; x < L; x++)
|
|
|
1799 for(int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
1800 M[x][y] += MIN_ADD_FREQ;
|
|
|
1801
|
|
|
1802 norm_sec_step = true;
|
|
|
1803
|
|
|
1804 normalizer();
|
|
|
1805 }
|
|
|
1806
|
|
|
1807 return;
|
|
|
1808
|
|
|
1809 }
|
|
|
1810
|
|
|
1811 void matrix::to_log()
|
|
|
1812 {
|
|
|
1813 for(int x = 0; x < L; x++)
|
|
|
1814 for(int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
1815 M[x][y] = log(M[x][y]);
|
|
|
1816
|
|
|
1817 return;
|
|
|
1818 }
|
|
|
1819
|
|
|
1820 void matrix::avg()
|
|
|
1821 {
|
|
|
1822 // mat_avg = new double[L];
|
|
|
1823
|
|
|
1824 for(int x = 0; x < L; x++)
|
|
|
1825 {
|
|
|
1826 double buf = 0;
|
|
|
1827
|
|
|
1828 for(int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
1829 buf += M[x][y];
|
|
|
1830
|
|
|
1831 // mat_avg[x] = buf/ALPHABET_SIZE;
|
|
|
1832 M[x][ALPHABET_SIZE] = buf/ALPHABET_SIZE;
|
|
|
1833 rc_M[(L-1) -x][ALPHABET_SIZE] = buf/ALPHABET_SIZE;
|
|
|
1834 }
|
|
|
1835
|
|
|
1836 return;
|
|
|
1837 }
|
|
|
1838
|
|
|
1839 void matrix::max_min()
|
|
|
1840 {
|
|
|
1841 double min, max;
|
|
|
1842
|
|
|
1843 max_score = 0;
|
|
|
1844 min_score = 0;
|
|
|
1845
|
|
|
1846 for(int x = 0; x < L; x++)
|
|
|
1847 {
|
|
|
1848 min = M[x][0];
|
|
|
1849 max = M[x][0];
|
|
|
1850
|
|
|
1851 for(int y = 1; y < ALPHABET_SIZE; y++)
|
|
|
1852 {
|
|
|
1853 if(M[x][y] < min)
|
|
|
1854 min = M[x][y];
|
|
|
1855
|
|
|
1856 if(M[x][y] > max)
|
|
|
1857 max = M[x][y];
|
|
|
1858 }
|
|
|
1859
|
|
|
1860 min_score += min;
|
|
|
1861 max_score += max;
|
|
|
1862 }
|
|
|
1863
|
|
|
1864 return;
|
|
|
1865 }
|
|
|
1866
|
|
|
1867 void matrix::rev_c()
|
|
|
1868 {
|
|
|
1869 for(unsigned int x = 0; x < L; x++)
|
|
|
1870 for(unsigned int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
1871 rc_M[(L - 1) - x][(ALPHABET_SIZE - 1) - y] = M[x][y];
|
|
|
1872
|
|
|
1873 return;
|
|
|
1874 }
|
|
|
1875
|
|
|
1876 void matrix::scan(vector<sequence> *Seq)
|
|
|
1877 {
|
|
|
1878 v_score.reserve(Seq->size());
|
|
|
1879 vF_score.reserve(Seq->size());
|
|
|
1880 v_pos.reserve(Seq->size());
|
|
|
1881 v_strand.reserve(Seq->size());
|
|
|
1882
|
|
|
1883 if(POSITIONAL)
|
|
|
1884 W_R_v_score.reserve(Seq->size());
|
|
|
1885
|
|
|
1886 unsigned int progress;
|
|
|
1887
|
|
|
1888 if(!SINGLE_STRAND)
|
|
|
1889 {
|
|
|
1890 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
1891 {
|
|
|
1892 if(VERBOSE)
|
|
|
1893 {
|
|
|
1894 progress = (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100);
|
|
|
1895
|
|
|
1896 if(!(progress%5))
|
|
|
1897 if(!QUIET) cerr << progress << '%';
|
|
|
1898 }
|
|
|
1899
|
|
|
1900 double max_flank_u, max_flank_d;
|
|
|
1901 vector<double> W_buf(BIN_NUM, 0);
|
|
|
1902
|
|
|
1903 {
|
|
|
1904 double r_score;
|
|
|
1905
|
|
|
1906 max_flank_u = scan_seq_ds(Seq->at(i).seq() , true, RSIZE/2, &W_buf);
|
|
|
1907 r_score = scan_seq_ds(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, &W_buf);
|
|
|
1908 max_flank_d = scan_seq_ds(Seq->at(i).seq() + ((RSIZE/2)*3) , true, RSIZE/2, &W_buf);
|
|
|
1909
|
|
|
1910 R_Avg += r_score;
|
|
|
1911
|
|
|
1912 /* if(r_score < max_flank_u || r_score < max_flank_d)
|
|
|
1913 {
|
|
|
1914 if(max_flank_u > max_flank_d)
|
|
|
1915 G_Avg += max_flank_u;
|
|
|
1916 else
|
|
|
1917 G_Avg += max_flank_d;
|
|
|
1918 }
|
|
|
1919 else
|
|
|
1920 G_Avg += r_score;*/
|
|
|
1921
|
|
|
1922 if(POSITIONAL)
|
|
|
1923 {
|
|
|
1924 // for(unsigned int i = 0; i < BIN_NUM; i++)
|
|
|
1925 // W_R_Avg[i] += W_buf[i];
|
|
|
1926
|
|
|
1927 W_R_v_score.push_back(W_buf);
|
|
|
1928 }
|
|
|
1929
|
|
|
1930 }
|
|
|
1931
|
|
|
1932 if(max_flank_u > max_flank_d)
|
|
|
1933 {
|
|
|
1934 F_Avg += max_flank_u;
|
|
|
1935 vF_score.push_back(max_flank_u);
|
|
|
1936 }
|
|
|
1937 else
|
|
|
1938 {
|
|
|
1939 F_Avg += max_flank_d;
|
|
|
1940 vF_score.push_back(max_flank_d);
|
|
|
1941 }
|
|
|
1942
|
|
|
1943
|
|
|
1944 if(VERBOSE)
|
|
|
1945 if(!(progress%5))
|
|
|
1946 if(!QUIET) cerr << (char)13;
|
|
|
1947 }
|
|
|
1948 }
|
|
|
1949 else
|
|
|
1950 {
|
|
|
1951 for(unsigned int i = 0; i < Seq->size(); i++)
|
|
|
1952 {
|
|
|
1953
|
|
|
1954 if(VERBOSE)
|
|
|
1955 if(!QUIET) cerr << (unsigned int)(((double)(i+1)/(double)Seq->size()) * 100) << '%';
|
|
|
1956
|
|
|
1957 double max_flank_u, max_flank_d;
|
|
|
1958 vector<double> W_buf(BIN_NUM, 0);
|
|
|
1959
|
|
|
1960 {
|
|
|
1961
|
|
|
1962 double r_score;
|
|
|
1963
|
|
|
1964 max_flank_u = scan_seq_ss(Seq->at(i).seq() , true, RSIZE/2, Seq->at(i).strand(), &W_buf);
|
|
|
1965 r_score += scan_seq_ss(Seq->at(i).seq() + (RSIZE/2) , false, RSIZE, Seq->at(i).strand(), &W_buf);
|
|
|
1966 max_flank_d = scan_seq_ss(Seq->at(i).seq() + ((RSIZE/2)*3), true, RSIZE/2, Seq->at(i).strand(), &W_buf);
|
|
|
1967
|
|
|
1968 R_Avg += r_score;
|
|
|
1969
|
|
|
1970 /* if(r_score < max_flank_u || r_score < max_flank_d)
|
|
|
1971 {
|
|
|
1972 if(max_flank_u > max_flank_d)
|
|
|
1973 G_Avg += max_flank_u;
|
|
|
1974 else
|
|
|
1975 G_Avg += max_flank_d;
|
|
|
1976 }
|
|
|
1977 else
|
|
|
1978 G_Avg += r_score;*/
|
|
|
1979
|
|
|
1980 if(POSITIONAL)
|
|
|
1981 {
|
|
|
1982 // for(unsigned int i = 0; i < BIN_NUM; i++)
|
|
|
1983 // W_R_Avg[i] += W_buf[i];
|
|
|
1984
|
|
|
1985 W_R_v_score.push_back(W_buf);
|
|
|
1986 }
|
|
|
1987 }
|
|
|
1988
|
|
|
1989 if(max_flank_u > max_flank_d)
|
|
|
1990 {
|
|
|
1991 F_Avg += max_flank_u;
|
|
|
1992 vF_score.push_back(max_flank_u);
|
|
|
1993 }
|
|
|
1994 else
|
|
|
1995 {
|
|
|
1996 F_Avg += max_flank_d;
|
|
|
1997 vF_score.push_back(max_flank_d);
|
|
|
1998 }
|
|
|
1999
|
|
|
2000 if(VERBOSE)
|
|
|
2001 if(!QUIET) cerr << (char)13;
|
|
|
2002 }
|
|
|
2003 }
|
|
|
2004
|
|
|
2005 R_Avg /= Seq->size();
|
|
|
2006 F_Avg /= Seq->size();
|
|
|
2007 // G_Avg /= Seq->size();
|
|
|
2008
|
|
|
2009 // for(unsigned int i = 0; i < BIN_NUM; i++)
|
|
|
2010 // W_R_Avg[i] /= Seq->size();
|
|
|
2011
|
|
|
2012 if(SPEARMAN)
|
|
|
2013 {
|
|
|
2014 multimap<double, unsigned int> SRM;
|
|
|
2015
|
|
|
2016 for(unsigned int i = 0; i < v_score.size(); i++)
|
|
|
2017 SRM.insert(make_pair(v_score[i], i));
|
|
|
2018
|
|
|
2019 multimap<double, unsigned int>::reverse_iterator mmi = SRM.rbegin();
|
|
|
2020 int sr = 0;
|
|
|
2021 double d = 0;
|
|
|
2022
|
|
|
2023 while(mmi != SRM.rend())
|
|
|
2024 {
|
|
|
2025 // if(!QUIET) cerr << endl << mmi->first << '\t' << sr << '\t' << Seq->at(mmi->second).e_rank() << '\t' << this->name();
|
|
|
2026
|
|
|
2027 d += pow((sr - (int)Seq->at(mmi->second).e_rank()), 2);
|
|
|
2028 sr++;
|
|
|
2029 mmi++;
|
|
|
2030 }
|
|
|
2031
|
|
|
2032 d *= 6;
|
|
|
2033
|
|
|
2034 // if(!QUIET) cerr << endl << "d = " << d << "\tden = " << (Seq->size() * (pow(Seq->size(), 2) - 1)) << '\t' << "ssize = " << Seq->size();
|
|
|
2035
|
|
|
2036 d /= (Seq->size() * (pow(Seq->size(), 2) - 1));
|
|
|
2037
|
|
|
2038 spearman = 1 - d;
|
|
|
2039 }
|
|
|
2040
|
|
|
2041 return;
|
|
|
2042 }
|
|
|
2043
|
|
|
2044 void matrix::calc_var(double S)
|
|
|
2045 {
|
|
|
2046 for(unsigned int i = 0; i < S; i++)
|
|
|
2047 {
|
|
|
2048 R_Var += pow(v_score[i] - R_Avg,2);
|
|
|
2049 F_Var += pow(vF_score[i] - F_Avg,2);
|
|
|
2050
|
|
|
2051 /* if(v_score[i] >= vF_score[i])
|
|
|
2052 G_Var += pow(v_score[i] - G_Avg,2);
|
|
|
2053 else
|
|
|
2054 G_Var += pow(vF_score[i] - G_Avg,2);*/
|
|
|
2055 }
|
|
|
2056
|
|
|
2057 if(S > 1)
|
|
|
2058 {
|
|
|
2059 R_Var /= (S - 1);
|
|
|
2060 F_Var /= (S - 1);
|
|
|
2061 // G_Var /= (S - 1);
|
|
|
2062 }
|
|
|
2063
|
|
|
2064 if(R_Var == 0)
|
|
|
2065 R_Var += EPSILON;
|
|
|
2066
|
|
|
2067 if(F_Var == 0)
|
|
|
2068 F_Var += EPSILON;
|
|
|
2069
|
|
|
2070 // if(G_Var == 0)
|
|
|
2071 // G_Var += EPSILON;
|
|
|
2072
|
|
|
2073 return;
|
|
|
2074 }
|
|
|
2075
|
|
|
2076 void matrix::calc_W_param(double S)
|
|
|
2077 {
|
|
|
2078 if(S > 1)
|
|
|
2079 S = S/2; //SOLO PER LA PRIMA META' DELLE SEQUENZE!!!
|
|
|
2080
|
|
|
2081 W_Avg = 0;
|
|
|
2082 W_Var = 0;
|
|
|
2083
|
|
|
2084 multimap<double, unsigned int> score_seq_map;
|
|
|
2085
|
|
|
2086 for(unsigned int i = 0; i < v_score.size(); i++)
|
|
|
2087 score_seq_map.insert(make_pair(v_score.at(i), i));
|
|
|
2088
|
|
|
2089 multimap<double, unsigned int>::reverse_iterator mi = score_seq_map.rbegin();
|
|
|
2090
|
|
|
2091 for(unsigned int i = 0; i < S; i++)
|
|
|
2092 {
|
|
|
2093 for(unsigned int j = 0; j < BIN_NUM; j++)
|
|
|
2094 W_R_Avg[j] += W_R_v_score[mi->second][j];
|
|
|
2095
|
|
|
2096 // if(!QUIET) cerr << Name << '\t' << mi->first << endl;
|
|
|
2097
|
|
|
2098 mi++;
|
|
|
2099 }
|
|
|
2100
|
|
|
2101 for(unsigned int j = 0; j < BIN_NUM; j++)
|
|
|
2102 W_R_Avg[j] /= S;
|
|
|
2103
|
|
|
2104 mi = score_seq_map.rbegin();
|
|
|
2105
|
|
|
2106 for(unsigned int i = 0; i < S; i++)
|
|
|
2107 {
|
|
|
2108 for(unsigned int j = 0; j < BIN_NUM; j++)
|
|
|
2109 W_R_Var[j] += pow(W_R_v_score[mi->second][j] - W_R_Avg[j],2);
|
|
|
2110
|
|
|
2111 mi++;
|
|
|
2112 }
|
|
|
2113
|
|
|
2114 /* for(unsigned int i = 0; i < S; i++)
|
|
|
2115 for(unsigned int j = 0; j < BIN_NUM; j++)
|
|
|
2116 W_R_Var[j] += pow(W_R_v_score[i][j] - W_R_Avg[j],2);*/
|
|
|
2117
|
|
|
2118 if(S > 1)
|
|
|
2119 {
|
|
|
2120 for(unsigned int j = 0; j < BIN_NUM; j++)
|
|
|
2121 W_R_Var[j] /= (S - 1);
|
|
|
2122 }
|
|
|
2123
|
|
|
2124 for(unsigned int j = 0; j < BIN_NUM; j++)
|
|
|
2125 if(W_R_Var[j] == 0)
|
|
|
2126 W_R_Var[j] += EPSILON;
|
|
|
2127
|
|
|
2128 for(unsigned int j = 0; j < BIN_NUM - 1; j++) //THE LAST BIN IS WASTED SINCE IT IS AN HALF BIN
|
|
|
2129 W_Avg += W_R_Avg[j];
|
|
|
2130
|
|
|
2131 W_Avg /= (BIN_NUM - 1);
|
|
|
2132
|
|
|
2133 for(unsigned int j = 0; j < BIN_NUM - 1; j++)
|
|
|
2134 W_Var += pow(W_R_Avg[j] - W_Avg, 2);
|
|
|
2135
|
|
|
2136 W_Var /= (BIN_NUM - 2);
|
|
|
2137
|
|
|
2138 return;
|
|
|
2139 }
|
|
|
2140
|
|
|
2141 void matrix::welch_test(double S)
|
|
|
2142 {
|
|
|
2143 t = R_Avg - F_Avg;
|
|
|
2144
|
|
|
2145 if(t == 0)
|
|
|
2146 t = EPSILON;
|
|
|
2147
|
|
|
2148 t /= pow((R_Var/S) + (F_Var/S),0.5);
|
|
|
2149
|
|
|
2150 dof = pow((R_Var/S) + (F_Var/S),2);
|
|
|
2151
|
|
|
2152 dof /= (pow(R_Var,2)/((pow(S,2)) * (S-1))) + (pow(F_Var,2)/((pow(S,2)) * (S-1)));
|
|
|
2153
|
|
|
2154 if(t > 0)
|
|
|
2155 pvalue = gsl_cdf_tdist_Q(t,dof);
|
|
|
2156 else
|
|
|
2157 {
|
|
|
2158 pvalue = gsl_cdf_tdist_P(t,dof);
|
|
|
2159 O_U = true;
|
|
|
2160 }
|
|
|
2161
|
|
|
2162 return;
|
|
|
2163 }
|
|
|
2164
|
|
|
2165 void matrix::z_test_pos(double S)
|
|
|
2166 {
|
|
|
2167 // if(!QUIET) cerr << endl << Name;
|
|
|
2168
|
|
|
2169 for(unsigned int i = 0; i < BIN_NUM; i++)
|
|
|
2170 {
|
|
|
2171 double se = sqrt(W_R_Var.at(i)), z;
|
|
|
2172 se /= sqrt(S/2);
|
|
|
2173
|
|
|
2174 z = W_R_Avg.at(i) - W_Avg;
|
|
|
2175 z /=se;
|
|
|
2176
|
|
|
2177 if(z >= 0)
|
|
|
2178 W_O_U.at(i) = true;
|
|
|
2179
|
|
|
2180 W_R_pvalue.at(i) = gsl_cdf_ugaussian_Q(z);
|
|
|
2181
|
|
|
2182 // if(!QUIET) cerr << endl << "W_R_Avg.at(i) " << W_R_Avg.at(i) << '\t' << W_Avg << '\t' << W_R_pvalue.at(i) << endl;
|
|
|
2183
|
|
|
2184 }
|
|
|
2185
|
|
|
2186 return;
|
|
|
2187 }
|
|
|
2188
|
|
|
2189 void matrix::welch_test_pos(double S)
|
|
|
2190 {
|
|
|
2191 if(S > 1)
|
|
|
2192 S = S/2; //SOLO LA PRIMA META' DELLE SEQUENZE!!!
|
|
|
2193
|
|
|
2194 // if(!QUIET) cerr << "\nRAVG = " << W_Avg << endl;
|
|
|
2195 for(unsigned int i = 0; i < BIN_NUM; i++)
|
|
|
2196 {
|
|
|
2197 double T, DOF;
|
|
|
2198
|
|
|
2199 T = W_R_Avg.at(i) - W_Avg;
|
|
|
2200
|
|
|
2201 // if(!QUIET) cerr << W_R_Avg.at(i) << '\t';
|
|
|
2202
|
|
|
2203 // if(!QUIET) cerr << endl << Name << endl << "T = " << T << '\t' << W_R_Avg.at(i) << '\t' << W_Avg;
|
|
|
2204
|
|
|
2205 if(T == 0)
|
|
|
2206 T = EPSILON;
|
|
|
2207
|
|
|
2208 T /= pow((W_R_Var.at(i)/S) + (W_Var/S),0.5);
|
|
|
2209
|
|
|
2210 DOF = pow((W_R_Var.at(i)/S) + (W_Var/S),2);
|
|
|
2211
|
|
|
2212 DOF /= (pow(W_R_Var.at(i),2)/((pow(S,2)) * (S-1))) + (pow(W_Var,2)/((pow(S,2)) * (S-1)));
|
|
|
2213
|
|
|
2214 if(T > 0)
|
|
|
2215 W_R_T_pvalue.at(i) = gsl_cdf_tdist_Q(T,DOF);
|
|
|
2216 else
|
|
|
2217 {
|
|
|
2218 W_R_T_pvalue.at(i) = gsl_cdf_tdist_P(T,DOF);
|
|
|
2219 W_O_U.at(i) = true;
|
|
|
2220 }
|
|
|
2221
|
|
|
2222 // if(!QUIET) cerr << "\tPV= " << W_R_pvalue.at(i);
|
|
|
2223
|
|
|
2224 }
|
|
|
2225
|
|
|
2226 // if(!QUIET) cerr << endl;
|
|
|
2227
|
|
|
2228 return;
|
|
|
2229 }
|
|
|
2230
|
|
|
2231 void matrix::welch_test_bg(double S1)
|
|
|
2232 {
|
|
|
2233 double S2 = bg_Size;
|
|
|
2234
|
|
|
2235 // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl;
|
|
|
2236
|
|
|
2237 t_bg = R_Avg - bg_Avg;
|
|
|
2238
|
|
|
2239 if(t_bg == 0)
|
|
|
2240 t_bg = EPSILON;
|
|
|
2241
|
|
|
2242 t_bg /= pow((R_Var/S1) + (bg_Var/S2),0.5);
|
|
|
2243
|
|
|
2244 dof_bg = pow((R_Var/S1) + (bg_Var/S2),2);
|
|
|
2245
|
|
|
2246 dof_bg /= (pow(R_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1)));
|
|
|
2247
|
|
|
2248 if(t_bg > 0)
|
|
|
2249 pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg);
|
|
|
2250 else
|
|
|
2251 {
|
|
|
2252 pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg);
|
|
|
2253 O_U_bg = true;
|
|
|
2254 }
|
|
|
2255
|
|
|
2256 return;
|
|
|
2257 }
|
|
|
2258
|
|
|
2259 /*void matrix::welch_test_bg_G(double S1)
|
|
|
2260 {
|
|
|
2261 double S2 = bg_Size;
|
|
|
2262
|
|
|
2263 // if(!QUIET) cerr << Name << '\t' << R_Avg << '\t' << bg_Avg << '\t' << R_Var << '\t' << bg_Var << '\t' << S1 << '\t' << S2 << endl;
|
|
|
2264
|
|
|
2265 t_bg = G_Avg - bg_Avg;
|
|
|
2266
|
|
|
2267 if(t_bg == 0)
|
|
|
2268 t_bg = EPSILON;
|
|
|
2269
|
|
|
2270 t_bg /= pow((G_Var/S1) + (bg_Var/S2),0.5);
|
|
|
2271
|
|
|
2272 dof_bg = pow((G_Var/S1) + (bg_Var/S2),2);
|
|
|
2273
|
|
|
2274 dof_bg /= (pow(G_Var,2)/((pow(S1,2)) * (S1-1))) + (pow(bg_Var,2)/((pow(S2,2)) * (S2-1)));
|
|
|
2275
|
|
|
2276 if(t_bg > 0)
|
|
|
2277 pvalue_bg = gsl_cdf_tdist_Q(t_bg,dof_bg);
|
|
|
2278 else
|
|
|
2279 {
|
|
|
2280 pvalue_bg = gsl_cdf_tdist_P(t_bg,dof_bg);
|
|
|
2281 O_U_bg = true;
|
|
|
2282 }
|
|
|
2283
|
|
|
2284 return;
|
|
|
2285 }*/
|
|
|
2286
|
|
|
2287 double matrix::scan_seq_ds(char *S, bool flank, unsigned int l, vector<double> *W_buf)
|
|
|
2288 {
|
|
|
2289 unsigned int pos = 0, max_pos = 0;
|
|
|
2290 double seq_score;
|
|
|
2291 bool strand = false;
|
|
|
2292
|
|
|
2293 l -= L/2;
|
|
|
2294
|
|
|
2295 while(pos < l)
|
|
|
2296 {
|
|
|
2297
|
|
|
2298 double pos_score[2] = {0,0};
|
|
|
2299
|
|
|
2300 for(unsigned int j = 0; j < L; j++)
|
|
|
2301 {
|
|
|
2302 pos_score[0] += get_M_value(j, S[pos+j]);
|
|
|
2303 pos_score[1] += get_rc_M_value(j, S[pos+j]);
|
|
|
2304 }
|
|
|
2305
|
|
|
2306 if(seq_score < pos_score[0] || !pos)
|
|
|
2307 {
|
|
|
2308 seq_score = pos_score[0];
|
|
|
2309 max_pos = pos;
|
|
|
2310 strand = false;
|
|
|
2311 }
|
|
|
2312 if(seq_score < pos_score[1])
|
|
|
2313 {
|
|
|
2314 seq_score = pos_score[1];
|
|
|
2315 max_pos = pos;
|
|
|
2316 strand = true;
|
|
|
2317 }
|
|
|
2318
|
|
|
2319 if(!flank && POSITIONAL)
|
|
|
2320 {
|
|
|
2321 unsigned int vpos = pos / W_STEP;
|
|
|
2322
|
|
|
2323 if(W_buf->at(vpos) < pos_score[0] || !(pos % W_STEP))
|
|
|
2324 W_buf->at(vpos) = pos_score[0];
|
|
|
2325
|
|
|
2326 if(W_buf->at(vpos) < pos_score[1])
|
|
|
2327 W_buf->at(vpos) = pos_score[1];
|
|
|
2328
|
|
|
2329 if(vpos)
|
|
|
2330 {
|
|
|
2331 if(W_buf->at(vpos - 1) < pos_score[0])
|
|
|
2332 W_buf->at(vpos - 1) = pos_score[0];
|
|
|
2333
|
|
|
2334 if(W_buf->at(vpos - 1) < pos_score[1])
|
|
|
2335 W_buf->at(vpos - 1) = pos_score[1];
|
|
|
2336 }
|
|
|
2337 }
|
|
|
2338
|
|
|
2339 pos++;
|
|
|
2340 }
|
|
|
2341
|
|
|
2342 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
|
|
|
2343
|
|
|
2344 if(!flank && POSITIONAL)
|
|
|
2345 {
|
|
|
2346 // if(!QUIET) cerr << endl << Name << endl;
|
|
|
2347
|
|
|
2348 unsigned int binm = 0;
|
|
|
2349
|
|
|
2350 for(unsigned int i = 0; i < BIN_NUM; i++)
|
|
|
2351 {
|
|
|
2352 if(W_buf->at(i) != 0)
|
|
|
2353 W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score);
|
|
|
2354 else
|
|
|
2355 binm++;
|
|
|
2356 }
|
|
|
2357
|
|
|
2358 BIN_NUM -= binm;
|
|
|
2359 }
|
|
|
2360
|
|
|
2361 if(!flank)
|
|
|
2362 {
|
|
|
2363 v_score.push_back(seq_score);
|
|
|
2364 v_pos.push_back(max_pos + L/2);
|
|
|
2365 v_strand.push_back(strand);
|
|
|
2366 }
|
|
|
2367
|
|
|
2368 return seq_score;
|
|
|
2369 }
|
|
|
2370
|
|
|
2371 double matrix::scan_seq_ss_second_best(char *S, unsigned int l, unsigned int *P, bool *s, unsigned int POS, bool seq_strand)
|
|
|
2372 {
|
|
|
2373 unsigned int pos = 0, max_pos = 0;
|
|
|
2374 double seq_score = min_score;
|
|
|
2375 bool strand = false;
|
|
|
2376
|
|
|
2377 while(pos < l)
|
|
|
2378 {
|
|
|
2379 if(pos == POS)
|
|
|
2380 {
|
|
|
2381 pos++;
|
|
|
2382 continue;
|
|
|
2383 }
|
|
|
2384
|
|
|
2385 double pos_score[2] = {0,0};
|
|
|
2386
|
|
|
2387 if(seq_strand == '+')
|
|
|
2388 {
|
|
|
2389 for(unsigned int j = 0; j < L; j++)
|
|
|
2390 pos_score[0] += get_M_value(j, S[pos+j]);
|
|
|
2391 }
|
|
|
2392 else
|
|
|
2393 {
|
|
|
2394 for(unsigned int j = 0; j < L; j++)
|
|
|
2395 pos_score[0] += get_rc_M_value(j, S[pos+j]);
|
|
|
2396 }
|
|
|
2397
|
|
|
2398 if(seq_score < pos_score[0])
|
|
|
2399 {
|
|
|
2400 seq_score = pos_score[0];
|
|
|
2401 max_pos = pos;
|
|
|
2402 strand = false;
|
|
|
2403 }
|
|
|
2404
|
|
|
2405
|
|
|
2406 pos++;
|
|
|
2407 }
|
|
|
2408
|
|
|
2409 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
|
|
|
2410
|
|
|
2411 *P = max_pos;
|
|
|
2412 *s = strand;
|
|
|
2413
|
|
|
2414 return seq_score;
|
|
|
2415 }
|
|
|
2416
|
|
|
2417 double matrix::scan_seq_ds_second_best(char *S, unsigned int l, unsigned int *P, bool *s, int POS)
|
|
|
2418 {
|
|
|
2419 int pos = 0, max_pos = 0, count = 0;
|
|
|
2420 double seq_score;
|
|
|
2421 bool strand = false;
|
|
|
2422
|
|
|
2423 POS -= L/2;
|
|
|
2424 l -= L/2;
|
|
|
2425
|
|
|
2426 while(pos < l)
|
|
|
2427 {
|
|
|
2428 // if(pos == POS)
|
|
|
2429 if(pos >= (POS - L) && pos <= (POS + L))
|
|
|
2430 {
|
|
|
2431 pos++;
|
|
|
2432 continue;
|
|
|
2433 }
|
|
|
2434
|
|
|
2435 double pos_score[2] = {0,0};
|
|
|
2436
|
|
|
2437 for(unsigned int j = 0; j < L; j++)
|
|
|
2438 {
|
|
|
2439 pos_score[0] += get_M_value(j, S[pos+j]);
|
|
|
2440 pos_score[1] += get_rc_M_value(j, S[pos+j]);
|
|
|
2441 }
|
|
|
2442
|
|
|
2443 if(seq_score < pos_score[0] || !count)
|
|
|
2444 {
|
|
|
2445 seq_score = pos_score[0];
|
|
|
2446 max_pos = pos;
|
|
|
2447 strand = false;
|
|
|
2448 }
|
|
|
2449 if(seq_score < pos_score[1])
|
|
|
2450 {
|
|
|
2451 seq_score = pos_score[1];
|
|
|
2452 max_pos = pos;
|
|
|
2453 strand = true;
|
|
|
2454 }
|
|
|
2455
|
|
|
2456 pos++;
|
|
|
2457 count++;
|
|
|
2458 }
|
|
|
2459
|
|
|
2460 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
|
|
|
2461
|
|
|
2462 *P = max_pos ;//+ L/2;
|
|
|
2463 *s = strand;
|
|
|
2464
|
|
|
2465 return seq_score;
|
|
|
2466 }
|
|
|
2467
|
|
|
2468
|
|
|
2469
|
|
|
2470 /*double matrix::scan_seq_ds_quick(char *S, bool flank, unsigned int l)
|
|
|
2471 {
|
|
|
2472 unsigned int pos = 0, max_pos = 0;
|
|
|
2473 double seq_score = min_score;
|
|
|
2474 bool strand = false;
|
|
|
2475
|
|
|
2476 while(pos < l)
|
|
|
2477 {
|
|
|
2478 double pos_score[2] = {0,0};
|
|
|
2479
|
|
|
2480 for(unsigned int j = 0; j < L; j++)
|
|
|
2481 {
|
|
|
2482 pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]);
|
|
|
2483
|
|
|
2484 if((pos_score[0] + p_max[j+1]) <= seq_score)
|
|
|
2485 {
|
|
|
2486 pos_score[0] = min_score;
|
|
|
2487 break;
|
|
|
2488 }
|
|
|
2489
|
|
|
2490 }
|
|
|
2491
|
|
|
2492 for(unsigned int j = 0; j < L; j++)
|
|
|
2493 {
|
|
|
2494 pos_score[1] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]);
|
|
|
2495
|
|
|
2496 if((pos_score[1] + p_max[j+1]) <= seq_score)
|
|
|
2497 {
|
|
|
2498 pos_score[1] = min_score;
|
|
|
2499 break;
|
|
|
2500 }
|
|
|
2501 }
|
|
|
2502
|
|
|
2503 if(seq_score < pos_score[0])
|
|
|
2504 {
|
|
|
2505 seq_score = pos_score[0];
|
|
|
2506 max_pos = pos;
|
|
|
2507 strand = false;
|
|
|
2508 }
|
|
|
2509 if(seq_score < pos_score[1])
|
|
|
2510 {
|
|
|
2511 seq_score = pos_score[1];
|
|
|
2512 max_pos = pos;
|
|
|
2513 strand = true;
|
|
|
2514 }
|
|
|
2515
|
|
|
2516 pos++;
|
|
|
2517 }
|
|
|
2518
|
|
|
2519 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
|
|
|
2520
|
|
|
2521 if(!flank)
|
|
|
2522 {
|
|
|
2523 v_score.push_back(seq_score);
|
|
|
2524 v_pos.push_back(max_pos);
|
|
|
2525 v_strand.push_back(strand);
|
|
|
2526 }
|
|
|
2527
|
|
|
2528
|
|
|
2529 return seq_score;
|
|
|
2530 }
|
|
|
2531 */
|
|
|
2532 double matrix::scan_seq_ss(char *S, bool flank, unsigned int l, char seq_strand, vector<double> *W_buf)
|
|
|
2533 {
|
|
|
2534 unsigned int pos = 0, max_pos = 0;
|
|
|
2535 double seq_score = min_score;
|
|
|
2536 bool strand = false;
|
|
|
2537
|
|
|
2538 while(pos < l)
|
|
|
2539 {
|
|
|
2540 double pos_score[2] = {0,0};
|
|
|
2541
|
|
|
2542 if(seq_strand == '+')
|
|
|
2543 {
|
|
|
2544 for(unsigned int j = 0; j < L; j++)
|
|
|
2545 pos_score[0] += get_M_value(j, S[pos+j]);
|
|
|
2546 }
|
|
|
2547 else
|
|
|
2548 {
|
|
|
2549 for(unsigned int j = 0; j < L; j++)
|
|
|
2550 pos_score[0] += get_rc_M_value(j, S[pos+j]);
|
|
|
2551 }
|
|
|
2552
|
|
|
2553 if(seq_score < pos_score[0])
|
|
|
2554 {
|
|
|
2555 seq_score = pos_score[0];
|
|
|
2556 max_pos = pos;
|
|
|
2557 strand = false;
|
|
|
2558 }
|
|
|
2559
|
|
|
2560 if(!flank && POSITIONAL)
|
|
|
2561 {
|
|
|
2562 unsigned int vpos = pos / W_STEP;
|
|
|
2563
|
|
|
2564 if(W_buf->at(vpos) < pos_score[0])
|
|
|
2565 W_buf->at(vpos) = pos_score[0];
|
|
|
2566
|
|
|
2567 if(vpos)
|
|
|
2568 {
|
|
|
2569 if(W_buf->at(vpos - 1) < pos_score[0])
|
|
|
2570 W_buf->at(vpos - 1) = pos_score[0];
|
|
|
2571 }
|
|
|
2572 }
|
|
|
2573
|
|
|
2574 pos++;
|
|
|
2575 }
|
|
|
2576
|
|
|
2577 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
|
|
|
2578
|
|
|
2579 if(POSITIONAL && !flank)
|
|
|
2580 for(unsigned int i = 0; i < W_buf->size(); i++)
|
|
|
2581 W_buf->at(i) = 1 + (W_buf->at(i) - max_score)/(max_score - min_score);
|
|
|
2582
|
|
|
2583 if(!flank)
|
|
|
2584 {
|
|
|
2585 v_score.push_back(seq_score);
|
|
|
2586 v_pos.push_back(max_pos);
|
|
|
2587 v_strand.push_back(strand);
|
|
|
2588 }
|
|
|
2589
|
|
|
2590 return seq_score;
|
|
|
2591 }
|
|
|
2592
|
|
|
2593 /* double matrix::scan_seq_ss_quick(char *S, bool flank, unsigned int l, char seq_strand)
|
|
|
2594 {
|
|
|
2595 unsigned int pos = 0, max_pos = 0;
|
|
|
2596 double seq_score = min_score;
|
|
|
2597 bool strand = false;
|
|
|
2598
|
|
|
2599 while(pos < l)
|
|
|
2600 {
|
|
|
2601 double pos_score[2] = {0,0};
|
|
|
2602
|
|
|
2603 if(seq_strand == '+')
|
|
|
2604 {
|
|
|
2605 for(unsigned int j = 0; j < L; j++)
|
|
|
2606 {
|
|
|
2607 pos_score[0] += get_M_value(p_pos[j], S[pos+p_pos[j]]);
|
|
|
2608
|
|
|
2609 if((pos_score[0] + p_max[j+1]) <= seq_score)
|
|
|
2610 {
|
|
|
2611 pos_score[0] = min_score;
|
|
|
2612 break;
|
|
|
2613 }
|
|
|
2614
|
|
|
2615
|
|
|
2616 }
|
|
|
2617 }
|
|
|
2618 else
|
|
|
2619 {
|
|
|
2620 for(unsigned int j = 0; j < L; j++)
|
|
|
2621 {
|
|
|
2622 pos_score[0] += get_rc_M_value(rc_p_pos[j], S[pos+rc_p_pos[j]]);
|
|
|
2623
|
|
|
2624 if((pos_score[0] + p_max[j+1]) <= seq_score)
|
|
|
2625 {
|
|
|
2626 pos_score[0] = min_score;
|
|
|
2627 break;
|
|
|
2628 }
|
|
|
2629
|
|
|
2630 }
|
|
|
2631 }
|
|
|
2632
|
|
|
2633 if(seq_score < pos_score[0])
|
|
|
2634 {
|
|
|
2635 seq_score = pos_score[0];
|
|
|
2636 max_pos = pos;
|
|
|
2637 strand = false;
|
|
|
2638 }
|
|
|
2639
|
|
|
2640 pos++;
|
|
|
2641 }
|
|
|
2642
|
|
|
2643 seq_score = 1 + (seq_score - max_score)/(max_score - min_score);
|
|
|
2644
|
|
|
2645 if(!flank)
|
|
|
2646 {
|
|
|
2647 v_score.push_back(seq_score);
|
|
|
2648 v_pos.push_back(max_pos);
|
|
|
2649 v_strand.push_back(strand);
|
|
|
2650 }
|
|
|
2651
|
|
|
2652 return seq_score;
|
|
|
2653 }*/
|
|
|
2654
|
|
|
2655 double matrix::get_M_value(int pos, char C)
|
|
|
2656 {
|
|
|
2657 return M[pos][VI[C]];
|
|
|
2658 }
|
|
|
2659
|
|
|
2660 double matrix::get_rc_M_value(int pos, char C)
|
|
|
2661 {
|
|
|
2662 return rc_M[pos][VI[C]];
|
|
|
2663 }
|
|
|
2664
|
|
|
2665 /*
|
|
|
2666 void matrix::set_top_pos()
|
|
|
2667 {
|
|
|
2668 multimap<double, mpoint> Tmap;
|
|
|
2669 vector<double> p_max_buf, rc_p_max_buf;
|
|
|
2670
|
|
|
2671 for(unsigned int x = 0; x < L; x++)
|
|
|
2672 {
|
|
|
2673 for(unsigned int y = 0; y < ALPHABET_SIZE; y++)
|
|
|
2674 {
|
|
|
2675 mpoint p;
|
|
|
2676
|
|
|
2677 p.x = x;
|
|
|
2678 p.y = y;
|
|
|
2679
|
|
|
2680 Tmap.insert(make_pair(M[x][y],p));
|
|
|
2681
|
|
|
2682 if(!y)
|
|
|
2683 {
|
|
|
2684 p_max_buf.push_back(M[x][y]);
|
|
|
2685 rc_p_max_buf.push_back(rc_M[x][y]);
|
|
|
2686 }
|
|
|
2687 else
|
|
|
2688 {
|
|
|
2689 if(M[x][y] > p_max_buf.back())
|
|
|
2690 p_max_buf[p_max_buf.size() - 1] = M[x][y];
|
|
|
2691
|
|
|
2692 if(rc_M[x][y] > rc_p_max_buf.back())
|
|
|
2693 rc_p_max_buf[rc_p_max_buf.size() - 1] = rc_M[x][y];
|
|
|
2694 }
|
|
|
2695
|
|
|
2696 }
|
|
|
2697
|
|
|
2698 // if(!QUIET) cerr << endl << p_max_buf.back();
|
|
|
2699 }
|
|
|
2700
|
|
|
2701 multimap<double, mpoint>::reverse_iterator ri = Tmap.rbegin();
|
|
|
2702
|
|
|
2703 for(unsigned int i = 0; i < TOP_P; i++)
|
|
|
2704 {
|
|
|
2705 top_pos[i] = ri->second.x;
|
|
|
2706 top_char[i] = ALPHABET[ri->second.y];
|
|
|
2707
|
|
|
2708 rc_top_pos[i] = (L - 1) - ri->second.x;
|
|
|
2709 rc_top_char[i] = RC_ALPHABET[ri->second.y];
|
|
|
2710
|
|
|
2711 ri++;
|
|
|
2712 }
|
|
|
2713
|
|
|
2714 // if(!QUIET) cerr << endl << name() << '\t' << max_score << '\t' << min_score << endl;
|
|
|
2715
|
|
|
2716 while(p_max.size() != p_max_buf.size())
|
|
|
2717 {
|
|
|
2718 double buf = -INF;
|
|
|
2719 unsigned int pos;
|
|
|
2720
|
|
|
2721 for(unsigned int i = 0; i < p_max_buf.size(); i++)
|
|
|
2722 if(p_max_buf.at(i) > buf)
|
|
|
2723 {
|
|
|
2724 pos = i;
|
|
|
2725 buf = p_max_buf.at(i);
|
|
|
2726 }
|
|
|
2727
|
|
|
2728 p_pos.push_back(pos);
|
|
|
2729 rc_p_pos.push_back((L - 1) - pos);
|
|
|
2730
|
|
|
2731 p_max.push_back(buf);
|
|
|
2732
|
|
|
2733 p_max_buf[pos] = -INF;
|
|
|
2734 }
|
|
|
2735
|
|
|
2736 for(unsigned int i = 0; i < L; i++)
|
|
|
2737 {
|
|
|
2738 //if(!QUIET) cerr << p_max[i] << "//"<< '\t';
|
|
|
2739
|
|
|
2740 for(unsigned int j = i + 1; j < L; j++)
|
|
|
2741 p_max[i] += p_max[j];
|
|
|
2742
|
|
|
2743 // for(unsigned int j = i + 1; j < L; j++)
|
|
|
2744 // rc_p_max[i] += rc_p_max[j];
|
|
|
2745
|
|
|
2746 // if(!QUIET) cerr << p_max[i] << "(" << p_pos[i] << ")" << rc_p_max[i] << '\t';
|
|
|
2747 }
|
|
|
2748
|
|
|
2749 p_max.push_back(0); //WE NEED A 0 AT THE END OF THE VECTOR!
|
|
|
2750 // rc_p_max.push_back(0);
|
|
|
2751
|
|
|
2752 // if(!QUIET) cerr << endl;
|
|
|
2753
|
|
|
2754 return;
|
|
|
2755 }
|
|
|
2756 */
|
|
|
2757 double matrix::get_score_at(unsigned int a)
|
|
|
2758 {
|
|
|
2759 return v_score.at(a);
|
|
|
2760 }
|
|
|
2761 unsigned int matrix::get_pos_at(unsigned int a)
|
|
|
2762 {
|
|
|
2763 return v_pos.at(a);
|
|
|
2764 }
|
|
|
2765 bool matrix::get_strand_at(unsigned int a)
|
|
|
2766 {
|
|
|
2767 return v_strand.at(a);
|
|
|
2768 }
|
|
|
2769
|
|
|
2770 double matrix::max()
|
|
|
2771 {
|
|
|
2772 return max_score;
|
|
|
2773 }
|
|
|
2774
|
|
|
2775 double matrix::min()
|
|
|
2776 {
|
|
|
2777 return min_score;
|
|
|
2778 }
|
|
|
2779
|
|
|
2780 string matrix::name()
|
|
|
2781 {
|
|
|
2782 if(!Name.empty())
|
|
|
2783 return Name;
|
|
|
2784 else
|
|
|
2785 return Id;
|
|
|
2786 }
|
|
|
2787
|
|
|
2788 string matrix::id()
|
|
|
2789 {
|
|
|
2790 return Id;
|
|
|
2791 }
|
|
|
2792
|
|
|
2793 double matrix::get_pvalue()
|
|
|
2794 {
|
|
|
2795 return pvalue;
|
|
|
2796 }
|
|
|
2797
|
|
|
2798 double matrix::get_pvalue_bg()
|
|
|
2799 {
|
|
|
2800 return pvalue_bg;
|
|
|
2801 }
|
|
|
2802
|
|
|
2803 double matrix::get_W_pvalue(unsigned int pos)
|
|
|
2804 {
|
|
|
2805 return W_R_pvalue.at(pos);
|
|
|
2806 }
|
|
|
2807
|
|
|
2808 double matrix::get_W_R_Avg_at_pos(unsigned int pos)
|
|
|
2809 {
|
|
|
2810 return W_R_Avg.at(pos);
|
|
|
2811 }
|
|
|
2812
|
|
|
2813 string matrix::o_or_u()
|
|
|
2814 {
|
|
|
2815 if(O_U)
|
|
|
2816 return "UNDER";
|
|
|
2817 else
|
|
|
2818 return "OVER";
|
|
|
2819 }
|
|
|
2820
|
|
|
2821 string matrix::o_or_u_bg()
|
|
|
2822 {
|
|
|
2823 if(O_U_bg)
|
|
|
2824 return "UNDER";
|
|
|
2825 else
|
|
|
2826 return "OVER";
|
|
|
2827 }
|
|
|
2828
|
|
|
2829 int matrix::W_o_or_u(unsigned int pos)
|
|
|
2830 {
|
|
|
2831 if(W_O_U[pos])
|
|
|
2832 return -1;
|
|
|
2833 else
|
|
|
2834 return 1;
|
|
|
2835 }
|
|
|
2836
|
|
|
2837 double matrix::get_R_Avg()
|
|
|
2838 {
|
|
|
2839 return R_Avg;
|
|
|
2840 }
|
|
|
2841 double matrix::get_F_Avg()
|
|
|
2842 {
|
|
|
2843 return F_Avg;
|
|
|
2844 }
|
|
|
2845 double matrix::get_R_Var()
|
|
|
2846 {
|
|
|
2847 return R_Var;
|
|
|
2848 }
|
|
|
2849 double matrix::get_F_Var()
|
|
|
2850 {
|
|
|
2851 return F_Var;
|
|
|
2852 }
|
|
|
2853 double matrix::get_bg_Avg()
|
|
|
2854 {
|
|
|
2855 return bg_Avg;
|
|
|
2856 }
|
|
|
2857
|
|
|
2858 double matrix::get_t()
|
|
|
2859 {
|
|
|
2860 return t;
|
|
|
2861 }
|
|
|
2862
|
|
|
2863 double matrix::get_dof()
|
|
|
2864 {
|
|
|
2865 return dof;
|
|
|
2866 }
|
|
|
2867
|
|
|
2868 bool matrix::have_bg()
|
|
|
2869 {
|
|
|
2870 return BG;
|
|
|
2871 }
|
|
|
2872
|
|
|
2873 unsigned int matrix::bin_num()
|
|
|
2874 {
|
|
|
2875 return BIN_NUM;
|
|
|
2876 }
|
|
|
2877
|
|
|
2878 unsigned int matrix::length()
|
|
|
2879 {
|
|
|
2880 return L;
|
|
|
2881 }
|
|
|
2882
|
|
|
2883 double matrix::get_spearman()
|
|
|
2884 {
|
|
|
2885 double s = spearman;
|
|
|
2886
|
|
|
2887 s *= 10000;
|
|
|
2888
|
|
|
2889 if(s > 0)
|
|
|
2890 s = floor(s);
|
|
|
2891 else if(s < 0)
|
|
|
2892 s = ceil(s);
|
|
|
2893
|
|
|
2894 s /= 10000;
|
|
|
2895
|
|
|
2896 return s;
|
|
|
2897 }
|
|
|
2898
|
|
|
2899 double matrix::get_W_Avg()
|
|
|
2900 {
|
|
|
2901 return W_Avg;
|
|
|
2902 }
|
|
|
2903
|
|
|
2904 double matrix::get_W_R_T_pvalue(unsigned int i)
|
|
|
2905 {
|
|
|
2906 return W_R_T_pvalue.at(i);
|
|
|
2907 }
|
|
|
2908 // MATRIX CLASS END
|