comparison mafft/core/pair2hat3s.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #include "mltaln.h"
2
3 #define DEBUG 0
4 #define IODEBUG 0
5 #define SCOREOUT 1
6 #define TSUYOSAFACTOR 100
7
8
9 static char *pairfile;
10 static int nhomologs;
11
12 void strip( char *s )
13 {
14 char *pt = s;
15 while( *++pt )
16 if( *pt == '\n' ) *pt = 0;
17 }
18
19 int searchused( char *q, char **keys, int n )
20 {
21 int i;
22 for( i=0; i<n; i++ )
23 {
24 // fprintf( stderr, "%s ? %s\n", q, names[i] );
25 if( !strcmp( q, keys[i] ) ) return( i );
26 }
27 return( -1 );
28 }
29
30 void arguments( int argc, char *argv[] )
31 {
32 int c;
33
34 nhomologs = 2;
35 inputfile = NULL;
36 pairfile = NULL;
37 fftkeika = 0;
38 pslocal = -1000.0;
39 constraint = 0;
40 nblosum = 62;
41 fmodel = 0;
42 calledByXced = 0;
43 devide = 0;
44 use_fft = 0;
45 fftscore = 1;
46 fftRepeatStop = 0;
47 fftNoAnchStop = 0;
48 weight = 3;
49 utree = 1;
50 tbutree = 1;
51 refine = 0;
52 check = 1;
53 cut = 0.0;
54 disp = 0;
55 outgap = 1;
56 alg = 'A';
57 mix = 0;
58 tbitr = 0;
59 scmtd = 5;
60 tbweight = 0;
61 tbrweight = 3;
62 checkC = 0;
63 treemethod = 'x';
64 contin = 0;
65 scoremtx = 1;
66 kobetsubunkatsu = 0;
67 divpairscore = 0;
68 dorp = NOTSPECIFIED;
69 ppenalty = NOTSPECIFIED;
70 ppenalty_OP = NOTSPECIFIED;
71 ppenalty_ex = NOTSPECIFIED;
72 ppenalty_EX = NOTSPECIFIED;
73 poffset = NOTSPECIFIED;
74 kimuraR = NOTSPECIFIED;
75 pamN = NOTSPECIFIED;
76 geta2 = GETA2;
77 fftWinSize = NOTSPECIFIED;
78 fftThreshold = NOTSPECIFIED;
79
80 while( --argc > 0 && (*++argv)[0] == '-' )
81 {
82 while ( ( c = *++argv[0] ) )
83 {
84 switch( c )
85 {
86 case 'i':
87 inputfile = *++argv;
88 fprintf( stderr, "inputfile = %s\n", inputfile );
89 --argc;
90 goto nextoption;
91 case 'p':
92 pairfile = *++argv;
93 fprintf( stderr, "pairfile = %s\n", pairfile );
94 --argc;
95 goto nextoption;
96 case 't':
97 nhomologs = myatoi( *++argv );
98 fprintf( stderr, "nhomologs = %d\n", nhomologs );
99 --argc;
100 goto nextoption;
101 case 'D':
102 dorp = 'd';
103 break;
104 case 'P':
105 dorp = 'p';
106 break;
107 default:
108 fprintf( stderr, "illegal option %c\n", c );
109 argc = 0;
110 break;
111 }
112 }
113 nextoption:
114 ;
115 }
116 if( argc == 1 )
117 {
118 cut = atof( (*argv) );
119 argc--;
120 }
121 if( argc != 0 )
122 {
123 fprintf( stderr, "options: Check source file !\n" );
124 exit( 1 );
125 }
126 if( tbitr == 1 && outgap == 0 )
127 {
128 fprintf( stderr, "conflicting options : o, m or u\n" );
129 exit( 1 );
130 }
131 if( alg == 'C' && outgap == 0 )
132 {
133 fprintf( stderr, "conflicting options : C, o\n" );
134 exit( 1 );
135 }
136 }
137
138 int countamino( char *s, int end )
139 {
140 int val = 0;
141 while( end-- )
142 if( *s++ != '-' ) val++;
143 return( val );
144 }
145
146 static void pairalign( char **name, int nlen[M], char **seq, double *effarr, int alloclen )
147 {
148 FILE *tmpfp;
149 static char dumm1[B], dumm0[B];
150 int i, j;
151 char *res;
152 FILE *hat3p;
153 static double *effarr1 = NULL;
154 static double *effarr2 = NULL;
155 static char **pseq;
156 LocalHom **localhomtable, *tmpptr;
157 float pscore = 0.0; // by D.Mathog, aguess
158 char *aseq = NULL; // by D.Mathog
159 char **usedseqs = NULL; // by D.Mathog
160 char **usednames = NULL; // by D.Mathog
161 int nused;
162 double tsuyosa;
163
164 tsuyosa = (double)nhomologs * (nhomologs-1) / njob * TSUYOSAFACTOR;
165 fprintf( stderr, "tsuyosa = %f\n", tsuyosa );
166 localhomtable = (LocalHom **)calloc( njob, sizeof( LocalHom *) );
167 for( i=0; i<njob; i++)
168 {
169 localhomtable[i] = (LocalHom *)calloc( njob, sizeof( LocalHom ) );
170 for( j=0; j<njob; j++)
171 {
172 localhomtable[i][j].start1 = -1;
173 localhomtable[i][j].end1 = -1;
174 localhomtable[i][j].start2 = -1;
175 localhomtable[i][j].end2 = -1;
176 localhomtable[i][j].opt = -1.0;
177 localhomtable[i][j].next = NULL;
178 }
179 }
180
181 if( effarr1 == NULL )
182 {
183 effarr1 = AllocateDoubleVec( njob );
184 effarr2 = AllocateDoubleVec( njob );
185 pseq = AllocateCharMtx( 2, nlenmax*9+1 );
186 aseq = AllocateCharVec( nlenmax*9+1 );
187 usedseqs = AllocateCharMtx( njob, nlenmax*9+1 );
188 usednames = AllocateCharMtx( njob, B );
189 #if 0
190 #else
191 #endif
192 }
193
194 #if 0
195 fprintf( stderr, "##### fftwinsize = %d, fftthreshold = %d\n", fftWinSize, fftThreshold );
196 #endif
197
198 #if 0
199 for( i=0; i<njob; i++ )
200 fprintf( stderr, "TBFAST effarr[%d] = %f\n", i, effarr[i] );
201 #endif
202
203
204 // writePre( njob, name, nlen, aseq, 0 );
205
206 fprintf( stderr, "opening %s\n", pairfile );
207 tmpfp = fopen( pairfile, "r" );
208 if( !tmpfp )
209 {
210 fprintf( stderr, "Cannot open %s\n", pairfile );
211 exit( 1 );
212 }
213 searchKUorWA( tmpfp );
214 hat3p = fopen( "hat3", "w" );
215 if( !hat3p ) ErrorExit( "Cannot open hat3." );
216 nused = 0;
217 while( 1 )
218 {
219 res = fgets( dumm0, B-1, tmpfp );
220 strip( dumm0 );
221 if( res == NULL )
222 {
223 break;
224 }
225 load1SeqWithoutName_new( tmpfp, pseq[0] );
226 gappick0( aseq, pseq[0] );
227 i = searchused( aseq, usedseqs, nused );
228 if( i == -1 )
229 {
230 strcpy( usednames[nused], dumm0+1 );
231 strcpy( usedseqs[nused], aseq );
232 i = nused;
233 nused++;
234 }
235 fprintf( stderr, "i = %d\n", i );
236
237 res = fgets( dumm1, B-1, tmpfp );
238 strip( dumm1 );
239 if( res == NULL )
240 {
241 fprintf( stderr, "ERROR: The number of sequences in %s must be even.\n", pairfile );
242 exit( 1 );
243 }
244 load1SeqWithoutName_new( tmpfp, pseq[1] );
245 gappick0( aseq, pseq[1] );
246 j = searchused( aseq, usedseqs, nused );
247 if( j == -1 )
248 {
249 strcpy( usednames[nused], dumm1+1 );
250 strcpy( usedseqs[nused], aseq );
251 j = nused;
252 nused++;
253 }
254 fprintf( stderr, "j = %d\n", j );
255
256 if( strlen( pseq[0] ) != strlen( pseq[1] ) )
257 {
258 fprintf( stderr, "Not aligned, %s - %s\n", dumm0, dumm1 );
259 exit( 1 );
260 }
261
262
263 fprintf( stderr, "adding %d-%d\n", i, j );
264 putlocalhom2( pseq[0], pseq[1], localhomtable[i]+j, 0, 0, (int)pscore, strlen( pseq[0] ) );
265 for( tmpptr=localhomtable[i]+j; tmpptr; tmpptr=tmpptr->next )
266 {
267 if( tmpptr->opt == -1.0 ) continue;
268 fprintf( hat3p, "%d %d %d %6.3f %d %d %d %d %p\n", i, j, tmpptr->overlapaa, tmpptr->opt * tsuyosa, tmpptr->start1, tmpptr->end1, tmpptr->start2, tmpptr->end2, (void *)tmpptr->next );
269 }
270 }
271 fclose( tmpfp );
272 fclose( hat3p );
273
274 for( i=0; i<nused; i++ )
275 fprintf( stdout, ">%s\n%s\n", usednames[i], usedseqs[i] );
276
277
278 #if 0
279 fprintf( stderr, "##### writing hat3\n" );
280 hat3p = fopen( "hat3", "w" );
281 if( !hat3p ) ErrorExit( "Cannot open hat3." );
282 ilim = njob-1;
283 for( i=0; i<ilim; i++ )
284 {
285 for( j=i+1; j<njob; j++ )
286 {
287 for( tmpptr=localhomtable[i]+j; tmpptr; tmpptr=tmpptr->next )
288 {
289 if( tmpptr->opt == -1.0 ) continue;
290 fprintf( hat3p, "%d %d %d %6.3f %d %d %d %d %p\n", i, j, tmpptr->overlapaa, tmpptr->opt * tsuyosa, tmpptr->start1, tmpptr->end1, tmpptr->start2, tmpptr->end2, tmpptr->next );
291 }
292 }
293 }
294 fclose( hat3p );
295 #endif
296 #if DEBUG
297 fprintf( stderr, "calling FreeLocalHomTable\n" );
298 #endif
299 FreeLocalHomTable( localhomtable, njob );
300 #if DEBUG
301 fprintf( stderr, "done. FreeLocalHomTable\n" );
302 #endif
303 }
304
305 static void WriteOptions( FILE *fp )
306 {
307
308 if( dorp == 'd' ) fprintf( fp, "DNA\n" );
309 else
310 {
311 if ( scoremtx == 0 ) fprintf( fp, "JTT %dPAM\n", pamN );
312 else if( scoremtx == 1 ) fprintf( fp, "BLOSUM %d\n", nblosum );
313 else if( scoremtx == 2 ) fprintf( fp, "M-Y\n" );
314 }
315 fprintf( stderr, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
316 if( use_fft ) fprintf( fp, "FFT on\n" );
317
318 fprintf( fp, "tree-base method\n" );
319 if( tbrweight == 0 ) fprintf( fp, "unweighted\n" );
320 else if( tbrweight == 3 ) fprintf( fp, "clustalw-like weighting\n" );
321 if( tbitr || tbweight )
322 {
323 fprintf( fp, "iterate at each step\n" );
324 if( tbitr && tbrweight == 0 ) fprintf( fp, " unweighted\n" );
325 if( tbitr && tbrweight == 3 ) fprintf( fp, " reversely weighted\n" );
326 if( tbweight ) fprintf( fp, " weighted\n" );
327 fprintf( fp, "\n" );
328 }
329
330 fprintf( fp, "Gap Penalty = %+5.2f, %+5.2f, %+5.2f\n", (double)ppenalty/1000, (double)ppenalty_ex/1000, (double)poffset/1000 );
331
332 if( alg == 'a' )
333 fprintf( fp, "Algorithm A\n" );
334 else if( alg == 'A' )
335 fprintf( fp, "Algorithm A+\n" );
336 else if( alg == 'S' )
337 fprintf( fp, "Apgorithm S\n" );
338 else if( alg == 'C' )
339 fprintf( fp, "Apgorithm A+/C\n" );
340 else
341 fprintf( fp, "Unknown algorithm\n" );
342
343 if( treemethod == 'x' )
344 fprintf( fp, "Tree = UPGMA (3).\n" );
345 else if( treemethod == 's' )
346 fprintf( fp, "Tree = UPGMA (2).\n" );
347 else if( treemethod == 'p' )
348 fprintf( fp, "Tree = UPGMA (1).\n" );
349 else
350 fprintf( fp, "Unknown tree.\n" );
351
352 if( use_fft )
353 {
354 fprintf( fp, "FFT on\n" );
355 if( dorp == 'd' )
356 fprintf( fp, "Basis : 4 nucleotides\n" );
357 else
358 {
359 if( fftscore )
360 fprintf( fp, "Basis : Polarity and Volume\n" );
361 else
362 fprintf( fp, "Basis : 20 amino acids\n" );
363 }
364 fprintf( fp, "Threshold of anchors = %d%%\n", fftThreshold );
365 fprintf( fp, "window size of anchors = %dsites\n", fftWinSize );
366 }
367 else
368 fprintf( fp, "FFT off\n" );
369 fflush( fp );
370 }
371
372
373 int main( int argc, char *argv[] )
374 {
375 static int nlen[M];
376 static char **name, **seq;
377 static char **bseq;
378 static double *eff;
379 int i;
380 char c;
381 int alloclen;
382 FILE *infp;
383
384 arguments( argc, argv );
385
386 if( inputfile )
387 {
388 infp = fopen( inputfile, "r" );
389 if( !infp )
390 {
391 fprintf( stderr, "Cannot open %s\n", inputfile );
392 exit( 1 );
393 }
394 }
395 else
396 infp = stdin;
397
398 if( !pairfile )
399 {
400 fprintf( stderr, "Usage: %s -p pairfile -i inputfile \n", argv[0] );
401 exit( 1 );
402 }
403
404 getnumlen( infp );
405 rewind( infp );
406
407 if( njob < 2 )
408 {
409 fprintf( stderr, "At least 2 sequences should be input!\n"
410 "Only %d sequence found.\n", njob );
411 exit( 1 );
412 }
413
414 name = AllocateCharMtx( njob, B+1 );
415 seq = AllocateCharMtx( njob, nlenmax*9+1 );
416 bseq = AllocateCharMtx( njob, nlenmax*9+1 );
417 alloclen = nlenmax*9;
418
419 eff = AllocateDoubleVec( njob );
420
421 #if 0
422 Read( name, nlen, seq );
423 #else
424 readData_pointer( infp, name, nlen, seq );
425 #endif
426 fclose( infp );
427
428 constants( njob, seq );
429
430 #if 0
431 fprintf( stderr, "params = %d, %d, %d\n", penalty, penalty_ex, offset );
432 #endif
433
434 initSignalSM();
435
436 initFiles();
437
438 WriteOptions( trap_g );
439
440 c = seqcheck( seq );
441 if( c )
442 {
443 fprintf( stderr, "Illeagal character %c\n", c );
444 exit( 1 );
445 }
446
447 // writePre( njob, name, nlen, seq, 0 );
448
449 for( i=0; i<njob; i++ ) eff[i] = 1.0;
450
451
452 for( i=0; i<njob; i++ ) gappick0( bseq[i], seq[i] );
453
454
455 pairalign( name, nlen, bseq, eff, alloclen );
456
457 fprintf( trap_g, "done.\n" );
458 #if DEBUG
459 fprintf( stderr, "closing trap_g\n" );
460 #endif
461 fclose( trap_g );
462
463 #if IODEBUG
464 fprintf( stderr, "OSHIMAI\n" );
465 #endif
466 SHOWVERSION;
467 return( 0 );
468 }