comparison mafft/core/mafft-distance.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
comparison
equal deleted inserted replaced
17:836fa4fe9494 18:e4d75f9efb90
1 #include "mltaln.h"
2 #include "mtxutl.h"
3
4 #define DEBUG 0
5 #define TEST 0
6
7 #define END_OF_VEC -1
8
9 static int maxl;
10 static int tsize;
11 static char outputformat;
12 static float lenfaca, lenfacb, lenfacc, lenfacd;
13 static int nadd;
14 #define PLENFACA 0.01
15 #define PLENFACB 10000
16 #define PLENFACC 10000
17 #define PLENFACD 0.1
18 #define DLENFACA 0.01
19 #define DLENFACB 2500
20 #define DLENFACC 2500
21 #define DLENFACD 0.1
22
23 void arguments( int argc, char *argv[] )
24 {
25 int c;
26
27 inputfile = NULL;
28 outputformat = 's';
29 scoremtx = 1;
30 nblosum = 62;
31 dorp = NOTSPECIFIED;
32 nadd = 0;
33 alg = 'X';
34
35 while( --argc > 0 && (*++argv)[0] == '-' )
36 {
37 while ( (c = *++argv[0]) )
38 {
39 switch( c )
40 {
41 case 'i':
42 inputfile = *++argv;
43 fprintf( stderr, "inputfile = %s\n", inputfile );
44 --argc;
45 goto nextoption;
46 case 'I':
47 nadd = myatoi(*++argv);
48 if( nadd == 0 )
49 {
50 fprintf( stderr, "nadd = %d?\n", nadd );
51 exit( 1 );
52 }
53 --argc;
54 goto nextoption;
55 case 'p':
56 outputformat = 'p';
57 break;
58 case 'D':
59 dorp = 'd';
60 break;
61 case 'P':
62 dorp = 'p';
63 break;
64 default:
65 fprintf( stderr, "illegal option %c\n", c );
66 argc = 0;
67 break;
68 }
69 }
70 nextoption:
71 ;
72 }
73 if( inputfile == NULL )
74 {
75 argc--;
76 inputfile = *argv;
77 fprintf( stderr, "inputfile = %s\n", inputfile );
78 }
79 if( argc != 0 )
80 {
81 fprintf( stderr, "Usage: mafft-distance [-PD] [-i inputfile] inputfile > outputfile\n" );
82 exit( 1 );
83 }
84 }
85
86 void seq_grp_nuc( int *grp, char *seq )
87 {
88 int tmp;
89 int *grpbk = grp;
90 while( *seq )
91 {
92 tmp = amino_grp[(int)*seq++];
93 if( tmp < 4 )
94 *grp++ = tmp;
95 else
96 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
97 }
98 *grp = END_OF_VEC;
99 if( grp - grpbk < 6 )
100 {
101 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
102 // exit( 1 );
103 *grpbk = -1;
104 }
105 }
106
107 void seq_grp( int *grp, char *seq )
108 {
109 int tmp;
110 int *grpbk = grp;
111 while( *seq )
112 {
113 tmp = amino_grp[(int)*seq++];
114 if( tmp < 6 )
115 *grp++ = tmp;
116 else
117 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
118 }
119 *grp = END_OF_VEC;
120 if( grp - grpbk < 6 )
121 {
122 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
123 // exit( 1 );
124 *grpbk = -1;
125 }
126 }
127
128 void makecompositiontable_p( short *table, int *pointt )
129 {
130 int point;
131
132 while( ( point = *pointt++ ) != END_OF_VEC )
133 table[point]++;
134 }
135
136 int commonsextet_p( short *table, int *pointt )
137 {
138 int value = 0;
139 short tmp;
140 int point;
141 static short *memo = NULL;
142 static int *ct = NULL;
143 static int *cp;
144
145 if( *pointt == -1 )
146 return( 0 );
147
148 if( !memo )
149 {
150 memo = (short *)calloc( tsize, sizeof( short ) );
151 if( !memo ) ErrorExit( "Cannot allocate memo\n" );
152 ct = (int *)calloc( MIN( maxl, tsize)+1, sizeof( int ) );
153 if( !ct ) ErrorExit( "Cannot allocate memo\n" );
154 }
155
156 cp = ct;
157 while( ( point = *pointt++ ) != END_OF_VEC )
158 {
159 tmp = memo[point]++;
160 if( tmp < table[point] )
161 value++;
162 if( tmp == 0 ) *cp++ = point;
163 // fprintf( stderr, "cp - ct = %d (tsize = %d)\n", cp - ct, tsize );
164 }
165 *cp = END_OF_VEC;
166
167 cp = ct;
168 while( *cp != END_OF_VEC )
169 memo[*cp++] = 0;
170
171 return( value );
172 }
173
174 void makepointtable_nuc( int *pointt, int *n )
175 {
176 int point;
177 register int *p;
178
179 if( *n == -1 )
180 {
181 *pointt = -1;
182 return;
183 }
184
185 p = n;
186 point = *n++ * 1024;
187 point += *n++ * 256;
188 point += *n++ * 64;
189 point += *n++ * 16;
190 point += *n++ * 4;
191 point += *n++;
192 *pointt++ = point;
193
194 while( *n != END_OF_VEC )
195 {
196 point -= *p++ * 1024;
197 point *= 4;
198 point += *n++;
199 *pointt++ = point;
200 }
201 *pointt = END_OF_VEC;
202 }
203
204 void makepointtable( int *pointt, int *n )
205 {
206 int point;
207 register int *p;
208
209 if( *n == -1 )
210 {
211 *pointt = -1;
212 return;
213 }
214
215 p = n;
216 point = *n++ * 7776;
217 point += *n++ * 1296;
218 point += *n++ * 216;
219 point += *n++ * 36;
220 point += *n++ * 6;
221 point += *n++;
222 *pointt++ = point;
223
224 while( *n != END_OF_VEC )
225 {
226 point -= *p++ * 7776;
227 point *= 6;
228 point += *n++;
229 *pointt++ = point;
230 }
231 *pointt = END_OF_VEC;
232 }
233
234 int main( int argc, char **argv )
235 {
236 int i, j, initj;
237 FILE *infp;
238 char **seq;
239 int *grpseq;
240 char *tmpseq;
241 int **pointt;
242 static char **name;
243 static int *nlen;
244 double *mtxself;
245 float score;
246 static short *table1;
247 float longer, shorter;
248 float lenfac;
249 float bunbo;
250 int norg;
251
252 arguments( argc, argv );
253
254
255 if( inputfile )
256 {
257 infp = fopen( inputfile, "r" );
258 if( !infp )
259 {
260 fprintf( stderr, "Cannot open %s\n", inputfile );
261 exit( 1 );
262 }
263 }
264 else
265 infp = stdin;
266
267 #if 0
268 PreRead( stdin, &njob, &nlenmax );
269 #else
270 getnumlen( infp );
271 #endif
272 rewind( infp );
273 if( njob < 2 )
274 {
275 fprintf( stderr, "At least 2 sequences should be input!\n"
276 "Only %d sequence found.\n", njob );
277 exit( 1 );
278 }
279
280 tmpseq = AllocateCharVec( nlenmax+1 );
281 seq = AllocateCharMtx( njob, nlenmax+1 );
282 grpseq = AllocateIntVec( nlenmax+1 );
283 pointt = AllocateIntMtx( njob, nlenmax+1 );
284 mtxself = AllocateDoubleVec( njob );
285 pamN = NOTSPECIFIED;
286 name = AllocateCharMtx( njob, B );
287 nlen = AllocateIntVec( njob );
288
289 #if 0
290 FRead( infp, name, nlen, seq );
291 #else
292 readData_pointer( infp, name, nlen, seq );
293 #endif
294
295 fclose( infp );
296
297 constants( njob, seq );
298
299
300 if( nadd ) outputformat = 's';
301 norg = njob - nadd;
302
303 if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
304 else tsize = (int)pow( 6, 6 );
305
306 if( dorp == 'd' )
307 {
308 lenfaca = DLENFACA;
309 lenfacb = DLENFACB;
310 lenfacc = DLENFACC;
311 lenfacd = DLENFACD;
312 }
313 else
314 {
315 lenfaca = PLENFACA;
316 lenfacb = PLENFACB;
317 lenfacc = PLENFACC;
318 lenfacd = PLENFACD;
319 }
320
321 maxl = 0;
322 for( i=0; i<njob; i++ )
323 {
324 gappick0( tmpseq, seq[i] );
325 nlen[i] = strlen( tmpseq );
326 // if( nlen[i] < 6 )
327 // {
328 // fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] );
329 // exit( 1 );
330 // }
331 if( nlen[i] > maxl ) maxl = nlen[i];
332 if( dorp == 'd' ) /* nuc */
333 {
334 seq_grp_nuc( grpseq, tmpseq );
335 makepointtable_nuc( pointt[i], grpseq );
336 }
337 else /* amino */
338 {
339 seq_grp( grpseq, tmpseq );
340 makepointtable( pointt[i], grpseq );
341 }
342 }
343 fprintf( stderr, "\nCalculating i-i scores ... " );
344 for( i=0; i<njob; i++ )
345 {
346 table1 = (short *)calloc( tsize, sizeof( short ) );
347 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
348 makecompositiontable_p( table1, pointt[i] );
349
350 score = commonsextet_p( table1, pointt[i] );
351 mtxself[i] = score;
352 free( table1 );
353 }
354
355 fprintf( stderr, "done.\n" );
356 fprintf( stderr, "\nCalculating i-j scores ... \n" );
357 if( outputformat == 'p' ) fprintf( stdout, "%-5d", njob );
358 for( i=0; i<norg; i++ )
359 {
360 if( outputformat == 'p' ) fprintf( stdout, "\n%-9d ", i+1 );
361 table1 = (short *)calloc( tsize, sizeof( short ) );
362 if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
363 if( i % 10 == 0 )
364 {
365 fprintf( stderr, "%4d / %4d\r", i+1, njob );
366 }
367 makecompositiontable_p( table1, pointt[i] );
368
369
370 if( nadd == 0 )
371 {
372 if( outputformat == 'p' ) initj = 0;
373 else initj = i+1;
374 }
375 else
376 {
377 initj = norg;
378 }
379 for( j=initj; j<njob; j++ )
380 {
381 if( nlen[i] > nlen[j] )
382 {
383 longer=(float)nlen[i];
384 shorter=(float)nlen[j];
385 }
386 else
387 {
388 longer=(float)nlen[j];
389 shorter=(float)nlen[i];
390 }
391 // lenfac = 3.0 / ( LENFACA + LENFACB / ( longer + LENFACC ) + shorter / longer * LENFACD );
392 lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
393 // lenfac = 1.0;
394 // fprintf( stderr, "lenfac = %f (%.0f,%.0f)\n", lenfac, longer, shorter );
395 score = commonsextet_p( table1, pointt[j] );
396 bunbo = MIN( mtxself[i], mtxself[j] );
397 if( outputformat == 'p' )
398 {
399 if( bunbo == 0.0 )
400 fprintf( stdout, " %8.6f", 1.0 );
401 else
402 fprintf( stdout, " %8.6f", ( 1.0 - score / bunbo ) * lenfac );
403 if( j % 7 == 6 ) fprintf( stdout, "\n" );
404 }
405 else
406 {
407 if( bunbo == 0.0 )
408 fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, 1.0, nlen[i], nlen[j] );
409 else
410 fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / bunbo ) * lenfac, nlen[i], nlen[j] );
411 }
412 // fprintf( stderr, "##### mtx = %f, mtx[i][0]=%f, mtx[j][0]=%f, bunbo=%f\n", mtx[i][j-i], mtx[i][0], mtx[j][0], bunbo );
413 // score = (double)commonsextet_p( table1, pointt[j] );
414 // fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / MIN( mtxself[i], mtxself[j] ) ) * 3, nlen[i], nlen[j] );
415
416
417 }
418 free( table1 );
419 }
420
421 fprintf( stderr, "\n" );
422 if( outputformat == 'p' ) fprintf( stdout, "\n" );
423 SHOWVERSION;
424 exit( 0 );
425 }