Mercurial > repos > nick > duplex
comparison mafft/core/mafft-distance.c @ 18:e4d75f9efb90 draft
planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author | nick |
---|---|
date | Thu, 02 Feb 2017 18:44:31 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:836fa4fe9494 | 18:e4d75f9efb90 |
---|---|
1 #include "mltaln.h" | |
2 #include "mtxutl.h" | |
3 | |
4 #define DEBUG 0 | |
5 #define TEST 0 | |
6 | |
7 #define END_OF_VEC -1 | |
8 | |
9 static int maxl; | |
10 static int tsize; | |
11 static char outputformat; | |
12 static float lenfaca, lenfacb, lenfacc, lenfacd; | |
13 static int nadd; | |
14 #define PLENFACA 0.01 | |
15 #define PLENFACB 10000 | |
16 #define PLENFACC 10000 | |
17 #define PLENFACD 0.1 | |
18 #define DLENFACA 0.01 | |
19 #define DLENFACB 2500 | |
20 #define DLENFACC 2500 | |
21 #define DLENFACD 0.1 | |
22 | |
23 void arguments( int argc, char *argv[] ) | |
24 { | |
25 int c; | |
26 | |
27 inputfile = NULL; | |
28 outputformat = 's'; | |
29 scoremtx = 1; | |
30 nblosum = 62; | |
31 dorp = NOTSPECIFIED; | |
32 nadd = 0; | |
33 alg = 'X'; | |
34 | |
35 while( --argc > 0 && (*++argv)[0] == '-' ) | |
36 { | |
37 while ( (c = *++argv[0]) ) | |
38 { | |
39 switch( c ) | |
40 { | |
41 case 'i': | |
42 inputfile = *++argv; | |
43 fprintf( stderr, "inputfile = %s\n", inputfile ); | |
44 --argc; | |
45 goto nextoption; | |
46 case 'I': | |
47 nadd = myatoi(*++argv); | |
48 if( nadd == 0 ) | |
49 { | |
50 fprintf( stderr, "nadd = %d?\n", nadd ); | |
51 exit( 1 ); | |
52 } | |
53 --argc; | |
54 goto nextoption; | |
55 case 'p': | |
56 outputformat = 'p'; | |
57 break; | |
58 case 'D': | |
59 dorp = 'd'; | |
60 break; | |
61 case 'P': | |
62 dorp = 'p'; | |
63 break; | |
64 default: | |
65 fprintf( stderr, "illegal option %c\n", c ); | |
66 argc = 0; | |
67 break; | |
68 } | |
69 } | |
70 nextoption: | |
71 ; | |
72 } | |
73 if( inputfile == NULL ) | |
74 { | |
75 argc--; | |
76 inputfile = *argv; | |
77 fprintf( stderr, "inputfile = %s\n", inputfile ); | |
78 } | |
79 if( argc != 0 ) | |
80 { | |
81 fprintf( stderr, "Usage: mafft-distance [-PD] [-i inputfile] inputfile > outputfile\n" ); | |
82 exit( 1 ); | |
83 } | |
84 } | |
85 | |
86 void seq_grp_nuc( int *grp, char *seq ) | |
87 { | |
88 int tmp; | |
89 int *grpbk = grp; | |
90 while( *seq ) | |
91 { | |
92 tmp = amino_grp[(int)*seq++]; | |
93 if( tmp < 4 ) | |
94 *grp++ = tmp; | |
95 else | |
96 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) ); | |
97 } | |
98 *grp = END_OF_VEC; | |
99 if( grp - grpbk < 6 ) | |
100 { | |
101 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" ); | |
102 // exit( 1 ); | |
103 *grpbk = -1; | |
104 } | |
105 } | |
106 | |
107 void seq_grp( int *grp, char *seq ) | |
108 { | |
109 int tmp; | |
110 int *grpbk = grp; | |
111 while( *seq ) | |
112 { | |
113 tmp = amino_grp[(int)*seq++]; | |
114 if( tmp < 6 ) | |
115 *grp++ = tmp; | |
116 else | |
117 fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) ); | |
118 } | |
119 *grp = END_OF_VEC; | |
120 if( grp - grpbk < 6 ) | |
121 { | |
122 fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" ); | |
123 // exit( 1 ); | |
124 *grpbk = -1; | |
125 } | |
126 } | |
127 | |
128 void makecompositiontable_p( short *table, int *pointt ) | |
129 { | |
130 int point; | |
131 | |
132 while( ( point = *pointt++ ) != END_OF_VEC ) | |
133 table[point]++; | |
134 } | |
135 | |
136 int commonsextet_p( short *table, int *pointt ) | |
137 { | |
138 int value = 0; | |
139 short tmp; | |
140 int point; | |
141 static short *memo = NULL; | |
142 static int *ct = NULL; | |
143 static int *cp; | |
144 | |
145 if( *pointt == -1 ) | |
146 return( 0 ); | |
147 | |
148 if( !memo ) | |
149 { | |
150 memo = (short *)calloc( tsize, sizeof( short ) ); | |
151 if( !memo ) ErrorExit( "Cannot allocate memo\n" ); | |
152 ct = (int *)calloc( MIN( maxl, tsize)+1, sizeof( int ) ); | |
153 if( !ct ) ErrorExit( "Cannot allocate memo\n" ); | |
154 } | |
155 | |
156 cp = ct; | |
157 while( ( point = *pointt++ ) != END_OF_VEC ) | |
158 { | |
159 tmp = memo[point]++; | |
160 if( tmp < table[point] ) | |
161 value++; | |
162 if( tmp == 0 ) *cp++ = point; | |
163 // fprintf( stderr, "cp - ct = %d (tsize = %d)\n", cp - ct, tsize ); | |
164 } | |
165 *cp = END_OF_VEC; | |
166 | |
167 cp = ct; | |
168 while( *cp != END_OF_VEC ) | |
169 memo[*cp++] = 0; | |
170 | |
171 return( value ); | |
172 } | |
173 | |
174 void makepointtable_nuc( int *pointt, int *n ) | |
175 { | |
176 int point; | |
177 register int *p; | |
178 | |
179 if( *n == -1 ) | |
180 { | |
181 *pointt = -1; | |
182 return; | |
183 } | |
184 | |
185 p = n; | |
186 point = *n++ * 1024; | |
187 point += *n++ * 256; | |
188 point += *n++ * 64; | |
189 point += *n++ * 16; | |
190 point += *n++ * 4; | |
191 point += *n++; | |
192 *pointt++ = point; | |
193 | |
194 while( *n != END_OF_VEC ) | |
195 { | |
196 point -= *p++ * 1024; | |
197 point *= 4; | |
198 point += *n++; | |
199 *pointt++ = point; | |
200 } | |
201 *pointt = END_OF_VEC; | |
202 } | |
203 | |
204 void makepointtable( int *pointt, int *n ) | |
205 { | |
206 int point; | |
207 register int *p; | |
208 | |
209 if( *n == -1 ) | |
210 { | |
211 *pointt = -1; | |
212 return; | |
213 } | |
214 | |
215 p = n; | |
216 point = *n++ * 7776; | |
217 point += *n++ * 1296; | |
218 point += *n++ * 216; | |
219 point += *n++ * 36; | |
220 point += *n++ * 6; | |
221 point += *n++; | |
222 *pointt++ = point; | |
223 | |
224 while( *n != END_OF_VEC ) | |
225 { | |
226 point -= *p++ * 7776; | |
227 point *= 6; | |
228 point += *n++; | |
229 *pointt++ = point; | |
230 } | |
231 *pointt = END_OF_VEC; | |
232 } | |
233 | |
234 int main( int argc, char **argv ) | |
235 { | |
236 int i, j, initj; | |
237 FILE *infp; | |
238 char **seq; | |
239 int *grpseq; | |
240 char *tmpseq; | |
241 int **pointt; | |
242 static char **name; | |
243 static int *nlen; | |
244 double *mtxself; | |
245 float score; | |
246 static short *table1; | |
247 float longer, shorter; | |
248 float lenfac; | |
249 float bunbo; | |
250 int norg; | |
251 | |
252 arguments( argc, argv ); | |
253 | |
254 | |
255 if( inputfile ) | |
256 { | |
257 infp = fopen( inputfile, "r" ); | |
258 if( !infp ) | |
259 { | |
260 fprintf( stderr, "Cannot open %s\n", inputfile ); | |
261 exit( 1 ); | |
262 } | |
263 } | |
264 else | |
265 infp = stdin; | |
266 | |
267 #if 0 | |
268 PreRead( stdin, &njob, &nlenmax ); | |
269 #else | |
270 getnumlen( infp ); | |
271 #endif | |
272 rewind( infp ); | |
273 if( njob < 2 ) | |
274 { | |
275 fprintf( stderr, "At least 2 sequences should be input!\n" | |
276 "Only %d sequence found.\n", njob ); | |
277 exit( 1 ); | |
278 } | |
279 | |
280 tmpseq = AllocateCharVec( nlenmax+1 ); | |
281 seq = AllocateCharMtx( njob, nlenmax+1 ); | |
282 grpseq = AllocateIntVec( nlenmax+1 ); | |
283 pointt = AllocateIntMtx( njob, nlenmax+1 ); | |
284 mtxself = AllocateDoubleVec( njob ); | |
285 pamN = NOTSPECIFIED; | |
286 name = AllocateCharMtx( njob, B ); | |
287 nlen = AllocateIntVec( njob ); | |
288 | |
289 #if 0 | |
290 FRead( infp, name, nlen, seq ); | |
291 #else | |
292 readData_pointer( infp, name, nlen, seq ); | |
293 #endif | |
294 | |
295 fclose( infp ); | |
296 | |
297 constants( njob, seq ); | |
298 | |
299 | |
300 if( nadd ) outputformat = 's'; | |
301 norg = njob - nadd; | |
302 | |
303 if( dorp == 'd' ) tsize = (int)pow( 4, 6 ); | |
304 else tsize = (int)pow( 6, 6 ); | |
305 | |
306 if( dorp == 'd' ) | |
307 { | |
308 lenfaca = DLENFACA; | |
309 lenfacb = DLENFACB; | |
310 lenfacc = DLENFACC; | |
311 lenfacd = DLENFACD; | |
312 } | |
313 else | |
314 { | |
315 lenfaca = PLENFACA; | |
316 lenfacb = PLENFACB; | |
317 lenfacc = PLENFACC; | |
318 lenfacd = PLENFACD; | |
319 } | |
320 | |
321 maxl = 0; | |
322 for( i=0; i<njob; i++ ) | |
323 { | |
324 gappick0( tmpseq, seq[i] ); | |
325 nlen[i] = strlen( tmpseq ); | |
326 // if( nlen[i] < 6 ) | |
327 // { | |
328 // fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] ); | |
329 // exit( 1 ); | |
330 // } | |
331 if( nlen[i] > maxl ) maxl = nlen[i]; | |
332 if( dorp == 'd' ) /* nuc */ | |
333 { | |
334 seq_grp_nuc( grpseq, tmpseq ); | |
335 makepointtable_nuc( pointt[i], grpseq ); | |
336 } | |
337 else /* amino */ | |
338 { | |
339 seq_grp( grpseq, tmpseq ); | |
340 makepointtable( pointt[i], grpseq ); | |
341 } | |
342 } | |
343 fprintf( stderr, "\nCalculating i-i scores ... " ); | |
344 for( i=0; i<njob; i++ ) | |
345 { | |
346 table1 = (short *)calloc( tsize, sizeof( short ) ); | |
347 if( !table1 ) ErrorExit( "Cannot allocate table1\n" ); | |
348 makecompositiontable_p( table1, pointt[i] ); | |
349 | |
350 score = commonsextet_p( table1, pointt[i] ); | |
351 mtxself[i] = score; | |
352 free( table1 ); | |
353 } | |
354 | |
355 fprintf( stderr, "done.\n" ); | |
356 fprintf( stderr, "\nCalculating i-j scores ... \n" ); | |
357 if( outputformat == 'p' ) fprintf( stdout, "%-5d", njob ); | |
358 for( i=0; i<norg; i++ ) | |
359 { | |
360 if( outputformat == 'p' ) fprintf( stdout, "\n%-9d ", i+1 ); | |
361 table1 = (short *)calloc( tsize, sizeof( short ) ); | |
362 if( !table1 ) ErrorExit( "Cannot allocate table1\n" ); | |
363 if( i % 10 == 0 ) | |
364 { | |
365 fprintf( stderr, "%4d / %4d\r", i+1, njob ); | |
366 } | |
367 makecompositiontable_p( table1, pointt[i] ); | |
368 | |
369 | |
370 if( nadd == 0 ) | |
371 { | |
372 if( outputformat == 'p' ) initj = 0; | |
373 else initj = i+1; | |
374 } | |
375 else | |
376 { | |
377 initj = norg; | |
378 } | |
379 for( j=initj; j<njob; j++ ) | |
380 { | |
381 if( nlen[i] > nlen[j] ) | |
382 { | |
383 longer=(float)nlen[i]; | |
384 shorter=(float)nlen[j]; | |
385 } | |
386 else | |
387 { | |
388 longer=(float)nlen[j]; | |
389 shorter=(float)nlen[i]; | |
390 } | |
391 // lenfac = 3.0 / ( LENFACA + LENFACB / ( longer + LENFACC ) + shorter / longer * LENFACD ); | |
392 lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca ); | |
393 // lenfac = 1.0; | |
394 // fprintf( stderr, "lenfac = %f (%.0f,%.0f)\n", lenfac, longer, shorter ); | |
395 score = commonsextet_p( table1, pointt[j] ); | |
396 bunbo = MIN( mtxself[i], mtxself[j] ); | |
397 if( outputformat == 'p' ) | |
398 { | |
399 if( bunbo == 0.0 ) | |
400 fprintf( stdout, " %8.6f", 1.0 ); | |
401 else | |
402 fprintf( stdout, " %8.6f", ( 1.0 - score / bunbo ) * lenfac ); | |
403 if( j % 7 == 6 ) fprintf( stdout, "\n" ); | |
404 } | |
405 else | |
406 { | |
407 if( bunbo == 0.0 ) | |
408 fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, 1.0, nlen[i], nlen[j] ); | |
409 else | |
410 fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / bunbo ) * lenfac, nlen[i], nlen[j] ); | |
411 } | |
412 // fprintf( stderr, "##### mtx = %f, mtx[i][0]=%f, mtx[j][0]=%f, bunbo=%f\n", mtx[i][j-i], mtx[i][0], mtx[j][0], bunbo ); | |
413 // score = (double)commonsextet_p( table1, pointt[j] ); | |
414 // fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / MIN( mtxself[i], mtxself[j] ) ) * 3, nlen[i], nlen[j] ); | |
415 | |
416 | |
417 } | |
418 free( table1 ); | |
419 } | |
420 | |
421 fprintf( stderr, "\n" ); | |
422 if( outputformat == 'p' ) fprintf( stdout, "\n" ); | |
423 SHOWVERSION; | |
424 exit( 0 ); | |
425 } |