diff mafft/core/mafft-distance.c @ 18:e4d75f9efb90 draft

planemo upload commit b'4303231da9e48b2719b4429a29b72421d24310f4\n'-dirty
author nick
date Thu, 02 Feb 2017 18:44:31 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mafft/core/mafft-distance.c	Thu Feb 02 18:44:31 2017 -0500
@@ -0,0 +1,425 @@
+#include "mltaln.h"
+#include "mtxutl.h"
+
+#define DEBUG 0
+#define TEST  0
+
+#define END_OF_VEC -1
+
+static int maxl;
+static int tsize;
+static char outputformat;
+static float lenfaca, lenfacb, lenfacc, lenfacd;
+static int nadd;
+#define PLENFACA 0.01
+#define PLENFACB 10000
+#define PLENFACC 10000
+#define PLENFACD 0.1
+#define DLENFACA 0.01
+#define DLENFACB 2500
+#define DLENFACC 2500
+#define DLENFACD 0.1
+
+void arguments( int argc, char *argv[] )
+{
+	int c;
+
+	inputfile = NULL;
+	outputformat = 's';
+	scoremtx = 1;
+	nblosum = 62;
+	dorp = NOTSPECIFIED;
+	nadd = 0;
+	alg = 'X';
+
+    while( --argc > 0 && (*++argv)[0] == '-' )
+	{
+        while ( (c = *++argv[0]) )
+		{
+            switch( c )
+            {
+				case 'i':
+					inputfile = *++argv;
+					fprintf( stderr, "inputfile = %s\n", inputfile );
+					--argc;
+					goto nextoption;
+				case 'I':
+					nadd = myatoi(*++argv);
+					if( nadd == 0 )
+					{
+						fprintf( stderr, "nadd = %d?\n", nadd );
+						exit( 1 );
+					}
+					--argc;
+					goto nextoption;
+				case 'p':
+					outputformat = 'p';
+					break;
+				case 'D':
+					dorp = 'd';
+					break;
+				case 'P':
+					dorp = 'p';
+					break;
+                default:
+                    fprintf( stderr, "illegal option %c\n", c );
+                    argc = 0;
+                    break;
+            }
+		}
+		nextoption:
+			;
+	}
+	if( inputfile == NULL )
+	{
+		argc--;
+		inputfile = *argv;
+		fprintf( stderr, "inputfile = %s\n", inputfile );
+	}
+    if( argc != 0 )
+    {
+        fprintf( stderr, "Usage: mafft-distance [-PD] [-i inputfile] inputfile > outputfile\n" );
+        exit( 1 );
+    }
+}
+
+void seq_grp_nuc( int *grp, char *seq )
+{
+	int tmp;
+	int *grpbk = grp;
+	while( *seq )
+	{
+		tmp = amino_grp[(int)*seq++];
+		if( tmp < 4 )
+			*grp++ = tmp;
+		else
+			fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
+	}
+	*grp = END_OF_VEC;
+	if( grp - grpbk < 6 )
+	{
+		fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
+//		exit( 1 );
+		*grpbk = -1;
+	}
+}
+
+void seq_grp( int *grp, char *seq )
+{
+	int tmp;
+	int *grpbk = grp;
+	while( *seq )
+	{
+		tmp = amino_grp[(int)*seq++];
+		if( tmp < 6 )
+			*grp++ = tmp;
+		else
+			fprintf( stderr, "WARNING : Unknown character %c\r", *(seq-1) );
+	}
+	*grp = END_OF_VEC;
+	if( grp - grpbk < 6 )
+	{
+		fprintf( stderr, "\n\nWARNING: Too short.\nPlease also consider use mafft-ginsi, mafft-linsi or mafft-ginsi.\n\n\n" );
+//		exit( 1 );
+		*grpbk = -1;
+	}
+}
+
+void makecompositiontable_p( short *table, int *pointt )
+{
+	int point;
+
+	while( ( point = *pointt++ ) != END_OF_VEC )
+		table[point]++;
+}
+
+int commonsextet_p( short *table, int *pointt )
+{
+	int value = 0;
+	short tmp;
+	int point;
+	static short *memo = NULL;
+	static int *ct = NULL;
+	static int *cp;
+
+	if( *pointt == -1 )
+		return( 0 );
+
+	if( !memo )
+	{
+		memo = (short *)calloc( tsize, sizeof( short ) );
+		if( !memo ) ErrorExit( "Cannot allocate memo\n" );
+		ct = (int *)calloc( MIN( maxl, tsize)+1, sizeof( int ) );
+		if( !ct ) ErrorExit( "Cannot allocate memo\n" );
+	}
+
+	cp = ct;
+	while( ( point = *pointt++ ) != END_OF_VEC )
+	{
+		tmp = memo[point]++;
+		if( tmp < table[point] )
+			value++;
+		if( tmp == 0 ) *cp++ = point;
+//		fprintf( stderr, "cp - ct = %d (tsize = %d)\n", cp - ct, tsize );
+	}
+	*cp = END_OF_VEC;
+	
+	cp =  ct;
+	while( *cp != END_OF_VEC )
+		memo[*cp++] = 0;
+
+	return( value );
+}
+
+void makepointtable_nuc( int *pointt, int *n )
+{
+	int point;
+	register int *p;
+
+	if( *n == -1 )
+	{
+		*pointt = -1;
+		return;
+	}
+
+	p = n;
+	point  = *n++ *  1024;
+	point += *n++ *   256;
+	point += *n++ *    64;
+	point += *n++ *    16;
+	point += *n++ *     4;
+	point += *n++;
+	*pointt++ = point;
+
+	while( *n != END_OF_VEC )
+	{
+		point -= *p++ * 1024;
+		point *= 4;
+		point += *n++;
+		*pointt++ = point;
+	}
+	*pointt = END_OF_VEC;
+}
+
+void makepointtable( int *pointt, int *n )
+{
+	int point;
+	register int *p;
+
+	if( *n == -1 )
+	{
+		*pointt = -1;
+		return;
+	}
+
+	p = n;
+	point  = *n++ *  7776;
+	point += *n++ *  1296;
+	point += *n++ *   216;
+	point += *n++ *    36;
+	point += *n++ *     6;
+	point += *n++;
+	*pointt++ = point;
+
+	while( *n != END_OF_VEC )
+	{
+		point -= *p++ * 7776;
+		point *= 6;
+		point += *n++;
+		*pointt++ = point;
+	}
+	*pointt = END_OF_VEC;
+}
+
+int main( int argc, char **argv )
+{
+	int i, j, initj;
+	FILE *infp;
+	char **seq;
+	int *grpseq;
+	char *tmpseq;
+	int  **pointt;
+	static char **name;
+	static int *nlen;
+	double *mtxself;
+	float score;
+	static short *table1;
+	float longer, shorter;
+	float lenfac;
+	float bunbo;
+	int norg;
+
+	arguments( argc, argv );
+
+
+	if( inputfile )
+	{
+		infp = fopen( inputfile, "r" );
+		if( !infp )
+		{
+			fprintf( stderr, "Cannot open %s\n", inputfile );
+			exit( 1 );
+		}
+	}
+	else
+		infp = stdin;
+
+#if 0
+	PreRead( stdin, &njob, &nlenmax );
+#else
+	getnumlen( infp );
+#endif
+	rewind( infp );
+	if( njob < 2 )
+	{
+		fprintf( stderr, "At least 2 sequences should be input!\n"
+						 "Only %d sequence found.\n", njob );
+		exit( 1 );
+	}
+
+	tmpseq = AllocateCharVec( nlenmax+1 );
+	seq = AllocateCharMtx( njob, nlenmax+1 );
+	grpseq = AllocateIntVec( nlenmax+1 );
+	pointt = AllocateIntMtx( njob, nlenmax+1 );
+	mtxself = AllocateDoubleVec( njob );
+	pamN = NOTSPECIFIED;
+	name = AllocateCharMtx( njob, B );
+	nlen = AllocateIntVec( njob );
+
+#if 0
+	FRead( infp, name, nlen, seq );
+#else
+	readData_pointer( infp, name, nlen, seq );
+#endif
+
+	fclose( infp );
+
+	constants( njob, seq );
+
+
+	if( nadd ) outputformat = 's';
+	norg = njob - nadd;
+
+	if( dorp == 'd' ) tsize = (int)pow( 4, 6 );
+	else              tsize = (int)pow( 6, 6 );
+
+	if( dorp == 'd' )
+	{
+		lenfaca = DLENFACA;
+		lenfacb = DLENFACB;
+		lenfacc = DLENFACC;
+		lenfacd = DLENFACD;
+	}
+	else    
+	{
+		lenfaca = PLENFACA;
+		lenfacb = PLENFACB;
+		lenfacc = PLENFACC;
+		lenfacd = PLENFACD;
+	}
+
+	maxl = 0;
+	for( i=0; i<njob; i++ ) 
+	{
+		gappick0( tmpseq, seq[i] );
+		nlen[i] = strlen( tmpseq );
+//		if( nlen[i] < 6 )
+//		{
+//			fprintf( stderr, "Seq %d, too short, %d characters\n", i+1, nlen[i] );
+//			exit( 1 );
+//		}
+		if( nlen[i] > maxl ) maxl = nlen[i];
+		if( dorp == 'd' ) /* nuc */
+		{
+			seq_grp_nuc( grpseq, tmpseq );
+			makepointtable_nuc( pointt[i], grpseq );
+		}
+		else                 /* amino */
+		{
+			seq_grp( grpseq, tmpseq );
+			makepointtable( pointt[i], grpseq );
+		}
+	}
+	fprintf( stderr, "\nCalculating i-i scores ... " );
+	for( i=0; i<njob; i++ )
+	{
+		table1 = (short *)calloc( tsize, sizeof( short ) );
+		if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
+		makecompositiontable_p( table1, pointt[i] );
+
+		score = commonsextet_p( table1, pointt[i] );
+		mtxself[i] = score;
+		free( table1 );
+	}
+
+	fprintf( stderr, "done.\n" );
+	fprintf( stderr, "\nCalculating i-j scores ... \n" );
+	if( outputformat == 'p' ) fprintf( stdout, "%-5d", njob );
+	for( i=0; i<norg; i++ )
+	{
+		if( outputformat == 'p' ) fprintf( stdout, "\n%-9d ", i+1 );
+		table1 = (short *)calloc( tsize, sizeof( short ) );
+		if( !table1 ) ErrorExit( "Cannot allocate table1\n" );
+		if( i % 10 == 0 )
+		{
+			fprintf( stderr, "%4d / %4d\r", i+1, njob );
+		}
+		makecompositiontable_p( table1, pointt[i] );
+
+
+		if( nadd == 0 )
+		{
+			if( outputformat == 'p' ) initj = 0;
+			else initj = i+1;
+		}
+		else 
+		{
+			initj = norg;
+		}
+		for( j=initj; j<njob; j++ ) 
+		{
+			if( nlen[i] > nlen[j] )
+			{
+				longer=(float)nlen[i];
+				shorter=(float)nlen[j];
+			}
+			else
+			{
+				longer=(float)nlen[j];
+				shorter=(float)nlen[i];
+			}
+//			lenfac = 3.0 / ( LENFACA + LENFACB / ( longer + LENFACC ) + shorter / longer * LENFACD );
+			lenfac = 1.0 / ( shorter / longer * lenfacd + lenfacb / ( longer + lenfacc ) + lenfaca );
+//			lenfac = 1.0;
+//			fprintf( stderr, "lenfac = %f (%.0f,%.0f)\n", lenfac, longer, shorter );
+			score = commonsextet_p( table1, pointt[j] );
+			bunbo = MIN( mtxself[i], mtxself[j] );
+			if( outputformat == 'p' )
+			{
+				if( bunbo == 0.0 )
+					fprintf( stdout, " %8.6f", 1.0 );
+				else
+					fprintf( stdout, " %8.6f", ( 1.0 - score / bunbo ) * lenfac );
+				if( j % 7 == 6 ) fprintf( stdout, "\n" );
+			}
+			else
+			{
+				if( bunbo == 0.0 )
+					fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, 1.0, nlen[i], nlen[j] );
+				else
+					fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / bunbo ) * lenfac, nlen[i], nlen[j] );
+			}
+//			fprintf( stderr, "##### mtx = %f, mtx[i][0]=%f, mtx[j][0]=%f, bunbo=%f\n", mtx[i][j-i], mtx[i][0], mtx[j][0], bunbo );
+//          score = (double)commonsextet_p( table1, pointt[j] );
+//			fprintf( stdout, "%d-%d d=%4.2f l=%d,%d\n", i+1, j+1, ( 1.0 - score / MIN( mtxself[i], mtxself[j] ) ) * 3, nlen[i], nlen[j] );
+
+
+		} 
+		free( table1 );
+	}
+		
+	fprintf( stderr, "\n" );
+	if( outputformat == 'p' ) fprintf( stdout, "\n" );
+	SHOWVERSION;
+	exit( 0 );
+}