view clustalomega/clustal-omega-0.2.0/src/hhalign/hhutil-C.h @ 0:ea6d0e588642 default tip

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 16:13:02 -0400
parents
children
line wrap: on
line source

/* -*- mode: c; tab-width: 4; c-basic-offset: 4; indent-tabs-mode: nil -*- */

/*********************************************************************
 * Clustal Omega - Multiple sequence alignment
 *
 * Copyright (C) 2010 University College Dublin
 *
 * Clustal-Omega is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This file is part of Clustal-Omega.
 *
 ********************************************************************/

/*
 * RCS $Id: hhutil-C.h 229 2011-04-07 17:09:07Z fabian $
 */

//////////////////////////////////////////////////////////////////////////////
// Transform a character to lower case and '.' to '-' and vice versa
//////////////////////////////////////////////////////////////////////////////


inline char 
MatchChr(char c)  {return ((c>='a' && c<='z')? c-'a'+'A' : (c=='.'? '-':c) );}

inline char 
InsertChr(char c) {return ((c>='A' && c<='Z')? c+'a'-'A' : ((c>='0' && c<='9') || c=='-')? '.':c );}

inline int  
WordChr(char c) {return (int)((c>='A' && c<='Z') || (c>='a' && c<='z'));}


//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms the one-letter amino acid code into an integer between 0 and 22
 */
inline char 
aa2i(char c)
{
  //A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
  if (c>='a' && c<='z') c+='A'-'a';
  switch (c)
    {
    case 'A': return 0;
    case 'R': return 1;
    case 'N': return 2;
    case 'D': return 3;
    case 'C': return 4;
    case 'Q': return 5;
    case 'E': return 6;
    case 'G': return 7;
    case 'H': return 8;
    case 'I': return 9;
    case 'L': return 10;
    case 'K': return 11;
    case 'M': return 12;
    case 'F': return 13;
    case 'P': return 14;
    case 'S': return 15;
    case 'T': return 16;
    case 'W': return 17;
    case 'Y': return 18;
    case 'V': return 19;
    case 'X': return ANY;
    case 'J': return ANY;
    case 'O': return ANY;
    case 'U': return 4;  //Selenocystein -> Cystein
    case 'B': return 3;  //D (or N)
    case 'Z': return 6;  //E (or Q)
    case '-': return GAP;
    case '.': return GAP;
    case '_': return GAP;
    }
  if (c>=0 && c<=32) return -1; // white space and control characters
  return -2;
}

///////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms integers between 0 and 22 into the one-letter amino acid code
 */
inline char 
i2aa(char c)
{
  //A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V
  switch (c)
    {
    case 0: return 'A';
    case 1: return 'R';
    case 2: return 'N';
    case 3: return 'D';
    case 4: return 'C';
    case 5: return 'Q';
    case 6: return 'E';
    case 7: return 'G';
    case 8: return 'H';
    case 9: return 'I';
    case 10: return 'L';
    case 11: return 'K';
    case 12: return 'M';
    case 13: return 'F';
    case 14: return 'P';
    case 15: return 'S';
    case 16: return 'T';
    case 17: return 'W';
    case 18: return 'Y';
    case 19: return 'V';
    case ANY: return 'X';
    case GAP: return '-';
    case ENDGAP: return '-';
    }
  return '?';
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms the dssp/psipred secondary structure code into an integer number
 */
inline char 
ss2i(char c)
{
  //- H E C S T G B
  if (c>='a' && c<='z') c+='A'-'a';
  switch (c)
    {
    case '.': return 0;
    case '-': return 0;
    case 'X': return 0;
    case 'H': return 1;
    case 'E': return 2;
    case 'C': return 3;
    case '~': return 3;
    case 'S': return 4;
    case 'T': return 5;
    case 'G': return 6;
    case 'B': return 7;
    case 'I': return 3;
    case ' ': return -1;
    case '\t': return -1;
    case '\n': return -1;
    }
  return -2;
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms integers between 0 and 8 into the dssp/psipred secondary structure code
 */
inline char 
i2ss(int c)
{
  //- H E C S T G B
  switch (c)
    {
    case 0: return '-';
    case 1: return 'H';
    case 2: return 'E';
    case 3: return 'C';
    case 4: return 'S'; 
    case 5: return 'T';
    case 6: return 'G';
    case 7: return 'B';
    case 8: return 'I';
    }
  return '?';
}


//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms the solvend accessiblity code into an integer number
 */
inline char 
sa2i(char c)
{
  //- A B C D E 
  if (c>='a' && c<='z') c+='A'-'a';
  switch (c)
    {
    case '.': return 0;
    case '-': return 0;
    case 'A': return 1;
    case 'B': return 2;
    case 'C': return 3;
    case 'D': return 4;
    case 'E': return 5;
    case 'F': return 6;
    case ' ': return -1;
    case '\t': return -1;
    case '\n': return -1;
    }
  return -2;
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms integers between 0 and 5 into the solvent accessibility code
 */
inline char i2sa(int c)
{
  //- H E C S T G B
  switch (c)
    {
    case 0: return '-';
    case 1: return 'A';
    case 2: return 'B';
    case 3: return 'C';
    case 4: return 'D'; 
    case 5: return 'E';
    case 6: return 'F';
    }
  return '?';
}


//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms alternative secondary structure symbols into symbols 
 */
inline char 
ss2ss(char c)
{
  //- H E C S T G B
  switch (c)
    {
    case '~': return 'C';
    case 'I': return 'C';
    case 'i': return 'c';      
    case 'H': 
    case 'E': 
    case 'C': 
    case 'S': 
    case 'T': 
    case 'G': 
    case 'B': 
    case 'h': 
    case 'e': 
    case 'c': 
    case 's': 
    case 't': 
    case 'g': 
    case 'b': 
    case '.': 
      return c;
    }
  return '-';
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms confidence values of psipred into internal code
 */
inline char 
cf2i(char c)
{
  switch (c)
    {
    case '-': return 0;
    case '.': return 0;
    case '0': return 1;
    case '1': return 2;
    case '2': return 3;
    case '3': return 4;
    case '4': return 5;
    case '5': return 6;
    case '6': return 7;
    case '7': return 8;
    case '8': return 9;
    case '9': return 10;
    }
  return 0;
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Transforms internal representation of psipred confidence values into printable chars
 */
inline char 
i2cf(char c)
{
  switch (c)
    {
    case 0: return '-';
    case 1: return '0';
    case 2: return '1';
    case 3: return '2';
    case 4: return '3';
    case 5: return '4';
    case 6: return '5';
    case 7: return '6';
    case 8: return '7';
    case 9: return '8';
    case 10: return '9';
    }
  return '-';
}


//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Fast lookup of log2(1+2^(-x)) for x>=0 (precision < 0.35%)
 */
inline float 
fast_addscore(float x) 
{
  static float val[2001];         // val[i]=log2(1+2^(-x))
  static char initialized;
  if (x>20) return 0.0;
  if (x<0) 
    {
      fprintf(stderr,"Error in function fast_addscore: argument %g is negative\n",x);
      exit(7);
    }
  if (!initialized)   //First fill in the log2-vector
    {
      for (int i=0; i<=2000; i++) val[i]=log2(1.0+pow(2,-0.01*(i+0.5)));
      initialized=1;
    }  
  return val[(int)(100.0*x)];
}



//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Little utilities for output
 */
inline void 
fout(FILE* outf, int d)
{
  if (d>=99999) fprintf(outf,"*\t"); else fprintf(outf,"%i\t",d); 
  return;
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Errors
 */
int 
FormatError(const char infile[], const char details[]="")
{
  cerr<<"Error in "<</*par.argv[0],FS*/__FILE__<<": wrong format while reading file \'"<<infile<<". "<<details<<"\n"; 
  exit(1);
}

int 
OpenFileError(const char outfile[])
{
  cerr<<endl<<"Error in "<</*par.argv[0],FS*/__FILE__<<": could not open file \'"<<outfile<<"\'\n"; 
  exit(2);
}

int 
MemoryError(const char arrayname[])
{
  cerr<<"Error in "<</*par.argv[0],FS*/__FILE__<<": Memory overflow while creating \'"<<arrayname<<"\'. Please report this bug to developers\n"; 
  exit(3);
}

int 
InternalError(const char errstr[])
{
  cerr<<"Error in "<</*par.argv[0],FS*/__FILE__<<":  "<<errstr<<". Please report this bug to developers\n"; 
  exit(6);
}


//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Takes family code (eg. a.1.2.3) and returns strings 'a', 'a.1', and 'a.1.2'
 */
inline void  
ScopID(char cl[], char fold[], char sfam[], const char fam[])
{
  char* ptr;

  //get scop class ID 
  strcpy(cl,fam); 
  ptr = strchr(cl,'.');               //return adress of next '.' in name
  if(ptr) ptr[0]='\0';  

  //get scop fold ID
  strcpy(fold,fam); 
  ptr = strchr(fold,'.');             //return adress of next '.' in name
  if(ptr) ptr = strchr(ptr+1,'.');    //return adress of next '.' in name
  if(ptr) ptr[0]='\0';

  //get scop superfamily ID
  strcpy(sfam,fam); 
  ptr = strchr(sfam,'.');            //return adress of next '.' in name
  if(ptr) ptr = strchr(ptr+1,'.');   //return adress of next '.' in name
  if(ptr) ptr = strchr(ptr+1,'.');   //return adress of next '.' in name
  if(ptr) ptr[0]='\0';
  return;
}

//////////////////////////////////////////////////////////////////////////////
/**
 * @brief Read up to n lines of outfile and write to screen (STDERR)
 */
void 
WriteToScreen(char* outfile, int n)
{
  char line[LINELEN]="";
  ifstream outf;
  outf.open(outfile, ios::in);
  if (!outf) {OpenFileError(outfile);}
  cout<<"\n";
  for(; n>0 && outf.getline(line,LINELEN); n--) cout<<line<<"\n";
  outf.close();
  cout<<"\n";
}
  
inline void 
WriteToScreen(char* outfile) {WriteToScreen(outfile,INT_MAX);}



/////////////////////////////////////////////////////////////////////////////////////
/**
 * @brief Read .hhdefaults file into array argv_conf (beginning at argv_conf[1])
 */
void 
ReadDefaultsFile(int& argc_conf, char** argv_conf)
{
  char line[LINELEN]="";
  char filename[NAMELEN];
  char* c_first;   //pointer to first character of argument string
  char* c;         //pointer to scan line read in for end of argument
  //  ifstream configf;
  FILE* configf=NULL;
  argc_conf=1;     //counts number of arguments read in 

  // Open config file
  strcpy(filename,"./.hhdefaults");
  configf = fopen(filename,"r");
  if (!configf && getenv("HOME"))  
    {
      strcpy(filename,getenv("HOME"));
      strcat(filename,"/.hhdefaults");
      configf = fopen(filename,"r");
      if (!configf)
	{
	  if (v>=3) cerr<<"Warning: could not find ./.hhdefaults or "<<filename<<"\n";
	  return;
	}
    }
  else if (!configf) return; // only webserver has no home directory => need no warning

  // Scan file until line 'program_nameANYTHING'
  while (fgets(line,LINELEN,configf))
    if (!strncmp(line,program_name,6)) break;
  // Found line 'program_nameANYTHING'?
  if (!strncmp(line,program_name,6))
    {
      // Read in options until end-of-file or empty line
	while (fgets(line,LINELEN,configf) && strcmp(line,"\n"))
	{
	  // Analyze line
	  c=line;
	  do
	    {
	      // Find next word
	      while (*c==' ' || *c=='\t') c++; //Advance until next non-white space
	      if (*c=='\0' || *c=='\n' || *c=='#') break;  //Is next word empty string?
	      c_first=c;
	      while (*c!=' ' && *c!='\t'  && *c!='#' && *c!='\0' && *c!='\n' ) c++; //Advance until next white space or '#'
	      if (*c=='\0' || *c=='\n' || *c=='#')         //Is end of line reached?
		{
		  *c='\0'; 
		  argv_conf[argc_conf]=new(char[strlen(c_first)+1]);
		  strcpy(argv_conf[argc_conf++],c_first); 
		  break;
		}
	      *c='\0'; 
	      argv_conf[argc_conf]=new(char[strlen(c_first)+1]);
	      strcpy(argv_conf[argc_conf++],c_first);
	      printf("Argument: %s\n",c_first);
	      c++;
	    } while (1);
	} //end read line
     if (v>=3) 
	{
	  cout<<"Arguments read in from .hhdefaults:";
	  for (int argc=1; argc<argc_conf; argc++) cout<<(argv_conf[argc][0]=='-'? " ":"")<<argv_conf[argc]<<" ";
	  cout<<"\n";
	}
     else if (v>=3) cout<<"Read in "<<argc_conf<<" default arguments for "<<program_name<<" from "<<filename<<"\n";
     }
  else //found no line 'program_name   anything"
    {
      if (v>=3) cerr<<endl<<"Warning: no default options for \'"<<program_name<<"\' found in "<<filename<<"\n";
      return; //no line 'program_name   anything' found
    }
//   configf.close();
  fclose(configf);
}


/////////////////////////////////////////////////////////////////////////////////////
/**
 * @brief Set default parameter values
 */
void 
SetDefaults()
{
    
  par.append=0;                // overwrite output file
  par.outformat=0;             // 0: hhr  1: FASTA  2:A2M   3:A3M 
  par.p=20.0f;                 // minimum threshold for inclusion in hit list and alignment listing
  par.E=1e6f;                  // maximum threshold for inclusion in hit list and alignment listing
  par.b=10;                    // min number of alignments
  par.B=500;                   // max number of alignments
  par.z=10;                    // min number of lines in hit list
  par.Z=500;                   // max number of lines in hit list
  par.e=1e-3f;                 // maximum E-value for inclusion in output alignment, output HMM, and PSI-BLAST checkpoint model
  par.showcons=1;              // show consensus sequence
  par.showdssp=1;              // show predicted secondary structure
  par.showpred=1;              // show dssp secondary structure
  par.cons=0;                  // show first non-SS sequence as main representative sequence (not consensus)
  par.nseqdis=1;               // maximum number of query sequences for output alignment
  par.mark=0;                  // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed
  par.aliwidth=80;             // number of characters per line in output alignments for HMM search
  par.max_seqid=90;            // default for maximum sequence identity threshold
  par.qid=0;                   // default for minimum sequence identity with query
  par.qsc=-20.0f;              // default for minimum score per column with query
  par.coverage=0;              // default for minimum coverage threshold
  par.Ndiff=100;               // pick Ndiff most different sequences from alignment
  par.coverage_core=80;        // Minimum coverage for sequences in core alignment
  par.qsc_core=0.3f;           // Minimum score per column of core sequence with query
  par.coresc=-20.0f;           // Minimum score per column with core alignment (HMM)

  par.M=1;                     // match state assignment is by A2M/A3M
  par.Mgaps=50;                // Above this percentage of gaps, columns are assigned to insert states (for par.M=2)
  par.calibrate=0;             // default: no calibration
  par.calm=0;                  // derive P-values from: 0:query calibration  1:template calibration  2:both
  par.mode=0;                  // 
  
  par.wg=0;                    // 0: use local sequence weights   1: use local ones

  par.matrix=0;                // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62 
  par.pcm=2;                   // pseudocount mode: default=divergence-dependent (but not column-specific)
#if 1 /* Nelder-Meade on Baliscore */
  par.pca=1.712190f;                // default values for substitution matrix pseudocounts 
  par.pcb=1.039640f;                // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
  par.pcc=0.878067f;                // pcs are reduced prop. to 1/Neff^pcc
  par.pcw=0.0f;                // wc>0 weighs columns according to their intra-clomun similarity

  par.gapb=1.405220;                // default values for transition pseudocounts
  par.gapd=1.316760;               // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
  par.gape=1.793780;                // gap extension penalty pseudocount
  par.gapf=1.034710;                // factor for increasing gap open penalty for deletes
  par.gapg=0.894772;                // factor for increasing gap open penalty for inserts
  par.gaph=0.544072;                // factor for increasing gap extension penalty for deletes
  par.gapi=0.862559;                // factor for increasing gap extension penalty for inserts
#else  /* Soeding's default*/
  par.pca=1.0f;                // default values for substitution matrix pseudocounts 
  par.pcb=1.5f;                // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
  par.pcc=1.0f;                // pcs are reduced prop. to 1/Neff^pcc
  par.pcw=0.0f;                // wc>0 weighs columns according to their intra-clomun similarity

  par.gapb=1.0;                // default values for transition pseudocounts
  par.gapd=0.15;               // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
  par.gape=1.0;                // gap extension penalty pseudocount
  par.gapf=0.6;                // factor for increasing gap open penalty for deletes
  par.gapg=0.6;                // factor for increasing gap open penalty for inserts
  par.gaph=0.6;                // factor for increasing gap extension penalty for deletes
  par.gapi=0.6;                // factor for increasing gap extension penalty for inserts
#endif
  /* Viterbi parameters optimised on Sabre (R228), FS, r228 -> r229 */
#if 1
  par.pcaV=1.245150f;                // default values for substitution matrix pseudocounts 
  par.pcbV=1.682110f;                // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
  par.pccV=1.483840f;                // pcs are reduced prop. to 1/Neff^pcc
  par.pcwV=0.0f;                // wc>0 weighs columns according to their intra-clomun similarity

  par.gapbV=0.818625;                // default values for transition pseudocounts
  par.gapdV=0.666110;               // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
  par.gapeV=1.028050;                // gap extension penalty pseudocount
  par.gapfV=0.710760;                // factor for increasing gap open penalty for deletes
  par.gapgV=1.649800;                // factor for increasing gap open penalty for inserts
  par.gaphV=0.470604;                // factor for increasing gap extension penalty for deletes
  par.gapiV=0.829479;                // factor for increasing gap extension penalty for inserts
#else /* Soeding default*/ 
  par.pcaV=1.0f;                // default values for substitution matrix pseudocounts 
  par.pcbV=1.5f;                // significant reduction of pcs by Neff_M starts around Neff_M-1=pcb
  par.pccV=1.0f;                // pcs are reduced prop. to 1/Neff^pcc
  par.pcwV=0.0f;                // wc>0 weighs columns according to their intra-clomun similarity

  par.gapbV=1.0;                // default values for transition pseudocounts
  par.gapdV=0.15;               // gap open penalty pseudocount; 0.25 corresponds to 7.1*gapf bits
  par.gapeV=1.0;                // gap extension penalty pseudocount
  par.gapfV=0.6;                // factor for increasing gap open penalty for deletes
  par.gapgV=0.6;                // factor for increasing gap open penalty for inserts
  par.gaphV=0.6;                // factor for increasing gap extension penalty for deletes
  par.gapiV=0.6;                // factor for increasing gap extension penalty for inserts
#endif

  par.ssm=2;                   // ss scoring mode: 0:no ss score  1:score after alignment  2:score during alignment
  par.ssw=0.11f;               // weight of ss scoring
  par.ssa=1.0f;                // weight of ss evolution matrix
  par.shift=-0.01f;            // Shift match score up 
  par.mact=0.3001f;            // Score threshold for MAC alignment in local mode (set to 0.5001 to track user modification)
  par.corr=0.1f;               // Weight of correlations of scores for |i-j|<=4
  par.wstruc=1.0f;             // Weight of structure scores

  par.egq=0.0f;                // no charge for end gaps as default
  par.egt=0.0f;                // no charge for end gaps as default

  par.trans=0;                 // no transitive scoring as default
  par.Emax_trans=100.0f;       // use intermediate HMMs with E-values up to 100 between query and database HMM
  par.Emax_trans=100.0f;       // use intermediate HMMs with E-values up to 100 between query and database HMM
  par.wtrans=1.0f;             // Ztot[k] = Zq[k] + wtrans * (Zforward[k]+Zreverse[k])
  par.ssgap=0;                 // 1: add secondary structure-dependent gap penalties  0:off
  par.ssgapd=1.0f;             // secondary structure-dependent gap-opening penalty (per residue)
  par.ssgape=0.0f;             // secondary structure-dependent gap-extension penalty (per residue)
  par.ssgapi=4;                // max. number of inside-integer(ii); gap-open-penalty= -ii*ssgapd

  par.loc=1;                   // local vs. global alignment as default
  par.altali=2;                // find up to two (possibly overlapping) subalignments 
  par.forward=0;               // 0: Viterbi algorithm; 1: Viterbi+stochastic sampling; 3:Maximum Accuracy (MAC) algorithm
  par.realign=1;               // realign with MAC algorithm

  par.repmode=0;               // repeats score independently of one another
  par.columnscore=1;           // Default column score is 1: null model pnul = 1/2 * (q_av(a)+p_av(a))
  par.min_overlap=0;           // automatic minimum overlap used
  par.opt=0;                   // Default = optimization mode off
  par.readdefaultsfile=0;      // Default = do not read a defaults file ./.hhdefaults or HOME/.hhdefaults
  par.maxdbstrlen=200;         // maximum length of database string to be printed in 'Command' line of hhr file
  par.mode=0;
  par.idummy=par.jdummy=0;     //

  par.notags=1;                // neutralize His-tags, FLAG-tags, C-myc-tags

  // Initialize strings
  strcpy(par.infile,"stdin");
  strcpy(par.outfile,"");
  strcpy(par. pairwisealisfile,"");
  strcpy(par.buffer,"buffer.txt"); 
  strcpy(par.scorefile,""); 
  strcpy(par.wfile,""); 
  strcpy(par.alnfile,""); 
  strcpy(par.hhmfile,""); 
  strcpy(par.psifile,""); 
  par.exclstr=NULL; 

#if 0  /* read parameter file from home-dir */
#include "hhutil-C-help.h"
#endif /* read parameter file from home-dir */

   return;
} /** this is the end of SetDefaults() **/

/*
 * EOF hhutil-C.h
 */