view clustalomega/clustal-omega-0.2.0/src/squid/msa.c @ 0:ea6d0e588642 default tip

Migrated tool version 0.2 from old tool shed archive to new tool shed repository
author clustalomega
date Tue, 07 Jun 2011 16:13:02 -0400
parents
children
line wrap: on
line source

/*****************************************************************
 * SQUID - a library of functions for biological sequence analysis
 * Copyright (C) 1992-2002 Washington University School of Medicine
 * 
 *     This source code is freely distributed under the terms of the
 *     GNU General Public License. See the files COPYRIGHT and LICENSE
 *     for details.
 *****************************************************************/

/* msa.c
 * SRE, Mon May 17 10:48:47 1999
 * 
 * SQUID's interface for multiple sequence alignment
 * manipulation: access to the MSA object.
 * 
 * RCS $Id: msa.c 217 2011-03-19 10:27:10Z andreas $ (Original squid RCS Id: msa.c,v 1.18 2002/10/12 04:40:35 eddy Exp)
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "squid.h"
#include "msa.h"	/* multiple sequence alignment object support */
#include "gki.h"	/* string indexing hashtable code  */
#include "ssi.h"	/* SSI sequence file indexing code */

/* Function: MSAAlloc()
 * Date:     SRE, Tue May 18 10:45:47 1999 [St. Louis]
 *
 * Purpose:  Allocate an MSA structure, return a pointer
 *           to it.
 *           
 *           Designed to be used in three ways:
 *           1) We know exactly the dimensions of the alignment:
 *              both nseq and alen.
 *                    msa = MSAAlloc(nseq, alen);
 *                    
 *           2) We know the number of sequences but not alen.
 *              (We add sequences later.) 
 *                    msa = MSAAlloc(nseq, 0);
 *              
 *           3) We even don't know the number of sequences, so
 *              we'll have to dynamically expand allocations.
 *              We provide a blocksize for the allocation expansion,
 *              and expand when needed.
 *                    msa = MSAAlloc(10, 0);
 *                    if (msa->nseq == msa->nseqalloc) MSAExpand(msa);   
 *
 * Args:     nseq - number of sequences, or nseq allocation blocksize
 *           alen - length of alignment in columns, or 0      
 *
 * Returns:  pointer to new MSA object, w/ all values initialized.
 *           Note that msa->nseq is initialized to 0, though space
 *           is allocated.
 *           
 * Diagnostics: "always works". Die()'s on memory allocation failure.
 *             
 */
MSA *
MSAAlloc(int nseq, int alen)
{
  MSA *msa;
  int  i;

  msa         = MallocOrDie(sizeof(MSA));
  msa->aseq   = MallocOrDie(sizeof(char *) * nseq);
  msa->sqname = MallocOrDie(sizeof(char *) * nseq);
  msa->sqlen  = MallocOrDie(sizeof(int)    * nseq);
  msa->wgt    = MallocOrDie(sizeof(float)  * nseq);

  for (i = 0; i < nseq; i++)
    {
      msa->sqname[i] = NULL;
      msa->sqlen[i]  = 0;
      msa->wgt[i]    = -1.0;

      if (alen != 0) msa->aseq[i] = MallocOrDie(sizeof(char) * (alen+1));
      else           msa->aseq[i] = NULL;
    }      

  msa->alen      = alen;
  msa->nseq      = 0;
  msa->nseqalloc = nseq;
  msa->nseqlump  = nseq;

  msa->flags   = 0;
  msa->type    = kOtherSeq;
  msa->name    = NULL;
  msa->desc    = NULL;
  msa->acc     = NULL;
  msa->au      = NULL;
  msa->ss_cons = NULL;
  msa->sa_cons = NULL;
  msa->rf      = NULL;
  msa->sqacc   = NULL;
  msa->sqdesc  = NULL;
  msa->ss      = NULL;
  msa->sslen   = NULL;
  msa->sa      = NULL;
  msa->salen   = NULL;
  msa->index   = GKIInit();
  msa->lastidx = 0;

  for (i = 0; i < MSA_MAXCUTOFFS; i++) {
    msa->cutoff[i]        = 0.;
    msa->cutoff_is_set[i] = FALSE;
  }

  /* Initialize unparsed optional markup
   */
  msa->comment        = NULL;
  msa->ncomment       = 0;
  msa->alloc_ncomment = 0;

  msa->gf_tag         = NULL;
  msa->gf             = NULL;
  msa->ngf            = 0;

  msa->gs_tag         = NULL;
  msa->gs             = NULL;
  msa->gs_idx         = NULL;
  msa->ngs            = 0;

  msa->gc_tag         = NULL;
  msa->gc             = NULL;
  msa->gc_idx         = NULL;
  msa->ngc            = 0;

  msa->gr_tag         = NULL;
  msa->gr             = NULL;
  msa->gr_idx         = NULL;
  msa->ngr            = 0;

  /* Done. Return the alloced, initialized structure
   */ 
  return msa;
}

/* Function: MSAExpand()
 * Date:     SRE, Tue May 18 11:06:53 1999 [St. Louis]
 *
 * Purpose:  Increase the sequence allocation in an MSA
 *           by msa->nseqlump. (Typically used when we're reading
 *           in an alignment sequentially from a file,
 *           so we don't know nseq until we're done.)
 *
 * Args:     msa - the MSA object
 *
 * Returns:  (void)
 *           
 */
void
MSAExpand(MSA *msa)
{
  int i,j;

  msa->nseqalloc += msa->nseqlump;

  msa->aseq   = ReallocOrDie(msa->aseq,   sizeof(char *) * msa->nseqalloc);
  msa->sqname = ReallocOrDie(msa->sqname, sizeof(char *) * msa->nseqalloc);
  msa->sqlen  = ReallocOrDie(msa->sqlen,  sizeof(char *) * msa->nseqalloc);
  msa->wgt    = ReallocOrDie(msa->wgt,    sizeof(float)  * msa->nseqalloc);

  if (msa->ss != NULL) {
    msa->ss    = ReallocOrDie(msa->ss,    sizeof(char *) * msa->nseqalloc);
    msa->sslen = ReallocOrDie(msa->sslen, sizeof(int)    * msa->nseqalloc);
  }
  if (msa->sa != NULL) {
    msa->sa    = ReallocOrDie(msa->sa,    sizeof(char *) * msa->nseqalloc);
    msa->salen = ReallocOrDie(msa->salen, sizeof(int)    * msa->nseqalloc);
  }
  if (msa->sqacc != NULL)
    msa->sqacc = ReallocOrDie(msa->sqacc, sizeof(char *) * msa->nseqalloc);
  if (msa->sqdesc != NULL)
    msa->sqdesc =ReallocOrDie(msa->sqdesc,sizeof(char *) * msa->nseqalloc);

  for (i = msa->nseqalloc-msa->nseqlump; i < msa->nseqalloc; i++)
    {
      msa->sqname[i] = NULL;
      msa->wgt[i]    = -1.0;

      if (msa->sqacc  != NULL) msa->sqacc[i]  = NULL;
      if (msa->sqdesc != NULL) msa->sqdesc[i] = NULL;

      if (msa->alen != 0) 
	msa->aseq[i] = ReallocOrDie(msa->aseq[i], sizeof(char) * (msa->alen+1));
      else msa->aseq[i] = NULL;
      msa->sqlen[i] = 0;

      if (msa->ss != NULL) {
	if (msa->alen != 0) 
	  msa->ss[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
	else msa->ss[i] = NULL;
	msa->sslen[i] = 0;
      }
      if (msa->sa != NULL) { 
	if (msa->alen != 0) 
	  msa->sa[i] = ReallocOrDie(msa->ss[i], sizeof(char) * (msa->alen+1));
	else 
	  msa->sa[i] = NULL;
	msa->salen[i] = 0;
      }
    }

  /* Reallocate and re-init for unparsed #=GS tags, if we have some.
   * gs is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
   * set of pointers.
   */
  if (msa->gs != NULL)
    for (i = 0; i < msa->ngs; i++)
      {
	if (msa->gs[i] != NULL)
	  {
	    msa->gs[i] = ReallocOrDie(msa->gs[i], sizeof(char *) * msa->nseqalloc);
	    for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
	      msa->gs[i][j] = NULL;
	  }
      }

  /* Reallocate and re-init for unparsed #=GR tags, if we have some.
   * gr is [0..ngs-1][0..nseq-1][], so we're reallocing the middle
   * set of pointers.
   */
  if (msa->gr != NULL)
    for (i = 0; i < msa->ngr; i++)
      {
	if (msa->gr[i] != NULL)
	  {
	    msa->gr[i] = ReallocOrDie(msa->gr[i], sizeof(char *) * msa->nseqalloc);
	    for (j = msa->nseqalloc-msa->nseqlump; j < msa->nseqalloc; j++)
	      msa->gr[i][j] = NULL;
	  }
      }

  return;
}

/* Function: MSAFree()
 * Date:     SRE, Tue May 18 11:20:16 1999 [St. Louis]
 *
 * Purpose:  Free a multiple sequence alignment structure.
 *
 * Args:     msa - the alignment
 *
 * Returns:  (void)
 */
void
MSAFree(MSA *msa)
{
  Free2DArray((void **) msa->aseq,   msa->nseq);
  Free2DArray((void **) msa->sqname, msa->nseq);
  Free2DArray((void **) msa->sqacc,  msa->nseq);
  Free2DArray((void **) msa->sqdesc, msa->nseq);
  Free2DArray((void **) msa->ss,     msa->nseq);
  Free2DArray((void **) msa->sa,     msa->nseq);

  if (msa->sqlen   != NULL) free(msa->sqlen);
  if (msa->wgt     != NULL) free(msa->wgt);

  if (msa->name    != NULL) free(msa->name);
  if (msa->desc    != NULL) free(msa->desc);
  if (msa->acc     != NULL) free(msa->acc);
  if (msa->au      != NULL) free(msa->au);
  if (msa->ss_cons != NULL) free(msa->ss_cons);
  if (msa->sa_cons != NULL) free(msa->sa_cons);
  if (msa->rf      != NULL) free(msa->rf);
  if (msa->sslen   != NULL) free(msa->sslen);
  if (msa->salen   != NULL) free(msa->salen);
  
  Free2DArray((void **) msa->comment, msa->ncomment);
  Free2DArray((void **) msa->gf_tag,  msa->ngf);
  Free2DArray((void **) msa->gf,      msa->ngf);
  Free2DArray((void **) msa->gs_tag,  msa->ngs);
  Free3DArray((void ***)msa->gs,      msa->ngs, msa->nseq);
  Free2DArray((void **) msa->gc_tag,  msa->ngc);
  Free2DArray((void **) msa->gc,      msa->ngc);
  Free2DArray((void **) msa->gr_tag,  msa->ngr);
  Free3DArray((void ***)msa->gr,      msa->ngr, msa->nseq);

  GKIFree(msa->index);
  GKIFree(msa->gs_idx);
  GKIFree(msa->gc_idx);
  GKIFree(msa->gr_idx);

  free(msa);
}


/* Function: MSASetSeqAccession()
 * Date:     SRE, Mon Jun 21 04:13:33 1999 [Sanger Centre]
 *
 * Purpose:  Set a sequence accession in an MSA structure.
 *           Handles some necessary allocation/initialization.
 *
 * Args:     msa      - multiple alignment to add accession to
 *           seqidx   - index of sequence to attach accession to
 *           acc      - accession 
 *
 * Returns:  void
 */
void
MSASetSeqAccession(MSA *msa, int seqidx, char *acc)
{
  int x;

  if (msa->sqacc == NULL) {
    msa->sqacc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
    for (x = 0; x < msa->nseqalloc; x++)
      msa->sqacc[x] = NULL;
  }
  msa->sqacc[seqidx] = sre_strdup(acc, -1);
}

/* Function: MSASetSeqDescription()
 * Date:     SRE, Mon Jun 21 04:21:09 1999 [Sanger Centre]
 *
 * Purpose:  Set a sequence description in an MSA structure.
 *           Handles some necessary allocation/initialization.
 *
 * Args:     msa      - multiple alignment to add accession to
 *           seqidx   - index of sequence to attach accession to
 *           desc     - description
 *
 * Returns:  void
 */
void
MSASetSeqDescription(MSA *msa, int seqidx, char *desc)
{
  int x;

  if (msa->sqdesc == NULL) {
    msa->sqdesc = MallocOrDie(sizeof(char *) * msa->nseqalloc);
    for (x = 0; x < msa->nseqalloc; x++)
      msa->sqdesc[x] = NULL;
  }
  msa->sqdesc[seqidx] = sre_strdup(desc, -1);
}


/* Function: MSAAddComment()
 * Date:     SRE, Tue Jun  1 17:37:21 1999 [St. Louis]
 *
 * Purpose:  Add an (unparsed) comment line to the MSA structure,
 *           allocating as necessary.
 *
 * Args:     msa - a multiple alignment
 *           s   - comment line to add
 *
 * Returns:  (void)
 */
void
MSAAddComment(MSA *msa, char *s)
{
  /* If this is our first recorded comment, we need to malloc();
   * and if we've filled available space, we need to realloc().
   * Note the arbitrary lumpsize of 10 lines per allocation...
   */
  if (msa->comment == NULL) {
    msa->comment        = MallocOrDie (sizeof(char *) * 10);
    msa->alloc_ncomment = 10;
  }
  if (msa->ncomment == msa->alloc_ncomment) {
    msa->alloc_ncomment += 10;
    msa->comment = ReallocOrDie(msa->comment, sizeof(char *) * msa->alloc_ncomment);
  }

  msa->comment[msa->ncomment] = sre_strdup(s, -1);
  msa->ncomment++;
  return;
}

/* Function: MSAAddGF()
 * Date:     SRE, Wed Jun  2 06:53:54 1999 [bus to Madison]
 *
 * Purpose:  Add an unparsed #=GF markup line to the MSA
 *           structure, allocating as necessary. 
 *
 * Args:     msa   - a multiple alignment
 *           tag   - markup tag (e.g. "AU")       
 *           value - free text markup (e.g. "Alex Bateman")
 *
 * Returns:  (void)
 */
void
MSAAddGF(MSA *msa, char *tag, char *value)
{
  /* If this is our first recorded unparsed #=GF line, we need to malloc();
   * if we've filled availabl space If we already have a hash index, and the GF 
   * Note the arbitrary lumpsize of 10 lines per allocation...
   */
  if (msa->gf_tag == NULL) {
    msa->gf_tag    = MallocOrDie (sizeof(char *) * 10);
    msa->gf        = MallocOrDie (sizeof(char *) * 10);
    msa->alloc_ngf = 10;
  }
  if (msa->ngf == msa->alloc_ngf) {
    msa->alloc_ngf += 10;
    msa->gf_tag     = ReallocOrDie(msa->gf_tag, sizeof(char *) * msa->alloc_ngf);
    msa->gf         = ReallocOrDie(msa->gf, sizeof(char *) * msa->alloc_ngf);
  }

  msa->gf_tag[msa->ngf] = sre_strdup(tag, -1);
  msa->gf[msa->ngf]     = sre_strdup(value, -1);
  msa->ngf++;

  return;
}


/* Function: MSAAddGS()
 * Date:     SRE, Wed Jun  2 06:57:03 1999 [St. Louis]
 *
 * Purpose:  Add an unparsed #=GS markup line to the MSA
 *           structure, allocating as necessary.
 *           
 *           It's possible that we could get more than one
 *           of the same type of GS tag per sequence; for
 *           example, "DR PDB;" structure links in Pfam.
 *           Hack: handle these by appending to the string,
 *           in a \n separated fashion. 
 *
 * Args:     msa    - multiple alignment structure
 *           tag    - markup tag (e.g. "AC")
 *           sqidx  - index of sequence to assoc markup with (0..nseq-1)
 *           value  - markup (e.g. "P00666")
 *
 * Returns:  0 on success
 */
void
MSAAddGS(MSA *msa, char *tag, int sqidx, char *value)
{
  int tagidx;
  int i;

  /* Is this an unparsed tag name that we recognize?
   * If not, handle adding it to index, and reallocating
   * as needed.
   */
  if (msa->gs_tag == NULL)	/* first tag? init w/ malloc  */
    {
      msa->gs_idx = GKIInit();
      tagidx      = GKIStoreKey(msa->gs_idx, tag);
      SQD_DASSERT1((tagidx == 0));
      msa->gs_tag = MallocOrDie(sizeof(char *));
      msa->gs     = MallocOrDie(sizeof(char **));
      msa->gs[0]  = MallocOrDie(sizeof(char *) * msa->nseqalloc);
      for (i = 0; i < msa->nseqalloc; i++)
	msa->gs[0][i] = NULL;
    }
  else 
    {
				/* new tag? */
      tagidx  = GKIKeyIndex(msa->gs_idx, tag); 
      if (tagidx < 0) {		/* it's a new tag name; realloc */
	tagidx = GKIStoreKey(msa->gs_idx, tag);
				/* since we alloc in blocks of 1,
				   we always realloc upon seeing 
				   a new tag. */
	SQD_DASSERT1((tagidx == msa->ngs));
	msa->gs_tag =       ReallocOrDie(msa->gs_tag, (msa->ngs+1) * sizeof(char *));
	msa->gs     =       ReallocOrDie(msa->gs, (msa->ngs+1) * sizeof(char **));
	msa->gs[msa->ngs] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	for (i = 0; i < msa->nseqalloc; i++) 
	  msa->gs[msa->ngs][i] = NULL;
      }
    }

  if (tagidx == msa->ngs) {
    msa->gs_tag[tagidx] = sre_strdup(tag, -1);
    msa->ngs++;
  }
  
  if (msa->gs[tagidx][sqidx] == NULL) /* first annotation of this seq with this tag? */
    msa->gs[tagidx][sqidx] = sre_strdup(value, -1);
  else {			
				/* >1 annotation of this seq with this tag; append */
    int len;
    if ((len = sre_strcat(&(msa->gs[tagidx][sqidx]), -1, "\n", 1)) < 0)
      Die("failed to sre_strcat()");
    if (sre_strcat(&(msa->gs[tagidx][sqidx]), len, value, -1) < 0)
      Die("failed to sre_strcat()");
  }
  return;
} 

/* Function: MSAAppendGC()
 * Date:     SRE, Thu Jun  3 06:25:14 1999 [Madison]
 *
 * Purpose:  Add an unparsed #=GC markup line to the MSA
 *           structure, allocating as necessary. 
 *           
 *           When called multiple times for the same tag,
 *           appends value strings together -- used when
 *           parsing multiblock alignment files, for
 *           example.
 *
 * Args:     msa   - multiple alignment structure
 *           tag   - markup tag (e.g. "CS")
 *           value - markup, one char per aligned column      
 *
 * Returns:  (void)
 */
void
MSAAppendGC(MSA *msa, char *tag, char *value)
{
  int tagidx;

  /* Is this an unparsed tag name that we recognize?
   * If not, handle adding it to index, and reallocating
   * as needed.
   */
  if (msa->gc_tag == NULL)	/* first tag? init w/ malloc  */
    {
      msa->gc_tag = MallocOrDie(sizeof(char *));
      msa->gc     = MallocOrDie(sizeof(char *));
      msa->gc_idx = GKIInit();
      tagidx      = GKIStoreKey(msa->gc_idx, tag);
      SQD_DASSERT1((tagidx == 0));
      msa->gc[0]  = NULL;
    }
  else
    {			/* new tag? */
      tagidx  = GKIKeyIndex(msa->gc_idx, tag); 
      if (tagidx < 0) {		/* it's a new tag name; realloc */
	tagidx = GKIStoreKey(msa->gc_idx, tag);
				/* since we alloc in blocks of 1,
				   we always realloc upon seeing 
				   a new tag. */
	SQD_DASSERT1((tagidx == msa->ngc));
	msa->gc_tag = ReallocOrDie(msa->gc_tag, (msa->ngc+1) * sizeof(char **));
	msa->gc     = ReallocOrDie(msa->gc, (msa->ngc+1) * sizeof(char **));
	msa->gc[tagidx] = NULL;
      }
    }

  if (tagidx == msa->ngc) {
    msa->gc_tag[tagidx] = sre_strdup(tag, -1);
    msa->ngc++;
  }
  sre_strcat(&(msa->gc[tagidx]), -1, value, -1);
  return;
}

/* Function: MSAGetGC()
 * Date:     SRE, Fri Aug 13 13:25:57 1999 [St. Louis]
 *
 * Purpose:  Given a tagname for a miscellaneous #=GC column
 *           annotation, return a pointer to the annotation
 *           string. 
 *
 * Args:     msa  - alignment and its annotation
 *           tag  - name of the annotation       
 *
 * Returns:  ptr to the annotation string. Caller does *not*
 *           free; is managed by msa object still.
 */
char *
MSAGetGC(MSA *msa, char *tag)
{
  int tagidx;

  if (msa->gc_idx == NULL) return NULL;
  if ((tagidx = GKIKeyIndex(msa->gc_idx, tag)) < 0) return NULL;
  return msa->gc[tagidx];
}


/* Function: MSAAppendGR()
 * Date:     SRE, Thu Jun  3 06:34:38 1999 [Madison]
 *
 * Purpose:  Add an unparsed #=GR markup line to the
 *           MSA structure, allocating as necessary.
 *           
 *           When called multiple times for the same tag,
 *           appends value strings together -- used when
 *           parsing multiblock alignment files, for
 *           example.
 *
 * Args:     msa    - multiple alignment structure
 *           tag    - markup tag (e.g. "SS")
 *           sqidx  - index of seq to assoc markup with (0..nseq-1)
 *           value  - markup, one char per aligned column      
 *
 * Returns:  (void)
 */
void
MSAAppendGR(MSA *msa, char *tag, int sqidx, char *value)
{
  int tagidx;
  int i;

  /* Is this an unparsed tag name that we recognize?
   * If not, handle adding it to index, and reallocating
   * as needed.
   */
  if (msa->gr_tag == NULL)	/* first tag? init w/ malloc  */
    {
      msa->gr_tag = MallocOrDie(sizeof(char *));
      msa->gr     = MallocOrDie(sizeof(char **));
      msa->gr[0]  = MallocOrDie(sizeof(char *) * msa->nseqalloc);
      for (i = 0; i < msa->nseqalloc; i++) 
	msa->gr[0][i] = NULL;
      msa->gr_idx = GKIInit();
      tagidx      = GKIStoreKey(msa->gr_idx, tag);
      SQD_DASSERT1((tagidx == 0));
    }
  else 
    {
				/* new tag? */
      tagidx  = GKIKeyIndex(msa->gr_idx, tag); 
      if (tagidx < 0) {		/* it's a new tag name; realloc */
	tagidx = GKIStoreKey(msa->gr_idx, tag);
				/* since we alloc in blocks of 1,
				   we always realloc upon seeing 
				   a new tag. */
	SQD_DASSERT1((tagidx == msa->ngr));
	msa->gr_tag       = ReallocOrDie(msa->gr_tag, (msa->ngr+1) * sizeof(char *));
	msa->gr           = ReallocOrDie(msa->gr, (msa->ngr+1) * sizeof(char **));
	msa->gr[msa->ngr] = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	for (i = 0; i < msa->nseqalloc; i++) 
	  msa->gr[msa->ngr][i] = NULL;
      }
    }
  
  if (tagidx == msa->ngr) {
    msa->gr_tag[tagidx] = sre_strdup(tag, -1);
    msa->ngr++;
  }
  sre_strcat(&(msa->gr[tagidx][sqidx]), -1, value, -1);
  return;
}


/* Function: MSAVerifyParse()
 * Date:     SRE, Sat Jun  5 14:24:24 1999 [Madison, 1999 worm mtg]
 *
 * Purpose:  Last function called after a multiple alignment is
 *           parsed. Checks that parse was successful; makes sure
 *           required information is present; makes sure required
 *           information is consistent. Some fields that are
 *           only use during parsing may be freed (sqlen, for
 *           example).
 *           
 *           Some fields in msa may be modified (msa->alen is set,
 *           for example).
 *
 * Args:     msa - the multiple alignment
 *                 sqname, aseq must be set
 *                 nseq must be correct
 *                 alen need not be set; will be set here.
 *                 wgt will be set here if not already set
 *
 * Returns:  (void)
 *           Will Die() here with diagnostics on error.
 *
 * Example:  
 */
void
MSAVerifyParse(MSA *msa)
{
  int idx;

  if (msa->nseq == 0) Die("Parse error: no sequences were found for alignment %s",
			  msa->name != NULL ? msa->name : "");

  msa->alen = msa->sqlen[0];

  /* We can rely on msa->sqname[] being valid for any index,
   * because of the way the line parsers always store any name
   * they add to the index.
   */
  for (idx = 0; idx < msa->nseq; idx++)
    {
				/* aseq is required. */
      if (msa->aseq[idx] == NULL) 
	Die("Parse error: No sequence for %s in alignment %s", msa->sqname[idx],
	    msa->name != NULL ? msa->name : "");
				/* either all weights must be set, or none of them */
      if ((msa->flags & MSA_SET_WGT) && msa->wgt[idx] == -1.0)
	Die("Parse error: some weights are set, but %s doesn't have one in alignment %s", 
	    msa->sqname[idx],
	    msa->name != NULL ? msa->name : "");
				/* all aseq must be same length. */
      if (msa->sqlen[idx] != msa->alen)
	Die("Parse error: sequence %s: length %d, expected %d in alignment %s",
	    msa->sqname[idx], msa->sqlen[idx], msa->alen,
	    msa->name != NULL ? msa->name : "");
				/* if SS is present, must have length right */
      if (msa->ss != NULL && msa->ss[idx] != NULL && msa->sslen[idx] != msa->alen) 
	Die("Parse error: #=GR SS annotation for %s: length %d, expected %d in alignment %s",
	    msa->sqname[idx], msa->sslen[idx], msa->alen,
	    msa->name != NULL ? msa->name : "");
				/* if SA is present, must have length right */
      if (msa->sa != NULL && msa->sa[idx] != NULL && msa->salen[idx] != msa->alen) 
	Die("Parse error: #=GR SA annotation for %s: length %d, expected %d in alignment %s",
	    msa->sqname[idx], msa->salen[idx], msa->alen,
	    msa->name != NULL ? msa->name : "");
    }

			/* if cons SS is present, must have length right */
  if (msa->ss_cons != NULL && strlen(msa->ss_cons) != msa->alen) 
    Die("Parse error: #=GC SS_cons annotation: length %d, expected %d in alignment %s",
	strlen(msa->ss_cons), msa->alen,
	msa->name != NULL ? msa->name : "");

			/* if cons SA is present, must have length right */
  if (msa->sa_cons != NULL && strlen(msa->sa_cons) != msa->alen) 
    Die("Parse error: #=GC SA_cons annotation: length %d, expected %d in alignment %s",
	strlen(msa->sa_cons), msa->alen,
	msa->name != NULL ? msa->name : "");

				/* if RF is present, must have length right */
  if (msa->rf != NULL && strlen(msa->rf) != msa->alen) 
    Die("Parse error: #=GC RF annotation: length %d, expected %d in alignment %s",
	strlen(msa->rf), msa->alen,
	msa->name != NULL ? msa->name : "");

				/* Check that all or no weights are set */
  if (!(msa->flags & MSA_SET_WGT))
    FSet(msa->wgt, msa->nseq, 1.0); /* default weights */

				/* Clean up a little from the parser */
  if (msa->sqlen != NULL) { free(msa->sqlen); msa->sqlen = NULL; }
  if (msa->sslen != NULL) { free(msa->sslen); msa->sslen = NULL; }
  if (msa->salen != NULL) { free(msa->salen); msa->salen = NULL; }

  return;
}




/* Function: MSAFileOpen()
 * Date:     SRE, Tue May 18 13:22:01 1999 [St. Louis]
 *
 * Purpose:  Open an alignment database file and prepare
 *           for reading one alignment, or sequentially
 *           in the (rare) case of multiple MSA databases
 *           (e.g. Stockholm format).
 *           
 * Args:     filename - name of file to open
 *                      if "-", read stdin
 *                      if it ends in ".gz", read from pipe to gunzip -dc
 *           format   - format of file (e.g. MSAFILE_STOCKHOLM)
 *           env      - environment variable for path (e.g. BLASTDB)
 *
 * Returns:  opened MSAFILE * on success.
 *           NULL on failure: 
 *             usually, because the file doesn't exist;
 *             for gzip'ed files, may also mean that gzip isn't in the path.
 */
MSAFILE *
MSAFileOpen(char *filename, int format, char *env)
{
  MSAFILE *afp;
  
  afp        = MallocOrDie(sizeof(MSAFILE));
  if (strcmp(filename, "-") == 0)
    {
      afp->f         = stdin;
      afp->do_stdin  = TRUE; 
      afp->do_gzip   = FALSE;
      afp->fname     = sre_strdup("[STDIN]", -1);
      afp->ssi       = NULL;	/* can't index stdin because we can't seek*/
    }
#ifndef SRE_STRICT_ANSI		
  /* popen(), pclose() aren't portable to non-POSIX systems; disable */
  else if (Strparse("^.*\\.gz$", filename, 0))
    {
      char cmd[256];

      /* Note that popen() will return "successfully"
       * if file doesn't exist, because gzip works fine
       * and prints an error! So we have to check for
       * existence of file ourself.
       */
      if (! FileExists(filename))
	Die("%s: file does not exist", filename);
      if (strlen(filename) + strlen("gzip -dc ") >= 256)
	Die("filename > 255 char in MSAFileOpen()"); 
      sprintf(cmd, "gzip -dc %s", filename);
      if ((afp->f = popen(cmd, "r")) == NULL)
	return NULL;

      afp->do_stdin = FALSE;
      afp->do_gzip  = TRUE;
      afp->fname    = sre_strdup(filename, -1);
      /* we can't index a .gz file, because we can't seek in a pipe afaik */
      afp->ssi      = NULL;	
    }
#endif /*SRE_STRICT_ANSI*/
  else
    {
      char *ssifile;
      char *dir;

      /* When we open a file, it may be either in the current
       * directory, or in the directory indicated by the env
       * argument - and we have to construct the SSI filename accordingly.
       */
      if ((afp->f = fopen(filename, "r")) != NULL)
	{
	  ssifile = MallocOrDie(sizeof(char) * (strlen(filename) + 5));
	  sprintf(ssifile, "%s.ssi", filename);
	}
      else if ((afp->f = EnvFileOpen(filename, env, &dir)) != NULL)
	{
	  char *full;
	  full = FileConcat(dir, filename);
	  ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(filename)  + 5));
	  sprintf(ssifile, "%s.ssi", full);
	  free(dir);
	}
      else return NULL;

      afp->do_stdin = FALSE;
      afp->do_gzip  = FALSE;
      afp->fname    = sre_strdup(filename, -1);
      afp->ssi      = NULL;

      /* Open the SSI index file. If it doesn't exist, or
       * it's corrupt, or some error happens, afp->ssi stays NULL.
       */
      SSIOpen(ssifile, &(afp->ssi));
      free(ssifile);
    }

  /* Invoke autodetection if we haven't already been told what
   * to expect.
   */
  if (format == MSAFILE_UNKNOWN)
    {
      if (afp->do_stdin == TRUE || afp->do_gzip)
	Die("Can't autodetect alignment file format from a stdin or gzip pipe");
      format = MSAFileFormat(afp);
      if (format == MSAFILE_UNKNOWN)
	Die("Can't determine format of multiple alignment file %s", afp->fname);
    }

  afp->format     = format;
  afp->linenumber = 0;
  afp->buf        = NULL;
  afp->buflen     = 0;

  return afp;
}


/* Function: MSAFilePositionByKey()
 *           MSAFilePositionByIndex()
 *           MSAFileRewind()
 * 
 * Date:     SRE, Tue Nov  9 19:02:54 1999 [St. Louis]
 *
 * Purpose:  Family of functions for repositioning in
 *           open MSA files; analogous to a similarly
 *           named function series in HMMER's hmmio.c.
 *
 * Args:     afp    - open alignment file
 *           offset - disk offset in bytes
 *           key    - key to look up in SSI indices 
 *           idx    - index of alignment.
 *
 * Returns:  0 on failure.
 *           1 on success.
 *           If called on a non-fseek()'able file (e.g. a gzip'ed
 *           or pipe'd alignment), returns 0 as a failure flag.
 */
int 
MSAFileRewind(MSAFILE *afp)
{
  if (afp->do_gzip || afp->do_stdin) return 0;
  rewind(afp->f);
  return 1;
}
int 
MSAFilePositionByKey(MSAFILE *afp, char *key)
{
  int       fh;			/* filehandle is ignored       */
  SSIOFFSET offset;		/* offset of the key alignment */

  if (afp->ssi == NULL) return 0;
  if (SSIGetOffsetByName(afp->ssi, key, &fh, &offset) != 0) return 0;
  if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
  return 1;
}
int
MSAFilePositionByIndex(MSAFILE *afp, int idx)
{
  int       fh;			/* filehandled is passed but ignored */
  SSIOFFSET offset;		/* disk offset of desired alignment  */

  if (afp->ssi == NULL) return 0;
  if (SSIGetOffsetByNumber(afp->ssi, idx, &fh, &offset) != 0) return 0;
  if (SSISetFilePosition(afp->f, &offset) != 0) return 0;
  return 1;
}


/* Function: MSAFileRead()
 * Date:     SRE, Fri May 28 16:01:43 1999 [St. Louis]
 *
 * Purpose:  Read the next msa from an open alignment file.
 *           This is a wrapper around format-specific calls.
 *
 * Args:     afp     - open alignment file
 *
 * Returns:  next alignment, or NULL if out of alignments 
 */
MSA *
MSAFileRead(MSAFILE *afp)
{
  MSA *msa = NULL;

  switch (afp->format) {
  case MSAFILE_STOCKHOLM: msa = ReadStockholm(afp); break;
  case MSAFILE_MSF:       msa = ReadMSF(afp);       break;
  case MSAFILE_A2M:       msa = ReadA2M(afp);       break;
  case MSAFILE_CLUSTAL:   msa = ReadClustal(afp);   break;
  case MSAFILE_SELEX:     msa = ReadSELEX(afp);     break;
  case MSAFILE_PHYLIP:    msa = ReadPhylip(afp);    break;
  default:
    Die("MSAFILE corrupted: bad format index");
  }
  return msa;
}

/* Function: MSAFileClose()
 * Date:     SRE, Tue May 18 14:05:28 1999 [St. Louis]
 *
 * Purpose:  Close an open MSAFILE.
 *
 * Args:     afp  - ptr to an open MSAFILE.
 *
 * Returns:  void
 */
void
MSAFileClose(MSAFILE *afp)
{
#ifndef SRE_STRICT_ANSI	 /* gzip functionality only on POSIX systems */
  if (afp->do_gzip)    pclose(afp->f);
#endif
  if (! afp->do_stdin) fclose(afp->f);
  if (afp->buf  != NULL) free(afp->buf);
  if (afp->ssi  != NULL) SSIClose(afp->ssi);
  if (afp->fname != NULL) free(afp->fname);
  free(afp);
}

char *
MSAFileGetLine(MSAFILE *afp)
{
  char *s;
  if ((s = sre_fgets(&(afp->buf), &(afp->buflen), afp->f)) == NULL)
    return NULL;
  afp->linenumber++;
  return afp->buf;
}

void 
MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline)
{
  switch (outfmt) {
#ifdef CLUSTALO
  case MSAFILE_A2M:       WriteA2M(stdout, msa, 0);     break;
  case MSAFILE_VIENNA:    WriteA2M(stdout, msa, 1);     break;
#else
  case MSAFILE_A2M:       WriteA2M(stdout, msa);     break;
#endif
  case MSAFILE_CLUSTAL:   WriteClustal(stdout, msa); break;
  case MSAFILE_MSF:       WriteMSF(stdout, msa);     break;
  case MSAFILE_PHYLIP:    WritePhylip(stdout, msa);  break;
  case MSAFILE_SELEX:     WriteSELEX(stdout, msa);   break;
  case MSAFILE_STOCKHOLM:
    if (do_oneline) WriteStockholmOneBlock(stdout, msa);
    else            WriteStockholm(stdout, msa);
    break;
  default:
    Die("can't write. no such alignment format %d\n", outfmt);
  }
}

/* Function: MSAGetSeqidx()
 * Date:     SRE, Wed May 19 15:08:25 1999 [St. Louis]
 *
 * Purpose:  From a sequence name, return seqidx appropriate
 *           for an MSA structure.
 *           
 *           1) try to guess the index. (pass -1 if you can't guess)
 *           2) Look up name in msa's hashtable.
 *           3) If it's a new name, store in msa's hashtable;
 *                                  expand allocs as needed;
 *                                  save sqname.
 *
 * Args:     msa   - alignment object
 *           name  - a sequence name
 *           guess - a guess at the right index, or -1 if no guess.
 *
 * Returns:  seqidx
 */
int
MSAGetSeqidx(MSA *msa, char *name, int guess)
{
  int seqidx;
				/* can we guess? */
  if (guess >= 0 && guess < msa->nseq && strcmp(name, msa->sqname[guess]) == 0) 
    return guess;
				/* else, a lookup in the index */
  if ((seqidx = GKIKeyIndex(msa->index, name)) >= 0)
    return seqidx;
				/* else, it's a new name */
  seqidx = GKIStoreKey(msa->index, name);
  if (seqidx >= msa->nseqalloc)  MSAExpand(msa);

  msa->sqname[seqidx] = sre_strdup(name, -1);
  msa->nseq++;
  return seqidx;
}


/* Function: MSAFromAINFO()
 * Date:     SRE, Mon Jun 14 11:22:24 1999 [St. Louis]
 *
 * Purpose:  Convert the old aseq/ainfo alignment structure
 *           to new MSA structure. Enables more rapid conversion
 *           of codebase to the new world order.
 *
 * Args:     aseq  - [0..nseq-1][0..alen-1] alignment
 *           ainfo - old-style optional info
 *
 * Returns:  MSA *
 */
MSA *
MSAFromAINFO(char **aseq, AINFO *ainfo)
{
  MSA *msa;
  int  i, j;

  msa = MSAAlloc(ainfo->nseq, ainfo->alen);
  for (i = 0; i < ainfo->nseq; i++)
    {
      strcpy(msa->aseq[i], aseq[i]);
      msa->wgt[i]    = ainfo->wgt[i];
      msa->sqname[i] = sre_strdup(ainfo->sqinfo[i].name, -1);
      msa->sqlen[i]  = msa->alen;
      GKIStoreKey(msa->index, msa->sqname[i]);

      if (ainfo->sqinfo[i].flags & SQINFO_ACC) 
	MSASetSeqAccession(msa, i, ainfo->sqinfo[i].acc);

      if (ainfo->sqinfo[i].flags & SQINFO_DESC) 
	MSASetSeqDescription(msa, i, ainfo->sqinfo[i].desc);

      if (ainfo->sqinfo[i].flags & SQINFO_SS) {
	if (msa->ss == NULL) {
	  msa->ss    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	  msa->sslen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
	  for (j = 0; j < msa->nseqalloc; j++) {
	    msa->ss[j]    = NULL;
	    msa->sslen[j] = 0;
	  }
	}
	MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].ss, &(msa->ss[i]));
	msa->sslen[i] = msa->alen;
      }

      if (ainfo->sqinfo[i].flags & SQINFO_SA) {
	if (msa->sa == NULL) {
	  msa->sa    = MallocOrDie(sizeof(char *) * msa->nseqalloc);
	  msa->salen = MallocOrDie(sizeof(int)    * msa->nseqalloc);
	  for (j = 0; j < msa->nseqalloc; j++) {
	    msa->sa[j]    = NULL;
	    msa->salen[j] = 0;
	  }
	}
	MakeAlignedString(msa->aseq[i], msa->alen, ainfo->sqinfo[i].sa, &(msa->sa[i]));
	msa->salen[i] = msa->alen;
      }
    }
			/* note that sre_strdup() returns NULL when passed NULL */
  msa->name    = sre_strdup(ainfo->name, -1);
  msa->desc    = sre_strdup(ainfo->desc, -1);
  msa->acc     = sre_strdup(ainfo->acc,  -1);
  msa->au      = sre_strdup(ainfo->au,   -1);
  msa->ss_cons = sre_strdup(ainfo->cs,   -1);
  msa->rf      = sre_strdup(ainfo->rf,   -1);
  if (ainfo->flags & AINFO_TC) {
    msa->cutoff[MSA_CUTOFF_TC1] = ainfo->tc1; msa->cutoff_is_set[MSA_CUTOFF_TC1] = TRUE;
    msa->cutoff[MSA_CUTOFF_TC2] = ainfo->tc2; msa->cutoff_is_set[MSA_CUTOFF_TC2] = TRUE;
  }
  if (ainfo->flags & AINFO_NC) {
    msa->cutoff[MSA_CUTOFF_NC1] = ainfo->nc1; msa->cutoff_is_set[MSA_CUTOFF_NC1] = TRUE;
    msa->cutoff[MSA_CUTOFF_NC2] = ainfo->nc2; msa->cutoff_is_set[MSA_CUTOFF_NC2] = TRUE;
  }
  if (ainfo->flags & AINFO_GA) {
    msa->cutoff[MSA_CUTOFF_GA1] = ainfo->ga1; msa->cutoff_is_set[MSA_CUTOFF_GA1] = TRUE;
    msa->cutoff[MSA_CUTOFF_GA2] = ainfo->ga2; msa->cutoff_is_set[MSA_CUTOFF_GA2] = TRUE;
  }
  msa->nseq = ainfo->nseq;
  msa->alen = ainfo->alen;
  return msa;
}




/* Function: MSAFileFormat()
 * Date:     SRE, Fri Jun 18 14:26:49 1999 [Sanger Centre]
 *
 * Purpose:  (Attempt to) determine the format of an alignment file.
 *           Since it rewinds the file pointer when it's done,
 *           cannot be used on a pipe or gzip'ed file. Works by
 *           calling SeqfileFormat() from sqio.c, then making sure
 *           that the format is indeed an alignment. If the format
 *           comes back as FASTA, it assumes that the format as A2M 
 *           (e.g. aligned FASTA).
 *
 * Args:     fname   - file to evaluate
 *
 * Returns:  format code; e.g. MSAFILE_STOCKHOLM
 */
int
MSAFileFormat(MSAFILE *afp)
{
  int fmt;

  fmt = SeqfileFormat(afp->f);

  if (fmt == SQFILE_FASTA) fmt = MSAFILE_A2M;

  if (fmt != MSAFILE_UNKNOWN && ! IsAlignmentFormat(fmt)) 
    Die("File %s does not appear to be an alignment file;\n\
rather, it appears to be an unaligned file in %s format.\n\
I'm expecting an alignment file in this context.\n",
	afp->fname,
	SeqfileFormat2String(fmt));
  return fmt;
}


/* Function: MSAMingap()
 * Date:     SRE, Mon Jun 28 18:57:54 1999 [on jury duty, St. Louis Civil Court]
 *
 * Purpose:  Remove all-gap columns from a multiple sequence alignment
 *           and its associated per-residue data.
 *
 * Args:     msa - the alignment
 *
 * Returns:  (void)
 */
void
MSAMingap(MSA *msa)
{
  int *useme;			/* array of TRUE/FALSE flags for which columns to keep */
  int apos;			/* position in original alignment */
  int idx;			/* sequence index */

  useme = MallocOrDie(sizeof(int) * msa->alen);
  for (apos = 0; apos < msa->alen; apos++)
    {
      for (idx = 0; idx < msa->nseq; idx++)
	if (! isgap(msa->aseq[idx][apos]))
	  break;
      if (idx == msa->nseq) useme[apos] = FALSE; else useme[apos] = TRUE;
    }
  MSAShorterAlignment(msa, useme);
  free(useme);
  return;
}

/* Function: MSANogap()
 * Date:     SRE, Wed Nov 17 09:59:51 1999 [St. Louis]
 *
 * Purpose:  Remove all columns from a multiple sequence alignment that
 *           contain any gaps -- used for filtering before phylogenetic
 *           analysis.
 *
 * Args:     msa - the alignment
 *
 * Returns:  (void). The alignment is modified, so if you want to keep
 *           the original for something, make a copy.
 */
void
MSANogap(MSA *msa)
{
  int *useme;			/* array of TRUE/FALSE flags for which columns to keep */
  int apos;			/* position in original alignment */
  int idx;			/* sequence index */

  useme = MallocOrDie(sizeof(int) * msa->alen);
  for (apos = 0; apos < msa->alen; apos++)
    {
      for (idx = 0; idx < msa->nseq; idx++)
	if (isgap(msa->aseq[idx][apos]))
	  break;
      if (idx == msa->nseq) useme[apos] = TRUE; else useme[apos] = FALSE;
    }
  MSAShorterAlignment(msa, useme);
  free(useme);
  return;
}


/* Function: MSAShorterAlignment()
 * Date:     SRE, Wed Nov 17 09:49:32 1999 [St. Louis]
 *
 * Purpose:  Given an array "useme" (0..alen-1) of TRUE/FALSE flags,
 *           where TRUE means "keep this column in the new alignment":
 *           Remove all columns annotated as "FALSE" in the useme
 *           array.
 *
 * Args:     msa   - the alignment. The alignment is changed, so
 *                   if you don't want the original screwed up, make
 *                   a copy of it first.
 *           useme - TRUE/FALSE flags for columns to keep: 0..alen-1
 *
 * Returns:  (void)
 */
void
MSAShorterAlignment(MSA *msa, int *useme)
{
  int apos;			/* position in original alignment */
  int mpos;			/* position in new alignment      */
  int idx;			/* sequence index */
  int i;			/* markup index */

  /* Since we're minimizing, we can overwrite, using already allocated
   * memory.
   */
  for (apos = 0, mpos = 0; apos < msa->alen; apos++)
    {
      if (useme[apos] == FALSE) continue;

			/* shift alignment and associated per-column+per-residue markup */
      if (mpos != apos)
	{
	  for (idx = 0; idx < msa->nseq; idx++)
	    {
	      msa->aseq[idx][mpos] = msa->aseq[idx][apos];
	      if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = msa->ss[idx][apos];
	      if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = msa->sa[idx][apos];
	      
	      for (i = 0; i < msa->ngr; i++)
		if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = msa->gr[i][idx][apos];
	    }
	  
	  if (msa->ss_cons != NULL) msa->ss_cons[mpos] = msa->ss_cons[apos];
	  if (msa->sa_cons != NULL) msa->sa_cons[mpos] = msa->sa_cons[apos];
	  if (msa->rf      != NULL) msa->rf[mpos]      = msa->rf[apos];

	  for (i = 0; i < msa->ngc; i++)
	    msa->gc[i][mpos] = msa->gc[i][apos];
	}
      mpos++;
    }
		
  msa->alen = mpos;		/* set new length */
				/* null terminate everything */
  for (idx = 0; idx < msa->nseq; idx++)
    {
      msa->aseq[idx][mpos] = '\0';
      if (msa->ss != NULL && msa->ss[idx] != NULL) msa->ss[idx][mpos] = '\0';
      if (msa->sa != NULL && msa->sa[idx] != NULL) msa->sa[idx][mpos] = '\0';
	      
      for (i = 0; i < msa->ngr; i++)
	if (msa->gr[i][idx] != NULL) msa->gr[i][idx][mpos] = '\0';
    }

  if (msa->ss_cons != NULL) msa->ss_cons[mpos] = '\0';
  if (msa->sa_cons != NULL) msa->sa_cons[mpos] = '\0';
  if (msa->rf != NULL)      msa->rf[mpos] = '\0';

  for (i = 0; i < msa->ngc; i++)
    msa->gc[i][mpos] = '\0';

  return;
}


/* Function: MSASmallerAlignment()
 * Date:     SRE, Wed Jun 30 09:56:08 1999 [St. Louis]
 *
 * Purpose:  Given an array "useme" of TRUE/FALSE flags for
 *           each sequence in an alignment, construct
 *           and return a new alignment containing only 
 *           those sequences that are flagged useme=TRUE.
 *           
 *           Used by routines such as MSAFilterAlignment()
 *           and MSASampleAlignment().
 *           
 * Limitations:
 *           Does not copy unparsed Stockholm markup.
 *
 *           Does not make assumptions about meaning of wgt;
 *           if you want the new wgt vector renormalized, do
 *           it yourself with FNorm(new->wgt, new->nseq). 
 *
 * Args:     msa     -- the original (larger) alignment
 *           useme   -- [0..nseq-1] array of TRUE/FALSE flags; TRUE means include 
 *                      this seq in new alignment
 *           ret_new -- RETURN: new alignment          
 *
 * Returns:  void
 *           ret_new is allocated here; free with MSAFree() 
 */
void
MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new)
{
  MSA *new;                     /* RETURN: new alignment */
  int nnew;			/* number of seqs in new msa (e.g. # of TRUEs) */
  int oidx, nidx;		/* old, new indices */
  int i;

  nnew = 0;
  for (oidx = 0; oidx < msa->nseq; oidx++)
    if (useme[oidx]) nnew++;
  if (nnew == 0) { *ret_new = NULL; return; }
  
  new  = MSAAlloc(nnew, 0);
  nidx = 0;
  for (oidx = 0; oidx < msa->nseq; oidx++)
    if (useme[oidx])
      {
	new->aseq[nidx]   = sre_strdup(msa->aseq[oidx],   msa->alen);
	new->sqname[nidx] = sre_strdup(msa->sqname[oidx], msa->alen);
	GKIStoreKey(new->index, msa->sqname[oidx]);
	new->wgt[nidx]    = msa->wgt[oidx];
	if (msa->sqacc != NULL)
	  MSASetSeqAccession(new, nidx, msa->sqacc[oidx]);
	if (msa->sqdesc != NULL)
	  MSASetSeqDescription(new, nidx, msa->sqdesc[oidx]);
	if (msa->ss != NULL && msa->ss[oidx] != NULL)
	  {
	    if (new->ss == NULL) new->ss = MallocOrDie(sizeof(char *) * new->nseq);
	    new->ss[nidx] = sre_strdup(msa->ss[oidx], -1);
	  }
	if (msa->sa != NULL && msa->sa[oidx] != NULL)
	  {
	    if (new->sa == NULL) new->sa = MallocOrDie(sizeof(char *) * new->nseq);
	    new->sa[nidx] = sre_strdup(msa->sa[oidx], -1);
	  }
	nidx++;
      }

  new->nseq    = nnew;
  new->alen    = msa->alen; 
  new->flags   = msa->flags;
  new->type    = msa->type;
  new->name    = sre_strdup(msa->name, -1);
  new->desc    = sre_strdup(msa->desc, -1);
  new->acc     = sre_strdup(msa->acc, -1);
  new->au      = sre_strdup(msa->au, -1);
  new->ss_cons = sre_strdup(msa->ss_cons, -1);
  new->sa_cons = sre_strdup(msa->sa_cons, -1);
  new->rf      = sre_strdup(msa->rf, -1);
  for (i = 0; i < MSA_MAXCUTOFFS; i++) {
    new->cutoff[i]        = msa->cutoff[i];
    new->cutoff_is_set[i] = msa->cutoff_is_set[i];
  }
  free(new->sqlen);

  MSAMingap(new);
  *ret_new = new;
  return;
}


/*****************************************************************
 * Retrieval routines
 * 
 * Access to MSA structure data is possible through these routines.
 * I'm not doing this because of object oriented design, though
 * it might work in my favor someday.
 * I'm doing this because lots of MSA data is optional, and
 * checking through the chain of possible NULLs is a pain.
 *****************************************************************/

char *
MSAGetSeqAccession(MSA *msa, int idx)
{
  if (msa->sqacc != NULL && msa->sqacc[idx] != NULL)
    return msa->sqacc[idx];
  else
    return NULL;
}
char *
MSAGetSeqDescription(MSA *msa, int idx)
{
  if (msa->sqdesc != NULL && msa->sqdesc[idx] != NULL)
    return msa->sqdesc[idx];
  else
    return NULL;
}
char *
MSAGetSeqSS(MSA *msa, int idx)
{
  if (msa->ss != NULL && msa->ss[idx] != NULL)
    return msa->ss[idx];
  else
    return NULL;
}
char *
MSAGetSeqSA(MSA *msa, int idx)
{
  if (msa->sa != NULL && msa->sa[idx] != NULL)
    return msa->sa[idx];
  else
    return NULL;
}


/*****************************************************************
 * Information routines
 * 
 * Access information about the MSA.
 *****************************************************************/

/* Function: MSAAverageSequenceLength()
 * Date:     SRE, Sat Apr  6 09:41:34 2002 [St. Louis]
 *
 * Purpose:  Return the average length of the (unaligned) sequences
 *           in the MSA.
 *
 * Args:     msa  - the alignment
 *
 * Returns:  average length
 */
float
MSAAverageSequenceLength(MSA *msa)
{
  int   i;
  float avg;
  
  avg = 0.;
  for (i = 0; i < msa->nseq; i++) 
    avg += (float) DealignedLength(msa->aseq[i]);

  if (msa->nseq == 0) return 0.;
  else                return (avg / msa->nseq);
}