view mayachemtools/docs/modules/html/code/SequenceFileUtil.html @ 9:ab29fa5c8c1f draft default tip

Uploaded
author deepakjadmin
date Thu, 15 Dec 2016 14:18:03 -0500
parents 73ae111cf86f
children
line wrap: on
line source

<html>
<head>
<title>MayaChemTools:Code:SequenceFileUtil.pm</title>
<meta http-equiv="content-type" content="text/html;charset=utf-8">
<link rel="stylesheet" type="text/css" href="../../../css/MayaChemToolsCode.css">
</head>
<body leftmargin="20" rightmargin="20" topmargin="10" bottommargin="10">
<br/>
<center>
<a href="http://www.mayachemtools.org" title="MayaChemTools Home"><img src="../../../images/MayaChemToolsLogo.gif" border="0" alt="MayaChemTools"></a>
</center>
<br/>
<pre>
<a name="package-SequenceFileUtil-"></a>   1 <span class="k">package </span><span class="i">SequenceFileUtil</span><span class="sc">;</span>
   2 <span class="c">#</span>
   3 <span class="c"># $RCSfile: SequenceFileUtil.pm,v $</span>
   4 <span class="c"># $Date: 2015/02/28 20:47:18 $</span>
   5 <span class="c"># $Revision: 1.33 $</span>
   6 <span class="c">#</span>
   7 <span class="c"># Author: Manish Sud &lt;msud@san.rr.com&gt;</span>
   8 <span class="c">#</span>
   9 <span class="c"># Copyright (C) 2015 Manish Sud. All rights reserved.</span>
  10 <span class="c">#</span>
  11 <span class="c"># This file is part of MayaChemTools.</span>
  12 <span class="c">#</span>
  13 <span class="c"># MayaChemTools is free software; you can redistribute it and/or modify it under</span>
  14 <span class="c"># the terms of the GNU Lesser General Public License as published by the Free</span>
  15 <span class="c"># Software Foundation; either version 3 of the License, or (at your option) any</span>
  16 <span class="c"># later version.</span>
  17 <span class="c">#</span>
  18 <span class="c"># MayaChemTools is distributed in the hope that it will be useful, but without</span>
  19 <span class="c"># any warranty; without even the implied warranty of merchantability of fitness</span>
  20 <span class="c"># for a particular purpose.  See the GNU Lesser General Public License for more</span>
  21 <span class="c"># details.</span>
  22 <span class="c">#</span>
  23 <span class="c"># You should have received a copy of the GNU Lesser General Public License</span>
  24 <span class="c"># along with MayaChemTools; if not, see &lt;http://www.gnu.org/licenses/&gt; or</span>
  25 <span class="c"># write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,</span>
  26 <span class="c"># Boston, MA, 02111-1307, USA.</span>
  27 <span class="c">#</span>
  28 
  29 <span class="k">use</span> <span class="w">strict</span><span class="sc">;</span>
  30 <span class="k">use</span> <span class="w">Exporter</span><span class="sc">;</span>
  31 <span class="k">use</span> <span class="w">Text::ParseWords</span><span class="sc">;</span>
  32 <span class="k">use</span> <span class="w">TextUtil</span><span class="sc">;</span>
  33 <span class="k">use</span> <span class="w">FileUtil</span><span class="sc">;</span>
  34 
  35 <span class="k">use</span> <span class="w">vars</span> <span class="q">qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS)</span><span class="sc">;</span>
  36 
  37 <span class="i">@ISA</span> = <span class="q">qw(Exporter)</span><span class="sc">;</span>
  38 <span class="i">@EXPORT</span> = <span class="q">qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile)</span><span class="sc">;</span>
  39 <span class="i">@EXPORT_OK</span> = <span class="q">qw()</span><span class="sc">;</span>
  40 
  41 <span class="i">%EXPORT_TAGS</span> = <span class="s">(</span><span class="w">all</span>  <span class="cm">=&gt;</span> <span class="s">[</span><span class="i">@EXPORT</span><span class="cm">,</span> <span class="i">@EXPORT_OK</span><span class="s">]</span><span class="s">)</span><span class="sc">;</span>
  42 
  43 <span class="c"># Compare lengths of all sequences...</span>
<a name="AreSequenceLengthsIdentical-"></a>  44 <span class="k">sub </span><span class="m">AreSequenceLengthsIdentical</span> <span class="s">{</span>
  45   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
  46   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$FirstID</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$FirstDifferentLenID</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="s">)</span><span class="sc">;</span>
  47 
  48   <span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span>
  49   <span class="i">$FirstID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
  50   <span class="i">$FirstDifferentLenID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
  51 
  52   <span class="j">ID:</span> <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
  53     <span class="k">if</span> <span class="s">(</span>!<span class="i">$FirstID</span><span class="s">)</span> <span class="s">{</span>
  54       <span class="i">$FirstID</span> = <span class="i">$ID</span><span class="sc">;</span>
  55       <span class="i">$FirstSeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span><span class="sc">;</span>
  56       <span class="k">next</span> <span class="j">ID</span><span class="sc">;</span>
  57     <span class="s">}</span>
  58     <span class="i">$SeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span><span class="sc">;</span>
  59     <span class="k">if</span> <span class="s">(</span><span class="i">$SeqLen</span> != <span class="i">$FirstSeqLen</span><span class="s">)</span> <span class="s">{</span>
  60       <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
  61       <span class="i">$FirstDifferentLenID</span> = <span class="i">$ID</span><span class="sc">;</span>
  62       <span class="k">last</span> <span class="j">ID</span><span class="sc">;</span>
  63     <span class="s">}</span>
  64   <span class="s">}</span>
  65   <span class="k">return</span> <span class="s">(</span><span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
  66 <span class="s">}</span>
  67 
  68 <span class="c"># Calculate percent identity between two sequences. By default, gaps are ignored.</span>
<a name="CalcuatePercentSequenceIdentity-"></a>  69 <span class="k">sub </span><span class="m">CalcuatePercentSequenceIdentity</span> <span class="s">{</span>
  70   <span class="k">my</span><span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$PercentIdentity</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span><span class="sc">;</span>
  71 
  72   <span class="i">$PercentIdentity</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
  73   <span class="i">$Precision</span> = <span class="n">1</span><span class="sc">;</span>
  74   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
  75   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">4</span><span class="s">)</span> <span class="s">{</span>
  76     <span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
  77   <span class="s">}</span>
  78   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">3</span><span class="s">)</span> <span class="s">{</span>
  79     <span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
  80   <span class="s">}</span>
  81   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
  82     <span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
  83   <span class="s">}</span>
  84   <span class="k">else</span> <span class="s">{</span>
  85     <span class="k">return</span> <span class="i">$PercentIdentity</span><span class="sc">;</span>
  86   <span class="s">}</span>
  87   <span class="k">if</span> <span class="s">(</span>!<span class="s">(</span><span class="i">IsNotEmpty</span><span class="s">(</span><span class="i">$Sequence1</span><span class="s">)</span> &amp;&amp; <span class="i">IsNotEmpty</span><span class="s">(</span><span class="i">$Sequence2</span><span class="s">)</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
  88     <span class="k">return</span> <span class="i">$PercentIdentity</span><span class="sc">;</span>
  89   <span class="s">}</span>
  90   <span class="k">my</span><span class="s">(</span><span class="i">$Index</span><span class="cm">,</span> <span class="i">$Identity</span><span class="cm">,</span> <span class="i">$Sequence1Len</span><span class="cm">,</span> <span class="i">$Sequence2Len</span><span class="cm">,</span> <span class="i">$Residue1</span><span class="cm">,</span> <span class="i">$Residue2</span><span class="cm">,</span> <span class="i">$ResMatchCount</span><span class="cm">,</span> <span class="i">$ResCount</span><span class="s">)</span><span class="sc">;</span>
  91 
  92   <span class="i">$Sequence1Len</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence1</span><span class="s">)</span><span class="sc">;</span>
  93   <span class="i">$Sequence2Len</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence2</span><span class="s">)</span><span class="sc">;</span>
  94 
  95   <span class="i">$ResMatchCount</span> = <span class="n">0</span><span class="sc">;</span>
  96   <span class="i">$ResCount</span> = <span class="n">0</span><span class="sc">;</span>
  97   <span class="j">RESIDUE:</span> <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$Sequence1Len</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
  98     <span class="i">$Residue1</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
  99     <span class="i">$Residue2</span> = <span class="s">(</span><span class="i">$Index</span> &lt; <span class="i">$Sequence2Len</span><span class="s">)</span> ? <span class="k">substr</span><span class="s">(</span><span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span> <span class="co">:</span> <span class="q">&#39;&#39;</span><span class="sc">;</span>
 100     <span class="k">if</span> <span class="s">(</span><span class="i">$IgnoreGaps</span><span class="s">)</span> <span class="s">{</span>
 101       <span class="k">if</span> <span class="s">(</span><span class="i">$Residue1</span> !~ <span class="q">/[A-Z]/i</span> || <span class="i">$Residue2</span> !~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
 102         <span class="k">next</span> <span class="j">RESIDUE</span><span class="sc">;</span>
 103       <span class="s">}</span>
 104     <span class="s">}</span>
 105     <span class="k">if</span> <span class="s">(</span><span class="i">$Residue1</span> <span class="k">eq</span> <span class="i">$Residue2</span><span class="s">)</span> <span class="s">{</span>
 106       <span class="i">$ResMatchCount</span>++<span class="sc">;</span>
 107     <span class="s">}</span>
 108     <span class="i">$ResCount</span>++<span class="sc">;</span>
 109   <span class="s">}</span>
 110   <span class="i">$Identity</span> = <span class="i">$ResCount</span> ? <span class="s">(</span><span class="i">$ResMatchCount</span>/<span class="i">$ResCount</span><span class="s">)</span> <span class="co">:</span> <span class="n">0.0</span><span class="sc">;</span>
 111   <span class="i">$PercentIdentity</span> = <span class="k">sprintf</span><span class="s">(</span><span class="q">&quot;%.${Precision}f&quot;</span><span class="cm">,</span> <span class="s">(</span><span class="i">$Identity</span> * <span class="n">100</span><span class="s">)</span><span class="s">)</span><span class="sc">;</span>
 112 
 113   <span class="k">return</span> <span class="i">$PercentIdentity</span><span class="sc">;</span>
 114 <span class="s">}</span>
 115 
 116 <span class="c"># Calculate pairwise identify matrix for all the sequences and return a reference</span>
 117 <span class="c"># to a hash with the following keys:</span>
 118 <span class="c">#</span>
 119 <span class="c"># {IDs} - Sequence IDs</span>
 120 <span class="c"># {Count} - Number of IDs</span>
 121 <span class="c"># {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences</span>
 122 <span class="c">#</span>
<a name="CalculatePercentSequenceIdentityMatrix-"></a> 123 <span class="k">sub </span><span class="m">CalculatePercentSequenceIdentityMatrix</span> <span class="s">{</span>
 124   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="cm">,</span> <span class="i">$Precision</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$RowID</span><span class="cm">,</span> <span class="i">$ColID</span><span class="cm">,</span> <span class="i">$RowIDSeq</span><span class="cm">,</span> <span class="i">$ColIDSeq</span><span class="cm">,</span> <span class="i">$PercentIdentity</span><span class="cm">,</span> <span class="i">%IdentityMatrixData</span><span class="s">)</span><span class="sc">;</span>
 125 
 126   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
 127   <span class="i">$Precision</span> = <span class="n">1</span><span class="sc">;</span>
 128   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">3</span><span class="s">)</span> <span class="s">{</span>
 129     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 130   <span class="s">}</span>
 131   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
 132     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 133   <span class="s">}</span>
 134   <span class="k">else</span> <span class="s">{</span>
 135     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 136   <span class="s">}</span>
 137 
 138   <span class="i">%IdentityMatrixData</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 139   <span class="i">@</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">IDs</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 140   <span class="i">%</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">PercentIdentity</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 141   <span class="i">$IdentityMatrixData</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
 142 
 143   <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 144     <span class="k">push</span> <span class="i">@</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
 145     <span class="i">$IdentityMatrixData</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
 146   <span class="s">}</span>
 147   <span class="c"># Initialize and calculate percent identity data values...</span>
 148   <span class="k">for</span> <span class="i">$RowID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 149     <span class="i">%</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">PercentIdentity</span>}{<span class="i">$RowID</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 150     <span class="i">$RowIDSeq</span> = <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$RowID</span>}<span class="sc">;</span>
 151     <span class="k">for</span> <span class="i">$ColID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 152       <span class="i">$IdentityMatrixData</span>{<span class="i">$RowID</span>}{<span class="i">$ColID</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 153       <span class="i">$ColIDSeq</span> = <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ColID</span>}<span class="sc">;</span>
 154       <span class="i">$PercentIdentity</span> = <span class="i">CalcuatePercentSequenceIdentity</span><span class="s">(</span><span class="i">$RowIDSeq</span><span class="cm">,</span> <span class="i">$ColIDSeq</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span><span class="sc">;</span>
 155       <span class="i">$IdentityMatrixData</span>{<span class="w">PercentIdentity</span>}{<span class="i">$RowID</span>}{<span class="i">$ColID</span>} = <span class="i">$PercentIdentity</span><span class="sc">;</span>
 156     <span class="s">}</span>
 157   <span class="s">}</span>
 158   <span class="k">return</span> \<span class="i">%IdentityMatrixData</span><span class="sc">;</span>
 159 <span class="s">}</span>
 160 
 161 <span class="c"># Retrieve information about shortest sequence...</span>
<a name="GetShortestSequence-"></a> 162 <span class="k">sub </span><span class="m">GetShortestSequence</span> <span class="s">{</span>
 163   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
 164 
 165   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
 166   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
 167     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 168   <span class="s">}</span>
 169   <span class="k">else</span> <span class="s">{</span>
 170     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 171   <span class="s">}</span>
 172 
 173   <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> =  <span class="i">_GetShortestOrLongestSequence</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="q">&#39;Shortest&#39;</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
 174   <span class="k">return</span> <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
 175 <span class="s">}</span>
 176 
 177 <span class="c"># Retrieve information about longest sequence..</span>
<a name="GetLongestSequence-"></a> 178 <span class="k">sub </span><span class="m">GetLongestSequence</span> <span class="s">{</span>
 179   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
 180 
 181   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
 182   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
 183     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 184   <span class="s">}</span>
 185   <span class="k">else</span> <span class="s">{</span>
 186     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 187   <span class="s">}</span>
 188 
 189   <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> =  <span class="i">_GetShortestOrLongestSequence</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="q">&#39;Longest&#39;</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
 190   <span class="k">return</span> <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
 191 <span class="s">}</span>
 192 
 193 <span class="c"># Get sequence length...</span>
<a name="GetSequenceLength-"></a> 194 <span class="k">sub </span><span class="m">GetSequenceLength</span> <span class="s">{</span>
 195   <span class="k">my</span><span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
 196 
 197   <span class="i">$SeqLen</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span> <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
 198   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
 199     <span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 200   <span class="s">}</span>
 201   <span class="k">else</span> <span class="s">{</span>
 202     <span class="s">(</span><span class="i">$Seq</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 203   <span class="s">}</span>
 204   <span class="k">if</span> <span class="s">(</span><span class="i">$IgnoreGaps</span><span class="s">)</span> <span class="s">{</span>
 205     <span class="k">my</span><span class="s">(</span><span class="i">$Index</span><span class="cm">,</span> <span class="i">$Residue</span><span class="s">)</span><span class="sc">;</span>
 206     <span class="i">$SeqLen</span> = <span class="n">0</span><span class="sc">;</span>
 207     <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="k">length</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 208       <span class="i">$Residue</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 209       <span class="k">if</span> <span class="s">(</span><span class="i">$Residue</span> =~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
 210         <span class="i">$SeqLen</span>++<span class="sc">;</span>
 211       <span class="s">}</span>
 212     <span class="s">}</span>
 213   <span class="s">}</span>
 214   <span class="k">else</span> <span class="s">{</span>
 215     <span class="i">$SeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span><span class="sc">;</span>
 216   <span class="s">}</span>
 217 
 218   <span class="k">return</span> <span class="i">$SeqLen</span><span class="sc">;</span>
 219 <span class="s">}</span>
 220 
 221 <span class="c"># Is it a gap residue...</span>
<a name="IsGapResidue-"></a> 222 <span class="k">sub </span><span class="m">IsGapResidue</span> <span class="s">{</span>
 223   <span class="k">my</span><span class="s">(</span><span class="i">$Residue</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 224   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
 225 
 226   <span class="i">$Status</span> = <span class="s">(</span><span class="i">$Residue</span> !~ <span class="q">/[A-Z]/i</span> <span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
 227 
 228   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
 229 <span class="s">}</span>
 230 
 231 <span class="c"># Is it a supported sequence file?</span>
 232 <span class="c">#</span>
 233 <span class="c"># Supported seqence formats are:</span>
 234 <span class="c">#</span>
 235 <span class="c"># ALN/ClustalW   .aln</span>
 236 <span class="c"># GCG/MSF         .msf</span>
 237 <span class="c"># PILEUP/MSF     .msf</span>
 238 <span class="c"># Fasts(Pearson) .fasta, .fta</span>
 239 <span class="c"># NBRF/PIR         .pir</span>
 240 <span class="c">#</span>
<a name="IsSupportedSequenceFile-"></a> 241 <span class="k">sub </span><span class="m">IsSupportedSequenceFile</span> <span class="s">{</span>
 242   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 243   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$SequenceFormat</span><span class="s">)</span><span class="sc">;</span>
 244   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;NotSupported&#39;</span><span class="sc">;</span>
 245 
 246   <span class="j">SEQFORMAT:</span> <span class="s">{</span>
 247       <span class="k">if</span> <span class="s">(</span><span class="i">IsClustalWSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;ClustalW&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
 248       <span class="k">if</span> <span class="s">(</span><span class="i">IsPearsonFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;Pearson&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
 249       <span class="k">if</span> <span class="s">(</span><span class="i">IsPIRFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;PIR&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
 250       <span class="k">if</span> <span class="s">(</span><span class="i">IsMSFSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;MSF&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
 251       <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;NotSupported&#39;</span><span class="sc">;</span>
 252   <span class="s">}</span>
 253   <span class="k">return</span> <span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$SequenceFormat</span><span class="s">)</span><span class="sc">;</span>
 254 <span class="s">}</span>
 255 
 256 <span class="c"># Is it a ClustalW multiple sequence sequence file...</span>
<a name="IsClustalWSequenceFile-"></a> 257 <span class="k">sub </span><span class="m">IsClustalWSequenceFile</span> <span class="s">{</span>
 258   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 259   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
 260 
 261   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
 262 
 263   <span class="k">open</span> <span class="w">SEQUENCEFILE</span><span class="cm">,</span> <span class="q">&quot;$SequenceFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $SequenceFile: $!\n&quot;</span><span class="sc">;</span>
 264   <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*SEQUENCEFILE</span><span class="s">)</span><span class="sc">;</span>
 265   <span class="i">$Status</span> = <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/(ClustalW|Clustal W|Clustal)/i</span> <span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
 266   <span class="k">close</span> <span class="w">SEQUENCEFILE</span><span class="sc">;</span>
 267 
 268   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
 269 <span class="s">}</span>
 270 
 271 <span class="c"># Is it a valid Pearson fasta sequence or alignment file?</span>
 272 <span class="c">#</span>
<a name="IsPearsonFastaSequenceFile-"></a> 273 <span class="k">sub </span><span class="m">IsPearsonFastaSequenceFile</span> <span class="s">{</span>
 274   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFile</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
 275 
 276   <span class="s">(</span><span class="i">$FastaFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 277   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
 278 
 279   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFile: $!\n&quot;</span><span class="sc">;</span>
 280   <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="sc">;</span>
 281 
 282   <span class="c"># First line starts with &gt; and the fourth character is not &#39;;&#39;; otherwise, it&#39;s</span>
 283   <span class="c"># PIR FASTA format.</span>
 284   <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^&gt;/</span><span class="s">)</span> <span class="s">{</span>
 285     <span class="k">my</span><span class="s">(</span><span class="i">$FourthChar</span><span class="s">)</span><span class="sc">;</span>
 286     <span class="i">$FourthChar</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="n">3</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 287     <span class="i">$Status</span> = <span class="s">(</span><span class="i">$FourthChar</span> !~ <span class="q">/\;/</span><span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
 288   <span class="s">}</span>
 289   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
 290 
 291   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
 292 <span class="s">}</span>
 293 
 294 <span class="c"># Is it a valid NBRF/PIR fasta sequence or alignment file?</span>
 295 <span class="c">#</span>
<a name="IsPIRFastaSequenceFile-"></a> 296 <span class="k">sub </span><span class="m">IsPIRFastaSequenceFile</span> <span class="s">{</span>
 297   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFile</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
 298 
 299   <span class="s">(</span><span class="i">$FastaFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 300   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
 301 
 302   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFile: $!\n&quot;</span><span class="sc">;</span>
 303   <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="sc">;</span>
 304 
 305   <span class="c"># First line starts with &gt; and the fourth character is &#39;;&#39;; otherwise, it&#39;s</span>
 306   <span class="c"># a Pearson FASTA format.</span>
 307   <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^&gt;/</span><span class="s">)</span> <span class="s">{</span>
 308     <span class="k">my</span><span class="s">(</span><span class="i">$FourthChar</span><span class="s">)</span><span class="sc">;</span>
 309     <span class="i">$FourthChar</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="n">3</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 310     <span class="i">$Status</span> = <span class="s">(</span><span class="i">$FourthChar</span> =~ <span class="q">/\;/</span><span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
 311   <span class="s">}</span>
 312   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
 313 
 314   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
 315 <span class="s">}</span>
 316 
 317 <span class="c"># Is it a valid MSF sequence or alignment file?</span>
 318 <span class="c">#</span>
<a name="IsMSFSequenceFile-"></a> 319 <span class="k">sub </span><span class="m">IsMSFSequenceFile</span> <span class="s">{</span>
 320   <span class="k">my</span><span class="s">(</span><span class="i">$MSFFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 321 
 322   <span class="k">open</span> <span class="w">MSFFILE</span><span class="cm">,</span> <span class="q">&quot;$MSFFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $MSFFile: $!\n&quot;</span><span class="sc">;</span>
 323 
 324   <span class="k">my</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
 325 
 326   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
 327   <span class="c"># Find a line that contains MSF: keyword and ends with &#39;..&#39;</span>
 328   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*MSFFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 329     <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
 330     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/MSF:/i</span> &amp;&amp; <span class="i">$Line</span> =~ <span class="q">/\.\.[ ]*$/</span><span class="s">)</span> <span class="s">{</span>
 331       <span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span>
 332       <span class="k">last</span> <span class="j">LINE</span><span class="sc">;</span>
 333     <span class="s">}</span>
 334     <span class="k">elsif</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i</span><span class="s">)</span> <span class="s">{</span>
 335       <span class="c"># Pileup MSF...</span>
 336       <span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span>
 337       <span class="k">last</span> <span class="j">LINE</span><span class="sc">;</span>
 338     <span class="s">}</span>
 339   <span class="s">}</span>
 340   <span class="k">close</span> <span class="w">MSFFILE</span><span class="sc">;</span>
 341 
 342   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
 343 <span class="s">}</span>
 344 
 345 <span class="c"># Read sequence or sequence alignment file...</span>
<a name="ReadSequenceFile-"></a> 346 <span class="k">sub </span><span class="m">ReadSequenceFile</span> <span class="s">{</span>
 347   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 348 
 349   <span class="k">if</span> <span class="s">(</span><span class="i">IsPearsonFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 350     <span class="k">return</span> <span class="i">ReadPearsonFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 351   <span class="s">}</span>
 352   <span class="k">elsif</span> <span class="s">(</span><span class="i">IsPIRFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 353     <span class="k">return</span> <span class="i">ReadPIRFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 354   <span class="s">}</span>
 355   <span class="k">elsif</span> <span class="s">(</span><span class="i">IsMSFSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 356     <span class="k">return</span> <span class="i">ReadMSFSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 357   <span class="s">}</span>
 358   <span class="k">elsif</span> <span class="s">(</span><span class="i">IsClustalWSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 359     <span class="k">return</span> <span class="i">ReadClustalWSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 360   <span class="s">}</span>
 361   <span class="k">else</span> <span class="s">{</span>
 362     <span class="k">return</span> <span class="k">undef</span><span class="sc">;</span>
 363   <span class="s">}</span>
 364 <span class="s">}</span>
 365 
 366 <span class="c"># Read file and setup alignment data...</span>
<a name="ReadClustalWSequenceFile-"></a> 367 <span class="k">sub </span><span class="m">ReadClustalWSequenceFile</span> <span class="s">{</span>
 368   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 369 
 370   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;ClustalW&#39;</span><span class="s">)</span><span class="sc">;</span>
 371 <span class="s">}</span>
 372 
 373 <span class="c"># Read file and setup alignment data...</span>
<a name="ReadPearsonFastaSequenceFile-"></a> 374 <span class="k">sub </span><span class="m">ReadPearsonFastaSequenceFile</span> <span class="s">{</span>
 375   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 376 
 377   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;Pearson&#39;</span><span class="s">)</span><span class="sc">;</span>
 378 <span class="s">}</span>
 379 
 380 <span class="c"># Read file and setup alignment data...</span>
<a name="ReadPIRFastaSequenceFile-"></a> 381 <span class="k">sub </span><span class="m">ReadPIRFastaSequenceFile</span> <span class="s">{</span>
 382   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 383 
 384   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;PIR&#39;</span><span class="s">)</span><span class="sc">;</span>
 385 <span class="s">}</span>
 386 
 387 
 388 <span class="c"># Read file and setup sequence data...</span>
<a name="ReadMSFSequenceFile-"></a> 389 <span class="k">sub </span><span class="m">ReadMSFSequenceFile</span> <span class="s">{</span>
 390   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 391 
 392   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;MSF&#39;</span><span class="s">)</span><span class="sc">;</span>
 393 <span class="s">}</span>
 394 
 395 <span class="c"># Write out a Pearson FASTA file...</span>
<a name="WritePearsonFastaSequenceFile-"></a> 396 <span class="k">sub </span><span class="m">WritePearsonFastaSequenceFile</span> <span class="s">{</span>
 397   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFileName</span><span class="cm">,</span> <span class="i">$SequenceDataRef</span><span class="cm">,</span> <span class="i">$MaxLength</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$WrappedSequence</span><span class="s">)</span><span class="sc">;</span>
 398 
 399   <span class="i">$MaxLength</span> = <span class="n">80</span><span class="sc">;</span>
 400   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">3</span><span class="s">)</span> <span class="s">{</span>
 401     <span class="s">(</span><span class="i">$SequenceFileName</span><span class="cm">,</span> <span class="i">$SequenceDataRef</span><span class="cm">,</span> <span class="i">$MaxLength</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 402   <span class="s">}</span>
 403   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
 404     <span class="s">(</span><span class="i">$SequenceFileName</span><span class="cm">,</span> <span class="i">$SequenceDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 405   <span class="s">}</span>
 406   <span class="k">open</span> <span class="w">SEQUENCEFILE</span><span class="cm">,</span> <span class="q">&quot;&gt;$SequenceFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Can&#39;t open $SequenceFileName: $!\n&quot;</span><span class="sc">;</span>
 407   <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequenceDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 408     <span class="i">$Description</span> = <span class="i">$SequenceDataRef</span>-&gt;{<span class="w">Description</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
 409     <span class="i">$Sequence</span> = <span class="i">$SequenceDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
 410     <span class="i">$WrappedSequence</span> = <span class="i">WrapText</span><span class="s">(</span><span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$MaxLength</span><span class="cm">,</span> <span class="q">&quot;\n&quot;</span><span class="s">)</span><span class="sc">;</span>
 411 
 412     <span class="c"># Description also contains ID...</span>
 413     <span class="k">print</span> <span class="i">SEQUENCEFILE</span> <span class="q">&quot;&gt;$Description\n&quot;</span><span class="sc">;</span>
 414     <span class="k">print</span> <span class="i">SEQUENCEFILE</span> <span class="q">&quot;$WrappedSequence\n&quot;</span><span class="sc">;</span>
 415   <span class="s">}</span>
 416   <span class="k">close</span> <span class="w">SEQUENCEFILE</span><span class="sc">;</span>
 417 <span class="s">}</span>
 418 
 419 <span class="c"># Get ID, Sequence and Length for smallest or longest sequence</span>
<a name="_GetShortestOrLongestSequence-"></a> 420 <span class="k">sub </span><span class="m">_GetShortestOrLongestSequence</span> <span class="s">{</span>
 421   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$SequenceType</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 422   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$FirstID</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
 423 
 424   <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="s">)</span> = <span class="s">(</span><span class="q">&#39;&#39;</span><span class="cm">,</span> <span class="q">&#39;&#39;</span><span class="cm">,</span> <span class="q">&#39;&#39;</span><span class="s">)</span><span class="sc">;</span>
 425   <span class="i">$FirstID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 426 
 427   <span class="j">ID:</span> <span class="k">for</span> <span class="i">$CurrentID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 428     <span class="i">$CurrentSeq</span> = <span class="i">$IgnoreGaps</span> ? <span class="i">RemoveSequenceGaps</span><span class="s">(</span><span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$CurrentID</span>}<span class="s">)</span> <span class="co">:</span> <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$CurrentID</span>}<span class="sc">;</span>
 429     <span class="i">$CurrentSeqLen</span> = <span class="i">GetSequenceLength</span><span class="s">(</span><span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
 430     <span class="i">$CurrentDescription</span> = <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Description</span>}{<span class="i">$CurrentID</span>}<span class="sc">;</span>
 431     <span class="k">if</span> <span class="s">(</span>!<span class="i">$FirstID</span><span class="s">)</span> <span class="s">{</span>
 432       <span class="i">$FirstID</span> = <span class="i">$ID</span><span class="sc">;</span> <span class="i">$FirstSeqLen</span> = <span class="i">$CurrentSeqLen</span><span class="sc">;</span>
 433       <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> = <span class="s">(</span><span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
 434       <span class="k">next</span> <span class="j">ID</span><span class="sc">;</span>
 435     <span class="s">}</span>
 436     <span class="k">if</span> <span class="s">(</span><span class="i">$CurrentSeqLen</span> != <span class="i">$SeqLen</span><span class="s">)</span> <span class="s">{</span>
 437       <span class="k">if</span> <span class="s">(</span><span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/Shortest/i</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">$CurrentSeqLen</span> &lt; <span class="i">$SeqLen</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 438         <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> = <span class="s">(</span><span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
 439       <span class="s">}</span>
 440       <span class="k">elsif</span> <span class="s">(</span><span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/Longest/i</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">$CurrentSeqLen</span> &gt; <span class="i">$SeqLen</span><span class="s">)</span> <span class="s">)</span> <span class="s">{</span>
 441         <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> = <span class="s">(</span><span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
 442       <span class="s">}</span>
 443     <span class="s">}</span>
 444   <span class="s">}</span>
 445   <span class="k">return</span> <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
 446 <span class="s">}</span>
 447 
 448 <span class="c"># Remove gaps in the sequence and return new sequence...</span>
<a name="RemoveSequenceGaps-"></a> 449 <span class="k">sub </span><span class="m">RemoveSequenceGaps</span> <span class="s">{</span>
 450   <span class="k">my</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 451   <span class="k">my</span><span class="s">(</span><span class="i">$SeqWithoutGaps</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="i">$Residue</span><span class="s">)</span><span class="sc">;</span>
 452 
 453   <span class="i">$SeqWithoutGaps</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 454   <span class="i">$SeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span><span class="sc">;</span>
 455   <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$SeqLen</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 456     <span class="i">$Residue</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 457     <span class="k">if</span> <span class="s">(</span><span class="i">$Residue</span> =~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
 458       <span class="i">$SeqWithoutGaps</span> .= <span class="i">$Residue</span><span class="sc">;</span>
 459     <span class="s">}</span>
 460   <span class="s">}</span>
 461 
 462   <span class="k">return</span> <span class="i">$SeqWithoutGaps</span><span class="sc">;</span>
 463 <span class="s">}</span>
 464 
 465 <span class="c"># Using input alignment data map ref containing following keys, generate</span>
 466 <span class="c"># a new hash with same set of keys after residue columns containg only</span>
 467 <span class="c"># gaps have been removed:</span>
 468 <span class="c">#</span>
 469 <span class="c"># {IDs} : Array of IDs in order as they appear in file</span>
 470 <span class="c"># {Count}: ID count...</span>
 471 <span class="c"># {Description}{$ID} : Description data...</span>
 472 <span class="c"># {Sequence}{$ID} : Sequence data...</span>
 473 <span class="c">#</span>
<a name="RemoveSequenceAlignmentGapColumns-"></a> 474 <span class="k">sub </span><span class="m">RemoveSequenceAlignmentGapColumns</span> <span class="s">{</span>
 475   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$AlignmentDataMapRef</span><span class="cm">,</span> <span class="i">%NewAlignmentDataMap</span><span class="s">)</span><span class="sc">;</span>
 476 
 477   <span class="s">(</span><span class="i">$AlignmentDataMapRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 478 
 479   <span class="i">%NewAlignmentDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 480   <span class="i">@</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 481   <span class="i">%</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 482   <span class="i">%</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 483   <span class="i">$NewAlignmentDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
 484 
 485   <span class="c"># Transfer ID and count information...</span>
 486   <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 487     <span class="k">push</span> <span class="i">@</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
 488     <span class="i">$NewAlignmentDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Description</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
 489     <span class="i">$NewAlignmentDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 490     <span class="i">$NewAlignmentDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
 491   <span class="s">}</span>
 492 
 493   <span class="c"># Go over residue columns and transfer the data...</span>
 494   <span class="k">my</span><span class="s">(</span><span class="i">$FirstID</span><span class="cm">,</span> <span class="i">$FirstSeq</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="i">$Res</span><span class="cm">,</span> <span class="i">$GapColumn</span><span class="s">)</span><span class="sc">;</span>
 495 
 496   <span class="i">$FirstID</span> = <span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}[<span class="n">0</span>]<span class="sc">;</span>
 497   <span class="i">$FirstSeq</span> = <span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$FirstID</span>}<span class="sc">;</span>
 498   <span class="i">$FirstSeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$FirstSeq</span><span class="s">)</span><span class="sc">;</span>
 499 
 500   <span class="j">RES:</span> <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$FirstSeqLen</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 501     <span class="c"># Is this a gap column?</span>
 502     <span class="i">$GapColumn</span> = <span class="n">1</span><span class="sc">;</span>
 503     <span class="j">ID:</span> <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 504       <span class="i">$Res</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 505       <span class="k">if</span> <span class="s">(</span><span class="i">$Res</span> =~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
 506         <span class="i">$GapColumn</span> = <span class="n">0</span><span class="sc">;</span>
 507         <span class="k">last</span> <span class="j">ID</span><span class="sc">;</span>
 508       <span class="s">}</span>
 509     <span class="s">}</span>
 510     <span class="k">if</span> <span class="s">(</span><span class="i">$GapColumn</span><span class="s">)</span> <span class="s">{</span>
 511       <span class="k">next</span> <span class="j">RES</span><span class="sc">;</span>
 512     <span class="s">}</span>
 513     <span class="c"># Transfer this residue...</span>
 514     <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 515       <span class="i">$Res</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 516       <span class="i">$NewAlignmentDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Res</span><span class="sc">;</span>
 517     <span class="s">}</span>
 518   <span class="s">}</span>
 519 
 520   <span class="k">return</span> <span class="s">(</span>\<span class="i">%NewAlignmentDataMap</span><span class="s">)</span><span class="sc">;</span>
 521 <span class="s">}</span>
 522 
 523 <span class="c">#</span>
 524 <span class="c"># Read sequences file and return a reference to hash with the following keys:</span>
 525 <span class="c">#</span>
 526 <span class="c"># {IDs} - Array of sequence IDs</span>
 527 <span class="c"># {Count} - Number of sequences</span>
 528 <span class="c"># {Description}{$ID} - Sequence description</span>
 529 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
 530 <span class="c"># {InputFileType} - Sequence file format</span>
 531 <span class="c"># {ConservedAnnotation} - Conserved residue annonation</span>
 532 <span class="c">#</span>
 533 <span class="c"># Note:</span>
 534 <span class="c">#   . Conserved residue annotation either exist in the input sequence alignment file or set</span>
 535 <span class="c">#     for a file containing same number of residues for all the sequence using the following</span>
 536 <span class="c">#     notation: * - Residue conserved; &#39; &#39; - Residue not conserved.</span>
 537 <span class="c">#</span>
<a name="_ReadFileAndSetupSequencesData-"></a> 538 <span class="k">sub </span><span class="m">_ReadFileAndSetupSequencesData</span> <span class="s">{</span>
 539   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="i">$SequenceType</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 540   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
 541 
 542   <span class="i">$SequenceDataMapRef</span> = <span class="k">undef</span><span class="sc">;</span>
 543 
 544   <span class="c"># Read sequence file...</span>
 545   <span class="i">$SequenceDataMapRef</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 546   <span class="k">if</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^ClustalW$/i</span><span class="s">)</span> <span class="s">{</span>
 547     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadClustalWFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 548   <span class="s">}</span>
 549   <span class="k">elsif</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^Pearson$/i</span><span class="s">)</span> <span class="s">{</span>
 550     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadPearsonFastaFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 551   <span class="s">}</span>
 552   <span class="k">elsif</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^PIR$/i</span><span class="s">)</span> <span class="s">{</span>
 553     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadPIRFastaFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 554   <span class="s">}</span>
 555   <span class="k">elsif</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^MSF$/i</span><span class="s">)</span> <span class="s">{</span>
 556     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadMSFFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
 557   <span class="s">}</span>
 558   <span class="k">else</span> <span class="s">{</span>
 559     <span class="k">return</span> <span class="i">$SequenceDataMapRef</span><span class="sc">;</span>
 560   <span class="s">}</span>
 561 
 562   <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>}<span class="s">)</span> <span class="s">{</span>
 563     <span class="k">return</span> <span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
 564   <span class="s">}</span>
 565   <span class="k">if</span> <span class="s">(</span>!<span class="s">(</span><span class="s">(</span><span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">Count</span>} &gt; <span class="n">1</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">AreSequenceLengthsIdentical</span><span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="s">)</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 566     <span class="k">return</span> <span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
 567   <span class="s">}</span>
 568 
 569   <span class="c"># Use the first sequence to setup an empty ConservedAnnotation key...</span>
 570   <span class="c"># And mark fully conserved residues...</span>
 571   <span class="c">#</span>
 572   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$FirstSequence</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$Res</span><span class="cm">,</span> <span class="i">$FirstRes</span><span class="cm">,</span> <span class="i">$ResConserved</span><span class="cm">,</span> <span class="i">$Index</span><span class="s">)</span><span class="sc">;</span>
 573   <span class="i">$ID</span> = <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">IDs</span>}[<span class="n">0</span>]<span class="sc">;</span>
 574   <span class="i">$FirstSequence</span> = <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
 575   <span class="i">$FirstSeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$FirstSequence</span><span class="s">)</span><span class="sc">;</span>
 576   <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 577   <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$FirstSeqLen</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 578     <span class="i">$FirstRes</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 579     <span class="i">$ResConserved</span> = <span class="n">1</span><span class="sc">;</span>
 580     <span class="j">ID:</span> <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
 581       <span class="i">$Sequence</span> = <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
 582       <span class="i">$Res</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
 583       <span class="k">if</span> <span class="s">(</span>!<span class="i">$FirstRes</span><span class="s">)</span> <span class="s">{</span>
 584         <span class="i">$FirstRes</span> = <span class="i">$Res</span><span class="sc">;</span>
 585         <span class="k">next</span> <span class="j">ID</span><span class="sc">;</span>
 586       <span class="s">}</span>
 587       <span class="k">if</span> <span class="s">(</span><span class="s">(</span><span class="i">$Res</span> !~ <span class="q">/[A-Z]/i</span><span class="s">)</span> || <span class="s">(</span><span class="i">$Res</span> <span class="k">ne</span> <span class="i">$FirstRes</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 588         <span class="i">$ResConserved</span> = <span class="n">0</span><span class="sc">;</span>
 589         <span class="k">last</span> <span class="j">ID</span><span class="sc">;</span>
 590       <span class="s">}</span>
 591     <span class="s">}</span>
 592     <span class="k">if</span> <span class="s">(</span><span class="i">$ResConserved</span><span class="s">)</span> <span class="s">{</span>
 593       <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>} .= <span class="q">&#39;*&#39;</span><span class="sc">;</span>
 594     <span class="s">}</span>
 595     <span class="k">else</span> <span class="s">{</span>
 596       <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>} .= <span class="q">&#39; &#39;</span><span class="sc">;</span>
 597     <span class="s">}</span>
 598   <span class="s">}</span>
 599 
 600   <span class="k">return</span> <span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
 601 <span class="s">}</span>
 602 
 603 <span class="c"># Read sequence data in ClustalW multiple sequence alignment file and</span>
 604 <span class="c"># return a reference to hash with these keys and values:</span>
 605 <span class="c">#</span>
 606 <span class="c"># {IDs} - Array of sequence IDs</span>
 607 <span class="c"># {Count} - Number of sequences</span>
 608 <span class="c"># {Description}{$ID} - Sequence description</span>
 609 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
 610 <span class="c"># {InputFileType} - Sequence file format</span>
 611 <span class="c"># {ConservedAnnotation} - Conserved residue annonations: space, *, : , .</span>
 612 <span class="c">#</span>
 613 <span class="c">#</span>
 614 <span class="c">#</span>
 615 <span class="c"># And based on ClustalW/X manual, here is what the ConservedAnnonations mean:</span>
 616 <span class="c">#</span>
 617 <span class="c"># &#39;*&#39; indicates positions which have a single, fully conserved residue</span>
 618 <span class="c">#</span>
 619 <span class="c"># &#39;:&#39; indicates that one of the following &#39;strong&#39; groups is fully conserved: STA</span>
 620 <span class="c">#    NEQK NHQK NDEQ QHRK MILV MILF HY FYW</span>
 621 
 622 <span class="c"># &#39;.&#39; indicates that one of the following &#39;weaker&#39; groups is fully conserved:</span>
 623 <span class="c">#     CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY</span>
 624 <span class="c">#</span>
 625 <span class="c"># These are all the positively scoring groups that occur in the Gonnet Pam250</span>
 626 <span class="c"># matrix. The strong and weak groups are defined as strong score &gt;0.5 and weak</span>
 627 <span class="c"># score =&lt;0.5 respectively.</span>
 628 <span class="c">#</span>
<a name="_ReadClustalWFile-"></a> 629 <span class="k">sub </span><span class="m">_ReadClustalWFile</span> <span class="s">{</span>
 630   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 631   <span class="k">my</span><span class="s">(</span><span class="i">%SequencesDataMap</span><span class="s">)</span><span class="sc">;</span>
 632 
 633   <span class="c"># Initialize data...</span>
 634   <span class="i">%SequencesDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 635   <span class="i">@</span>{<span class="i">$SequencesDataMap</span>{<span class="w">IDs</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 636   <span class="i">%</span>{<span class="i">$SequencesDataMap</span>{<span class="w">Description</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 637   <span class="i">%</span>{<span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 638   <span class="i">$SequencesDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
 639   <span class="i">$SequencesDataMap</span>{<span class="w">ConservedAnnotation</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 640   <span class="i">$SequencesDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;ClustalW&#39;</span><span class="sc">;</span>
 641 
 642   <span class="k">open</span> <span class="w">SEQUENCEFILE</span><span class="cm">,</span> <span class="q">&quot;$SequenceFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $SequenceFile: $!\n&quot;</span><span class="sc">;</span>
 643 
 644   <span class="k">my</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="i">$LineLength</span><span class="cm">,</span> <span class="i">$AnnotationStart</span><span class="cm">,</span> <span class="i">$AnnotationLength</span><span class="cm">,</span> <span class="i">$Annotation</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SequenceLength</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$IDIndex</span><span class="s">)</span><span class="sc">;</span>
 645 
 646   <span class="c"># Ignore the header line...</span>
 647   <span class="i">$Line</span> = <span class="q">&lt;SEQUENCEFILE&gt;</span><span class="sc">;</span>
 648 
 649   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*SEQUENCEFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 650     <span class="k">if</span> <span class="s">(</span><span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^[ \*\:\.]/</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">$Line</span> !~ <span class="q">/[A-Z]/i</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 651       <span class="c"># Annotation for sequences: fully conserverd, weaker or stronger group conserverd.</span>
 652       <span class="c"># Extract it and save...</span>
 653       <span class="i">$LineLength</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
 654       <span class="i">$AnnotationStart</span> = <span class="i">$LineLength</span> - <span class="i">$SequenceLength</span><span class="sc">;</span>
 655       <span class="i">$AnnotationLength</span> = <span class="i">$SequenceLength</span><span class="sc">;</span>
 656       <span class="i">$Annotation</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="i">$AnnotationStart</span><span class="cm">,</span> <span class="i">$AnnotationLength</span><span class="s">)</span><span class="sc">;</span>
 657       <span class="i">$SequencesDataMap</span>{<span class="w">ConservedAnnotation</span>} .= <span class="i">$Annotation</span><span class="sc">;</span>
 658     <span class="s">}</span>
 659     <span class="k">else</span> <span class="s">{</span>
 660       <span class="c"># Extract ID and sequences...</span>
 661       <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="s">)</span>= <span class="i">$Line</span> =~ <span class="q">/^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/</span><span class="sc">;</span>
 662       <span class="i">$Sequence</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
 663       <span class="k">if</span> <span class="s">(</span>!<span class="s">(</span><span class="i">$ID</span> &amp;&amp; <span class="i">$Sequence</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 664         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
 665       <span class="s">}</span>
 666 
 667       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 668         <span class="c"># Append to existing alignment value...</span>
 669         <span class="i">$SequenceLength</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence</span><span class="s">)</span><span class="sc">;</span>
 670         <span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Sequence</span><span class="sc">;</span>
 671       <span class="s">}</span>
 672       <span class="k">else</span> <span class="s">{</span>
 673         <span class="c"># New alignment data...</span>
 674         <span class="i">$SequencesDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
 675         <span class="k">push</span> <span class="i">@</span>{<span class="i">$SequencesDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
 676         <span class="i">$SequencesDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$ID</span><span class="sc">;</span>
 677         <span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Sequence</span><span class="sc">;</span>
 678         <span class="i">$SequenceLength</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence</span><span class="s">)</span><span class="sc">;</span>
 679       <span class="s">}</span>
 680     <span class="s">}</span>
 681   <span class="s">}</span>
 682   <span class="k">close</span> <span class="w">SEQUENCEFILE</span><span class="sc">;</span>
 683   <span class="k">return</span> <span class="s">(</span>\<span class="i">%SequencesDataMap</span><span class="s">)</span><span class="sc">;</span>
 684 <span class="s">}</span>
 685 
 686 <span class="c"># Read Pearson fasta file and return a reference to hash with these keys:</span>
 687 <span class="c">#</span>
 688 <span class="c"># {IDs} - Array of sequence IDs</span>
 689 <span class="c"># {Count} - Number of sequences</span>
 690 <span class="c"># {Description}{$ID} - Sequence description</span>
 691 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
 692 <span class="c"># {InputFileType} - Sequence file format</span>
 693 <span class="c"># {ConservedAnnotation} - Conserved residue annonation</span>
 694 <span class="c">#</span>
<a name="_ReadPearsonFastaFile-"></a> 695 <span class="k">sub </span><span class="m">_ReadPearsonFastaFile</span> <span class="s">{</span>
 696   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFileName</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$IgnoreID</span><span class="cm">,</span> <span class="i">@LineWords</span><span class="cm">,</span> <span class="i">%FastaDataMap</span><span class="s">)</span><span class="sc">;</span>
 697 
 698   <span class="s">(</span><span class="i">$FastaFileName</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 699 
 700   <span class="i">%FastaDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 701   <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 702   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 703   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 704   <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
 705   <span class="i">$FastaDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;Pearson&#39;</span><span class="sc">;</span>
 706 
 707   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFileName: $!\n&quot;</span><span class="sc">;</span>
 708   <span class="i">$ID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 709   <span class="i">$IgnoreID</span> = <span class="n">0</span><span class="sc">;</span>
 710   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 711     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^\&gt;/</span><span class="s">)</span> <span class="s">{</span>
 712       <span class="c"># Start of a new ID...</span>
 713       <span class="i">$Line</span> =~ <span class="q">s/^\&gt;//</span><span class="sc">;</span>
 714       <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
 715       <span class="i">@LineWords</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 716       <span class="i">@LineWords</span> = <span class="k">split</span> <span class="q">/ /</span><span class="cm">,</span> <span class="i">$Line</span><span class="sc">;</span>
 717 
 718       <span class="i">$ID</span> = <span class="i">$LineWords</span>[<span class="n">0</span>]<span class="sc">;</span>
 719       <span class="i">$ID</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
 720       <span class="i">$Description</span> = <span class="i">$Line</span><span class="sc">;</span>
 721 
 722       <span class="i">$IgnoreID</span> = <span class="n">0</span><span class="sc">;</span>
 723       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 724         <span class="i">$IgnoreID</span> = <span class="n">1</span><span class="sc">;</span>
 725         <span class="k">warn</span> <span class="q">&quot;Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n&quot;</span><span class="sc">;</span>
 726         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
 727       <span class="s">}</span>
 728       <span class="k">push</span> <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
 729       <span class="i">$FastaDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$Description</span><span class="sc">;</span>
 730       <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
 731       <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
 732     <span class="s">}</span>
 733     <span class="k">if</span> <span class="s">(</span><span class="i">$IgnoreID</span><span class="s">)</span> <span class="s">{</span> <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span> <span class="s">}</span>
 734 
 735     <span class="c"># Remove any spaces in the sequence...</span>
 736     <span class="i">$Line</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
 737     <span class="c"># Sequence data for active ID...</span>
 738     <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 739       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Line</span><span class="sc">;</span>
 740     <span class="s">}</span>
 741     <span class="k">else</span> <span class="s">{</span>
 742       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Line</span><span class="sc">;</span>
 743     <span class="s">}</span>
 744   <span class="s">}</span>
 745   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
 746   <span class="k">return</span> \<span class="i">%FastaDataMap</span><span class="sc">;</span>
 747 <span class="s">}</span>
 748 
 749 <span class="c"># Read PIR fasta file and return a reference to hash with these keys:</span>
 750 <span class="c">#</span>
 751 <span class="c"># {IDs} - Array of sequence IDs</span>
 752 <span class="c"># {Count} - Number of sequences</span>
 753 <span class="c"># {Description}{$ID} - Sequence description</span>
 754 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
 755 <span class="c"># {InputFileType} - Sequence file format</span>
 756 <span class="c"># {ConservedAnnotation} - Conserved residue annonation</span>
 757 <span class="c">#</span>
 758 <span class="c"># Format:</span>
 759 <span class="c"># A sequence in PIR format consists of:</span>
 760 <span class="c"># One line starting with</span>
 761 <span class="c">#   a &quot;&gt;&quot; (greater-than) sign, followed by</span>
 762 <span class="c">#   a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by</span>
 763 <span class="c">#   a semicolon, followed by</span>
 764 <span class="c">#   the sequence identification code (the database ID-code).</span>
 765 <span class="c"># One line containing a textual description of the sequence.</span>
 766 <span class="c"># One or more lines containing the sequence itself. The end of the</span>
 767 <span class="c"># sequence is marked by a &quot;*&quot; (asterisk) character.</span>
 768 <span class="c">#</span>
 769 <span class="c"># A file in PIR format may comprise more than one sequence.</span>
 770 <span class="c">#</span>
 771 <span class="c"># The PIR format is also often referred to as the NBRF format.</span>
 772 <span class="c">#</span>
 773 <span class="c"># Code SequenceType</span>
 774 <span class="c"># P1    Protein (complete)</span>
 775 <span class="c"># F1    Protein (fragment)</span>
 776 <span class="c"># DL    DNA (linear)</span>
 777 <span class="c"># DC    DNA (circular)</span>
 778 <span class="c"># RL    RNA (linear)</span>
 779 <span class="c"># RC   RNA (circular)</span>
 780 <span class="c"># N3    tRNA</span>
 781 <span class="c"># N1    Other functional RNA</span>
 782 <span class="c">#</span>
 783 
<a name="_ReadPIRFastaFile-"></a> 784 <span class="k">sub </span><span class="m">_ReadPIRFastaFile</span> <span class="s">{</span>
 785   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFileName</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$SequenceTypeCode</span><span class="cm">,</span> <span class="i">$ReadingSequenceData</span><span class="cm">,</span> <span class="i">%FastaDataMap</span><span class="s">)</span><span class="sc">;</span>
 786 
 787   <span class="s">(</span><span class="i">$FastaFileName</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 788 
 789   <span class="i">%FastaDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 790   <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 791   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 792   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 793   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">SequenceTypeCode</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 794   <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
 795   <span class="i">$FastaDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;PIR&#39;</span><span class="sc">;</span>
 796 
 797   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFileName: $!\n&quot;</span><span class="sc">;</span>
 798   <span class="i">$ID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
 799   <span class="i">$ReadingSequenceData</span> = <span class="n">0</span><span class="sc">;</span>
 800   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 801     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^\&gt;/</span><span class="s">)</span> <span class="s">{</span>
 802       <span class="c"># Start of a new ID...</span>
 803       <span class="i">$Line</span> =~ <span class="q">s/^\&gt;//</span><span class="sc">;</span>
 804       <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
 805       <span class="s">(</span><span class="i">$SequenceTypeCode</span><span class="cm">,</span> <span class="i">$ID</span><span class="s">)</span> = <span class="q">/^\&gt;(.*?)\;(.*?)$/</span><span class="sc">;</span>
 806 
 807       <span class="c"># Use next line to retrieve sequence description...</span>
 808       <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="sc">;</span>
 809       <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
 810       <span class="i">$Description</span> = <span class="i">$Line</span><span class="sc">;</span>
 811 
 812       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 813         <span class="k">warn</span> <span class="q">&quot;Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n&quot;</span><span class="sc">;</span>
 814         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
 815       <span class="s">}</span>
 816       <span class="i">$ReadingSequenceData</span> = <span class="n">1</span><span class="sc">;</span>
 817       <span class="k">push</span> <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
 818       <span class="i">$FastaDataMap</span>{<span class="w">SequenceTypeCode</span>}{<span class="i">$ID</span>} = <span class="i">$SequenceTypeCode</span><span class="sc">;</span>
 819       <span class="i">$FastaDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$Description</span><span class="sc">;</span>
 820       <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
 821       <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
 822     <span class="s">}</span>
 823     <span class="k">if</span> <span class="s">(</span>!<span class="i">$ReadingSequenceData</span><span class="s">)</span> <span class="s">{</span> <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span> <span class="s">}</span>
 824 
 825     <span class="c"># Remove any spaces in the sequence...</span>
 826     <span class="i">$Line</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
 827     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/[\*]$/</span><span class="s">)</span> <span class="s">{</span>
 828       <span class="c"># End of sequence...</span>
 829       <span class="i">$ReadingSequenceData</span> = <span class="n">0</span><span class="sc">;</span>
 830       <span class="i">$Line</span> =~ <span class="q">s/[\*]$//</span><span class="sc">;</span>
 831     <span class="s">}</span>
 832     <span class="c"># Sequence data for active ID...</span>
 833     <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 834       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Line</span><span class="sc">;</span>
 835     <span class="s">}</span>
 836     <span class="k">else</span> <span class="s">{</span>
 837       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Line</span><span class="sc">;</span>
 838     <span class="s">}</span>
 839   <span class="s">}</span>
 840   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
 841   <span class="k">return</span> \<span class="i">%FastaDataMap</span><span class="sc">;</span>
 842 <span class="s">}</span>
 843 
 844 <span class="c"># Read MSF file and return a reference to hash with these keys:</span>
 845 <span class="c">#</span>
 846 <span class="c"># {IDs} : Array of IDs in order as they appear in file</span>
 847 <span class="c"># {Count}: ID count...</span>
 848 <span class="c"># {Description}{$ID} : Description data...</span>
 849 <span class="c"># {Sequence}{$ID} : Sequence data...</span>
 850 <span class="c">#</span>
<a name="_ReadMSFFile-"></a> 851 <span class="k">sub </span><span class="m">_ReadMSFFile</span> <span class="s">{</span>
 852   <span class="k">my</span><span class="s">(</span><span class="i">$MSFFileName</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">@LineWords</span><span class="cm">,</span> <span class="i">%MSFDataMap</span><span class="s">)</span><span class="sc">;</span>
 853 
 854   <span class="s">(</span><span class="i">$MSFFileName</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
 855 
 856   <span class="i">%MSFDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 857   <span class="i">@</span>{<span class="i">$MSFDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 858   <span class="i">%</span>{<span class="i">$MSFDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 859   <span class="i">%</span>{<span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 860   <span class="i">$MSFDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
 861   <span class="i">$MSFDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;MSF&#39;</span><span class="sc">;</span>
 862 
 863   <span class="k">open</span> <span class="w">MSFFILE</span><span class="cm">,</span> <span class="q">&quot;$MSFFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $MSFFileName: $!\n&quot;</span><span class="sc">;</span>
 864 
 865   <span class="c"># Collect sequences and IDs...</span>
 866   <span class="c">#</span>
 867   <span class="c"># &#39;//&#39; after the name fields indicates end of header list and start of sequence data.</span>
 868   <span class="c">#</span>
 869   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Len</span><span class="cm">,</span> <span class="i">$Check</span><span class="cm">,</span> <span class="i">$Weight</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$NameFieldsFound</span><span class="cm">,</span> <span class="i">%MSFIDsMap</span><span class="s">)</span><span class="sc">;</span>
 870   <span class="i">%MSFIDsMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
 871   <span class="i">$NameFieldsFound</span> = <span class="n">0</span><span class="sc">;</span>
 872   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*MSFFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 873     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/Name:/</span><span class="s">)</span> <span class="s">{</span>
 874       <span class="i">$NameFieldsFound</span>++<span class="sc">;</span>
 875       <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Len</span><span class="cm">,</span> <span class="i">$Check</span><span class="cm">,</span> <span class="i">$Weight</span><span class="s">)</span> = <span class="i">$Line</span> =~ <span class="q">/^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/</span><span class="sc">;</span>
 876       <span class="k">if</span> <span class="s">(</span><span class="i">$ID</span> =~ <span class="q">/ /</span><span class="s">)</span> <span class="s">{</span>
 877         <span class="s">(</span><span class="i">$ID</span><span class="s">)</span> = <span class="i">$ID</span> =~ <span class="q">/^(.*?)[ ]+/</span>
 878       <span class="s">}</span>
 879       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$MSFIDsMap</span>{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 880         <span class="k">warn</span> <span class="q">&quot;Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n&quot;</span><span class="sc">;</span>
 881         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
 882       <span class="s">}</span>
 883       <span class="i">$MSFIDsMap</span>{<span class="i">$ID</span>} = <span class="i">$ID</span><span class="sc">;</span>
 884       <span class="k">push</span> <span class="i">@</span>{<span class="i">$MSFDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
 885       <span class="i">$MSFDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$ID</span><span class="sc">;</span>
 886       <span class="i">$MSFDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
 887     <span class="s">}</span>
 888     <span class="k">elsif</span> <span class="s">(</span> <span class="q">/\/\//</span> &amp;&amp; <span class="i">$NameFieldsFound</span><span class="s">)</span> <span class="s">{</span>
 889       <span class="c"># End of header list...</span>
 890       <span class="k">last</span> <span class="j">LINE</span><span class="sc">;</span>
 891     <span class="s">}</span>
 892   <span class="s">}</span>
 893   <span class="c"># Collect all sequences...</span>
 894   <span class="c">#</span>
 895   <span class="k">my</span><span class="s">(</span><span class="i">$FirstField</span><span class="cm">,</span> <span class="i">$SecondField</span><span class="s">)</span><span class="sc">;</span>
 896   <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*MSFFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
 897     <span class="s">(</span><span class="i">$FirstField</span><span class="cm">,</span> <span class="i">$SecondField</span><span class="s">)</span> = <span class="i">$Line</span> =~ <span class="q">/^[ ]*(.*?)[ ]+(.*?)$/</span><span class="sc">;</span>
 898     <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$MSFIDsMap</span>{<span class="i">$FirstField</span>}<span class="s">)</span> <span class="s">{</span>
 899       <span class="c"># It&#39;s ID and sequence data...</span>
 900       <span class="i">$ID</span> = <span class="i">$FirstField</span><span class="sc">;</span>
 901       <span class="i">$Sequence</span> = <span class="i">$SecondField</span><span class="sc">;</span>
 902       <span class="c"># Take out spaces and leave the gap characters...</span>
 903       <span class="i">$Sequence</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
 904       <span class="k">if</span> <span class="s">(</span><span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
 905         <span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Sequence</span><span class="sc">;</span>
 906       <span class="s">}</span>
 907       <span class="k">else</span> <span class="s">{</span>
 908         <span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Sequence</span><span class="sc">;</span>
 909       <span class="s">}</span>
 910     <span class="s">}</span>
 911   <span class="s">}</span>
 912 
 913   <span class="k">close</span> <span class="w">MSFFILE</span><span class="sc">;</span>
 914   <span class="k">return</span> \<span class="i">%MSFDataMap</span><span class="sc">;</span>
 915 <span class="s">}</span>
 916 
 917 
<a name="EOF-"></a></pre>
<p>&nbsp;</p>
<br />
<center>
<img src="../../../images/h2o2.png">
</center>
</body>
</html>