diff mayachemtools/docs/modules/html/code/SequenceFileUtil.html @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mayachemtools/docs/modules/html/code/SequenceFileUtil.html	Wed Jan 20 11:55:01 2016 -0500
@@ -0,0 +1,938 @@
+<html>
+<head>
+<title>MayaChemTools:Code:SequenceFileUtil.pm</title>
+<meta http-equiv="content-type" content="text/html;charset=utf-8">
+<link rel="stylesheet" type="text/css" href="../../../css/MayaChemToolsCode.css">
+</head>
+<body leftmargin="20" rightmargin="20" topmargin="10" bottommargin="10">
+<br/>
+<center>
+<a href="http://www.mayachemtools.org" title="MayaChemTools Home"><img src="../../../images/MayaChemToolsLogo.gif" border="0" alt="MayaChemTools"></a>
+</center>
+<br/>
+<pre>
+<a name="package-SequenceFileUtil-"></a>   1 <span class="k">package </span><span class="i">SequenceFileUtil</span><span class="sc">;</span>
+   2 <span class="c">#</span>
+   3 <span class="c"># $RCSfile: SequenceFileUtil.pm,v $</span>
+   4 <span class="c"># $Date: 2015/02/28 20:47:18 $</span>
+   5 <span class="c"># $Revision: 1.33 $</span>
+   6 <span class="c">#</span>
+   7 <span class="c"># Author: Manish Sud &lt;msud@san.rr.com&gt;</span>
+   8 <span class="c">#</span>
+   9 <span class="c"># Copyright (C) 2015 Manish Sud. All rights reserved.</span>
+  10 <span class="c">#</span>
+  11 <span class="c"># This file is part of MayaChemTools.</span>
+  12 <span class="c">#</span>
+  13 <span class="c"># MayaChemTools is free software; you can redistribute it and/or modify it under</span>
+  14 <span class="c"># the terms of the GNU Lesser General Public License as published by the Free</span>
+  15 <span class="c"># Software Foundation; either version 3 of the License, or (at your option) any</span>
+  16 <span class="c"># later version.</span>
+  17 <span class="c">#</span>
+  18 <span class="c"># MayaChemTools is distributed in the hope that it will be useful, but without</span>
+  19 <span class="c"># any warranty; without even the implied warranty of merchantability of fitness</span>
+  20 <span class="c"># for a particular purpose.  See the GNU Lesser General Public License for more</span>
+  21 <span class="c"># details.</span>
+  22 <span class="c">#</span>
+  23 <span class="c"># You should have received a copy of the GNU Lesser General Public License</span>
+  24 <span class="c"># along with MayaChemTools; if not, see &lt;http://www.gnu.org/licenses/&gt; or</span>
+  25 <span class="c"># write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,</span>
+  26 <span class="c"># Boston, MA, 02111-1307, USA.</span>
+  27 <span class="c">#</span>
+  28 
+  29 <span class="k">use</span> <span class="w">strict</span><span class="sc">;</span>
+  30 <span class="k">use</span> <span class="w">Exporter</span><span class="sc">;</span>
+  31 <span class="k">use</span> <span class="w">Text::ParseWords</span><span class="sc">;</span>
+  32 <span class="k">use</span> <span class="w">TextUtil</span><span class="sc">;</span>
+  33 <span class="k">use</span> <span class="w">FileUtil</span><span class="sc">;</span>
+  34 
+  35 <span class="k">use</span> <span class="w">vars</span> <span class="q">qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS)</span><span class="sc">;</span>
+  36 
+  37 <span class="i">@ISA</span> = <span class="q">qw(Exporter)</span><span class="sc">;</span>
+  38 <span class="i">@EXPORT</span> = <span class="q">qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile)</span><span class="sc">;</span>
+  39 <span class="i">@EXPORT_OK</span> = <span class="q">qw()</span><span class="sc">;</span>
+  40 
+  41 <span class="i">%EXPORT_TAGS</span> = <span class="s">(</span><span class="w">all</span>  <span class="cm">=&gt;</span> <span class="s">[</span><span class="i">@EXPORT</span><span class="cm">,</span> <span class="i">@EXPORT_OK</span><span class="s">]</span><span class="s">)</span><span class="sc">;</span>
+  42 
+  43 <span class="c"># Compare lengths of all sequences...</span>
+<a name="AreSequenceLengthsIdentical-"></a>  44 <span class="k">sub </span><span class="m">AreSequenceLengthsIdentical</span> <span class="s">{</span>
+  45   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+  46   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$FirstID</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$FirstDifferentLenID</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="s">)</span><span class="sc">;</span>
+  47 
+  48   <span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span>
+  49   <span class="i">$FirstID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+  50   <span class="i">$FirstDifferentLenID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+  51 
+  52   <span class="j">ID:</span> <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+  53     <span class="k">if</span> <span class="s">(</span>!<span class="i">$FirstID</span><span class="s">)</span> <span class="s">{</span>
+  54       <span class="i">$FirstID</span> = <span class="i">$ID</span><span class="sc">;</span>
+  55       <span class="i">$FirstSeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span><span class="sc">;</span>
+  56       <span class="k">next</span> <span class="j">ID</span><span class="sc">;</span>
+  57     <span class="s">}</span>
+  58     <span class="i">$SeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span><span class="sc">;</span>
+  59     <span class="k">if</span> <span class="s">(</span><span class="i">$SeqLen</span> != <span class="i">$FirstSeqLen</span><span class="s">)</span> <span class="s">{</span>
+  60       <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
+  61       <span class="i">$FirstDifferentLenID</span> = <span class="i">$ID</span><span class="sc">;</span>
+  62       <span class="k">last</span> <span class="j">ID</span><span class="sc">;</span>
+  63     <span class="s">}</span>
+  64   <span class="s">}</span>
+  65   <span class="k">return</span> <span class="s">(</span><span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
+  66 <span class="s">}</span>
+  67 
+  68 <span class="c"># Calculate percent identity between two sequences. By default, gaps are ignored.</span>
+<a name="CalcuatePercentSequenceIdentity-"></a>  69 <span class="k">sub </span><span class="m">CalcuatePercentSequenceIdentity</span> <span class="s">{</span>
+  70   <span class="k">my</span><span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$PercentIdentity</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span><span class="sc">;</span>
+  71 
+  72   <span class="i">$PercentIdentity</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+  73   <span class="i">$Precision</span> = <span class="n">1</span><span class="sc">;</span>
+  74   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
+  75   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">4</span><span class="s">)</span> <span class="s">{</span>
+  76     <span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+  77   <span class="s">}</span>
+  78   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">3</span><span class="s">)</span> <span class="s">{</span>
+  79     <span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+  80   <span class="s">}</span>
+  81   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
+  82     <span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Sequence2</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+  83   <span class="s">}</span>
+  84   <span class="k">else</span> <span class="s">{</span>
+  85     <span class="k">return</span> <span class="i">$PercentIdentity</span><span class="sc">;</span>
+  86   <span class="s">}</span>
+  87   <span class="k">if</span> <span class="s">(</span>!<span class="s">(</span><span class="i">IsNotEmpty</span><span class="s">(</span><span class="i">$Sequence1</span><span class="s">)</span> &amp;&amp; <span class="i">IsNotEmpty</span><span class="s">(</span><span class="i">$Sequence2</span><span class="s">)</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+  88     <span class="k">return</span> <span class="i">$PercentIdentity</span><span class="sc">;</span>
+  89   <span class="s">}</span>
+  90   <span class="k">my</span><span class="s">(</span><span class="i">$Index</span><span class="cm">,</span> <span class="i">$Identity</span><span class="cm">,</span> <span class="i">$Sequence1Len</span><span class="cm">,</span> <span class="i">$Sequence2Len</span><span class="cm">,</span> <span class="i">$Residue1</span><span class="cm">,</span> <span class="i">$Residue2</span><span class="cm">,</span> <span class="i">$ResMatchCount</span><span class="cm">,</span> <span class="i">$ResCount</span><span class="s">)</span><span class="sc">;</span>
+  91 
+  92   <span class="i">$Sequence1Len</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence1</span><span class="s">)</span><span class="sc">;</span>
+  93   <span class="i">$Sequence2Len</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence2</span><span class="s">)</span><span class="sc">;</span>
+  94 
+  95   <span class="i">$ResMatchCount</span> = <span class="n">0</span><span class="sc">;</span>
+  96   <span class="i">$ResCount</span> = <span class="n">0</span><span class="sc">;</span>
+  97   <span class="j">RESIDUE:</span> <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$Sequence1Len</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+  98     <span class="i">$Residue1</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Sequence1</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+  99     <span class="i">$Residue2</span> = <span class="s">(</span><span class="i">$Index</span> &lt; <span class="i">$Sequence2Len</span><span class="s">)</span> ? <span class="k">substr</span><span class="s">(</span><span class="i">$Sequence2</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span> <span class="co">:</span> <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 100     <span class="k">if</span> <span class="s">(</span><span class="i">$IgnoreGaps</span><span class="s">)</span> <span class="s">{</span>
+ 101       <span class="k">if</span> <span class="s">(</span><span class="i">$Residue1</span> !~ <span class="q">/[A-Z]/i</span> || <span class="i">$Residue2</span> !~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
+ 102         <span class="k">next</span> <span class="j">RESIDUE</span><span class="sc">;</span>
+ 103       <span class="s">}</span>
+ 104     <span class="s">}</span>
+ 105     <span class="k">if</span> <span class="s">(</span><span class="i">$Residue1</span> <span class="k">eq</span> <span class="i">$Residue2</span><span class="s">)</span> <span class="s">{</span>
+ 106       <span class="i">$ResMatchCount</span>++<span class="sc">;</span>
+ 107     <span class="s">}</span>
+ 108     <span class="i">$ResCount</span>++<span class="sc">;</span>
+ 109   <span class="s">}</span>
+ 110   <span class="i">$Identity</span> = <span class="i">$ResCount</span> ? <span class="s">(</span><span class="i">$ResMatchCount</span>/<span class="i">$ResCount</span><span class="s">)</span> <span class="co">:</span> <span class="n">0.0</span><span class="sc">;</span>
+ 111   <span class="i">$PercentIdentity</span> = <span class="k">sprintf</span><span class="s">(</span><span class="q">&quot;%.${Precision}f&quot;</span><span class="cm">,</span> <span class="s">(</span><span class="i">$Identity</span> * <span class="n">100</span><span class="s">)</span><span class="s">)</span><span class="sc">;</span>
+ 112 
+ 113   <span class="k">return</span> <span class="i">$PercentIdentity</span><span class="sc">;</span>
+ 114 <span class="s">}</span>
+ 115 
+ 116 <span class="c"># Calculate pairwise identify matrix for all the sequences and return a reference</span>
+ 117 <span class="c"># to a hash with the following keys:</span>
+ 118 <span class="c">#</span>
+ 119 <span class="c"># {IDs} - Sequence IDs</span>
+ 120 <span class="c"># {Count} - Number of IDs</span>
+ 121 <span class="c"># {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences</span>
+ 122 <span class="c">#</span>
+<a name="CalculatePercentSequenceIdentityMatrix-"></a> 123 <span class="k">sub </span><span class="m">CalculatePercentSequenceIdentityMatrix</span> <span class="s">{</span>
+ 124   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="cm">,</span> <span class="i">$Precision</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$RowID</span><span class="cm">,</span> <span class="i">$ColID</span><span class="cm">,</span> <span class="i">$RowIDSeq</span><span class="cm">,</span> <span class="i">$ColIDSeq</span><span class="cm">,</span> <span class="i">$PercentIdentity</span><span class="cm">,</span> <span class="i">%IdentityMatrixData</span><span class="s">)</span><span class="sc">;</span>
+ 125 
+ 126   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
+ 127   <span class="i">$Precision</span> = <span class="n">1</span><span class="sc">;</span>
+ 128   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">3</span><span class="s">)</span> <span class="s">{</span>
+ 129     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 130   <span class="s">}</span>
+ 131   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
+ 132     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 133   <span class="s">}</span>
+ 134   <span class="k">else</span> <span class="s">{</span>
+ 135     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 136   <span class="s">}</span>
+ 137 
+ 138   <span class="i">%IdentityMatrixData</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 139   <span class="i">@</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">IDs</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 140   <span class="i">%</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">PercentIdentity</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 141   <span class="i">$IdentityMatrixData</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
+ 142 
+ 143   <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 144     <span class="k">push</span> <span class="i">@</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
+ 145     <span class="i">$IdentityMatrixData</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
+ 146   <span class="s">}</span>
+ 147   <span class="c"># Initialize and calculate percent identity data values...</span>
+ 148   <span class="k">for</span> <span class="i">$RowID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 149     <span class="i">%</span>{<span class="i">$IdentityMatrixData</span>{<span class="w">PercentIdentity</span>}{<span class="i">$RowID</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 150     <span class="i">$RowIDSeq</span> = <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$RowID</span>}<span class="sc">;</span>
+ 151     <span class="k">for</span> <span class="i">$ColID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 152       <span class="i">$IdentityMatrixData</span>{<span class="i">$RowID</span>}{<span class="i">$ColID</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 153       <span class="i">$ColIDSeq</span> = <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ColID</span>}<span class="sc">;</span>
+ 154       <span class="i">$PercentIdentity</span> = <span class="i">CalcuatePercentSequenceIdentity</span><span class="s">(</span><span class="i">$RowIDSeq</span><span class="cm">,</span> <span class="i">$ColIDSeq</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$Precision</span><span class="s">)</span><span class="sc">;</span>
+ 155       <span class="i">$IdentityMatrixData</span>{<span class="w">PercentIdentity</span>}{<span class="i">$RowID</span>}{<span class="i">$ColID</span>} = <span class="i">$PercentIdentity</span><span class="sc">;</span>
+ 156     <span class="s">}</span>
+ 157   <span class="s">}</span>
+ 158   <span class="k">return</span> \<span class="i">%IdentityMatrixData</span><span class="sc">;</span>
+ 159 <span class="s">}</span>
+ 160 
+ 161 <span class="c"># Retrieve information about shortest sequence...</span>
+<a name="GetShortestSequence-"></a> 162 <span class="k">sub </span><span class="m">GetShortestSequence</span> <span class="s">{</span>
+ 163   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
+ 164 
+ 165   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
+ 166   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
+ 167     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 168   <span class="s">}</span>
+ 169   <span class="k">else</span> <span class="s">{</span>
+ 170     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 171   <span class="s">}</span>
+ 172 
+ 173   <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> =  <span class="i">_GetShortestOrLongestSequence</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="q">&#39;Shortest&#39;</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
+ 174   <span class="k">return</span> <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
+ 175 <span class="s">}</span>
+ 176 
+ 177 <span class="c"># Retrieve information about longest sequence..</span>
+<a name="GetLongestSequence-"></a> 178 <span class="k">sub </span><span class="m">GetLongestSequence</span> <span class="s">{</span>
+ 179   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
+ 180 
+ 181   <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
+ 182   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
+ 183     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 184   <span class="s">}</span>
+ 185   <span class="k">else</span> <span class="s">{</span>
+ 186     <span class="s">(</span><span class="i">$SequencesDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 187   <span class="s">}</span>
+ 188 
+ 189   <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> =  <span class="i">_GetShortestOrLongestSequence</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="q">&#39;Longest&#39;</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
+ 190   <span class="k">return</span> <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
+ 191 <span class="s">}</span>
+ 192 
+ 193 <span class="c"># Get sequence length...</span>
+<a name="GetSequenceLength-"></a> 194 <span class="k">sub </span><span class="m">GetSequenceLength</span> <span class="s">{</span>
+ 195   <span class="k">my</span><span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
+ 196 
+ 197   <span class="i">$SeqLen</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span> <span class="i">$IgnoreGaps</span> = <span class="n">1</span><span class="sc">;</span>
+ 198   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
+ 199     <span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 200   <span class="s">}</span>
+ 201   <span class="k">else</span> <span class="s">{</span>
+ 202     <span class="s">(</span><span class="i">$Seq</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 203   <span class="s">}</span>
+ 204   <span class="k">if</span> <span class="s">(</span><span class="i">$IgnoreGaps</span><span class="s">)</span> <span class="s">{</span>
+ 205     <span class="k">my</span><span class="s">(</span><span class="i">$Index</span><span class="cm">,</span> <span class="i">$Residue</span><span class="s">)</span><span class="sc">;</span>
+ 206     <span class="i">$SeqLen</span> = <span class="n">0</span><span class="sc">;</span>
+ 207     <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="k">length</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 208       <span class="i">$Residue</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 209       <span class="k">if</span> <span class="s">(</span><span class="i">$Residue</span> =~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
+ 210         <span class="i">$SeqLen</span>++<span class="sc">;</span>
+ 211       <span class="s">}</span>
+ 212     <span class="s">}</span>
+ 213   <span class="s">}</span>
+ 214   <span class="k">else</span> <span class="s">{</span>
+ 215     <span class="i">$SeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span><span class="sc">;</span>
+ 216   <span class="s">}</span>
+ 217 
+ 218   <span class="k">return</span> <span class="i">$SeqLen</span><span class="sc">;</span>
+ 219 <span class="s">}</span>
+ 220 
+ 221 <span class="c"># Is it a gap residue...</span>
+<a name="IsGapResidue-"></a> 222 <span class="k">sub </span><span class="m">IsGapResidue</span> <span class="s">{</span>
+ 223   <span class="k">my</span><span class="s">(</span><span class="i">$Residue</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 224   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
+ 225 
+ 226   <span class="i">$Status</span> = <span class="s">(</span><span class="i">$Residue</span> !~ <span class="q">/[A-Z]/i</span> <span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
+ 227 
+ 228   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
+ 229 <span class="s">}</span>
+ 230 
+ 231 <span class="c"># Is it a supported sequence file?</span>
+ 232 <span class="c">#</span>
+ 233 <span class="c"># Supported seqence formats are:</span>
+ 234 <span class="c">#</span>
+ 235 <span class="c"># ALN/ClustalW   .aln</span>
+ 236 <span class="c"># GCG/MSF         .msf</span>
+ 237 <span class="c"># PILEUP/MSF     .msf</span>
+ 238 <span class="c"># Fasts(Pearson) .fasta, .fta</span>
+ 239 <span class="c"># NBRF/PIR         .pir</span>
+ 240 <span class="c">#</span>
+<a name="IsSupportedSequenceFile-"></a> 241 <span class="k">sub </span><span class="m">IsSupportedSequenceFile</span> <span class="s">{</span>
+ 242   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 243   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$SequenceFormat</span><span class="s">)</span><span class="sc">;</span>
+ 244   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;NotSupported&#39;</span><span class="sc">;</span>
+ 245 
+ 246   <span class="j">SEQFORMAT:</span> <span class="s">{</span>
+ 247       <span class="k">if</span> <span class="s">(</span><span class="i">IsClustalWSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;ClustalW&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
+ 248       <span class="k">if</span> <span class="s">(</span><span class="i">IsPearsonFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;Pearson&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
+ 249       <span class="k">if</span> <span class="s">(</span><span class="i">IsPIRFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;PIR&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
+ 250       <span class="k">if</span> <span class="s">(</span><span class="i">IsMSFSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span><span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;MSF&#39;</span><span class="sc">;</span> <span class="k">last</span> <span class="j">SEQFORMAT</span><span class="s">}</span>
+ 251       <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span> <span class="i">$SequenceFormat</span> = <span class="q">&#39;NotSupported&#39;</span><span class="sc">;</span>
+ 252   <span class="s">}</span>
+ 253   <span class="k">return</span> <span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$SequenceFormat</span><span class="s">)</span><span class="sc">;</span>
+ 254 <span class="s">}</span>
+ 255 
+ 256 <span class="c"># Is it a ClustalW multiple sequence sequence file...</span>
+<a name="IsClustalWSequenceFile-"></a> 257 <span class="k">sub </span><span class="m">IsClustalWSequenceFile</span> <span class="s">{</span>
+ 258   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 259   <span class="k">my</span><span class="s">(</span><span class="i">$Status</span><span class="cm">,</span> <span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
+ 260 
+ 261   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
+ 262 
+ 263   <span class="k">open</span> <span class="w">SEQUENCEFILE</span><span class="cm">,</span> <span class="q">&quot;$SequenceFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $SequenceFile: $!\n&quot;</span><span class="sc">;</span>
+ 264   <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*SEQUENCEFILE</span><span class="s">)</span><span class="sc">;</span>
+ 265   <span class="i">$Status</span> = <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/(ClustalW|Clustal W|Clustal)/i</span> <span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
+ 266   <span class="k">close</span> <span class="w">SEQUENCEFILE</span><span class="sc">;</span>
+ 267 
+ 268   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
+ 269 <span class="s">}</span>
+ 270 
+ 271 <span class="c"># Is it a valid Pearson fasta sequence or alignment file?</span>
+ 272 <span class="c">#</span>
+<a name="IsPearsonFastaSequenceFile-"></a> 273 <span class="k">sub </span><span class="m">IsPearsonFastaSequenceFile</span> <span class="s">{</span>
+ 274   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFile</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
+ 275 
+ 276   <span class="s">(</span><span class="i">$FastaFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 277   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
+ 278 
+ 279   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFile: $!\n&quot;</span><span class="sc">;</span>
+ 280   <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="sc">;</span>
+ 281 
+ 282   <span class="c"># First line starts with &gt; and the fourth character is not &#39;;&#39;; otherwise, it&#39;s</span>
+ 283   <span class="c"># PIR FASTA format.</span>
+ 284   <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^&gt;/</span><span class="s">)</span> <span class="s">{</span>
+ 285     <span class="k">my</span><span class="s">(</span><span class="i">$FourthChar</span><span class="s">)</span><span class="sc">;</span>
+ 286     <span class="i">$FourthChar</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="n">3</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 287     <span class="i">$Status</span> = <span class="s">(</span><span class="i">$FourthChar</span> !~ <span class="q">/\;/</span><span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
+ 288   <span class="s">}</span>
+ 289   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
+ 290 
+ 291   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
+ 292 <span class="s">}</span>
+ 293 
+ 294 <span class="c"># Is it a valid NBRF/PIR fasta sequence or alignment file?</span>
+ 295 <span class="c">#</span>
+<a name="IsPIRFastaSequenceFile-"></a> 296 <span class="k">sub </span><span class="m">IsPIRFastaSequenceFile</span> <span class="s">{</span>
+ 297   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFile</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
+ 298 
+ 299   <span class="s">(</span><span class="i">$FastaFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 300   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
+ 301 
+ 302   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFile: $!\n&quot;</span><span class="sc">;</span>
+ 303   <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="sc">;</span>
+ 304 
+ 305   <span class="c"># First line starts with &gt; and the fourth character is &#39;;&#39;; otherwise, it&#39;s</span>
+ 306   <span class="c"># a Pearson FASTA format.</span>
+ 307   <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^&gt;/</span><span class="s">)</span> <span class="s">{</span>
+ 308     <span class="k">my</span><span class="s">(</span><span class="i">$FourthChar</span><span class="s">)</span><span class="sc">;</span>
+ 309     <span class="i">$FourthChar</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="n">3</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 310     <span class="i">$Status</span> = <span class="s">(</span><span class="i">$FourthChar</span> =~ <span class="q">/\;/</span><span class="s">)</span> ? <span class="n">1</span> <span class="co">:</span> <span class="n">0</span><span class="sc">;</span>
+ 311   <span class="s">}</span>
+ 312   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
+ 313 
+ 314   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
+ 315 <span class="s">}</span>
+ 316 
+ 317 <span class="c"># Is it a valid MSF sequence or alignment file?</span>
+ 318 <span class="c">#</span>
+<a name="IsMSFSequenceFile-"></a> 319 <span class="k">sub </span><span class="m">IsMSFSequenceFile</span> <span class="s">{</span>
+ 320   <span class="k">my</span><span class="s">(</span><span class="i">$MSFFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 321 
+ 322   <span class="k">open</span> <span class="w">MSFFILE</span><span class="cm">,</span> <span class="q">&quot;$MSFFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $MSFFile: $!\n&quot;</span><span class="sc">;</span>
+ 323 
+ 324   <span class="k">my</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="i">$Status</span><span class="s">)</span><span class="sc">;</span>
+ 325 
+ 326   <span class="i">$Status</span> = <span class="n">0</span><span class="sc">;</span>
+ 327   <span class="c"># Find a line that contains MSF: keyword and ends with &#39;..&#39;</span>
+ 328   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*MSFFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 329     <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
+ 330     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/MSF:/i</span> &amp;&amp; <span class="i">$Line</span> =~ <span class="q">/\.\.[ ]*$/</span><span class="s">)</span> <span class="s">{</span>
+ 331       <span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span>
+ 332       <span class="k">last</span> <span class="j">LINE</span><span class="sc">;</span>
+ 333     <span class="s">}</span>
+ 334     <span class="k">elsif</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i</span><span class="s">)</span> <span class="s">{</span>
+ 335       <span class="c"># Pileup MSF...</span>
+ 336       <span class="i">$Status</span> = <span class="n">1</span><span class="sc">;</span>
+ 337       <span class="k">last</span> <span class="j">LINE</span><span class="sc">;</span>
+ 338     <span class="s">}</span>
+ 339   <span class="s">}</span>
+ 340   <span class="k">close</span> <span class="w">MSFFILE</span><span class="sc">;</span>
+ 341 
+ 342   <span class="k">return</span> <span class="i">$Status</span><span class="sc">;</span>
+ 343 <span class="s">}</span>
+ 344 
+ 345 <span class="c"># Read sequence or sequence alignment file...</span>
+<a name="ReadSequenceFile-"></a> 346 <span class="k">sub </span><span class="m">ReadSequenceFile</span> <span class="s">{</span>
+ 347   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 348 
+ 349   <span class="k">if</span> <span class="s">(</span><span class="i">IsPearsonFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 350     <span class="k">return</span> <span class="i">ReadPearsonFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 351   <span class="s">}</span>
+ 352   <span class="k">elsif</span> <span class="s">(</span><span class="i">IsPIRFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 353     <span class="k">return</span> <span class="i">ReadPIRFastaSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 354   <span class="s">}</span>
+ 355   <span class="k">elsif</span> <span class="s">(</span><span class="i">IsMSFSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 356     <span class="k">return</span> <span class="i">ReadMSFSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 357   <span class="s">}</span>
+ 358   <span class="k">elsif</span> <span class="s">(</span><span class="i">IsClustalWSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 359     <span class="k">return</span> <span class="i">ReadClustalWSequenceFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 360   <span class="s">}</span>
+ 361   <span class="k">else</span> <span class="s">{</span>
+ 362     <span class="k">return</span> <span class="k">undef</span><span class="sc">;</span>
+ 363   <span class="s">}</span>
+ 364 <span class="s">}</span>
+ 365 
+ 366 <span class="c"># Read file and setup alignment data...</span>
+<a name="ReadClustalWSequenceFile-"></a> 367 <span class="k">sub </span><span class="m">ReadClustalWSequenceFile</span> <span class="s">{</span>
+ 368   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 369 
+ 370   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;ClustalW&#39;</span><span class="s">)</span><span class="sc">;</span>
+ 371 <span class="s">}</span>
+ 372 
+ 373 <span class="c"># Read file and setup alignment data...</span>
+<a name="ReadPearsonFastaSequenceFile-"></a> 374 <span class="k">sub </span><span class="m">ReadPearsonFastaSequenceFile</span> <span class="s">{</span>
+ 375   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 376 
+ 377   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;Pearson&#39;</span><span class="s">)</span><span class="sc">;</span>
+ 378 <span class="s">}</span>
+ 379 
+ 380 <span class="c"># Read file and setup alignment data...</span>
+<a name="ReadPIRFastaSequenceFile-"></a> 381 <span class="k">sub </span><span class="m">ReadPIRFastaSequenceFile</span> <span class="s">{</span>
+ 382   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 383 
+ 384   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;PIR&#39;</span><span class="s">)</span><span class="sc">;</span>
+ 385 <span class="s">}</span>
+ 386 
+ 387 
+ 388 <span class="c"># Read file and setup sequence data...</span>
+<a name="ReadMSFSequenceFile-"></a> 389 <span class="k">sub </span><span class="m">ReadMSFSequenceFile</span> <span class="s">{</span>
+ 390   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 391 
+ 392   <span class="k">return</span> <span class="i">_ReadFileAndSetupSequencesData</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="q">&#39;MSF&#39;</span><span class="s">)</span><span class="sc">;</span>
+ 393 <span class="s">}</span>
+ 394 
+ 395 <span class="c"># Write out a Pearson FASTA file...</span>
+<a name="WritePearsonFastaSequenceFile-"></a> 396 <span class="k">sub </span><span class="m">WritePearsonFastaSequenceFile</span> <span class="s">{</span>
+ 397   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFileName</span><span class="cm">,</span> <span class="i">$SequenceDataRef</span><span class="cm">,</span> <span class="i">$MaxLength</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$WrappedSequence</span><span class="s">)</span><span class="sc">;</span>
+ 398 
+ 399   <span class="i">$MaxLength</span> = <span class="n">80</span><span class="sc">;</span>
+ 400   <span class="k">if</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">3</span><span class="s">)</span> <span class="s">{</span>
+ 401     <span class="s">(</span><span class="i">$SequenceFileName</span><span class="cm">,</span> <span class="i">$SequenceDataRef</span><span class="cm">,</span> <span class="i">$MaxLength</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 402   <span class="s">}</span>
+ 403   <span class="k">elsif</span> <span class="s">(</span><span class="i">@_</span> == <span class="n">2</span><span class="s">)</span> <span class="s">{</span>
+ 404     <span class="s">(</span><span class="i">$SequenceFileName</span><span class="cm">,</span> <span class="i">$SequenceDataRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 405   <span class="s">}</span>
+ 406   <span class="k">open</span> <span class="w">SEQUENCEFILE</span><span class="cm">,</span> <span class="q">&quot;&gt;$SequenceFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Can&#39;t open $SequenceFileName: $!\n&quot;</span><span class="sc">;</span>
+ 407   <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequenceDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 408     <span class="i">$Description</span> = <span class="i">$SequenceDataRef</span>-&gt;{<span class="w">Description</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
+ 409     <span class="i">$Sequence</span> = <span class="i">$SequenceDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
+ 410     <span class="i">$WrappedSequence</span> = <span class="i">WrapText</span><span class="s">(</span><span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$MaxLength</span><span class="cm">,</span> <span class="q">&quot;\n&quot;</span><span class="s">)</span><span class="sc">;</span>
+ 411 
+ 412     <span class="c"># Description also contains ID...</span>
+ 413     <span class="k">print</span> <span class="i">SEQUENCEFILE</span> <span class="q">&quot;&gt;$Description\n&quot;</span><span class="sc">;</span>
+ 414     <span class="k">print</span> <span class="i">SEQUENCEFILE</span> <span class="q">&quot;$WrappedSequence\n&quot;</span><span class="sc">;</span>
+ 415   <span class="s">}</span>
+ 416   <span class="k">close</span> <span class="w">SEQUENCEFILE</span><span class="sc">;</span>
+ 417 <span class="s">}</span>
+ 418 
+ 419 <span class="c"># Get ID, Sequence and Length for smallest or longest sequence</span>
+<a name="_GetShortestOrLongestSequence-"></a> 420 <span class="k">sub </span><span class="m">_GetShortestOrLongestSequence</span> <span class="s">{</span>
+ 421   <span class="k">my</span><span class="s">(</span><span class="i">$SequencesDataRef</span><span class="cm">,</span> <span class="i">$SequenceType</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 422   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$FirstID</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
+ 423 
+ 424   <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="s">)</span> = <span class="s">(</span><span class="q">&#39;&#39;</span><span class="cm">,</span> <span class="q">&#39;&#39;</span><span class="cm">,</span> <span class="q">&#39;&#39;</span><span class="s">)</span><span class="sc">;</span>
+ 425   <span class="i">$FirstID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 426 
+ 427   <span class="j">ID:</span> <span class="k">for</span> <span class="i">$CurrentID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequencesDataRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 428     <span class="i">$CurrentSeq</span> = <span class="i">$IgnoreGaps</span> ? <span class="i">RemoveSequenceGaps</span><span class="s">(</span><span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$CurrentID</span>}<span class="s">)</span> <span class="co">:</span> <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$CurrentID</span>}<span class="sc">;</span>
+ 429     <span class="i">$CurrentSeqLen</span> = <span class="i">GetSequenceLength</span><span class="s">(</span><span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$IgnoreGaps</span><span class="s">)</span><span class="sc">;</span>
+ 430     <span class="i">$CurrentDescription</span> = <span class="i">$SequencesDataRef</span>-&gt;{<span class="w">Description</span>}{<span class="i">$CurrentID</span>}<span class="sc">;</span>
+ 431     <span class="k">if</span> <span class="s">(</span>!<span class="i">$FirstID</span><span class="s">)</span> <span class="s">{</span>
+ 432       <span class="i">$FirstID</span> = <span class="i">$ID</span><span class="sc">;</span> <span class="i">$FirstSeqLen</span> = <span class="i">$CurrentSeqLen</span><span class="sc">;</span>
+ 433       <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> = <span class="s">(</span><span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
+ 434       <span class="k">next</span> <span class="j">ID</span><span class="sc">;</span>
+ 435     <span class="s">}</span>
+ 436     <span class="k">if</span> <span class="s">(</span><span class="i">$CurrentSeqLen</span> != <span class="i">$SeqLen</span><span class="s">)</span> <span class="s">{</span>
+ 437       <span class="k">if</span> <span class="s">(</span><span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/Shortest/i</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">$CurrentSeqLen</span> &lt; <span class="i">$SeqLen</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 438         <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> = <span class="s">(</span><span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
+ 439       <span class="s">}</span>
+ 440       <span class="k">elsif</span> <span class="s">(</span><span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/Longest/i</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">$CurrentSeqLen</span> &gt; <span class="i">$SeqLen</span><span class="s">)</span> <span class="s">)</span> <span class="s">{</span>
+ 441         <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span> = <span class="s">(</span><span class="i">$CurrentID</span><span class="cm">,</span> <span class="i">$CurrentSeq</span><span class="cm">,</span> <span class="i">$CurrentSeqLen</span><span class="cm">,</span> <span class="i">$CurrentDescription</span><span class="s">)</span><span class="sc">;</span>
+ 442       <span class="s">}</span>
+ 443     <span class="s">}</span>
+ 444   <span class="s">}</span>
+ 445   <span class="k">return</span> <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Seq</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Description</span><span class="s">)</span><span class="sc">;</span>
+ 446 <span class="s">}</span>
+ 447 
+ 448 <span class="c"># Remove gaps in the sequence and return new sequence...</span>
+<a name="RemoveSequenceGaps-"></a> 449 <span class="k">sub </span><span class="m">RemoveSequenceGaps</span> <span class="s">{</span>
+ 450   <span class="k">my</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 451   <span class="k">my</span><span class="s">(</span><span class="i">$SeqWithoutGaps</span><span class="cm">,</span> <span class="i">$SeqLen</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="i">$Residue</span><span class="s">)</span><span class="sc">;</span>
+ 452 
+ 453   <span class="i">$SeqWithoutGaps</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 454   <span class="i">$SeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Seq</span><span class="s">)</span><span class="sc">;</span>
+ 455   <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$SeqLen</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 456     <span class="i">$Residue</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Seq</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 457     <span class="k">if</span> <span class="s">(</span><span class="i">$Residue</span> =~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
+ 458       <span class="i">$SeqWithoutGaps</span> .= <span class="i">$Residue</span><span class="sc">;</span>
+ 459     <span class="s">}</span>
+ 460   <span class="s">}</span>
+ 461 
+ 462   <span class="k">return</span> <span class="i">$SeqWithoutGaps</span><span class="sc">;</span>
+ 463 <span class="s">}</span>
+ 464 
+ 465 <span class="c"># Using input alignment data map ref containing following keys, generate</span>
+ 466 <span class="c"># a new hash with same set of keys after residue columns containg only</span>
+ 467 <span class="c"># gaps have been removed:</span>
+ 468 <span class="c">#</span>
+ 469 <span class="c"># {IDs} : Array of IDs in order as they appear in file</span>
+ 470 <span class="c"># {Count}: ID count...</span>
+ 471 <span class="c"># {Description}{$ID} : Description data...</span>
+ 472 <span class="c"># {Sequence}{$ID} : Sequence data...</span>
+ 473 <span class="c">#</span>
+<a name="RemoveSequenceAlignmentGapColumns-"></a> 474 <span class="k">sub </span><span class="m">RemoveSequenceAlignmentGapColumns</span> <span class="s">{</span>
+ 475   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$AlignmentDataMapRef</span><span class="cm">,</span> <span class="i">%NewAlignmentDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 476 
+ 477   <span class="s">(</span><span class="i">$AlignmentDataMapRef</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 478 
+ 479   <span class="i">%NewAlignmentDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 480   <span class="i">@</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 481   <span class="i">%</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 482   <span class="i">%</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 483   <span class="i">$NewAlignmentDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
+ 484 
+ 485   <span class="c"># Transfer ID and count information...</span>
+ 486   <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 487     <span class="k">push</span> <span class="i">@</span>{<span class="i">$NewAlignmentDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
+ 488     <span class="i">$NewAlignmentDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Description</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
+ 489     <span class="i">$NewAlignmentDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 490     <span class="i">$NewAlignmentDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
+ 491   <span class="s">}</span>
+ 492 
+ 493   <span class="c"># Go over residue columns and transfer the data...</span>
+ 494   <span class="k">my</span><span class="s">(</span><span class="i">$FirstID</span><span class="cm">,</span> <span class="i">$FirstSeq</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="i">$Res</span><span class="cm">,</span> <span class="i">$GapColumn</span><span class="s">)</span><span class="sc">;</span>
+ 495 
+ 496   <span class="i">$FirstID</span> = <span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}[<span class="n">0</span>]<span class="sc">;</span>
+ 497   <span class="i">$FirstSeq</span> = <span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$FirstID</span>}<span class="sc">;</span>
+ 498   <span class="i">$FirstSeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$FirstSeq</span><span class="s">)</span><span class="sc">;</span>
+ 499 
+ 500   <span class="j">RES:</span> <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$FirstSeqLen</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 501     <span class="c"># Is this a gap column?</span>
+ 502     <span class="i">$GapColumn</span> = <span class="n">1</span><span class="sc">;</span>
+ 503     <span class="j">ID:</span> <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 504       <span class="i">$Res</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 505       <span class="k">if</span> <span class="s">(</span><span class="i">$Res</span> =~ <span class="q">/[A-Z]/i</span><span class="s">)</span> <span class="s">{</span>
+ 506         <span class="i">$GapColumn</span> = <span class="n">0</span><span class="sc">;</span>
+ 507         <span class="k">last</span> <span class="j">ID</span><span class="sc">;</span>
+ 508       <span class="s">}</span>
+ 509     <span class="s">}</span>
+ 510     <span class="k">if</span> <span class="s">(</span><span class="i">$GapColumn</span><span class="s">)</span> <span class="s">{</span>
+ 511       <span class="k">next</span> <span class="j">RES</span><span class="sc">;</span>
+ 512     <span class="s">}</span>
+ 513     <span class="c"># Transfer this residue...</span>
+ 514     <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 515       <span class="i">$Res</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$AlignmentDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 516       <span class="i">$NewAlignmentDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Res</span><span class="sc">;</span>
+ 517     <span class="s">}</span>
+ 518   <span class="s">}</span>
+ 519 
+ 520   <span class="k">return</span> <span class="s">(</span>\<span class="i">%NewAlignmentDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 521 <span class="s">}</span>
+ 522 
+ 523 <span class="c">#</span>
+ 524 <span class="c"># Read sequences file and return a reference to hash with the following keys:</span>
+ 525 <span class="c">#</span>
+ 526 <span class="c"># {IDs} - Array of sequence IDs</span>
+ 527 <span class="c"># {Count} - Number of sequences</span>
+ 528 <span class="c"># {Description}{$ID} - Sequence description</span>
+ 529 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
+ 530 <span class="c"># {InputFileType} - Sequence file format</span>
+ 531 <span class="c"># {ConservedAnnotation} - Conserved residue annonation</span>
+ 532 <span class="c">#</span>
+ 533 <span class="c"># Note:</span>
+ 534 <span class="c">#   . Conserved residue annotation either exist in the input sequence alignment file or set</span>
+ 535 <span class="c">#     for a file containing same number of residues for all the sequence using the following</span>
+ 536 <span class="c">#     notation: * - Residue conserved; &#39; &#39; - Residue not conserved.</span>
+ 537 <span class="c">#</span>
+<a name="_ReadFileAndSetupSequencesData-"></a> 538 <span class="k">sub </span><span class="m">_ReadFileAndSetupSequencesData</span> <span class="s">{</span>
+ 539   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="cm">,</span> <span class="i">$SequenceType</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 540   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
+ 541 
+ 542   <span class="i">$SequenceDataMapRef</span> = <span class="k">undef</span><span class="sc">;</span>
+ 543 
+ 544   <span class="c"># Read sequence file...</span>
+ 545   <span class="i">$SequenceDataMapRef</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 546   <span class="k">if</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^ClustalW$/i</span><span class="s">)</span> <span class="s">{</span>
+ 547     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadClustalWFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 548   <span class="s">}</span>
+ 549   <span class="k">elsif</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^Pearson$/i</span><span class="s">)</span> <span class="s">{</span>
+ 550     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadPearsonFastaFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 551   <span class="s">}</span>
+ 552   <span class="k">elsif</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^PIR$/i</span><span class="s">)</span> <span class="s">{</span>
+ 553     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadPIRFastaFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 554   <span class="s">}</span>
+ 555   <span class="k">elsif</span> <span class="s">(</span><span class="i">$SequenceType</span> =~ <span class="q">/^MSF$/i</span><span class="s">)</span> <span class="s">{</span>
+ 556     <span class="i">$SequenceDataMapRef</span> = <span class="i">_ReadMSFFile</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span><span class="sc">;</span>
+ 557   <span class="s">}</span>
+ 558   <span class="k">else</span> <span class="s">{</span>
+ 559     <span class="k">return</span> <span class="i">$SequenceDataMapRef</span><span class="sc">;</span>
+ 560   <span class="s">}</span>
+ 561 
+ 562   <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>}<span class="s">)</span> <span class="s">{</span>
+ 563     <span class="k">return</span> <span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
+ 564   <span class="s">}</span>
+ 565   <span class="k">if</span> <span class="s">(</span>!<span class="s">(</span><span class="s">(</span><span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">Count</span>} &gt; <span class="n">1</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">AreSequenceLengthsIdentical</span><span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="s">)</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 566     <span class="k">return</span> <span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
+ 567   <span class="s">}</span>
+ 568 
+ 569   <span class="c"># Use the first sequence to setup an empty ConservedAnnotation key...</span>
+ 570   <span class="c"># And mark fully conserved residues...</span>
+ 571   <span class="c">#</span>
+ 572   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$FirstSequence</span><span class="cm">,</span> <span class="i">$FirstSeqLen</span><span class="cm">,</span> <span class="i">$Res</span><span class="cm">,</span> <span class="i">$FirstRes</span><span class="cm">,</span> <span class="i">$ResConserved</span><span class="cm">,</span> <span class="i">$Index</span><span class="s">)</span><span class="sc">;</span>
+ 573   <span class="i">$ID</span> = <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">IDs</span>}[<span class="n">0</span>]<span class="sc">;</span>
+ 574   <span class="i">$FirstSequence</span> = <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
+ 575   <span class="i">$FirstSeqLen</span> = <span class="k">length</span><span class="s">(</span><span class="i">$FirstSequence</span><span class="s">)</span><span class="sc">;</span>
+ 576   <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 577   <span class="k">for</span> <span class="i">$Index</span> <span class="s">(</span><span class="n">0</span> .. <span class="s">(</span><span class="i">$FirstSeqLen</span> - <span class="n">1</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 578     <span class="i">$FirstRes</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 579     <span class="i">$ResConserved</span> = <span class="n">1</span><span class="sc">;</span>
+ 580     <span class="j">ID:</span> <span class="k">for</span> <span class="i">$ID</span> <span class="s">(</span><span class="i">@</span>{<span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">IDs</span>}}<span class="s">)</span> <span class="s">{</span>
+ 581       <span class="i">$Sequence</span> = <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="sc">;</span>
+ 582       <span class="i">$Res</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$Index</span><span class="cm">,</span> <span class="n">1</span><span class="s">)</span><span class="sc">;</span>
+ 583       <span class="k">if</span> <span class="s">(</span>!<span class="i">$FirstRes</span><span class="s">)</span> <span class="s">{</span>
+ 584         <span class="i">$FirstRes</span> = <span class="i">$Res</span><span class="sc">;</span>
+ 585         <span class="k">next</span> <span class="j">ID</span><span class="sc">;</span>
+ 586       <span class="s">}</span>
+ 587       <span class="k">if</span> <span class="s">(</span><span class="s">(</span><span class="i">$Res</span> !~ <span class="q">/[A-Z]/i</span><span class="s">)</span> || <span class="s">(</span><span class="i">$Res</span> <span class="k">ne</span> <span class="i">$FirstRes</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 588         <span class="i">$ResConserved</span> = <span class="n">0</span><span class="sc">;</span>
+ 589         <span class="k">last</span> <span class="j">ID</span><span class="sc">;</span>
+ 590       <span class="s">}</span>
+ 591     <span class="s">}</span>
+ 592     <span class="k">if</span> <span class="s">(</span><span class="i">$ResConserved</span><span class="s">)</span> <span class="s">{</span>
+ 593       <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>} .= <span class="q">&#39;*&#39;</span><span class="sc">;</span>
+ 594     <span class="s">}</span>
+ 595     <span class="k">else</span> <span class="s">{</span>
+ 596       <span class="i">$SequenceDataMapRef</span>-&gt;{<span class="w">ConservedAnnotation</span>} .= <span class="q">&#39; &#39;</span><span class="sc">;</span>
+ 597     <span class="s">}</span>
+ 598   <span class="s">}</span>
+ 599 
+ 600   <span class="k">return</span> <span class="s">(</span><span class="i">$SequenceDataMapRef</span><span class="s">)</span><span class="sc">;</span>
+ 601 <span class="s">}</span>
+ 602 
+ 603 <span class="c"># Read sequence data in ClustalW multiple sequence alignment file and</span>
+ 604 <span class="c"># return a reference to hash with these keys and values:</span>
+ 605 <span class="c">#</span>
+ 606 <span class="c"># {IDs} - Array of sequence IDs</span>
+ 607 <span class="c"># {Count} - Number of sequences</span>
+ 608 <span class="c"># {Description}{$ID} - Sequence description</span>
+ 609 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
+ 610 <span class="c"># {InputFileType} - Sequence file format</span>
+ 611 <span class="c"># {ConservedAnnotation} - Conserved residue annonations: space, *, : , .</span>
+ 612 <span class="c">#</span>
+ 613 <span class="c">#</span>
+ 614 <span class="c">#</span>
+ 615 <span class="c"># And based on ClustalW/X manual, here is what the ConservedAnnonations mean:</span>
+ 616 <span class="c">#</span>
+ 617 <span class="c"># &#39;*&#39; indicates positions which have a single, fully conserved residue</span>
+ 618 <span class="c">#</span>
+ 619 <span class="c"># &#39;:&#39; indicates that one of the following &#39;strong&#39; groups is fully conserved: STA</span>
+ 620 <span class="c">#    NEQK NHQK NDEQ QHRK MILV MILF HY FYW</span>
+ 621 
+ 622 <span class="c"># &#39;.&#39; indicates that one of the following &#39;weaker&#39; groups is fully conserved:</span>
+ 623 <span class="c">#     CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY</span>
+ 624 <span class="c">#</span>
+ 625 <span class="c"># These are all the positively scoring groups that occur in the Gonnet Pam250</span>
+ 626 <span class="c"># matrix. The strong and weak groups are defined as strong score &gt;0.5 and weak</span>
+ 627 <span class="c"># score =&lt;0.5 respectively.</span>
+ 628 <span class="c">#</span>
+<a name="_ReadClustalWFile-"></a> 629 <span class="k">sub </span><span class="m">_ReadClustalWFile</span> <span class="s">{</span>
+ 630   <span class="k">my</span><span class="s">(</span><span class="i">$SequenceFile</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 631   <span class="k">my</span><span class="s">(</span><span class="i">%SequencesDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 632 
+ 633   <span class="c"># Initialize data...</span>
+ 634   <span class="i">%SequencesDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 635   <span class="i">@</span>{<span class="i">$SequencesDataMap</span>{<span class="w">IDs</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 636   <span class="i">%</span>{<span class="i">$SequencesDataMap</span>{<span class="w">Description</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 637   <span class="i">%</span>{<span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}} = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 638   <span class="i">$SequencesDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
+ 639   <span class="i">$SequencesDataMap</span>{<span class="w">ConservedAnnotation</span>} = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 640   <span class="i">$SequencesDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;ClustalW&#39;</span><span class="sc">;</span>
+ 641 
+ 642   <span class="k">open</span> <span class="w">SEQUENCEFILE</span><span class="cm">,</span> <span class="q">&quot;$SequenceFile&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $SequenceFile: $!\n&quot;</span><span class="sc">;</span>
+ 643 
+ 644   <span class="k">my</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="i">$LineLength</span><span class="cm">,</span> <span class="i">$AnnotationStart</span><span class="cm">,</span> <span class="i">$AnnotationLength</span><span class="cm">,</span> <span class="i">$Annotation</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$SequenceLength</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$IDIndex</span><span class="s">)</span><span class="sc">;</span>
+ 645 
+ 646   <span class="c"># Ignore the header line...</span>
+ 647   <span class="i">$Line</span> = <span class="q">&lt;SEQUENCEFILE&gt;</span><span class="sc">;</span>
+ 648 
+ 649   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*SEQUENCEFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 650     <span class="k">if</span> <span class="s">(</span><span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^[ \*\:\.]/</span><span class="s">)</span> &amp;&amp; <span class="s">(</span><span class="i">$Line</span> !~ <span class="q">/[A-Z]/i</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 651       <span class="c"># Annotation for sequences: fully conserverd, weaker or stronger group conserverd.</span>
+ 652       <span class="c"># Extract it and save...</span>
+ 653       <span class="i">$LineLength</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
+ 654       <span class="i">$AnnotationStart</span> = <span class="i">$LineLength</span> - <span class="i">$SequenceLength</span><span class="sc">;</span>
+ 655       <span class="i">$AnnotationLength</span> = <span class="i">$SequenceLength</span><span class="sc">;</span>
+ 656       <span class="i">$Annotation</span> = <span class="k">substr</span><span class="s">(</span><span class="i">$Line</span><span class="cm">,</span> <span class="i">$AnnotationStart</span><span class="cm">,</span> <span class="i">$AnnotationLength</span><span class="s">)</span><span class="sc">;</span>
+ 657       <span class="i">$SequencesDataMap</span>{<span class="w">ConservedAnnotation</span>} .= <span class="i">$Annotation</span><span class="sc">;</span>
+ 658     <span class="s">}</span>
+ 659     <span class="k">else</span> <span class="s">{</span>
+ 660       <span class="c"># Extract ID and sequences...</span>
+ 661       <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="s">)</span>= <span class="i">$Line</span> =~ <span class="q">/^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/</span><span class="sc">;</span>
+ 662       <span class="i">$Sequence</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
+ 663       <span class="k">if</span> <span class="s">(</span>!<span class="s">(</span><span class="i">$ID</span> &amp;&amp; <span class="i">$Sequence</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 664         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
+ 665       <span class="s">}</span>
+ 666 
+ 667       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 668         <span class="c"># Append to existing alignment value...</span>
+ 669         <span class="i">$SequenceLength</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence</span><span class="s">)</span><span class="sc">;</span>
+ 670         <span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Sequence</span><span class="sc">;</span>
+ 671       <span class="s">}</span>
+ 672       <span class="k">else</span> <span class="s">{</span>
+ 673         <span class="c"># New alignment data...</span>
+ 674         <span class="i">$SequencesDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
+ 675         <span class="k">push</span> <span class="i">@</span>{<span class="i">$SequencesDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
+ 676         <span class="i">$SequencesDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$ID</span><span class="sc">;</span>
+ 677         <span class="i">$SequencesDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Sequence</span><span class="sc">;</span>
+ 678         <span class="i">$SequenceLength</span> = <span class="k">length</span><span class="s">(</span><span class="i">$Sequence</span><span class="s">)</span><span class="sc">;</span>
+ 679       <span class="s">}</span>
+ 680     <span class="s">}</span>
+ 681   <span class="s">}</span>
+ 682   <span class="k">close</span> <span class="w">SEQUENCEFILE</span><span class="sc">;</span>
+ 683   <span class="k">return</span> <span class="s">(</span>\<span class="i">%SequencesDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 684 <span class="s">}</span>
+ 685 
+ 686 <span class="c"># Read Pearson fasta file and return a reference to hash with these keys:</span>
+ 687 <span class="c">#</span>
+ 688 <span class="c"># {IDs} - Array of sequence IDs</span>
+ 689 <span class="c"># {Count} - Number of sequences</span>
+ 690 <span class="c"># {Description}{$ID} - Sequence description</span>
+ 691 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
+ 692 <span class="c"># {InputFileType} - Sequence file format</span>
+ 693 <span class="c"># {ConservedAnnotation} - Conserved residue annonation</span>
+ 694 <span class="c">#</span>
+<a name="_ReadPearsonFastaFile-"></a> 695 <span class="k">sub </span><span class="m">_ReadPearsonFastaFile</span> <span class="s">{</span>
+ 696   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFileName</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$IgnoreID</span><span class="cm">,</span> <span class="i">@LineWords</span><span class="cm">,</span> <span class="i">%FastaDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 697 
+ 698   <span class="s">(</span><span class="i">$FastaFileName</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 699 
+ 700   <span class="i">%FastaDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 701   <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 702   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 703   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 704   <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
+ 705   <span class="i">$FastaDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;Pearson&#39;</span><span class="sc">;</span>
+ 706 
+ 707   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFileName: $!\n&quot;</span><span class="sc">;</span>
+ 708   <span class="i">$ID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 709   <span class="i">$IgnoreID</span> = <span class="n">0</span><span class="sc">;</span>
+ 710   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 711     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^\&gt;/</span><span class="s">)</span> <span class="s">{</span>
+ 712       <span class="c"># Start of a new ID...</span>
+ 713       <span class="i">$Line</span> =~ <span class="q">s/^\&gt;//</span><span class="sc">;</span>
+ 714       <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
+ 715       <span class="i">@LineWords</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 716       <span class="i">@LineWords</span> = <span class="k">split</span> <span class="q">/ /</span><span class="cm">,</span> <span class="i">$Line</span><span class="sc">;</span>
+ 717 
+ 718       <span class="i">$ID</span> = <span class="i">$LineWords</span>[<span class="n">0</span>]<span class="sc">;</span>
+ 719       <span class="i">$ID</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
+ 720       <span class="i">$Description</span> = <span class="i">$Line</span><span class="sc">;</span>
+ 721 
+ 722       <span class="i">$IgnoreID</span> = <span class="n">0</span><span class="sc">;</span>
+ 723       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 724         <span class="i">$IgnoreID</span> = <span class="n">1</span><span class="sc">;</span>
+ 725         <span class="k">warn</span> <span class="q">&quot;Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n&quot;</span><span class="sc">;</span>
+ 726         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
+ 727       <span class="s">}</span>
+ 728       <span class="k">push</span> <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
+ 729       <span class="i">$FastaDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$Description</span><span class="sc">;</span>
+ 730       <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
+ 731       <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
+ 732     <span class="s">}</span>
+ 733     <span class="k">if</span> <span class="s">(</span><span class="i">$IgnoreID</span><span class="s">)</span> <span class="s">{</span> <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span> <span class="s">}</span>
+ 734 
+ 735     <span class="c"># Remove any spaces in the sequence...</span>
+ 736     <span class="i">$Line</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
+ 737     <span class="c"># Sequence data for active ID...</span>
+ 738     <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 739       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Line</span><span class="sc">;</span>
+ 740     <span class="s">}</span>
+ 741     <span class="k">else</span> <span class="s">{</span>
+ 742       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Line</span><span class="sc">;</span>
+ 743     <span class="s">}</span>
+ 744   <span class="s">}</span>
+ 745   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
+ 746   <span class="k">return</span> \<span class="i">%FastaDataMap</span><span class="sc">;</span>
+ 747 <span class="s">}</span>
+ 748 
+ 749 <span class="c"># Read PIR fasta file and return a reference to hash with these keys:</span>
+ 750 <span class="c">#</span>
+ 751 <span class="c"># {IDs} - Array of sequence IDs</span>
+ 752 <span class="c"># {Count} - Number of sequences</span>
+ 753 <span class="c"># {Description}{$ID} - Sequence description</span>
+ 754 <span class="c"># {Sequence}{$ID} - Sequence for a specific ID</span>
+ 755 <span class="c"># {InputFileType} - Sequence file format</span>
+ 756 <span class="c"># {ConservedAnnotation} - Conserved residue annonation</span>
+ 757 <span class="c">#</span>
+ 758 <span class="c"># Format:</span>
+ 759 <span class="c"># A sequence in PIR format consists of:</span>
+ 760 <span class="c"># One line starting with</span>
+ 761 <span class="c">#   a &quot;&gt;&quot; (greater-than) sign, followed by</span>
+ 762 <span class="c">#   a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by</span>
+ 763 <span class="c">#   a semicolon, followed by</span>
+ 764 <span class="c">#   the sequence identification code (the database ID-code).</span>
+ 765 <span class="c"># One line containing a textual description of the sequence.</span>
+ 766 <span class="c"># One or more lines containing the sequence itself. The end of the</span>
+ 767 <span class="c"># sequence is marked by a &quot;*&quot; (asterisk) character.</span>
+ 768 <span class="c">#</span>
+ 769 <span class="c"># A file in PIR format may comprise more than one sequence.</span>
+ 770 <span class="c">#</span>
+ 771 <span class="c"># The PIR format is also often referred to as the NBRF format.</span>
+ 772 <span class="c">#</span>
+ 773 <span class="c"># Code SequenceType</span>
+ 774 <span class="c"># P1    Protein (complete)</span>
+ 775 <span class="c"># F1    Protein (fragment)</span>
+ 776 <span class="c"># DL    DNA (linear)</span>
+ 777 <span class="c"># DC    DNA (circular)</span>
+ 778 <span class="c"># RL    RNA (linear)</span>
+ 779 <span class="c"># RC   RNA (circular)</span>
+ 780 <span class="c"># N3    tRNA</span>
+ 781 <span class="c"># N1    Other functional RNA</span>
+ 782 <span class="c">#</span>
+ 783 
+<a name="_ReadPIRFastaFile-"></a> 784 <span class="k">sub </span><span class="m">_ReadPIRFastaFile</span> <span class="s">{</span>
+ 785   <span class="k">my</span><span class="s">(</span><span class="i">$FastaFileName</span><span class="cm">,</span> <span class="i">$ID</span><span class="cm">,</span> <span class="i">$Description</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">$SequenceTypeCode</span><span class="cm">,</span> <span class="i">$ReadingSequenceData</span><span class="cm">,</span> <span class="i">%FastaDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 786 
+ 787   <span class="s">(</span><span class="i">$FastaFileName</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 788 
+ 789   <span class="i">%FastaDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 790   <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 791   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 792   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 793   <span class="i">%</span>{<span class="i">$FastaDataMap</span>{<span class="w">SequenceTypeCode</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 794   <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
+ 795   <span class="i">$FastaDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;PIR&#39;</span><span class="sc">;</span>
+ 796 
+ 797   <span class="k">open</span> <span class="w">FASTAFILE</span><span class="cm">,</span> <span class="q">&quot;$FastaFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $FastaFileName: $!\n&quot;</span><span class="sc">;</span>
+ 798   <span class="i">$ID</span> = <span class="q">&#39;&#39;</span><span class="sc">;</span>
+ 799   <span class="i">$ReadingSequenceData</span> = <span class="n">0</span><span class="sc">;</span>
+ 800   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 801     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/^\&gt;/</span><span class="s">)</span> <span class="s">{</span>
+ 802       <span class="c"># Start of a new ID...</span>
+ 803       <span class="i">$Line</span> =~ <span class="q">s/^\&gt;//</span><span class="sc">;</span>
+ 804       <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
+ 805       <span class="s">(</span><span class="i">$SequenceTypeCode</span><span class="cm">,</span> <span class="i">$ID</span><span class="s">)</span> = <span class="q">/^\&gt;(.*?)\;(.*?)$/</span><span class="sc">;</span>
+ 806 
+ 807       <span class="c"># Use next line to retrieve sequence description...</span>
+ 808       <span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*FASTAFILE</span><span class="s">)</span><span class="sc">;</span>
+ 809       <span class="i">$Line</span> = <span class="i">RemoveLeadingWhiteSpaces</span><span class="s">(</span><span class="i">$Line</span><span class="s">)</span><span class="sc">;</span>
+ 810       <span class="i">$Description</span> = <span class="i">$Line</span><span class="sc">;</span>
+ 811 
+ 812       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 813         <span class="k">warn</span> <span class="q">&quot;Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n&quot;</span><span class="sc">;</span>
+ 814         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
+ 815       <span class="s">}</span>
+ 816       <span class="i">$ReadingSequenceData</span> = <span class="n">1</span><span class="sc">;</span>
+ 817       <span class="k">push</span> <span class="i">@</span>{<span class="i">$FastaDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
+ 818       <span class="i">$FastaDataMap</span>{<span class="w">SequenceTypeCode</span>}{<span class="i">$ID</span>} = <span class="i">$SequenceTypeCode</span><span class="sc">;</span>
+ 819       <span class="i">$FastaDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$Description</span><span class="sc">;</span>
+ 820       <span class="i">$FastaDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
+ 821       <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
+ 822     <span class="s">}</span>
+ 823     <span class="k">if</span> <span class="s">(</span>!<span class="i">$ReadingSequenceData</span><span class="s">)</span> <span class="s">{</span> <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span> <span class="s">}</span>
+ 824 
+ 825     <span class="c"># Remove any spaces in the sequence...</span>
+ 826     <span class="i">$Line</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
+ 827     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/[\*]$/</span><span class="s">)</span> <span class="s">{</span>
+ 828       <span class="c"># End of sequence...</span>
+ 829       <span class="i">$ReadingSequenceData</span> = <span class="n">0</span><span class="sc">;</span>
+ 830       <span class="i">$Line</span> =~ <span class="q">s/[\*]$//</span><span class="sc">;</span>
+ 831     <span class="s">}</span>
+ 832     <span class="c"># Sequence data for active ID...</span>
+ 833     <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 834       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Line</span><span class="sc">;</span>
+ 835     <span class="s">}</span>
+ 836     <span class="k">else</span> <span class="s">{</span>
+ 837       <span class="i">$FastaDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Line</span><span class="sc">;</span>
+ 838     <span class="s">}</span>
+ 839   <span class="s">}</span>
+ 840   <span class="k">close</span> <span class="w">FASTAFILE</span><span class="sc">;</span>
+ 841   <span class="k">return</span> \<span class="i">%FastaDataMap</span><span class="sc">;</span>
+ 842 <span class="s">}</span>
+ 843 
+ 844 <span class="c"># Read MSF file and return a reference to hash with these keys:</span>
+ 845 <span class="c">#</span>
+ 846 <span class="c"># {IDs} : Array of IDs in order as they appear in file</span>
+ 847 <span class="c"># {Count}: ID count...</span>
+ 848 <span class="c"># {Description}{$ID} : Description data...</span>
+ 849 <span class="c"># {Sequence}{$ID} : Sequence data...</span>
+ 850 <span class="c">#</span>
+<a name="_ReadMSFFile-"></a> 851 <span class="k">sub </span><span class="m">_ReadMSFFile</span> <span class="s">{</span>
+ 852   <span class="k">my</span><span class="s">(</span><span class="i">$MSFFileName</span><span class="cm">,</span> <span class="i">$Line</span><span class="cm">,</span> <span class="i">@LineWords</span><span class="cm">,</span> <span class="i">%MSFDataMap</span><span class="s">)</span><span class="sc">;</span>
+ 853 
+ 854   <span class="s">(</span><span class="i">$MSFFileName</span><span class="s">)</span> = <span class="i">@_</span><span class="sc">;</span>
+ 855 
+ 856   <span class="i">%MSFDataMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 857   <span class="i">@</span>{<span class="i">$MSFDataMap</span>{<span class="w">IDs</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 858   <span class="i">%</span>{<span class="i">$MSFDataMap</span>{<span class="w">Description</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 859   <span class="i">%</span>{<span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}} =<span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 860   <span class="i">$MSFDataMap</span>{<span class="w">Count</span>} = <span class="n">0</span><span class="sc">;</span>
+ 861   <span class="i">$MSFDataMap</span>{<span class="w">InputFileType</span>} = <span class="q">&#39;MSF&#39;</span><span class="sc">;</span>
+ 862 
+ 863   <span class="k">open</span> <span class="w">MSFFILE</span><span class="cm">,</span> <span class="q">&quot;$MSFFileName&quot;</span> <span class="k">or</span> <span class="k">die</span> <span class="q">&quot;Couldn&#39;t open $MSFFileName: $!\n&quot;</span><span class="sc">;</span>
+ 864 
+ 865   <span class="c"># Collect sequences and IDs...</span>
+ 866   <span class="c">#</span>
+ 867   <span class="c"># &#39;//&#39; after the name fields indicates end of header list and start of sequence data.</span>
+ 868   <span class="c">#</span>
+ 869   <span class="k">my</span><span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Len</span><span class="cm">,</span> <span class="i">$Check</span><span class="cm">,</span> <span class="i">$Weight</span><span class="cm">,</span> <span class="i">$Sequence</span><span class="cm">,</span> <span class="i">$NameFieldsFound</span><span class="cm">,</span> <span class="i">%MSFIDsMap</span><span class="s">)</span><span class="sc">;</span>
+ 870   <span class="i">%MSFIDsMap</span> = <span class="s">(</span><span class="s">)</span><span class="sc">;</span>
+ 871   <span class="i">$NameFieldsFound</span> = <span class="n">0</span><span class="sc">;</span>
+ 872   <span class="j">LINE:</span> <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*MSFFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 873     <span class="k">if</span> <span class="s">(</span><span class="i">$Line</span> =~ <span class="q">/Name:/</span><span class="s">)</span> <span class="s">{</span>
+ 874       <span class="i">$NameFieldsFound</span>++<span class="sc">;</span>
+ 875       <span class="s">(</span><span class="i">$ID</span><span class="cm">,</span> <span class="i">$Len</span><span class="cm">,</span> <span class="i">$Check</span><span class="cm">,</span> <span class="i">$Weight</span><span class="s">)</span> = <span class="i">$Line</span> =~ <span class="q">/^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/</span><span class="sc">;</span>
+ 876       <span class="k">if</span> <span class="s">(</span><span class="i">$ID</span> =~ <span class="q">/ /</span><span class="s">)</span> <span class="s">{</span>
+ 877         <span class="s">(</span><span class="i">$ID</span><span class="s">)</span> = <span class="i">$ID</span> =~ <span class="q">/^(.*?)[ ]+/</span>
+ 878       <span class="s">}</span>
+ 879       <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$MSFIDsMap</span>{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 880         <span class="k">warn</span> <span class="q">&quot;Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n&quot;</span><span class="sc">;</span>
+ 881         <span class="k">next</span> <span class="j">LINE</span><span class="sc">;</span>
+ 882       <span class="s">}</span>
+ 883       <span class="i">$MSFIDsMap</span>{<span class="i">$ID</span>} = <span class="i">$ID</span><span class="sc">;</span>
+ 884       <span class="k">push</span> <span class="i">@</span>{<span class="i">$MSFDataMap</span>{<span class="w">IDs</span>}}<span class="cm">,</span> <span class="i">$ID</span><span class="sc">;</span>
+ 885       <span class="i">$MSFDataMap</span>{<span class="w">Description</span>}{<span class="i">$ID</span>} = <span class="i">$ID</span><span class="sc">;</span>
+ 886       <span class="i">$MSFDataMap</span>{<span class="w">Count</span>} += <span class="n">1</span><span class="sc">;</span>
+ 887     <span class="s">}</span>
+ 888     <span class="k">elsif</span> <span class="s">(</span> <span class="q">/\/\//</span> &amp;&amp; <span class="i">$NameFieldsFound</span><span class="s">)</span> <span class="s">{</span>
+ 889       <span class="c"># End of header list...</span>
+ 890       <span class="k">last</span> <span class="j">LINE</span><span class="sc">;</span>
+ 891     <span class="s">}</span>
+ 892   <span class="s">}</span>
+ 893   <span class="c"># Collect all sequences...</span>
+ 894   <span class="c">#</span>
+ 895   <span class="k">my</span><span class="s">(</span><span class="i">$FirstField</span><span class="cm">,</span> <span class="i">$SecondField</span><span class="s">)</span><span class="sc">;</span>
+ 896   <span class="k">while</span> <span class="s">(</span><span class="i">$Line</span> = <span class="i">GetTextLine</span><span class="s">(</span>\<span class="i">*MSFFILE</span><span class="s">)</span><span class="s">)</span> <span class="s">{</span>
+ 897     <span class="s">(</span><span class="i">$FirstField</span><span class="cm">,</span> <span class="i">$SecondField</span><span class="s">)</span> = <span class="i">$Line</span> =~ <span class="q">/^[ ]*(.*?)[ ]+(.*?)$/</span><span class="sc">;</span>
+ 898     <span class="k">if</span> <span class="s">(</span><span class="k">exists</span> <span class="i">$MSFIDsMap</span>{<span class="i">$FirstField</span>}<span class="s">)</span> <span class="s">{</span>
+ 899       <span class="c"># It&#39;s ID and sequence data...</span>
+ 900       <span class="i">$ID</span> = <span class="i">$FirstField</span><span class="sc">;</span>
+ 901       <span class="i">$Sequence</span> = <span class="i">$SecondField</span><span class="sc">;</span>
+ 902       <span class="c"># Take out spaces and leave the gap characters...</span>
+ 903       <span class="i">$Sequence</span> =~ <span class="q">s/ //g</span><span class="sc">;</span>
+ 904       <span class="k">if</span> <span class="s">(</span><span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>}<span class="s">)</span> <span class="s">{</span>
+ 905         <span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} .= <span class="i">$Sequence</span><span class="sc">;</span>
+ 906       <span class="s">}</span>
+ 907       <span class="k">else</span> <span class="s">{</span>
+ 908         <span class="i">$MSFDataMap</span>{<span class="w">Sequence</span>}{<span class="i">$ID</span>} = <span class="i">$Sequence</span><span class="sc">;</span>
+ 909       <span class="s">}</span>
+ 910     <span class="s">}</span>
+ 911   <span class="s">}</span>
+ 912 
+ 913   <span class="k">close</span> <span class="w">MSFFILE</span><span class="sc">;</span>
+ 914   <span class="k">return</span> \<span class="i">%MSFDataMap</span><span class="sc">;</span>
+ 915 <span class="s">}</span>
+ 916 
+ 917 
+<a name="EOF-"></a></pre>
+<p>&nbsp;</p>
+<br />
+<center>
+<img src="../../../images/h2o2.png">
+</center>
+</body>
+</html>