MayaChemTools

   1 package SequenceFileUtil;
   2 #
   3 # $RCSfile: SequenceFileUtil.pm,v $
   4 # $Date: 2015/02/28 20:47:18 $
   5 # $Revision: 1.33 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Exporter;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use FileUtil;
  34 
  35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
  39 @EXPORT_OK = qw();
  40 
  41 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  42 
  43 # Compare lengths of all sequences...
  44 sub AreSequenceLengthsIdentical {
  45   my($SequencesDataRef) = @_;
  46   my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
  47 
  48   $Status = 1;
  49   $FirstID = '';
  50   $FirstDifferentLenID = '';
  51 
  52   ID: for $ID (@{$SequencesDataRef->{IDs}}) {
  53     if (!$FirstID) {
  54       $FirstID = $ID;
  55       $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
  56       next ID;
  57     }
  58     $SeqLen = length($SequencesDataRef->{Sequence}{$ID});
  59     if ($SeqLen != $FirstSeqLen) {
  60       $Status = 0;
  61       $FirstDifferentLenID = $ID;
  62       last ID;
  63     }
  64   }
  65   return ($Status);
  66 }
  67 
  68 # Calculate percent identity between two sequences. By default, gaps are ignored.
  69 sub CalcuatePercentSequenceIdentity {
  70   my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
  71 
  72   $PercentIdentity = '';
  73   $Precision = 1;
  74   $IgnoreGaps = 1;
  75   if (@_ == 4) {
  76     ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
  77   }
  78   elsif (@_ == 3) {
  79     ($Sequence1, $Sequence2, $IgnoreGaps) = @_;
  80   }
  81   elsif (@_ == 2) {
  82     ($Sequence1, $Sequence2) = @_;
  83   }
  84   else {
  85     return $PercentIdentity;
  86   }
  87   if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
  88     return $PercentIdentity;
  89   }
  90   my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
  91 
  92   $Sequence1Len = length($Sequence1);
  93   $Sequence2Len = length($Sequence2);
  94 
  95   $ResMatchCount = 0;
  96   $ResCount = 0;
  97   RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
  98     $Residue1 = substr($Sequence1, $Index, 1);
  99     $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
 100     if ($IgnoreGaps) {
 101       if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
 102         next RESIDUE;
 103       }
 104     }
 105     if ($Residue1 eq $Residue2) {
 106       $ResMatchCount++;
 107     }
 108     $ResCount++;
 109   }
 110   $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
 111   $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
 112 
 113   return $PercentIdentity;
 114 }
 115 
 116 # Calculate pairwise identify matrix for all the sequences and return a reference
 117 # to a hash with the following keys:
 118 #
 119 # {IDs} - Sequence IDs
 120 # {Count} - Number of IDs
 121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
 122 #
 123 sub CalculatePercentSequenceIdentityMatrix {
 124   my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
 125 
 126   $IgnoreGaps = 1;
 127   $Precision = 1;
 128   if (@_ == 3) {
 129     ($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
 130   }
 131   elsif (@_ == 2) {
 132     ($SequencesDataRef, $IgnoreGaps) = @_;
 133   }
 134   else {
 135     ($SequencesDataRef) = @_;
 136   }
 137 
 138   %IdentityMatrixData = ();
 139   @{$IdentityMatrixData{IDs}} = ();
 140   %{$IdentityMatrixData{PercentIdentity}} = ();
 141   $IdentityMatrixData{Count} = 0;
 142 
 143   for $ID (@{$SequencesDataRef->{IDs}}) {
 144     push @{$IdentityMatrixData{IDs}}, $ID;
 145     $IdentityMatrixData{Count} += 1;
 146   }
 147   # Initialize and calculate percent identity data values...
 148   for $RowID (@{$SequencesDataRef->{IDs}}) {
 149     %{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
 150     $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
 151     for $ColID (@{$SequencesDataRef->{IDs}}) {
 152       $IdentityMatrixData{$RowID}{$ColID} = '';
 153       $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
 154       $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
 155       $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
 156     }
 157   }
 158   return \%IdentityMatrixData;
 159 }
 160 
 161 # Retrieve information about shortest sequence...
 162 sub GetShortestSequence {
 163   my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
 164 
 165   $IgnoreGaps = 1;
 166   if (@_ == 2) {
 167     ($SequencesDataRef, $IgnoreGaps) = @_;
 168   }
 169   else {
 170     ($SequencesDataRef) = @_;
 171   }
 172 
 173   ($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
 174   return ($ID, $Sequence, $SeqLen, $Description);
 175 }
 176 
 177 # Retrieve information about longest sequence..
 178 sub GetLongestSequence {
 179   my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
 180 
 181   $IgnoreGaps = 1;
 182   if (@_ == 2) {
 183     ($SequencesDataRef, $IgnoreGaps) = @_;
 184   }
 185   else {
 186     ($SequencesDataRef) = @_;
 187   }
 188 
 189   ($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
 190   return ($ID, $Sequence, $SeqLen, $Description);
 191 }
 192 
 193 # Get sequence length...
 194 sub GetSequenceLength {
 195   my($Seq, $SeqLen, $IgnoreGaps);
 196 
 197   $SeqLen = ''; $IgnoreGaps = 1;
 198   if (@_ == 2) {
 199     ($Seq, $IgnoreGaps) = @_;
 200   }
 201   else {
 202     ($Seq) = @_;
 203   }
 204   if ($IgnoreGaps) {
 205     my($Index, $Residue);
 206     $SeqLen = 0;
 207     for $Index (0 .. (length($Seq) - 1)) {
 208       $Residue = substr($Seq, $Index, 1);
 209       if ($Residue =~ /[A-Z]/i) {
 210         $SeqLen++;
 211       }
 212     }
 213   }
 214   else {
 215     $SeqLen = length($Seq);
 216   }
 217 
 218   return $SeqLen;
 219 }
 220 
 221 # Is it a gap residue...
 222 sub IsGapResidue {
 223   my($Residue) = @_;
 224   my($Status);
 225 
 226   $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
 227 
 228   return $Status;
 229 }
 230 
 231 # Is it a supported sequence file?
 232 #
 233 # Supported seqence formats are:
 234 #
 235 # ALN/ClustalW   .aln
 236 # GCG/MSF         .msf
 237 # PILEUP/MSF     .msf
 238 # Fasts(Pearson) .fasta, .fta
 239 # NBRF/PIR         .pir
 240 #
 241 sub IsSupportedSequenceFile {
 242   my($SequenceFile) = @_;
 243   my($Status, $SequenceFormat);
 244   $Status = 0; $SequenceFormat = 'NotSupported';
 245 
 246   SEQFORMAT: {
 247       if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
 248       if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
 249       if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
 250       if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
 251       $Status = 0; $SequenceFormat = 'NotSupported';
 252   }
 253   return ($Status, $SequenceFormat);
 254 }
 255 
 256 # Is it a ClustalW multiple sequence sequence file...
 257 sub IsClustalWSequenceFile {
 258   my($SequenceFile) = @_;
 259   my($Status, $Line);
 260 
 261   $Status = 0;
 262 
 263   open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
 264   $Line = GetTextLine(\*SEQUENCEFILE);
 265   $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
 266   close SEQUENCEFILE;
 267 
 268   return $Status;
 269 }
 270 
 271 # Is it a valid Pearson fasta sequence or alignment file?
 272 #
 273 sub IsPearsonFastaSequenceFile {
 274   my($FastaFile, $Line, $Status);
 275 
 276   ($FastaFile) = @_;
 277   $Status = 0;
 278 
 279   open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
 280   $Line = GetTextLine(\*FASTAFILE);
 281 
 282   # First line starts with > and the fourth character is not ';'; otherwise, it's
 283   # PIR FASTA format.
 284   if ($Line =~ /^>/) {
 285     my($FourthChar);
 286     $FourthChar = substr($Line, 3, 1);
 287     $Status = ($FourthChar !~ /\;/) ? 1 : 0;
 288   }
 289   close FASTAFILE;
 290 
 291   return $Status;
 292 }
 293 
 294 # Is it a valid NBRF/PIR fasta sequence or alignment file?
 295 #
 296 sub IsPIRFastaSequenceFile {
 297   my($FastaFile, $Line, $Status);
 298 
 299   ($FastaFile) = @_;
 300   $Status = 0;
 301 
 302   open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
 303   $Line = GetTextLine(\*FASTAFILE);
 304 
 305   # First line starts with > and the fourth character is ';'; otherwise, it's
 306   # a Pearson FASTA format.
 307   if ($Line =~ /^>/) {
 308     my($FourthChar);
 309     $FourthChar = substr($Line, 3, 1);
 310     $Status = ($FourthChar =~ /\;/) ? 1 : 0;
 311   }
 312   close FASTAFILE;
 313 
 314   return $Status;
 315 }
 316 
 317 # Is it a valid MSF sequence or alignment file?
 318 #
 319 sub IsMSFSequenceFile {
 320   my($MSFFile) = @_;
 321 
 322   open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
 323 
 324   my($Line, $Status);
 325 
 326   $Status = 0;
 327   # Find a line that contains MSF: keyword and ends with '..'
 328   LINE: while ($Line = GetTextLine(\*MSFFILE)) {
 329     $Line = RemoveLeadingWhiteSpaces($Line);
 330     if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
 331       $Status = 1;
 332       last LINE;
 333     }
 334     elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
 335       # Pileup MSF...
 336       $Status = 1;
 337       last LINE;
 338     }
 339   }
 340   close MSFFILE;
 341 
 342   return $Status;
 343 }
 344 
 345 # Read sequence or sequence alignment file...
 346 sub ReadSequenceFile {
 347   my($SequenceFile) = @_;
 348 
 349   if (IsPearsonFastaSequenceFile($SequenceFile)) {
 350     return ReadPearsonFastaSequenceFile($SequenceFile);
 351   }
 352   elsif (IsPIRFastaSequenceFile($SequenceFile)) {
 353     return ReadPIRFastaSequenceFile($SequenceFile);
 354   }
 355   elsif (IsMSFSequenceFile($SequenceFile)) {
 356     return ReadMSFSequenceFile($SequenceFile);
 357   }
 358   elsif (IsClustalWSequenceFile($SequenceFile)) {
 359     return ReadClustalWSequenceFile($SequenceFile);
 360   }
 361   else {
 362     return undef;
 363   }
 364 }
 365 
 366 # Read file and setup alignment data...
 367 sub ReadClustalWSequenceFile {
 368   my($SequenceFile) = @_;
 369 
 370   return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
 371 }
 372 
 373 # Read file and setup alignment data...
 374 sub ReadPearsonFastaSequenceFile {
 375   my($SequenceFile) = @_;
 376 
 377   return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
 378 }
 379 
 380 # Read file and setup alignment data...
 381 sub ReadPIRFastaSequenceFile {
 382   my($SequenceFile) = @_;
 383 
 384   return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
 385 }
 386 
 387 
 388 # Read file and setup sequence data...
 389 sub ReadMSFSequenceFile {
 390   my($SequenceFile) = @_;
 391 
 392   return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
 393 }
 394 
 395 # Write out a Pearson FASTA file...
 396 sub WritePearsonFastaSequenceFile {
 397   my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
 398 
 399   $MaxLength = 80;
 400   if (@_ == 3) {
 401     ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
 402   }
 403   elsif (@_ == 2) {
 404     ($SequenceFileName, $SequenceDataRef) = @_;
 405   }
 406   open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
 407   for $ID (@{$SequenceDataRef->{IDs}}) {
 408     $Description = $SequenceDataRef->{Description}{$ID};
 409     $Sequence = $SequenceDataRef->{Sequence}{$ID};
 410     $WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
 411 
 412     # Description also contains ID...
 413     print SEQUENCEFILE ">$Description\n";
 414     print SEQUENCEFILE "$WrappedSequence\n";
 415   }
 416   close SEQUENCEFILE;
 417 }
 418 
 419 # Get ID, Sequence and Length for smallest or longest sequence
 420 sub _GetShortestOrLongestSequence {
 421   my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
 422   my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 423 
 424   ($ID, $Seq, $SeqLen) = ('', '', '');
 425   $FirstID = '';
 426 
 427   ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
 428     $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
 429     $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
 430     $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
 431     if (!$FirstID) {
 432       $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
 433       ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 434       next ID;
 435     }
 436     if ($CurrentSeqLen != $SeqLen) {
 437       if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
 438         ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 439       }
 440       elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
 441         ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
 442       }
 443     }
 444   }
 445   return ($ID, $Seq, $SeqLen, $Description);
 446 }
 447 
 448 # Remove gaps in the sequence and return new sequence...
 449 sub RemoveSequenceGaps {
 450   my($Seq) = @_;
 451   my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
 452 
 453   $SeqWithoutGaps = '';
 454   $SeqLen = length($Seq);
 455   for $Index (0 .. ($SeqLen - 1)) {
 456     $Residue = substr($Seq, $Index, 1);
 457     if ($Residue =~ /[A-Z]/i) {
 458       $SeqWithoutGaps .= $Residue;
 459     }
 460   }
 461 
 462   return $SeqWithoutGaps;
 463 }
 464 
 465 # Using input alignment data map ref containing following keys, generate
 466 # a new hash with same set of keys after residue columns containg only
 467 # gaps have been removed:
 468 #
 469 # {IDs} : Array of IDs in order as they appear in file
 470 # {Count}: ID count...
 471 # {Description}{$ID} : Description data...
 472 # {Sequence}{$ID} : Sequence data...
 473 #
 474 sub RemoveSequenceAlignmentGapColumns {
 475   my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
 476 
 477   ($AlignmentDataMapRef) = @_;
 478 
 479   %NewAlignmentDataMap = ();
 480   @{$NewAlignmentDataMap{IDs}} =();
 481   %{$NewAlignmentDataMap{Description}} =();
 482   %{$NewAlignmentDataMap{Sequence}} =();
 483   $NewAlignmentDataMap{Count} = 0;
 484 
 485   # Transfer ID and count information...
 486   for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 487     push @{$NewAlignmentDataMap{IDs}}, $ID;
 488     $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
 489     $NewAlignmentDataMap{Sequence}{$ID} = '';
 490     $NewAlignmentDataMap{Count} += 1;
 491   }
 492 
 493   # Go over residue columns and transfer the data...
 494   my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
 495 
 496   $FirstID = $AlignmentDataMapRef->{IDs}[0];
 497   $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
 498   $FirstSeqLen = length($FirstSeq);
 499 
 500   RES: for $Index (0 .. ($FirstSeqLen - 1)) {
 501     # Is this a gap column?
 502     $GapColumn = 1;
 503     ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 504       $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
 505       if ($Res =~ /[A-Z]/i) {
 506         $GapColumn = 0;
 507         last ID;
 508       }
 509     }
 510     if ($GapColumn) {
 511       next RES;
 512     }
 513     # Transfer this residue...
 514     for $ID (@{$AlignmentDataMapRef->{IDs}}) {
 515       $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
 516       $NewAlignmentDataMap{Sequence}{$ID} .= $Res;
 517     }
 518   }
 519 
 520   return (\%NewAlignmentDataMap);
 521 }
 522 
 523 #
 524 # Read sequences file and return a reference to hash with the following keys:
 525 #
 526 # {IDs} - Array of sequence IDs
 527 # {Count} - Number of sequences
 528 # {Description}{$ID} - Sequence description
 529 # {Sequence}{$ID} - Sequence for a specific ID
 530 # {InputFileType} - Sequence file format
 531 # {ConservedAnnotation} - Conserved residue annonation
 532 #
 533 # Note:
 534 #   . Conserved residue annotation either exist in the input sequence alignment file or set
 535 #     for a file containing same number of residues for all the sequence using the following
 536 #     notation: * - Residue conserved; ' ' - Residue not conserved.
 537 #
 538 sub _ReadFileAndSetupSequencesData {
 539   my($SequenceFile, $SequenceType) = @_;
 540   my($SequenceDataMapRef);
 541 
 542   $SequenceDataMapRef = undef;
 543 
 544   # Read sequence file...
 545   $SequenceDataMapRef = '';
 546   if ($SequenceType =~ /^ClustalW$/i) {
 547     $SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
 548   }
 549   elsif ($SequenceType =~ /^Pearson$/i) {
 550     $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
 551   }
 552   elsif ($SequenceType =~ /^PIR$/i) {
 553     $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
 554   }
 555   elsif ($SequenceType =~ /^MSF$/i) {
 556     $SequenceDataMapRef = _ReadMSFFile($SequenceFile);
 557   }
 558   else {
 559     return $SequenceDataMapRef;
 560   }
 561 
 562   if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
 563     return ($SequenceDataMapRef);
 564   }
 565   if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
 566     return ($SequenceDataMapRef);
 567   }
 568 
 569   # Use the first sequence to setup an empty ConservedAnnotation key...
 570   # And mark fully conserved residues...
 571   #
 572   my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
 573   $ID = $SequenceDataMapRef->{IDs}[0];
 574   $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
 575   $FirstSeqLen = length($FirstSequence);
 576   $SequenceDataMapRef->{ConservedAnnotation} = '';
 577   for $Index (0 .. ($FirstSeqLen - 1)) {
 578     $FirstRes = '';
 579     $ResConserved = 1;
 580     ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
 581       $Sequence = $SequenceDataMapRef->{Sequence}{$ID};
 582       $Res = substr($Sequence, $Index, 1);
 583       if (!$FirstRes) {
 584         $FirstRes = $Res;
 585         next ID;
 586       }
 587       if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
 588         $ResConserved = 0;
 589         last ID;
 590       }
 591     }
 592     if ($ResConserved) {
 593       $SequenceDataMapRef->{ConservedAnnotation} .= '*';
 594     }
 595     else {
 596       $SequenceDataMapRef->{ConservedAnnotation} .= ' ';
 597     }
 598   }
 599 
 600   return ($SequenceDataMapRef);
 601 }
 602 
 603 # Read sequence data in ClustalW multiple sequence alignment file and
 604 # return a reference to hash with these keys and values:
 605 #
 606 # {IDs} - Array of sequence IDs
 607 # {Count} - Number of sequences
 608 # {Description}{$ID} - Sequence description
 609 # {Sequence}{$ID} - Sequence for a specific ID
 610 # {InputFileType} - Sequence file format
 611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
 612 #
 613 #
 614 #
 615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
 616 #
 617 # '*' indicates positions which have a single, fully conserved residue
 618 #
 619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA
 620 #    NEQK NHQK NDEQ QHRK MILV MILF HY FYW
 621 
 622 # '.' indicates that one of the following 'weaker' groups is fully conserved:
 623 #     CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
 624 #
 625 # These are all the positively scoring groups that occur in the Gonnet Pam250
 626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak
 627 # score =<0.5 respectively.
 628 #
 629 sub _ReadClustalWFile {
 630   my($SequenceFile) = @_;
 631   my(%SequencesDataMap);
 632 
 633   # Initialize data...
 634   %SequencesDataMap = ();
 635   @{$SequencesDataMap{IDs}} = ();
 636   %{$SequencesDataMap{Description}} = ();
 637   %{$SequencesDataMap{Sequence}} = ();
 638   $SequencesDataMap{Count} = 0;
 639   $SequencesDataMap{ConservedAnnotation} = '';
 640   $SequencesDataMap{InputFileType} = 'ClustalW';
 641 
 642   open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
 643 
 644   my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
 645 
 646   # Ignore the header line...
 647   $Line = <SEQUENCEFILE>;
 648 
 649   LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
 650     if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
 651       # Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
 652       # Extract it and save...
 653       $LineLength = length($Line);
 654       $AnnotationStart = $LineLength - $SequenceLength;
 655       $AnnotationLength = $SequenceLength;
 656       $Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
 657       $SequencesDataMap{ConservedAnnotation} .= $Annotation;
 658     }
 659     else {
 660       # Extract ID and sequences...
 661       ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
 662       $Sequence =~ s/ //g;
 663       if (!($ID && $Sequence)) {
 664         next LINE;
 665       }
 666 
 667       if (exists $SequencesDataMap{Sequence}{$ID}) {
 668         # Append to existing alignment value...
 669         $SequenceLength = length($Sequence);
 670         $SequencesDataMap{Sequence}{$ID} .= $Sequence;
 671       }
 672       else {
 673         # New alignment data...
 674         $SequencesDataMap{Count} += 1;
 675         push @{$SequencesDataMap{IDs}}, $ID;
 676         $SequencesDataMap{Description}{$ID} = $ID;
 677         $SequencesDataMap{Sequence}{$ID} = $Sequence;
 678         $SequenceLength = length($Sequence);
 679       }
 680     }
 681   }
 682   close SEQUENCEFILE;
 683   return (\%SequencesDataMap);
 684 }
 685 
 686 # Read Pearson fasta file and return a reference to hash with these keys:
 687 #
 688 # {IDs} - Array of sequence IDs
 689 # {Count} - Number of sequences
 690 # {Description}{$ID} - Sequence description
 691 # {Sequence}{$ID} - Sequence for a specific ID
 692 # {InputFileType} - Sequence file format
 693 # {ConservedAnnotation} - Conserved residue annonation
 694 #
 695 sub _ReadPearsonFastaFile {
 696   my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
 697 
 698   ($FastaFileName) = @_;
 699 
 700   %FastaDataMap = ();
 701   @{$FastaDataMap{IDs}} =();
 702   %{$FastaDataMap{Description}} =();
 703   %{$FastaDataMap{Sequence}} =();
 704   $FastaDataMap{Count} = 0;
 705   $FastaDataMap{InputFileType} = 'Pearson';
 706 
 707   open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
 708   $ID = '';
 709   $IgnoreID = 0;
 710   LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
 711     if ($Line =~ /^\>/) {
 712       # Start of a new ID...
 713       $Line =~ s/^\>//;
 714       $Line = RemoveLeadingWhiteSpaces($Line);
 715       @LineWords = ();
 716       @LineWords = split / /, $Line;
 717 
 718       $ID = $LineWords[0];
 719       $ID =~ s/ //g;
 720       $Description = $Line;
 721 
 722       $IgnoreID = 0;
 723       if (exists $FastaDataMap{Sequence}{$ID}) {
 724         $IgnoreID = 1;
 725         warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
 726         next LINE;
 727       }
 728       push @{$FastaDataMap{IDs}}, $ID;
 729       $FastaDataMap{Description}{$ID} = $Description;
 730       $FastaDataMap{Count} += 1;
 731       next LINE;
 732     }
 733     if ($IgnoreID) { next LINE; }
 734 
 735     # Remove any spaces in the sequence...
 736     $Line =~ s/ //g;
 737     # Sequence data for active ID...
 738     if (exists $FastaDataMap{Sequence}{$ID}) {
 739       $FastaDataMap{Sequence}{$ID} .= $Line;
 740     }
 741     else {
 742       $FastaDataMap{Sequence}{$ID} = $Line;
 743     }
 744   }
 745   close FASTAFILE;
 746   return \%FastaDataMap;
 747 }
 748 
 749 # Read PIR fasta file and return a reference to hash with these keys:
 750 #
 751 # {IDs} - Array of sequence IDs
 752 # {Count} - Number of sequences
 753 # {Description}{$ID} - Sequence description
 754 # {Sequence}{$ID} - Sequence for a specific ID
 755 # {InputFileType} - Sequence file format
 756 # {ConservedAnnotation} - Conserved residue annonation
 757 #
 758 # Format:
 759 # A sequence in PIR format consists of:
 760 # One line starting with
 761 #   a ">" (greater-than) sign, followed by
 762 #   a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
 763 #   a semicolon, followed by
 764 #   the sequence identification code (the database ID-code).
 765 # One line containing a textual description of the sequence.
 766 # One or more lines containing the sequence itself. The end of the
 767 # sequence is marked by a "*" (asterisk) character.
 768 #
 769 # A file in PIR format may comprise more than one sequence.
 770 #
 771 # The PIR format is also often referred to as the NBRF format.
 772 #
 773 # Code SequenceType
 774 # P1    Protein (complete)
 775 # F1    Protein (fragment)
 776 # DL    DNA (linear)
 777 # DC    DNA (circular)
 778 # RL    RNA (linear)
 779 # RC   RNA (circular)
 780 # N3    tRNA
 781 # N1    Other functional RNA
 782 #
 783 
 784 sub _ReadPIRFastaFile {
 785   my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
 786 
 787   ($FastaFileName) = @_;
 788 
 789   %FastaDataMap = ();
 790   @{$FastaDataMap{IDs}} =();
 791   %{$FastaDataMap{Description}} =();
 792   %{$FastaDataMap{Sequence}} =();
 793   %{$FastaDataMap{SequenceTypeCode}} =();
 794   $FastaDataMap{Count} = 0;
 795   $FastaDataMap{InputFileType} = 'PIR';
 796 
 797   open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
 798   $ID = '';
 799   $ReadingSequenceData = 0;
 800   LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
 801     if ($Line =~ /^\>/) {
 802       # Start of a new ID...
 803       $Line =~ s/^\>//;
 804       $Line = RemoveLeadingWhiteSpaces($Line);
 805       ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
 806 
 807       # Use next line to retrieve sequence description...
 808       $Line = GetTextLine(\*FASTAFILE);
 809       $Line = RemoveLeadingWhiteSpaces($Line);
 810       $Description = $Line;
 811 
 812       if (exists $FastaDataMap{Sequence}{$ID}) {
 813         warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
 814         next LINE;
 815       }
 816       $ReadingSequenceData = 1;
 817       push @{$FastaDataMap{IDs}}, $ID;
 818       $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
 819       $FastaDataMap{Description}{$ID} = $Description;
 820       $FastaDataMap{Count} += 1;
 821       next LINE;
 822     }
 823     if (!$ReadingSequenceData) { next LINE; }
 824 
 825     # Remove any spaces in the sequence...
 826     $Line =~ s/ //g;
 827     if ($Line =~ /[\*]$/) {
 828       # End of sequence...
 829       $ReadingSequenceData = 0;
 830       $Line =~ s/[\*]$//;
 831     }
 832     # Sequence data for active ID...
 833     if (exists $FastaDataMap{Sequence}{$ID}) {
 834       $FastaDataMap{Sequence}{$ID} .= $Line;
 835     }
 836     else {
 837       $FastaDataMap{Sequence}{$ID} = $Line;
 838     }
 839   }
 840   close FASTAFILE;
 841   return \%FastaDataMap;
 842 }
 843 
 844 # Read MSF file and return a reference to hash with these keys:
 845 #
 846 # {IDs} : Array of IDs in order as they appear in file
 847 # {Count}: ID count...
 848 # {Description}{$ID} : Description data...
 849 # {Sequence}{$ID} : Sequence data...
 850 #
 851 sub _ReadMSFFile {
 852   my($MSFFileName, $Line, @LineWords, %MSFDataMap);
 853 
 854   ($MSFFileName) = @_;
 855 
 856   %MSFDataMap = ();
 857   @{$MSFDataMap{IDs}} =();
 858   %{$MSFDataMap{Description}} =();
 859   %{$MSFDataMap{Sequence}} =();
 860   $MSFDataMap{Count} = 0;
 861   $MSFDataMap{InputFileType} = 'MSF';
 862 
 863   open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
 864 
 865   # Collect sequences and IDs...
 866   #
 867   # '//' after the name fields indicates end of header list and start of sequence data.
 868   #
 869   my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
 870   %MSFIDsMap = ();
 871   $NameFieldsFound = 0;
 872   LINE: while ($Line = GetTextLine(\*MSFFILE)) {
 873     if ($Line =~ /Name:/) {
 874       $NameFieldsFound++;
 875       ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
 876       if ($ID =~ / /) {
 877         ($ID) = $ID =~ /^(.*?)[ ]+/
 878       }
 879       if (exists $MSFIDsMap{$ID}) {
 880         warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
 881         next LINE;
 882       }
 883       $MSFIDsMap{$ID} = $ID;
 884       push @{$MSFDataMap{IDs}}, $ID;
 885       $MSFDataMap{Description}{$ID} = $ID;
 886       $MSFDataMap{Count} += 1;
 887     }
 888     elsif ( /\/\// && $NameFieldsFound) {
 889       # End of header list...
 890       last LINE;
 891     }
 892   }
 893   # Collect all sequences...
 894   #
 895   my($FirstField, $SecondField);
 896   while ($Line = GetTextLine(\*MSFFILE)) {
 897     ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
 898     if (exists $MSFIDsMap{$FirstField}) {
 899       # It's ID and sequence data...
 900       $ID = $FirstField;
 901       $Sequence = $SecondField;
 902       # Take out spaces and leave the gap characters...
 903       $Sequence =~ s/ //g;
 904       if ($MSFDataMap{Sequence}{$ID}) {
 905         $MSFDataMap{Sequence}{$ID} .= $Sequence;
 906       }
 907       else {
 908         $MSFDataMap{Sequence}{$ID} = $Sequence;
 909       }
 910     }
 911   }
 912 
 913   close MSFFILE;
 914   return \%MSFDataMap;
 915 }
 916 
 917