1 package SequenceFileUtil; 2 # 3 # $RCSfile: SequenceFileUtil.pm,v $ 4 # $Date: 2015/02/28 20:47:18 $ 5 # $Revision: 1.33 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Exporter; 31 use Text::ParseWords; 32 use TextUtil; 33 use FileUtil; 34 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 @ISA = qw(Exporter); 38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile); 39 @EXPORT_OK = qw(); 40 41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 42 43 # Compare lengths of all sequences... 44 sub AreSequenceLengthsIdentical { 45 my($SequencesDataRef) = @_; 46 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen); 47 48 $Status = 1; 49 $FirstID = ''; 50 $FirstDifferentLenID = ''; 51 52 ID: for $ID (@{$SequencesDataRef->{IDs}}) { 53 if (!$FirstID) { 54 $FirstID = $ID; 55 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID}); 56 next ID; 57 } 58 $SeqLen = length($SequencesDataRef->{Sequence}{$ID}); 59 if ($SeqLen != $FirstSeqLen) { 60 $Status = 0; 61 $FirstDifferentLenID = $ID; 62 last ID; 63 } 64 } 65 return ($Status); 66 } 67 68 # Calculate percent identity between two sequences. By default, gaps are ignored. 69 sub CalcuatePercentSequenceIdentity { 70 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision); 71 72 $PercentIdentity = ''; 73 $Precision = 1; 74 $IgnoreGaps = 1; 75 if (@_ == 4) { 76 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_; 77 } 78 elsif (@_ == 3) { 79 ($Sequence1, $Sequence2, $IgnoreGaps) = @_; 80 } 81 elsif (@_ == 2) { 82 ($Sequence1, $Sequence2) = @_; 83 } 84 else { 85 return $PercentIdentity; 86 } 87 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) { 88 return $PercentIdentity; 89 } 90 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount); 91 92 $Sequence1Len = length($Sequence1); 93 $Sequence2Len = length($Sequence2); 94 95 $ResMatchCount = 0; 96 $ResCount = 0; 97 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) { 98 $Residue1 = substr($Sequence1, $Index, 1); 99 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : ''; 100 if ($IgnoreGaps) { 101 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) { 102 next RESIDUE; 103 } 104 } 105 if ($Residue1 eq $Residue2) { 106 $ResMatchCount++; 107 } 108 $ResCount++; 109 } 110 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0; 111 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100)); 112 113 return $PercentIdentity; 114 } 115 116 # Calculate pairwise identify matrix for all the sequences and return a reference 117 # to a hash with the following keys: 118 # 119 # {IDs} - Sequence IDs 120 # {Count} - Number of IDs 121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences 122 # 123 sub CalculatePercentSequenceIdentityMatrix { 124 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData); 125 126 $IgnoreGaps = 1; 127 $Precision = 1; 128 if (@_ == 3) { 129 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_; 130 } 131 elsif (@_ == 2) { 132 ($SequencesDataRef, $IgnoreGaps) = @_; 133 } 134 else { 135 ($SequencesDataRef) = @_; 136 } 137 138 %IdentityMatrixData = (); 139 @{$IdentityMatrixData{IDs}} = (); 140 %{$IdentityMatrixData{PercentIdentity}} = (); 141 $IdentityMatrixData{Count} = 0; 142 143 for $ID (@{$SequencesDataRef->{IDs}}) { 144 push @{$IdentityMatrixData{IDs}}, $ID; 145 $IdentityMatrixData{Count} += 1; 146 } 147 # Initialize and calculate percent identity data values... 148 for $RowID (@{$SequencesDataRef->{IDs}}) { 149 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = (); 150 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID}; 151 for $ColID (@{$SequencesDataRef->{IDs}}) { 152 $IdentityMatrixData{$RowID}{$ColID} = ''; 153 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID}; 154 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision); 155 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity; 156 } 157 } 158 return \%IdentityMatrixData; 159 } 160 161 # Retrieve information about shortest sequence... 162 sub GetShortestSequence { 163 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); 164 165 $IgnoreGaps = 1; 166 if (@_ == 2) { 167 ($SequencesDataRef, $IgnoreGaps) = @_; 168 } 169 else { 170 ($SequencesDataRef) = @_; 171 } 172 173 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps); 174 return ($ID, $Sequence, $SeqLen, $Description); 175 } 176 177 # Retrieve information about longest sequence.. 178 sub GetLongestSequence { 179 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); 180 181 $IgnoreGaps = 1; 182 if (@_ == 2) { 183 ($SequencesDataRef, $IgnoreGaps) = @_; 184 } 185 else { 186 ($SequencesDataRef) = @_; 187 } 188 189 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps); 190 return ($ID, $Sequence, $SeqLen, $Description); 191 } 192 193 # Get sequence length... 194 sub GetSequenceLength { 195 my($Seq, $SeqLen, $IgnoreGaps); 196 197 $SeqLen = ''; $IgnoreGaps = 1; 198 if (@_ == 2) { 199 ($Seq, $IgnoreGaps) = @_; 200 } 201 else { 202 ($Seq) = @_; 203 } 204 if ($IgnoreGaps) { 205 my($Index, $Residue); 206 $SeqLen = 0; 207 for $Index (0 .. (length($Seq) - 1)) { 208 $Residue = substr($Seq, $Index, 1); 209 if ($Residue =~ /[A-Z]/i) { 210 $SeqLen++; 211 } 212 } 213 } 214 else { 215 $SeqLen = length($Seq); 216 } 217 218 return $SeqLen; 219 } 220 221 # Is it a gap residue... 222 sub IsGapResidue { 223 my($Residue) = @_; 224 my($Status); 225 226 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0; 227 228 return $Status; 229 } 230 231 # Is it a supported sequence file? 232 # 233 # Supported seqence formats are: 234 # 235 # ALN/ClustalW .aln 236 # GCG/MSF .msf 237 # PILEUP/MSF .msf 238 # Fasts(Pearson) .fasta, .fta 239 # NBRF/PIR .pir 240 # 241 sub IsSupportedSequenceFile { 242 my($SequenceFile) = @_; 243 my($Status, $SequenceFormat); 244 $Status = 0; $SequenceFormat = 'NotSupported'; 245 246 SEQFORMAT: { 247 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT} 248 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT} 249 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT} 250 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT} 251 $Status = 0; $SequenceFormat = 'NotSupported'; 252 } 253 return ($Status, $SequenceFormat); 254 } 255 256 # Is it a ClustalW multiple sequence sequence file... 257 sub IsClustalWSequenceFile { 258 my($SequenceFile) = @_; 259 my($Status, $Line); 260 261 $Status = 0; 262 263 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; 264 $Line = GetTextLine(\*SEQUENCEFILE); 265 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0; 266 close SEQUENCEFILE; 267 268 return $Status; 269 } 270 271 # Is it a valid Pearson fasta sequence or alignment file? 272 # 273 sub IsPearsonFastaSequenceFile { 274 my($FastaFile, $Line, $Status); 275 276 ($FastaFile) = @_; 277 $Status = 0; 278 279 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; 280 $Line = GetTextLine(\*FASTAFILE); 281 282 # First line starts with > and the fourth character is not ';'; otherwise, it's 283 # PIR FASTA format. 284 if ($Line =~ /^>/) { 285 my($FourthChar); 286 $FourthChar = substr($Line, 3, 1); 287 $Status = ($FourthChar !~ /\;/) ? 1 : 0; 288 } 289 close FASTAFILE; 290 291 return $Status; 292 } 293 294 # Is it a valid NBRF/PIR fasta sequence or alignment file? 295 # 296 sub IsPIRFastaSequenceFile { 297 my($FastaFile, $Line, $Status); 298 299 ($FastaFile) = @_; 300 $Status = 0; 301 302 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; 303 $Line = GetTextLine(\*FASTAFILE); 304 305 # First line starts with > and the fourth character is ';'; otherwise, it's 306 # a Pearson FASTA format. 307 if ($Line =~ /^>/) { 308 my($FourthChar); 309 $FourthChar = substr($Line, 3, 1); 310 $Status = ($FourthChar =~ /\;/) ? 1 : 0; 311 } 312 close FASTAFILE; 313 314 return $Status; 315 } 316 317 # Is it a valid MSF sequence or alignment file? 318 # 319 sub IsMSFSequenceFile { 320 my($MSFFile) = @_; 321 322 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n"; 323 324 my($Line, $Status); 325 326 $Status = 0; 327 # Find a line that contains MSF: keyword and ends with '..' 328 LINE: while ($Line = GetTextLine(\*MSFFILE)) { 329 $Line = RemoveLeadingWhiteSpaces($Line); 330 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) { 331 $Status = 1; 332 last LINE; 333 } 334 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) { 335 # Pileup MSF... 336 $Status = 1; 337 last LINE; 338 } 339 } 340 close MSFFILE; 341 342 return $Status; 343 } 344 345 # Read sequence or sequence alignment file... 346 sub ReadSequenceFile { 347 my($SequenceFile) = @_; 348 349 if (IsPearsonFastaSequenceFile($SequenceFile)) { 350 return ReadPearsonFastaSequenceFile($SequenceFile); 351 } 352 elsif (IsPIRFastaSequenceFile($SequenceFile)) { 353 return ReadPIRFastaSequenceFile($SequenceFile); 354 } 355 elsif (IsMSFSequenceFile($SequenceFile)) { 356 return ReadMSFSequenceFile($SequenceFile); 357 } 358 elsif (IsClustalWSequenceFile($SequenceFile)) { 359 return ReadClustalWSequenceFile($SequenceFile); 360 } 361 else { 362 return undef; 363 } 364 } 365 366 # Read file and setup alignment data... 367 sub ReadClustalWSequenceFile { 368 my($SequenceFile) = @_; 369 370 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW'); 371 } 372 373 # Read file and setup alignment data... 374 sub ReadPearsonFastaSequenceFile { 375 my($SequenceFile) = @_; 376 377 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson'); 378 } 379 380 # Read file and setup alignment data... 381 sub ReadPIRFastaSequenceFile { 382 my($SequenceFile) = @_; 383 384 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR'); 385 } 386 387 388 # Read file and setup sequence data... 389 sub ReadMSFSequenceFile { 390 my($SequenceFile) = @_; 391 392 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF'); 393 } 394 395 # Write out a Pearson FASTA file... 396 sub WritePearsonFastaSequenceFile { 397 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence); 398 399 $MaxLength = 80; 400 if (@_ == 3) { 401 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_; 402 } 403 elsif (@_ == 2) { 404 ($SequenceFileName, $SequenceDataRef) = @_; 405 } 406 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n"; 407 for $ID (@{$SequenceDataRef->{IDs}}) { 408 $Description = $SequenceDataRef->{Description}{$ID}; 409 $Sequence = $SequenceDataRef->{Sequence}{$ID}; 410 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n"); 411 412 # Description also contains ID... 413 print SEQUENCEFILE ">$Description\n"; 414 print SEQUENCEFILE "$WrappedSequence\n"; 415 } 416 close SEQUENCEFILE; 417 } 418 419 # Get ID, Sequence and Length for smallest or longest sequence 420 sub _GetShortestOrLongestSequence { 421 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_; 422 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 423 424 ($ID, $Seq, $SeqLen) = ('', '', ''); 425 $FirstID = ''; 426 427 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) { 428 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID}; 429 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps); 430 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID}; 431 if (!$FirstID) { 432 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen; 433 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 434 next ID; 435 } 436 if ($CurrentSeqLen != $SeqLen) { 437 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) { 438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 439 } 440 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) { 441 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); 442 } 443 } 444 } 445 return ($ID, $Seq, $SeqLen, $Description); 446 } 447 448 # Remove gaps in the sequence and return new sequence... 449 sub RemoveSequenceGaps { 450 my($Seq) = @_; 451 my($SeqWithoutGaps, $SeqLen, $Index, $Residue); 452 453 $SeqWithoutGaps = ''; 454 $SeqLen = length($Seq); 455 for $Index (0 .. ($SeqLen - 1)) { 456 $Residue = substr($Seq, $Index, 1); 457 if ($Residue =~ /[A-Z]/i) { 458 $SeqWithoutGaps .= $Residue; 459 } 460 } 461 462 return $SeqWithoutGaps; 463 } 464 465 # Using input alignment data map ref containing following keys, generate 466 # a new hash with same set of keys after residue columns containg only 467 # gaps have been removed: 468 # 469 # {IDs} : Array of IDs in order as they appear in file 470 # {Count}: ID count... 471 # {Description}{$ID} : Description data... 472 # {Sequence}{$ID} : Sequence data... 473 # 474 sub RemoveSequenceAlignmentGapColumns { 475 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap); 476 477 ($AlignmentDataMapRef) = @_; 478 479 %NewAlignmentDataMap = (); 480 @{$NewAlignmentDataMap{IDs}} =(); 481 %{$NewAlignmentDataMap{Description}} =(); 482 %{$NewAlignmentDataMap{Sequence}} =(); 483 $NewAlignmentDataMap{Count} = 0; 484 485 # Transfer ID and count information... 486 for $ID (@{$AlignmentDataMapRef->{IDs}}) { 487 push @{$NewAlignmentDataMap{IDs}}, $ID; 488 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID}; 489 $NewAlignmentDataMap{Sequence}{$ID} = ''; 490 $NewAlignmentDataMap{Count} += 1; 491 } 492 493 # Go over residue columns and transfer the data... 494 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn); 495 496 $FirstID = $AlignmentDataMapRef->{IDs}[0]; 497 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID}; 498 $FirstSeqLen = length($FirstSeq); 499 500 RES: for $Index (0 .. ($FirstSeqLen - 1)) { 501 # Is this a gap column? 502 $GapColumn = 1; 503 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) { 504 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); 505 if ($Res =~ /[A-Z]/i) { 506 $GapColumn = 0; 507 last ID; 508 } 509 } 510 if ($GapColumn) { 511 next RES; 512 } 513 # Transfer this residue... 514 for $ID (@{$AlignmentDataMapRef->{IDs}}) { 515 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); 516 $NewAlignmentDataMap{Sequence}{$ID} .= $Res; 517 } 518 } 519 520 return (\%NewAlignmentDataMap); 521 } 522 523 # 524 # Read sequences file and return a reference to hash with the following keys: 525 # 526 # {IDs} - Array of sequence IDs 527 # {Count} - Number of sequences 528 # {Description}{$ID} - Sequence description 529 # {Sequence}{$ID} - Sequence for a specific ID 530 # {InputFileType} - Sequence file format 531 # {ConservedAnnotation} - Conserved residue annonation 532 # 533 # Note: 534 # . Conserved residue annotation either exist in the input sequence alignment file or set 535 # for a file containing same number of residues for all the sequence using the following 536 # notation: * - Residue conserved; ' ' - Residue not conserved. 537 # 538 sub _ReadFileAndSetupSequencesData { 539 my($SequenceFile, $SequenceType) = @_; 540 my($SequenceDataMapRef); 541 542 $SequenceDataMapRef = undef; 543 544 # Read sequence file... 545 $SequenceDataMapRef = ''; 546 if ($SequenceType =~ /^ClustalW$/i) { 547 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile); 548 } 549 elsif ($SequenceType =~ /^Pearson$/i) { 550 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile); 551 } 552 elsif ($SequenceType =~ /^PIR$/i) { 553 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile); 554 } 555 elsif ($SequenceType =~ /^MSF$/i) { 556 $SequenceDataMapRef = _ReadMSFFile($SequenceFile); 557 } 558 else { 559 return $SequenceDataMapRef; 560 } 561 562 if (exists $SequenceDataMapRef->{ConservedAnnotation}) { 563 return ($SequenceDataMapRef); 564 } 565 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) { 566 return ($SequenceDataMapRef); 567 } 568 569 # Use the first sequence to setup an empty ConservedAnnotation key... 570 # And mark fully conserved residues... 571 # 572 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index); 573 $ID = $SequenceDataMapRef->{IDs}[0]; 574 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID}; 575 $FirstSeqLen = length($FirstSequence); 576 $SequenceDataMapRef->{ConservedAnnotation} = ''; 577 for $Index (0 .. ($FirstSeqLen - 1)) { 578 $FirstRes = ''; 579 $ResConserved = 1; 580 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) { 581 $Sequence = $SequenceDataMapRef->{Sequence}{$ID}; 582 $Res = substr($Sequence, $Index, 1); 583 if (!$FirstRes) { 584 $FirstRes = $Res; 585 next ID; 586 } 587 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) { 588 $ResConserved = 0; 589 last ID; 590 } 591 } 592 if ($ResConserved) { 593 $SequenceDataMapRef->{ConservedAnnotation} .= '*'; 594 } 595 else { 596 $SequenceDataMapRef->{ConservedAnnotation} .= ' '; 597 } 598 } 599 600 return ($SequenceDataMapRef); 601 } 602 603 # Read sequence data in ClustalW multiple sequence alignment file and 604 # return a reference to hash with these keys and values: 605 # 606 # {IDs} - Array of sequence IDs 607 # {Count} - Number of sequences 608 # {Description}{$ID} - Sequence description 609 # {Sequence}{$ID} - Sequence for a specific ID 610 # {InputFileType} - Sequence file format 611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , . 612 # 613 # 614 # 615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean: 616 # 617 # '*' indicates positions which have a single, fully conserved residue 618 # 619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA 620 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW 621 622 # '.' indicates that one of the following 'weaker' groups is fully conserved: 623 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY 624 # 625 # These are all the positively scoring groups that occur in the Gonnet Pam250 626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak 627 # score =<0.5 respectively. 628 # 629 sub _ReadClustalWFile { 630 my($SequenceFile) = @_; 631 my(%SequencesDataMap); 632 633 # Initialize data... 634 %SequencesDataMap = (); 635 @{$SequencesDataMap{IDs}} = (); 636 %{$SequencesDataMap{Description}} = (); 637 %{$SequencesDataMap{Sequence}} = (); 638 $SequencesDataMap{Count} = 0; 639 $SequencesDataMap{ConservedAnnotation} = ''; 640 $SequencesDataMap{InputFileType} = 'ClustalW'; 641 642 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; 643 644 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex); 645 646 # Ignore the header line... 647 $Line = <SEQUENCEFILE>; 648 649 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) { 650 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) { 651 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd. 652 # Extract it and save... 653 $LineLength = length($Line); 654 $AnnotationStart = $LineLength - $SequenceLength; 655 $AnnotationLength = $SequenceLength; 656 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength); 657 $SequencesDataMap{ConservedAnnotation} .= $Annotation; 658 } 659 else { 660 # Extract ID and sequences... 661 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/; 662 $Sequence =~ s/ //g; 663 if (!($ID && $Sequence)) { 664 next LINE; 665 } 666 667 if (exists $SequencesDataMap{Sequence}{$ID}) { 668 # Append to existing alignment value... 669 $SequenceLength = length($Sequence); 670 $SequencesDataMap{Sequence}{$ID} .= $Sequence; 671 } 672 else { 673 # New alignment data... 674 $SequencesDataMap{Count} += 1; 675 push @{$SequencesDataMap{IDs}}, $ID; 676 $SequencesDataMap{Description}{$ID} = $ID; 677 $SequencesDataMap{Sequence}{$ID} = $Sequence; 678 $SequenceLength = length($Sequence); 679 } 680 } 681 } 682 close SEQUENCEFILE; 683 return (\%SequencesDataMap); 684 } 685 686 # Read Pearson fasta file and return a reference to hash with these keys: 687 # 688 # {IDs} - Array of sequence IDs 689 # {Count} - Number of sequences 690 # {Description}{$ID} - Sequence description 691 # {Sequence}{$ID} - Sequence for a specific ID 692 # {InputFileType} - Sequence file format 693 # {ConservedAnnotation} - Conserved residue annonation 694 # 695 sub _ReadPearsonFastaFile { 696 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap); 697 698 ($FastaFileName) = @_; 699 700 %FastaDataMap = (); 701 @{$FastaDataMap{IDs}} =(); 702 %{$FastaDataMap{Description}} =(); 703 %{$FastaDataMap{Sequence}} =(); 704 $FastaDataMap{Count} = 0; 705 $FastaDataMap{InputFileType} = 'Pearson'; 706 707 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; 708 $ID = ''; 709 $IgnoreID = 0; 710 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { 711 if ($Line =~ /^\>/) { 712 # Start of a new ID... 713 $Line =~ s/^\>//; 714 $Line = RemoveLeadingWhiteSpaces($Line); 715 @LineWords = (); 716 @LineWords = split / /, $Line; 717 718 $ID = $LineWords[0]; 719 $ID =~ s/ //g; 720 $Description = $Line; 721 722 $IgnoreID = 0; 723 if (exists $FastaDataMap{Sequence}{$ID}) { 724 $IgnoreID = 1; 725 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; 726 next LINE; 727 } 728 push @{$FastaDataMap{IDs}}, $ID; 729 $FastaDataMap{Description}{$ID} = $Description; 730 $FastaDataMap{Count} += 1; 731 next LINE; 732 } 733 if ($IgnoreID) { next LINE; } 734 735 # Remove any spaces in the sequence... 736 $Line =~ s/ //g; 737 # Sequence data for active ID... 738 if (exists $FastaDataMap{Sequence}{$ID}) { 739 $FastaDataMap{Sequence}{$ID} .= $Line; 740 } 741 else { 742 $FastaDataMap{Sequence}{$ID} = $Line; 743 } 744 } 745 close FASTAFILE; 746 return \%FastaDataMap; 747 } 748 749 # Read PIR fasta file and return a reference to hash with these keys: 750 # 751 # {IDs} - Array of sequence IDs 752 # {Count} - Number of sequences 753 # {Description}{$ID} - Sequence description 754 # {Sequence}{$ID} - Sequence for a specific ID 755 # {InputFileType} - Sequence file format 756 # {ConservedAnnotation} - Conserved residue annonation 757 # 758 # Format: 759 # A sequence in PIR format consists of: 760 # One line starting with 761 # a ">" (greater-than) sign, followed by 762 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by 763 # a semicolon, followed by 764 # the sequence identification code (the database ID-code). 765 # One line containing a textual description of the sequence. 766 # One or more lines containing the sequence itself. The end of the 767 # sequence is marked by a "*" (asterisk) character. 768 # 769 # A file in PIR format may comprise more than one sequence. 770 # 771 # The PIR format is also often referred to as the NBRF format. 772 # 773 # Code SequenceType 774 # P1 Protein (complete) 775 # F1 Protein (fragment) 776 # DL DNA (linear) 777 # DC DNA (circular) 778 # RL RNA (linear) 779 # RC RNA (circular) 780 # N3 tRNA 781 # N1 Other functional RNA 782 # 783 784 sub _ReadPIRFastaFile { 785 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap); 786 787 ($FastaFileName) = @_; 788 789 %FastaDataMap = (); 790 @{$FastaDataMap{IDs}} =(); 791 %{$FastaDataMap{Description}} =(); 792 %{$FastaDataMap{Sequence}} =(); 793 %{$FastaDataMap{SequenceTypeCode}} =(); 794 $FastaDataMap{Count} = 0; 795 $FastaDataMap{InputFileType} = 'PIR'; 796 797 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; 798 $ID = ''; 799 $ReadingSequenceData = 0; 800 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { 801 if ($Line =~ /^\>/) { 802 # Start of a new ID... 803 $Line =~ s/^\>//; 804 $Line = RemoveLeadingWhiteSpaces($Line); 805 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/; 806 807 # Use next line to retrieve sequence description... 808 $Line = GetTextLine(\*FASTAFILE); 809 $Line = RemoveLeadingWhiteSpaces($Line); 810 $Description = $Line; 811 812 if (exists $FastaDataMap{Sequence}{$ID}) { 813 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; 814 next LINE; 815 } 816 $ReadingSequenceData = 1; 817 push @{$FastaDataMap{IDs}}, $ID; 818 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode; 819 $FastaDataMap{Description}{$ID} = $Description; 820 $FastaDataMap{Count} += 1; 821 next LINE; 822 } 823 if (!$ReadingSequenceData) { next LINE; } 824 825 # Remove any spaces in the sequence... 826 $Line =~ s/ //g; 827 if ($Line =~ /[\*]$/) { 828 # End of sequence... 829 $ReadingSequenceData = 0; 830 $Line =~ s/[\*]$//; 831 } 832 # Sequence data for active ID... 833 if (exists $FastaDataMap{Sequence}{$ID}) { 834 $FastaDataMap{Sequence}{$ID} .= $Line; 835 } 836 else { 837 $FastaDataMap{Sequence}{$ID} = $Line; 838 } 839 } 840 close FASTAFILE; 841 return \%FastaDataMap; 842 } 843 844 # Read MSF file and return a reference to hash with these keys: 845 # 846 # {IDs} : Array of IDs in order as they appear in file 847 # {Count}: ID count... 848 # {Description}{$ID} : Description data... 849 # {Sequence}{$ID} : Sequence data... 850 # 851 sub _ReadMSFFile { 852 my($MSFFileName, $Line, @LineWords, %MSFDataMap); 853 854 ($MSFFileName) = @_; 855 856 %MSFDataMap = (); 857 @{$MSFDataMap{IDs}} =(); 858 %{$MSFDataMap{Description}} =(); 859 %{$MSFDataMap{Sequence}} =(); 860 $MSFDataMap{Count} = 0; 861 $MSFDataMap{InputFileType} = 'MSF'; 862 863 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n"; 864 865 # Collect sequences and IDs... 866 # 867 # '//' after the name fields indicates end of header list and start of sequence data. 868 # 869 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap); 870 %MSFIDsMap = (); 871 $NameFieldsFound = 0; 872 LINE: while ($Line = GetTextLine(\*MSFFILE)) { 873 if ($Line =~ /Name:/) { 874 $NameFieldsFound++; 875 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/; 876 if ($ID =~ / /) { 877 ($ID) = $ID =~ /^(.*?)[ ]+/ 878 } 879 if (exists $MSFIDsMap{$ID}) { 880 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n"; 881 next LINE; 882 } 883 $MSFIDsMap{$ID} = $ID; 884 push @{$MSFDataMap{IDs}}, $ID; 885 $MSFDataMap{Description}{$ID} = $ID; 886 $MSFDataMap{Count} += 1; 887 } 888 elsif ( /\/\// && $NameFieldsFound) { 889 # End of header list... 890 last LINE; 891 } 892 } 893 # Collect all sequences... 894 # 895 my($FirstField, $SecondField); 896 while ($Line = GetTextLine(\*MSFFILE)) { 897 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/; 898 if (exists $MSFIDsMap{$FirstField}) { 899 # It's ID and sequence data... 900 $ID = $FirstField; 901 $Sequence = $SecondField; 902 # Take out spaces and leave the gap characters... 903 $Sequence =~ s/ //g; 904 if ($MSFDataMap{Sequence}{$ID}) { 905 $MSFDataMap{Sequence}{$ID} .= $Sequence; 906 } 907 else { 908 $MSFDataMap{Sequence}{$ID} = $Sequence; 909 } 910 } 911 } 912 913 close MSFFILE; 914 return \%MSFDataMap; 915 } 916 917