Mercurial > repos > deepakjadmin > mayatool3_test2
comparison lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 package SequenceFileUtil; | |
| 2 # | |
| 3 # $RCSfile: SequenceFileUtil.pm,v $ | |
| 4 # $Date: 2015/02/28 20:47:18 $ | |
| 5 # $Revision: 1.33 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use Exporter; | |
| 31 use Text::ParseWords; | |
| 32 use TextUtil; | |
| 33 use FileUtil; | |
| 34 | |
| 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |
| 36 | |
| 37 @ISA = qw(Exporter); | |
| 38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile); | |
| 39 @EXPORT_OK = qw(); | |
| 40 | |
| 41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); | |
| 42 | |
| 43 # Compare lengths of all sequences... | |
| 44 sub AreSequenceLengthsIdentical { | |
| 45 my($SequencesDataRef) = @_; | |
| 46 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen); | |
| 47 | |
| 48 $Status = 1; | |
| 49 $FirstID = ''; | |
| 50 $FirstDifferentLenID = ''; | |
| 51 | |
| 52 ID: for $ID (@{$SequencesDataRef->{IDs}}) { | |
| 53 if (!$FirstID) { | |
| 54 $FirstID = $ID; | |
| 55 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID}); | |
| 56 next ID; | |
| 57 } | |
| 58 $SeqLen = length($SequencesDataRef->{Sequence}{$ID}); | |
| 59 if ($SeqLen != $FirstSeqLen) { | |
| 60 $Status = 0; | |
| 61 $FirstDifferentLenID = $ID; | |
| 62 last ID; | |
| 63 } | |
| 64 } | |
| 65 return ($Status); | |
| 66 } | |
| 67 | |
| 68 # Calculate percent identity between two sequences. By default, gaps are ignored. | |
| 69 sub CalcuatePercentSequenceIdentity { | |
| 70 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision); | |
| 71 | |
| 72 $PercentIdentity = ''; | |
| 73 $Precision = 1; | |
| 74 $IgnoreGaps = 1; | |
| 75 if (@_ == 4) { | |
| 76 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_; | |
| 77 } | |
| 78 elsif (@_ == 3) { | |
| 79 ($Sequence1, $Sequence2, $IgnoreGaps) = @_; | |
| 80 } | |
| 81 elsif (@_ == 2) { | |
| 82 ($Sequence1, $Sequence2) = @_; | |
| 83 } | |
| 84 else { | |
| 85 return $PercentIdentity; | |
| 86 } | |
| 87 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) { | |
| 88 return $PercentIdentity; | |
| 89 } | |
| 90 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount); | |
| 91 | |
| 92 $Sequence1Len = length($Sequence1); | |
| 93 $Sequence2Len = length($Sequence2); | |
| 94 | |
| 95 $ResMatchCount = 0; | |
| 96 $ResCount = 0; | |
| 97 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) { | |
| 98 $Residue1 = substr($Sequence1, $Index, 1); | |
| 99 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : ''; | |
| 100 if ($IgnoreGaps) { | |
| 101 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) { | |
| 102 next RESIDUE; | |
| 103 } | |
| 104 } | |
| 105 if ($Residue1 eq $Residue2) { | |
| 106 $ResMatchCount++; | |
| 107 } | |
| 108 $ResCount++; | |
| 109 } | |
| 110 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0; | |
| 111 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100)); | |
| 112 | |
| 113 return $PercentIdentity; | |
| 114 } | |
| 115 | |
| 116 # Calculate pairwise identify matrix for all the sequences and return a reference | |
| 117 # to a hash with the following keys: | |
| 118 # | |
| 119 # {IDs} - Sequence IDs | |
| 120 # {Count} - Number of IDs | |
| 121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences | |
| 122 # | |
| 123 sub CalculatePercentSequenceIdentityMatrix { | |
| 124 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData); | |
| 125 | |
| 126 $IgnoreGaps = 1; | |
| 127 $Precision = 1; | |
| 128 if (@_ == 3) { | |
| 129 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_; | |
| 130 } | |
| 131 elsif (@_ == 2) { | |
| 132 ($SequencesDataRef, $IgnoreGaps) = @_; | |
| 133 } | |
| 134 else { | |
| 135 ($SequencesDataRef) = @_; | |
| 136 } | |
| 137 | |
| 138 %IdentityMatrixData = (); | |
| 139 @{$IdentityMatrixData{IDs}} = (); | |
| 140 %{$IdentityMatrixData{PercentIdentity}} = (); | |
| 141 $IdentityMatrixData{Count} = 0; | |
| 142 | |
| 143 for $ID (@{$SequencesDataRef->{IDs}}) { | |
| 144 push @{$IdentityMatrixData{IDs}}, $ID; | |
| 145 $IdentityMatrixData{Count} += 1; | |
| 146 } | |
| 147 # Initialize and calculate percent identity data values... | |
| 148 for $RowID (@{$SequencesDataRef->{IDs}}) { | |
| 149 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = (); | |
| 150 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID}; | |
| 151 for $ColID (@{$SequencesDataRef->{IDs}}) { | |
| 152 $IdentityMatrixData{$RowID}{$ColID} = ''; | |
| 153 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID}; | |
| 154 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision); | |
| 155 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity; | |
| 156 } | |
| 157 } | |
| 158 return \%IdentityMatrixData; | |
| 159 } | |
| 160 | |
| 161 # Retrieve information about shortest sequence... | |
| 162 sub GetShortestSequence { | |
| 163 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); | |
| 164 | |
| 165 $IgnoreGaps = 1; | |
| 166 if (@_ == 2) { | |
| 167 ($SequencesDataRef, $IgnoreGaps) = @_; | |
| 168 } | |
| 169 else { | |
| 170 ($SequencesDataRef) = @_; | |
| 171 } | |
| 172 | |
| 173 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps); | |
| 174 return ($ID, $Sequence, $SeqLen, $Description); | |
| 175 } | |
| 176 | |
| 177 # Retrieve information about longest sequence.. | |
| 178 sub GetLongestSequence { | |
| 179 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); | |
| 180 | |
| 181 $IgnoreGaps = 1; | |
| 182 if (@_ == 2) { | |
| 183 ($SequencesDataRef, $IgnoreGaps) = @_; | |
| 184 } | |
| 185 else { | |
| 186 ($SequencesDataRef) = @_; | |
| 187 } | |
| 188 | |
| 189 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps); | |
| 190 return ($ID, $Sequence, $SeqLen, $Description); | |
| 191 } | |
| 192 | |
| 193 # Get sequence length... | |
| 194 sub GetSequenceLength { | |
| 195 my($Seq, $SeqLen, $IgnoreGaps); | |
| 196 | |
| 197 $SeqLen = ''; $IgnoreGaps = 1; | |
| 198 if (@_ == 2) { | |
| 199 ($Seq, $IgnoreGaps) = @_; | |
| 200 } | |
| 201 else { | |
| 202 ($Seq) = @_; | |
| 203 } | |
| 204 if ($IgnoreGaps) { | |
| 205 my($Index, $Residue); | |
| 206 $SeqLen = 0; | |
| 207 for $Index (0 .. (length($Seq) - 1)) { | |
| 208 $Residue = substr($Seq, $Index, 1); | |
| 209 if ($Residue =~ /[A-Z]/i) { | |
| 210 $SeqLen++; | |
| 211 } | |
| 212 } | |
| 213 } | |
| 214 else { | |
| 215 $SeqLen = length($Seq); | |
| 216 } | |
| 217 | |
| 218 return $SeqLen; | |
| 219 } | |
| 220 | |
| 221 # Is it a gap residue... | |
| 222 sub IsGapResidue { | |
| 223 my($Residue) = @_; | |
| 224 my($Status); | |
| 225 | |
| 226 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0; | |
| 227 | |
| 228 return $Status; | |
| 229 } | |
| 230 | |
| 231 # Is it a supported sequence file? | |
| 232 # | |
| 233 # Supported seqence formats are: | |
| 234 # | |
| 235 # ALN/ClustalW .aln | |
| 236 # GCG/MSF .msf | |
| 237 # PILEUP/MSF .msf | |
| 238 # Fasts(Pearson) .fasta, .fta | |
| 239 # NBRF/PIR .pir | |
| 240 # | |
| 241 sub IsSupportedSequenceFile { | |
| 242 my($SequenceFile) = @_; | |
| 243 my($Status, $SequenceFormat); | |
| 244 $Status = 0; $SequenceFormat = 'NotSupported'; | |
| 245 | |
| 246 SEQFORMAT: { | |
| 247 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT} | |
| 248 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT} | |
| 249 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT} | |
| 250 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT} | |
| 251 $Status = 0; $SequenceFormat = 'NotSupported'; | |
| 252 } | |
| 253 return ($Status, $SequenceFormat); | |
| 254 } | |
| 255 | |
| 256 # Is it a ClustalW multiple sequence sequence file... | |
| 257 sub IsClustalWSequenceFile { | |
| 258 my($SequenceFile) = @_; | |
| 259 my($Status, $Line); | |
| 260 | |
| 261 $Status = 0; | |
| 262 | |
| 263 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; | |
| 264 $Line = GetTextLine(\*SEQUENCEFILE); | |
| 265 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0; | |
| 266 close SEQUENCEFILE; | |
| 267 | |
| 268 return $Status; | |
| 269 } | |
| 270 | |
| 271 # Is it a valid Pearson fasta sequence or alignment file? | |
| 272 # | |
| 273 sub IsPearsonFastaSequenceFile { | |
| 274 my($FastaFile, $Line, $Status); | |
| 275 | |
| 276 ($FastaFile) = @_; | |
| 277 $Status = 0; | |
| 278 | |
| 279 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; | |
| 280 $Line = GetTextLine(\*FASTAFILE); | |
| 281 | |
| 282 # First line starts with > and the fourth character is not ';'; otherwise, it's | |
| 283 # PIR FASTA format. | |
| 284 if ($Line =~ /^>/) { | |
| 285 my($FourthChar); | |
| 286 $FourthChar = substr($Line, 3, 1); | |
| 287 $Status = ($FourthChar !~ /\;/) ? 1 : 0; | |
| 288 } | |
| 289 close FASTAFILE; | |
| 290 | |
| 291 return $Status; | |
| 292 } | |
| 293 | |
| 294 # Is it a valid NBRF/PIR fasta sequence or alignment file? | |
| 295 # | |
| 296 sub IsPIRFastaSequenceFile { | |
| 297 my($FastaFile, $Line, $Status); | |
| 298 | |
| 299 ($FastaFile) = @_; | |
| 300 $Status = 0; | |
| 301 | |
| 302 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; | |
| 303 $Line = GetTextLine(\*FASTAFILE); | |
| 304 | |
| 305 # First line starts with > and the fourth character is ';'; otherwise, it's | |
| 306 # a Pearson FASTA format. | |
| 307 if ($Line =~ /^>/) { | |
| 308 my($FourthChar); | |
| 309 $FourthChar = substr($Line, 3, 1); | |
| 310 $Status = ($FourthChar =~ /\;/) ? 1 : 0; | |
| 311 } | |
| 312 close FASTAFILE; | |
| 313 | |
| 314 return $Status; | |
| 315 } | |
| 316 | |
| 317 # Is it a valid MSF sequence or alignment file? | |
| 318 # | |
| 319 sub IsMSFSequenceFile { | |
| 320 my($MSFFile) = @_; | |
| 321 | |
| 322 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n"; | |
| 323 | |
| 324 my($Line, $Status); | |
| 325 | |
| 326 $Status = 0; | |
| 327 # Find a line that contains MSF: keyword and ends with '..' | |
| 328 LINE: while ($Line = GetTextLine(\*MSFFILE)) { | |
| 329 $Line = RemoveLeadingWhiteSpaces($Line); | |
| 330 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) { | |
| 331 $Status = 1; | |
| 332 last LINE; | |
| 333 } | |
| 334 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) { | |
| 335 # Pileup MSF... | |
| 336 $Status = 1; | |
| 337 last LINE; | |
| 338 } | |
| 339 } | |
| 340 close MSFFILE; | |
| 341 | |
| 342 return $Status; | |
| 343 } | |
| 344 | |
| 345 # Read sequence or sequence alignment file... | |
| 346 sub ReadSequenceFile { | |
| 347 my($SequenceFile) = @_; | |
| 348 | |
| 349 if (IsPearsonFastaSequenceFile($SequenceFile)) { | |
| 350 return ReadPearsonFastaSequenceFile($SequenceFile); | |
| 351 } | |
| 352 elsif (IsPIRFastaSequenceFile($SequenceFile)) { | |
| 353 return ReadPIRFastaSequenceFile($SequenceFile); | |
| 354 } | |
| 355 elsif (IsMSFSequenceFile($SequenceFile)) { | |
| 356 return ReadMSFSequenceFile($SequenceFile); | |
| 357 } | |
| 358 elsif (IsClustalWSequenceFile($SequenceFile)) { | |
| 359 return ReadClustalWSequenceFile($SequenceFile); | |
| 360 } | |
| 361 else { | |
| 362 return undef; | |
| 363 } | |
| 364 } | |
| 365 | |
| 366 # Read file and setup alignment data... | |
| 367 sub ReadClustalWSequenceFile { | |
| 368 my($SequenceFile) = @_; | |
| 369 | |
| 370 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW'); | |
| 371 } | |
| 372 | |
| 373 # Read file and setup alignment data... | |
| 374 sub ReadPearsonFastaSequenceFile { | |
| 375 my($SequenceFile) = @_; | |
| 376 | |
| 377 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson'); | |
| 378 } | |
| 379 | |
| 380 # Read file and setup alignment data... | |
| 381 sub ReadPIRFastaSequenceFile { | |
| 382 my($SequenceFile) = @_; | |
| 383 | |
| 384 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR'); | |
| 385 } | |
| 386 | |
| 387 | |
| 388 # Read file and setup sequence data... | |
| 389 sub ReadMSFSequenceFile { | |
| 390 my($SequenceFile) = @_; | |
| 391 | |
| 392 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF'); | |
| 393 } | |
| 394 | |
| 395 # Write out a Pearson FASTA file... | |
| 396 sub WritePearsonFastaSequenceFile { | |
| 397 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence); | |
| 398 | |
| 399 $MaxLength = 80; | |
| 400 if (@_ == 3) { | |
| 401 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_; | |
| 402 } | |
| 403 elsif (@_ == 2) { | |
| 404 ($SequenceFileName, $SequenceDataRef) = @_; | |
| 405 } | |
| 406 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n"; | |
| 407 for $ID (@{$SequenceDataRef->{IDs}}) { | |
| 408 $Description = $SequenceDataRef->{Description}{$ID}; | |
| 409 $Sequence = $SequenceDataRef->{Sequence}{$ID}; | |
| 410 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n"); | |
| 411 | |
| 412 # Description also contains ID... | |
| 413 print SEQUENCEFILE ">$Description\n"; | |
| 414 print SEQUENCEFILE "$WrappedSequence\n"; | |
| 415 } | |
| 416 close SEQUENCEFILE; | |
| 417 } | |
| 418 | |
| 419 # Get ID, Sequence and Length for smallest or longest sequence | |
| 420 sub _GetShortestOrLongestSequence { | |
| 421 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_; | |
| 422 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
| 423 | |
| 424 ($ID, $Seq, $SeqLen) = ('', '', ''); | |
| 425 $FirstID = ''; | |
| 426 | |
| 427 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) { | |
| 428 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID}; | |
| 429 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps); | |
| 430 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID}; | |
| 431 if (!$FirstID) { | |
| 432 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen; | |
| 433 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
| 434 next ID; | |
| 435 } | |
| 436 if ($CurrentSeqLen != $SeqLen) { | |
| 437 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) { | |
| 438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
| 439 } | |
| 440 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) { | |
| 441 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
| 442 } | |
| 443 } | |
| 444 } | |
| 445 return ($ID, $Seq, $SeqLen, $Description); | |
| 446 } | |
| 447 | |
| 448 # Remove gaps in the sequence and return new sequence... | |
| 449 sub RemoveSequenceGaps { | |
| 450 my($Seq) = @_; | |
| 451 my($SeqWithoutGaps, $SeqLen, $Index, $Residue); | |
| 452 | |
| 453 $SeqWithoutGaps = ''; | |
| 454 $SeqLen = length($Seq); | |
| 455 for $Index (0 .. ($SeqLen - 1)) { | |
| 456 $Residue = substr($Seq, $Index, 1); | |
| 457 if ($Residue =~ /[A-Z]/i) { | |
| 458 $SeqWithoutGaps .= $Residue; | |
| 459 } | |
| 460 } | |
| 461 | |
| 462 return $SeqWithoutGaps; | |
| 463 } | |
| 464 | |
| 465 # Using input alignment data map ref containing following keys, generate | |
| 466 # a new hash with same set of keys after residue columns containg only | |
| 467 # gaps have been removed: | |
| 468 # | |
| 469 # {IDs} : Array of IDs in order as they appear in file | |
| 470 # {Count}: ID count... | |
| 471 # {Description}{$ID} : Description data... | |
| 472 # {Sequence}{$ID} : Sequence data... | |
| 473 # | |
| 474 sub RemoveSequenceAlignmentGapColumns { | |
| 475 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap); | |
| 476 | |
| 477 ($AlignmentDataMapRef) = @_; | |
| 478 | |
| 479 %NewAlignmentDataMap = (); | |
| 480 @{$NewAlignmentDataMap{IDs}} =(); | |
| 481 %{$NewAlignmentDataMap{Description}} =(); | |
| 482 %{$NewAlignmentDataMap{Sequence}} =(); | |
| 483 $NewAlignmentDataMap{Count} = 0; | |
| 484 | |
| 485 # Transfer ID and count information... | |
| 486 for $ID (@{$AlignmentDataMapRef->{IDs}}) { | |
| 487 push @{$NewAlignmentDataMap{IDs}}, $ID; | |
| 488 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID}; | |
| 489 $NewAlignmentDataMap{Sequence}{$ID} = ''; | |
| 490 $NewAlignmentDataMap{Count} += 1; | |
| 491 } | |
| 492 | |
| 493 # Go over residue columns and transfer the data... | |
| 494 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn); | |
| 495 | |
| 496 $FirstID = $AlignmentDataMapRef->{IDs}[0]; | |
| 497 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID}; | |
| 498 $FirstSeqLen = length($FirstSeq); | |
| 499 | |
| 500 RES: for $Index (0 .. ($FirstSeqLen - 1)) { | |
| 501 # Is this a gap column? | |
| 502 $GapColumn = 1; | |
| 503 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) { | |
| 504 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); | |
| 505 if ($Res =~ /[A-Z]/i) { | |
| 506 $GapColumn = 0; | |
| 507 last ID; | |
| 508 } | |
| 509 } | |
| 510 if ($GapColumn) { | |
| 511 next RES; | |
| 512 } | |
| 513 # Transfer this residue... | |
| 514 for $ID (@{$AlignmentDataMapRef->{IDs}}) { | |
| 515 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); | |
| 516 $NewAlignmentDataMap{Sequence}{$ID} .= $Res; | |
| 517 } | |
| 518 } | |
| 519 | |
| 520 return (\%NewAlignmentDataMap); | |
| 521 } | |
| 522 | |
| 523 # | |
| 524 # Read sequences file and return a reference to hash with the following keys: | |
| 525 # | |
| 526 # {IDs} - Array of sequence IDs | |
| 527 # {Count} - Number of sequences | |
| 528 # {Description}{$ID} - Sequence description | |
| 529 # {Sequence}{$ID} - Sequence for a specific ID | |
| 530 # {InputFileType} - Sequence file format | |
| 531 # {ConservedAnnotation} - Conserved residue annonation | |
| 532 # | |
| 533 # Note: | |
| 534 # . Conserved residue annotation either exist in the input sequence alignment file or set | |
| 535 # for a file containing same number of residues for all the sequence using the following | |
| 536 # notation: * - Residue conserved; ' ' - Residue not conserved. | |
| 537 # | |
| 538 sub _ReadFileAndSetupSequencesData { | |
| 539 my($SequenceFile, $SequenceType) = @_; | |
| 540 my($SequenceDataMapRef); | |
| 541 | |
| 542 $SequenceDataMapRef = undef; | |
| 543 | |
| 544 # Read sequence file... | |
| 545 $SequenceDataMapRef = ''; | |
| 546 if ($SequenceType =~ /^ClustalW$/i) { | |
| 547 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile); | |
| 548 } | |
| 549 elsif ($SequenceType =~ /^Pearson$/i) { | |
| 550 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile); | |
| 551 } | |
| 552 elsif ($SequenceType =~ /^PIR$/i) { | |
| 553 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile); | |
| 554 } | |
| 555 elsif ($SequenceType =~ /^MSF$/i) { | |
| 556 $SequenceDataMapRef = _ReadMSFFile($SequenceFile); | |
| 557 } | |
| 558 else { | |
| 559 return $SequenceDataMapRef; | |
| 560 } | |
| 561 | |
| 562 if (exists $SequenceDataMapRef->{ConservedAnnotation}) { | |
| 563 return ($SequenceDataMapRef); | |
| 564 } | |
| 565 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) { | |
| 566 return ($SequenceDataMapRef); | |
| 567 } | |
| 568 | |
| 569 # Use the first sequence to setup an empty ConservedAnnotation key... | |
| 570 # And mark fully conserved residues... | |
| 571 # | |
| 572 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index); | |
| 573 $ID = $SequenceDataMapRef->{IDs}[0]; | |
| 574 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID}; | |
| 575 $FirstSeqLen = length($FirstSequence); | |
| 576 $SequenceDataMapRef->{ConservedAnnotation} = ''; | |
| 577 for $Index (0 .. ($FirstSeqLen - 1)) { | |
| 578 $FirstRes = ''; | |
| 579 $ResConserved = 1; | |
| 580 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) { | |
| 581 $Sequence = $SequenceDataMapRef->{Sequence}{$ID}; | |
| 582 $Res = substr($Sequence, $Index, 1); | |
| 583 if (!$FirstRes) { | |
| 584 $FirstRes = $Res; | |
| 585 next ID; | |
| 586 } | |
| 587 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) { | |
| 588 $ResConserved = 0; | |
| 589 last ID; | |
| 590 } | |
| 591 } | |
| 592 if ($ResConserved) { | |
| 593 $SequenceDataMapRef->{ConservedAnnotation} .= '*'; | |
| 594 } | |
| 595 else { | |
| 596 $SequenceDataMapRef->{ConservedAnnotation} .= ' '; | |
| 597 } | |
| 598 } | |
| 599 | |
| 600 return ($SequenceDataMapRef); | |
| 601 } | |
| 602 | |
| 603 # Read sequence data in ClustalW multiple sequence alignment file and | |
| 604 # return a reference to hash with these keys and values: | |
| 605 # | |
| 606 # {IDs} - Array of sequence IDs | |
| 607 # {Count} - Number of sequences | |
| 608 # {Description}{$ID} - Sequence description | |
| 609 # {Sequence}{$ID} - Sequence for a specific ID | |
| 610 # {InputFileType} - Sequence file format | |
| 611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , . | |
| 612 # | |
| 613 # | |
| 614 # | |
| 615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean: | |
| 616 # | |
| 617 # '*' indicates positions which have a single, fully conserved residue | |
| 618 # | |
| 619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA | |
| 620 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW | |
| 621 | |
| 622 # '.' indicates that one of the following 'weaker' groups is fully conserved: | |
| 623 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY | |
| 624 # | |
| 625 # These are all the positively scoring groups that occur in the Gonnet Pam250 | |
| 626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak | |
| 627 # score =<0.5 respectively. | |
| 628 # | |
| 629 sub _ReadClustalWFile { | |
| 630 my($SequenceFile) = @_; | |
| 631 my(%SequencesDataMap); | |
| 632 | |
| 633 # Initialize data... | |
| 634 %SequencesDataMap = (); | |
| 635 @{$SequencesDataMap{IDs}} = (); | |
| 636 %{$SequencesDataMap{Description}} = (); | |
| 637 %{$SequencesDataMap{Sequence}} = (); | |
| 638 $SequencesDataMap{Count} = 0; | |
| 639 $SequencesDataMap{ConservedAnnotation} = ''; | |
| 640 $SequencesDataMap{InputFileType} = 'ClustalW'; | |
| 641 | |
| 642 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; | |
| 643 | |
| 644 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex); | |
| 645 | |
| 646 # Ignore the header line... | |
| 647 $Line = <SEQUENCEFILE>; | |
| 648 | |
| 649 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) { | |
| 650 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) { | |
| 651 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd. | |
| 652 # Extract it and save... | |
| 653 $LineLength = length($Line); | |
| 654 $AnnotationStart = $LineLength - $SequenceLength; | |
| 655 $AnnotationLength = $SequenceLength; | |
| 656 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength); | |
| 657 $SequencesDataMap{ConservedAnnotation} .= $Annotation; | |
| 658 } | |
| 659 else { | |
| 660 # Extract ID and sequences... | |
| 661 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/; | |
| 662 $Sequence =~ s/ //g; | |
| 663 if (!($ID && $Sequence)) { | |
| 664 next LINE; | |
| 665 } | |
| 666 | |
| 667 if (exists $SequencesDataMap{Sequence}{$ID}) { | |
| 668 # Append to existing alignment value... | |
| 669 $SequenceLength = length($Sequence); | |
| 670 $SequencesDataMap{Sequence}{$ID} .= $Sequence; | |
| 671 } | |
| 672 else { | |
| 673 # New alignment data... | |
| 674 $SequencesDataMap{Count} += 1; | |
| 675 push @{$SequencesDataMap{IDs}}, $ID; | |
| 676 $SequencesDataMap{Description}{$ID} = $ID; | |
| 677 $SequencesDataMap{Sequence}{$ID} = $Sequence; | |
| 678 $SequenceLength = length($Sequence); | |
| 679 } | |
| 680 } | |
| 681 } | |
| 682 close SEQUENCEFILE; | |
| 683 return (\%SequencesDataMap); | |
| 684 } | |
| 685 | |
| 686 # Read Pearson fasta file and return a reference to hash with these keys: | |
| 687 # | |
| 688 # {IDs} - Array of sequence IDs | |
| 689 # {Count} - Number of sequences | |
| 690 # {Description}{$ID} - Sequence description | |
| 691 # {Sequence}{$ID} - Sequence for a specific ID | |
| 692 # {InputFileType} - Sequence file format | |
| 693 # {ConservedAnnotation} - Conserved residue annonation | |
| 694 # | |
| 695 sub _ReadPearsonFastaFile { | |
| 696 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap); | |
| 697 | |
| 698 ($FastaFileName) = @_; | |
| 699 | |
| 700 %FastaDataMap = (); | |
| 701 @{$FastaDataMap{IDs}} =(); | |
| 702 %{$FastaDataMap{Description}} =(); | |
| 703 %{$FastaDataMap{Sequence}} =(); | |
| 704 $FastaDataMap{Count} = 0; | |
| 705 $FastaDataMap{InputFileType} = 'Pearson'; | |
| 706 | |
| 707 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; | |
| 708 $ID = ''; | |
| 709 $IgnoreID = 0; | |
| 710 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { | |
| 711 if ($Line =~ /^\>/) { | |
| 712 # Start of a new ID... | |
| 713 $Line =~ s/^\>//; | |
| 714 $Line = RemoveLeadingWhiteSpaces($Line); | |
| 715 @LineWords = (); | |
| 716 @LineWords = split / /, $Line; | |
| 717 | |
| 718 $ID = $LineWords[0]; | |
| 719 $ID =~ s/ //g; | |
| 720 $Description = $Line; | |
| 721 | |
| 722 $IgnoreID = 0; | |
| 723 if (exists $FastaDataMap{Sequence}{$ID}) { | |
| 724 $IgnoreID = 1; | |
| 725 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; | |
| 726 next LINE; | |
| 727 } | |
| 728 push @{$FastaDataMap{IDs}}, $ID; | |
| 729 $FastaDataMap{Description}{$ID} = $Description; | |
| 730 $FastaDataMap{Count} += 1; | |
| 731 next LINE; | |
| 732 } | |
| 733 if ($IgnoreID) { next LINE; } | |
| 734 | |
| 735 # Remove any spaces in the sequence... | |
| 736 $Line =~ s/ //g; | |
| 737 # Sequence data for active ID... | |
| 738 if (exists $FastaDataMap{Sequence}{$ID}) { | |
| 739 $FastaDataMap{Sequence}{$ID} .= $Line; | |
| 740 } | |
| 741 else { | |
| 742 $FastaDataMap{Sequence}{$ID} = $Line; | |
| 743 } | |
| 744 } | |
| 745 close FASTAFILE; | |
| 746 return \%FastaDataMap; | |
| 747 } | |
| 748 | |
| 749 # Read PIR fasta file and return a reference to hash with these keys: | |
| 750 # | |
| 751 # {IDs} - Array of sequence IDs | |
| 752 # {Count} - Number of sequences | |
| 753 # {Description}{$ID} - Sequence description | |
| 754 # {Sequence}{$ID} - Sequence for a specific ID | |
| 755 # {InputFileType} - Sequence file format | |
| 756 # {ConservedAnnotation} - Conserved residue annonation | |
| 757 # | |
| 758 # Format: | |
| 759 # A sequence in PIR format consists of: | |
| 760 # One line starting with | |
| 761 # a ">" (greater-than) sign, followed by | |
| 762 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by | |
| 763 # a semicolon, followed by | |
| 764 # the sequence identification code (the database ID-code). | |
| 765 # One line containing a textual description of the sequence. | |
| 766 # One or more lines containing the sequence itself. The end of the | |
| 767 # sequence is marked by a "*" (asterisk) character. | |
| 768 # | |
| 769 # A file in PIR format may comprise more than one sequence. | |
| 770 # | |
| 771 # The PIR format is also often referred to as the NBRF format. | |
| 772 # | |
| 773 # Code SequenceType | |
| 774 # P1 Protein (complete) | |
| 775 # F1 Protein (fragment) | |
| 776 # DL DNA (linear) | |
| 777 # DC DNA (circular) | |
| 778 # RL RNA (linear) | |
| 779 # RC RNA (circular) | |
| 780 # N3 tRNA | |
| 781 # N1 Other functional RNA | |
| 782 # | |
| 783 | |
| 784 sub _ReadPIRFastaFile { | |
| 785 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap); | |
| 786 | |
| 787 ($FastaFileName) = @_; | |
| 788 | |
| 789 %FastaDataMap = (); | |
| 790 @{$FastaDataMap{IDs}} =(); | |
| 791 %{$FastaDataMap{Description}} =(); | |
| 792 %{$FastaDataMap{Sequence}} =(); | |
| 793 %{$FastaDataMap{SequenceTypeCode}} =(); | |
| 794 $FastaDataMap{Count} = 0; | |
| 795 $FastaDataMap{InputFileType} = 'PIR'; | |
| 796 | |
| 797 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; | |
| 798 $ID = ''; | |
| 799 $ReadingSequenceData = 0; | |
| 800 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { | |
| 801 if ($Line =~ /^\>/) { | |
| 802 # Start of a new ID... | |
| 803 $Line =~ s/^\>//; | |
| 804 $Line = RemoveLeadingWhiteSpaces($Line); | |
| 805 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/; | |
| 806 | |
| 807 # Use next line to retrieve sequence description... | |
| 808 $Line = GetTextLine(\*FASTAFILE); | |
| 809 $Line = RemoveLeadingWhiteSpaces($Line); | |
| 810 $Description = $Line; | |
| 811 | |
| 812 if (exists $FastaDataMap{Sequence}{$ID}) { | |
| 813 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; | |
| 814 next LINE; | |
| 815 } | |
| 816 $ReadingSequenceData = 1; | |
| 817 push @{$FastaDataMap{IDs}}, $ID; | |
| 818 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode; | |
| 819 $FastaDataMap{Description}{$ID} = $Description; | |
| 820 $FastaDataMap{Count} += 1; | |
| 821 next LINE; | |
| 822 } | |
| 823 if (!$ReadingSequenceData) { next LINE; } | |
| 824 | |
| 825 # Remove any spaces in the sequence... | |
| 826 $Line =~ s/ //g; | |
| 827 if ($Line =~ /[\*]$/) { | |
| 828 # End of sequence... | |
| 829 $ReadingSequenceData = 0; | |
| 830 $Line =~ s/[\*]$//; | |
| 831 } | |
| 832 # Sequence data for active ID... | |
| 833 if (exists $FastaDataMap{Sequence}{$ID}) { | |
| 834 $FastaDataMap{Sequence}{$ID} .= $Line; | |
| 835 } | |
| 836 else { | |
| 837 $FastaDataMap{Sequence}{$ID} = $Line; | |
| 838 } | |
| 839 } | |
| 840 close FASTAFILE; | |
| 841 return \%FastaDataMap; | |
| 842 } | |
| 843 | |
| 844 # Read MSF file and return a reference to hash with these keys: | |
| 845 # | |
| 846 # {IDs} : Array of IDs in order as they appear in file | |
| 847 # {Count}: ID count... | |
| 848 # {Description}{$ID} : Description data... | |
| 849 # {Sequence}{$ID} : Sequence data... | |
| 850 # | |
| 851 sub _ReadMSFFile { | |
| 852 my($MSFFileName, $Line, @LineWords, %MSFDataMap); | |
| 853 | |
| 854 ($MSFFileName) = @_; | |
| 855 | |
| 856 %MSFDataMap = (); | |
| 857 @{$MSFDataMap{IDs}} =(); | |
| 858 %{$MSFDataMap{Description}} =(); | |
| 859 %{$MSFDataMap{Sequence}} =(); | |
| 860 $MSFDataMap{Count} = 0; | |
| 861 $MSFDataMap{InputFileType} = 'MSF'; | |
| 862 | |
| 863 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n"; | |
| 864 | |
| 865 # Collect sequences and IDs... | |
| 866 # | |
| 867 # '//' after the name fields indicates end of header list and start of sequence data. | |
| 868 # | |
| 869 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap); | |
| 870 %MSFIDsMap = (); | |
| 871 $NameFieldsFound = 0; | |
| 872 LINE: while ($Line = GetTextLine(\*MSFFILE)) { | |
| 873 if ($Line =~ /Name:/) { | |
| 874 $NameFieldsFound++; | |
| 875 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/; | |
| 876 if ($ID =~ / /) { | |
| 877 ($ID) = $ID =~ /^(.*?)[ ]+/ | |
| 878 } | |
| 879 if (exists $MSFIDsMap{$ID}) { | |
| 880 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n"; | |
| 881 next LINE; | |
| 882 } | |
| 883 $MSFIDsMap{$ID} = $ID; | |
| 884 push @{$MSFDataMap{IDs}}, $ID; | |
| 885 $MSFDataMap{Description}{$ID} = $ID; | |
| 886 $MSFDataMap{Count} += 1; | |
| 887 } | |
| 888 elsif ( /\/\// && $NameFieldsFound) { | |
| 889 # End of header list... | |
| 890 last LINE; | |
| 891 } | |
| 892 } | |
| 893 # Collect all sequences... | |
| 894 # | |
| 895 my($FirstField, $SecondField); | |
| 896 while ($Line = GetTextLine(\*MSFFILE)) { | |
| 897 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/; | |
| 898 if (exists $MSFIDsMap{$FirstField}) { | |
| 899 # It's ID and sequence data... | |
| 900 $ID = $FirstField; | |
| 901 $Sequence = $SecondField; | |
| 902 # Take out spaces and leave the gap characters... | |
| 903 $Sequence =~ s/ //g; | |
| 904 if ($MSFDataMap{Sequence}{$ID}) { | |
| 905 $MSFDataMap{Sequence}{$ID} .= $Sequence; | |
| 906 } | |
| 907 else { | |
| 908 $MSFDataMap{Sequence}{$ID} = $Sequence; | |
| 909 } | |
| 910 } | |
| 911 } | |
| 912 | |
| 913 close MSFFILE; | |
| 914 return \%MSFDataMap; | |
| 915 } | |
| 916 | |
| 917 | |
| 918 1; | |
| 919 | |
| 920 __END__ | |
| 921 | |
| 922 =head1 NAME | |
| 923 | |
| 924 SequenceFileUtil | |
| 925 | |
| 926 =head1 SYNOPSIS | |
| 927 | |
| 928 use SequenceFileUtil ; | |
| 929 | |
| 930 use SequenceFileUtil qw(:all); | |
| 931 | |
| 932 =head1 DESCRIPTION | |
| 933 | |
| 934 B<SequenceFileUtil> module provides the following functions: | |
| 935 | |
| 936 AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity, | |
| 937 CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength, | |
| 938 GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile, | |
| 939 IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile, | |
| 940 ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile, | |
| 941 ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns, | |
| 942 RemoveSequenceGaps, WritePearsonFastaSequenceFile | |
| 943 SequenceFileUtil module provides various methods to process sequence | |
| 944 files and retreive appropriate information. | |
| 945 | |
| 946 =head1 FUNCTIONS | |
| 947 | |
| 948 =over 4 | |
| 949 | |
| 950 =item B<AreSequenceLengthsIdentical> | |
| 951 | |
| 952 $Status = AreSequenceLengthsIdentical($SequencesDataRef); | |
| 953 | |
| 954 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1 | |
| 955 or 0 based whether lengths of all the sequence is same. | |
| 956 | |
| 957 =item B<CalcuatePercentSequenceIdentity> | |
| 958 | |
| 959 $PercentIdentity = | |
| 960 AreSequenceLengthsIdenticalAreSequenceLengthsIdentical( | |
| 961 $Sequence1, $Sequence2, [$IgnoreGaps, $Precision]); | |
| 962 | |
| 963 Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments | |
| 964 I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the | |
| 965 returned value. By default, gaps are ignored and precision is set up to 1 decimal. | |
| 966 | |
| 967 =item B<CalculatePercentSequenceIdentityMatrix> | |
| 968 | |
| 969 $IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix( | |
| 970 $SequencesDataRef, [$IgnoreGaps, | |
| 971 $Precision]); | |
| 972 | |
| 973 Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef> | |
| 974 and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and | |
| 975 I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps | |
| 976 are ignored and precision is set up to 1 decimal. | |
| 977 | |
| 978 =item B<GetSequenceLength> | |
| 979 | |
| 980 $SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]); | |
| 981 | |
| 982 Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling | |
| 983 of gaps. By default, gaps are ignored. | |
| 984 | |
| 985 =item B<GetShortestSequence> | |
| 986 | |
| 987 ($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence( | |
| 988 $SequencesDataRef, [$IgnoreGaps]); | |
| 989 | |
| 990 Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID, | |
| 991 $Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps | |
| 992 controls handling of gaps in sequences. By default, gaps are ignored. | |
| 993 | |
| 994 =item B<GetLongestSequence> | |
| 995 | |
| 996 ($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence( | |
| 997 $SequencesDataRef, [$IgnoreGaps]); | |
| 998 | |
| 999 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>, | |
| 1000 B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument | |
| 1001 $I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored. | |
| 1002 | |
| 1003 =item B<IsGapResidue> | |
| 1004 | |
| 1005 $Status = AreSequenceLengthsIdentical($Residue); | |
| 1006 | |
| 1007 Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is | |
| 1008 considered a gap residue. | |
| 1009 | |
| 1010 =item B<IsSupportedSequenceFile> | |
| 1011 | |
| 1012 $Status = IsSupportedSequenceFile($SequenceFile); | |
| 1013 | |
| 1014 Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence | |
| 1015 format. | |
| 1016 | |
| 1017 =item B<IsClustalWSequenceFile> | |
| 1018 | |
| 1019 $Status = IsClustalWSequenceFile($SequenceFile); | |
| 1020 | |
| 1021 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment | |
| 1022 format. | |
| 1023 | |
| 1024 =item B<IsPearsonFastaSequenceFile> | |
| 1025 | |
| 1026 $Status = IsPearsonFastaSequenceFile($SequenceFile); | |
| 1027 | |
| 1028 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence | |
| 1029 format. | |
| 1030 | |
| 1031 =item B<IsPIRFastaSequenceFile> | |
| 1032 | |
| 1033 $Status = IsPIRFastaSequenceFile($SequenceFile); | |
| 1034 | |
| 1035 Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence | |
| 1036 format. | |
| 1037 | |
| 1038 =item B<IsMSFSequenceFile> | |
| 1039 | |
| 1040 $Status = IsClustalWSequenceFile($SequenceFile); | |
| 1041 | |
| 1042 Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment | |
| 1043 format. | |
| 1044 | |
| 1045 =item B<ReadSequenceFile> | |
| 1046 | |
| 1047 $SequenceDataMapRef = ReadSequenceFile($SequenceFile); | |
| 1048 | |
| 1049 Reads I<SequenceFile> and returns reference to a hash containing following key/value | |
| 1050 pairs: | |
| 1051 | |
| 1052 $SequenceDataMapRef->{IDs} - Array of sequence IDs | |
| 1053 $SequenceDataMapRef->{Count} - Number of sequences | |
| 1054 $SequenceDataMapRef->{Description}{$ID} - Sequence description | |
| 1055 $SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID | |
| 1056 $SequenceDataMapRef->{Sequence}{InputFileType} - File format | |
| 1057 | |
| 1058 =item B<ReadClustalWSequenceFile> | |
| 1059 | |
| 1060 $SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile); | |
| 1061 | |
| 1062 Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value | |
| 1063 pairs as describes in B<ReadSequenceFile> method. | |
| 1064 | |
| 1065 =item B<ReadMSFSequenceFile> | |
| 1066 | |
| 1067 $SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile); | |
| 1068 | |
| 1069 Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value | |
| 1070 pairs as describes in B<ReadSequenceFile> method. | |
| 1071 | |
| 1072 =item B<ReadPIRFastaSequenceFile> | |
| 1073 | |
| 1074 $SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile); | |
| 1075 | |
| 1076 Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value | |
| 1077 pairs as describes in B<ReadSequenceFile> method. | |
| 1078 | |
| 1079 =item B<ReadPearsonFastaSequenceFile> | |
| 1080 | |
| 1081 $SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile); | |
| 1082 | |
| 1083 Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value | |
| 1084 pairs as describes in B<ReadSequenceFile> method. | |
| 1085 | |
| 1086 =item B<RemoveSequenceGaps> | |
| 1087 | |
| 1088 $SeqWithoutGaps = RemoveSequenceGaps($Sequence); | |
| 1089 | |
| 1090 Removes gaps from I<Sequence> and return a sequence without any gaps. | |
| 1091 | |
| 1092 =item B<RemoveSequenceAlignmentGapColumns> | |
| 1093 | |
| 1094 $NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns( | |
| 1095 $AlignmentDataMapRef); | |
| 1096 | |
| 1097 Using input alignment data map ref containing following keys, generate a new hash with | |
| 1098 same set of keys after residue columns containg only gaps have been removed: | |
| 1099 | |
| 1100 {IDs} : Array of IDs in order as they appear in file | |
| 1101 {Count}: ID count | |
| 1102 {Description}{$ID} : Description data | |
| 1103 {Sequence}{$ID} : Sequence data | |
| 1104 | |
| 1105 =item B<WritePearsonFastaSequenceFile> | |
| 1106 | |
| 1107 WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef, | |
| 1108 [$MaxLength]); | |
| 1109 | |
| 1110 Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence | |
| 1111 file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is | |
| 1112 80. | |
| 1113 | |
| 1114 =back | |
| 1115 | |
| 1116 =head1 AUTHOR | |
| 1117 | |
| 1118 Manish Sud <msud@san.rr.com> | |
| 1119 | |
| 1120 =head1 SEE ALSO | |
| 1121 | |
| 1122 PDBFileUtil.pm | |
| 1123 | |
| 1124 =head1 COPYRIGHT | |
| 1125 | |
| 1126 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 1127 | |
| 1128 This file is part of MayaChemTools. | |
| 1129 | |
| 1130 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 1131 the terms of the GNU Lesser General Public License as published by the Free | |
| 1132 Software Foundation; either version 3 of the License, or (at your option) | |
| 1133 any later version. | |
| 1134 | |
| 1135 =cut |
