Mercurial > repos > deepakjadmin > mayatool3_test2
view lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
line wrap: on
line source
package SequenceFileUtil; # # $RCSfile: SequenceFileUtil.pm,v $ # $Date: 2015/02/28 20:47:18 $ # $Revision: 1.33 $ # # Author: Manish Sud <msud@san.rr.com> # # Copyright (C) 2015 Manish Sud. All rights reserved. # # This file is part of MayaChemTools. # # MayaChemTools is free software; you can redistribute it and/or modify it under # the terms of the GNU Lesser General Public License as published by the Free # Software Foundation; either version 3 of the License, or (at your option) any # later version. # # MayaChemTools is distributed in the hope that it will be useful, but without # any warranty; without even the implied warranty of merchantability of fitness # for a particular purpose. See the GNU Lesser General Public License for more # details. # # You should have received a copy of the GNU Lesser General Public License # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, # Boston, MA, 02111-1307, USA. # use strict; use Exporter; use Text::ParseWords; use TextUtil; use FileUtil; use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); @ISA = qw(Exporter); @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile); @EXPORT_OK = qw(); %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); # Compare lengths of all sequences... sub AreSequenceLengthsIdentical { my($SequencesDataRef) = @_; my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen); $Status = 1; $FirstID = ''; $FirstDifferentLenID = ''; ID: for $ID (@{$SequencesDataRef->{IDs}}) { if (!$FirstID) { $FirstID = $ID; $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID}); next ID; } $SeqLen = length($SequencesDataRef->{Sequence}{$ID}); if ($SeqLen != $FirstSeqLen) { $Status = 0; $FirstDifferentLenID = $ID; last ID; } } return ($Status); } # Calculate percent identity between two sequences. By default, gaps are ignored. sub CalcuatePercentSequenceIdentity { my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision); $PercentIdentity = ''; $Precision = 1; $IgnoreGaps = 1; if (@_ == 4) { ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_; } elsif (@_ == 3) { ($Sequence1, $Sequence2, $IgnoreGaps) = @_; } elsif (@_ == 2) { ($Sequence1, $Sequence2) = @_; } else { return $PercentIdentity; } if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) { return $PercentIdentity; } my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount); $Sequence1Len = length($Sequence1); $Sequence2Len = length($Sequence2); $ResMatchCount = 0; $ResCount = 0; RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) { $Residue1 = substr($Sequence1, $Index, 1); $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : ''; if ($IgnoreGaps) { if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) { next RESIDUE; } } if ($Residue1 eq $Residue2) { $ResMatchCount++; } $ResCount++; } $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0; $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100)); return $PercentIdentity; } # Calculate pairwise identify matrix for all the sequences and return a reference # to a hash with the following keys: # # {IDs} - Sequence IDs # {Count} - Number of IDs # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences # sub CalculatePercentSequenceIdentityMatrix { my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData); $IgnoreGaps = 1; $Precision = 1; if (@_ == 3) { ($SequencesDataRef, $IgnoreGaps, $Precision) = @_; } elsif (@_ == 2) { ($SequencesDataRef, $IgnoreGaps) = @_; } else { ($SequencesDataRef) = @_; } %IdentityMatrixData = (); @{$IdentityMatrixData{IDs}} = (); %{$IdentityMatrixData{PercentIdentity}} = (); $IdentityMatrixData{Count} = 0; for $ID (@{$SequencesDataRef->{IDs}}) { push @{$IdentityMatrixData{IDs}}, $ID; $IdentityMatrixData{Count} += 1; } # Initialize and calculate percent identity data values... for $RowID (@{$SequencesDataRef->{IDs}}) { %{$IdentityMatrixData{PercentIdentity}{$RowID}} = (); $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID}; for $ColID (@{$SequencesDataRef->{IDs}}) { $IdentityMatrixData{$RowID}{$ColID} = ''; $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID}; $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision); $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity; } } return \%IdentityMatrixData; } # Retrieve information about shortest sequence... sub GetShortestSequence { my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); $IgnoreGaps = 1; if (@_ == 2) { ($SequencesDataRef, $IgnoreGaps) = @_; } else { ($SequencesDataRef) = @_; } ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps); return ($ID, $Sequence, $SeqLen, $Description); } # Retrieve information about longest sequence.. sub GetLongestSequence { my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); $IgnoreGaps = 1; if (@_ == 2) { ($SequencesDataRef, $IgnoreGaps) = @_; } else { ($SequencesDataRef) = @_; } ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps); return ($ID, $Sequence, $SeqLen, $Description); } # Get sequence length... sub GetSequenceLength { my($Seq, $SeqLen, $IgnoreGaps); $SeqLen = ''; $IgnoreGaps = 1; if (@_ == 2) { ($Seq, $IgnoreGaps) = @_; } else { ($Seq) = @_; } if ($IgnoreGaps) { my($Index, $Residue); $SeqLen = 0; for $Index (0 .. (length($Seq) - 1)) { $Residue = substr($Seq, $Index, 1); if ($Residue =~ /[A-Z]/i) { $SeqLen++; } } } else { $SeqLen = length($Seq); } return $SeqLen; } # Is it a gap residue... sub IsGapResidue { my($Residue) = @_; my($Status); $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0; return $Status; } # Is it a supported sequence file? # # Supported seqence formats are: # # ALN/ClustalW .aln # GCG/MSF .msf # PILEUP/MSF .msf # Fasts(Pearson) .fasta, .fta # NBRF/PIR .pir # sub IsSupportedSequenceFile { my($SequenceFile) = @_; my($Status, $SequenceFormat); $Status = 0; $SequenceFormat = 'NotSupported'; SEQFORMAT: { if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT} if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT} if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT} if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT} $Status = 0; $SequenceFormat = 'NotSupported'; } return ($Status, $SequenceFormat); } # Is it a ClustalW multiple sequence sequence file... sub IsClustalWSequenceFile { my($SequenceFile) = @_; my($Status, $Line); $Status = 0; open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; $Line = GetTextLine(\*SEQUENCEFILE); $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0; close SEQUENCEFILE; return $Status; } # Is it a valid Pearson fasta sequence or alignment file? # sub IsPearsonFastaSequenceFile { my($FastaFile, $Line, $Status); ($FastaFile) = @_; $Status = 0; open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; $Line = GetTextLine(\*FASTAFILE); # First line starts with > and the fourth character is not ';'; otherwise, it's # PIR FASTA format. if ($Line =~ /^>/) { my($FourthChar); $FourthChar = substr($Line, 3, 1); $Status = ($FourthChar !~ /\;/) ? 1 : 0; } close FASTAFILE; return $Status; } # Is it a valid NBRF/PIR fasta sequence or alignment file? # sub IsPIRFastaSequenceFile { my($FastaFile, $Line, $Status); ($FastaFile) = @_; $Status = 0; open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; $Line = GetTextLine(\*FASTAFILE); # First line starts with > and the fourth character is ';'; otherwise, it's # a Pearson FASTA format. if ($Line =~ /^>/) { my($FourthChar); $FourthChar = substr($Line, 3, 1); $Status = ($FourthChar =~ /\;/) ? 1 : 0; } close FASTAFILE; return $Status; } # Is it a valid MSF sequence or alignment file? # sub IsMSFSequenceFile { my($MSFFile) = @_; open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n"; my($Line, $Status); $Status = 0; # Find a line that contains MSF: keyword and ends with '..' LINE: while ($Line = GetTextLine(\*MSFFILE)) { $Line = RemoveLeadingWhiteSpaces($Line); if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) { $Status = 1; last LINE; } elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) { # Pileup MSF... $Status = 1; last LINE; } } close MSFFILE; return $Status; } # Read sequence or sequence alignment file... sub ReadSequenceFile { my($SequenceFile) = @_; if (IsPearsonFastaSequenceFile($SequenceFile)) { return ReadPearsonFastaSequenceFile($SequenceFile); } elsif (IsPIRFastaSequenceFile($SequenceFile)) { return ReadPIRFastaSequenceFile($SequenceFile); } elsif (IsMSFSequenceFile($SequenceFile)) { return ReadMSFSequenceFile($SequenceFile); } elsif (IsClustalWSequenceFile($SequenceFile)) { return ReadClustalWSequenceFile($SequenceFile); } else { return undef; } } # Read file and setup alignment data... sub ReadClustalWSequenceFile { my($SequenceFile) = @_; return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW'); } # Read file and setup alignment data... sub ReadPearsonFastaSequenceFile { my($SequenceFile) = @_; return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson'); } # Read file and setup alignment data... sub ReadPIRFastaSequenceFile { my($SequenceFile) = @_; return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR'); } # Read file and setup sequence data... sub ReadMSFSequenceFile { my($SequenceFile) = @_; return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF'); } # Write out a Pearson FASTA file... sub WritePearsonFastaSequenceFile { my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence); $MaxLength = 80; if (@_ == 3) { ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_; } elsif (@_ == 2) { ($SequenceFileName, $SequenceDataRef) = @_; } open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n"; for $ID (@{$SequenceDataRef->{IDs}}) { $Description = $SequenceDataRef->{Description}{$ID}; $Sequence = $SequenceDataRef->{Sequence}{$ID}; $WrappedSequence = WrapText($Sequence, $MaxLength, "\n"); # Description also contains ID... print SEQUENCEFILE ">$Description\n"; print SEQUENCEFILE "$WrappedSequence\n"; } close SEQUENCEFILE; } # Get ID, Sequence and Length for smallest or longest sequence sub _GetShortestOrLongestSequence { my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_; my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); ($ID, $Seq, $SeqLen) = ('', '', ''); $FirstID = ''; ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) { $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID}; $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps); $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID}; if (!$FirstID) { $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen; ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); next ID; } if ($CurrentSeqLen != $SeqLen) { if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) { ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); } elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) { ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); } } } return ($ID, $Seq, $SeqLen, $Description); } # Remove gaps in the sequence and return new sequence... sub RemoveSequenceGaps { my($Seq) = @_; my($SeqWithoutGaps, $SeqLen, $Index, $Residue); $SeqWithoutGaps = ''; $SeqLen = length($Seq); for $Index (0 .. ($SeqLen - 1)) { $Residue = substr($Seq, $Index, 1); if ($Residue =~ /[A-Z]/i) { $SeqWithoutGaps .= $Residue; } } return $SeqWithoutGaps; } # Using input alignment data map ref containing following keys, generate # a new hash with same set of keys after residue columns containg only # gaps have been removed: # # {IDs} : Array of IDs in order as they appear in file # {Count}: ID count... # {Description}{$ID} : Description data... # {Sequence}{$ID} : Sequence data... # sub RemoveSequenceAlignmentGapColumns { my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap); ($AlignmentDataMapRef) = @_; %NewAlignmentDataMap = (); @{$NewAlignmentDataMap{IDs}} =(); %{$NewAlignmentDataMap{Description}} =(); %{$NewAlignmentDataMap{Sequence}} =(); $NewAlignmentDataMap{Count} = 0; # Transfer ID and count information... for $ID (@{$AlignmentDataMapRef->{IDs}}) { push @{$NewAlignmentDataMap{IDs}}, $ID; $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID}; $NewAlignmentDataMap{Sequence}{$ID} = ''; $NewAlignmentDataMap{Count} += 1; } # Go over residue columns and transfer the data... my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn); $FirstID = $AlignmentDataMapRef->{IDs}[0]; $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID}; $FirstSeqLen = length($FirstSeq); RES: for $Index (0 .. ($FirstSeqLen - 1)) { # Is this a gap column? $GapColumn = 1; ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) { $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); if ($Res =~ /[A-Z]/i) { $GapColumn = 0; last ID; } } if ($GapColumn) { next RES; } # Transfer this residue... for $ID (@{$AlignmentDataMapRef->{IDs}}) { $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); $NewAlignmentDataMap{Sequence}{$ID} .= $Res; } } return (\%NewAlignmentDataMap); } # # Read sequences file and return a reference to hash with the following keys: # # {IDs} - Array of sequence IDs # {Count} - Number of sequences # {Description}{$ID} - Sequence description # {Sequence}{$ID} - Sequence for a specific ID # {InputFileType} - Sequence file format # {ConservedAnnotation} - Conserved residue annonation # # Note: # . Conserved residue annotation either exist in the input sequence alignment file or set # for a file containing same number of residues for all the sequence using the following # notation: * - Residue conserved; ' ' - Residue not conserved. # sub _ReadFileAndSetupSequencesData { my($SequenceFile, $SequenceType) = @_; my($SequenceDataMapRef); $SequenceDataMapRef = undef; # Read sequence file... $SequenceDataMapRef = ''; if ($SequenceType =~ /^ClustalW$/i) { $SequenceDataMapRef = _ReadClustalWFile($SequenceFile); } elsif ($SequenceType =~ /^Pearson$/i) { $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile); } elsif ($SequenceType =~ /^PIR$/i) { $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile); } elsif ($SequenceType =~ /^MSF$/i) { $SequenceDataMapRef = _ReadMSFFile($SequenceFile); } else { return $SequenceDataMapRef; } if (exists $SequenceDataMapRef->{ConservedAnnotation}) { return ($SequenceDataMapRef); } if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) { return ($SequenceDataMapRef); } # Use the first sequence to setup an empty ConservedAnnotation key... # And mark fully conserved residues... # my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index); $ID = $SequenceDataMapRef->{IDs}[0]; $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID}; $FirstSeqLen = length($FirstSequence); $SequenceDataMapRef->{ConservedAnnotation} = ''; for $Index (0 .. ($FirstSeqLen - 1)) { $FirstRes = ''; $ResConserved = 1; ID: for $ID (@{$SequenceDataMapRef->{IDs}}) { $Sequence = $SequenceDataMapRef->{Sequence}{$ID}; $Res = substr($Sequence, $Index, 1); if (!$FirstRes) { $FirstRes = $Res; next ID; } if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) { $ResConserved = 0; last ID; } } if ($ResConserved) { $SequenceDataMapRef->{ConservedAnnotation} .= '*'; } else { $SequenceDataMapRef->{ConservedAnnotation} .= ' '; } } return ($SequenceDataMapRef); } # Read sequence data in ClustalW multiple sequence alignment file and # return a reference to hash with these keys and values: # # {IDs} - Array of sequence IDs # {Count} - Number of sequences # {Description}{$ID} - Sequence description # {Sequence}{$ID} - Sequence for a specific ID # {InputFileType} - Sequence file format # {ConservedAnnotation} - Conserved residue annonations: space, *, : , . # # # # And based on ClustalW/X manual, here is what the ConservedAnnonations mean: # # '*' indicates positions which have a single, fully conserved residue # # ':' indicates that one of the following 'strong' groups is fully conserved: STA # NEQK NHQK NDEQ QHRK MILV MILF HY FYW # '.' indicates that one of the following 'weaker' groups is fully conserved: # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY # # These are all the positively scoring groups that occur in the Gonnet Pam250 # matrix. The strong and weak groups are defined as strong score >0.5 and weak # score =<0.5 respectively. # sub _ReadClustalWFile { my($SequenceFile) = @_; my(%SequencesDataMap); # Initialize data... %SequencesDataMap = (); @{$SequencesDataMap{IDs}} = (); %{$SequencesDataMap{Description}} = (); %{$SequencesDataMap{Sequence}} = (); $SequencesDataMap{Count} = 0; $SequencesDataMap{ConservedAnnotation} = ''; $SequencesDataMap{InputFileType} = 'ClustalW'; open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex); # Ignore the header line... $Line = <SEQUENCEFILE>; LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) { if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) { # Annotation for sequences: fully conserverd, weaker or stronger group conserverd. # Extract it and save... $LineLength = length($Line); $AnnotationStart = $LineLength - $SequenceLength; $AnnotationLength = $SequenceLength; $Annotation = substr($Line, $AnnotationStart, $AnnotationLength); $SequencesDataMap{ConservedAnnotation} .= $Annotation; } else { # Extract ID and sequences... ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/; $Sequence =~ s/ //g; if (!($ID && $Sequence)) { next LINE; } if (exists $SequencesDataMap{Sequence}{$ID}) { # Append to existing alignment value... $SequenceLength = length($Sequence); $SequencesDataMap{Sequence}{$ID} .= $Sequence; } else { # New alignment data... $SequencesDataMap{Count} += 1; push @{$SequencesDataMap{IDs}}, $ID; $SequencesDataMap{Description}{$ID} = $ID; $SequencesDataMap{Sequence}{$ID} = $Sequence; $SequenceLength = length($Sequence); } } } close SEQUENCEFILE; return (\%SequencesDataMap); } # Read Pearson fasta file and return a reference to hash with these keys: # # {IDs} - Array of sequence IDs # {Count} - Number of sequences # {Description}{$ID} - Sequence description # {Sequence}{$ID} - Sequence for a specific ID # {InputFileType} - Sequence file format # {ConservedAnnotation} - Conserved residue annonation # sub _ReadPearsonFastaFile { my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap); ($FastaFileName) = @_; %FastaDataMap = (); @{$FastaDataMap{IDs}} =(); %{$FastaDataMap{Description}} =(); %{$FastaDataMap{Sequence}} =(); $FastaDataMap{Count} = 0; $FastaDataMap{InputFileType} = 'Pearson'; open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; $ID = ''; $IgnoreID = 0; LINE: while ($Line = GetTextLine(\*FASTAFILE)) { if ($Line =~ /^\>/) { # Start of a new ID... $Line =~ s/^\>//; $Line = RemoveLeadingWhiteSpaces($Line); @LineWords = (); @LineWords = split / /, $Line; $ID = $LineWords[0]; $ID =~ s/ //g; $Description = $Line; $IgnoreID = 0; if (exists $FastaDataMap{Sequence}{$ID}) { $IgnoreID = 1; warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; next LINE; } push @{$FastaDataMap{IDs}}, $ID; $FastaDataMap{Description}{$ID} = $Description; $FastaDataMap{Count} += 1; next LINE; } if ($IgnoreID) { next LINE; } # Remove any spaces in the sequence... $Line =~ s/ //g; # Sequence data for active ID... if (exists $FastaDataMap{Sequence}{$ID}) { $FastaDataMap{Sequence}{$ID} .= $Line; } else { $FastaDataMap{Sequence}{$ID} = $Line; } } close FASTAFILE; return \%FastaDataMap; } # Read PIR fasta file and return a reference to hash with these keys: # # {IDs} - Array of sequence IDs # {Count} - Number of sequences # {Description}{$ID} - Sequence description # {Sequence}{$ID} - Sequence for a specific ID # {InputFileType} - Sequence file format # {ConservedAnnotation} - Conserved residue annonation # # Format: # A sequence in PIR format consists of: # One line starting with # a ">" (greater-than) sign, followed by # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by # a semicolon, followed by # the sequence identification code (the database ID-code). # One line containing a textual description of the sequence. # One or more lines containing the sequence itself. The end of the # sequence is marked by a "*" (asterisk) character. # # A file in PIR format may comprise more than one sequence. # # The PIR format is also often referred to as the NBRF format. # # Code SequenceType # P1 Protein (complete) # F1 Protein (fragment) # DL DNA (linear) # DC DNA (circular) # RL RNA (linear) # RC RNA (circular) # N3 tRNA # N1 Other functional RNA # sub _ReadPIRFastaFile { my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap); ($FastaFileName) = @_; %FastaDataMap = (); @{$FastaDataMap{IDs}} =(); %{$FastaDataMap{Description}} =(); %{$FastaDataMap{Sequence}} =(); %{$FastaDataMap{SequenceTypeCode}} =(); $FastaDataMap{Count} = 0; $FastaDataMap{InputFileType} = 'PIR'; open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; $ID = ''; $ReadingSequenceData = 0; LINE: while ($Line = GetTextLine(\*FASTAFILE)) { if ($Line =~ /^\>/) { # Start of a new ID... $Line =~ s/^\>//; $Line = RemoveLeadingWhiteSpaces($Line); ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/; # Use next line to retrieve sequence description... $Line = GetTextLine(\*FASTAFILE); $Line = RemoveLeadingWhiteSpaces($Line); $Description = $Line; if (exists $FastaDataMap{Sequence}{$ID}) { warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; next LINE; } $ReadingSequenceData = 1; push @{$FastaDataMap{IDs}}, $ID; $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode; $FastaDataMap{Description}{$ID} = $Description; $FastaDataMap{Count} += 1; next LINE; } if (!$ReadingSequenceData) { next LINE; } # Remove any spaces in the sequence... $Line =~ s/ //g; if ($Line =~ /[\*]$/) { # End of sequence... $ReadingSequenceData = 0; $Line =~ s/[\*]$//; } # Sequence data for active ID... if (exists $FastaDataMap{Sequence}{$ID}) { $FastaDataMap{Sequence}{$ID} .= $Line; } else { $FastaDataMap{Sequence}{$ID} = $Line; } } close FASTAFILE; return \%FastaDataMap; } # Read MSF file and return a reference to hash with these keys: # # {IDs} : Array of IDs in order as they appear in file # {Count}: ID count... # {Description}{$ID} : Description data... # {Sequence}{$ID} : Sequence data... # sub _ReadMSFFile { my($MSFFileName, $Line, @LineWords, %MSFDataMap); ($MSFFileName) = @_; %MSFDataMap = (); @{$MSFDataMap{IDs}} =(); %{$MSFDataMap{Description}} =(); %{$MSFDataMap{Sequence}} =(); $MSFDataMap{Count} = 0; $MSFDataMap{InputFileType} = 'MSF'; open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n"; # Collect sequences and IDs... # # '//' after the name fields indicates end of header list and start of sequence data. # my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap); %MSFIDsMap = (); $NameFieldsFound = 0; LINE: while ($Line = GetTextLine(\*MSFFILE)) { if ($Line =~ /Name:/) { $NameFieldsFound++; ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/; if ($ID =~ / /) { ($ID) = $ID =~ /^(.*?)[ ]+/ } if (exists $MSFIDsMap{$ID}) { warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n"; next LINE; } $MSFIDsMap{$ID} = $ID; push @{$MSFDataMap{IDs}}, $ID; $MSFDataMap{Description}{$ID} = $ID; $MSFDataMap{Count} += 1; } elsif ( /\/\// && $NameFieldsFound) { # End of header list... last LINE; } } # Collect all sequences... # my($FirstField, $SecondField); while ($Line = GetTextLine(\*MSFFILE)) { ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/; if (exists $MSFIDsMap{$FirstField}) { # It's ID and sequence data... $ID = $FirstField; $Sequence = $SecondField; # Take out spaces and leave the gap characters... $Sequence =~ s/ //g; if ($MSFDataMap{Sequence}{$ID}) { $MSFDataMap{Sequence}{$ID} .= $Sequence; } else { $MSFDataMap{Sequence}{$ID} = $Sequence; } } } close MSFFILE; return \%MSFDataMap; } 1; __END__ =head1 NAME SequenceFileUtil =head1 SYNOPSIS use SequenceFileUtil ; use SequenceFileUtil qw(:all); =head1 DESCRIPTION B<SequenceFileUtil> module provides the following functions: AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity, CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength, GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile, IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile, ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile, ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns, RemoveSequenceGaps, WritePearsonFastaSequenceFile SequenceFileUtil module provides various methods to process sequence files and retreive appropriate information. =head1 FUNCTIONS =over 4 =item B<AreSequenceLengthsIdentical> $Status = AreSequenceLengthsIdentical($SequencesDataRef); Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1 or 0 based whether lengths of all the sequence is same. =item B<CalcuatePercentSequenceIdentity> $PercentIdentity = AreSequenceLengthsIdenticalAreSequenceLengthsIdentical( $Sequence1, $Sequence2, [$IgnoreGaps, $Precision]); Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps are ignored and precision is set up to 1 decimal. =item B<CalculatePercentSequenceIdentityMatrix> $IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix( $SequencesDataRef, [$IgnoreGaps, $Precision]); Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef> and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps are ignored and precision is set up to 1 decimal. =item B<GetSequenceLength> $SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]); Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling of gaps. By default, gaps are ignored. =item B<GetShortestSequence> ($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence( $SequencesDataRef, [$IgnoreGaps]); Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID, $Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps controls handling of gaps in sequences. By default, gaps are ignored. =item B<GetLongestSequence> ($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence( $SequencesDataRef, [$IgnoreGaps]); Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>, B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument $I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored. =item B<IsGapResidue> $Status = AreSequenceLengthsIdentical($Residue); Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is considered a gap residue. =item B<IsSupportedSequenceFile> $Status = IsSupportedSequenceFile($SequenceFile); Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence format. =item B<IsClustalWSequenceFile> $Status = IsClustalWSequenceFile($SequenceFile); Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment format. =item B<IsPearsonFastaSequenceFile> $Status = IsPearsonFastaSequenceFile($SequenceFile); Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence format. =item B<IsPIRFastaSequenceFile> $Status = IsPIRFastaSequenceFile($SequenceFile); Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence format. =item B<IsMSFSequenceFile> $Status = IsClustalWSequenceFile($SequenceFile); Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment format. =item B<ReadSequenceFile> $SequenceDataMapRef = ReadSequenceFile($SequenceFile); Reads I<SequenceFile> and returns reference to a hash containing following key/value pairs: $SequenceDataMapRef->{IDs} - Array of sequence IDs $SequenceDataMapRef->{Count} - Number of sequences $SequenceDataMapRef->{Description}{$ID} - Sequence description $SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID $SequenceDataMapRef->{Sequence}{InputFileType} - File format =item B<ReadClustalWSequenceFile> $SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile); Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value pairs as describes in B<ReadSequenceFile> method. =item B<ReadMSFSequenceFile> $SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile); Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value pairs as describes in B<ReadSequenceFile> method. =item B<ReadPIRFastaSequenceFile> $SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile); Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value pairs as describes in B<ReadSequenceFile> method. =item B<ReadPearsonFastaSequenceFile> $SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile); Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value pairs as describes in B<ReadSequenceFile> method. =item B<RemoveSequenceGaps> $SeqWithoutGaps = RemoveSequenceGaps($Sequence); Removes gaps from I<Sequence> and return a sequence without any gaps. =item B<RemoveSequenceAlignmentGapColumns> $NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns( $AlignmentDataMapRef); Using input alignment data map ref containing following keys, generate a new hash with same set of keys after residue columns containg only gaps have been removed: {IDs} : Array of IDs in order as they appear in file {Count}: ID count {Description}{$ID} : Description data {Sequence}{$ID} : Sequence data =item B<WritePearsonFastaSequenceFile> WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef, [$MaxLength]); Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is 80. =back =head1 AUTHOR Manish Sud <msud@san.rr.com> =head1 SEE ALSO PDBFileUtil.pm =head1 COPYRIGHT Copyright (C) 2015 Manish Sud. All rights reserved. This file is part of MayaChemTools. MayaChemTools is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. =cut