mayatool3_test2: lib/SequenceFileUtil.pm comparison

comparison lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip

Uploaded

author	deepakjadmin
date	Wed, 20 Jan 2016 09:23:18 -0500
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:4816e4a8ae95
+package SequenceFileUtil;
+#
+# $RCSfile: SequenceFileUtil.pm,v $
+# $Date: 2015/02/28 20:47:18 $
+# $Revision: 1.33 $
+#
+# Author: Manish Sud <msud@san.rr.com>
+#
+# Copyright (C) 2015 Manish Sud. All rights reserved.
+#
+# This file is part of MayaChemTools.
+#
+# MayaChemTools is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option) any
+# later version.
+#
+# MayaChemTools is distributed in the hope that it will be useful, but without
+# any warranty; without even the implied warranty of merchantability of fitness
+# for a particular purpose.  See the GNU Lesser General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
+# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
+# Boston, MA, 02111-1307, USA.
+#
+use strict;
+use Exporter;
+use Text::ParseWords;
+use TextUtil;
+use FileUtil;
+use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
+@ISA = qw(Exporter);
+@EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
+@EXPORT_OK = qw();
+%EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
+# Compare lengths of all sequences...
+sub AreSequenceLengthsIdentical {
+my($SequencesDataRef) = @_;
+my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
+$Status = 1;
+$FirstID = '';
+$FirstDifferentLenID = '';
+ID: for $ID (@{$SequencesDataRef->{IDs}}) {
+if (!$FirstID) {
+$FirstID = $ID;
+$FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
+next ID;
+}
+$SeqLen = length($SequencesDataRef->{Sequence}{$ID});
+if ($SeqLen != $FirstSeqLen) {
+$Status = 0;
+$FirstDifferentLenID = $ID;
+last ID;
+}
+}
+return ($Status);
+}
+# Calculate percent identity between two sequences. By default, gaps are ignored.
+sub CalcuatePercentSequenceIdentity {
+my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
+$PercentIdentity = '';
+$Precision = 1;
+$IgnoreGaps = 1;
+if (@_ == 4) {
+($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
+}
+elsif (@_ == 3) {
+($Sequence1, $Sequence2, $IgnoreGaps) = @_;
+}
+elsif (@_ == 2) {
+($Sequence1, $Sequence2) = @_;
+}
+else {
+return $PercentIdentity;
+}
+if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
+return $PercentIdentity;
+}
+my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
+$Sequence1Len = length($Sequence1);
+$Sequence2Len = length($Sequence2);
+$ResMatchCount = 0;
+$ResCount = 0;
+RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
+$Residue1 = substr($Sequence1, $Index, 1);
+$Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
+if ($IgnoreGaps) {
+if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
+	next RESIDUE;
+}
+}
+if ($Residue1 eq $Residue2) {
+$ResMatchCount++;
+}
+$ResCount++;
+}
+$Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
+$PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
+return $PercentIdentity;
+}
+# Calculate pairwise identify matrix for all the sequences and return a reference
+# to a hash with the following keys:
+#
+# {IDs} - Sequence IDs
+# {Count} - Number of IDs
+# {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
+#
+sub CalculatePercentSequenceIdentityMatrix {
+my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
+$IgnoreGaps = 1;
+$Precision = 1;
+if (@_ == 3) {
+($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
+}
+elsif (@_ == 2) {
+($SequencesDataRef, $IgnoreGaps) = @_;
+}
+else {
+($SequencesDataRef) = @_;
+}
+%IdentityMatrixData = ();
+@{$IdentityMatrixData{IDs}} = ();
+%{$IdentityMatrixData{PercentIdentity}} = ();
+$IdentityMatrixData{Count} = 0;
+for $ID (@{$SequencesDataRef->{IDs}}) {
+push @{$IdentityMatrixData{IDs}}, $ID;
+$IdentityMatrixData{Count} += 1;
+}
+# Initialize and calculate percent identity data values...
+for $RowID (@{$SequencesDataRef->{IDs}}) {
+%{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
+$RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
+for $ColID (@{$SequencesDataRef->{IDs}}) {
+$IdentityMatrixData{$RowID}{$ColID} = '';
+$ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
+$PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
+$IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
+}
+}
+return \%IdentityMatrixData;
+}
+# Retrieve information about shortest sequence...
+sub GetShortestSequence {
+my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
+$IgnoreGaps = 1;
+if (@_ == 2) {
+($SequencesDataRef, $IgnoreGaps) = @_;
+}
+else {
+($SequencesDataRef) = @_;
+}
+($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
+return ($ID, $Sequence, $SeqLen, $Description);
+}
+# Retrieve information about longest sequence..
+sub GetLongestSequence {
+my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
+$IgnoreGaps = 1;
+if (@_ == 2) {
+($SequencesDataRef, $IgnoreGaps) = @_;
+}
+else {
+($SequencesDataRef) = @_;
+}
+($ID, $Sequence, $SeqLen, $Description) =  _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
+return ($ID, $Sequence, $SeqLen, $Description);
+}
+# Get sequence length...
+sub GetSequenceLength {
+my($Seq, $SeqLen, $IgnoreGaps);
+$SeqLen = ''; $IgnoreGaps = 1;
+if (@_ == 2) {
+($Seq, $IgnoreGaps) = @_;
+}
+else {
+($Seq) = @_;
+}
+if ($IgnoreGaps) {
+my($Index, $Residue);
+$SeqLen = 0;
+for $Index (0 .. (length($Seq) - 1)) {
+$Residue = substr($Seq, $Index, 1);
+if ($Residue =~ /[A-Z]/i) {
+	$SeqLen++;
+}
+}
+}
+else {
+$SeqLen = length($Seq);
+}
+return $SeqLen;
+}
+# Is it a gap residue...
+sub IsGapResidue {
+my($Residue) = @_;
+my($Status);
+$Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
+return $Status;
+}
+# Is it a supported sequence file?
+#
+# Supported seqence formats are:
+#
+# ALN/ClustalW   .aln
+# GCG/MSF         .msf
+# PILEUP/MSF     .msf
+# Fasts(Pearson) .fasta, .fta
+# NBRF/PIR         .pir
+#
+sub IsSupportedSequenceFile {
+my($SequenceFile) = @_;
+my($Status, $SequenceFormat);
+$Status = 0; $SequenceFormat = 'NotSupported';
+SEQFORMAT: {
+if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
+if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
+if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
+if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
+$Status = 0; $SequenceFormat = 'NotSupported';
+}
+return ($Status, $SequenceFormat);
+}
+# Is it a ClustalW multiple sequence sequence file...
+sub IsClustalWSequenceFile {
+my($SequenceFile) = @_;
+my($Status, $Line);
+$Status = 0;
+open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
+$Line = GetTextLine(\*SEQUENCEFILE);
+$Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
+close SEQUENCEFILE;
+return $Status;
+}
+# Is it a valid Pearson fasta sequence or alignment file?
+#
+sub IsPearsonFastaSequenceFile {
+my($FastaFile, $Line, $Status);
+($FastaFile) = @_;
+$Status = 0;
+open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
+$Line = GetTextLine(\*FASTAFILE);
+# First line starts with > and the fourth character is not ';'; otherwise, it's
+# PIR FASTA format.
+if ($Line =~ /^>/) {
+my($FourthChar);
+$FourthChar = substr($Line, 3, 1);
+$Status = ($FourthChar !~ /\;/) ? 1 : 0;
+}
+close FASTAFILE;
+return $Status;
+}
+# Is it a valid NBRF/PIR fasta sequence or alignment file?
+#
+sub IsPIRFastaSequenceFile {
+my($FastaFile, $Line, $Status);
+($FastaFile) = @_;
+$Status = 0;
+open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
+$Line = GetTextLine(\*FASTAFILE);
+# First line starts with > and the fourth character is ';'; otherwise, it's
+# a Pearson FASTA format.
+if ($Line =~ /^>/) {
+my($FourthChar);
+$FourthChar = substr($Line, 3, 1);
+$Status = ($FourthChar =~ /\;/) ? 1 : 0;
+}
+close FASTAFILE;
+return $Status;
+}
+# Is it a valid MSF sequence or alignment file?
+#
+sub IsMSFSequenceFile {
+my($MSFFile) = @_;
+open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
+my($Line, $Status);
+$Status = 0;
+# Find a line that contains MSF: keyword and ends with '..'
+LINE: while ($Line = GetTextLine(\*MSFFILE)) {
+$Line = RemoveLeadingWhiteSpaces($Line);
+if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
+$Status = 1;
+last LINE;
+}
+elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
+# Pileup MSF...
+$Status = 1;
+last LINE;
+}
+}
+close MSFFILE;
+return $Status;
+}
+# Read sequence or sequence alignment file...
+sub ReadSequenceFile {
+my($SequenceFile) = @_;
+if (IsPearsonFastaSequenceFile($SequenceFile)) {
+return ReadPearsonFastaSequenceFile($SequenceFile);
+}
+elsif (IsPIRFastaSequenceFile($SequenceFile)) {
+return ReadPIRFastaSequenceFile($SequenceFile);
+}
+elsif (IsMSFSequenceFile($SequenceFile)) {
+return ReadMSFSequenceFile($SequenceFile);
+}
+elsif (IsClustalWSequenceFile($SequenceFile)) {
+return ReadClustalWSequenceFile($SequenceFile);
+}
+else {
+return undef;
+}
+}
+# Read file and setup alignment data...
+sub ReadClustalWSequenceFile {
+my($SequenceFile) = @_;
+return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
+}
+# Read file and setup alignment data...
+sub ReadPearsonFastaSequenceFile {
+my($SequenceFile) = @_;
+return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
+}
+# Read file and setup alignment data...
+sub ReadPIRFastaSequenceFile {
+my($SequenceFile) = @_;
+return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
+}
+# Read file and setup sequence data...
+sub ReadMSFSequenceFile {
+my($SequenceFile) = @_;
+return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
+}
+# Write out a Pearson FASTA file...
+sub WritePearsonFastaSequenceFile {
+my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
+$MaxLength = 80;
+if (@_ == 3) {
+($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
+}
+elsif (@_ == 2) {
+($SequenceFileName, $SequenceDataRef) = @_;
+}
+open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
+for $ID (@{$SequenceDataRef->{IDs}}) {
+$Description = $SequenceDataRef->{Description}{$ID};
+$Sequence = $SequenceDataRef->{Sequence}{$ID};
+$WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
+# Description also contains ID...
+print SEQUENCEFILE ">$Description\n";
+print SEQUENCEFILE "$WrappedSequence\n";
+}
+close SEQUENCEFILE;
+}
+# Get ID, Sequence and Length for smallest or longest sequence
+sub _GetShortestOrLongestSequence {
+my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
+my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
+($ID, $Seq, $SeqLen) = ('', '', '');
+$FirstID = '';
+ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
+$CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
+$CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
+$CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
+if (!$FirstID) {
+$FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
+($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
+next ID;
+}
+if ($CurrentSeqLen != $SeqLen) {
+if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
+	($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
+}
+elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
+	($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
+}
+}
+}
+return ($ID, $Seq, $SeqLen, $Description);
+}
+# Remove gaps in the sequence and return new sequence...
+sub RemoveSequenceGaps {
+my($Seq) = @_;
+my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
+$SeqWithoutGaps = '';
+$SeqLen = length($Seq);
+for $Index (0 .. ($SeqLen - 1)) {
+$Residue = substr($Seq, $Index, 1);
+if ($Residue =~ /[A-Z]/i) {
+$SeqWithoutGaps .= $Residue;
+}
+}
+return $SeqWithoutGaps;
+}
+# Using input alignment data map ref containing following keys, generate
+# a new hash with same set of keys after residue columns containg only
+# gaps have been removed:
+#
+# {IDs} : Array of IDs in order as they appear in file
+# {Count}: ID count...
+# {Description}{$ID} : Description data...
+# {Sequence}{$ID} : Sequence data...
+#
+sub RemoveSequenceAlignmentGapColumns {
+my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
+($AlignmentDataMapRef) = @_;
+%NewAlignmentDataMap = ();
+@{$NewAlignmentDataMap{IDs}} =();
+%{$NewAlignmentDataMap{Description}} =();
+%{$NewAlignmentDataMap{Sequence}} =();
+$NewAlignmentDataMap{Count} = 0;
+# Transfer ID and count information...
+for $ID (@{$AlignmentDataMapRef->{IDs}}) {
+push @{$NewAlignmentDataMap{IDs}}, $ID;
+$NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
+$NewAlignmentDataMap{Sequence}{$ID} = '';
+$NewAlignmentDataMap{Count} += 1;
+}
+# Go over residue columns and transfer the data...
+my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
+$FirstID = $AlignmentDataMapRef->{IDs}[0];
+$FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
+$FirstSeqLen = length($FirstSeq);
+RES: for $Index (0 .. ($FirstSeqLen - 1)) {
+# Is this a gap column?
+$GapColumn = 1;
+ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
+$Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
+if ($Res =~ /[A-Z]/i) {
+	$GapColumn = 0;
+	last ID;
+}
+}
+if ($GapColumn) {
+next RES;
+}
+# Transfer this residue...
+for $ID (@{$AlignmentDataMapRef->{IDs}}) {
+$Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
+$NewAlignmentDataMap{Sequence}{$ID} .= $Res;
+}
+}
+return (\%NewAlignmentDataMap);
+}
+#
+# Read sequences file and return a reference to hash with the following keys:
+#
+# {IDs} - Array of sequence IDs
+# {Count} - Number of sequences
+# {Description}{$ID} - Sequence description
+# {Sequence}{$ID} - Sequence for a specific ID
+# {InputFileType} - Sequence file format
+# {ConservedAnnotation} - Conserved residue annonation
+#
+# Note:
+#   . Conserved residue annotation either exist in the input sequence alignment file or set
+#     for a file containing same number of residues for all the sequence using the following
+#     notation: * - Residue conserved; ' ' - Residue not conserved.
+#
+sub _ReadFileAndSetupSequencesData {
+my($SequenceFile, $SequenceType) = @_;
+my($SequenceDataMapRef);
+$SequenceDataMapRef = undef;
+# Read sequence file...
+$SequenceDataMapRef = '';
+if ($SequenceType =~ /^ClustalW$/i) {
+$SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
+}
+elsif ($SequenceType =~ /^Pearson$/i) {
+$SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
+}
+elsif ($SequenceType =~ /^PIR$/i) {
+$SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
+}
+elsif ($SequenceType =~ /^MSF$/i) {
+$SequenceDataMapRef = _ReadMSFFile($SequenceFile);
+}
+else {
+return $SequenceDataMapRef;
+}
+if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
+return ($SequenceDataMapRef);
+}
+if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
+return ($SequenceDataMapRef);
+}
+# Use the first sequence to setup an empty ConservedAnnotation key...
+# And mark fully conserved residues...
+#
+my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
+$ID = $SequenceDataMapRef->{IDs}[0];
+$FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
+$FirstSeqLen = length($FirstSequence);
+$SequenceDataMapRef->{ConservedAnnotation} = '';
+for $Index (0 .. ($FirstSeqLen - 1)) {
+$FirstRes = '';
+$ResConserved = 1;
+ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
+$Sequence = $SequenceDataMapRef->{Sequence}{$ID};
+$Res = substr($Sequence, $Index, 1);
+if (!$FirstRes) {
+	$FirstRes = $Res;
+	next ID;
+}
+if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
+	$ResConserved = 0;
+	last ID;
+}
+}
+if ($ResConserved) {
+$SequenceDataMapRef->{ConservedAnnotation} .= '*';
+}
+else {
+$SequenceDataMapRef->{ConservedAnnotation} .= ' ';
+}
+}
+return ($SequenceDataMapRef);
+}
+# Read sequence data in ClustalW multiple sequence alignment file and
+# return a reference to hash with these keys and values:
+#
+# {IDs} - Array of sequence IDs
+# {Count} - Number of sequences
+# {Description}{$ID} - Sequence description
+# {Sequence}{$ID} - Sequence for a specific ID
+# {InputFileType} - Sequence file format
+# {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
+#
+#
+#
+# And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
+#
+# '*' indicates positions which have a single, fully conserved residue
+#
+# ':' indicates that one of the following 'strong' groups is fully conserved: STA
+#    NEQK NHQK NDEQ QHRK MILV MILF HY FYW
+# '.' indicates that one of the following 'weaker' groups is fully conserved:
+#     CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
+#
+# These are all the positively scoring groups that occur in the Gonnet Pam250
+# matrix. The strong and weak groups are defined as strong score >0.5 and weak
+# score =<0.5 respectively.
+#
+sub _ReadClustalWFile {
+my($SequenceFile) = @_;
+my(%SequencesDataMap);
+# Initialize data...
+%SequencesDataMap = ();
+@{$SequencesDataMap{IDs}} = ();
+%{$SequencesDataMap{Description}} = ();
+%{$SequencesDataMap{Sequence}} = ();
+$SequencesDataMap{Count} = 0;
+$SequencesDataMap{ConservedAnnotation} = '';
+$SequencesDataMap{InputFileType} = 'ClustalW';
+open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
+my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
+# Ignore the header line...
+$Line = <SEQUENCEFILE>;
+LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
+if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
+# Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
+# Extract it and save...
+$LineLength = length($Line);
+$AnnotationStart = $LineLength - $SequenceLength;
+$AnnotationLength = $SequenceLength;
+$Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
+$SequencesDataMap{ConservedAnnotation} .= $Annotation;
+}
+else {
+# Extract ID and sequences...
+($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
+$Sequence =~ s/ //g;
+if (!($ID && $Sequence)) {
+	next LINE;
+}
+if (exists $SequencesDataMap{Sequence}{$ID}) {
+	# Append to existing alignment value...
+	$SequenceLength = length($Sequence);
+	$SequencesDataMap{Sequence}{$ID} .= $Sequence;
+}
+else {
+	# New alignment data...
+	$SequencesDataMap{Count} += 1;
+	push @{$SequencesDataMap{IDs}}, $ID;
+	$SequencesDataMap{Description}{$ID} = $ID;
+	$SequencesDataMap{Sequence}{$ID} = $Sequence;
+	$SequenceLength = length($Sequence);
+}
+}
+}
+close SEQUENCEFILE;
+return (\%SequencesDataMap);
+}
+# Read Pearson fasta file and return a reference to hash with these keys:
+#
+# {IDs} - Array of sequence IDs
+# {Count} - Number of sequences
+# {Description}{$ID} - Sequence description
+# {Sequence}{$ID} - Sequence for a specific ID
+# {InputFileType} - Sequence file format
+# {ConservedAnnotation} - Conserved residue annonation
+#
+sub _ReadPearsonFastaFile {
+my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
+($FastaFileName) = @_;
+%FastaDataMap = ();
+@{$FastaDataMap{IDs}} =();
+%{$FastaDataMap{Description}} =();
+%{$FastaDataMap{Sequence}} =();
+$FastaDataMap{Count} = 0;
+$FastaDataMap{InputFileType} = 'Pearson';
+open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
+$ID = '';
+$IgnoreID = 0;
+LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
+if ($Line =~ /^\>/) {
+# Start of a new ID...
+$Line =~ s/^\>//;
+$Line = RemoveLeadingWhiteSpaces($Line);
+@LineWords = ();
+@LineWords = split / /, $Line;
+$ID = $LineWords[0];
+$ID =~ s/ //g;
+$Description = $Line;
+$IgnoreID = 0;
+if (exists $FastaDataMap{Sequence}{$ID}) {
+	$IgnoreID = 1;
+	warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
+	next LINE;
+}
+push @{$FastaDataMap{IDs}}, $ID;
+$FastaDataMap{Description}{$ID} = $Description;
+$FastaDataMap{Count} += 1;
+next LINE;
+}
+if ($IgnoreID) { next LINE; }
+# Remove any spaces in the sequence...
+$Line =~ s/ //g;
+# Sequence data for active ID...
+if (exists $FastaDataMap{Sequence}{$ID}) {
+$FastaDataMap{Sequence}{$ID} .= $Line;
+}
+else {
+$FastaDataMap{Sequence}{$ID} = $Line;
+}
+}
+close FASTAFILE;
+return \%FastaDataMap;
+}
+# Read PIR fasta file and return a reference to hash with these keys:
+#
+# {IDs} - Array of sequence IDs
+# {Count} - Number of sequences
+# {Description}{$ID} - Sequence description
+# {Sequence}{$ID} - Sequence for a specific ID
+# {InputFileType} - Sequence file format
+# {ConservedAnnotation} - Conserved residue annonation
+#
+# Format:
+# A sequence in PIR format consists of:
+# One line starting with
+#   a ">" (greater-than) sign, followed by
+#   a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
+#   a semicolon, followed by
+#   the sequence identification code (the database ID-code).
+# One line containing a textual description of the sequence.
+# One or more lines containing the sequence itself. The end of the
+# sequence is marked by a "*" (asterisk) character.
+#
+# A file in PIR format may comprise more than one sequence.
+#
+# The PIR format is also often referred to as the NBRF format.
+#
+# Code SequenceType
+# P1    Protein (complete)
+# F1    Protein (fragment)
+# DL    DNA (linear)
+# DC    DNA (circular)
+# RL    RNA (linear)
+# RC   RNA (circular)
+# N3    tRNA
+# N1    Other functional RNA
+#
+sub _ReadPIRFastaFile {
+my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
+($FastaFileName) = @_;
+%FastaDataMap = ();
+@{$FastaDataMap{IDs}} =();
+%{$FastaDataMap{Description}} =();
+%{$FastaDataMap{Sequence}} =();
+%{$FastaDataMap{SequenceTypeCode}} =();
+$FastaDataMap{Count} = 0;
+$FastaDataMap{InputFileType} = 'PIR';
+open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
+$ID = '';
+$ReadingSequenceData = 0;
+LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
+if ($Line =~ /^\>/) {
+# Start of a new ID...
+$Line =~ s/^\>//;
+$Line = RemoveLeadingWhiteSpaces($Line);
+($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
+# Use next line to retrieve sequence description...
+$Line = GetTextLine(\*FASTAFILE);
+$Line = RemoveLeadingWhiteSpaces($Line);
+$Description = $Line;
+if (exists $FastaDataMap{Sequence}{$ID}) {
+	warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
+	next LINE;
+}
+$ReadingSequenceData = 1;
+push @{$FastaDataMap{IDs}}, $ID;
+$FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
+$FastaDataMap{Description}{$ID} = $Description;
+$FastaDataMap{Count} += 1;
+next LINE;
+}
+if (!$ReadingSequenceData) { next LINE; }
+# Remove any spaces in the sequence...
+$Line =~ s/ //g;
+if ($Line =~ /[\*]$/) {
+# End of sequence...
+$ReadingSequenceData = 0;
+$Line =~ s/[\*]$//;
+}
+# Sequence data for active ID...
+if (exists $FastaDataMap{Sequence}{$ID}) {
+$FastaDataMap{Sequence}{$ID} .= $Line;
+}
+else {
+$FastaDataMap{Sequence}{$ID} = $Line;
+}
+}
+close FASTAFILE;
+return \%FastaDataMap;
+}
+# Read MSF file and return a reference to hash with these keys:
+#
+# {IDs} : Array of IDs in order as they appear in file
+# {Count}: ID count...
+# {Description}{$ID} : Description data...
+# {Sequence}{$ID} : Sequence data...
+#
+sub _ReadMSFFile {
+my($MSFFileName, $Line, @LineWords, %MSFDataMap);
+($MSFFileName) = @_;
+%MSFDataMap = ();
+@{$MSFDataMap{IDs}} =();
+%{$MSFDataMap{Description}} =();
+%{$MSFDataMap{Sequence}} =();
+$MSFDataMap{Count} = 0;
+$MSFDataMap{InputFileType} = 'MSF';
+open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
+# Collect sequences and IDs...
+#
+# '//' after the name fields indicates end of header list and start of sequence data.
+#
+my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
+%MSFIDsMap = ();
+$NameFieldsFound = 0;
+LINE: while ($Line = GetTextLine(\*MSFFILE)) {
+if ($Line =~ /Name:/) {
+$NameFieldsFound++;
+($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
+if ($ID =~ / /) {
+	($ID) = $ID =~ /^(.*?)[ ]+/
+}
+if (exists $MSFIDsMap{$ID}) {
+	warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
+	next LINE;
+}
+$MSFIDsMap{$ID} = $ID;
+push @{$MSFDataMap{IDs}}, $ID;
+$MSFDataMap{Description}{$ID} = $ID;
+$MSFDataMap{Count} += 1;
+}
+elsif ( /\/\// && $NameFieldsFound) {
+# End of header list...
+last LINE;
+}
+}
+# Collect all sequences...
+#
+my($FirstField, $SecondField);
+while ($Line = GetTextLine(\*MSFFILE)) {
+($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
+if (exists $MSFIDsMap{$FirstField}) {
+# It's ID and sequence data...
+$ID = $FirstField;
+$Sequence = $SecondField;
+# Take out spaces and leave the gap characters...
+$Sequence =~ s/ //g;
+if ($MSFDataMap{Sequence}{$ID}) {
+	$MSFDataMap{Sequence}{$ID} .= $Sequence;
+}
+else {
+	$MSFDataMap{Sequence}{$ID} = $Sequence;
+}
+}
+}
+close MSFFILE;
+return \%MSFDataMap;
+}
+1;
+__END__
+=head1 NAME
+SequenceFileUtil
+=head1 SYNOPSIS
+use SequenceFileUtil ;
+use SequenceFileUtil qw(:all);
+=head1 DESCRIPTION
+B<SequenceFileUtil> module provides the following functions:
+AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity,
+CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength,
+GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile,
+IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile,
+ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile,
+ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns,
+RemoveSequenceGaps, WritePearsonFastaSequenceFile
+SequenceFileUtil module provides various methods to process sequence
+files and retreive appropriate information.
+=head1 FUNCTIONS
+=over 4
+=item B<AreSequenceLengthsIdentical>
+$Status = AreSequenceLengthsIdentical($SequencesDataRef);
+Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1
+or 0 based whether lengths of all the sequence is same.
+=item B<CalcuatePercentSequenceIdentity>
+$PercentIdentity =
+AreSequenceLengthsIdenticalAreSequenceLengthsIdentical(
+$Sequence1, $Sequence2, [$IgnoreGaps, $Precision]);
+Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments
+I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the
+returned value. By default, gaps are ignored and precision is set up to 1 decimal.
+=item B<CalculatePercentSequenceIdentityMatrix>
+$IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix(
+$SequencesDataRef, [$IgnoreGaps,
+$Precision]);
+Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef>
+and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and
+I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps
+are ignored and precision is set up to 1 decimal.
+=item B<GetSequenceLength>
+$SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]);
+Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling
+of gaps. By default, gaps are ignored.
+=item B<GetShortestSequence>
+($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence(
+$SequencesDataRef, [$IgnoreGaps]);
+Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID,
+$Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps
+controls handling of gaps in sequences. By default, gaps are ignored.
+=item B<GetLongestSequence>
+($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence(
+$SequencesDataRef, [$IgnoreGaps]);
+Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>,
+B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument
+$I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored.
+=item B<IsGapResidue>
+$Status = AreSequenceLengthsIdentical($Residue);
+Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is
+considered a gap residue.
+=item B<IsSupportedSequenceFile>
+$Status = IsSupportedSequenceFile($SequenceFile);
+Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence
+format.
+=item B<IsClustalWSequenceFile>
+$Status = IsClustalWSequenceFile($SequenceFile);
+Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment
+format.
+=item B<IsPearsonFastaSequenceFile>
+$Status = IsPearsonFastaSequenceFile($SequenceFile);
+Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence
+format.
+=item B<IsPIRFastaSequenceFile>
+$Status = IsPIRFastaSequenceFile($SequenceFile);
+Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence
+format.
+=item B<IsMSFSequenceFile>
+$Status = IsClustalWSequenceFile($SequenceFile);
+Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment
+format.
+=item B<ReadSequenceFile>
+$SequenceDataMapRef = ReadSequenceFile($SequenceFile);
+Reads I<SequenceFile> and returns reference to a hash containing following key/value
+pairs:
+$SequenceDataMapRef->{IDs} - Array of sequence IDs
+$SequenceDataMapRef->{Count} - Number of sequences
+$SequenceDataMapRef->{Description}{$ID} - Sequence description
+$SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID
+$SequenceDataMapRef->{Sequence}{InputFileType} - File format
+=item B<ReadClustalWSequenceFile>
+$SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile);
+Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value
+pairs as describes in B<ReadSequenceFile> method.
+=item B<ReadMSFSequenceFile>
+$SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile);
+Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value
+pairs as describes in B<ReadSequenceFile> method.
+=item B<ReadPIRFastaSequenceFile>
+$SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile);
+Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value
+pairs as describes in B<ReadSequenceFile> method.
+=item B<ReadPearsonFastaSequenceFile>
+$SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile);
+Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value
+pairs as describes in B<ReadSequenceFile> method.
+=item B<RemoveSequenceGaps>
+$SeqWithoutGaps = RemoveSequenceGaps($Sequence);
+Removes gaps from I<Sequence> and return a sequence without any gaps.
+=item B<RemoveSequenceAlignmentGapColumns>
+$NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns(
+$AlignmentDataMapRef);
+Using input alignment data map ref containing following keys, generate a new hash with
+same set of keys after residue columns containg only gaps have been removed:
+{IDs} : Array of IDs in order as they appear in file
+{Count}: ID count
+{Description}{$ID} : Description data
+{Sequence}{$ID} : Sequence data
+=item B<WritePearsonFastaSequenceFile>
+WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef,
+[$MaxLength]);
+Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence
+file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is
+80.
+=back
+=head1 AUTHOR
+Manish Sud <msud@san.rr.com>
+=head1 SEE ALSO
+PDBFileUtil.pm
+=head1 COPYRIGHT
+Copyright (C) 2015 Manish Sud. All rights reserved.
+This file is part of MayaChemTools.
+MayaChemTools is free software; you can redistribute it and/or modify it under
+the terms of the GNU Lesser General Public License as published by the Free
+Software Foundation; either version 3 of the License, or (at your option)
+any later version.
+=cut

Mercurial > repos > deepakjadmin > mayatool3_test2

comparison lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip