annotate lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1 package SequenceFileUtil;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
2 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: SequenceFileUtil.pm,v $
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:47:18 $
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.33 $
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
6 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
8 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
10 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
12 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
17 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
22 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
27 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
28
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
30 use Exporter;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
31 use Text::ParseWords;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
32 use TextUtil;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
33 use FileUtil;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
34
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
36
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
37 @ISA = qw(Exporter);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
39 @EXPORT_OK = qw();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
40
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
42
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
43 # Compare lengths of all sequences...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
44 sub AreSequenceLengthsIdentical {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
45 my($SequencesDataRef) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
46 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
47
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
48 $Status = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
49 $FirstID = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
50 $FirstDifferentLenID = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
51
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
52 ID: for $ID (@{$SequencesDataRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
53 if (!$FirstID) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
54 $FirstID = $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
55 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
56 next ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
57 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
58 $SeqLen = length($SequencesDataRef->{Sequence}{$ID});
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
59 if ($SeqLen != $FirstSeqLen) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
60 $Status = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
61 $FirstDifferentLenID = $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
62 last ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
63 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
64 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
65 return ($Status);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
66 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
67
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
68 # Calculate percent identity between two sequences. By default, gaps are ignored.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
69 sub CalcuatePercentSequenceIdentity {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
70 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
71
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
72 $PercentIdentity = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
73 $Precision = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
74 $IgnoreGaps = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
75 if (@_ == 4) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
76 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
77 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
78 elsif (@_ == 3) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
79 ($Sequence1, $Sequence2, $IgnoreGaps) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
80 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
81 elsif (@_ == 2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
82 ($Sequence1, $Sequence2) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
83 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
84 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
85 return $PercentIdentity;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
86 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
87 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
88 return $PercentIdentity;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
89 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
90 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
91
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
92 $Sequence1Len = length($Sequence1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
93 $Sequence2Len = length($Sequence2);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
94
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
95 $ResMatchCount = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
96 $ResCount = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
97 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
98 $Residue1 = substr($Sequence1, $Index, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
99 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
100 if ($IgnoreGaps) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
101 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
102 next RESIDUE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
103 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
104 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
105 if ($Residue1 eq $Residue2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
106 $ResMatchCount++;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
107 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
108 $ResCount++;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
109 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
110 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
111 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
112
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
113 return $PercentIdentity;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
114 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
115
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
116 # Calculate pairwise identify matrix for all the sequences and return a reference
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
117 # to a hash with the following keys:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
118 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
119 # {IDs} - Sequence IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
120 # {Count} - Number of IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
122 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
123 sub CalculatePercentSequenceIdentityMatrix {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
124 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
125
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
126 $IgnoreGaps = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
127 $Precision = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
128 if (@_ == 3) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
129 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
130 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
131 elsif (@_ == 2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
132 ($SequencesDataRef, $IgnoreGaps) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
133 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
134 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
135 ($SequencesDataRef) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
136 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
137
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
138 %IdentityMatrixData = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
139 @{$IdentityMatrixData{IDs}} = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
140 %{$IdentityMatrixData{PercentIdentity}} = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
141 $IdentityMatrixData{Count} = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
142
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
143 for $ID (@{$SequencesDataRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
144 push @{$IdentityMatrixData{IDs}}, $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
145 $IdentityMatrixData{Count} += 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
146 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
147 # Initialize and calculate percent identity data values...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
148 for $RowID (@{$SequencesDataRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
149 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
150 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
151 for $ColID (@{$SequencesDataRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
152 $IdentityMatrixData{$RowID}{$ColID} = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
153 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
154 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
155 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
156 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
157 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
158 return \%IdentityMatrixData;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
159 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
160
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
161 # Retrieve information about shortest sequence...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
162 sub GetShortestSequence {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
163 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
164
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
165 $IgnoreGaps = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
166 if (@_ == 2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
167 ($SequencesDataRef, $IgnoreGaps) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
168 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
169 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
170 ($SequencesDataRef) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
171 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
172
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
173 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
174 return ($ID, $Sequence, $SeqLen, $Description);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
175 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
176
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
177 # Retrieve information about longest sequence..
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
178 sub GetLongestSequence {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
179 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
180
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
181 $IgnoreGaps = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
182 if (@_ == 2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
183 ($SequencesDataRef, $IgnoreGaps) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
184 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
185 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
186 ($SequencesDataRef) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
187 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
188
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
189 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
190 return ($ID, $Sequence, $SeqLen, $Description);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
191 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
192
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
193 # Get sequence length...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
194 sub GetSequenceLength {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
195 my($Seq, $SeqLen, $IgnoreGaps);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
196
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
197 $SeqLen = ''; $IgnoreGaps = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
198 if (@_ == 2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
199 ($Seq, $IgnoreGaps) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
200 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
201 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
202 ($Seq) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
203 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
204 if ($IgnoreGaps) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
205 my($Index, $Residue);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
206 $SeqLen = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
207 for $Index (0 .. (length($Seq) - 1)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
208 $Residue = substr($Seq, $Index, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
209 if ($Residue =~ /[A-Z]/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
210 $SeqLen++;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
211 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
212 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
213 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
214 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
215 $SeqLen = length($Seq);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
216 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
217
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
218 return $SeqLen;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
219 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
220
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
221 # Is it a gap residue...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
222 sub IsGapResidue {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
223 my($Residue) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
224 my($Status);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
225
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
226 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
227
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
228 return $Status;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
229 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
230
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
231 # Is it a supported sequence file?
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
232 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
233 # Supported seqence formats are:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
234 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
235 # ALN/ClustalW .aln
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
236 # GCG/MSF .msf
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
237 # PILEUP/MSF .msf
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
238 # Fasts(Pearson) .fasta, .fta
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
239 # NBRF/PIR .pir
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
240 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
241 sub IsSupportedSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
242 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
243 my($Status, $SequenceFormat);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
244 $Status = 0; $SequenceFormat = 'NotSupported';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
245
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
246 SEQFORMAT: {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
247 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
248 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
249 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
250 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
251 $Status = 0; $SequenceFormat = 'NotSupported';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
252 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
253 return ($Status, $SequenceFormat);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
254 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
255
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
256 # Is it a ClustalW multiple sequence sequence file...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
257 sub IsClustalWSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
258 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
259 my($Status, $Line);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
260
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
261 $Status = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
262
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
263 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
264 $Line = GetTextLine(\*SEQUENCEFILE);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
265 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
266 close SEQUENCEFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
267
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
268 return $Status;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
269 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
270
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
271 # Is it a valid Pearson fasta sequence or alignment file?
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
272 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
273 sub IsPearsonFastaSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
274 my($FastaFile, $Line, $Status);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
275
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
276 ($FastaFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
277 $Status = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
278
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
279 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
280 $Line = GetTextLine(\*FASTAFILE);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
281
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
282 # First line starts with > and the fourth character is not ';'; otherwise, it's
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
283 # PIR FASTA format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
284 if ($Line =~ /^>/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
285 my($FourthChar);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
286 $FourthChar = substr($Line, 3, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
287 $Status = ($FourthChar !~ /\;/) ? 1 : 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
288 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
289 close FASTAFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
290
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
291 return $Status;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
292 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
293
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
294 # Is it a valid NBRF/PIR fasta sequence or alignment file?
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
295 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
296 sub IsPIRFastaSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
297 my($FastaFile, $Line, $Status);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
298
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
299 ($FastaFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
300 $Status = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
301
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
302 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
303 $Line = GetTextLine(\*FASTAFILE);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
304
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
305 # First line starts with > and the fourth character is ';'; otherwise, it's
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
306 # a Pearson FASTA format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
307 if ($Line =~ /^>/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
308 my($FourthChar);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
309 $FourthChar = substr($Line, 3, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
310 $Status = ($FourthChar =~ /\;/) ? 1 : 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
311 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
312 close FASTAFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
313
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
314 return $Status;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
315 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
316
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
317 # Is it a valid MSF sequence or alignment file?
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
318 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
319 sub IsMSFSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
320 my($MSFFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
321
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
322 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
323
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
324 my($Line, $Status);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
325
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
326 $Status = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
327 # Find a line that contains MSF: keyword and ends with '..'
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
328 LINE: while ($Line = GetTextLine(\*MSFFILE)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
329 $Line = RemoveLeadingWhiteSpaces($Line);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
330 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
331 $Status = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
332 last LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
333 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
334 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
335 # Pileup MSF...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
336 $Status = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
337 last LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
338 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
339 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
340 close MSFFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
341
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
342 return $Status;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
343 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
344
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
345 # Read sequence or sequence alignment file...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
346 sub ReadSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
347 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
348
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
349 if (IsPearsonFastaSequenceFile($SequenceFile)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
350 return ReadPearsonFastaSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
351 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
352 elsif (IsPIRFastaSequenceFile($SequenceFile)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
353 return ReadPIRFastaSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
354 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
355 elsif (IsMSFSequenceFile($SequenceFile)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
356 return ReadMSFSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
357 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
358 elsif (IsClustalWSequenceFile($SequenceFile)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
359 return ReadClustalWSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
360 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
361 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
362 return undef;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
363 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
364 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
365
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
366 # Read file and setup alignment data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
367 sub ReadClustalWSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
368 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
369
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
370 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
371 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
372
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
373 # Read file and setup alignment data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
374 sub ReadPearsonFastaSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
375 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
376
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
377 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
378 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
379
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
380 # Read file and setup alignment data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
381 sub ReadPIRFastaSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
382 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
383
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
384 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
385 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
386
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
387
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
388 # Read file and setup sequence data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
389 sub ReadMSFSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
390 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
391
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
392 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
393 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
394
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
395 # Write out a Pearson FASTA file...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
396 sub WritePearsonFastaSequenceFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
397 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
398
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
399 $MaxLength = 80;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
400 if (@_ == 3) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
401 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
402 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
403 elsif (@_ == 2) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
404 ($SequenceFileName, $SequenceDataRef) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
405 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
406 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
407 for $ID (@{$SequenceDataRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
408 $Description = $SequenceDataRef->{Description}{$ID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
409 $Sequence = $SequenceDataRef->{Sequence}{$ID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
410 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
411
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
412 # Description also contains ID...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
413 print SEQUENCEFILE ">$Description\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
414 print SEQUENCEFILE "$WrappedSequence\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
415 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
416 close SEQUENCEFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
417 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
418
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
419 # Get ID, Sequence and Length for smallest or longest sequence
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
420 sub _GetShortestOrLongestSequence {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
421 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
422 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
423
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
424 ($ID, $Seq, $SeqLen) = ('', '', '');
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
425 $FirstID = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
426
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
427 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
428 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
429 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
430 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
431 if (!$FirstID) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
432 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
433 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
434 next ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
435 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
436 if ($CurrentSeqLen != $SeqLen) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
437 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
439 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
440 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
441 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
442 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
443 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
444 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
445 return ($ID, $Seq, $SeqLen, $Description);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
446 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
447
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
448 # Remove gaps in the sequence and return new sequence...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
449 sub RemoveSequenceGaps {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
450 my($Seq) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
451 my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
452
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
453 $SeqWithoutGaps = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
454 $SeqLen = length($Seq);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
455 for $Index (0 .. ($SeqLen - 1)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
456 $Residue = substr($Seq, $Index, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
457 if ($Residue =~ /[A-Z]/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
458 $SeqWithoutGaps .= $Residue;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
459 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
460 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
461
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
462 return $SeqWithoutGaps;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
463 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
464
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
465 # Using input alignment data map ref containing following keys, generate
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
466 # a new hash with same set of keys after residue columns containg only
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
467 # gaps have been removed:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
468 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
469 # {IDs} : Array of IDs in order as they appear in file
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
470 # {Count}: ID count...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
471 # {Description}{$ID} : Description data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
472 # {Sequence}{$ID} : Sequence data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
473 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
474 sub RemoveSequenceAlignmentGapColumns {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
475 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
476
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
477 ($AlignmentDataMapRef) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
478
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
479 %NewAlignmentDataMap = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
480 @{$NewAlignmentDataMap{IDs}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
481 %{$NewAlignmentDataMap{Description}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
482 %{$NewAlignmentDataMap{Sequence}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
483 $NewAlignmentDataMap{Count} = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
484
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
485 # Transfer ID and count information...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
486 for $ID (@{$AlignmentDataMapRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
487 push @{$NewAlignmentDataMap{IDs}}, $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
488 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
489 $NewAlignmentDataMap{Sequence}{$ID} = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
490 $NewAlignmentDataMap{Count} += 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
491 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
492
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
493 # Go over residue columns and transfer the data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
494 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
495
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
496 $FirstID = $AlignmentDataMapRef->{IDs}[0];
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
497 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
498 $FirstSeqLen = length($FirstSeq);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
499
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
500 RES: for $Index (0 .. ($FirstSeqLen - 1)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
501 # Is this a gap column?
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
502 $GapColumn = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
503 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
504 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
505 if ($Res =~ /[A-Z]/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
506 $GapColumn = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
507 last ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
508 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
509 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
510 if ($GapColumn) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
511 next RES;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
512 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
513 # Transfer this residue...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
514 for $ID (@{$AlignmentDataMapRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
515 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
516 $NewAlignmentDataMap{Sequence}{$ID} .= $Res;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
517 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
518 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
519
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
520 return (\%NewAlignmentDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
521 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
522
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
523 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
524 # Read sequences file and return a reference to hash with the following keys:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
525 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
526 # {IDs} - Array of sequence IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
527 # {Count} - Number of sequences
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
528 # {Description}{$ID} - Sequence description
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
529 # {Sequence}{$ID} - Sequence for a specific ID
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
530 # {InputFileType} - Sequence file format
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
531 # {ConservedAnnotation} - Conserved residue annonation
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
532 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
533 # Note:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
534 # . Conserved residue annotation either exist in the input sequence alignment file or set
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
535 # for a file containing same number of residues for all the sequence using the following
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
536 # notation: * - Residue conserved; ' ' - Residue not conserved.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
537 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
538 sub _ReadFileAndSetupSequencesData {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
539 my($SequenceFile, $SequenceType) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
540 my($SequenceDataMapRef);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
541
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
542 $SequenceDataMapRef = undef;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
543
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
544 # Read sequence file...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
545 $SequenceDataMapRef = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
546 if ($SequenceType =~ /^ClustalW$/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
547 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
548 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
549 elsif ($SequenceType =~ /^Pearson$/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
550 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
551 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
552 elsif ($SequenceType =~ /^PIR$/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
553 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
554 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
555 elsif ($SequenceType =~ /^MSF$/i) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
556 $SequenceDataMapRef = _ReadMSFFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
557 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
558 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
559 return $SequenceDataMapRef;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
560 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
561
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
562 if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
563 return ($SequenceDataMapRef);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
564 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
565 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
566 return ($SequenceDataMapRef);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
567 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
568
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
569 # Use the first sequence to setup an empty ConservedAnnotation key...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
570 # And mark fully conserved residues...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
571 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
572 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
573 $ID = $SequenceDataMapRef->{IDs}[0];
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
574 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
575 $FirstSeqLen = length($FirstSequence);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
576 $SequenceDataMapRef->{ConservedAnnotation} = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
577 for $Index (0 .. ($FirstSeqLen - 1)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
578 $FirstRes = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
579 $ResConserved = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
580 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
581 $Sequence = $SequenceDataMapRef->{Sequence}{$ID};
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
582 $Res = substr($Sequence, $Index, 1);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
583 if (!$FirstRes) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
584 $FirstRes = $Res;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
585 next ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
586 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
587 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
588 $ResConserved = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
589 last ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
590 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
591 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
592 if ($ResConserved) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
593 $SequenceDataMapRef->{ConservedAnnotation} .= '*';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
594 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
595 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
596 $SequenceDataMapRef->{ConservedAnnotation} .= ' ';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
597 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
598 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
599
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
600 return ($SequenceDataMapRef);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
601 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
602
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
603 # Read sequence data in ClustalW multiple sequence alignment file and
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
604 # return a reference to hash with these keys and values:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
605 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
606 # {IDs} - Array of sequence IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
607 # {Count} - Number of sequences
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
608 # {Description}{$ID} - Sequence description
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
609 # {Sequence}{$ID} - Sequence for a specific ID
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
610 # {InputFileType} - Sequence file format
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
612 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
613 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
614 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
616 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
617 # '*' indicates positions which have a single, fully conserved residue
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
618 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
620 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
621
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
622 # '.' indicates that one of the following 'weaker' groups is fully conserved:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
623 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
624 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
625 # These are all the positively scoring groups that occur in the Gonnet Pam250
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
627 # score =<0.5 respectively.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
628 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
629 sub _ReadClustalWFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
630 my($SequenceFile) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
631 my(%SequencesDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
632
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
633 # Initialize data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
634 %SequencesDataMap = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
635 @{$SequencesDataMap{IDs}} = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
636 %{$SequencesDataMap{Description}} = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
637 %{$SequencesDataMap{Sequence}} = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
638 $SequencesDataMap{Count} = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
639 $SequencesDataMap{ConservedAnnotation} = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
640 $SequencesDataMap{InputFileType} = 'ClustalW';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
641
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
642 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
643
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
644 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
645
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
646 # Ignore the header line...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
647 $Line = <SEQUENCEFILE>;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
648
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
649 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
650 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
651 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
652 # Extract it and save...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
653 $LineLength = length($Line);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
654 $AnnotationStart = $LineLength - $SequenceLength;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
655 $AnnotationLength = $SequenceLength;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
656 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
657 $SequencesDataMap{ConservedAnnotation} .= $Annotation;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
658 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
659 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
660 # Extract ID and sequences...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
661 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
662 $Sequence =~ s/ //g;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
663 if (!($ID && $Sequence)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
664 next LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
665 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
666
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
667 if (exists $SequencesDataMap{Sequence}{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
668 # Append to existing alignment value...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
669 $SequenceLength = length($Sequence);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
670 $SequencesDataMap{Sequence}{$ID} .= $Sequence;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
671 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
672 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
673 # New alignment data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
674 $SequencesDataMap{Count} += 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
675 push @{$SequencesDataMap{IDs}}, $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
676 $SequencesDataMap{Description}{$ID} = $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
677 $SequencesDataMap{Sequence}{$ID} = $Sequence;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
678 $SequenceLength = length($Sequence);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
679 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
680 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
681 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
682 close SEQUENCEFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
683 return (\%SequencesDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
684 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
685
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
686 # Read Pearson fasta file and return a reference to hash with these keys:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
687 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
688 # {IDs} - Array of sequence IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
689 # {Count} - Number of sequences
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
690 # {Description}{$ID} - Sequence description
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
691 # {Sequence}{$ID} - Sequence for a specific ID
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
692 # {InputFileType} - Sequence file format
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
693 # {ConservedAnnotation} - Conserved residue annonation
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
694 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
695 sub _ReadPearsonFastaFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
696 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
697
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
698 ($FastaFileName) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
699
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
700 %FastaDataMap = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
701 @{$FastaDataMap{IDs}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
702 %{$FastaDataMap{Description}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
703 %{$FastaDataMap{Sequence}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
704 $FastaDataMap{Count} = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
705 $FastaDataMap{InputFileType} = 'Pearson';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
706
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
707 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
708 $ID = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
709 $IgnoreID = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
710 LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
711 if ($Line =~ /^\>/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
712 # Start of a new ID...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
713 $Line =~ s/^\>//;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
714 $Line = RemoveLeadingWhiteSpaces($Line);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
715 @LineWords = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
716 @LineWords = split / /, $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
717
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
718 $ID = $LineWords[0];
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
719 $ID =~ s/ //g;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
720 $Description = $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
721
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
722 $IgnoreID = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
723 if (exists $FastaDataMap{Sequence}{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
724 $IgnoreID = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
725 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
726 next LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
727 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
728 push @{$FastaDataMap{IDs}}, $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
729 $FastaDataMap{Description}{$ID} = $Description;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
730 $FastaDataMap{Count} += 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
731 next LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
732 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
733 if ($IgnoreID) { next LINE; }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
734
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
735 # Remove any spaces in the sequence...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
736 $Line =~ s/ //g;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
737 # Sequence data for active ID...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
738 if (exists $FastaDataMap{Sequence}{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
739 $FastaDataMap{Sequence}{$ID} .= $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
740 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
741 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
742 $FastaDataMap{Sequence}{$ID} = $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
743 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
744 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
745 close FASTAFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
746 return \%FastaDataMap;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
747 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
748
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
749 # Read PIR fasta file and return a reference to hash with these keys:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
750 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
751 # {IDs} - Array of sequence IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
752 # {Count} - Number of sequences
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
753 # {Description}{$ID} - Sequence description
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
754 # {Sequence}{$ID} - Sequence for a specific ID
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
755 # {InputFileType} - Sequence file format
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
756 # {ConservedAnnotation} - Conserved residue annonation
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
757 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
758 # Format:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
759 # A sequence in PIR format consists of:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
760 # One line starting with
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
761 # a ">" (greater-than) sign, followed by
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
762 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
763 # a semicolon, followed by
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
764 # the sequence identification code (the database ID-code).
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
765 # One line containing a textual description of the sequence.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
766 # One or more lines containing the sequence itself. The end of the
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
767 # sequence is marked by a "*" (asterisk) character.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
768 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
769 # A file in PIR format may comprise more than one sequence.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
770 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
771 # The PIR format is also often referred to as the NBRF format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
772 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
773 # Code SequenceType
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
774 # P1 Protein (complete)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
775 # F1 Protein (fragment)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
776 # DL DNA (linear)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
777 # DC DNA (circular)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
778 # RL RNA (linear)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
779 # RC RNA (circular)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
780 # N3 tRNA
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
781 # N1 Other functional RNA
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
782 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
783
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
784 sub _ReadPIRFastaFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
785 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
786
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
787 ($FastaFileName) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
788
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
789 %FastaDataMap = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
790 @{$FastaDataMap{IDs}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
791 %{$FastaDataMap{Description}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
792 %{$FastaDataMap{Sequence}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
793 %{$FastaDataMap{SequenceTypeCode}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
794 $FastaDataMap{Count} = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
795 $FastaDataMap{InputFileType} = 'PIR';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
796
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
797 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
798 $ID = '';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
799 $ReadingSequenceData = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
800 LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
801 if ($Line =~ /^\>/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
802 # Start of a new ID...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
803 $Line =~ s/^\>//;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
804 $Line = RemoveLeadingWhiteSpaces($Line);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
805 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
806
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
807 # Use next line to retrieve sequence description...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
808 $Line = GetTextLine(\*FASTAFILE);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
809 $Line = RemoveLeadingWhiteSpaces($Line);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
810 $Description = $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
811
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
812 if (exists $FastaDataMap{Sequence}{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
813 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
814 next LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
815 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
816 $ReadingSequenceData = 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
817 push @{$FastaDataMap{IDs}}, $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
818 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
819 $FastaDataMap{Description}{$ID} = $Description;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
820 $FastaDataMap{Count} += 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
821 next LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
822 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
823 if (!$ReadingSequenceData) { next LINE; }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
824
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
825 # Remove any spaces in the sequence...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
826 $Line =~ s/ //g;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
827 if ($Line =~ /[\*]$/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
828 # End of sequence...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
829 $ReadingSequenceData = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
830 $Line =~ s/[\*]$//;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
831 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
832 # Sequence data for active ID...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
833 if (exists $FastaDataMap{Sequence}{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
834 $FastaDataMap{Sequence}{$ID} .= $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
835 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
836 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
837 $FastaDataMap{Sequence}{$ID} = $Line;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
838 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
839 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
840 close FASTAFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
841 return \%FastaDataMap;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
842 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
843
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
844 # Read MSF file and return a reference to hash with these keys:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
845 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
846 # {IDs} : Array of IDs in order as they appear in file
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
847 # {Count}: ID count...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
848 # {Description}{$ID} : Description data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
849 # {Sequence}{$ID} : Sequence data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
850 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
851 sub _ReadMSFFile {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
852 my($MSFFileName, $Line, @LineWords, %MSFDataMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
853
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
854 ($MSFFileName) = @_;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
855
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
856 %MSFDataMap = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
857 @{$MSFDataMap{IDs}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
858 %{$MSFDataMap{Description}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
859 %{$MSFDataMap{Sequence}} =();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
860 $MSFDataMap{Count} = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
861 $MSFDataMap{InputFileType} = 'MSF';
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
862
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
863 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
864
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
865 # Collect sequences and IDs...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
866 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
867 # '//' after the name fields indicates end of header list and start of sequence data.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
868 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
869 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
870 %MSFIDsMap = ();
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
871 $NameFieldsFound = 0;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
872 LINE: while ($Line = GetTextLine(\*MSFFILE)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
873 if ($Line =~ /Name:/) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
874 $NameFieldsFound++;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
875 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
876 if ($ID =~ / /) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
877 ($ID) = $ID =~ /^(.*?)[ ]+/
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
878 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
879 if (exists $MSFIDsMap{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
880 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
881 next LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
882 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
883 $MSFIDsMap{$ID} = $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
884 push @{$MSFDataMap{IDs}}, $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
885 $MSFDataMap{Description}{$ID} = $ID;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
886 $MSFDataMap{Count} += 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
887 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
888 elsif ( /\/\// && $NameFieldsFound) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
889 # End of header list...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
890 last LINE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
891 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
892 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
893 # Collect all sequences...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
894 #
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
895 my($FirstField, $SecondField);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
896 while ($Line = GetTextLine(\*MSFFILE)) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
897 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
898 if (exists $MSFIDsMap{$FirstField}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
899 # It's ID and sequence data...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
900 $ID = $FirstField;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
901 $Sequence = $SecondField;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
902 # Take out spaces and leave the gap characters...
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
903 $Sequence =~ s/ //g;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
904 if ($MSFDataMap{Sequence}{$ID}) {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
905 $MSFDataMap{Sequence}{$ID} .= $Sequence;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
906 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
907 else {
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
908 $MSFDataMap{Sequence}{$ID} = $Sequence;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
909 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
910 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
911 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
912
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
913 close MSFFILE;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
914 return \%MSFDataMap;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
915 }
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
916
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
917
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
918 1;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
919
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
920 __END__
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
921
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
922 =head1 NAME
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
923
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
924 SequenceFileUtil
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
925
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
926 =head1 SYNOPSIS
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
927
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
928 use SequenceFileUtil ;
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
929
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
930 use SequenceFileUtil qw(:all);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
931
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
932 =head1 DESCRIPTION
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
933
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
934 B<SequenceFileUtil> module provides the following functions:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
935
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
936 AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
937 CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
938 GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
939 IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
940 ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
941 ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
942 RemoveSequenceGaps, WritePearsonFastaSequenceFile
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
943 SequenceFileUtil module provides various methods to process sequence
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
944 files and retreive appropriate information.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
945
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
946 =head1 FUNCTIONS
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
947
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
948 =over 4
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
949
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
950 =item B<AreSequenceLengthsIdentical>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
951
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
952 $Status = AreSequenceLengthsIdentical($SequencesDataRef);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
953
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
954 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
955 or 0 based whether lengths of all the sequence is same.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
956
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
957 =item B<CalcuatePercentSequenceIdentity>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
958
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
959 $PercentIdentity =
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
960 AreSequenceLengthsIdenticalAreSequenceLengthsIdentical(
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
961 $Sequence1, $Sequence2, [$IgnoreGaps, $Precision]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
962
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
963 Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
964 I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
965 returned value. By default, gaps are ignored and precision is set up to 1 decimal.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
966
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
967 =item B<CalculatePercentSequenceIdentityMatrix>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
968
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
969 $IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix(
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
970 $SequencesDataRef, [$IgnoreGaps,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
971 $Precision]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
972
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
973 Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
974 and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
975 I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
976 are ignored and precision is set up to 1 decimal.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
977
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
978 =item B<GetSequenceLength>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
979
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
980 $SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
981
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
982 Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
983 of gaps. By default, gaps are ignored.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
984
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
985 =item B<GetShortestSequence>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
986
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
987 ($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence(
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
988 $SequencesDataRef, [$IgnoreGaps]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
989
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
990 Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
991 $Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
992 controls handling of gaps in sequences. By default, gaps are ignored.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
993
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
994 =item B<GetLongestSequence>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
995
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
996 ($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence(
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
997 $SequencesDataRef, [$IgnoreGaps]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
998
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
999 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1000 B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1001 $I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1002
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1003 =item B<IsGapResidue>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1004
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1005 $Status = AreSequenceLengthsIdentical($Residue);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1006
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1007 Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1008 considered a gap residue.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1009
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1010 =item B<IsSupportedSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1011
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1012 $Status = IsSupportedSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1013
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1014 Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1015 format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1016
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1017 =item B<IsClustalWSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1018
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1019 $Status = IsClustalWSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1020
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1021 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1022 format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1023
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1024 =item B<IsPearsonFastaSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1025
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1026 $Status = IsPearsonFastaSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1027
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1028 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1029 format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1030
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1031 =item B<IsPIRFastaSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1032
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1033 $Status = IsPIRFastaSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1034
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1035 Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1036 format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1037
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1038 =item B<IsMSFSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1039
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1040 $Status = IsClustalWSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1041
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1042 Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1043 format.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1044
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1045 =item B<ReadSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1046
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1047 $SequenceDataMapRef = ReadSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1048
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1049 Reads I<SequenceFile> and returns reference to a hash containing following key/value
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1050 pairs:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1051
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1052 $SequenceDataMapRef->{IDs} - Array of sequence IDs
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1053 $SequenceDataMapRef->{Count} - Number of sequences
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1054 $SequenceDataMapRef->{Description}{$ID} - Sequence description
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1055 $SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1056 $SequenceDataMapRef->{Sequence}{InputFileType} - File format
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1057
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1058 =item B<ReadClustalWSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1059
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1060 $SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1061
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1062 Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1063 pairs as describes in B<ReadSequenceFile> method.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1064
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1065 =item B<ReadMSFSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1066
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1067 $SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1068
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1069 Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1070 pairs as describes in B<ReadSequenceFile> method.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1071
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1072 =item B<ReadPIRFastaSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1073
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1074 $SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1075
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1076 Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1077 pairs as describes in B<ReadSequenceFile> method.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1078
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1079 =item B<ReadPearsonFastaSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1080
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1081 $SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1082
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1083 Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1084 pairs as describes in B<ReadSequenceFile> method.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1085
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1086 =item B<RemoveSequenceGaps>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1087
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1088 $SeqWithoutGaps = RemoveSequenceGaps($Sequence);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1089
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1090 Removes gaps from I<Sequence> and return a sequence without any gaps.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1091
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1092 =item B<RemoveSequenceAlignmentGapColumns>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1093
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1094 $NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns(
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1095 $AlignmentDataMapRef);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1096
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1097 Using input alignment data map ref containing following keys, generate a new hash with
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1098 same set of keys after residue columns containg only gaps have been removed:
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1099
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1100 {IDs} : Array of IDs in order as they appear in file
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1101 {Count}: ID count
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1102 {Description}{$ID} : Description data
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1103 {Sequence}{$ID} : Sequence data
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1104
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1105 =item B<WritePearsonFastaSequenceFile>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1106
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1107 WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef,
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1108 [$MaxLength]);
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1109
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1110 Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1111 file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1112 80.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1113
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1114 =back
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1115
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1116 =head1 AUTHOR
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1117
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1118 Manish Sud <msud@san.rr.com>
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1119
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1120 =head1 SEE ALSO
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1121
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1122 PDBFileUtil.pm
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1123
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1124 =head1 COPYRIGHT
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1125
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1126 Copyright (C) 2015 Manish Sud. All rights reserved.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1127
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1128 This file is part of MayaChemTools.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1129
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1130 MayaChemTools is free software; you can redistribute it and/or modify it under
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1131 the terms of the GNU Lesser General Public License as published by the Free
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1132 Software Foundation; either version 3 of the License, or (at your option)
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1133 any later version.
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1134
4816e4a8ae95 Uploaded
deepakjadmin
parents:
diff changeset
1135 =cut