0
|
1 package SequenceFileUtil;
|
|
2 #
|
|
3 # $RCSfile: SequenceFileUtil.pm,v $
|
|
4 # $Date: 2015/02/28 20:47:18 $
|
|
5 # $Revision: 1.33 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use Exporter;
|
|
31 use Text::ParseWords;
|
|
32 use TextUtil;
|
|
33 use FileUtil;
|
|
34
|
|
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
36
|
|
37 @ISA = qw(Exporter);
|
|
38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
|
|
39 @EXPORT_OK = qw();
|
|
40
|
|
41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
|
|
42
|
|
43 # Compare lengths of all sequences...
|
|
44 sub AreSequenceLengthsIdentical {
|
|
45 my($SequencesDataRef) = @_;
|
|
46 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
|
|
47
|
|
48 $Status = 1;
|
|
49 $FirstID = '';
|
|
50 $FirstDifferentLenID = '';
|
|
51
|
|
52 ID: for $ID (@{$SequencesDataRef->{IDs}}) {
|
|
53 if (!$FirstID) {
|
|
54 $FirstID = $ID;
|
|
55 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
|
|
56 next ID;
|
|
57 }
|
|
58 $SeqLen = length($SequencesDataRef->{Sequence}{$ID});
|
|
59 if ($SeqLen != $FirstSeqLen) {
|
|
60 $Status = 0;
|
|
61 $FirstDifferentLenID = $ID;
|
|
62 last ID;
|
|
63 }
|
|
64 }
|
|
65 return ($Status);
|
|
66 }
|
|
67
|
|
68 # Calculate percent identity between two sequences. By default, gaps are ignored.
|
|
69 sub CalcuatePercentSequenceIdentity {
|
|
70 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
|
|
71
|
|
72 $PercentIdentity = '';
|
|
73 $Precision = 1;
|
|
74 $IgnoreGaps = 1;
|
|
75 if (@_ == 4) {
|
|
76 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
|
|
77 }
|
|
78 elsif (@_ == 3) {
|
|
79 ($Sequence1, $Sequence2, $IgnoreGaps) = @_;
|
|
80 }
|
|
81 elsif (@_ == 2) {
|
|
82 ($Sequence1, $Sequence2) = @_;
|
|
83 }
|
|
84 else {
|
|
85 return $PercentIdentity;
|
|
86 }
|
|
87 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
|
|
88 return $PercentIdentity;
|
|
89 }
|
|
90 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
|
|
91
|
|
92 $Sequence1Len = length($Sequence1);
|
|
93 $Sequence2Len = length($Sequence2);
|
|
94
|
|
95 $ResMatchCount = 0;
|
|
96 $ResCount = 0;
|
|
97 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
|
|
98 $Residue1 = substr($Sequence1, $Index, 1);
|
|
99 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
|
|
100 if ($IgnoreGaps) {
|
|
101 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
|
|
102 next RESIDUE;
|
|
103 }
|
|
104 }
|
|
105 if ($Residue1 eq $Residue2) {
|
|
106 $ResMatchCount++;
|
|
107 }
|
|
108 $ResCount++;
|
|
109 }
|
|
110 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
|
|
111 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
|
|
112
|
|
113 return $PercentIdentity;
|
|
114 }
|
|
115
|
|
116 # Calculate pairwise identify matrix for all the sequences and return a reference
|
|
117 # to a hash with the following keys:
|
|
118 #
|
|
119 # {IDs} - Sequence IDs
|
|
120 # {Count} - Number of IDs
|
|
121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
|
|
122 #
|
|
123 sub CalculatePercentSequenceIdentityMatrix {
|
|
124 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
|
|
125
|
|
126 $IgnoreGaps = 1;
|
|
127 $Precision = 1;
|
|
128 if (@_ == 3) {
|
|
129 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
|
|
130 }
|
|
131 elsif (@_ == 2) {
|
|
132 ($SequencesDataRef, $IgnoreGaps) = @_;
|
|
133 }
|
|
134 else {
|
|
135 ($SequencesDataRef) = @_;
|
|
136 }
|
|
137
|
|
138 %IdentityMatrixData = ();
|
|
139 @{$IdentityMatrixData{IDs}} = ();
|
|
140 %{$IdentityMatrixData{PercentIdentity}} = ();
|
|
141 $IdentityMatrixData{Count} = 0;
|
|
142
|
|
143 for $ID (@{$SequencesDataRef->{IDs}}) {
|
|
144 push @{$IdentityMatrixData{IDs}}, $ID;
|
|
145 $IdentityMatrixData{Count} += 1;
|
|
146 }
|
|
147 # Initialize and calculate percent identity data values...
|
|
148 for $RowID (@{$SequencesDataRef->{IDs}}) {
|
|
149 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
|
|
150 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
|
|
151 for $ColID (@{$SequencesDataRef->{IDs}}) {
|
|
152 $IdentityMatrixData{$RowID}{$ColID} = '';
|
|
153 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
|
|
154 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
|
|
155 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
|
|
156 }
|
|
157 }
|
|
158 return \%IdentityMatrixData;
|
|
159 }
|
|
160
|
|
161 # Retrieve information about shortest sequence...
|
|
162 sub GetShortestSequence {
|
|
163 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
|
|
164
|
|
165 $IgnoreGaps = 1;
|
|
166 if (@_ == 2) {
|
|
167 ($SequencesDataRef, $IgnoreGaps) = @_;
|
|
168 }
|
|
169 else {
|
|
170 ($SequencesDataRef) = @_;
|
|
171 }
|
|
172
|
|
173 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
|
|
174 return ($ID, $Sequence, $SeqLen, $Description);
|
|
175 }
|
|
176
|
|
177 # Retrieve information about longest sequence..
|
|
178 sub GetLongestSequence {
|
|
179 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
|
|
180
|
|
181 $IgnoreGaps = 1;
|
|
182 if (@_ == 2) {
|
|
183 ($SequencesDataRef, $IgnoreGaps) = @_;
|
|
184 }
|
|
185 else {
|
|
186 ($SequencesDataRef) = @_;
|
|
187 }
|
|
188
|
|
189 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
|
|
190 return ($ID, $Sequence, $SeqLen, $Description);
|
|
191 }
|
|
192
|
|
193 # Get sequence length...
|
|
194 sub GetSequenceLength {
|
|
195 my($Seq, $SeqLen, $IgnoreGaps);
|
|
196
|
|
197 $SeqLen = ''; $IgnoreGaps = 1;
|
|
198 if (@_ == 2) {
|
|
199 ($Seq, $IgnoreGaps) = @_;
|
|
200 }
|
|
201 else {
|
|
202 ($Seq) = @_;
|
|
203 }
|
|
204 if ($IgnoreGaps) {
|
|
205 my($Index, $Residue);
|
|
206 $SeqLen = 0;
|
|
207 for $Index (0 .. (length($Seq) - 1)) {
|
|
208 $Residue = substr($Seq, $Index, 1);
|
|
209 if ($Residue =~ /[A-Z]/i) {
|
|
210 $SeqLen++;
|
|
211 }
|
|
212 }
|
|
213 }
|
|
214 else {
|
|
215 $SeqLen = length($Seq);
|
|
216 }
|
|
217
|
|
218 return $SeqLen;
|
|
219 }
|
|
220
|
|
221 # Is it a gap residue...
|
|
222 sub IsGapResidue {
|
|
223 my($Residue) = @_;
|
|
224 my($Status);
|
|
225
|
|
226 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
|
|
227
|
|
228 return $Status;
|
|
229 }
|
|
230
|
|
231 # Is it a supported sequence file?
|
|
232 #
|
|
233 # Supported seqence formats are:
|
|
234 #
|
|
235 # ALN/ClustalW .aln
|
|
236 # GCG/MSF .msf
|
|
237 # PILEUP/MSF .msf
|
|
238 # Fasts(Pearson) .fasta, .fta
|
|
239 # NBRF/PIR .pir
|
|
240 #
|
|
241 sub IsSupportedSequenceFile {
|
|
242 my($SequenceFile) = @_;
|
|
243 my($Status, $SequenceFormat);
|
|
244 $Status = 0; $SequenceFormat = 'NotSupported';
|
|
245
|
|
246 SEQFORMAT: {
|
|
247 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
|
|
248 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
|
|
249 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
|
|
250 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
|
|
251 $Status = 0; $SequenceFormat = 'NotSupported';
|
|
252 }
|
|
253 return ($Status, $SequenceFormat);
|
|
254 }
|
|
255
|
|
256 # Is it a ClustalW multiple sequence sequence file...
|
|
257 sub IsClustalWSequenceFile {
|
|
258 my($SequenceFile) = @_;
|
|
259 my($Status, $Line);
|
|
260
|
|
261 $Status = 0;
|
|
262
|
|
263 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
|
|
264 $Line = GetTextLine(\*SEQUENCEFILE);
|
|
265 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
|
|
266 close SEQUENCEFILE;
|
|
267
|
|
268 return $Status;
|
|
269 }
|
|
270
|
|
271 # Is it a valid Pearson fasta sequence or alignment file?
|
|
272 #
|
|
273 sub IsPearsonFastaSequenceFile {
|
|
274 my($FastaFile, $Line, $Status);
|
|
275
|
|
276 ($FastaFile) = @_;
|
|
277 $Status = 0;
|
|
278
|
|
279 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
|
|
280 $Line = GetTextLine(\*FASTAFILE);
|
|
281
|
|
282 # First line starts with > and the fourth character is not ';'; otherwise, it's
|
|
283 # PIR FASTA format.
|
|
284 if ($Line =~ /^>/) {
|
|
285 my($FourthChar);
|
|
286 $FourthChar = substr($Line, 3, 1);
|
|
287 $Status = ($FourthChar !~ /\;/) ? 1 : 0;
|
|
288 }
|
|
289 close FASTAFILE;
|
|
290
|
|
291 return $Status;
|
|
292 }
|
|
293
|
|
294 # Is it a valid NBRF/PIR fasta sequence or alignment file?
|
|
295 #
|
|
296 sub IsPIRFastaSequenceFile {
|
|
297 my($FastaFile, $Line, $Status);
|
|
298
|
|
299 ($FastaFile) = @_;
|
|
300 $Status = 0;
|
|
301
|
|
302 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
|
|
303 $Line = GetTextLine(\*FASTAFILE);
|
|
304
|
|
305 # First line starts with > and the fourth character is ';'; otherwise, it's
|
|
306 # a Pearson FASTA format.
|
|
307 if ($Line =~ /^>/) {
|
|
308 my($FourthChar);
|
|
309 $FourthChar = substr($Line, 3, 1);
|
|
310 $Status = ($FourthChar =~ /\;/) ? 1 : 0;
|
|
311 }
|
|
312 close FASTAFILE;
|
|
313
|
|
314 return $Status;
|
|
315 }
|
|
316
|
|
317 # Is it a valid MSF sequence or alignment file?
|
|
318 #
|
|
319 sub IsMSFSequenceFile {
|
|
320 my($MSFFile) = @_;
|
|
321
|
|
322 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
|
|
323
|
|
324 my($Line, $Status);
|
|
325
|
|
326 $Status = 0;
|
|
327 # Find a line that contains MSF: keyword and ends with '..'
|
|
328 LINE: while ($Line = GetTextLine(\*MSFFILE)) {
|
|
329 $Line = RemoveLeadingWhiteSpaces($Line);
|
|
330 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
|
|
331 $Status = 1;
|
|
332 last LINE;
|
|
333 }
|
|
334 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
|
|
335 # Pileup MSF...
|
|
336 $Status = 1;
|
|
337 last LINE;
|
|
338 }
|
|
339 }
|
|
340 close MSFFILE;
|
|
341
|
|
342 return $Status;
|
|
343 }
|
|
344
|
|
345 # Read sequence or sequence alignment file...
|
|
346 sub ReadSequenceFile {
|
|
347 my($SequenceFile) = @_;
|
|
348
|
|
349 if (IsPearsonFastaSequenceFile($SequenceFile)) {
|
|
350 return ReadPearsonFastaSequenceFile($SequenceFile);
|
|
351 }
|
|
352 elsif (IsPIRFastaSequenceFile($SequenceFile)) {
|
|
353 return ReadPIRFastaSequenceFile($SequenceFile);
|
|
354 }
|
|
355 elsif (IsMSFSequenceFile($SequenceFile)) {
|
|
356 return ReadMSFSequenceFile($SequenceFile);
|
|
357 }
|
|
358 elsif (IsClustalWSequenceFile($SequenceFile)) {
|
|
359 return ReadClustalWSequenceFile($SequenceFile);
|
|
360 }
|
|
361 else {
|
|
362 return undef;
|
|
363 }
|
|
364 }
|
|
365
|
|
366 # Read file and setup alignment data...
|
|
367 sub ReadClustalWSequenceFile {
|
|
368 my($SequenceFile) = @_;
|
|
369
|
|
370 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
|
|
371 }
|
|
372
|
|
373 # Read file and setup alignment data...
|
|
374 sub ReadPearsonFastaSequenceFile {
|
|
375 my($SequenceFile) = @_;
|
|
376
|
|
377 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
|
|
378 }
|
|
379
|
|
380 # Read file and setup alignment data...
|
|
381 sub ReadPIRFastaSequenceFile {
|
|
382 my($SequenceFile) = @_;
|
|
383
|
|
384 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
|
|
385 }
|
|
386
|
|
387
|
|
388 # Read file and setup sequence data...
|
|
389 sub ReadMSFSequenceFile {
|
|
390 my($SequenceFile) = @_;
|
|
391
|
|
392 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
|
|
393 }
|
|
394
|
|
395 # Write out a Pearson FASTA file...
|
|
396 sub WritePearsonFastaSequenceFile {
|
|
397 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
|
|
398
|
|
399 $MaxLength = 80;
|
|
400 if (@_ == 3) {
|
|
401 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
|
|
402 }
|
|
403 elsif (@_ == 2) {
|
|
404 ($SequenceFileName, $SequenceDataRef) = @_;
|
|
405 }
|
|
406 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
|
|
407 for $ID (@{$SequenceDataRef->{IDs}}) {
|
|
408 $Description = $SequenceDataRef->{Description}{$ID};
|
|
409 $Sequence = $SequenceDataRef->{Sequence}{$ID};
|
|
410 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
|
|
411
|
|
412 # Description also contains ID...
|
|
413 print SEQUENCEFILE ">$Description\n";
|
|
414 print SEQUENCEFILE "$WrappedSequence\n";
|
|
415 }
|
|
416 close SEQUENCEFILE;
|
|
417 }
|
|
418
|
|
419 # Get ID, Sequence and Length for smallest or longest sequence
|
|
420 sub _GetShortestOrLongestSequence {
|
|
421 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
|
|
422 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
|
|
423
|
|
424 ($ID, $Seq, $SeqLen) = ('', '', '');
|
|
425 $FirstID = '';
|
|
426
|
|
427 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
|
|
428 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
|
|
429 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
|
|
430 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
|
|
431 if (!$FirstID) {
|
|
432 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
|
|
433 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
|
|
434 next ID;
|
|
435 }
|
|
436 if ($CurrentSeqLen != $SeqLen) {
|
|
437 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
|
|
438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
|
|
439 }
|
|
440 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
|
|
441 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
|
|
442 }
|
|
443 }
|
|
444 }
|
|
445 return ($ID, $Seq, $SeqLen, $Description);
|
|
446 }
|
|
447
|
|
448 # Remove gaps in the sequence and return new sequence...
|
|
449 sub RemoveSequenceGaps {
|
|
450 my($Seq) = @_;
|
|
451 my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
|
|
452
|
|
453 $SeqWithoutGaps = '';
|
|
454 $SeqLen = length($Seq);
|
|
455 for $Index (0 .. ($SeqLen - 1)) {
|
|
456 $Residue = substr($Seq, $Index, 1);
|
|
457 if ($Residue =~ /[A-Z]/i) {
|
|
458 $SeqWithoutGaps .= $Residue;
|
|
459 }
|
|
460 }
|
|
461
|
|
462 return $SeqWithoutGaps;
|
|
463 }
|
|
464
|
|
465 # Using input alignment data map ref containing following keys, generate
|
|
466 # a new hash with same set of keys after residue columns containg only
|
|
467 # gaps have been removed:
|
|
468 #
|
|
469 # {IDs} : Array of IDs in order as they appear in file
|
|
470 # {Count}: ID count...
|
|
471 # {Description}{$ID} : Description data...
|
|
472 # {Sequence}{$ID} : Sequence data...
|
|
473 #
|
|
474 sub RemoveSequenceAlignmentGapColumns {
|
|
475 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
|
|
476
|
|
477 ($AlignmentDataMapRef) = @_;
|
|
478
|
|
479 %NewAlignmentDataMap = ();
|
|
480 @{$NewAlignmentDataMap{IDs}} =();
|
|
481 %{$NewAlignmentDataMap{Description}} =();
|
|
482 %{$NewAlignmentDataMap{Sequence}} =();
|
|
483 $NewAlignmentDataMap{Count} = 0;
|
|
484
|
|
485 # Transfer ID and count information...
|
|
486 for $ID (@{$AlignmentDataMapRef->{IDs}}) {
|
|
487 push @{$NewAlignmentDataMap{IDs}}, $ID;
|
|
488 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
|
|
489 $NewAlignmentDataMap{Sequence}{$ID} = '';
|
|
490 $NewAlignmentDataMap{Count} += 1;
|
|
491 }
|
|
492
|
|
493 # Go over residue columns and transfer the data...
|
|
494 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
|
|
495
|
|
496 $FirstID = $AlignmentDataMapRef->{IDs}[0];
|
|
497 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
|
|
498 $FirstSeqLen = length($FirstSeq);
|
|
499
|
|
500 RES: for $Index (0 .. ($FirstSeqLen - 1)) {
|
|
501 # Is this a gap column?
|
|
502 $GapColumn = 1;
|
|
503 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
|
|
504 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
|
|
505 if ($Res =~ /[A-Z]/i) {
|
|
506 $GapColumn = 0;
|
|
507 last ID;
|
|
508 }
|
|
509 }
|
|
510 if ($GapColumn) {
|
|
511 next RES;
|
|
512 }
|
|
513 # Transfer this residue...
|
|
514 for $ID (@{$AlignmentDataMapRef->{IDs}}) {
|
|
515 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
|
|
516 $NewAlignmentDataMap{Sequence}{$ID} .= $Res;
|
|
517 }
|
|
518 }
|
|
519
|
|
520 return (\%NewAlignmentDataMap);
|
|
521 }
|
|
522
|
|
523 #
|
|
524 # Read sequences file and return a reference to hash with the following keys:
|
|
525 #
|
|
526 # {IDs} - Array of sequence IDs
|
|
527 # {Count} - Number of sequences
|
|
528 # {Description}{$ID} - Sequence description
|
|
529 # {Sequence}{$ID} - Sequence for a specific ID
|
|
530 # {InputFileType} - Sequence file format
|
|
531 # {ConservedAnnotation} - Conserved residue annonation
|
|
532 #
|
|
533 # Note:
|
|
534 # . Conserved residue annotation either exist in the input sequence alignment file or set
|
|
535 # for a file containing same number of residues for all the sequence using the following
|
|
536 # notation: * - Residue conserved; ' ' - Residue not conserved.
|
|
537 #
|
|
538 sub _ReadFileAndSetupSequencesData {
|
|
539 my($SequenceFile, $SequenceType) = @_;
|
|
540 my($SequenceDataMapRef);
|
|
541
|
|
542 $SequenceDataMapRef = undef;
|
|
543
|
|
544 # Read sequence file...
|
|
545 $SequenceDataMapRef = '';
|
|
546 if ($SequenceType =~ /^ClustalW$/i) {
|
|
547 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
|
|
548 }
|
|
549 elsif ($SequenceType =~ /^Pearson$/i) {
|
|
550 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
|
|
551 }
|
|
552 elsif ($SequenceType =~ /^PIR$/i) {
|
|
553 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
|
|
554 }
|
|
555 elsif ($SequenceType =~ /^MSF$/i) {
|
|
556 $SequenceDataMapRef = _ReadMSFFile($SequenceFile);
|
|
557 }
|
|
558 else {
|
|
559 return $SequenceDataMapRef;
|
|
560 }
|
|
561
|
|
562 if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
|
|
563 return ($SequenceDataMapRef);
|
|
564 }
|
|
565 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
|
|
566 return ($SequenceDataMapRef);
|
|
567 }
|
|
568
|
|
569 # Use the first sequence to setup an empty ConservedAnnotation key...
|
|
570 # And mark fully conserved residues...
|
|
571 #
|
|
572 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
|
|
573 $ID = $SequenceDataMapRef->{IDs}[0];
|
|
574 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
|
|
575 $FirstSeqLen = length($FirstSequence);
|
|
576 $SequenceDataMapRef->{ConservedAnnotation} = '';
|
|
577 for $Index (0 .. ($FirstSeqLen - 1)) {
|
|
578 $FirstRes = '';
|
|
579 $ResConserved = 1;
|
|
580 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
|
|
581 $Sequence = $SequenceDataMapRef->{Sequence}{$ID};
|
|
582 $Res = substr($Sequence, $Index, 1);
|
|
583 if (!$FirstRes) {
|
|
584 $FirstRes = $Res;
|
|
585 next ID;
|
|
586 }
|
|
587 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
|
|
588 $ResConserved = 0;
|
|
589 last ID;
|
|
590 }
|
|
591 }
|
|
592 if ($ResConserved) {
|
|
593 $SequenceDataMapRef->{ConservedAnnotation} .= '*';
|
|
594 }
|
|
595 else {
|
|
596 $SequenceDataMapRef->{ConservedAnnotation} .= ' ';
|
|
597 }
|
|
598 }
|
|
599
|
|
600 return ($SequenceDataMapRef);
|
|
601 }
|
|
602
|
|
603 # Read sequence data in ClustalW multiple sequence alignment file and
|
|
604 # return a reference to hash with these keys and values:
|
|
605 #
|
|
606 # {IDs} - Array of sequence IDs
|
|
607 # {Count} - Number of sequences
|
|
608 # {Description}{$ID} - Sequence description
|
|
609 # {Sequence}{$ID} - Sequence for a specific ID
|
|
610 # {InputFileType} - Sequence file format
|
|
611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
|
|
612 #
|
|
613 #
|
|
614 #
|
|
615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
|
|
616 #
|
|
617 # '*' indicates positions which have a single, fully conserved residue
|
|
618 #
|
|
619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA
|
|
620 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW
|
|
621
|
|
622 # '.' indicates that one of the following 'weaker' groups is fully conserved:
|
|
623 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
|
|
624 #
|
|
625 # These are all the positively scoring groups that occur in the Gonnet Pam250
|
|
626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak
|
|
627 # score =<0.5 respectively.
|
|
628 #
|
|
629 sub _ReadClustalWFile {
|
|
630 my($SequenceFile) = @_;
|
|
631 my(%SequencesDataMap);
|
|
632
|
|
633 # Initialize data...
|
|
634 %SequencesDataMap = ();
|
|
635 @{$SequencesDataMap{IDs}} = ();
|
|
636 %{$SequencesDataMap{Description}} = ();
|
|
637 %{$SequencesDataMap{Sequence}} = ();
|
|
638 $SequencesDataMap{Count} = 0;
|
|
639 $SequencesDataMap{ConservedAnnotation} = '';
|
|
640 $SequencesDataMap{InputFileType} = 'ClustalW';
|
|
641
|
|
642 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
|
|
643
|
|
644 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
|
|
645
|
|
646 # Ignore the header line...
|
|
647 $Line = <SEQUENCEFILE>;
|
|
648
|
|
649 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
|
|
650 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
|
|
651 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
|
|
652 # Extract it and save...
|
|
653 $LineLength = length($Line);
|
|
654 $AnnotationStart = $LineLength - $SequenceLength;
|
|
655 $AnnotationLength = $SequenceLength;
|
|
656 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
|
|
657 $SequencesDataMap{ConservedAnnotation} .= $Annotation;
|
|
658 }
|
|
659 else {
|
|
660 # Extract ID and sequences...
|
|
661 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
|
|
662 $Sequence =~ s/ //g;
|
|
663 if (!($ID && $Sequence)) {
|
|
664 next LINE;
|
|
665 }
|
|
666
|
|
667 if (exists $SequencesDataMap{Sequence}{$ID}) {
|
|
668 # Append to existing alignment value...
|
|
669 $SequenceLength = length($Sequence);
|
|
670 $SequencesDataMap{Sequence}{$ID} .= $Sequence;
|
|
671 }
|
|
672 else {
|
|
673 # New alignment data...
|
|
674 $SequencesDataMap{Count} += 1;
|
|
675 push @{$SequencesDataMap{IDs}}, $ID;
|
|
676 $SequencesDataMap{Description}{$ID} = $ID;
|
|
677 $SequencesDataMap{Sequence}{$ID} = $Sequence;
|
|
678 $SequenceLength = length($Sequence);
|
|
679 }
|
|
680 }
|
|
681 }
|
|
682 close SEQUENCEFILE;
|
|
683 return (\%SequencesDataMap);
|
|
684 }
|
|
685
|
|
686 # Read Pearson fasta file and return a reference to hash with these keys:
|
|
687 #
|
|
688 # {IDs} - Array of sequence IDs
|
|
689 # {Count} - Number of sequences
|
|
690 # {Description}{$ID} - Sequence description
|
|
691 # {Sequence}{$ID} - Sequence for a specific ID
|
|
692 # {InputFileType} - Sequence file format
|
|
693 # {ConservedAnnotation} - Conserved residue annonation
|
|
694 #
|
|
695 sub _ReadPearsonFastaFile {
|
|
696 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
|
|
697
|
|
698 ($FastaFileName) = @_;
|
|
699
|
|
700 %FastaDataMap = ();
|
|
701 @{$FastaDataMap{IDs}} =();
|
|
702 %{$FastaDataMap{Description}} =();
|
|
703 %{$FastaDataMap{Sequence}} =();
|
|
704 $FastaDataMap{Count} = 0;
|
|
705 $FastaDataMap{InputFileType} = 'Pearson';
|
|
706
|
|
707 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
|
|
708 $ID = '';
|
|
709 $IgnoreID = 0;
|
|
710 LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
|
|
711 if ($Line =~ /^\>/) {
|
|
712 # Start of a new ID...
|
|
713 $Line =~ s/^\>//;
|
|
714 $Line = RemoveLeadingWhiteSpaces($Line);
|
|
715 @LineWords = ();
|
|
716 @LineWords = split / /, $Line;
|
|
717
|
|
718 $ID = $LineWords[0];
|
|
719 $ID =~ s/ //g;
|
|
720 $Description = $Line;
|
|
721
|
|
722 $IgnoreID = 0;
|
|
723 if (exists $FastaDataMap{Sequence}{$ID}) {
|
|
724 $IgnoreID = 1;
|
|
725 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
|
|
726 next LINE;
|
|
727 }
|
|
728 push @{$FastaDataMap{IDs}}, $ID;
|
|
729 $FastaDataMap{Description}{$ID} = $Description;
|
|
730 $FastaDataMap{Count} += 1;
|
|
731 next LINE;
|
|
732 }
|
|
733 if ($IgnoreID) { next LINE; }
|
|
734
|
|
735 # Remove any spaces in the sequence...
|
|
736 $Line =~ s/ //g;
|
|
737 # Sequence data for active ID...
|
|
738 if (exists $FastaDataMap{Sequence}{$ID}) {
|
|
739 $FastaDataMap{Sequence}{$ID} .= $Line;
|
|
740 }
|
|
741 else {
|
|
742 $FastaDataMap{Sequence}{$ID} = $Line;
|
|
743 }
|
|
744 }
|
|
745 close FASTAFILE;
|
|
746 return \%FastaDataMap;
|
|
747 }
|
|
748
|
|
749 # Read PIR fasta file and return a reference to hash with these keys:
|
|
750 #
|
|
751 # {IDs} - Array of sequence IDs
|
|
752 # {Count} - Number of sequences
|
|
753 # {Description}{$ID} - Sequence description
|
|
754 # {Sequence}{$ID} - Sequence for a specific ID
|
|
755 # {InputFileType} - Sequence file format
|
|
756 # {ConservedAnnotation} - Conserved residue annonation
|
|
757 #
|
|
758 # Format:
|
|
759 # A sequence in PIR format consists of:
|
|
760 # One line starting with
|
|
761 # a ">" (greater-than) sign, followed by
|
|
762 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
|
|
763 # a semicolon, followed by
|
|
764 # the sequence identification code (the database ID-code).
|
|
765 # One line containing a textual description of the sequence.
|
|
766 # One or more lines containing the sequence itself. The end of the
|
|
767 # sequence is marked by a "*" (asterisk) character.
|
|
768 #
|
|
769 # A file in PIR format may comprise more than one sequence.
|
|
770 #
|
|
771 # The PIR format is also often referred to as the NBRF format.
|
|
772 #
|
|
773 # Code SequenceType
|
|
774 # P1 Protein (complete)
|
|
775 # F1 Protein (fragment)
|
|
776 # DL DNA (linear)
|
|
777 # DC DNA (circular)
|
|
778 # RL RNA (linear)
|
|
779 # RC RNA (circular)
|
|
780 # N3 tRNA
|
|
781 # N1 Other functional RNA
|
|
782 #
|
|
783
|
|
784 sub _ReadPIRFastaFile {
|
|
785 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
|
|
786
|
|
787 ($FastaFileName) = @_;
|
|
788
|
|
789 %FastaDataMap = ();
|
|
790 @{$FastaDataMap{IDs}} =();
|
|
791 %{$FastaDataMap{Description}} =();
|
|
792 %{$FastaDataMap{Sequence}} =();
|
|
793 %{$FastaDataMap{SequenceTypeCode}} =();
|
|
794 $FastaDataMap{Count} = 0;
|
|
795 $FastaDataMap{InputFileType} = 'PIR';
|
|
796
|
|
797 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
|
|
798 $ID = '';
|
|
799 $ReadingSequenceData = 0;
|
|
800 LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
|
|
801 if ($Line =~ /^\>/) {
|
|
802 # Start of a new ID...
|
|
803 $Line =~ s/^\>//;
|
|
804 $Line = RemoveLeadingWhiteSpaces($Line);
|
|
805 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
|
|
806
|
|
807 # Use next line to retrieve sequence description...
|
|
808 $Line = GetTextLine(\*FASTAFILE);
|
|
809 $Line = RemoveLeadingWhiteSpaces($Line);
|
|
810 $Description = $Line;
|
|
811
|
|
812 if (exists $FastaDataMap{Sequence}{$ID}) {
|
|
813 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
|
|
814 next LINE;
|
|
815 }
|
|
816 $ReadingSequenceData = 1;
|
|
817 push @{$FastaDataMap{IDs}}, $ID;
|
|
818 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
|
|
819 $FastaDataMap{Description}{$ID} = $Description;
|
|
820 $FastaDataMap{Count} += 1;
|
|
821 next LINE;
|
|
822 }
|
|
823 if (!$ReadingSequenceData) { next LINE; }
|
|
824
|
|
825 # Remove any spaces in the sequence...
|
|
826 $Line =~ s/ //g;
|
|
827 if ($Line =~ /[\*]$/) {
|
|
828 # End of sequence...
|
|
829 $ReadingSequenceData = 0;
|
|
830 $Line =~ s/[\*]$//;
|
|
831 }
|
|
832 # Sequence data for active ID...
|
|
833 if (exists $FastaDataMap{Sequence}{$ID}) {
|
|
834 $FastaDataMap{Sequence}{$ID} .= $Line;
|
|
835 }
|
|
836 else {
|
|
837 $FastaDataMap{Sequence}{$ID} = $Line;
|
|
838 }
|
|
839 }
|
|
840 close FASTAFILE;
|
|
841 return \%FastaDataMap;
|
|
842 }
|
|
843
|
|
844 # Read MSF file and return a reference to hash with these keys:
|
|
845 #
|
|
846 # {IDs} : Array of IDs in order as they appear in file
|
|
847 # {Count}: ID count...
|
|
848 # {Description}{$ID} : Description data...
|
|
849 # {Sequence}{$ID} : Sequence data...
|
|
850 #
|
|
851 sub _ReadMSFFile {
|
|
852 my($MSFFileName, $Line, @LineWords, %MSFDataMap);
|
|
853
|
|
854 ($MSFFileName) = @_;
|
|
855
|
|
856 %MSFDataMap = ();
|
|
857 @{$MSFDataMap{IDs}} =();
|
|
858 %{$MSFDataMap{Description}} =();
|
|
859 %{$MSFDataMap{Sequence}} =();
|
|
860 $MSFDataMap{Count} = 0;
|
|
861 $MSFDataMap{InputFileType} = 'MSF';
|
|
862
|
|
863 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
|
|
864
|
|
865 # Collect sequences and IDs...
|
|
866 #
|
|
867 # '//' after the name fields indicates end of header list and start of sequence data.
|
|
868 #
|
|
869 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
|
|
870 %MSFIDsMap = ();
|
|
871 $NameFieldsFound = 0;
|
|
872 LINE: while ($Line = GetTextLine(\*MSFFILE)) {
|
|
873 if ($Line =~ /Name:/) {
|
|
874 $NameFieldsFound++;
|
|
875 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
|
|
876 if ($ID =~ / /) {
|
|
877 ($ID) = $ID =~ /^(.*?)[ ]+/
|
|
878 }
|
|
879 if (exists $MSFIDsMap{$ID}) {
|
|
880 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
|
|
881 next LINE;
|
|
882 }
|
|
883 $MSFIDsMap{$ID} = $ID;
|
|
884 push @{$MSFDataMap{IDs}}, $ID;
|
|
885 $MSFDataMap{Description}{$ID} = $ID;
|
|
886 $MSFDataMap{Count} += 1;
|
|
887 }
|
|
888 elsif ( /\/\// && $NameFieldsFound) {
|
|
889 # End of header list...
|
|
890 last LINE;
|
|
891 }
|
|
892 }
|
|
893 # Collect all sequences...
|
|
894 #
|
|
895 my($FirstField, $SecondField);
|
|
896 while ($Line = GetTextLine(\*MSFFILE)) {
|
|
897 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
|
|
898 if (exists $MSFIDsMap{$FirstField}) {
|
|
899 # It's ID and sequence data...
|
|
900 $ID = $FirstField;
|
|
901 $Sequence = $SecondField;
|
|
902 # Take out spaces and leave the gap characters...
|
|
903 $Sequence =~ s/ //g;
|
|
904 if ($MSFDataMap{Sequence}{$ID}) {
|
|
905 $MSFDataMap{Sequence}{$ID} .= $Sequence;
|
|
906 }
|
|
907 else {
|
|
908 $MSFDataMap{Sequence}{$ID} = $Sequence;
|
|
909 }
|
|
910 }
|
|
911 }
|
|
912
|
|
913 close MSFFILE;
|
|
914 return \%MSFDataMap;
|
|
915 }
|
|
916
|
|
917
|
|
918 1;
|
|
919
|
|
920 __END__
|
|
921
|
|
922 =head1 NAME
|
|
923
|
|
924 SequenceFileUtil
|
|
925
|
|
926 =head1 SYNOPSIS
|
|
927
|
|
928 use SequenceFileUtil ;
|
|
929
|
|
930 use SequenceFileUtil qw(:all);
|
|
931
|
|
932 =head1 DESCRIPTION
|
|
933
|
|
934 B<SequenceFileUtil> module provides the following functions:
|
|
935
|
|
936 AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity,
|
|
937 CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength,
|
|
938 GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile,
|
|
939 IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile,
|
|
940 ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile,
|
|
941 ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns,
|
|
942 RemoveSequenceGaps, WritePearsonFastaSequenceFile
|
|
943 SequenceFileUtil module provides various methods to process sequence
|
|
944 files and retreive appropriate information.
|
|
945
|
|
946 =head1 FUNCTIONS
|
|
947
|
|
948 =over 4
|
|
949
|
|
950 =item B<AreSequenceLengthsIdentical>
|
|
951
|
|
952 $Status = AreSequenceLengthsIdentical($SequencesDataRef);
|
|
953
|
|
954 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1
|
|
955 or 0 based whether lengths of all the sequence is same.
|
|
956
|
|
957 =item B<CalcuatePercentSequenceIdentity>
|
|
958
|
|
959 $PercentIdentity =
|
|
960 AreSequenceLengthsIdenticalAreSequenceLengthsIdentical(
|
|
961 $Sequence1, $Sequence2, [$IgnoreGaps, $Precision]);
|
|
962
|
|
963 Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments
|
|
964 I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the
|
|
965 returned value. By default, gaps are ignored and precision is set up to 1 decimal.
|
|
966
|
|
967 =item B<CalculatePercentSequenceIdentityMatrix>
|
|
968
|
|
969 $IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix(
|
|
970 $SequencesDataRef, [$IgnoreGaps,
|
|
971 $Precision]);
|
|
972
|
|
973 Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef>
|
|
974 and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and
|
|
975 I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps
|
|
976 are ignored and precision is set up to 1 decimal.
|
|
977
|
|
978 =item B<GetSequenceLength>
|
|
979
|
|
980 $SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]);
|
|
981
|
|
982 Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling
|
|
983 of gaps. By default, gaps are ignored.
|
|
984
|
|
985 =item B<GetShortestSequence>
|
|
986
|
|
987 ($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence(
|
|
988 $SequencesDataRef, [$IgnoreGaps]);
|
|
989
|
|
990 Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID,
|
|
991 $Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps
|
|
992 controls handling of gaps in sequences. By default, gaps are ignored.
|
|
993
|
|
994 =item B<GetLongestSequence>
|
|
995
|
|
996 ($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence(
|
|
997 $SequencesDataRef, [$IgnoreGaps]);
|
|
998
|
|
999 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>,
|
|
1000 B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument
|
|
1001 $I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored.
|
|
1002
|
|
1003 =item B<IsGapResidue>
|
|
1004
|
|
1005 $Status = AreSequenceLengthsIdentical($Residue);
|
|
1006
|
|
1007 Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is
|
|
1008 considered a gap residue.
|
|
1009
|
|
1010 =item B<IsSupportedSequenceFile>
|
|
1011
|
|
1012 $Status = IsSupportedSequenceFile($SequenceFile);
|
|
1013
|
|
1014 Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence
|
|
1015 format.
|
|
1016
|
|
1017 =item B<IsClustalWSequenceFile>
|
|
1018
|
|
1019 $Status = IsClustalWSequenceFile($SequenceFile);
|
|
1020
|
|
1021 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment
|
|
1022 format.
|
|
1023
|
|
1024 =item B<IsPearsonFastaSequenceFile>
|
|
1025
|
|
1026 $Status = IsPearsonFastaSequenceFile($SequenceFile);
|
|
1027
|
|
1028 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence
|
|
1029 format.
|
|
1030
|
|
1031 =item B<IsPIRFastaSequenceFile>
|
|
1032
|
|
1033 $Status = IsPIRFastaSequenceFile($SequenceFile);
|
|
1034
|
|
1035 Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence
|
|
1036 format.
|
|
1037
|
|
1038 =item B<IsMSFSequenceFile>
|
|
1039
|
|
1040 $Status = IsClustalWSequenceFile($SequenceFile);
|
|
1041
|
|
1042 Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment
|
|
1043 format.
|
|
1044
|
|
1045 =item B<ReadSequenceFile>
|
|
1046
|
|
1047 $SequenceDataMapRef = ReadSequenceFile($SequenceFile);
|
|
1048
|
|
1049 Reads I<SequenceFile> and returns reference to a hash containing following key/value
|
|
1050 pairs:
|
|
1051
|
|
1052 $SequenceDataMapRef->{IDs} - Array of sequence IDs
|
|
1053 $SequenceDataMapRef->{Count} - Number of sequences
|
|
1054 $SequenceDataMapRef->{Description}{$ID} - Sequence description
|
|
1055 $SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID
|
|
1056 $SequenceDataMapRef->{Sequence}{InputFileType} - File format
|
|
1057
|
|
1058 =item B<ReadClustalWSequenceFile>
|
|
1059
|
|
1060 $SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile);
|
|
1061
|
|
1062 Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value
|
|
1063 pairs as describes in B<ReadSequenceFile> method.
|
|
1064
|
|
1065 =item B<ReadMSFSequenceFile>
|
|
1066
|
|
1067 $SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile);
|
|
1068
|
|
1069 Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value
|
|
1070 pairs as describes in B<ReadSequenceFile> method.
|
|
1071
|
|
1072 =item B<ReadPIRFastaSequenceFile>
|
|
1073
|
|
1074 $SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile);
|
|
1075
|
|
1076 Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value
|
|
1077 pairs as describes in B<ReadSequenceFile> method.
|
|
1078
|
|
1079 =item B<ReadPearsonFastaSequenceFile>
|
|
1080
|
|
1081 $SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile);
|
|
1082
|
|
1083 Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value
|
|
1084 pairs as describes in B<ReadSequenceFile> method.
|
|
1085
|
|
1086 =item B<RemoveSequenceGaps>
|
|
1087
|
|
1088 $SeqWithoutGaps = RemoveSequenceGaps($Sequence);
|
|
1089
|
|
1090 Removes gaps from I<Sequence> and return a sequence without any gaps.
|
|
1091
|
|
1092 =item B<RemoveSequenceAlignmentGapColumns>
|
|
1093
|
|
1094 $NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns(
|
|
1095 $AlignmentDataMapRef);
|
|
1096
|
|
1097 Using input alignment data map ref containing following keys, generate a new hash with
|
|
1098 same set of keys after residue columns containg only gaps have been removed:
|
|
1099
|
|
1100 {IDs} : Array of IDs in order as they appear in file
|
|
1101 {Count}: ID count
|
|
1102 {Description}{$ID} : Description data
|
|
1103 {Sequence}{$ID} : Sequence data
|
|
1104
|
|
1105 =item B<WritePearsonFastaSequenceFile>
|
|
1106
|
|
1107 WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef,
|
|
1108 [$MaxLength]);
|
|
1109
|
|
1110 Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence
|
|
1111 file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is
|
|
1112 80.
|
|
1113
|
|
1114 =back
|
|
1115
|
|
1116 =head1 AUTHOR
|
|
1117
|
|
1118 Manish Sud <msud@san.rr.com>
|
|
1119
|
|
1120 =head1 SEE ALSO
|
|
1121
|
|
1122 PDBFileUtil.pm
|
|
1123
|
|
1124 =head1 COPYRIGHT
|
|
1125
|
|
1126 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1127
|
|
1128 This file is part of MayaChemTools.
|
|
1129
|
|
1130 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1131 the terms of the GNU Lesser General Public License as published by the Free
|
|
1132 Software Foundation; either version 3 of the License, or (at your option)
|
|
1133 any later version.
|
|
1134
|
|
1135 =cut
|