comparison lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 package SequenceFileUtil;
2 #
3 # $RCSfile: SequenceFileUtil.pm,v $
4 # $Date: 2015/02/28 20:47:18 $
5 # $Revision: 1.33 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Exporter;
31 use Text::ParseWords;
32 use TextUtil;
33 use FileUtil;
34
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
36
37 @ISA = qw(Exporter);
38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile);
39 @EXPORT_OK = qw();
40
41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
42
43 # Compare lengths of all sequences...
44 sub AreSequenceLengthsIdentical {
45 my($SequencesDataRef) = @_;
46 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen);
47
48 $Status = 1;
49 $FirstID = '';
50 $FirstDifferentLenID = '';
51
52 ID: for $ID (@{$SequencesDataRef->{IDs}}) {
53 if (!$FirstID) {
54 $FirstID = $ID;
55 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID});
56 next ID;
57 }
58 $SeqLen = length($SequencesDataRef->{Sequence}{$ID});
59 if ($SeqLen != $FirstSeqLen) {
60 $Status = 0;
61 $FirstDifferentLenID = $ID;
62 last ID;
63 }
64 }
65 return ($Status);
66 }
67
68 # Calculate percent identity between two sequences. By default, gaps are ignored.
69 sub CalcuatePercentSequenceIdentity {
70 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision);
71
72 $PercentIdentity = '';
73 $Precision = 1;
74 $IgnoreGaps = 1;
75 if (@_ == 4) {
76 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_;
77 }
78 elsif (@_ == 3) {
79 ($Sequence1, $Sequence2, $IgnoreGaps) = @_;
80 }
81 elsif (@_ == 2) {
82 ($Sequence1, $Sequence2) = @_;
83 }
84 else {
85 return $PercentIdentity;
86 }
87 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) {
88 return $PercentIdentity;
89 }
90 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount);
91
92 $Sequence1Len = length($Sequence1);
93 $Sequence2Len = length($Sequence2);
94
95 $ResMatchCount = 0;
96 $ResCount = 0;
97 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) {
98 $Residue1 = substr($Sequence1, $Index, 1);
99 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : '';
100 if ($IgnoreGaps) {
101 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) {
102 next RESIDUE;
103 }
104 }
105 if ($Residue1 eq $Residue2) {
106 $ResMatchCount++;
107 }
108 $ResCount++;
109 }
110 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0;
111 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100));
112
113 return $PercentIdentity;
114 }
115
116 # Calculate pairwise identify matrix for all the sequences and return a reference
117 # to a hash with the following keys:
118 #
119 # {IDs} - Sequence IDs
120 # {Count} - Number of IDs
121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences
122 #
123 sub CalculatePercentSequenceIdentityMatrix {
124 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData);
125
126 $IgnoreGaps = 1;
127 $Precision = 1;
128 if (@_ == 3) {
129 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_;
130 }
131 elsif (@_ == 2) {
132 ($SequencesDataRef, $IgnoreGaps) = @_;
133 }
134 else {
135 ($SequencesDataRef) = @_;
136 }
137
138 %IdentityMatrixData = ();
139 @{$IdentityMatrixData{IDs}} = ();
140 %{$IdentityMatrixData{PercentIdentity}} = ();
141 $IdentityMatrixData{Count} = 0;
142
143 for $ID (@{$SequencesDataRef->{IDs}}) {
144 push @{$IdentityMatrixData{IDs}}, $ID;
145 $IdentityMatrixData{Count} += 1;
146 }
147 # Initialize and calculate percent identity data values...
148 for $RowID (@{$SequencesDataRef->{IDs}}) {
149 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = ();
150 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID};
151 for $ColID (@{$SequencesDataRef->{IDs}}) {
152 $IdentityMatrixData{$RowID}{$ColID} = '';
153 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID};
154 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision);
155 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity;
156 }
157 }
158 return \%IdentityMatrixData;
159 }
160
161 # Retrieve information about shortest sequence...
162 sub GetShortestSequence {
163 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
164
165 $IgnoreGaps = 1;
166 if (@_ == 2) {
167 ($SequencesDataRef, $IgnoreGaps) = @_;
168 }
169 else {
170 ($SequencesDataRef) = @_;
171 }
172
173 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps);
174 return ($ID, $Sequence, $SeqLen, $Description);
175 }
176
177 # Retrieve information about longest sequence..
178 sub GetLongestSequence {
179 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description);
180
181 $IgnoreGaps = 1;
182 if (@_ == 2) {
183 ($SequencesDataRef, $IgnoreGaps) = @_;
184 }
185 else {
186 ($SequencesDataRef) = @_;
187 }
188
189 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps);
190 return ($ID, $Sequence, $SeqLen, $Description);
191 }
192
193 # Get sequence length...
194 sub GetSequenceLength {
195 my($Seq, $SeqLen, $IgnoreGaps);
196
197 $SeqLen = ''; $IgnoreGaps = 1;
198 if (@_ == 2) {
199 ($Seq, $IgnoreGaps) = @_;
200 }
201 else {
202 ($Seq) = @_;
203 }
204 if ($IgnoreGaps) {
205 my($Index, $Residue);
206 $SeqLen = 0;
207 for $Index (0 .. (length($Seq) - 1)) {
208 $Residue = substr($Seq, $Index, 1);
209 if ($Residue =~ /[A-Z]/i) {
210 $SeqLen++;
211 }
212 }
213 }
214 else {
215 $SeqLen = length($Seq);
216 }
217
218 return $SeqLen;
219 }
220
221 # Is it a gap residue...
222 sub IsGapResidue {
223 my($Residue) = @_;
224 my($Status);
225
226 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0;
227
228 return $Status;
229 }
230
231 # Is it a supported sequence file?
232 #
233 # Supported seqence formats are:
234 #
235 # ALN/ClustalW .aln
236 # GCG/MSF .msf
237 # PILEUP/MSF .msf
238 # Fasts(Pearson) .fasta, .fta
239 # NBRF/PIR .pir
240 #
241 sub IsSupportedSequenceFile {
242 my($SequenceFile) = @_;
243 my($Status, $SequenceFormat);
244 $Status = 0; $SequenceFormat = 'NotSupported';
245
246 SEQFORMAT: {
247 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT}
248 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT}
249 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT}
250 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT}
251 $Status = 0; $SequenceFormat = 'NotSupported';
252 }
253 return ($Status, $SequenceFormat);
254 }
255
256 # Is it a ClustalW multiple sequence sequence file...
257 sub IsClustalWSequenceFile {
258 my($SequenceFile) = @_;
259 my($Status, $Line);
260
261 $Status = 0;
262
263 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
264 $Line = GetTextLine(\*SEQUENCEFILE);
265 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0;
266 close SEQUENCEFILE;
267
268 return $Status;
269 }
270
271 # Is it a valid Pearson fasta sequence or alignment file?
272 #
273 sub IsPearsonFastaSequenceFile {
274 my($FastaFile, $Line, $Status);
275
276 ($FastaFile) = @_;
277 $Status = 0;
278
279 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
280 $Line = GetTextLine(\*FASTAFILE);
281
282 # First line starts with > and the fourth character is not ';'; otherwise, it's
283 # PIR FASTA format.
284 if ($Line =~ /^>/) {
285 my($FourthChar);
286 $FourthChar = substr($Line, 3, 1);
287 $Status = ($FourthChar !~ /\;/) ? 1 : 0;
288 }
289 close FASTAFILE;
290
291 return $Status;
292 }
293
294 # Is it a valid NBRF/PIR fasta sequence or alignment file?
295 #
296 sub IsPIRFastaSequenceFile {
297 my($FastaFile, $Line, $Status);
298
299 ($FastaFile) = @_;
300 $Status = 0;
301
302 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n";
303 $Line = GetTextLine(\*FASTAFILE);
304
305 # First line starts with > and the fourth character is ';'; otherwise, it's
306 # a Pearson FASTA format.
307 if ($Line =~ /^>/) {
308 my($FourthChar);
309 $FourthChar = substr($Line, 3, 1);
310 $Status = ($FourthChar =~ /\;/) ? 1 : 0;
311 }
312 close FASTAFILE;
313
314 return $Status;
315 }
316
317 # Is it a valid MSF sequence or alignment file?
318 #
319 sub IsMSFSequenceFile {
320 my($MSFFile) = @_;
321
322 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n";
323
324 my($Line, $Status);
325
326 $Status = 0;
327 # Find a line that contains MSF: keyword and ends with '..'
328 LINE: while ($Line = GetTextLine(\*MSFFILE)) {
329 $Line = RemoveLeadingWhiteSpaces($Line);
330 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) {
331 $Status = 1;
332 last LINE;
333 }
334 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) {
335 # Pileup MSF...
336 $Status = 1;
337 last LINE;
338 }
339 }
340 close MSFFILE;
341
342 return $Status;
343 }
344
345 # Read sequence or sequence alignment file...
346 sub ReadSequenceFile {
347 my($SequenceFile) = @_;
348
349 if (IsPearsonFastaSequenceFile($SequenceFile)) {
350 return ReadPearsonFastaSequenceFile($SequenceFile);
351 }
352 elsif (IsPIRFastaSequenceFile($SequenceFile)) {
353 return ReadPIRFastaSequenceFile($SequenceFile);
354 }
355 elsif (IsMSFSequenceFile($SequenceFile)) {
356 return ReadMSFSequenceFile($SequenceFile);
357 }
358 elsif (IsClustalWSequenceFile($SequenceFile)) {
359 return ReadClustalWSequenceFile($SequenceFile);
360 }
361 else {
362 return undef;
363 }
364 }
365
366 # Read file and setup alignment data...
367 sub ReadClustalWSequenceFile {
368 my($SequenceFile) = @_;
369
370 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW');
371 }
372
373 # Read file and setup alignment data...
374 sub ReadPearsonFastaSequenceFile {
375 my($SequenceFile) = @_;
376
377 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson');
378 }
379
380 # Read file and setup alignment data...
381 sub ReadPIRFastaSequenceFile {
382 my($SequenceFile) = @_;
383
384 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR');
385 }
386
387
388 # Read file and setup sequence data...
389 sub ReadMSFSequenceFile {
390 my($SequenceFile) = @_;
391
392 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF');
393 }
394
395 # Write out a Pearson FASTA file...
396 sub WritePearsonFastaSequenceFile {
397 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence);
398
399 $MaxLength = 80;
400 if (@_ == 3) {
401 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_;
402 }
403 elsif (@_ == 2) {
404 ($SequenceFileName, $SequenceDataRef) = @_;
405 }
406 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n";
407 for $ID (@{$SequenceDataRef->{IDs}}) {
408 $Description = $SequenceDataRef->{Description}{$ID};
409 $Sequence = $SequenceDataRef->{Sequence}{$ID};
410 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n");
411
412 # Description also contains ID...
413 print SEQUENCEFILE ">$Description\n";
414 print SEQUENCEFILE "$WrappedSequence\n";
415 }
416 close SEQUENCEFILE;
417 }
418
419 # Get ID, Sequence and Length for smallest or longest sequence
420 sub _GetShortestOrLongestSequence {
421 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_;
422 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
423
424 ($ID, $Seq, $SeqLen) = ('', '', '');
425 $FirstID = '';
426
427 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) {
428 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID};
429 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps);
430 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID};
431 if (!$FirstID) {
432 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen;
433 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
434 next ID;
435 }
436 if ($CurrentSeqLen != $SeqLen) {
437 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) {
438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
439 }
440 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) {
441 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription);
442 }
443 }
444 }
445 return ($ID, $Seq, $SeqLen, $Description);
446 }
447
448 # Remove gaps in the sequence and return new sequence...
449 sub RemoveSequenceGaps {
450 my($Seq) = @_;
451 my($SeqWithoutGaps, $SeqLen, $Index, $Residue);
452
453 $SeqWithoutGaps = '';
454 $SeqLen = length($Seq);
455 for $Index (0 .. ($SeqLen - 1)) {
456 $Residue = substr($Seq, $Index, 1);
457 if ($Residue =~ /[A-Z]/i) {
458 $SeqWithoutGaps .= $Residue;
459 }
460 }
461
462 return $SeqWithoutGaps;
463 }
464
465 # Using input alignment data map ref containing following keys, generate
466 # a new hash with same set of keys after residue columns containg only
467 # gaps have been removed:
468 #
469 # {IDs} : Array of IDs in order as they appear in file
470 # {Count}: ID count...
471 # {Description}{$ID} : Description data...
472 # {Sequence}{$ID} : Sequence data...
473 #
474 sub RemoveSequenceAlignmentGapColumns {
475 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap);
476
477 ($AlignmentDataMapRef) = @_;
478
479 %NewAlignmentDataMap = ();
480 @{$NewAlignmentDataMap{IDs}} =();
481 %{$NewAlignmentDataMap{Description}} =();
482 %{$NewAlignmentDataMap{Sequence}} =();
483 $NewAlignmentDataMap{Count} = 0;
484
485 # Transfer ID and count information...
486 for $ID (@{$AlignmentDataMapRef->{IDs}}) {
487 push @{$NewAlignmentDataMap{IDs}}, $ID;
488 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID};
489 $NewAlignmentDataMap{Sequence}{$ID} = '';
490 $NewAlignmentDataMap{Count} += 1;
491 }
492
493 # Go over residue columns and transfer the data...
494 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn);
495
496 $FirstID = $AlignmentDataMapRef->{IDs}[0];
497 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID};
498 $FirstSeqLen = length($FirstSeq);
499
500 RES: for $Index (0 .. ($FirstSeqLen - 1)) {
501 # Is this a gap column?
502 $GapColumn = 1;
503 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) {
504 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
505 if ($Res =~ /[A-Z]/i) {
506 $GapColumn = 0;
507 last ID;
508 }
509 }
510 if ($GapColumn) {
511 next RES;
512 }
513 # Transfer this residue...
514 for $ID (@{$AlignmentDataMapRef->{IDs}}) {
515 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1);
516 $NewAlignmentDataMap{Sequence}{$ID} .= $Res;
517 }
518 }
519
520 return (\%NewAlignmentDataMap);
521 }
522
523 #
524 # Read sequences file and return a reference to hash with the following keys:
525 #
526 # {IDs} - Array of sequence IDs
527 # {Count} - Number of sequences
528 # {Description}{$ID} - Sequence description
529 # {Sequence}{$ID} - Sequence for a specific ID
530 # {InputFileType} - Sequence file format
531 # {ConservedAnnotation} - Conserved residue annonation
532 #
533 # Note:
534 # . Conserved residue annotation either exist in the input sequence alignment file or set
535 # for a file containing same number of residues for all the sequence using the following
536 # notation: * - Residue conserved; ' ' - Residue not conserved.
537 #
538 sub _ReadFileAndSetupSequencesData {
539 my($SequenceFile, $SequenceType) = @_;
540 my($SequenceDataMapRef);
541
542 $SequenceDataMapRef = undef;
543
544 # Read sequence file...
545 $SequenceDataMapRef = '';
546 if ($SequenceType =~ /^ClustalW$/i) {
547 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile);
548 }
549 elsif ($SequenceType =~ /^Pearson$/i) {
550 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile);
551 }
552 elsif ($SequenceType =~ /^PIR$/i) {
553 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile);
554 }
555 elsif ($SequenceType =~ /^MSF$/i) {
556 $SequenceDataMapRef = _ReadMSFFile($SequenceFile);
557 }
558 else {
559 return $SequenceDataMapRef;
560 }
561
562 if (exists $SequenceDataMapRef->{ConservedAnnotation}) {
563 return ($SequenceDataMapRef);
564 }
565 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) {
566 return ($SequenceDataMapRef);
567 }
568
569 # Use the first sequence to setup an empty ConservedAnnotation key...
570 # And mark fully conserved residues...
571 #
572 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index);
573 $ID = $SequenceDataMapRef->{IDs}[0];
574 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID};
575 $FirstSeqLen = length($FirstSequence);
576 $SequenceDataMapRef->{ConservedAnnotation} = '';
577 for $Index (0 .. ($FirstSeqLen - 1)) {
578 $FirstRes = '';
579 $ResConserved = 1;
580 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) {
581 $Sequence = $SequenceDataMapRef->{Sequence}{$ID};
582 $Res = substr($Sequence, $Index, 1);
583 if (!$FirstRes) {
584 $FirstRes = $Res;
585 next ID;
586 }
587 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) {
588 $ResConserved = 0;
589 last ID;
590 }
591 }
592 if ($ResConserved) {
593 $SequenceDataMapRef->{ConservedAnnotation} .= '*';
594 }
595 else {
596 $SequenceDataMapRef->{ConservedAnnotation} .= ' ';
597 }
598 }
599
600 return ($SequenceDataMapRef);
601 }
602
603 # Read sequence data in ClustalW multiple sequence alignment file and
604 # return a reference to hash with these keys and values:
605 #
606 # {IDs} - Array of sequence IDs
607 # {Count} - Number of sequences
608 # {Description}{$ID} - Sequence description
609 # {Sequence}{$ID} - Sequence for a specific ID
610 # {InputFileType} - Sequence file format
611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , .
612 #
613 #
614 #
615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean:
616 #
617 # '*' indicates positions which have a single, fully conserved residue
618 #
619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA
620 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW
621
622 # '.' indicates that one of the following 'weaker' groups is fully conserved:
623 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY
624 #
625 # These are all the positively scoring groups that occur in the Gonnet Pam250
626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak
627 # score =<0.5 respectively.
628 #
629 sub _ReadClustalWFile {
630 my($SequenceFile) = @_;
631 my(%SequencesDataMap);
632
633 # Initialize data...
634 %SequencesDataMap = ();
635 @{$SequencesDataMap{IDs}} = ();
636 %{$SequencesDataMap{Description}} = ();
637 %{$SequencesDataMap{Sequence}} = ();
638 $SequencesDataMap{Count} = 0;
639 $SequencesDataMap{ConservedAnnotation} = '';
640 $SequencesDataMap{InputFileType} = 'ClustalW';
641
642 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n";
643
644 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex);
645
646 # Ignore the header line...
647 $Line = <SEQUENCEFILE>;
648
649 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) {
650 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) {
651 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd.
652 # Extract it and save...
653 $LineLength = length($Line);
654 $AnnotationStart = $LineLength - $SequenceLength;
655 $AnnotationLength = $SequenceLength;
656 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength);
657 $SequencesDataMap{ConservedAnnotation} .= $Annotation;
658 }
659 else {
660 # Extract ID and sequences...
661 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/;
662 $Sequence =~ s/ //g;
663 if (!($ID && $Sequence)) {
664 next LINE;
665 }
666
667 if (exists $SequencesDataMap{Sequence}{$ID}) {
668 # Append to existing alignment value...
669 $SequenceLength = length($Sequence);
670 $SequencesDataMap{Sequence}{$ID} .= $Sequence;
671 }
672 else {
673 # New alignment data...
674 $SequencesDataMap{Count} += 1;
675 push @{$SequencesDataMap{IDs}}, $ID;
676 $SequencesDataMap{Description}{$ID} = $ID;
677 $SequencesDataMap{Sequence}{$ID} = $Sequence;
678 $SequenceLength = length($Sequence);
679 }
680 }
681 }
682 close SEQUENCEFILE;
683 return (\%SequencesDataMap);
684 }
685
686 # Read Pearson fasta file and return a reference to hash with these keys:
687 #
688 # {IDs} - Array of sequence IDs
689 # {Count} - Number of sequences
690 # {Description}{$ID} - Sequence description
691 # {Sequence}{$ID} - Sequence for a specific ID
692 # {InputFileType} - Sequence file format
693 # {ConservedAnnotation} - Conserved residue annonation
694 #
695 sub _ReadPearsonFastaFile {
696 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap);
697
698 ($FastaFileName) = @_;
699
700 %FastaDataMap = ();
701 @{$FastaDataMap{IDs}} =();
702 %{$FastaDataMap{Description}} =();
703 %{$FastaDataMap{Sequence}} =();
704 $FastaDataMap{Count} = 0;
705 $FastaDataMap{InputFileType} = 'Pearson';
706
707 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
708 $ID = '';
709 $IgnoreID = 0;
710 LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
711 if ($Line =~ /^\>/) {
712 # Start of a new ID...
713 $Line =~ s/^\>//;
714 $Line = RemoveLeadingWhiteSpaces($Line);
715 @LineWords = ();
716 @LineWords = split / /, $Line;
717
718 $ID = $LineWords[0];
719 $ID =~ s/ //g;
720 $Description = $Line;
721
722 $IgnoreID = 0;
723 if (exists $FastaDataMap{Sequence}{$ID}) {
724 $IgnoreID = 1;
725 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
726 next LINE;
727 }
728 push @{$FastaDataMap{IDs}}, $ID;
729 $FastaDataMap{Description}{$ID} = $Description;
730 $FastaDataMap{Count} += 1;
731 next LINE;
732 }
733 if ($IgnoreID) { next LINE; }
734
735 # Remove any spaces in the sequence...
736 $Line =~ s/ //g;
737 # Sequence data for active ID...
738 if (exists $FastaDataMap{Sequence}{$ID}) {
739 $FastaDataMap{Sequence}{$ID} .= $Line;
740 }
741 else {
742 $FastaDataMap{Sequence}{$ID} = $Line;
743 }
744 }
745 close FASTAFILE;
746 return \%FastaDataMap;
747 }
748
749 # Read PIR fasta file and return a reference to hash with these keys:
750 #
751 # {IDs} - Array of sequence IDs
752 # {Count} - Number of sequences
753 # {Description}{$ID} - Sequence description
754 # {Sequence}{$ID} - Sequence for a specific ID
755 # {InputFileType} - Sequence file format
756 # {ConservedAnnotation} - Conserved residue annonation
757 #
758 # Format:
759 # A sequence in PIR format consists of:
760 # One line starting with
761 # a ">" (greater-than) sign, followed by
762 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by
763 # a semicolon, followed by
764 # the sequence identification code (the database ID-code).
765 # One line containing a textual description of the sequence.
766 # One or more lines containing the sequence itself. The end of the
767 # sequence is marked by a "*" (asterisk) character.
768 #
769 # A file in PIR format may comprise more than one sequence.
770 #
771 # The PIR format is also often referred to as the NBRF format.
772 #
773 # Code SequenceType
774 # P1 Protein (complete)
775 # F1 Protein (fragment)
776 # DL DNA (linear)
777 # DC DNA (circular)
778 # RL RNA (linear)
779 # RC RNA (circular)
780 # N3 tRNA
781 # N1 Other functional RNA
782 #
783
784 sub _ReadPIRFastaFile {
785 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap);
786
787 ($FastaFileName) = @_;
788
789 %FastaDataMap = ();
790 @{$FastaDataMap{IDs}} =();
791 %{$FastaDataMap{Description}} =();
792 %{$FastaDataMap{Sequence}} =();
793 %{$FastaDataMap{SequenceTypeCode}} =();
794 $FastaDataMap{Count} = 0;
795 $FastaDataMap{InputFileType} = 'PIR';
796
797 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n";
798 $ID = '';
799 $ReadingSequenceData = 0;
800 LINE: while ($Line = GetTextLine(\*FASTAFILE)) {
801 if ($Line =~ /^\>/) {
802 # Start of a new ID...
803 $Line =~ s/^\>//;
804 $Line = RemoveLeadingWhiteSpaces($Line);
805 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/;
806
807 # Use next line to retrieve sequence description...
808 $Line = GetTextLine(\*FASTAFILE);
809 $Line = RemoveLeadingWhiteSpaces($Line);
810 $Description = $Line;
811
812 if (exists $FastaDataMap{Sequence}{$ID}) {
813 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n";
814 next LINE;
815 }
816 $ReadingSequenceData = 1;
817 push @{$FastaDataMap{IDs}}, $ID;
818 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode;
819 $FastaDataMap{Description}{$ID} = $Description;
820 $FastaDataMap{Count} += 1;
821 next LINE;
822 }
823 if (!$ReadingSequenceData) { next LINE; }
824
825 # Remove any spaces in the sequence...
826 $Line =~ s/ //g;
827 if ($Line =~ /[\*]$/) {
828 # End of sequence...
829 $ReadingSequenceData = 0;
830 $Line =~ s/[\*]$//;
831 }
832 # Sequence data for active ID...
833 if (exists $FastaDataMap{Sequence}{$ID}) {
834 $FastaDataMap{Sequence}{$ID} .= $Line;
835 }
836 else {
837 $FastaDataMap{Sequence}{$ID} = $Line;
838 }
839 }
840 close FASTAFILE;
841 return \%FastaDataMap;
842 }
843
844 # Read MSF file and return a reference to hash with these keys:
845 #
846 # {IDs} : Array of IDs in order as they appear in file
847 # {Count}: ID count...
848 # {Description}{$ID} : Description data...
849 # {Sequence}{$ID} : Sequence data...
850 #
851 sub _ReadMSFFile {
852 my($MSFFileName, $Line, @LineWords, %MSFDataMap);
853
854 ($MSFFileName) = @_;
855
856 %MSFDataMap = ();
857 @{$MSFDataMap{IDs}} =();
858 %{$MSFDataMap{Description}} =();
859 %{$MSFDataMap{Sequence}} =();
860 $MSFDataMap{Count} = 0;
861 $MSFDataMap{InputFileType} = 'MSF';
862
863 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n";
864
865 # Collect sequences and IDs...
866 #
867 # '//' after the name fields indicates end of header list and start of sequence data.
868 #
869 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap);
870 %MSFIDsMap = ();
871 $NameFieldsFound = 0;
872 LINE: while ($Line = GetTextLine(\*MSFFILE)) {
873 if ($Line =~ /Name:/) {
874 $NameFieldsFound++;
875 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/;
876 if ($ID =~ / /) {
877 ($ID) = $ID =~ /^(.*?)[ ]+/
878 }
879 if (exists $MSFIDsMap{$ID}) {
880 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n";
881 next LINE;
882 }
883 $MSFIDsMap{$ID} = $ID;
884 push @{$MSFDataMap{IDs}}, $ID;
885 $MSFDataMap{Description}{$ID} = $ID;
886 $MSFDataMap{Count} += 1;
887 }
888 elsif ( /\/\// && $NameFieldsFound) {
889 # End of header list...
890 last LINE;
891 }
892 }
893 # Collect all sequences...
894 #
895 my($FirstField, $SecondField);
896 while ($Line = GetTextLine(\*MSFFILE)) {
897 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/;
898 if (exists $MSFIDsMap{$FirstField}) {
899 # It's ID and sequence data...
900 $ID = $FirstField;
901 $Sequence = $SecondField;
902 # Take out spaces and leave the gap characters...
903 $Sequence =~ s/ //g;
904 if ($MSFDataMap{Sequence}{$ID}) {
905 $MSFDataMap{Sequence}{$ID} .= $Sequence;
906 }
907 else {
908 $MSFDataMap{Sequence}{$ID} = $Sequence;
909 }
910 }
911 }
912
913 close MSFFILE;
914 return \%MSFDataMap;
915 }
916
917
918 1;
919
920 __END__
921
922 =head1 NAME
923
924 SequenceFileUtil
925
926 =head1 SYNOPSIS
927
928 use SequenceFileUtil ;
929
930 use SequenceFileUtil qw(:all);
931
932 =head1 DESCRIPTION
933
934 B<SequenceFileUtil> module provides the following functions:
935
936 AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity,
937 CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength,
938 GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile,
939 IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile,
940 ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile,
941 ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns,
942 RemoveSequenceGaps, WritePearsonFastaSequenceFile
943 SequenceFileUtil module provides various methods to process sequence
944 files and retreive appropriate information.
945
946 =head1 FUNCTIONS
947
948 =over 4
949
950 =item B<AreSequenceLengthsIdentical>
951
952 $Status = AreSequenceLengthsIdentical($SequencesDataRef);
953
954 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1
955 or 0 based whether lengths of all the sequence is same.
956
957 =item B<CalcuatePercentSequenceIdentity>
958
959 $PercentIdentity =
960 AreSequenceLengthsIdenticalAreSequenceLengthsIdentical(
961 $Sequence1, $Sequence2, [$IgnoreGaps, $Precision]);
962
963 Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments
964 I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the
965 returned value. By default, gaps are ignored and precision is set up to 1 decimal.
966
967 =item B<CalculatePercentSequenceIdentityMatrix>
968
969 $IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix(
970 $SequencesDataRef, [$IgnoreGaps,
971 $Precision]);
972
973 Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef>
974 and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and
975 I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps
976 are ignored and precision is set up to 1 decimal.
977
978 =item B<GetSequenceLength>
979
980 $SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]);
981
982 Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling
983 of gaps. By default, gaps are ignored.
984
985 =item B<GetShortestSequence>
986
987 ($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence(
988 $SequencesDataRef, [$IgnoreGaps]);
989
990 Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID,
991 $Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps
992 controls handling of gaps in sequences. By default, gaps are ignored.
993
994 =item B<GetLongestSequence>
995
996 ($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence(
997 $SequencesDataRef, [$IgnoreGaps]);
998
999 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>,
1000 B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument
1001 $I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored.
1002
1003 =item B<IsGapResidue>
1004
1005 $Status = AreSequenceLengthsIdentical($Residue);
1006
1007 Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is
1008 considered a gap residue.
1009
1010 =item B<IsSupportedSequenceFile>
1011
1012 $Status = IsSupportedSequenceFile($SequenceFile);
1013
1014 Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence
1015 format.
1016
1017 =item B<IsClustalWSequenceFile>
1018
1019 $Status = IsClustalWSequenceFile($SequenceFile);
1020
1021 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment
1022 format.
1023
1024 =item B<IsPearsonFastaSequenceFile>
1025
1026 $Status = IsPearsonFastaSequenceFile($SequenceFile);
1027
1028 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence
1029 format.
1030
1031 =item B<IsPIRFastaSequenceFile>
1032
1033 $Status = IsPIRFastaSequenceFile($SequenceFile);
1034
1035 Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence
1036 format.
1037
1038 =item B<IsMSFSequenceFile>
1039
1040 $Status = IsClustalWSequenceFile($SequenceFile);
1041
1042 Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment
1043 format.
1044
1045 =item B<ReadSequenceFile>
1046
1047 $SequenceDataMapRef = ReadSequenceFile($SequenceFile);
1048
1049 Reads I<SequenceFile> and returns reference to a hash containing following key/value
1050 pairs:
1051
1052 $SequenceDataMapRef->{IDs} - Array of sequence IDs
1053 $SequenceDataMapRef->{Count} - Number of sequences
1054 $SequenceDataMapRef->{Description}{$ID} - Sequence description
1055 $SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID
1056 $SequenceDataMapRef->{Sequence}{InputFileType} - File format
1057
1058 =item B<ReadClustalWSequenceFile>
1059
1060 $SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile);
1061
1062 Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value
1063 pairs as describes in B<ReadSequenceFile> method.
1064
1065 =item B<ReadMSFSequenceFile>
1066
1067 $SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile);
1068
1069 Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value
1070 pairs as describes in B<ReadSequenceFile> method.
1071
1072 =item B<ReadPIRFastaSequenceFile>
1073
1074 $SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile);
1075
1076 Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value
1077 pairs as describes in B<ReadSequenceFile> method.
1078
1079 =item B<ReadPearsonFastaSequenceFile>
1080
1081 $SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile);
1082
1083 Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value
1084 pairs as describes in B<ReadSequenceFile> method.
1085
1086 =item B<RemoveSequenceGaps>
1087
1088 $SeqWithoutGaps = RemoveSequenceGaps($Sequence);
1089
1090 Removes gaps from I<Sequence> and return a sequence without any gaps.
1091
1092 =item B<RemoveSequenceAlignmentGapColumns>
1093
1094 $NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns(
1095 $AlignmentDataMapRef);
1096
1097 Using input alignment data map ref containing following keys, generate a new hash with
1098 same set of keys after residue columns containg only gaps have been removed:
1099
1100 {IDs} : Array of IDs in order as they appear in file
1101 {Count}: ID count
1102 {Description}{$ID} : Description data
1103 {Sequence}{$ID} : Sequence data
1104
1105 =item B<WritePearsonFastaSequenceFile>
1106
1107 WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef,
1108 [$MaxLength]);
1109
1110 Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence
1111 file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is
1112 80.
1113
1114 =back
1115
1116 =head1 AUTHOR
1117
1118 Manish Sud <msud@san.rr.com>
1119
1120 =head1 SEE ALSO
1121
1122 PDBFileUtil.pm
1123
1124 =head1 COPYRIGHT
1125
1126 Copyright (C) 2015 Manish Sud. All rights reserved.
1127
1128 This file is part of MayaChemTools.
1129
1130 MayaChemTools is free software; you can redistribute it and/or modify it under
1131 the terms of the GNU Lesser General Public License as published by the Free
1132 Software Foundation; either version 3 of the License, or (at your option)
1133 any later version.
1134
1135 =cut