Mercurial > repos > deepakjadmin > mayatool3_test2
comparison lib/SequenceFileUtil.pm @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 package SequenceFileUtil; | |
2 # | |
3 # $RCSfile: SequenceFileUtil.pm,v $ | |
4 # $Date: 2015/02/28 20:47:18 $ | |
5 # $Revision: 1.33 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use Exporter; | |
31 use Text::ParseWords; | |
32 use TextUtil; | |
33 use FileUtil; | |
34 | |
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |
36 | |
37 @ISA = qw(Exporter); | |
38 @EXPORT = qw(AreSequenceLengthsIdentical CalcuatePercentSequenceIdentity CalculatePercentSequenceIdentityMatrix GetLongestSequence GetShortestSequence GetSequenceLength IsGapResidue IsSupportedSequenceFile IsClustalWSequenceFile IsPearsonFastaSequenceFile IsMSFSequenceFile ReadSequenceFile RemoveSequenceGaps RemoveSequenceAlignmentGapColumns WritePearsonFastaSequenceFile); | |
39 @EXPORT_OK = qw(); | |
40 | |
41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); | |
42 | |
43 # Compare lengths of all sequences... | |
44 sub AreSequenceLengthsIdentical { | |
45 my($SequencesDataRef) = @_; | |
46 my($Status, $ID, $FirstID, $FirstSeqLen, $FirstDifferentLenID, $SeqLen); | |
47 | |
48 $Status = 1; | |
49 $FirstID = ''; | |
50 $FirstDifferentLenID = ''; | |
51 | |
52 ID: for $ID (@{$SequencesDataRef->{IDs}}) { | |
53 if (!$FirstID) { | |
54 $FirstID = $ID; | |
55 $FirstSeqLen = length($SequencesDataRef->{Sequence}{$ID}); | |
56 next ID; | |
57 } | |
58 $SeqLen = length($SequencesDataRef->{Sequence}{$ID}); | |
59 if ($SeqLen != $FirstSeqLen) { | |
60 $Status = 0; | |
61 $FirstDifferentLenID = $ID; | |
62 last ID; | |
63 } | |
64 } | |
65 return ($Status); | |
66 } | |
67 | |
68 # Calculate percent identity between two sequences. By default, gaps are ignored. | |
69 sub CalcuatePercentSequenceIdentity { | |
70 my($Sequence1, $Sequence2, $PercentIdentity, $IgnoreGaps, $Precision); | |
71 | |
72 $PercentIdentity = ''; | |
73 $Precision = 1; | |
74 $IgnoreGaps = 1; | |
75 if (@_ == 4) { | |
76 ($Sequence1, $Sequence2, $IgnoreGaps, $Precision) = @_; | |
77 } | |
78 elsif (@_ == 3) { | |
79 ($Sequence1, $Sequence2, $IgnoreGaps) = @_; | |
80 } | |
81 elsif (@_ == 2) { | |
82 ($Sequence1, $Sequence2) = @_; | |
83 } | |
84 else { | |
85 return $PercentIdentity; | |
86 } | |
87 if (!(IsNotEmpty($Sequence1) && IsNotEmpty($Sequence2))) { | |
88 return $PercentIdentity; | |
89 } | |
90 my($Index, $Identity, $Sequence1Len, $Sequence2Len, $Residue1, $Residue2, $ResMatchCount, $ResCount); | |
91 | |
92 $Sequence1Len = length($Sequence1); | |
93 $Sequence2Len = length($Sequence2); | |
94 | |
95 $ResMatchCount = 0; | |
96 $ResCount = 0; | |
97 RESIDUE: for $Index (0 .. ($Sequence1Len - 1)) { | |
98 $Residue1 = substr($Sequence1, $Index, 1); | |
99 $Residue2 = ($Index < $Sequence2Len) ? substr($Sequence2, $Index, 1) : ''; | |
100 if ($IgnoreGaps) { | |
101 if ($Residue1 !~ /[A-Z]/i || $Residue2 !~ /[A-Z]/i) { | |
102 next RESIDUE; | |
103 } | |
104 } | |
105 if ($Residue1 eq $Residue2) { | |
106 $ResMatchCount++; | |
107 } | |
108 $ResCount++; | |
109 } | |
110 $Identity = $ResCount ? ($ResMatchCount/$ResCount) : 0.0; | |
111 $PercentIdentity = sprintf("%.${Precision}f", ($Identity * 100)); | |
112 | |
113 return $PercentIdentity; | |
114 } | |
115 | |
116 # Calculate pairwise identify matrix for all the sequences and return a reference | |
117 # to a hash with the following keys: | |
118 # | |
119 # {IDs} - Sequence IDs | |
120 # {Count} - Number of IDs | |
121 # {PercentIdentity}{$RowID}{$ColID} - Percent identify for a pair of sequences | |
122 # | |
123 sub CalculatePercentSequenceIdentityMatrix { | |
124 my($SequencesDataRef, $IgnoreGaps, , $Precision, $ID, $RowID, $ColID, $RowIDSeq, $ColIDSeq, $PercentIdentity, %IdentityMatrixData); | |
125 | |
126 $IgnoreGaps = 1; | |
127 $Precision = 1; | |
128 if (@_ == 3) { | |
129 ($SequencesDataRef, $IgnoreGaps, $Precision) = @_; | |
130 } | |
131 elsif (@_ == 2) { | |
132 ($SequencesDataRef, $IgnoreGaps) = @_; | |
133 } | |
134 else { | |
135 ($SequencesDataRef) = @_; | |
136 } | |
137 | |
138 %IdentityMatrixData = (); | |
139 @{$IdentityMatrixData{IDs}} = (); | |
140 %{$IdentityMatrixData{PercentIdentity}} = (); | |
141 $IdentityMatrixData{Count} = 0; | |
142 | |
143 for $ID (@{$SequencesDataRef->{IDs}}) { | |
144 push @{$IdentityMatrixData{IDs}}, $ID; | |
145 $IdentityMatrixData{Count} += 1; | |
146 } | |
147 # Initialize and calculate percent identity data values... | |
148 for $RowID (@{$SequencesDataRef->{IDs}}) { | |
149 %{$IdentityMatrixData{PercentIdentity}{$RowID}} = (); | |
150 $RowIDSeq = $SequencesDataRef->{Sequence}{$RowID}; | |
151 for $ColID (@{$SequencesDataRef->{IDs}}) { | |
152 $IdentityMatrixData{$RowID}{$ColID} = ''; | |
153 $ColIDSeq = $SequencesDataRef->{Sequence}{$ColID}; | |
154 $PercentIdentity = CalcuatePercentSequenceIdentity($RowIDSeq, $ColIDSeq, $IgnoreGaps, $Precision); | |
155 $IdentityMatrixData{PercentIdentity}{$RowID}{$ColID} = $PercentIdentity; | |
156 } | |
157 } | |
158 return \%IdentityMatrixData; | |
159 } | |
160 | |
161 # Retrieve information about shortest sequence... | |
162 sub GetShortestSequence { | |
163 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); | |
164 | |
165 $IgnoreGaps = 1; | |
166 if (@_ == 2) { | |
167 ($SequencesDataRef, $IgnoreGaps) = @_; | |
168 } | |
169 else { | |
170 ($SequencesDataRef) = @_; | |
171 } | |
172 | |
173 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Shortest', $IgnoreGaps); | |
174 return ($ID, $Sequence, $SeqLen, $Description); | |
175 } | |
176 | |
177 # Retrieve information about longest sequence.. | |
178 sub GetLongestSequence { | |
179 my($SequencesDataRef, $IgnoreGaps, $ID, $Sequence, $SeqLen, $Description); | |
180 | |
181 $IgnoreGaps = 1; | |
182 if (@_ == 2) { | |
183 ($SequencesDataRef, $IgnoreGaps) = @_; | |
184 } | |
185 else { | |
186 ($SequencesDataRef) = @_; | |
187 } | |
188 | |
189 ($ID, $Sequence, $SeqLen, $Description) = _GetShortestOrLongestSequence($SequencesDataRef, 'Longest', $IgnoreGaps); | |
190 return ($ID, $Sequence, $SeqLen, $Description); | |
191 } | |
192 | |
193 # Get sequence length... | |
194 sub GetSequenceLength { | |
195 my($Seq, $SeqLen, $IgnoreGaps); | |
196 | |
197 $SeqLen = ''; $IgnoreGaps = 1; | |
198 if (@_ == 2) { | |
199 ($Seq, $IgnoreGaps) = @_; | |
200 } | |
201 else { | |
202 ($Seq) = @_; | |
203 } | |
204 if ($IgnoreGaps) { | |
205 my($Index, $Residue); | |
206 $SeqLen = 0; | |
207 for $Index (0 .. (length($Seq) - 1)) { | |
208 $Residue = substr($Seq, $Index, 1); | |
209 if ($Residue =~ /[A-Z]/i) { | |
210 $SeqLen++; | |
211 } | |
212 } | |
213 } | |
214 else { | |
215 $SeqLen = length($Seq); | |
216 } | |
217 | |
218 return $SeqLen; | |
219 } | |
220 | |
221 # Is it a gap residue... | |
222 sub IsGapResidue { | |
223 my($Residue) = @_; | |
224 my($Status); | |
225 | |
226 $Status = ($Residue !~ /[A-Z]/i ) ? 1 : 0; | |
227 | |
228 return $Status; | |
229 } | |
230 | |
231 # Is it a supported sequence file? | |
232 # | |
233 # Supported seqence formats are: | |
234 # | |
235 # ALN/ClustalW .aln | |
236 # GCG/MSF .msf | |
237 # PILEUP/MSF .msf | |
238 # Fasts(Pearson) .fasta, .fta | |
239 # NBRF/PIR .pir | |
240 # | |
241 sub IsSupportedSequenceFile { | |
242 my($SequenceFile) = @_; | |
243 my($Status, $SequenceFormat); | |
244 $Status = 0; $SequenceFormat = 'NotSupported'; | |
245 | |
246 SEQFORMAT: { | |
247 if (IsClustalWSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'ClustalW'; last SEQFORMAT} | |
248 if (IsPearsonFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'Pearson'; last SEQFORMAT} | |
249 if (IsPIRFastaSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'PIR'; last SEQFORMAT} | |
250 if (IsMSFSequenceFile($SequenceFile)) {$Status = 1; $SequenceFormat = 'MSF'; last SEQFORMAT} | |
251 $Status = 0; $SequenceFormat = 'NotSupported'; | |
252 } | |
253 return ($Status, $SequenceFormat); | |
254 } | |
255 | |
256 # Is it a ClustalW multiple sequence sequence file... | |
257 sub IsClustalWSequenceFile { | |
258 my($SequenceFile) = @_; | |
259 my($Status, $Line); | |
260 | |
261 $Status = 0; | |
262 | |
263 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; | |
264 $Line = GetTextLine(\*SEQUENCEFILE); | |
265 $Status = ($Line =~ /(ClustalW|Clustal W|Clustal)/i ) ? 1 : 0; | |
266 close SEQUENCEFILE; | |
267 | |
268 return $Status; | |
269 } | |
270 | |
271 # Is it a valid Pearson fasta sequence or alignment file? | |
272 # | |
273 sub IsPearsonFastaSequenceFile { | |
274 my($FastaFile, $Line, $Status); | |
275 | |
276 ($FastaFile) = @_; | |
277 $Status = 0; | |
278 | |
279 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; | |
280 $Line = GetTextLine(\*FASTAFILE); | |
281 | |
282 # First line starts with > and the fourth character is not ';'; otherwise, it's | |
283 # PIR FASTA format. | |
284 if ($Line =~ /^>/) { | |
285 my($FourthChar); | |
286 $FourthChar = substr($Line, 3, 1); | |
287 $Status = ($FourthChar !~ /\;/) ? 1 : 0; | |
288 } | |
289 close FASTAFILE; | |
290 | |
291 return $Status; | |
292 } | |
293 | |
294 # Is it a valid NBRF/PIR fasta sequence or alignment file? | |
295 # | |
296 sub IsPIRFastaSequenceFile { | |
297 my($FastaFile, $Line, $Status); | |
298 | |
299 ($FastaFile) = @_; | |
300 $Status = 0; | |
301 | |
302 open FASTAFILE, "$FastaFile" or die "Couldn't open $FastaFile: $!\n"; | |
303 $Line = GetTextLine(\*FASTAFILE); | |
304 | |
305 # First line starts with > and the fourth character is ';'; otherwise, it's | |
306 # a Pearson FASTA format. | |
307 if ($Line =~ /^>/) { | |
308 my($FourthChar); | |
309 $FourthChar = substr($Line, 3, 1); | |
310 $Status = ($FourthChar =~ /\;/) ? 1 : 0; | |
311 } | |
312 close FASTAFILE; | |
313 | |
314 return $Status; | |
315 } | |
316 | |
317 # Is it a valid MSF sequence or alignment file? | |
318 # | |
319 sub IsMSFSequenceFile { | |
320 my($MSFFile) = @_; | |
321 | |
322 open MSFFILE, "$MSFFile" or die "Couldn't open $MSFFile: $!\n"; | |
323 | |
324 my($Line, $Status); | |
325 | |
326 $Status = 0; | |
327 # Find a line that contains MSF: keyword and ends with '..' | |
328 LINE: while ($Line = GetTextLine(\*MSFFILE)) { | |
329 $Line = RemoveLeadingWhiteSpaces($Line); | |
330 if ($Line =~ /MSF:/i && $Line =~ /\.\.[ ]*$/) { | |
331 $Status = 1; | |
332 last LINE; | |
333 } | |
334 elsif ($Line =~ /(!!AA_MULTIPLE_ALIGNMENT|!!NA_MULTIPLE_ALIGNMENT|PILEUP)/i) { | |
335 # Pileup MSF... | |
336 $Status = 1; | |
337 last LINE; | |
338 } | |
339 } | |
340 close MSFFILE; | |
341 | |
342 return $Status; | |
343 } | |
344 | |
345 # Read sequence or sequence alignment file... | |
346 sub ReadSequenceFile { | |
347 my($SequenceFile) = @_; | |
348 | |
349 if (IsPearsonFastaSequenceFile($SequenceFile)) { | |
350 return ReadPearsonFastaSequenceFile($SequenceFile); | |
351 } | |
352 elsif (IsPIRFastaSequenceFile($SequenceFile)) { | |
353 return ReadPIRFastaSequenceFile($SequenceFile); | |
354 } | |
355 elsif (IsMSFSequenceFile($SequenceFile)) { | |
356 return ReadMSFSequenceFile($SequenceFile); | |
357 } | |
358 elsif (IsClustalWSequenceFile($SequenceFile)) { | |
359 return ReadClustalWSequenceFile($SequenceFile); | |
360 } | |
361 else { | |
362 return undef; | |
363 } | |
364 } | |
365 | |
366 # Read file and setup alignment data... | |
367 sub ReadClustalWSequenceFile { | |
368 my($SequenceFile) = @_; | |
369 | |
370 return _ReadFileAndSetupSequencesData($SequenceFile, 'ClustalW'); | |
371 } | |
372 | |
373 # Read file and setup alignment data... | |
374 sub ReadPearsonFastaSequenceFile { | |
375 my($SequenceFile) = @_; | |
376 | |
377 return _ReadFileAndSetupSequencesData($SequenceFile, 'Pearson'); | |
378 } | |
379 | |
380 # Read file and setup alignment data... | |
381 sub ReadPIRFastaSequenceFile { | |
382 my($SequenceFile) = @_; | |
383 | |
384 return _ReadFileAndSetupSequencesData($SequenceFile, 'PIR'); | |
385 } | |
386 | |
387 | |
388 # Read file and setup sequence data... | |
389 sub ReadMSFSequenceFile { | |
390 my($SequenceFile) = @_; | |
391 | |
392 return _ReadFileAndSetupSequencesData($SequenceFile, 'MSF'); | |
393 } | |
394 | |
395 # Write out a Pearson FASTA file... | |
396 sub WritePearsonFastaSequenceFile { | |
397 my($SequenceFileName, $SequenceDataRef, $MaxLength, $ID, $Description, $Sequence, $WrappedSequence); | |
398 | |
399 $MaxLength = 80; | |
400 if (@_ == 3) { | |
401 ($SequenceFileName, $SequenceDataRef, $MaxLength) = @_; | |
402 } | |
403 elsif (@_ == 2) { | |
404 ($SequenceFileName, $SequenceDataRef) = @_; | |
405 } | |
406 open SEQUENCEFILE, ">$SequenceFileName" or die "Can't open $SequenceFileName: $!\n"; | |
407 for $ID (@{$SequenceDataRef->{IDs}}) { | |
408 $Description = $SequenceDataRef->{Description}{$ID}; | |
409 $Sequence = $SequenceDataRef->{Sequence}{$ID}; | |
410 $WrappedSequence = WrapText($Sequence, $MaxLength, "\n"); | |
411 | |
412 # Description also contains ID... | |
413 print SEQUENCEFILE ">$Description\n"; | |
414 print SEQUENCEFILE "$WrappedSequence\n"; | |
415 } | |
416 close SEQUENCEFILE; | |
417 } | |
418 | |
419 # Get ID, Sequence and Length for smallest or longest sequence | |
420 sub _GetShortestOrLongestSequence { | |
421 my($SequencesDataRef, $SequenceType, $IgnoreGaps) = @_; | |
422 my($ID, $Seq, $SeqLen, $Description, $FirstID, $FirstSeqLen, $CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
423 | |
424 ($ID, $Seq, $SeqLen) = ('', '', ''); | |
425 $FirstID = ''; | |
426 | |
427 ID: for $CurrentID (@{$SequencesDataRef->{IDs}}) { | |
428 $CurrentSeq = $IgnoreGaps ? RemoveSequenceGaps($SequencesDataRef->{Sequence}{$CurrentID}) : $SequencesDataRef->{Sequence}{$CurrentID}; | |
429 $CurrentSeqLen = GetSequenceLength($CurrentSeq, $IgnoreGaps); | |
430 $CurrentDescription = $SequencesDataRef->{Description}{$CurrentID}; | |
431 if (!$FirstID) { | |
432 $FirstID = $ID; $FirstSeqLen = $CurrentSeqLen; | |
433 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
434 next ID; | |
435 } | |
436 if ($CurrentSeqLen != $SeqLen) { | |
437 if (($SequenceType =~ /Shortest/i) && ($CurrentSeqLen < $SeqLen)) { | |
438 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
439 } | |
440 elsif (($SequenceType =~ /Longest/i) && ($CurrentSeqLen > $SeqLen) ) { | |
441 ($ID, $Seq, $SeqLen, $Description) = ($CurrentID, $CurrentSeq, $CurrentSeqLen, $CurrentDescription); | |
442 } | |
443 } | |
444 } | |
445 return ($ID, $Seq, $SeqLen, $Description); | |
446 } | |
447 | |
448 # Remove gaps in the sequence and return new sequence... | |
449 sub RemoveSequenceGaps { | |
450 my($Seq) = @_; | |
451 my($SeqWithoutGaps, $SeqLen, $Index, $Residue); | |
452 | |
453 $SeqWithoutGaps = ''; | |
454 $SeqLen = length($Seq); | |
455 for $Index (0 .. ($SeqLen - 1)) { | |
456 $Residue = substr($Seq, $Index, 1); | |
457 if ($Residue =~ /[A-Z]/i) { | |
458 $SeqWithoutGaps .= $Residue; | |
459 } | |
460 } | |
461 | |
462 return $SeqWithoutGaps; | |
463 } | |
464 | |
465 # Using input alignment data map ref containing following keys, generate | |
466 # a new hash with same set of keys after residue columns containg only | |
467 # gaps have been removed: | |
468 # | |
469 # {IDs} : Array of IDs in order as they appear in file | |
470 # {Count}: ID count... | |
471 # {Description}{$ID} : Description data... | |
472 # {Sequence}{$ID} : Sequence data... | |
473 # | |
474 sub RemoveSequenceAlignmentGapColumns { | |
475 my($ID, $AlignmentDataMapRef, %NewAlignmentDataMap); | |
476 | |
477 ($AlignmentDataMapRef) = @_; | |
478 | |
479 %NewAlignmentDataMap = (); | |
480 @{$NewAlignmentDataMap{IDs}} =(); | |
481 %{$NewAlignmentDataMap{Description}} =(); | |
482 %{$NewAlignmentDataMap{Sequence}} =(); | |
483 $NewAlignmentDataMap{Count} = 0; | |
484 | |
485 # Transfer ID and count information... | |
486 for $ID (@{$AlignmentDataMapRef->{IDs}}) { | |
487 push @{$NewAlignmentDataMap{IDs}}, $ID; | |
488 $NewAlignmentDataMap{Description}{$ID} = $AlignmentDataMapRef->{Description}{$ID}; | |
489 $NewAlignmentDataMap{Sequence}{$ID} = ''; | |
490 $NewAlignmentDataMap{Count} += 1; | |
491 } | |
492 | |
493 # Go over residue columns and transfer the data... | |
494 my($FirstID, $FirstSeq, $FirstSeqLen, $Index, $Res, $GapColumn); | |
495 | |
496 $FirstID = $AlignmentDataMapRef->{IDs}[0]; | |
497 $FirstSeq = $AlignmentDataMapRef->{Sequence}{$FirstID}; | |
498 $FirstSeqLen = length($FirstSeq); | |
499 | |
500 RES: for $Index (0 .. ($FirstSeqLen - 1)) { | |
501 # Is this a gap column? | |
502 $GapColumn = 1; | |
503 ID: for $ID (@{$AlignmentDataMapRef->{IDs}}) { | |
504 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); | |
505 if ($Res =~ /[A-Z]/i) { | |
506 $GapColumn = 0; | |
507 last ID; | |
508 } | |
509 } | |
510 if ($GapColumn) { | |
511 next RES; | |
512 } | |
513 # Transfer this residue... | |
514 for $ID (@{$AlignmentDataMapRef->{IDs}}) { | |
515 $Res = substr($AlignmentDataMapRef->{Sequence}{$ID}, $Index, 1); | |
516 $NewAlignmentDataMap{Sequence}{$ID} .= $Res; | |
517 } | |
518 } | |
519 | |
520 return (\%NewAlignmentDataMap); | |
521 } | |
522 | |
523 # | |
524 # Read sequences file and return a reference to hash with the following keys: | |
525 # | |
526 # {IDs} - Array of sequence IDs | |
527 # {Count} - Number of sequences | |
528 # {Description}{$ID} - Sequence description | |
529 # {Sequence}{$ID} - Sequence for a specific ID | |
530 # {InputFileType} - Sequence file format | |
531 # {ConservedAnnotation} - Conserved residue annonation | |
532 # | |
533 # Note: | |
534 # . Conserved residue annotation either exist in the input sequence alignment file or set | |
535 # for a file containing same number of residues for all the sequence using the following | |
536 # notation: * - Residue conserved; ' ' - Residue not conserved. | |
537 # | |
538 sub _ReadFileAndSetupSequencesData { | |
539 my($SequenceFile, $SequenceType) = @_; | |
540 my($SequenceDataMapRef); | |
541 | |
542 $SequenceDataMapRef = undef; | |
543 | |
544 # Read sequence file... | |
545 $SequenceDataMapRef = ''; | |
546 if ($SequenceType =~ /^ClustalW$/i) { | |
547 $SequenceDataMapRef = _ReadClustalWFile($SequenceFile); | |
548 } | |
549 elsif ($SequenceType =~ /^Pearson$/i) { | |
550 $SequenceDataMapRef = _ReadPearsonFastaFile($SequenceFile); | |
551 } | |
552 elsif ($SequenceType =~ /^PIR$/i) { | |
553 $SequenceDataMapRef = _ReadPIRFastaFile($SequenceFile); | |
554 } | |
555 elsif ($SequenceType =~ /^MSF$/i) { | |
556 $SequenceDataMapRef = _ReadMSFFile($SequenceFile); | |
557 } | |
558 else { | |
559 return $SequenceDataMapRef; | |
560 } | |
561 | |
562 if (exists $SequenceDataMapRef->{ConservedAnnotation}) { | |
563 return ($SequenceDataMapRef); | |
564 } | |
565 if (!(($SequenceDataMapRef->{Count} > 1) && (AreSequenceLengthsIdentical($SequenceDataMapRef)))) { | |
566 return ($SequenceDataMapRef); | |
567 } | |
568 | |
569 # Use the first sequence to setup an empty ConservedAnnotation key... | |
570 # And mark fully conserved residues... | |
571 # | |
572 my($ID, $Sequence, $FirstSequence, $FirstSeqLen, $Res, $FirstRes, $ResConserved, $Index); | |
573 $ID = $SequenceDataMapRef->{IDs}[0]; | |
574 $FirstSequence = $SequenceDataMapRef->{Sequence}{$ID}; | |
575 $FirstSeqLen = length($FirstSequence); | |
576 $SequenceDataMapRef->{ConservedAnnotation} = ''; | |
577 for $Index (0 .. ($FirstSeqLen - 1)) { | |
578 $FirstRes = ''; | |
579 $ResConserved = 1; | |
580 ID: for $ID (@{$SequenceDataMapRef->{IDs}}) { | |
581 $Sequence = $SequenceDataMapRef->{Sequence}{$ID}; | |
582 $Res = substr($Sequence, $Index, 1); | |
583 if (!$FirstRes) { | |
584 $FirstRes = $Res; | |
585 next ID; | |
586 } | |
587 if (($Res !~ /[A-Z]/i) || ($Res ne $FirstRes)) { | |
588 $ResConserved = 0; | |
589 last ID; | |
590 } | |
591 } | |
592 if ($ResConserved) { | |
593 $SequenceDataMapRef->{ConservedAnnotation} .= '*'; | |
594 } | |
595 else { | |
596 $SequenceDataMapRef->{ConservedAnnotation} .= ' '; | |
597 } | |
598 } | |
599 | |
600 return ($SequenceDataMapRef); | |
601 } | |
602 | |
603 # Read sequence data in ClustalW multiple sequence alignment file and | |
604 # return a reference to hash with these keys and values: | |
605 # | |
606 # {IDs} - Array of sequence IDs | |
607 # {Count} - Number of sequences | |
608 # {Description}{$ID} - Sequence description | |
609 # {Sequence}{$ID} - Sequence for a specific ID | |
610 # {InputFileType} - Sequence file format | |
611 # {ConservedAnnotation} - Conserved residue annonations: space, *, : , . | |
612 # | |
613 # | |
614 # | |
615 # And based on ClustalW/X manual, here is what the ConservedAnnonations mean: | |
616 # | |
617 # '*' indicates positions which have a single, fully conserved residue | |
618 # | |
619 # ':' indicates that one of the following 'strong' groups is fully conserved: STA | |
620 # NEQK NHQK NDEQ QHRK MILV MILF HY FYW | |
621 | |
622 # '.' indicates that one of the following 'weaker' groups is fully conserved: | |
623 # CSA ATV SAG STNK STPA SGND SNDEQK NDEQHK NEQHRK FVLIM HFY | |
624 # | |
625 # These are all the positively scoring groups that occur in the Gonnet Pam250 | |
626 # matrix. The strong and weak groups are defined as strong score >0.5 and weak | |
627 # score =<0.5 respectively. | |
628 # | |
629 sub _ReadClustalWFile { | |
630 my($SequenceFile) = @_; | |
631 my(%SequencesDataMap); | |
632 | |
633 # Initialize data... | |
634 %SequencesDataMap = (); | |
635 @{$SequencesDataMap{IDs}} = (); | |
636 %{$SequencesDataMap{Description}} = (); | |
637 %{$SequencesDataMap{Sequence}} = (); | |
638 $SequencesDataMap{Count} = 0; | |
639 $SequencesDataMap{ConservedAnnotation} = ''; | |
640 $SequencesDataMap{InputFileType} = 'ClustalW'; | |
641 | |
642 open SEQUENCEFILE, "$SequenceFile" or die "Couldn't open $SequenceFile: $!\n"; | |
643 | |
644 my($Line, $LineLength, $AnnotationStart, $AnnotationLength, $Annotation, $Sequence, $SequenceLength, $ID, $IDIndex); | |
645 | |
646 # Ignore the header line... | |
647 $Line = <SEQUENCEFILE>; | |
648 | |
649 LINE: while ($Line = GetTextLine(\*SEQUENCEFILE)) { | |
650 if (($Line =~ /^[ \*\:\.]/) && ($Line !~ /[A-Z]/i)) { | |
651 # Annotation for sequences: fully conserverd, weaker or stronger group conserverd. | |
652 # Extract it and save... | |
653 $LineLength = length($Line); | |
654 $AnnotationStart = $LineLength - $SequenceLength; | |
655 $AnnotationLength = $SequenceLength; | |
656 $Annotation = substr($Line, $AnnotationStart, $AnnotationLength); | |
657 $SequencesDataMap{ConservedAnnotation} .= $Annotation; | |
658 } | |
659 else { | |
660 # Extract ID and sequences... | |
661 ($ID, $Sequence)= $Line =~ /^[ ]*(.*?)[ ]+(.*?)[ 01-9]*$/; | |
662 $Sequence =~ s/ //g; | |
663 if (!($ID && $Sequence)) { | |
664 next LINE; | |
665 } | |
666 | |
667 if (exists $SequencesDataMap{Sequence}{$ID}) { | |
668 # Append to existing alignment value... | |
669 $SequenceLength = length($Sequence); | |
670 $SequencesDataMap{Sequence}{$ID} .= $Sequence; | |
671 } | |
672 else { | |
673 # New alignment data... | |
674 $SequencesDataMap{Count} += 1; | |
675 push @{$SequencesDataMap{IDs}}, $ID; | |
676 $SequencesDataMap{Description}{$ID} = $ID; | |
677 $SequencesDataMap{Sequence}{$ID} = $Sequence; | |
678 $SequenceLength = length($Sequence); | |
679 } | |
680 } | |
681 } | |
682 close SEQUENCEFILE; | |
683 return (\%SequencesDataMap); | |
684 } | |
685 | |
686 # Read Pearson fasta file and return a reference to hash with these keys: | |
687 # | |
688 # {IDs} - Array of sequence IDs | |
689 # {Count} - Number of sequences | |
690 # {Description}{$ID} - Sequence description | |
691 # {Sequence}{$ID} - Sequence for a specific ID | |
692 # {InputFileType} - Sequence file format | |
693 # {ConservedAnnotation} - Conserved residue annonation | |
694 # | |
695 sub _ReadPearsonFastaFile { | |
696 my($FastaFileName, $ID, $Description, $Line, $IgnoreID, @LineWords, %FastaDataMap); | |
697 | |
698 ($FastaFileName) = @_; | |
699 | |
700 %FastaDataMap = (); | |
701 @{$FastaDataMap{IDs}} =(); | |
702 %{$FastaDataMap{Description}} =(); | |
703 %{$FastaDataMap{Sequence}} =(); | |
704 $FastaDataMap{Count} = 0; | |
705 $FastaDataMap{InputFileType} = 'Pearson'; | |
706 | |
707 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; | |
708 $ID = ''; | |
709 $IgnoreID = 0; | |
710 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { | |
711 if ($Line =~ /^\>/) { | |
712 # Start of a new ID... | |
713 $Line =~ s/^\>//; | |
714 $Line = RemoveLeadingWhiteSpaces($Line); | |
715 @LineWords = (); | |
716 @LineWords = split / /, $Line; | |
717 | |
718 $ID = $LineWords[0]; | |
719 $ID =~ s/ //g; | |
720 $Description = $Line; | |
721 | |
722 $IgnoreID = 0; | |
723 if (exists $FastaDataMap{Sequence}{$ID}) { | |
724 $IgnoreID = 1; | |
725 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; | |
726 next LINE; | |
727 } | |
728 push @{$FastaDataMap{IDs}}, $ID; | |
729 $FastaDataMap{Description}{$ID} = $Description; | |
730 $FastaDataMap{Count} += 1; | |
731 next LINE; | |
732 } | |
733 if ($IgnoreID) { next LINE; } | |
734 | |
735 # Remove any spaces in the sequence... | |
736 $Line =~ s/ //g; | |
737 # Sequence data for active ID... | |
738 if (exists $FastaDataMap{Sequence}{$ID}) { | |
739 $FastaDataMap{Sequence}{$ID} .= $Line; | |
740 } | |
741 else { | |
742 $FastaDataMap{Sequence}{$ID} = $Line; | |
743 } | |
744 } | |
745 close FASTAFILE; | |
746 return \%FastaDataMap; | |
747 } | |
748 | |
749 # Read PIR fasta file and return a reference to hash with these keys: | |
750 # | |
751 # {IDs} - Array of sequence IDs | |
752 # {Count} - Number of sequences | |
753 # {Description}{$ID} - Sequence description | |
754 # {Sequence}{$ID} - Sequence for a specific ID | |
755 # {InputFileType} - Sequence file format | |
756 # {ConservedAnnotation} - Conserved residue annonation | |
757 # | |
758 # Format: | |
759 # A sequence in PIR format consists of: | |
760 # One line starting with | |
761 # a ">" (greater-than) sign, followed by | |
762 # a two-letter code describing the sequence type code (P1, F1, DL, DC, RL, RC, N3, N1 or XX), followed by | |
763 # a semicolon, followed by | |
764 # the sequence identification code (the database ID-code). | |
765 # One line containing a textual description of the sequence. | |
766 # One or more lines containing the sequence itself. The end of the | |
767 # sequence is marked by a "*" (asterisk) character. | |
768 # | |
769 # A file in PIR format may comprise more than one sequence. | |
770 # | |
771 # The PIR format is also often referred to as the NBRF format. | |
772 # | |
773 # Code SequenceType | |
774 # P1 Protein (complete) | |
775 # F1 Protein (fragment) | |
776 # DL DNA (linear) | |
777 # DC DNA (circular) | |
778 # RL RNA (linear) | |
779 # RC RNA (circular) | |
780 # N3 tRNA | |
781 # N1 Other functional RNA | |
782 # | |
783 | |
784 sub _ReadPIRFastaFile { | |
785 my($FastaFileName, $ID, $Description, $Line, $SequenceTypeCode, $ReadingSequenceData, %FastaDataMap); | |
786 | |
787 ($FastaFileName) = @_; | |
788 | |
789 %FastaDataMap = (); | |
790 @{$FastaDataMap{IDs}} =(); | |
791 %{$FastaDataMap{Description}} =(); | |
792 %{$FastaDataMap{Sequence}} =(); | |
793 %{$FastaDataMap{SequenceTypeCode}} =(); | |
794 $FastaDataMap{Count} = 0; | |
795 $FastaDataMap{InputFileType} = 'PIR'; | |
796 | |
797 open FASTAFILE, "$FastaFileName" or die "Couldn't open $FastaFileName: $!\n"; | |
798 $ID = ''; | |
799 $ReadingSequenceData = 0; | |
800 LINE: while ($Line = GetTextLine(\*FASTAFILE)) { | |
801 if ($Line =~ /^\>/) { | |
802 # Start of a new ID... | |
803 $Line =~ s/^\>//; | |
804 $Line = RemoveLeadingWhiteSpaces($Line); | |
805 ($SequenceTypeCode, $ID) = /^\>(.*?)\;(.*?)$/; | |
806 | |
807 # Use next line to retrieve sequence description... | |
808 $Line = GetTextLine(\*FASTAFILE); | |
809 $Line = RemoveLeadingWhiteSpaces($Line); | |
810 $Description = $Line; | |
811 | |
812 if (exists $FastaDataMap{Sequence}{$ID}) { | |
813 warn "Warning: ID, $ID, in Fasta file already exists. Ignoring ID and sequence data...\n"; | |
814 next LINE; | |
815 } | |
816 $ReadingSequenceData = 1; | |
817 push @{$FastaDataMap{IDs}}, $ID; | |
818 $FastaDataMap{SequenceTypeCode}{$ID} = $SequenceTypeCode; | |
819 $FastaDataMap{Description}{$ID} = $Description; | |
820 $FastaDataMap{Count} += 1; | |
821 next LINE; | |
822 } | |
823 if (!$ReadingSequenceData) { next LINE; } | |
824 | |
825 # Remove any spaces in the sequence... | |
826 $Line =~ s/ //g; | |
827 if ($Line =~ /[\*]$/) { | |
828 # End of sequence... | |
829 $ReadingSequenceData = 0; | |
830 $Line =~ s/[\*]$//; | |
831 } | |
832 # Sequence data for active ID... | |
833 if (exists $FastaDataMap{Sequence}{$ID}) { | |
834 $FastaDataMap{Sequence}{$ID} .= $Line; | |
835 } | |
836 else { | |
837 $FastaDataMap{Sequence}{$ID} = $Line; | |
838 } | |
839 } | |
840 close FASTAFILE; | |
841 return \%FastaDataMap; | |
842 } | |
843 | |
844 # Read MSF file and return a reference to hash with these keys: | |
845 # | |
846 # {IDs} : Array of IDs in order as they appear in file | |
847 # {Count}: ID count... | |
848 # {Description}{$ID} : Description data... | |
849 # {Sequence}{$ID} : Sequence data... | |
850 # | |
851 sub _ReadMSFFile { | |
852 my($MSFFileName, $Line, @LineWords, %MSFDataMap); | |
853 | |
854 ($MSFFileName) = @_; | |
855 | |
856 %MSFDataMap = (); | |
857 @{$MSFDataMap{IDs}} =(); | |
858 %{$MSFDataMap{Description}} =(); | |
859 %{$MSFDataMap{Sequence}} =(); | |
860 $MSFDataMap{Count} = 0; | |
861 $MSFDataMap{InputFileType} = 'MSF'; | |
862 | |
863 open MSFFILE, "$MSFFileName" or die "Couldn't open $MSFFileName: $!\n"; | |
864 | |
865 # Collect sequences and IDs... | |
866 # | |
867 # '//' after the name fields indicates end of header list and start of sequence data. | |
868 # | |
869 my($ID, $Len, $Check, $Weight, $Sequence, $NameFieldsFound, %MSFIDsMap); | |
870 %MSFIDsMap = (); | |
871 $NameFieldsFound = 0; | |
872 LINE: while ($Line = GetTextLine(\*MSFFILE)) { | |
873 if ($Line =~ /Name:/) { | |
874 $NameFieldsFound++; | |
875 ($ID, $Len, $Check, $Weight) = $Line =~ /^[ ]*Name:[ ]+(.*?)[ ]+Len:[ ]+(.*?)[ ]+Check:[ ]+(.*?)[ ]+Weight:[ ]+(.*?)[ ]*$/; | |
876 if ($ID =~ / /) { | |
877 ($ID) = $ID =~ /^(.*?)[ ]+/ | |
878 } | |
879 if (exists $MSFIDsMap{$ID}) { | |
880 warn "Warning: ID, $ID, in MSF file already exists. Ignoring ID and sequence data...\n"; | |
881 next LINE; | |
882 } | |
883 $MSFIDsMap{$ID} = $ID; | |
884 push @{$MSFDataMap{IDs}}, $ID; | |
885 $MSFDataMap{Description}{$ID} = $ID; | |
886 $MSFDataMap{Count} += 1; | |
887 } | |
888 elsif ( /\/\// && $NameFieldsFound) { | |
889 # End of header list... | |
890 last LINE; | |
891 } | |
892 } | |
893 # Collect all sequences... | |
894 # | |
895 my($FirstField, $SecondField); | |
896 while ($Line = GetTextLine(\*MSFFILE)) { | |
897 ($FirstField, $SecondField) = $Line =~ /^[ ]*(.*?)[ ]+(.*?)$/; | |
898 if (exists $MSFIDsMap{$FirstField}) { | |
899 # It's ID and sequence data... | |
900 $ID = $FirstField; | |
901 $Sequence = $SecondField; | |
902 # Take out spaces and leave the gap characters... | |
903 $Sequence =~ s/ //g; | |
904 if ($MSFDataMap{Sequence}{$ID}) { | |
905 $MSFDataMap{Sequence}{$ID} .= $Sequence; | |
906 } | |
907 else { | |
908 $MSFDataMap{Sequence}{$ID} = $Sequence; | |
909 } | |
910 } | |
911 } | |
912 | |
913 close MSFFILE; | |
914 return \%MSFDataMap; | |
915 } | |
916 | |
917 | |
918 1; | |
919 | |
920 __END__ | |
921 | |
922 =head1 NAME | |
923 | |
924 SequenceFileUtil | |
925 | |
926 =head1 SYNOPSIS | |
927 | |
928 use SequenceFileUtil ; | |
929 | |
930 use SequenceFileUtil qw(:all); | |
931 | |
932 =head1 DESCRIPTION | |
933 | |
934 B<SequenceFileUtil> module provides the following functions: | |
935 | |
936 AreSequenceLengthsIdentical, CalcuatePercentSequenceIdentity, | |
937 CalculatePercentSequenceIdentityMatrix, GetLongestSequence, GetSequenceLength, | |
938 GetShortestSequence, IsClustalWSequenceFile, IsGapResidue, IsMSFSequenceFile, | |
939 IsPIRFastaSequenceFile, IsPearsonFastaSequenceFile, IsSupportedSequenceFile, | |
940 ReadClustalWSequenceFile, ReadMSFSequenceFile, ReadPIRFastaSequenceFile, | |
941 ReadPearsonFastaSequenceFile, ReadSequenceFile, RemoveSequenceAlignmentGapColumns, | |
942 RemoveSequenceGaps, WritePearsonFastaSequenceFile | |
943 SequenceFileUtil module provides various methods to process sequence | |
944 files and retreive appropriate information. | |
945 | |
946 =head1 FUNCTIONS | |
947 | |
948 =over 4 | |
949 | |
950 =item B<AreSequenceLengthsIdentical> | |
951 | |
952 $Status = AreSequenceLengthsIdentical($SequencesDataRef); | |
953 | |
954 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns 1 | |
955 or 0 based whether lengths of all the sequence is same. | |
956 | |
957 =item B<CalcuatePercentSequenceIdentity> | |
958 | |
959 $PercentIdentity = | |
960 AreSequenceLengthsIdenticalAreSequenceLengthsIdentical( | |
961 $Sequence1, $Sequence2, [$IgnoreGaps, $Precision]); | |
962 | |
963 Returns percent identity between I<Sequence1> and I<Sequence2>. Optional arguments | |
964 I<IgnoreGaps> and I<Precision> control handling of gaps in sequences and precision of the | |
965 returned value. By default, gaps are ignored and precision is set up to 1 decimal. | |
966 | |
967 =item B<CalculatePercentSequenceIdentityMatrix> | |
968 | |
969 $IdentityMatrixDataRef = CalculatePercentSequenceIdentityMatrix( | |
970 $SequencesDataRef, [$IgnoreGaps, | |
971 $Precision]); | |
972 | |
973 Calculate pairwise percent identity between all the sequences available in I<SequencesDataRef> | |
974 and returns a reference to identity matrix hash. Optional arguments I<IgnoreGaps> and | |
975 I<Precision> control handling of gaps in sequences and precision of the returned value. By default, gaps | |
976 are ignored and precision is set up to 1 decimal. | |
977 | |
978 =item B<GetSequenceLength> | |
979 | |
980 $SeqquenceLength = GetSequenceLength($Sequence, [$IgnoreGaps]); | |
981 | |
982 Returns length of the specified sequence. Optional argument I<IgnoreGaps> controls handling | |
983 of gaps. By default, gaps are ignored. | |
984 | |
985 =item B<GetShortestSequence> | |
986 | |
987 ($ID, $Sequence, $SeqLen, $Description) = GetShortestSequence( | |
988 $SequencesDataRef, [$IgnoreGaps]); | |
989 | |
990 Checks the lengths of all the sequences available in $SequencesDataRef and returns $ID, | |
991 $Sequence, $SeqLen, and $Description values for the shortest sequence. Optional arguments $IgnoreGaps | |
992 controls handling of gaps in sequences. By default, gaps are ignored. | |
993 | |
994 =item B<GetLongestSequence> | |
995 | |
996 ($ID, $Sequence, $SeqLen, $Description) = GetLongestSequence( | |
997 $SequencesDataRef, [$IgnoreGaps]); | |
998 | |
999 Checks the lengths of all the sequences available in I<SequencesDataRef> and returns B<ID>, | |
1000 B<Sequence>, B<SeqLen>, and B<Description> values for the longest sequence. Optional argument | |
1001 $I<IgnoreGaps> controls handling of gaps in sequences. By default, gaps are ignored. | |
1002 | |
1003 =item B<IsGapResidue> | |
1004 | |
1005 $Status = AreSequenceLengthsIdentical($Residue); | |
1006 | |
1007 Returns 1 or 0 based on whether I<Residue> corresponds to a gap. Any character other than A to Z is | |
1008 considered a gap residue. | |
1009 | |
1010 =item B<IsSupportedSequenceFile> | |
1011 | |
1012 $Status = IsSupportedSequenceFile($SequenceFile); | |
1013 | |
1014 Returns 1 or 0 based on whether I<SequenceFile> corresponds to a supported sequence | |
1015 format. | |
1016 | |
1017 =item B<IsClustalWSequenceFile> | |
1018 | |
1019 $Status = IsClustalWSequenceFile($SequenceFile); | |
1020 | |
1021 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Clustal sequence alignment | |
1022 format. | |
1023 | |
1024 =item B<IsPearsonFastaSequenceFile> | |
1025 | |
1026 $Status = IsPearsonFastaSequenceFile($SequenceFile); | |
1027 | |
1028 Returns 1 or 0 based on whether I<SequenceFile> corresponds to Pearson FASTA sequence | |
1029 format. | |
1030 | |
1031 =item B<IsPIRFastaSequenceFile> | |
1032 | |
1033 $Status = IsPIRFastaSequenceFile($SequenceFile); | |
1034 | |
1035 Returns 1 or 0 based on whether I<SequenceFile> corresponds to PIR FASTA sequence | |
1036 format. | |
1037 | |
1038 =item B<IsMSFSequenceFile> | |
1039 | |
1040 $Status = IsClustalWSequenceFile($SequenceFile); | |
1041 | |
1042 Returns 1 or 0 based on whether I<SequenceFile> corresponds to MSF sequence alignment | |
1043 format. | |
1044 | |
1045 =item B<ReadSequenceFile> | |
1046 | |
1047 $SequenceDataMapRef = ReadSequenceFile($SequenceFile); | |
1048 | |
1049 Reads I<SequenceFile> and returns reference to a hash containing following key/value | |
1050 pairs: | |
1051 | |
1052 $SequenceDataMapRef->{IDs} - Array of sequence IDs | |
1053 $SequenceDataMapRef->{Count} - Number of sequences | |
1054 $SequenceDataMapRef->{Description}{$ID} - Sequence description | |
1055 $SequenceDataMapRef->{Sequence}{$ID} - Sequence for a specific ID | |
1056 $SequenceDataMapRef->{Sequence}{InputFileType} - File format | |
1057 | |
1058 =item B<ReadClustalWSequenceFile> | |
1059 | |
1060 $SequenceDataMapRef = ReadClustalWSequenceFile($SequenceFile); | |
1061 | |
1062 Reads ClustalW I<SequenceFile> and returns reference to a hash containing following key/value | |
1063 pairs as describes in B<ReadSequenceFile> method. | |
1064 | |
1065 =item B<ReadMSFSequenceFile> | |
1066 | |
1067 $SequenceDataMapRef = ReadMSFSequenceFile($SequenceFile); | |
1068 | |
1069 Reads MSF I<SequenceFile> and returns reference to a hash containing following key/value | |
1070 pairs as describes in B<ReadSequenceFile> method. | |
1071 | |
1072 =item B<ReadPIRFastaSequenceFile> | |
1073 | |
1074 $SequenceDataMapRef = ReadPIRFastaSequenceFile($SequenceFile); | |
1075 | |
1076 Reads PIR FASTA I<SequenceFile> and returns reference to a hash containing following key/value | |
1077 pairs as describes in B<ReadSequenceFile> method. | |
1078 | |
1079 =item B<ReadPearsonFastaSequenceFile> | |
1080 | |
1081 $SequenceDataMapRef = ReadPearsonFastaSequenceFile($SequenceFile); | |
1082 | |
1083 Reads Pearson FASTA I<SequenceFile> and returns reference to a hash containing following key/value | |
1084 pairs as describes in B<ReadSequenceFile> method. | |
1085 | |
1086 =item B<RemoveSequenceGaps> | |
1087 | |
1088 $SeqWithoutGaps = RemoveSequenceGaps($Sequence); | |
1089 | |
1090 Removes gaps from I<Sequence> and return a sequence without any gaps. | |
1091 | |
1092 =item B<RemoveSequenceAlignmentGapColumns> | |
1093 | |
1094 $NewAlignmentDataMapRef = RemoveSequenceAlignmentGapColumns( | |
1095 $AlignmentDataMapRef); | |
1096 | |
1097 Using input alignment data map ref containing following keys, generate a new hash with | |
1098 same set of keys after residue columns containg only gaps have been removed: | |
1099 | |
1100 {IDs} : Array of IDs in order as they appear in file | |
1101 {Count}: ID count | |
1102 {Description}{$ID} : Description data | |
1103 {Sequence}{$ID} : Sequence data | |
1104 | |
1105 =item B<WritePearsonFastaSequenceFile> | |
1106 | |
1107 WritePearsonFastaSequenceFile($SequenceFileName, $SequenceDataRef, | |
1108 [$MaxLength]); | |
1109 | |
1110 Using sequence data specified via I<SequenceDataRef>, write out a Pearson FASTA sequence | |
1111 file. Optional argument I<MaxLength> controls maximum length sequence in each line; default is | |
1112 80. | |
1113 | |
1114 =back | |
1115 | |
1116 =head1 AUTHOR | |
1117 | |
1118 Manish Sud <msud@san.rr.com> | |
1119 | |
1120 =head1 SEE ALSO | |
1121 | |
1122 PDBFileUtil.pm | |
1123 | |
1124 =head1 COPYRIGHT | |
1125 | |
1126 Copyright (C) 2015 Manish Sud. All rights reserved. | |
1127 | |
1128 This file is part of MayaChemTools. | |
1129 | |
1130 MayaChemTools is free software; you can redistribute it and/or modify it under | |
1131 the terms of the GNU Lesser General Public License as published by the Free | |
1132 Software Foundation; either version 3 of the License, or (at your option) | |
1133 any later version. | |
1134 | |
1135 =cut |