comparison mayachemtools/lib/PDBFileUtil.pm @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 package PDBFileUtil;
2 #
3 # $RCSfile: PDBFileUtil.pm,v $
4 # $Date: 2015/02/28 20:47:18 $
5 # $Revision: 1.36 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Exporter;
31 use Text::ParseWords;
32 use TextUtil;
33 use FileUtil;
34 use TimeUtil ();
35
36 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
37
38 @ISA = qw(Exporter);
39 @EXPORT = qw(GetPDBRecordType GetRecordTypesCount GetAllResidues GetConectRecordLines GetChainsAndResidues GetExperimentalTechnique GetExperimentalTechniqueResolution GetMinMaxCoords IsPDBFile IsAtomRecordType IsConectRecordType IsHeaderRecordType IsHetatmRecordType IsSeqresRecordType IsModelRecordType IsEndmdlRecordType IsTerRecordType IsMasterRecordType ReadPDBFile ParseHeaderRecordLine GenerateHeaderRecordLine GenerateHeaderRecordTimeStamp ParseAtomRecordLine GenerateAtomRecordLine ParseAtomOrHetatmRecordLine GenerateAtomOrHetatmRecordLine GenerateHetatmRecordLine ParseHetatmRecordLine ParseConectRecordLine GenerateConectRecordLine ParseExpdtaRecordLine ParseRemark2ResolutionRecordLine ParseSeqresRecordLine ParseTerRecordLine GenerateTerRecordLine ParseMasterRecordLine GenerateEndRecordLine);
40 @EXPORT_OK = qw();
41
42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
43
44 # Get PDB record type...
45 sub GetPDBRecordType {
46 my($Line) = @_;
47
48 return _GetRecordType($Line);
49 }
50
51 # Is it a PDB file?
52 sub IsPDBFile {
53 my($PDBFile) = @_;
54 my($Line, $Status);
55
56 $Status = 0;
57 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n";
58 $Line = GetTextLine(\*PDBFILE);
59 $Status = ($Line =~ /^HEADER/i) ? 1 : 0;
60 close PDBFILE;
61
62 return $Status;
63 }
64
65 # Is it a atom record type?
66 sub IsAtomRecordType {
67 my($Line) = @_;
68
69 return _IsRecordType($Line, 'ATOM');
70 }
71
72 # Is it a connect record type?
73 sub IsConectRecordType {
74 my($Line) = @_;
75
76 return _IsRecordType($Line, 'CONECT');
77 }
78
79 # Is it a header atom record type?
80 sub IsHeaderRecordType {
81 my($Line) = @_;
82
83 return _IsRecordType($Line, 'HEADER');
84 }
85
86 # Is it a hetro atom record type?
87 sub IsHetatmRecordType {
88 my($Line) = @_;
89
90 return _IsRecordType($Line, 'HETATM');
91 }
92
93 # Is it a seqres record type?
94 sub IsSeqresRecordType {
95 my($Line) = @_;
96
97 return _IsRecordType($Line, 'SEQRES');
98 }
99
100 # Is it a MODEL record type?
101 sub IsModelRecordType {
102 my($Line) = @_;
103
104 return _IsRecordType($Line, 'MODEL');
105 }
106
107 # Is it a ENDMDL record type?
108 sub IsEndmdlRecordType {
109 my($Line) = @_;
110
111 return _IsRecordType($Line, 'ENDMDL');
112 }
113
114 # Is it a TER record type?
115 sub IsTerRecordType {
116 my($Line) = @_;
117
118 return _IsRecordType($Line, 'TER');
119 }
120
121 # Is it a MASTER record type?
122 sub IsMasterRecordType {
123 my($Line) = @_;
124
125 return _IsRecordType($Line, 'MASTER');
126 }
127
128 # Count the number of each record type and a reference to data type with these key/value pairs:
129 # {RecordTypes} - An array of unique record types in order of their presence in the file
130 # {Count}{$RecordType} - Count of each record type
131 # {Lines}{$RecordType} - Optional lines data for a specific record type.
132 #
133 sub GetRecordTypesCount {
134 my($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag, $RecordType, $RecordLine, %RecordTypeDataMap);
135
136 %RecordTypeDataMap = ();
137 @{$RecordTypeDataMap{RecordTypes}} = ();
138 %{$RecordTypeDataMap{Count}} = ();
139 %{$RecordTypeDataMap{Lines}} = ();
140
141 $SpecifiedRecordType = '';
142 $GetRecordLinesFlag = 0;
143 if (@_ == 3) {
144 ($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag) = @_;
145 $SpecifiedRecordType = uc $SpecifiedRecordType;
146 }
147 elsif (@_ == 2) {
148 ($PDBRecordLinesRef, $SpecifiedRecordType) = @_;
149 $SpecifiedRecordType = uc $SpecifiedRecordType;
150 }
151 else {
152 ($PDBRecordLinesRef) = @_;
153 }
154 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
155 $RecordType = _GetRecordType($RecordLine);
156 if ($SpecifiedRecordType && ($SpecifiedRecordType ne $RecordType)) {
157 next LINE;
158 }
159 if (exists $RecordTypeDataMap{Count}{$RecordType}) {
160 # Update count...
161 $RecordTypeDataMap{Count}{$RecordType} += 1;
162
163 if ($GetRecordLinesFlag) {
164 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine;
165 }
166 }
167 else {
168 # New record type...
169 push @{$RecordTypeDataMap{RecordTypes}}, $RecordType;
170 $RecordTypeDataMap{Count}{$RecordType} = 1;
171
172 if ($GetRecordLinesFlag) {
173 @{$RecordTypeDataMap{Lines}{$RecordType}} = ();
174 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine;
175 }
176 }
177 }
178 return (\%RecordTypeDataMap);
179 }
180
181 # Collect CONECT record lines for specific atom number, modified specified data to exclude any atom
182 # number not present in the list of specified atom numbers and return a reference to list of
183 # CONECT record lines.
184 #
185 sub GetConectRecordLines {
186 my($PDBRecordLinesRef, $AtomNumbersMapRef) = @_;
187 my($AtomNumber, $ConectAtomNumber, $RecordLine, @ConectRecordAtomNums, @ConectRecordLines);
188
189 @ConectRecordLines = ();
190 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
191 if (!IsConectRecordType($RecordLine)) {
192 next LINE;
193 }
194 @ConectRecordAtomNums = ();
195 push @ConectRecordAtomNums, ParseConectRecordLine($RecordLine);
196 ATOMNUMBER: for $ConectAtomNumber (@ConectRecordAtomNums) {
197 if (defined $ConectAtomNumber) {
198 $AtomNumber = $ConectAtomNumber;
199 if ($AtomNumber) {
200 if (! exists $AtomNumbersMapRef->{$AtomNumber}) {
201 next LINE;
202 }
203 }
204 }
205 }
206 push @ConectRecordLines, $RecordLine;
207 }
208 return \@ConectRecordLines;
209 }
210
211 # Get chains and residue information using ATOM/HETATM or SEQRES records. And return a reference to a
212 # hash with these keys:
213 #
214 # @{$ChainsDataMap{ChainIDs}} - List of chain IDs with 'None' for no chain identification
215 # @{$ChainsDataMap{Residues}{$ChainID}} - List of residues in order of their appearance in a chain
216 # @{$ChainsDataMap{ResidueNumbers}{$ChainID}} - List of residue numbers in order of their appearance in a chain
217 # %{$ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}} - Count of specific residues in a chain
218 #
219 # Notes:
220 # . Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records.
221 # . In addition to a different chain ID in ATOM/HETATM a TER record also indicates end of an existing chain
222 # and start of a new one: ChainID in ATOM/HETATM records might still be emtpy.
223 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored.
224 #
225 sub GetChainsAndResidues {
226 my($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag);
227
228 $RecordsSource = 'AtomAndHetatm';
229 $GetChainResiduesBeyondTERFlag = 0;
230 $GetRecordLinesFlag = 0;
231
232 if (@_ == 4) {
233 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_;
234 }
235 elsif (@_ == 3) {
236 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag) = @_;
237 }
238 elsif (@_ == 2) {
239 ($PDBRecordLinesRef, $RecordsSource) = @_;
240 }
241 else {
242 ($PDBRecordLinesRef) = @_;
243 }
244
245 if ($RecordsSource =~ /^AtomAndHetatm$/i) {
246 return _GetChainsAndResiduesFromAtomHetatmRecords($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag);
247 }
248 elsif ($RecordsSource =~ /^Seqres$/i) {
249 return _GetChainsAndResiduesFromSeqresRecords($PDBRecordLinesRef);
250 }
251 else {
252 my(%ChainsDataMap);
253 %ChainsDataMap = ();
254 @{$ChainsDataMap{ChainIDs}} = ();
255 %{$ChainsDataMap{Residues}} = ();
256 %{$ChainsDataMap{ResidueNumbers}} = ();
257 %{$ChainsDataMap{ResidueCount}} = ();
258
259 return \%ChainsDataMap;
260 }
261 }
262
263
264 # Get residue information using ATOM/HETATM records and return a reference to a hash with
265 # these keys:
266 #
267 # @{$ResiduesDataMap{ResidueNames}} - List of all the residues
268 # %{$ResiduesDataMap{ResidueCount}{$ResidueName}} - Count of residues
269 # @{$ResiduesDataMap{AtomResidueNames}} - List of all the residues
270 # %{$ResiduesDataMap{AtomResidueCount}{$ResidueName}} - Count of residues in ATOM records
271 # @{$ResiduesDataMap{HetatomResidueNames}} - List of all the residues
272 # %{$ResiduesDataMap{HetatmResidueCount}{$ResidueName}} - Count of residues HETATM records
273 #
274 # Notes:
275 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored.
276 #
277 sub GetAllResidues {
278 my($PDBRecordLinesRef) = @_;
279
280 my($PreviousChainID, $PreviousResidueNumber, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ResiduesDataMap);
281
282 %ResiduesDataMap = ();
283 @{$ResiduesDataMap{ResidueNames}} = ();
284 %{$ResiduesDataMap{ResidueCount}} = ();
285 @{$ResiduesDataMap{AtomResidueNames}} = ();
286 %{$ResiduesDataMap{AtomResidueCount}} = ();
287 @{$ResiduesDataMap{HetatmResidueNames}} = ();
288 %{$ResiduesDataMap{HetatmResidueCount}} = ();
289
290 $PreviousChainID = '';
291 $PreviousResidueNumber = 0;
292
293 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
294 if (IsEndmdlRecordType($RecordLine)) {
295 last LINE;
296 }
297 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
298 next LINE;
299 }
300 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
301
302 if ($PreviousChainID eq $ChainID) {
303 if ($ResidueNumber == $PreviousResidueNumber) {
304 next LINE;
305 }
306 $PreviousResidueNumber = $ResidueNumber;
307 }
308 else {
309 # New chain...
310 $PreviousChainID = $ChainID;
311 $PreviousResidueNumber = $ResidueNumber;
312 }
313
314 # Store the residue and update its count...
315 push @{$ResiduesDataMap{ResidueNames}}, $ResidueName;
316 if (exists $ResiduesDataMap{ResidueCount}{$ResidueName}) {
317 $ResiduesDataMap{ResidueCount}{$ResidueName} += 1;
318 }
319 else {
320 $ResiduesDataMap{ResidueCount}{$ResidueName} = 1;
321 }
322 # Update ATOM residue data...
323 if (IsAtomRecordType($RecordLine)) {
324 push @{$ResiduesDataMap{AtomResidueNames}}, $ResidueName;
325 if (exists $ResiduesDataMap{AtomResidueCount}{$ResidueName}) {
326 $ResiduesDataMap{AtomResidueCount}{$ResidueName} += 1;
327 }
328 else {
329 $ResiduesDataMap{AtomResidueCount}{$ResidueName} = 1;
330 }
331 }
332 # Update HETATM residue data...
333 if (IsHetatmRecordType($RecordLine)) {
334 push @{$ResiduesDataMap{HetatmResidueNames}}, $ResidueName;
335 if (exists $ResiduesDataMap{HetatmResidueCount}{$ResidueName}) {
336 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} += 1;
337 }
338 else {
339 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} = 1;
340 }
341 }
342 }
343
344 return \%ResiduesDataMap;
345 }
346
347 # Return min/max XYZ coordinates for ATOM/HETATM records...
348 sub GetMinMaxCoords {
349 my($PDBRecordLinesRef) = @_;
350
351 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
352
353 ($XMin, $YMin, $ZMin) = (99999) x 3;
354 ($XMax, $YMax, $ZMax) = (-99999) x 3;
355
356 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
357 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
358 next LINE;
359 }
360 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
361
362 $XMin = ($X < $XMin) ? $X : $XMin;
363 $YMin = ($Y < $YMin) ? $Y : $YMin;
364 $ZMin = ($Z < $ZMin) ? $Z : $ZMin;
365
366 $XMax = ($X > $XMax) ? $X : $XMax;
367 $YMax = ($Y > $YMax) ? $Y : $YMax;
368 $ZMax = ($Z > $ZMax) ? $Z : $ZMax;
369 }
370
371 if ($XMin == 99999) { $XMin = undef; }
372 if ($YMin == 99999) { $YMin = undef; }
373 if ($ZMin == 99999) { $ZMin = undef; }
374 if ($XMax == -99999) { $XMax = undef; }
375 if ($YMax == -99999) { $YMax = undef; }
376 if ($ZMax == -99999) { $ZMax = undef; }
377
378 return ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax);
379 }
380
381 # Read PDB file and return reference to record lines..
382 sub ReadPDBFile {
383 my($PDBFile) = @_;
384
385 my($Line, @PDBRecordLines);
386
387 @PDBRecordLines = ();
388 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n";
389 while ($Line = GetTextLine(\*PDBFILE)) {
390 push @PDBRecordLines, $Line;
391 }
392
393 close PDBFILE;
394
395 return (\@PDBRecordLines);
396 }
397
398 #
399 # Get experimental technique information...
400 #
401 sub GetExperimentalTechnique {
402 my($PDBRecordLinesRef) = @_;
403 my($RecordLine, $ContinuationNum, $ExperimentalTechnique);
404
405 $ExperimentalTechnique = undef;
406
407 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
408 if (_IsRecordType($RecordLine, 'EXPDTA')) {
409 ($ContinuationNum, $ExperimentalTechnique) = ParseExpdtaRecordLine($RecordLine);
410 last LINE;
411 }
412 }
413
414 return $ExperimentalTechnique;
415 }
416
417 #
418 # Get experimental technique resolution information...
419 #
420 sub GetExperimentalTechniqueResolution {
421 my($PDBRecordLinesRef) = @_;
422 my($RecordLine, $Resolution, $ResolutionUnits);
423
424 ($Resolution, $ResolutionUnits) = ((undef) x 2);
425
426 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
427 if ($RecordLine =~ /^REMARK 2 RESOLUTION./i) {
428 ($Resolution, $ResolutionUnits) = ParseRemark2ResolutionRecordLine($RecordLine);
429 last LINE;
430 }
431 }
432
433 return ($Resolution, $ResolutionUnits);
434 }
435
436 #
437 # Parse HEADER record line...
438 sub ParseHeaderRecordLine {
439 my($Line) = @_;
440 my($Classification, $DepositionDate, $IDCode) = (undef, undef, undef);
441
442 if ($Line !~ /^HEADER/i) {
443 return ($Classification, $DepositionDate, $IDCode);
444 }
445 my($Length);
446
447 ($Classification, $DepositionDate, $IDCode) = ('') x 3;
448
449 $Length = length $Line;
450
451 if ($Length <= 62) {
452 ($Classification, $DepositionDate) = unpack("x10A40A9", $Line);
453 }
454 else {
455 ($Classification, $DepositionDate, $IDCode) = unpack("x10A40A9x3A4", $Line);
456 }
457
458 $Classification = RemoveLeadingAndTrailingWhiteSpaces($Classification);
459 $DepositionDate =~ s/ //g;
460 $IDCode =~ s/ //g;
461
462 return ($Classification, $DepositionDate, $IDCode);
463 }
464
465 #
466 # Generate HEADER record line...
467 sub GenerateHeaderRecordLine {
468 my($Classification, $Date, $IDCode, $Line);
469
470 $Classification = "Created using MayaChemTools";
471 $Date = GenerateHeaderRecordTimeStamp();
472 if (@_ == 3) {
473 ($IDCode, $Classification, $Date) = @_;
474 }
475 elsif (@_ == 2) {
476 ($IDCode, $Classification) = @_;
477 }
478 elsif (@_ == 1) {
479 ($IDCode) = @_;
480 }
481
482 $Line = sprintf "HEADER %-40.40s%9.9s %4.4s", $Classification, $Date, $IDCode;
483 return $Line;
484 }
485
486 # Generate PDB header time stamp...
487 sub GenerateHeaderRecordTimeStamp {
488 return TimeUtil::PDBFileTimeStamp();
489 }
490
491 #
492 # Parse ATOM record line.
493 #
494 sub ParseAtomRecordLine {
495 my($Line) = @_;
496
497 return _ParseAtomOrHetatmRecordLine($Line);
498 }
499
500 # Generate ATOM record line...
501 sub GenerateAtomRecordLine {
502 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
503
504 return _GenerateAtomOrHetatmRecordLine('ATOM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
505 }
506
507 #
508 # Parse ATOM/HETATm record line.
509 #
510 sub ParseAtomOrHetatmRecordLine {
511 my($Line) = @_;
512
513 return _ParseAtomOrHetatmRecordLine($Line);
514 }
515
516 # Generate ATOM/HETATM record line...
517 sub GenerateAtomOrHetatmRecordLine {
518 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
519
520 return _GenerateAtomOrHetatmRecordLine($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
521 }
522 #
523 # Parse HETATM record line...
524 #
525 sub ParseHetatmRecordLine {
526 my($Line) = @_;
527
528 return _ParseAtomOrHetatmRecordLine($Line);
529 }
530
531 # Generate HETATM record line...
532 sub GenerateHetatmRecordLine {
533 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
534
535 return _GenerateAtomOrHetatmRecordLine('HETATM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
536 }
537
538 # Parse EXPDTA record line...
539 #
540 # EXPDTA format:
541 #
542 #1 - 6 Record name "EXPDTA"
543 # 9 - 10 Continuation continuation Allows concatenation of multiple records.
544 # 11 - 70 SList technique The experimental technique(s) with optional comment describing the sample or experiment.
545 #
546 # The EXPDTA record identifies the experimental technique used. This may refer to the type of radiation and
547 # sample, or include the spectroscopic or modeling technique. Permitted values include:
548 #
549 # ELECTRON DIFFRACTION
550 # FIBER DIFFRACTION
551 # FLUORESCENCE TRANSFER
552 # NEUTRON DIFFRACTION
553 # NMR
554 # THEORETICAL MODEL
555 # X-RAY DIFFRACTION
556 #
557 sub ParseExpdtaRecordLine {
558 my($Line) = @_;
559
560 if ($Line !~ /^EXPDTA/i) {
561 return ((undef) x 2);
562 }
563
564 my($ContinuationNum, $ExperimentalTechnique) = unpack("x8A2A60" , $Line);
565
566 $ContinuationNum =~ s/ //g;
567 $ExperimentalTechnique = RemoveLeadingAndTrailingWhiteSpaces($ExperimentalTechnique);
568
569 return ($ContinuationNum, $ExperimentalTechnique);
570 }
571
572 # Parse REMARK 2 record line...
573 #
574 # REMARK 2 format:
575 #
576 # The second REMARK 2 record has one of two formats. The first is used for diffraction studies, the second
577 # for other types of experiments in which resolution is not relevant, e.g., NMR and theoretical modeling.
578 #
579 #For diffraction experiments:
580 #
581 # 1 - 6 Record name "REMARK"
582 # 10 LString(1) "2"
583 # 12 - 22 LString(11) "RESOLUTION."
584 # 23 - 27 Real(5.2) resolution Resolution.
585 # 29 - 38 LString(10) "ANGSTROMS."
586 #
587 # REMARK 2 when not a diffraction experiment:
588 #
589 # 1 - 6 Record name "REMARK"
590 # 10 LString(1) "2"
591 # 12 - 38 LString(28) "RESOLUTION. NOT APPLICABLE."
592 # 41 - 70 String comment Comment.
593 #
594 sub ParseRemark2ResolutionRecordLine {
595 my($Line) = @_;
596
597 if ($Line !~ /^REMARK 2 RESOLUTION./i) {
598 return ((undef) x 2);
599 }
600
601 my($Resolution, $ResolutionUnits);
602
603 if ($Line =~ /NOT APPLICABLE/i) {
604 ($Resolution, $ResolutionUnits) = ("NOT APPLICABLE", "");
605 }
606 else {
607 ($Resolution, $ResolutionUnits) = unpack("x22A5x1A10" , $Line);
608 }
609
610 $Resolution = RemoveLeadingAndTrailingWhiteSpaces($Resolution);
611
612 $ResolutionUnits = RemoveLeadingAndTrailingWhiteSpaces($ResolutionUnits);
613 $ResolutionUnits =~ s/\.$//;
614
615 return ($Resolution, $ResolutionUnits);
616 }
617
618 #
619 # Parse SEQRES record line...
620 #
621 # SEQRES format:
622 #
623 # 1 - 6 Record name "SEQRES"
624 # 9 - 10 Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain.
625 # 12 - Chain identifier
626 # 14 - 17 Integer numRes Number of residues in the chain
627 # 20 - 22 24 -26 ... ... 68 - 70 Residue name resName Residue name.
628 #
629 sub ParseSeqresRecordLine {
630 my($Line) = @_;
631
632 if ($Line !~ /^SEQRES/i) {
633 return ((undef) x 5);
634 }
635 my($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) = unpack("x8A2x1A1x1A4x2A51" , $Line);
636 $RecordSerialNumber =~ s/ //g;
637 $ChainID =~ s/ //g;
638 $NumOfResidues =~ s/ //g;
639 $ResidueNames = RemoveLeadingAndTrailingWhiteSpaces($ResidueNames);
640
641 return ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames);
642 }
643
644 #
645 # Parse CONECT record line...
646 #
647 # CONECT format:
648 #
649 # 1 - 6 Record name "CONECT"
650 # 7 - 11 Atom number
651 # 12 - 16, 17 - 21, 22 - 26, 27 - 31 Atom number of bonded atom
652 #
653 # 32 - 36, 37 - 41 Atom number of hydrogen bonded atom
654 # 42 - 46 Atom number of salt bridged atom
655 # 47 - 51, 52 -56 Atom number of hydrogen bonded atom
656 # 57 - 61 Atom number of salt bridged atom
657 #
658 sub ParseConectRecordLine {
659 my($Line) = @_;
660
661 if ($Line !~ /^CONECT/i) {
662 return ((undef) x 11);
663 }
664 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = map {s/ //g; $_} unpack("x6A5A5A5A5A5A5A5A5A5A5A5", $Line);
665
666 return ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2);
667 }
668
669 # Generate CONECT record line...
670 sub GenerateConectRecordLine {
671 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = @_;
672 my($Line);
673
674 $Line = sprintf "CONECT%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s", $AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2;
675
676 return $Line;
677 }
678
679 #
680 # Parse TER record line...
681 #
682 # TER format:
683 #
684 #1 - 6 Record name "TER "
685 # 7 - 11 Serial number
686 # 18 - 20 Residue name
687 # 22 Chain identifier
688 # 23 - 26 Residue sequence number
689 # 27 Insertion code
690 #
691 sub ParseTerRecordLine {
692 my($Line) = @_;
693
694 if ($Line !~ /^TER/i) {
695 return ((undef) x 5);
696 }
697 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Length);
698
699 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = ('') x 5;
700
701 $Length = length $Line;
702
703 if ($Length <= 17) {
704 ($SerialNumber, $ResidueName) = map {s/ //g; $_} unpack("x6A5", $Line);
705 }
706 elsif ($Length <= 21) {
707 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3", $Line);
708 }
709 else {
710 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3xA1A4A1", $Line);
711 }
712
713 return ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode);
714 }
715
716 # Generate TER record line...
717 sub GenerateTerRecordLine {
718 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Line) = ('') x 6;
719
720 if (@_ == 5) {
721 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = @_;
722 }
723 elsif (@_ == 4) {
724 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber) = @_;
725 }
726 elsif (@_ == 3) {
727 ($SerialNumber, $ResidueName) = @_;
728 }
729 elsif (@_ == 2) {
730 ($SerialNumber, $ResidueName) = @_;
731 }
732 elsif (@_ == 1) {
733 ($SerialNumber) = @_;
734 }
735 $Line = sprintf "TER %5.5s %-3.3s %1.1s%4.4s%1.1s", $SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode;
736
737 return $Line;
738 }
739
740 #
741 # Parse MASTER record line...
742 #
743 # MASTER record format:
744 #
745 #1 - 6 Record name "MASTER"
746 # 11 - 15 Number of REMARK records
747 # 16 - 20 "0"
748 # 21 - 25 Number of HET records
749 # 26 - 30 Number of HELIX records
750 # 31 - 35 Number of SHEET records
751 # 36 - 40 Number of TURN records
752 # 41 - 45 Number of SITE records
753 # 46 - 50 Number of coordinate transformation records (ORIGXn+SCALEn+MTRIXn)
754 # 51 - 55 Number of atomic coordinate records (ATOM+HETATM)
755 # 56 - 60 Number of TER records
756 # 61 - 65 Number of CONECT records
757 # 66 - 70 Number of SEQRES records
758 #
759 sub ParseMasterRecordLine {
760 my($Line) = @_;
761
762 if ($Line !~ /^MASTER/i) {
763 return ((undef) x 11);
764 }
765 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = map {s/ //g; $_} unpack("x6x4A5x5A5A5A5A5A5A5A5A5A5A5", $Line);
766
767 return ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords);
768 }
769
770 # End record...
771 sub GenerateEndRecordLine {
772 my($Line);
773 $Line = 'END ';
774 return $Line;
775 }
776
777 # ATOM/HETATM record format:
778 #
779 # 1 - 6 Record name
780 # 7 - 11 Atom serial number - right justified
781 # 13 - 16 Atom name
782 # 17 Alternate location indicator.
783 # 18 - 20 Residue name - right justified
784 # 22 Chain identifier.
785 # 23 - 26 Residue sequence number - right justified
786 # 27 Code for insertion of residues.
787 # 31 - 38 Real(8.3), Orthogonal coordinates for X in Angstroms.
788 # 39 - 46 Real(8.3), Orthogonal coordinates for Y in Angstroms.
789 # 47 - 54 Real(8.3), Orthogonal coordinates for Z in Angstroms.
790 # 55 - 60 Real(6.2), Occupancy
791 # 61 - 66 Real(6.2), Temperature factor
792 # 73 - 76 LString(4), Segment identifier, left-justified.
793 # 77 - 78 LString(2), Element symbol, right-justified.
794 #79 - 80 LString(2), Charge on the atom.
795 #
796 # Notes:
797 # . Atom names starting with C, N, O and S are left justified starting with column 14
798 # and others are left justified starting with column 13.
799 #
800 # . Six characters (columns) are reserved for atom names, assigned as follows:
801 #
802 # 13 - 14 Chemical symbol - right justified, except for hydrogen atoms
803 #
804 # And for amino acids:
805 #
806 # 15 Remoteness indicator (alphabetic) (A, B, G, D, E, Z and so on)
807 # 16 Branch designator (numeric)
808 #
809 sub _ParseAtomOrHetatmRecordLine {
810 my($Line) = @_;
811
812 if ($Line !~ /^(ATOM|HETATM)/i) {
813 return ((undef) x 15);
814 }
815 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $Length);
816
817 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ('') x 15;
818
819 $Length = length $Line;
820
821 if ($Length <= 72) {
822 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6", $Line);
823 }
824 else {
825 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6x6A4A2A2", $Line);
826 }
827 return($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
828 }
829
830 # Generate ATOM/HETATM record line...
831 sub _GenerateAtomOrHetatmRecordLine {
832 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_;
833 my($Line, $AtomNameFormat);
834
835 if (length($AtomName) >= 4) {
836 # Left justified starting at column 13 for all atom names of length 4...
837 $AtomNameFormat = "%-4.4s";
838 }
839 elsif (IsEmpty($ElementSymbol)) {
840 # No element symbol specified; just guess from atom name to cover most likely cases...
841 $AtomNameFormat = ($AtomName =~ /^(C|N|O|S)/i) ? " %-3.3s" : "%-4.4s";
842 }
843 else {
844 # Element symbol specified...
845 if ($ElementSymbol =~ /^H$/i) {
846 # Hydrogen atom name with <=3 characters is left justified starting at column 14;
847 # Otherwise, left justified starting at column 13.
848 $AtomNameFormat = (length($AtomName) <= 3) ? " %-3.3s" : "%-4.4s";
849 }
850 else {
851 # Non-hydrogen atom name...
852 $AtomNameFormat = (length($ElementSymbol) == 1) ? " %-3.3s" : "%-4.4s";
853 }
854 }
855
856 $Line = sprintf "%-6.6s%5.5s ${AtomNameFormat}%1.1s%3.3s %1.1s%4.4s%1.1s %8.8s%8.8s%8.8s%6.6s%6.6s %-4.4s%2.2s%2.2s", $RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge;
857
858 return $Line;
859 }
860
861 # Check record type...
862 sub _IsRecordType {
863 my($Line, $SpecifiedType) = @_;
864 my($Type, $Status);
865
866 ($Type) = map {s/ //g; $_} unpack("A6", $Line);
867
868 $Status = ($SpecifiedType eq $Type) ? 1 : 0;
869
870 return $Status;
871 }
872
873 # Get record type...
874 sub _GetRecordType {
875 my($Line) = @_;
876 my($Type);
877
878 ($Type) = map {s/ //g; $_} unpack("A6", $Line);
879
880 return $Type;
881 }
882
883 # Get chains and residues data using ATOM/HETATM records...
884 #
885 sub _GetChainsAndResiduesFromAtomHetatmRecords {
886 my($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_;
887
888 my($LineCount, $TotalChainCount, $PreviousResidueNumber, $ChainCount, $DefaultChainID, $DefaultChainLabel, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ChainsDataMap);
889
890 # Do a quick chain count using TER record...
891 $TotalChainCount = 0;
892 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
893 if (IsEndmdlRecordType($RecordLine)) {
894 last LINE;
895 }
896 if (IsTerRecordType($RecordLine)) {
897 $TotalChainCount++;
898 }
899 }
900
901 %ChainsDataMap = ();
902 @{$ChainsDataMap{ChainIDs}} = ();
903 %{$ChainsDataMap{Residues}} = ();
904 %{$ChainsDataMap{ResidueNumbers}} = ();
905 %{$ChainsDataMap{Lines}} = ();
906 %{$ChainsDataMap{ResidueCount}} = ();
907
908 $LineCount = 0;
909 $ChainCount = 0;
910 $DefaultChainLabel = 'None';
911 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
912 $PreviousResidueNumber = 0;
913
914 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
915 $LineCount++;
916 if (IsTerRecordType($RecordLine)) {
917 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
918 $ChainCount++;
919 if ($ChainCount == $TotalChainCount) {
920 last LINE;
921 }
922 else {
923 next LINE;
924 }
925 }
926 elsif (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
927 next LINE;
928 }
929 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
930
931 if (IsEmpty($ChainID)) {
932 $ChainID = $DefaultChainID;
933 }
934 if (exists $ChainsDataMap{Residues}{$ChainID}) {
935 # Data for existing chain...
936 if ($GetRecordLinesFlag) {
937 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine;
938 }
939
940 if ($ResidueNumber != $PreviousResidueNumber) {
941 # Next residue with in the chain...
942 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName;
943 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber;
944
945 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) {
946 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1;
947 }
948 else {
949 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
950 }
951 $PreviousResidueNumber = $ResidueNumber;
952 }
953 }
954 else {
955 # Data for new chain...
956 push @{$ChainsDataMap{ChainIDs}}, $ChainID;
957
958 @{$ChainsDataMap{Residues}{$ChainID}} = ();
959 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName;
960
961 @{$ChainsDataMap{ResidueNumbers}{$ChainID}} = ();
962 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber;
963
964 @{$ChainsDataMap{Lines}{$ChainID}} = ();
965 if ($GetRecordLinesFlag) {
966 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine;
967 }
968
969 %{$ChainsDataMap{ResidueCount}{$ChainID}} = ();
970 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
971 $PreviousResidueNumber = $ResidueNumber;
972 }
973 }
974 if (!$GetChainResiduesBeyondTERFlag) {
975 return \%ChainsDataMap;
976 }
977 # Look for any HETATM residues specified outside TER records which could belong to an existing chain...
978 my($LineIndex, $PreviousChainID);
979 $PreviousChainID = '';
980 $PreviousResidueNumber = 0;
981 LINE: for $LineIndex (($LineCount - 1) .. $#{$PDBRecordLinesRef}) {
982 $RecordLine = $PDBRecordLinesRef->[$LineIndex];
983 if (IsEndmdlRecordType($RecordLine)) {
984 last LINE;
985 }
986 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
987 next LINE;
988 }
989 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
990 if (IsEmpty($ChainID)) {
991 # Ignore the chains with no ids...
992 next LINE;
993 }
994 if (! exists($ChainsDataMap{Residues}{$ChainID})) {
995 # Don't collect any new chains after TER record...
996 next LINE;
997 }
998 if ($GetRecordLinesFlag) {
999 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine;
1000 }
1001 if ($ResidueNumber != $PreviousResidueNumber || $ChainID ne $PreviousChainID) {
1002
1003 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName;
1004 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber;
1005
1006 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) {
1007 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1;
1008 }
1009 else {
1010 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
1011 }
1012 $PreviousChainID = $ChainID;
1013 $PreviousResidueNumber = $ResidueNumber;
1014 }
1015 }
1016 return \%ChainsDataMap;
1017 }
1018
1019 # Get chains and residues data using SEQRES records...
1020 #
1021 sub _GetChainsAndResiduesFromSeqresRecords {
1022 my($PDBRecordLinesRef) = @_;
1023
1024 my($ChainCount, $DefaultChainLabel, $DefaultChainID, $RecordLine, $RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueName, $ResidueNamesString, @ResidueNamesList, %ChainsDataMap);
1025
1026 %ChainsDataMap = ();
1027 @{$ChainsDataMap{ChainIDs}} = ();
1028 %{$ChainsDataMap{Residues}} = ();
1029 %{$ChainsDataMap{ResidueNumbers}} = ();
1030 %{$ChainsDataMap{ResidueCount}} = ();
1031
1032 $ChainCount = 0;
1033 $DefaultChainLabel = 'None';
1034 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
1035
1036 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
1037 if (!IsSeqresRecordType($RecordLine)) {
1038 next LINE;
1039 }
1040 ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNamesString) = ParseSeqresRecordLine($RecordLine);
1041 if ($RecordSerialNumber == 1) {
1042 # Indicates start of a new chain...
1043 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1);
1044 $ChainCount++;
1045 }
1046 if (IsEmpty($ChainID)) {
1047 $ChainID = $DefaultChainID;
1048 }
1049 # Process the residues...
1050 @ResidueNamesList = split /[ ]+/, $ResidueNamesString;
1051
1052 if (exists $ChainsDataMap{Residues}{$ChainID}) {
1053 # Data for existing chain...
1054 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList;
1055 }
1056 else {
1057 # Data for new chain...
1058 push @{$ChainsDataMap{ChainIDs}}, $ChainID;
1059 @{$ChainsDataMap{Residues}{$ChainID}} = ();
1060 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList;
1061 }
1062
1063 # Setup residue count...
1064 for $ResidueName (@ResidueNamesList) {
1065 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) {
1066 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1;
1067 }
1068 else {
1069 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1;
1070 }
1071 }
1072 }
1073 return \%ChainsDataMap;
1074 }
1075
1076 1;
1077
1078 __END__
1079
1080 =head1 NAME
1081
1082 PDBFileUtil
1083
1084 =head1 SYNOPSIS
1085
1086 use PDBFileUtil ;
1087
1088 use PDBFileUtil qw(:all);
1089
1090 =head1 DESCRIPTION
1091
1092 B<PDBFileUtil> module provides the following functions:
1093
1094 GenerateAtomOrHetatmRecordLine, GenerateAtomRecordLine, GenerateConectRecordLine,
1095 GenerateEndRecordLine, GenerateHeaderRecordLine, GenerateHeaderRecordTimeStamp,
1096 GenerateHetatmRecordLine, GenerateTerRecordLine, GetAllResidues,
1097 GetChainsAndResidues, GetConectRecordLines, GetExperimentalTechnique,
1098 GetExperimentalTechniqueResolution, GetMinMaxCoords, GetPDBRecordType,
1099 GetRecordTypesCount, IsAtomRecordType, IsConectRecordType, IsEndmdlRecordType,
1100 IsHeaderRecordType, IsHetatmRecordType, IsMasterRecordType, IsModelRecordType,
1101 IsPDBFile, IsSeqresRecordType, IsTerRecordType, ParseAtomOrHetatmRecordLine,
1102 ParseAtomRecordLine, ParseConectRecordLine, ParseExpdtaRecordLine,
1103 ParseHeaderRecordLine, ParseHetatmRecordLine, ParseMasterRecordLine,
1104 ParseRemark2ResolutionRecordLine, ParseSeqresRecordLine, ParseTerRecordLine,
1105 ReadPDBFile
1106
1107 =head1 METHODS
1108
1109 =over 4
1110
1111 =item B<GenerateAtomOrHetatmRecordLine>
1112
1113 $RecordLine = GenerateAtomOrHetatmRecordLine($RecordType,
1114 $AtomNumber, $AtomName, $AlternateLocation, $ResidueName,
1115 $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z,
1116 $Occupancy, $TemperatureFactor, $SegmentID,
1117 $ElementSymbol, $AtomCharge);
1118
1119 Returns ATOM or HETATM record line.
1120
1121 =item B<GenerateAtomRecordLine>
1122
1123 $RecordLine = GenerateAtomRecordLine($AtomNumber,
1124 $AtomName, $AlternateLocation, $ResidueName, $ChainID,
1125 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy,
1126 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge);
1127
1128 Returns ATOM record line.
1129
1130 =item B<GenerateConectRecordLine>
1131
1132 $RecordLine = GenerateConectRecordLine($AtomNum, $BondedAtomNum1,
1133 $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4,
1134 $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1,
1135 $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2);
1136
1137 Returns CONECT record line.
1138
1139 =item B<GenerateHeaderRecordLine>
1140
1141 $RecordLine = GenerateHeaderRecordLine($IDCode, [$Classification,
1142 $Date]);
1143
1144 Returns HEADER record line.
1145
1146 =item B<GenerateHeaderRecordTimeStamp>
1147
1148 $Date = GenerateHeaderRecordTimeStamp();
1149
1150 Returns PDB header time stamp.
1151
1152 =item B<GenerateHetatmRecordLine>
1153
1154 $RecordLine = GenerateHetatmRecordLine($AtomNumber, $AtomName,
1155 $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber,
1156 $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor,
1157 $SegmentID, $ElementSymbol, $AtomCharge);
1158
1159 Returns HETATM record line.
1160
1161 =item B<GenerateEndRecordLine>
1162
1163 $RecordLine = GenerateEndRecordLine();
1164
1165 Returns END record line.
1166
1167 =item B<GenerateTerRecordLine>
1168
1169 $RecordLine = GenerateTerRecordLine($SerialNumber, [$ResidueName,
1170 $ChainID, $ResidueNumber, $InsertionCode]);
1171
1172 Returns TER record line.
1173
1174 =item B<GetAllResidues>
1175
1176 $ResiduesDataRef = GetAllResidues($PDBRecordLinesRef);
1177
1178 Gets residue information using ATOM/HETATM records and returns a reference to a hash with
1179 following key/value pairs:
1180
1181 $ResiduesDataRef->{ResidueNames} - Array of all the residues
1182 $ResiduesDataRef->{ResidueCount}{$ResidueName} - Count of residues
1183 $ResiduesDataRef->{AtomResidueNames}} - Array of all ATOM residues
1184 $ResiduesDataRef->{AtomResidueCount}{$ResidueName} - Count of
1185 residues in ATOM records
1186 $ResiduesDataRef->{HetatomResidueNames} - List of all HETATM
1187 residues
1188 $ResiduesDataRef->{HetatmResidueCount}{$ResidueName} - Count of
1189 residues HETATM records
1190
1191 ATOM/HETATM records after the first ENDMDL records are simply ingnored.
1192
1193 =item B<GetChainsAndResidues>
1194
1195 $ChainsDataRef = GetChainsAndResidues($PDBRecordLinesRef,
1196 [$RecordsSource, $GetChainResiduesBeyondTERFlag,
1197 $GetRecordLinesFlag]);
1198
1199 Gets chains and residue information using ATOM/HETATM or SEQRES records and returns a reference to a
1200 hash with these keys:
1201
1202 $ChainsDataRef->{ChainIDs} - List of chain IDs with 'None' for
1203 no IDs
1204 $ChainsDataRef->{Residues}{$ChainID} - List of residues in order
1205 of their appearance in a chain
1206 $ChainsDataRef->{ResidueCount}{$ChainID}{$ResidueName} - Count of
1207 residues in a chain
1208
1209 Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records.
1210 ATOM/HETATM records after the first ENDMDL records are simply ingnored.
1211
1212 =item B<GetConectRecordLines>
1213
1214 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef,
1215 $AtomNumbersMapRef);
1216
1217 Collects CONECT record lines for specific atom number, modified specified data to exclude any atom
1218 number not present in the list of specified atom numbers and returns a reference to list of
1219 CONECT record lines.
1220
1221 =item B<GetExperimentalTechnique>
1222
1223 $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef);
1224
1225 Returns I<ExperimentalTechnique> value retrieved from EXPDATA record line.
1226
1227 =item B<GetExperimentalTechniqueResolution>
1228
1229 ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution(
1230 $PDBRecordLinesRef);
1231
1232 Returns I<Resolution> and I<ResolutionUnits> values from REMARK 2 RESOLUTION
1233 record line.
1234
1235 =item B<GetMinMaxCoords>
1236
1237 ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) =
1238 GetMinMaxCoords($PDBRecordLinesRef);
1239
1240 Returns minimum and maximum XYZ coordinates for ATOM/HETATM records.
1241
1242 =item B<GetPDBRecordType>
1243
1244 $RecordType = GetPDBRecordType($RecordLine);
1245
1246 Returns type of I<RecordLine>.
1247
1248 =item B<GetRecordTypesCount>
1249
1250 $RecordTypeDataRef = GetRecordTypesCount($PDBRecordLinesRef,
1251 [$SpecifiedRecordType, $GetRecordLinesFlag]);
1252
1253 Counts the number of each record type or a $SpecifiedRecordType and returns a reference to data
1254 type with following key/value pairs:
1255
1256 $RecordTypeDataRef->{RecordTypes} - An array of unique record types
1257 in order of their presence in the file
1258 $RecordTypeDataRef->{Count}{$RecordType} - Count of each record type
1259 $RecordTypeDataRef->{Lines}{$RecordType} - Optional lines data for a
1260 specific record type.
1261
1262 =item B<IsAtomRecordType>
1263
1264 $Status = IsAtomRecordType($RecordLine);
1265
1266 Returns 1 or 0 based on whether it's a ATOM record line.
1267
1268 =item B<IsConectRecordType>
1269
1270 $Status = IsAtomConectType($RecordLine);
1271
1272 Returns 1 or 0 based on whether it's a CONECT record line.
1273
1274 =item B<IsEndmdlRecordType>
1275
1276 $Status = IsEndmdlRecordType($RecordLine);
1277
1278 Returns 1 or 0 based on whether it's a ENDMDL a record line.
1279
1280 =item B<IsHeaderRecordType>
1281
1282 $Status = IsHeaderRecordType($RecordLine);
1283
1284 Returns 1 or 0 based on whether it's a HEADER a record line.
1285
1286 =item B<IsHetatmRecordType>
1287
1288 $Status = IsHetatmRecordType($RecordLine);
1289
1290 Returns 1 or 0 based on whether it's a HETATM a record line.
1291
1292 =item B<IsMasterRecordType>
1293
1294 $Status = IsMasterRecordType($RecordLine);
1295
1296 Returns 1 or 0 based on whether it's a MASTER a record line.
1297
1298 =item B<IsModelRecordType>
1299
1300 $Status = IsModelRecordType($RecordLine);
1301
1302 Returns 1 or 0 based on whether it's a MODEL record line.
1303
1304 =item B<IsPDBFile>
1305
1306 $Status = IsPDBFile($PDBFile);
1307
1308 Returns 1 or 0 based on whether it's a PDB file.
1309
1310 =item B<IsSeqresRecordType>
1311
1312 $Status = IsSeqresRecordType($RecordLine);
1313
1314 Returns 1 or 0 based on whether it's SEQRES a record line.
1315
1316 =item B<IsTerRecordType>
1317
1318 $Status = IsTerRecordType($RecordLine);
1319
1320 Returns 1 or 0 based on whether it's a TER record line.
1321
1322 =item B<ParseAtomOrHetatmRecordLine>
1323
1324 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID,
1325 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy,
1326 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) =
1327 ParseAtomOrHetatmRecordLine($RecordLine);
1328
1329 Parses ATOM or HETATM record line.
1330
1331 =item B<ParseAtomRecordLine>
1332
1333 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID,
1334 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy,
1335 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) =
1336 ParseAtomRecordLine($RecordLine);
1337
1338 Parses ATOM record line.
1339
1340 =item B<ParseConectRecordLine>
1341
1342 ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3,
1343 $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2,
1344 $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4,
1345 $SaltBridgedAtomNum2) = ParseConectRecordLine($RecordLine);
1346
1347 Parses CONECT record line.
1348
1349 =item B<ParseExpdtaRecordLine>
1350
1351 ($ContinuationNum, $ExperimentalTechnique) = ParseExpdtaRecordLine($Line);
1352
1353 Parses EXPDTA record line.
1354
1355 =item B<ParseHeaderRecordLine>
1356
1357 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($RecordLine);
1358
1359 Parses HEADER record line
1360
1361 =item B<ParseHetatmRecordLine>
1362
1363 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID,
1364 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy,
1365 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) =
1366 ParseHetatmRecordLine($RecordLine);
1367
1368 Parses HETATM record line.
1369
1370 =item B<ParseMasterRecordLine>
1371
1372 ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords,
1373 $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords,
1374 $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords,
1375 $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) =
1376 ParseMasterRecordLine($RecordLine);
1377
1378 Parses MASTER ecord line.
1379
1380 =item B<ParseRemark2ResolutionRecordLine>
1381
1382 ($Resolution, $ResolutionUnits) = ParseRemark2ResolutionRecordLine(
1383 $RecordLine);
1384
1385 Parses REMARK 2 RESOLUTION record line.
1386
1387 =item B<ParseSeqresRecordLine>
1388
1389 ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) =
1390 ParseSeqresRecordLine($RecordLine);
1391
1392 Parses SEQRES record line.
1393
1394 =item B<ParseTerRecordLine>
1395
1396 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) =
1397 ParseTerRecordLine($RecordLine);
1398
1399 Parses TER record line.
1400
1401 =item B<ReadPDBFile>
1402
1403 $PDBRecordLinesRef = ReadPDBFile($PDBFile);
1404
1405 Reads PDB file and returns reference to record lines.
1406
1407 =back
1408
1409 =head1 AUTHOR
1410
1411 Manish Sud <msud@san.rr.com>
1412
1413 =head1 SEE ALSO
1414
1415 FileUtil.pm, SequenceFileUtil.pm, TextUtil.pm
1416
1417 =head1 COPYRIGHT
1418
1419 Copyright (C) 2015 Manish Sud. All rights reserved.
1420
1421 This file is part of MayaChemTools.
1422
1423 MayaChemTools is free software; you can redistribute it and/or modify it under
1424 the terms of the GNU Lesser General Public License as published by the Free
1425 Software Foundation; either version 3 of the License, or (at your option)
1426 any later version.
1427
1428 =cut