Mercurial > repos > deepakjadmin > mayatool3_test3
comparison mayachemtools/lib/PDBFileUtil.pm @ 0:73ae111cf86f draft
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 11:55:01 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:73ae111cf86f |
---|---|
1 package PDBFileUtil; | |
2 # | |
3 # $RCSfile: PDBFileUtil.pm,v $ | |
4 # $Date: 2015/02/28 20:47:18 $ | |
5 # $Revision: 1.36 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use Exporter; | |
31 use Text::ParseWords; | |
32 use TextUtil; | |
33 use FileUtil; | |
34 use TimeUtil (); | |
35 | |
36 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); | |
37 | |
38 @ISA = qw(Exporter); | |
39 @EXPORT = qw(GetPDBRecordType GetRecordTypesCount GetAllResidues GetConectRecordLines GetChainsAndResidues GetExperimentalTechnique GetExperimentalTechniqueResolution GetMinMaxCoords IsPDBFile IsAtomRecordType IsConectRecordType IsHeaderRecordType IsHetatmRecordType IsSeqresRecordType IsModelRecordType IsEndmdlRecordType IsTerRecordType IsMasterRecordType ReadPDBFile ParseHeaderRecordLine GenerateHeaderRecordLine GenerateHeaderRecordTimeStamp ParseAtomRecordLine GenerateAtomRecordLine ParseAtomOrHetatmRecordLine GenerateAtomOrHetatmRecordLine GenerateHetatmRecordLine ParseHetatmRecordLine ParseConectRecordLine GenerateConectRecordLine ParseExpdtaRecordLine ParseRemark2ResolutionRecordLine ParseSeqresRecordLine ParseTerRecordLine GenerateTerRecordLine ParseMasterRecordLine GenerateEndRecordLine); | |
40 @EXPORT_OK = qw(); | |
41 | |
42 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); | |
43 | |
44 # Get PDB record type... | |
45 sub GetPDBRecordType { | |
46 my($Line) = @_; | |
47 | |
48 return _GetRecordType($Line); | |
49 } | |
50 | |
51 # Is it a PDB file? | |
52 sub IsPDBFile { | |
53 my($PDBFile) = @_; | |
54 my($Line, $Status); | |
55 | |
56 $Status = 0; | |
57 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n"; | |
58 $Line = GetTextLine(\*PDBFILE); | |
59 $Status = ($Line =~ /^HEADER/i) ? 1 : 0; | |
60 close PDBFILE; | |
61 | |
62 return $Status; | |
63 } | |
64 | |
65 # Is it a atom record type? | |
66 sub IsAtomRecordType { | |
67 my($Line) = @_; | |
68 | |
69 return _IsRecordType($Line, 'ATOM'); | |
70 } | |
71 | |
72 # Is it a connect record type? | |
73 sub IsConectRecordType { | |
74 my($Line) = @_; | |
75 | |
76 return _IsRecordType($Line, 'CONECT'); | |
77 } | |
78 | |
79 # Is it a header atom record type? | |
80 sub IsHeaderRecordType { | |
81 my($Line) = @_; | |
82 | |
83 return _IsRecordType($Line, 'HEADER'); | |
84 } | |
85 | |
86 # Is it a hetro atom record type? | |
87 sub IsHetatmRecordType { | |
88 my($Line) = @_; | |
89 | |
90 return _IsRecordType($Line, 'HETATM'); | |
91 } | |
92 | |
93 # Is it a seqres record type? | |
94 sub IsSeqresRecordType { | |
95 my($Line) = @_; | |
96 | |
97 return _IsRecordType($Line, 'SEQRES'); | |
98 } | |
99 | |
100 # Is it a MODEL record type? | |
101 sub IsModelRecordType { | |
102 my($Line) = @_; | |
103 | |
104 return _IsRecordType($Line, 'MODEL'); | |
105 } | |
106 | |
107 # Is it a ENDMDL record type? | |
108 sub IsEndmdlRecordType { | |
109 my($Line) = @_; | |
110 | |
111 return _IsRecordType($Line, 'ENDMDL'); | |
112 } | |
113 | |
114 # Is it a TER record type? | |
115 sub IsTerRecordType { | |
116 my($Line) = @_; | |
117 | |
118 return _IsRecordType($Line, 'TER'); | |
119 } | |
120 | |
121 # Is it a MASTER record type? | |
122 sub IsMasterRecordType { | |
123 my($Line) = @_; | |
124 | |
125 return _IsRecordType($Line, 'MASTER'); | |
126 } | |
127 | |
128 # Count the number of each record type and a reference to data type with these key/value pairs: | |
129 # {RecordTypes} - An array of unique record types in order of their presence in the file | |
130 # {Count}{$RecordType} - Count of each record type | |
131 # {Lines}{$RecordType} - Optional lines data for a specific record type. | |
132 # | |
133 sub GetRecordTypesCount { | |
134 my($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag, $RecordType, $RecordLine, %RecordTypeDataMap); | |
135 | |
136 %RecordTypeDataMap = (); | |
137 @{$RecordTypeDataMap{RecordTypes}} = (); | |
138 %{$RecordTypeDataMap{Count}} = (); | |
139 %{$RecordTypeDataMap{Lines}} = (); | |
140 | |
141 $SpecifiedRecordType = ''; | |
142 $GetRecordLinesFlag = 0; | |
143 if (@_ == 3) { | |
144 ($PDBRecordLinesRef, $SpecifiedRecordType, $GetRecordLinesFlag) = @_; | |
145 $SpecifiedRecordType = uc $SpecifiedRecordType; | |
146 } | |
147 elsif (@_ == 2) { | |
148 ($PDBRecordLinesRef, $SpecifiedRecordType) = @_; | |
149 $SpecifiedRecordType = uc $SpecifiedRecordType; | |
150 } | |
151 else { | |
152 ($PDBRecordLinesRef) = @_; | |
153 } | |
154 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
155 $RecordType = _GetRecordType($RecordLine); | |
156 if ($SpecifiedRecordType && ($SpecifiedRecordType ne $RecordType)) { | |
157 next LINE; | |
158 } | |
159 if (exists $RecordTypeDataMap{Count}{$RecordType}) { | |
160 # Update count... | |
161 $RecordTypeDataMap{Count}{$RecordType} += 1; | |
162 | |
163 if ($GetRecordLinesFlag) { | |
164 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine; | |
165 } | |
166 } | |
167 else { | |
168 # New record type... | |
169 push @{$RecordTypeDataMap{RecordTypes}}, $RecordType; | |
170 $RecordTypeDataMap{Count}{$RecordType} = 1; | |
171 | |
172 if ($GetRecordLinesFlag) { | |
173 @{$RecordTypeDataMap{Lines}{$RecordType}} = (); | |
174 push @{$RecordTypeDataMap{Lines}{$RecordType}}, $RecordLine; | |
175 } | |
176 } | |
177 } | |
178 return (\%RecordTypeDataMap); | |
179 } | |
180 | |
181 # Collect CONECT record lines for specific atom number, modified specified data to exclude any atom | |
182 # number not present in the list of specified atom numbers and return a reference to list of | |
183 # CONECT record lines. | |
184 # | |
185 sub GetConectRecordLines { | |
186 my($PDBRecordLinesRef, $AtomNumbersMapRef) = @_; | |
187 my($AtomNumber, $ConectAtomNumber, $RecordLine, @ConectRecordAtomNums, @ConectRecordLines); | |
188 | |
189 @ConectRecordLines = (); | |
190 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
191 if (!IsConectRecordType($RecordLine)) { | |
192 next LINE; | |
193 } | |
194 @ConectRecordAtomNums = (); | |
195 push @ConectRecordAtomNums, ParseConectRecordLine($RecordLine); | |
196 ATOMNUMBER: for $ConectAtomNumber (@ConectRecordAtomNums) { | |
197 if (defined $ConectAtomNumber) { | |
198 $AtomNumber = $ConectAtomNumber; | |
199 if ($AtomNumber) { | |
200 if (! exists $AtomNumbersMapRef->{$AtomNumber}) { | |
201 next LINE; | |
202 } | |
203 } | |
204 } | |
205 } | |
206 push @ConectRecordLines, $RecordLine; | |
207 } | |
208 return \@ConectRecordLines; | |
209 } | |
210 | |
211 # Get chains and residue information using ATOM/HETATM or SEQRES records. And return a reference to a | |
212 # hash with these keys: | |
213 # | |
214 # @{$ChainsDataMap{ChainIDs}} - List of chain IDs with 'None' for no chain identification | |
215 # @{$ChainsDataMap{Residues}{$ChainID}} - List of residues in order of their appearance in a chain | |
216 # @{$ChainsDataMap{ResidueNumbers}{$ChainID}} - List of residue numbers in order of their appearance in a chain | |
217 # %{$ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}} - Count of specific residues in a chain | |
218 # | |
219 # Notes: | |
220 # . Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records. | |
221 # . In addition to a different chain ID in ATOM/HETATM a TER record also indicates end of an existing chain | |
222 # and start of a new one: ChainID in ATOM/HETATM records might still be emtpy. | |
223 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored. | |
224 # | |
225 sub GetChainsAndResidues { | |
226 my($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag); | |
227 | |
228 $RecordsSource = 'AtomAndHetatm'; | |
229 $GetChainResiduesBeyondTERFlag = 0; | |
230 $GetRecordLinesFlag = 0; | |
231 | |
232 if (@_ == 4) { | |
233 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_; | |
234 } | |
235 elsif (@_ == 3) { | |
236 ($PDBRecordLinesRef, $RecordsSource, $GetChainResiduesBeyondTERFlag) = @_; | |
237 } | |
238 elsif (@_ == 2) { | |
239 ($PDBRecordLinesRef, $RecordsSource) = @_; | |
240 } | |
241 else { | |
242 ($PDBRecordLinesRef) = @_; | |
243 } | |
244 | |
245 if ($RecordsSource =~ /^AtomAndHetatm$/i) { | |
246 return _GetChainsAndResiduesFromAtomHetatmRecords($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag); | |
247 } | |
248 elsif ($RecordsSource =~ /^Seqres$/i) { | |
249 return _GetChainsAndResiduesFromSeqresRecords($PDBRecordLinesRef); | |
250 } | |
251 else { | |
252 my(%ChainsDataMap); | |
253 %ChainsDataMap = (); | |
254 @{$ChainsDataMap{ChainIDs}} = (); | |
255 %{$ChainsDataMap{Residues}} = (); | |
256 %{$ChainsDataMap{ResidueNumbers}} = (); | |
257 %{$ChainsDataMap{ResidueCount}} = (); | |
258 | |
259 return \%ChainsDataMap; | |
260 } | |
261 } | |
262 | |
263 | |
264 # Get residue information using ATOM/HETATM records and return a reference to a hash with | |
265 # these keys: | |
266 # | |
267 # @{$ResiduesDataMap{ResidueNames}} - List of all the residues | |
268 # %{$ResiduesDataMap{ResidueCount}{$ResidueName}} - Count of residues | |
269 # @{$ResiduesDataMap{AtomResidueNames}} - List of all the residues | |
270 # %{$ResiduesDataMap{AtomResidueCount}{$ResidueName}} - Count of residues in ATOM records | |
271 # @{$ResiduesDataMap{HetatomResidueNames}} - List of all the residues | |
272 # %{$ResiduesDataMap{HetatmResidueCount}{$ResidueName}} - Count of residues HETATM records | |
273 # | |
274 # Notes: | |
275 # . ATOM/HETATM records after the first ENDMDL records are simply ingnored. | |
276 # | |
277 sub GetAllResidues { | |
278 my($PDBRecordLinesRef) = @_; | |
279 | |
280 my($PreviousChainID, $PreviousResidueNumber, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ResiduesDataMap); | |
281 | |
282 %ResiduesDataMap = (); | |
283 @{$ResiduesDataMap{ResidueNames}} = (); | |
284 %{$ResiduesDataMap{ResidueCount}} = (); | |
285 @{$ResiduesDataMap{AtomResidueNames}} = (); | |
286 %{$ResiduesDataMap{AtomResidueCount}} = (); | |
287 @{$ResiduesDataMap{HetatmResidueNames}} = (); | |
288 %{$ResiduesDataMap{HetatmResidueCount}} = (); | |
289 | |
290 $PreviousChainID = ''; | |
291 $PreviousResidueNumber = 0; | |
292 | |
293 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
294 if (IsEndmdlRecordType($RecordLine)) { | |
295 last LINE; | |
296 } | |
297 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { | |
298 next LINE; | |
299 } | |
300 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); | |
301 | |
302 if ($PreviousChainID eq $ChainID) { | |
303 if ($ResidueNumber == $PreviousResidueNumber) { | |
304 next LINE; | |
305 } | |
306 $PreviousResidueNumber = $ResidueNumber; | |
307 } | |
308 else { | |
309 # New chain... | |
310 $PreviousChainID = $ChainID; | |
311 $PreviousResidueNumber = $ResidueNumber; | |
312 } | |
313 | |
314 # Store the residue and update its count... | |
315 push @{$ResiduesDataMap{ResidueNames}}, $ResidueName; | |
316 if (exists $ResiduesDataMap{ResidueCount}{$ResidueName}) { | |
317 $ResiduesDataMap{ResidueCount}{$ResidueName} += 1; | |
318 } | |
319 else { | |
320 $ResiduesDataMap{ResidueCount}{$ResidueName} = 1; | |
321 } | |
322 # Update ATOM residue data... | |
323 if (IsAtomRecordType($RecordLine)) { | |
324 push @{$ResiduesDataMap{AtomResidueNames}}, $ResidueName; | |
325 if (exists $ResiduesDataMap{AtomResidueCount}{$ResidueName}) { | |
326 $ResiduesDataMap{AtomResidueCount}{$ResidueName} += 1; | |
327 } | |
328 else { | |
329 $ResiduesDataMap{AtomResidueCount}{$ResidueName} = 1; | |
330 } | |
331 } | |
332 # Update HETATM residue data... | |
333 if (IsHetatmRecordType($RecordLine)) { | |
334 push @{$ResiduesDataMap{HetatmResidueNames}}, $ResidueName; | |
335 if (exists $ResiduesDataMap{HetatmResidueCount}{$ResidueName}) { | |
336 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} += 1; | |
337 } | |
338 else { | |
339 $ResiduesDataMap{HetatmResidueCount}{$ResidueName} = 1; | |
340 } | |
341 } | |
342 } | |
343 | |
344 return \%ResiduesDataMap; | |
345 } | |
346 | |
347 # Return min/max XYZ coordinates for ATOM/HETATM records... | |
348 sub GetMinMaxCoords { | |
349 my($PDBRecordLinesRef) = @_; | |
350 | |
351 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); | |
352 | |
353 ($XMin, $YMin, $ZMin) = (99999) x 3; | |
354 ($XMax, $YMax, $ZMax) = (-99999) x 3; | |
355 | |
356 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
357 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { | |
358 next LINE; | |
359 } | |
360 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); | |
361 | |
362 $XMin = ($X < $XMin) ? $X : $XMin; | |
363 $YMin = ($Y < $YMin) ? $Y : $YMin; | |
364 $ZMin = ($Z < $ZMin) ? $Z : $ZMin; | |
365 | |
366 $XMax = ($X > $XMax) ? $X : $XMax; | |
367 $YMax = ($Y > $YMax) ? $Y : $YMax; | |
368 $ZMax = ($Z > $ZMax) ? $Z : $ZMax; | |
369 } | |
370 | |
371 if ($XMin == 99999) { $XMin = undef; } | |
372 if ($YMin == 99999) { $YMin = undef; } | |
373 if ($ZMin == 99999) { $ZMin = undef; } | |
374 if ($XMax == -99999) { $XMax = undef; } | |
375 if ($YMax == -99999) { $YMax = undef; } | |
376 if ($ZMax == -99999) { $ZMax = undef; } | |
377 | |
378 return ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax); | |
379 } | |
380 | |
381 # Read PDB file and return reference to record lines.. | |
382 sub ReadPDBFile { | |
383 my($PDBFile) = @_; | |
384 | |
385 my($Line, @PDBRecordLines); | |
386 | |
387 @PDBRecordLines = (); | |
388 open PDBFILE, "$PDBFile" or die "Can't open $PDBFile: $!\n"; | |
389 while ($Line = GetTextLine(\*PDBFILE)) { | |
390 push @PDBRecordLines, $Line; | |
391 } | |
392 | |
393 close PDBFILE; | |
394 | |
395 return (\@PDBRecordLines); | |
396 } | |
397 | |
398 # | |
399 # Get experimental technique information... | |
400 # | |
401 sub GetExperimentalTechnique { | |
402 my($PDBRecordLinesRef) = @_; | |
403 my($RecordLine, $ContinuationNum, $ExperimentalTechnique); | |
404 | |
405 $ExperimentalTechnique = undef; | |
406 | |
407 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
408 if (_IsRecordType($RecordLine, 'EXPDTA')) { | |
409 ($ContinuationNum, $ExperimentalTechnique) = ParseExpdtaRecordLine($RecordLine); | |
410 last LINE; | |
411 } | |
412 } | |
413 | |
414 return $ExperimentalTechnique; | |
415 } | |
416 | |
417 # | |
418 # Get experimental technique resolution information... | |
419 # | |
420 sub GetExperimentalTechniqueResolution { | |
421 my($PDBRecordLinesRef) = @_; | |
422 my($RecordLine, $Resolution, $ResolutionUnits); | |
423 | |
424 ($Resolution, $ResolutionUnits) = ((undef) x 2); | |
425 | |
426 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
427 if ($RecordLine =~ /^REMARK 2 RESOLUTION./i) { | |
428 ($Resolution, $ResolutionUnits) = ParseRemark2ResolutionRecordLine($RecordLine); | |
429 last LINE; | |
430 } | |
431 } | |
432 | |
433 return ($Resolution, $ResolutionUnits); | |
434 } | |
435 | |
436 # | |
437 # Parse HEADER record line... | |
438 sub ParseHeaderRecordLine { | |
439 my($Line) = @_; | |
440 my($Classification, $DepositionDate, $IDCode) = (undef, undef, undef); | |
441 | |
442 if ($Line !~ /^HEADER/i) { | |
443 return ($Classification, $DepositionDate, $IDCode); | |
444 } | |
445 my($Length); | |
446 | |
447 ($Classification, $DepositionDate, $IDCode) = ('') x 3; | |
448 | |
449 $Length = length $Line; | |
450 | |
451 if ($Length <= 62) { | |
452 ($Classification, $DepositionDate) = unpack("x10A40A9", $Line); | |
453 } | |
454 else { | |
455 ($Classification, $DepositionDate, $IDCode) = unpack("x10A40A9x3A4", $Line); | |
456 } | |
457 | |
458 $Classification = RemoveLeadingAndTrailingWhiteSpaces($Classification); | |
459 $DepositionDate =~ s/ //g; | |
460 $IDCode =~ s/ //g; | |
461 | |
462 return ($Classification, $DepositionDate, $IDCode); | |
463 } | |
464 | |
465 # | |
466 # Generate HEADER record line... | |
467 sub GenerateHeaderRecordLine { | |
468 my($Classification, $Date, $IDCode, $Line); | |
469 | |
470 $Classification = "Created using MayaChemTools"; | |
471 $Date = GenerateHeaderRecordTimeStamp(); | |
472 if (@_ == 3) { | |
473 ($IDCode, $Classification, $Date) = @_; | |
474 } | |
475 elsif (@_ == 2) { | |
476 ($IDCode, $Classification) = @_; | |
477 } | |
478 elsif (@_ == 1) { | |
479 ($IDCode) = @_; | |
480 } | |
481 | |
482 $Line = sprintf "HEADER %-40.40s%9.9s %4.4s", $Classification, $Date, $IDCode; | |
483 return $Line; | |
484 } | |
485 | |
486 # Generate PDB header time stamp... | |
487 sub GenerateHeaderRecordTimeStamp { | |
488 return TimeUtil::PDBFileTimeStamp(); | |
489 } | |
490 | |
491 # | |
492 # Parse ATOM record line. | |
493 # | |
494 sub ParseAtomRecordLine { | |
495 my($Line) = @_; | |
496 | |
497 return _ParseAtomOrHetatmRecordLine($Line); | |
498 } | |
499 | |
500 # Generate ATOM record line... | |
501 sub GenerateAtomRecordLine { | |
502 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; | |
503 | |
504 return _GenerateAtomOrHetatmRecordLine('ATOM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); | |
505 } | |
506 | |
507 # | |
508 # Parse ATOM/HETATm record line. | |
509 # | |
510 sub ParseAtomOrHetatmRecordLine { | |
511 my($Line) = @_; | |
512 | |
513 return _ParseAtomOrHetatmRecordLine($Line); | |
514 } | |
515 | |
516 # Generate ATOM/HETATM record line... | |
517 sub GenerateAtomOrHetatmRecordLine { | |
518 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; | |
519 | |
520 return _GenerateAtomOrHetatmRecordLine($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); | |
521 } | |
522 # | |
523 # Parse HETATM record line... | |
524 # | |
525 sub ParseHetatmRecordLine { | |
526 my($Line) = @_; | |
527 | |
528 return _ParseAtomOrHetatmRecordLine($Line); | |
529 } | |
530 | |
531 # Generate HETATM record line... | |
532 sub GenerateHetatmRecordLine { | |
533 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; | |
534 | |
535 return _GenerateAtomOrHetatmRecordLine('HETATM', $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); | |
536 } | |
537 | |
538 # Parse EXPDTA record line... | |
539 # | |
540 # EXPDTA format: | |
541 # | |
542 #1 - 6 Record name "EXPDTA" | |
543 # 9 - 10 Continuation continuation Allows concatenation of multiple records. | |
544 # 11 - 70 SList technique The experimental technique(s) with optional comment describing the sample or experiment. | |
545 # | |
546 # The EXPDTA record identifies the experimental technique used. This may refer to the type of radiation and | |
547 # sample, or include the spectroscopic or modeling technique. Permitted values include: | |
548 # | |
549 # ELECTRON DIFFRACTION | |
550 # FIBER DIFFRACTION | |
551 # FLUORESCENCE TRANSFER | |
552 # NEUTRON DIFFRACTION | |
553 # NMR | |
554 # THEORETICAL MODEL | |
555 # X-RAY DIFFRACTION | |
556 # | |
557 sub ParseExpdtaRecordLine { | |
558 my($Line) = @_; | |
559 | |
560 if ($Line !~ /^EXPDTA/i) { | |
561 return ((undef) x 2); | |
562 } | |
563 | |
564 my($ContinuationNum, $ExperimentalTechnique) = unpack("x8A2A60" , $Line); | |
565 | |
566 $ContinuationNum =~ s/ //g; | |
567 $ExperimentalTechnique = RemoveLeadingAndTrailingWhiteSpaces($ExperimentalTechnique); | |
568 | |
569 return ($ContinuationNum, $ExperimentalTechnique); | |
570 } | |
571 | |
572 # Parse REMARK 2 record line... | |
573 # | |
574 # REMARK 2 format: | |
575 # | |
576 # The second REMARK 2 record has one of two formats. The first is used for diffraction studies, the second | |
577 # for other types of experiments in which resolution is not relevant, e.g., NMR and theoretical modeling. | |
578 # | |
579 #For diffraction experiments: | |
580 # | |
581 # 1 - 6 Record name "REMARK" | |
582 # 10 LString(1) "2" | |
583 # 12 - 22 LString(11) "RESOLUTION." | |
584 # 23 - 27 Real(5.2) resolution Resolution. | |
585 # 29 - 38 LString(10) "ANGSTROMS." | |
586 # | |
587 # REMARK 2 when not a diffraction experiment: | |
588 # | |
589 # 1 - 6 Record name "REMARK" | |
590 # 10 LString(1) "2" | |
591 # 12 - 38 LString(28) "RESOLUTION. NOT APPLICABLE." | |
592 # 41 - 70 String comment Comment. | |
593 # | |
594 sub ParseRemark2ResolutionRecordLine { | |
595 my($Line) = @_; | |
596 | |
597 if ($Line !~ /^REMARK 2 RESOLUTION./i) { | |
598 return ((undef) x 2); | |
599 } | |
600 | |
601 my($Resolution, $ResolutionUnits); | |
602 | |
603 if ($Line =~ /NOT APPLICABLE/i) { | |
604 ($Resolution, $ResolutionUnits) = ("NOT APPLICABLE", ""); | |
605 } | |
606 else { | |
607 ($Resolution, $ResolutionUnits) = unpack("x22A5x1A10" , $Line); | |
608 } | |
609 | |
610 $Resolution = RemoveLeadingAndTrailingWhiteSpaces($Resolution); | |
611 | |
612 $ResolutionUnits = RemoveLeadingAndTrailingWhiteSpaces($ResolutionUnits); | |
613 $ResolutionUnits =~ s/\.$//; | |
614 | |
615 return ($Resolution, $ResolutionUnits); | |
616 } | |
617 | |
618 # | |
619 # Parse SEQRES record line... | |
620 # | |
621 # SEQRES format: | |
622 # | |
623 # 1 - 6 Record name "SEQRES" | |
624 # 9 - 10 Serial number of the SEQRES record for the current chain. Starts at 1 and increments by one each line. Reset to 1 for each chain. | |
625 # 12 - Chain identifier | |
626 # 14 - 17 Integer numRes Number of residues in the chain | |
627 # 20 - 22 24 -26 ... ... 68 - 70 Residue name resName Residue name. | |
628 # | |
629 sub ParseSeqresRecordLine { | |
630 my($Line) = @_; | |
631 | |
632 if ($Line !~ /^SEQRES/i) { | |
633 return ((undef) x 5); | |
634 } | |
635 my($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) = unpack("x8A2x1A1x1A4x2A51" , $Line); | |
636 $RecordSerialNumber =~ s/ //g; | |
637 $ChainID =~ s/ //g; | |
638 $NumOfResidues =~ s/ //g; | |
639 $ResidueNames = RemoveLeadingAndTrailingWhiteSpaces($ResidueNames); | |
640 | |
641 return ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames); | |
642 } | |
643 | |
644 # | |
645 # Parse CONECT record line... | |
646 # | |
647 # CONECT format: | |
648 # | |
649 # 1 - 6 Record name "CONECT" | |
650 # 7 - 11 Atom number | |
651 # 12 - 16, 17 - 21, 22 - 26, 27 - 31 Atom number of bonded atom | |
652 # | |
653 # 32 - 36, 37 - 41 Atom number of hydrogen bonded atom | |
654 # 42 - 46 Atom number of salt bridged atom | |
655 # 47 - 51, 52 -56 Atom number of hydrogen bonded atom | |
656 # 57 - 61 Atom number of salt bridged atom | |
657 # | |
658 sub ParseConectRecordLine { | |
659 my($Line) = @_; | |
660 | |
661 if ($Line !~ /^CONECT/i) { | |
662 return ((undef) x 11); | |
663 } | |
664 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = map {s/ //g; $_} unpack("x6A5A5A5A5A5A5A5A5A5A5A5", $Line); | |
665 | |
666 return ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2); | |
667 } | |
668 | |
669 # Generate CONECT record line... | |
670 sub GenerateConectRecordLine { | |
671 my($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2) = @_; | |
672 my($Line); | |
673 | |
674 $Line = sprintf "CONECT%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s%5.5s", $AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2; | |
675 | |
676 return $Line; | |
677 } | |
678 | |
679 # | |
680 # Parse TER record line... | |
681 # | |
682 # TER format: | |
683 # | |
684 #1 - 6 Record name "TER " | |
685 # 7 - 11 Serial number | |
686 # 18 - 20 Residue name | |
687 # 22 Chain identifier | |
688 # 23 - 26 Residue sequence number | |
689 # 27 Insertion code | |
690 # | |
691 sub ParseTerRecordLine { | |
692 my($Line) = @_; | |
693 | |
694 if ($Line !~ /^TER/i) { | |
695 return ((undef) x 5); | |
696 } | |
697 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Length); | |
698 | |
699 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = ('') x 5; | |
700 | |
701 $Length = length $Line; | |
702 | |
703 if ($Length <= 17) { | |
704 ($SerialNumber, $ResidueName) = map {s/ //g; $_} unpack("x6A5", $Line); | |
705 } | |
706 elsif ($Length <= 21) { | |
707 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3", $Line); | |
708 } | |
709 else { | |
710 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = map {s/ //g; $_} unpack("x6A5x6A3xA1A4A1", $Line); | |
711 } | |
712 | |
713 return ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode); | |
714 } | |
715 | |
716 # Generate TER record line... | |
717 sub GenerateTerRecordLine { | |
718 my($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $Line) = ('') x 6; | |
719 | |
720 if (@_ == 5) { | |
721 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = @_; | |
722 } | |
723 elsif (@_ == 4) { | |
724 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber) = @_; | |
725 } | |
726 elsif (@_ == 3) { | |
727 ($SerialNumber, $ResidueName) = @_; | |
728 } | |
729 elsif (@_ == 2) { | |
730 ($SerialNumber, $ResidueName) = @_; | |
731 } | |
732 elsif (@_ == 1) { | |
733 ($SerialNumber) = @_; | |
734 } | |
735 $Line = sprintf "TER %5.5s %-3.3s %1.1s%4.4s%1.1s", $SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode; | |
736 | |
737 return $Line; | |
738 } | |
739 | |
740 # | |
741 # Parse MASTER record line... | |
742 # | |
743 # MASTER record format: | |
744 # | |
745 #1 - 6 Record name "MASTER" | |
746 # 11 - 15 Number of REMARK records | |
747 # 16 - 20 "0" | |
748 # 21 - 25 Number of HET records | |
749 # 26 - 30 Number of HELIX records | |
750 # 31 - 35 Number of SHEET records | |
751 # 36 - 40 Number of TURN records | |
752 # 41 - 45 Number of SITE records | |
753 # 46 - 50 Number of coordinate transformation records (ORIGXn+SCALEn+MTRIXn) | |
754 # 51 - 55 Number of atomic coordinate records (ATOM+HETATM) | |
755 # 56 - 60 Number of TER records | |
756 # 61 - 65 Number of CONECT records | |
757 # 66 - 70 Number of SEQRES records | |
758 # | |
759 sub ParseMasterRecordLine { | |
760 my($Line) = @_; | |
761 | |
762 if ($Line !~ /^MASTER/i) { | |
763 return ((undef) x 11); | |
764 } | |
765 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = map {s/ //g; $_} unpack("x6x4A5x5A5A5A5A5A5A5A5A5A5A5", $Line); | |
766 | |
767 return ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords); | |
768 } | |
769 | |
770 # End record... | |
771 sub GenerateEndRecordLine { | |
772 my($Line); | |
773 $Line = 'END '; | |
774 return $Line; | |
775 } | |
776 | |
777 # ATOM/HETATM record format: | |
778 # | |
779 # 1 - 6 Record name | |
780 # 7 - 11 Atom serial number - right justified | |
781 # 13 - 16 Atom name | |
782 # 17 Alternate location indicator. | |
783 # 18 - 20 Residue name - right justified | |
784 # 22 Chain identifier. | |
785 # 23 - 26 Residue sequence number - right justified | |
786 # 27 Code for insertion of residues. | |
787 # 31 - 38 Real(8.3), Orthogonal coordinates for X in Angstroms. | |
788 # 39 - 46 Real(8.3), Orthogonal coordinates for Y in Angstroms. | |
789 # 47 - 54 Real(8.3), Orthogonal coordinates for Z in Angstroms. | |
790 # 55 - 60 Real(6.2), Occupancy | |
791 # 61 - 66 Real(6.2), Temperature factor | |
792 # 73 - 76 LString(4), Segment identifier, left-justified. | |
793 # 77 - 78 LString(2), Element symbol, right-justified. | |
794 #79 - 80 LString(2), Charge on the atom. | |
795 # | |
796 # Notes: | |
797 # . Atom names starting with C, N, O and S are left justified starting with column 14 | |
798 # and others are left justified starting with column 13. | |
799 # | |
800 # . Six characters (columns) are reserved for atom names, assigned as follows: | |
801 # | |
802 # 13 - 14 Chemical symbol - right justified, except for hydrogen atoms | |
803 # | |
804 # And for amino acids: | |
805 # | |
806 # 15 Remoteness indicator (alphabetic) (A, B, G, D, E, Z and so on) | |
807 # 16 Branch designator (numeric) | |
808 # | |
809 sub _ParseAtomOrHetatmRecordLine { | |
810 my($Line) = @_; | |
811 | |
812 if ($Line !~ /^(ATOM|HETATM)/i) { | |
813 return ((undef) x 15); | |
814 } | |
815 my($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $Length); | |
816 | |
817 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ('') x 15; | |
818 | |
819 $Length = length $Line; | |
820 | |
821 if ($Length <= 72) { | |
822 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6", $Line); | |
823 } | |
824 else { | |
825 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = map {s/ //g; $_} unpack("x6A5xA4A1A3xA1A4A1x3A8A8A8A6A6x6A4A2A2", $Line); | |
826 } | |
827 return($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); | |
828 } | |
829 | |
830 # Generate ATOM/HETATM record line... | |
831 sub _GenerateAtomOrHetatmRecordLine { | |
832 my($RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = @_; | |
833 my($Line, $AtomNameFormat); | |
834 | |
835 if (length($AtomName) >= 4) { | |
836 # Left justified starting at column 13 for all atom names of length 4... | |
837 $AtomNameFormat = "%-4.4s"; | |
838 } | |
839 elsif (IsEmpty($ElementSymbol)) { | |
840 # No element symbol specified; just guess from atom name to cover most likely cases... | |
841 $AtomNameFormat = ($AtomName =~ /^(C|N|O|S)/i) ? " %-3.3s" : "%-4.4s"; | |
842 } | |
843 else { | |
844 # Element symbol specified... | |
845 if ($ElementSymbol =~ /^H$/i) { | |
846 # Hydrogen atom name with <=3 characters is left justified starting at column 14; | |
847 # Otherwise, left justified starting at column 13. | |
848 $AtomNameFormat = (length($AtomName) <= 3) ? " %-3.3s" : "%-4.4s"; | |
849 } | |
850 else { | |
851 # Non-hydrogen atom name... | |
852 $AtomNameFormat = (length($ElementSymbol) == 1) ? " %-3.3s" : "%-4.4s"; | |
853 } | |
854 } | |
855 | |
856 $Line = sprintf "%-6.6s%5.5s ${AtomNameFormat}%1.1s%3.3s %1.1s%4.4s%1.1s %8.8s%8.8s%8.8s%6.6s%6.6s %-4.4s%2.2s%2.2s", $RecordType, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge; | |
857 | |
858 return $Line; | |
859 } | |
860 | |
861 # Check record type... | |
862 sub _IsRecordType { | |
863 my($Line, $SpecifiedType) = @_; | |
864 my($Type, $Status); | |
865 | |
866 ($Type) = map {s/ //g; $_} unpack("A6", $Line); | |
867 | |
868 $Status = ($SpecifiedType eq $Type) ? 1 : 0; | |
869 | |
870 return $Status; | |
871 } | |
872 | |
873 # Get record type... | |
874 sub _GetRecordType { | |
875 my($Line) = @_; | |
876 my($Type); | |
877 | |
878 ($Type) = map {s/ //g; $_} unpack("A6", $Line); | |
879 | |
880 return $Type; | |
881 } | |
882 | |
883 # Get chains and residues data using ATOM/HETATM records... | |
884 # | |
885 sub _GetChainsAndResiduesFromAtomHetatmRecords { | |
886 my($PDBRecordLinesRef, $GetChainResiduesBeyondTERFlag, $GetRecordLinesFlag) = @_; | |
887 | |
888 my($LineCount, $TotalChainCount, $PreviousResidueNumber, $ChainCount, $DefaultChainID, $DefaultChainLabel, $RecordLine, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, %ChainsDataMap); | |
889 | |
890 # Do a quick chain count using TER record... | |
891 $TotalChainCount = 0; | |
892 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
893 if (IsEndmdlRecordType($RecordLine)) { | |
894 last LINE; | |
895 } | |
896 if (IsTerRecordType($RecordLine)) { | |
897 $TotalChainCount++; | |
898 } | |
899 } | |
900 | |
901 %ChainsDataMap = (); | |
902 @{$ChainsDataMap{ChainIDs}} = (); | |
903 %{$ChainsDataMap{Residues}} = (); | |
904 %{$ChainsDataMap{ResidueNumbers}} = (); | |
905 %{$ChainsDataMap{Lines}} = (); | |
906 %{$ChainsDataMap{ResidueCount}} = (); | |
907 | |
908 $LineCount = 0; | |
909 $ChainCount = 0; | |
910 $DefaultChainLabel = 'None'; | |
911 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); | |
912 $PreviousResidueNumber = 0; | |
913 | |
914 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
915 $LineCount++; | |
916 if (IsTerRecordType($RecordLine)) { | |
917 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); | |
918 $ChainCount++; | |
919 if ($ChainCount == $TotalChainCount) { | |
920 last LINE; | |
921 } | |
922 else { | |
923 next LINE; | |
924 } | |
925 } | |
926 elsif (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { | |
927 next LINE; | |
928 } | |
929 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); | |
930 | |
931 if (IsEmpty($ChainID)) { | |
932 $ChainID = $DefaultChainID; | |
933 } | |
934 if (exists $ChainsDataMap{Residues}{$ChainID}) { | |
935 # Data for existing chain... | |
936 if ($GetRecordLinesFlag) { | |
937 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; | |
938 } | |
939 | |
940 if ($ResidueNumber != $PreviousResidueNumber) { | |
941 # Next residue with in the chain... | |
942 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; | |
943 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber; | |
944 | |
945 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { | |
946 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; | |
947 } | |
948 else { | |
949 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; | |
950 } | |
951 $PreviousResidueNumber = $ResidueNumber; | |
952 } | |
953 } | |
954 else { | |
955 # Data for new chain... | |
956 push @{$ChainsDataMap{ChainIDs}}, $ChainID; | |
957 | |
958 @{$ChainsDataMap{Residues}{$ChainID}} = (); | |
959 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; | |
960 | |
961 @{$ChainsDataMap{ResidueNumbers}{$ChainID}} = (); | |
962 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber; | |
963 | |
964 @{$ChainsDataMap{Lines}{$ChainID}} = (); | |
965 if ($GetRecordLinesFlag) { | |
966 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; | |
967 } | |
968 | |
969 %{$ChainsDataMap{ResidueCount}{$ChainID}} = (); | |
970 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; | |
971 $PreviousResidueNumber = $ResidueNumber; | |
972 } | |
973 } | |
974 if (!$GetChainResiduesBeyondTERFlag) { | |
975 return \%ChainsDataMap; | |
976 } | |
977 # Look for any HETATM residues specified outside TER records which could belong to an existing chain... | |
978 my($LineIndex, $PreviousChainID); | |
979 $PreviousChainID = ''; | |
980 $PreviousResidueNumber = 0; | |
981 LINE: for $LineIndex (($LineCount - 1) .. $#{$PDBRecordLinesRef}) { | |
982 $RecordLine = $PDBRecordLinesRef->[$LineIndex]; | |
983 if (IsEndmdlRecordType($RecordLine)) { | |
984 last LINE; | |
985 } | |
986 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { | |
987 next LINE; | |
988 } | |
989 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); | |
990 if (IsEmpty($ChainID)) { | |
991 # Ignore the chains with no ids... | |
992 next LINE; | |
993 } | |
994 if (! exists($ChainsDataMap{Residues}{$ChainID})) { | |
995 # Don't collect any new chains after TER record... | |
996 next LINE; | |
997 } | |
998 if ($GetRecordLinesFlag) { | |
999 push @{$ChainsDataMap{Lines}{$ChainID}}, $RecordLine; | |
1000 } | |
1001 if ($ResidueNumber != $PreviousResidueNumber || $ChainID ne $PreviousChainID) { | |
1002 | |
1003 push @{$ChainsDataMap{Residues}{$ChainID}}, $ResidueName; | |
1004 push @{$ChainsDataMap{ResidueNumbers}{$ChainID}}, $ResidueNumber; | |
1005 | |
1006 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { | |
1007 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; | |
1008 } | |
1009 else { | |
1010 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; | |
1011 } | |
1012 $PreviousChainID = $ChainID; | |
1013 $PreviousResidueNumber = $ResidueNumber; | |
1014 } | |
1015 } | |
1016 return \%ChainsDataMap; | |
1017 } | |
1018 | |
1019 # Get chains and residues data using SEQRES records... | |
1020 # | |
1021 sub _GetChainsAndResiduesFromSeqresRecords { | |
1022 my($PDBRecordLinesRef) = @_; | |
1023 | |
1024 my($ChainCount, $DefaultChainLabel, $DefaultChainID, $RecordLine, $RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueName, $ResidueNamesString, @ResidueNamesList, %ChainsDataMap); | |
1025 | |
1026 %ChainsDataMap = (); | |
1027 @{$ChainsDataMap{ChainIDs}} = (); | |
1028 %{$ChainsDataMap{Residues}} = (); | |
1029 %{$ChainsDataMap{ResidueNumbers}} = (); | |
1030 %{$ChainsDataMap{ResidueCount}} = (); | |
1031 | |
1032 $ChainCount = 0; | |
1033 $DefaultChainLabel = 'None'; | |
1034 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); | |
1035 | |
1036 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
1037 if (!IsSeqresRecordType($RecordLine)) { | |
1038 next LINE; | |
1039 } | |
1040 ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNamesString) = ParseSeqresRecordLine($RecordLine); | |
1041 if ($RecordSerialNumber == 1) { | |
1042 # Indicates start of a new chain... | |
1043 $DefaultChainID = $DefaultChainLabel . ($ChainCount + 1); | |
1044 $ChainCount++; | |
1045 } | |
1046 if (IsEmpty($ChainID)) { | |
1047 $ChainID = $DefaultChainID; | |
1048 } | |
1049 # Process the residues... | |
1050 @ResidueNamesList = split /[ ]+/, $ResidueNamesString; | |
1051 | |
1052 if (exists $ChainsDataMap{Residues}{$ChainID}) { | |
1053 # Data for existing chain... | |
1054 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList; | |
1055 } | |
1056 else { | |
1057 # Data for new chain... | |
1058 push @{$ChainsDataMap{ChainIDs}}, $ChainID; | |
1059 @{$ChainsDataMap{Residues}{$ChainID}} = (); | |
1060 push @{$ChainsDataMap{Residues}{$ChainID}}, @ResidueNamesList; | |
1061 } | |
1062 | |
1063 # Setup residue count... | |
1064 for $ResidueName (@ResidueNamesList) { | |
1065 if (exists $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName}) { | |
1066 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} += 1; | |
1067 } | |
1068 else { | |
1069 $ChainsDataMap{ResidueCount}{$ChainID}{$ResidueName} = 1; | |
1070 } | |
1071 } | |
1072 } | |
1073 return \%ChainsDataMap; | |
1074 } | |
1075 | |
1076 1; | |
1077 | |
1078 __END__ | |
1079 | |
1080 =head1 NAME | |
1081 | |
1082 PDBFileUtil | |
1083 | |
1084 =head1 SYNOPSIS | |
1085 | |
1086 use PDBFileUtil ; | |
1087 | |
1088 use PDBFileUtil qw(:all); | |
1089 | |
1090 =head1 DESCRIPTION | |
1091 | |
1092 B<PDBFileUtil> module provides the following functions: | |
1093 | |
1094 GenerateAtomOrHetatmRecordLine, GenerateAtomRecordLine, GenerateConectRecordLine, | |
1095 GenerateEndRecordLine, GenerateHeaderRecordLine, GenerateHeaderRecordTimeStamp, | |
1096 GenerateHetatmRecordLine, GenerateTerRecordLine, GetAllResidues, | |
1097 GetChainsAndResidues, GetConectRecordLines, GetExperimentalTechnique, | |
1098 GetExperimentalTechniqueResolution, GetMinMaxCoords, GetPDBRecordType, | |
1099 GetRecordTypesCount, IsAtomRecordType, IsConectRecordType, IsEndmdlRecordType, | |
1100 IsHeaderRecordType, IsHetatmRecordType, IsMasterRecordType, IsModelRecordType, | |
1101 IsPDBFile, IsSeqresRecordType, IsTerRecordType, ParseAtomOrHetatmRecordLine, | |
1102 ParseAtomRecordLine, ParseConectRecordLine, ParseExpdtaRecordLine, | |
1103 ParseHeaderRecordLine, ParseHetatmRecordLine, ParseMasterRecordLine, | |
1104 ParseRemark2ResolutionRecordLine, ParseSeqresRecordLine, ParseTerRecordLine, | |
1105 ReadPDBFile | |
1106 | |
1107 =head1 METHODS | |
1108 | |
1109 =over 4 | |
1110 | |
1111 =item B<GenerateAtomOrHetatmRecordLine> | |
1112 | |
1113 $RecordLine = GenerateAtomOrHetatmRecordLine($RecordType, | |
1114 $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, | |
1115 $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, | |
1116 $Occupancy, $TemperatureFactor, $SegmentID, | |
1117 $ElementSymbol, $AtomCharge); | |
1118 | |
1119 Returns ATOM or HETATM record line. | |
1120 | |
1121 =item B<GenerateAtomRecordLine> | |
1122 | |
1123 $RecordLine = GenerateAtomRecordLine($AtomNumber, | |
1124 $AtomName, $AlternateLocation, $ResidueName, $ChainID, | |
1125 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, | |
1126 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge); | |
1127 | |
1128 Returns ATOM record line. | |
1129 | |
1130 =item B<GenerateConectRecordLine> | |
1131 | |
1132 $RecordLine = GenerateConectRecordLine($AtomNum, $BondedAtomNum1, | |
1133 $BondedAtomNum2, $BondedAtomNum3, $BondedAtomNum4, | |
1134 $HBondedAtomNum1, $HBondedAtomNum2, $SaltBridgedAtomNum1, | |
1135 $HBondedAtomNum3, $HBondedAtomNum4, $SaltBridgedAtomNum2); | |
1136 | |
1137 Returns CONECT record line. | |
1138 | |
1139 =item B<GenerateHeaderRecordLine> | |
1140 | |
1141 $RecordLine = GenerateHeaderRecordLine($IDCode, [$Classification, | |
1142 $Date]); | |
1143 | |
1144 Returns HEADER record line. | |
1145 | |
1146 =item B<GenerateHeaderRecordTimeStamp> | |
1147 | |
1148 $Date = GenerateHeaderRecordTimeStamp(); | |
1149 | |
1150 Returns PDB header time stamp. | |
1151 | |
1152 =item B<GenerateHetatmRecordLine> | |
1153 | |
1154 $RecordLine = GenerateHetatmRecordLine($AtomNumber, $AtomName, | |
1155 $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, | |
1156 $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, | |
1157 $SegmentID, $ElementSymbol, $AtomCharge); | |
1158 | |
1159 Returns HETATM record line. | |
1160 | |
1161 =item B<GenerateEndRecordLine> | |
1162 | |
1163 $RecordLine = GenerateEndRecordLine(); | |
1164 | |
1165 Returns END record line. | |
1166 | |
1167 =item B<GenerateTerRecordLine> | |
1168 | |
1169 $RecordLine = GenerateTerRecordLine($SerialNumber, [$ResidueName, | |
1170 $ChainID, $ResidueNumber, $InsertionCode]); | |
1171 | |
1172 Returns TER record line. | |
1173 | |
1174 =item B<GetAllResidues> | |
1175 | |
1176 $ResiduesDataRef = GetAllResidues($PDBRecordLinesRef); | |
1177 | |
1178 Gets residue information using ATOM/HETATM records and returns a reference to a hash with | |
1179 following key/value pairs: | |
1180 | |
1181 $ResiduesDataRef->{ResidueNames} - Array of all the residues | |
1182 $ResiduesDataRef->{ResidueCount}{$ResidueName} - Count of residues | |
1183 $ResiduesDataRef->{AtomResidueNames}} - Array of all ATOM residues | |
1184 $ResiduesDataRef->{AtomResidueCount}{$ResidueName} - Count of | |
1185 residues in ATOM records | |
1186 $ResiduesDataRef->{HetatomResidueNames} - List of all HETATM | |
1187 residues | |
1188 $ResiduesDataRef->{HetatmResidueCount}{$ResidueName} - Count of | |
1189 residues HETATM records | |
1190 | |
1191 ATOM/HETATM records after the first ENDMDL records are simply ingnored. | |
1192 | |
1193 =item B<GetChainsAndResidues> | |
1194 | |
1195 $ChainsDataRef = GetChainsAndResidues($PDBRecordLinesRef, | |
1196 [$RecordsSource, $GetChainResiduesBeyondTERFlag, | |
1197 $GetRecordLinesFlag]); | |
1198 | |
1199 Gets chains and residue information using ATOM/HETATM or SEQRES records and returns a reference to a | |
1200 hash with these keys: | |
1201 | |
1202 $ChainsDataRef->{ChainIDs} - List of chain IDs with 'None' for | |
1203 no IDs | |
1204 $ChainsDataRef->{Residues}{$ChainID} - List of residues in order | |
1205 of their appearance in a chain | |
1206 $ChainsDataRef->{ResidueCount}{$ChainID}{$ResidueName} - Count of | |
1207 residues in a chain | |
1208 | |
1209 Chains and residue data can be extacted using either ATOM/HETATM records or SEQRES records. | |
1210 ATOM/HETATM records after the first ENDMDL records are simply ingnored. | |
1211 | |
1212 =item B<GetConectRecordLines> | |
1213 | |
1214 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, | |
1215 $AtomNumbersMapRef); | |
1216 | |
1217 Collects CONECT record lines for specific atom number, modified specified data to exclude any atom | |
1218 number not present in the list of specified atom numbers and returns a reference to list of | |
1219 CONECT record lines. | |
1220 | |
1221 =item B<GetExperimentalTechnique> | |
1222 | |
1223 $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef); | |
1224 | |
1225 Returns I<ExperimentalTechnique> value retrieved from EXPDATA record line. | |
1226 | |
1227 =item B<GetExperimentalTechniqueResolution> | |
1228 | |
1229 ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution( | |
1230 $PDBRecordLinesRef); | |
1231 | |
1232 Returns I<Resolution> and I<ResolutionUnits> values from REMARK 2 RESOLUTION | |
1233 record line. | |
1234 | |
1235 =item B<GetMinMaxCoords> | |
1236 | |
1237 ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = | |
1238 GetMinMaxCoords($PDBRecordLinesRef); | |
1239 | |
1240 Returns minimum and maximum XYZ coordinates for ATOM/HETATM records. | |
1241 | |
1242 =item B<GetPDBRecordType> | |
1243 | |
1244 $RecordType = GetPDBRecordType($RecordLine); | |
1245 | |
1246 Returns type of I<RecordLine>. | |
1247 | |
1248 =item B<GetRecordTypesCount> | |
1249 | |
1250 $RecordTypeDataRef = GetRecordTypesCount($PDBRecordLinesRef, | |
1251 [$SpecifiedRecordType, $GetRecordLinesFlag]); | |
1252 | |
1253 Counts the number of each record type or a $SpecifiedRecordType and returns a reference to data | |
1254 type with following key/value pairs: | |
1255 | |
1256 $RecordTypeDataRef->{RecordTypes} - An array of unique record types | |
1257 in order of their presence in the file | |
1258 $RecordTypeDataRef->{Count}{$RecordType} - Count of each record type | |
1259 $RecordTypeDataRef->{Lines}{$RecordType} - Optional lines data for a | |
1260 specific record type. | |
1261 | |
1262 =item B<IsAtomRecordType> | |
1263 | |
1264 $Status = IsAtomRecordType($RecordLine); | |
1265 | |
1266 Returns 1 or 0 based on whether it's a ATOM record line. | |
1267 | |
1268 =item B<IsConectRecordType> | |
1269 | |
1270 $Status = IsAtomConectType($RecordLine); | |
1271 | |
1272 Returns 1 or 0 based on whether it's a CONECT record line. | |
1273 | |
1274 =item B<IsEndmdlRecordType> | |
1275 | |
1276 $Status = IsEndmdlRecordType($RecordLine); | |
1277 | |
1278 Returns 1 or 0 based on whether it's a ENDMDL a record line. | |
1279 | |
1280 =item B<IsHeaderRecordType> | |
1281 | |
1282 $Status = IsHeaderRecordType($RecordLine); | |
1283 | |
1284 Returns 1 or 0 based on whether it's a HEADER a record line. | |
1285 | |
1286 =item B<IsHetatmRecordType> | |
1287 | |
1288 $Status = IsHetatmRecordType($RecordLine); | |
1289 | |
1290 Returns 1 or 0 based on whether it's a HETATM a record line. | |
1291 | |
1292 =item B<IsMasterRecordType> | |
1293 | |
1294 $Status = IsMasterRecordType($RecordLine); | |
1295 | |
1296 Returns 1 or 0 based on whether it's a MASTER a record line. | |
1297 | |
1298 =item B<IsModelRecordType> | |
1299 | |
1300 $Status = IsModelRecordType($RecordLine); | |
1301 | |
1302 Returns 1 or 0 based on whether it's a MODEL record line. | |
1303 | |
1304 =item B<IsPDBFile> | |
1305 | |
1306 $Status = IsPDBFile($PDBFile); | |
1307 | |
1308 Returns 1 or 0 based on whether it's a PDB file. | |
1309 | |
1310 =item B<IsSeqresRecordType> | |
1311 | |
1312 $Status = IsSeqresRecordType($RecordLine); | |
1313 | |
1314 Returns 1 or 0 based on whether it's SEQRES a record line. | |
1315 | |
1316 =item B<IsTerRecordType> | |
1317 | |
1318 $Status = IsTerRecordType($RecordLine); | |
1319 | |
1320 Returns 1 or 0 based on whether it's a TER record line. | |
1321 | |
1322 =item B<ParseAtomOrHetatmRecordLine> | |
1323 | |
1324 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, | |
1325 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, | |
1326 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = | |
1327 ParseAtomOrHetatmRecordLine($RecordLine); | |
1328 | |
1329 Parses ATOM or HETATM record line. | |
1330 | |
1331 =item B<ParseAtomRecordLine> | |
1332 | |
1333 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, | |
1334 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, | |
1335 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = | |
1336 ParseAtomRecordLine($RecordLine); | |
1337 | |
1338 Parses ATOM record line. | |
1339 | |
1340 =item B<ParseConectRecordLine> | |
1341 | |
1342 ($AtomNum, $BondedAtomNum1, $BondedAtomNum2, $BondedAtomNum3, | |
1343 $BondedAtomNum4, $HBondedAtomNum1, $HBondedAtomNum2, | |
1344 $SaltBridgedAtomNum1, $HBondedAtomNum3, $HBondedAtomNum4, | |
1345 $SaltBridgedAtomNum2) = ParseConectRecordLine($RecordLine); | |
1346 | |
1347 Parses CONECT record line. | |
1348 | |
1349 =item B<ParseExpdtaRecordLine> | |
1350 | |
1351 ($ContinuationNum, $ExperimentalTechnique) = ParseExpdtaRecordLine($Line); | |
1352 | |
1353 Parses EXPDTA record line. | |
1354 | |
1355 =item B<ParseHeaderRecordLine> | |
1356 | |
1357 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($RecordLine); | |
1358 | |
1359 Parses HEADER record line | |
1360 | |
1361 =item B<ParseHetatmRecordLine> | |
1362 | |
1363 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, | |
1364 $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, | |
1365 $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = | |
1366 ParseHetatmRecordLine($RecordLine); | |
1367 | |
1368 Parses HETATM record line. | |
1369 | |
1370 =item B<ParseMasterRecordLine> | |
1371 | |
1372 ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, | |
1373 $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, | |
1374 $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, | |
1375 $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = | |
1376 ParseMasterRecordLine($RecordLine); | |
1377 | |
1378 Parses MASTER ecord line. | |
1379 | |
1380 =item B<ParseRemark2ResolutionRecordLine> | |
1381 | |
1382 ($Resolution, $ResolutionUnits) = ParseRemark2ResolutionRecordLine( | |
1383 $RecordLine); | |
1384 | |
1385 Parses REMARK 2 RESOLUTION record line. | |
1386 | |
1387 =item B<ParseSeqresRecordLine> | |
1388 | |
1389 ($RecordSerialNumber, $ChainID, $NumOfResidues, $ResidueNames) = | |
1390 ParseSeqresRecordLine($RecordLine); | |
1391 | |
1392 Parses SEQRES record line. | |
1393 | |
1394 =item B<ParseTerRecordLine> | |
1395 | |
1396 ($SerialNumber, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode) = | |
1397 ParseTerRecordLine($RecordLine); | |
1398 | |
1399 Parses TER record line. | |
1400 | |
1401 =item B<ReadPDBFile> | |
1402 | |
1403 $PDBRecordLinesRef = ReadPDBFile($PDBFile); | |
1404 | |
1405 Reads PDB file and returns reference to record lines. | |
1406 | |
1407 =back | |
1408 | |
1409 =head1 AUTHOR | |
1410 | |
1411 Manish Sud <msud@san.rr.com> | |
1412 | |
1413 =head1 SEE ALSO | |
1414 | |
1415 FileUtil.pm, SequenceFileUtil.pm, TextUtil.pm | |
1416 | |
1417 =head1 COPYRIGHT | |
1418 | |
1419 Copyright (C) 2015 Manish Sud. All rights reserved. | |
1420 | |
1421 This file is part of MayaChemTools. | |
1422 | |
1423 MayaChemTools is free software; you can redistribute it and/or modify it under | |
1424 the terms of the GNU Lesser General Public License as published by the Free | |
1425 Software Foundation; either version 3 of the License, or (at your option) | |
1426 any later version. | |
1427 | |
1428 =cut |