1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: InfoPDBFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.36 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use PDBFileUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@PDBFilesList); 56 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 57 58 # Process options... 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Setup information about input files... 64 my(%PDBFilesInfo); 65 print "Checking input PDB file(s)...\n"; 66 RetrievePDBFilesInfo(); 67 68 # Process input files.. 69 my($FileIndex); 70 if (@PDBFilesList > 1) { 71 print "\nProcessing PDB files...\n"; 72 } 73 for $FileIndex (0 .. $#PDBFilesList) { 74 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 75 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; 76 ListPDBFileInfo($FileIndex); 77 } 78 } 79 ListTotalSizeOfFiles(); 80 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # List appropriate information... 90 sub ListPDBFileInfo { 91 my($Index) = @_; 92 my($PDBFile, $PDBRecordLinesRef); 93 94 $PDBFile = $PDBFilesList[$Index]; 95 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 96 97 # Header informaton... 98 if ($OptionsInfo{ListHeaderInfo}) { 99 ListHeaderInfo($PDBRecordLinesRef); 100 } 101 102 # Experiment informaton... 103 if ($OptionsInfo{ListExperimentalTechniqueInfo}) { 104 ListExperimentalTechniqueInfo($PDBRecordLinesRef); 105 } 106 107 # Total number of records... 108 my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef}; 109 print "\nTotal number of records: $TotalRecordsCount\n"; 110 111 # List record type count information... 112 ListRecordTypesInfo($PDBRecordLinesRef); 113 114 if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) { 115 ListChainsAndResiduesInfo($PDBRecordLinesRef); 116 } 117 if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) { 118 ListAllResiduesInfo($PDBRecordLinesRef); 119 } 120 if ($OptionsInfo{ResidueNumbersInfo}) { 121 ListResidueNumbersInfo($PDBRecordLinesRef); 122 } 123 if ($OptionsInfo{CalculateBoundingBox}) { 124 ListBoundingBox($PDBRecordLinesRef); 125 } 126 127 # File size and modification information... 128 print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n"; 129 print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n"; 130 } 131 132 sub ListHeaderInfo { 133 my($PDBRecordLinesRef) = @_; 134 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode); 135 136 ($Classification, $DepositionDate, $IDCode) = (undef) x 3; 137 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 138 if (IsHeaderRecordType($HeaderRecordLine)) { 139 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 140 } 141 142 $Classification = IsEmpty($Classification) ? 'Not available' : $Classification; 143 $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate; 144 $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode; 145 146 print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n"; 147 } 148 149 # List experimental technique information info... 150 sub ListExperimentalTechniqueInfo { 151 my($PDBRecordLinesRef) = @_; 152 my($ExperimentalTechnique, $Resolution, $ResolutionUnits); 153 154 $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef); 155 print "\nExperimental technique: " . ($ExperimentalTechnique ? $ExperimentalTechnique : "Not available") . "\n"; 156 157 ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution($PDBRecordLinesRef); 158 print "Resolution: " . ($Resolution ? "$Resolution $ResolutionUnits" : "Not available") . "\n"; 159 160 } 161 162 # List record type info... 163 sub ListRecordTypesInfo { 164 my($PDBRecordLinesRef) = @_; 165 my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo); 166 167 $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef); 168 169 @RecordTypeCountInfo = (); 170 if ($OptionsInfo{CountRecordType} =~ /^All$/i) { 171 for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) { 172 $RecordCount = $RecordTypesCountRef->{Count}{$RecordType}; 173 push @RecordTypeCountInfo, "$RecordType - $RecordCount"; 174 } 175 } 176 else { 177 for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) { 178 $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0; 179 push @RecordTypeCountInfo, "$RecordType - $RecordCount"; 180 } 181 } 182 print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n"; 183 184 if ($OptionsInfo{CheckMasterRecord}) { 185 CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef); 186 } 187 } 188 189 # List information about residues and chains... 190 sub ListChainsAndResiduesInfo { 191 my($PDBRecordLinesRef) = @_; 192 my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef); 193 194 $CollectChainResiduesBeyondTER = 1; 195 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER); 196 $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 197 if ($OptionsInfo{CountChains}) { 198 print "\nNumber of chains: $ChainCount \n"; 199 my($ChainID, @ChainIDsList); 200 @ChainIDsList = (); 201 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 202 push @ChainIDsList, CleanupChainID($ChainID); 203 } 204 print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n"; 205 } 206 207 if ($OptionsInfo{CountResiduesInChains}) { 208 my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo); 209 @ResiduesCountInfo = (); 210 $TotalResiduesCount = 0; 211 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 212 $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 213 $TotalResiduesCount += $ResidueCount; 214 $ResidueCountInfo = "Chain ${ChainID} - ${ResidueCount}"; 215 push @ResiduesCountInfo, $ResidueCountInfo; 216 } 217 print "\nNumber of residues in chain(s): "; 218 if ($ChainCount > 1) { 219 print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n"; 220 } 221 else { 222 print "$TotalResiduesCount\n"; 223 } 224 225 # List of residues in each chain... 226 if ($OptionsInfo{DetailLevel} >= 3) { 227 print "List of residues in chain(s): \n"; 228 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 229 if ($ChainCount > 1) { 230 print "Chain ", CleanupChainID($ChainID), ": "; 231 } 232 print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n"; 233 } 234 } 235 } 236 if ($OptionsInfo{ResiduesFrequencyInChains}) { 237 # Setup a hash using residue count as key for sorting the values... 238 my(%ResiduesCountToNameMap); 239 %ResiduesCountToNameMap = (); 240 @{$ResiduesCountToNameMap{ChainIDs}} = (); 241 %{$ResiduesCountToNameMap{ResidueNames}} = (); 242 243 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 244 push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID; 245 %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = (); 246 247 for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) { 248 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName}; 249 # Setup count value for each chain... 250 if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) { 251 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}"; 252 } 253 else { 254 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName; 255 } 256 } 257 } 258 # Collect data for all the residues in all the chains... 259 my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap); 260 %AllResiduesNameToCountMap = (); 261 %AllResiduesCountToNameMap = (); 262 if ($ChainCount > 1) { 263 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 264 for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) { 265 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName}; 266 if (exists $AllResiduesNameToCountMap{$ResidueName}) { 267 $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount; 268 } 269 else { 270 $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount; 271 } 272 } 273 } 274 for $ResidueName (keys %AllResiduesNameToCountMap) { 275 $ResidueCount = $AllResiduesNameToCountMap{$ResidueName}; 276 if (exists $AllResiduesCountToNameMap{$ResidueCount}) { 277 $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}"; 278 } 279 else { 280 $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName; 281 } 282 } 283 } 284 285 # Setup distribution data for individual chains and the grand total as well... 286 my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap); 287 @{$ResiduesFrequencyInfoMap{ChainIDs}} = (); 288 %{$ResiduesFrequencyInfoMap{Frequency}} = (); 289 %{$ResiduesFrequencyInfoMap{PercentFrequency}} = (); 290 291 @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = (); 292 @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = (); 293 294 $TotalResidueCount = 0; 295 296 for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) { 297 push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID; 298 @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = (); 299 @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = (); 300 301 $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 302 $TotalResidueCount += $ChainResidueCount; 303 304 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) { 305 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}; 306 @ResidueNamesList = split /~/, $ResidueNames; 307 for $ResidueName (@ResidueNamesList) { 308 push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}"; 309 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0; 310 push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%"; 311 } 312 } 313 } 314 if ($ChainCount > 1) { 315 for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) { 316 $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount}; 317 @ResidueNamesList = split /~/, $ResidueNames; 318 for $ResidueName (@ResidueNamesList) { 319 push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}"; 320 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 321 push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%"; 322 } 323 } 324 } 325 326 # List distribution of residues 327 print "\nDistribution of residues in chain(s): \n"; 328 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) { 329 if ($ChainCount > 1) { 330 print "Chain ", CleanupChainID($ChainID), ": "; 331 } 332 print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n"; 333 } 334 if ($OptionsInfo{DetailLevel} >= 2) { 335 print "\nPercent distribution of residues in chain(s): \n"; 336 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) { 337 if ($ChainCount > 1) { 338 print "Chain ", CleanupChainID($ChainID), ": "; 339 } 340 print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n"; 341 } 342 } 343 if ($ChainCount > 1) { 344 print "\nDistribution of residues across all chains: \n"; 345 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n"; 346 347 if ($OptionsInfo{DetailLevel} >= 2) { 348 print "\nPercent distribution of residues across all chains: \n"; 349 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n"; 350 } 351 } 352 } 353 } 354 355 # List information about all the residues... 356 sub ListAllResiduesInfo { 357 my($PDBRecordLinesRef) = @_; 358 my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef); 359 360 $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef); 361 $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}}; 362 $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}}; 363 $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}}; 364 365 if ($OptionsInfo{CountResiduesAll}) { 366 print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n"; 367 368 if ($OptionsInfo{DetailLevel} >= 3) { 369 print "List of residues: \n"; 370 if ($AtomResiduesCount) { 371 print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n"; 372 } 373 if ($HetatmResiduesCount) { 374 print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n"; 375 } 376 } 377 } 378 379 if ($OptionsInfo{ResiduesFrequencyAll}) { 380 my($ResidueName, $ResidueCount); 381 382 # Setup a hash using residue count as key for sorting the values... 383 my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap); 384 %ResiduesCountToNameMap = (); 385 %{$ResiduesCountToNameMap{ResidueNames}} = (); 386 387 %AtomResiduesCountToNameMap = (); 388 %{$AtomResiduesCountToNameMap{ResidueNames}} = (); 389 390 %HetatmResiduesCountToNameMap = (); 391 %{$HetatmResiduesCountToNameMap{ResidueNames}} = (); 392 393 for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) { 394 $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName}; 395 if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 396 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 397 } 398 else { 399 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 400 } 401 } 402 403 if ($OptionsInfo{DetailLevel} >= 1) { 404 for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) { 405 $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName}; 406 if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 407 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 408 } 409 else { 410 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 411 } 412 } 413 for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) { 414 $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName}; 415 if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) { 416 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}"; 417 } 418 else { 419 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName; 420 } 421 } 422 } 423 424 # Setup distribution of residues info... 425 my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap); 426 427 @{$ResiduesCountInfoMap{Frequency}} = (); 428 @{$ResiduesCountInfoMap{PercentFrequency}} = (); 429 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) { 430 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 431 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 432 @ResidueNamesList = split /~/, $ResidueNames; 433 for $ResidueName (@ResidueNamesList) { 434 push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 435 push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 436 } 437 } 438 if ($OptionsInfo{DetailLevel} >= 1) { 439 @{$AtomResiduesCountInfoMap{Frequency}} = (); 440 @{$AtomResiduesCountInfoMap{PercentFrequency}} = (); 441 for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) { 442 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 443 $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 444 @ResidueNamesList = split /~/, $ResidueNames; 445 for $ResidueName (@ResidueNamesList) { 446 push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 447 push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 448 } 449 } 450 @{$HetatmResiduesCountInfoMap{Frequency}} = (); 451 @{$HetatmResiduesCountInfoMap{PercentFrequency}} = (); 452 for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) { 453 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0; 454 $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}; 455 @ResidueNamesList = split /~/, $ResidueNames; 456 for $ResidueName (@ResidueNamesList) { 457 push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}"; 458 push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}"; 459 } 460 } 461 } 462 463 # List distribution of residues 464 print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 465 if ($OptionsInfo{DetailLevel} >= 2) { 466 print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 467 } 468 469 if ($OptionsInfo{DetailLevel} >= 1) { 470 print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 471 if ($OptionsInfo{DetailLevel} >= 2) { 472 print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 473 } 474 475 print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n"; 476 if ($OptionsInfo{DetailLevel} >= 2) { 477 print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n"; 478 } 479 } 480 } 481 } 482 483 # List information about residue numbers for each chain... 484 sub ListResidueNumbersInfo { 485 my($PDBRecordLinesRef) = @_; 486 my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs); 487 488 $CollectChainResiduesBeyondTER = 0; 489 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER); 490 491 print "\nATOM/HETATM residue numbers information for chains:\n"; 492 493 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 494 print "\nChain ID - ", CleanupChainID($ChainID), ""; 495 496 $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}}; 497 498 # Start and end residue numbers... 499 $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0]; 500 $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1]; 501 print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n"; 502 503 # Identify any gaps in residue numbers or non-ascending order residue numbers... 504 $GapResiduePairsCount = 0; 505 $DescendingOrderResiduePairsCount = 0; 506 507 @DescendingOrderResiduePairs = (); 508 @GapResiduePairs = (); 509 510 RESIDUE: for $Index (1 .. ($ResidueCount - 1)) { 511 $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index]; 512 $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1]; 513 514 $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index]; 515 $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1]; 516 517 if ($ResidueNum == ($PreviousResidueNum + 1)) { 518 # All is good... 519 next RESIDUE; 520 } 521 522 # Are residue in descending order? 523 if ($ResidueNum < $PreviousResidueNum) { 524 $DescendingOrderResiduePairsCount++; 525 push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>"; 526 } 527 528 # Track gaps in residue pairs... 529 $GapResiduePairsCount++; 530 $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1; 531 532 push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>"; 533 } 534 535 # Print gaps information... 536 print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None"; 537 if ($GapResiduePairsCount) { 538 print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0); 539 } 540 print "\n"; 541 542 # Print descending residue order information... 543 print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None"; 544 if ($DescendingOrderResiduePairsCount) { 545 print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0); 546 } 547 print "\n"; 548 } 549 } 550 551 # List min/max XYZ coordinates for ATOM/HETATM records... 552 sub ListBoundingBox { 553 my($PDBRecordLinesRef) = @_; 554 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize); 555 556 ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef); 557 $XSize = abs($XMax - $XMin); 558 $YSize = abs($YMax - $YMin); 559 $ZSize = abs($ZMax - $ZMin); 560 561 $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0; 562 $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0; 563 $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0; 564 565 $XSize = sprintf("%.3f", $XSize) + 0; 566 $YSize = sprintf("%.3f", $YSize) + 0; 567 $ZSize = sprintf("%.3f", $ZSize) + 0; 568 569 print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n"; 570 print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n"; 571 572 } 573 574 # Check master record counts against actual record counts... 575 sub CheckMasterRecord { 576 my($RecordTypesCountRef, $PDBRecordLinesRef) = @_; 577 578 # Get master record information... 579 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11; 580 my($RecordLine, $MasterRecordFound); 581 $MasterRecordFound = 0; 582 583 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 584 if (IsMasterRecordType($RecordLine)) { 585 ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine); 586 $MasterRecordFound = 1; 587 last LINE; 588 } 589 } 590 if (!$MasterRecordFound) { 591 print "\nWarning: MASTER record is missing.\n"; 592 return; 593 } 594 my(@MasterRecordValidationInfo); 595 @MasterRecordValidationInfo = (); 596 $NumOfRemarkRecords += 0; 597 if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) { 598 push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}."; 599 } 600 $NumOfHetRecords += 0; 601 if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) { 602 push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}."; 603 } 604 $NumOfHelixRecords += 0; 605 if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) { 606 push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}."; 607 } 608 $NumOfSheetRecords += 0; 609 if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) { 610 push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}."; 611 } 612 $NumOfTurnRecords += 0; 613 if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) { 614 push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}."; 615 } 616 $NumOfSiteRecords += 0; 617 if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) { 618 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}."; 619 } 620 621 $NumOfTransformationsRecords += 0; 622 my($RecordsCount, $ID, $RecordID, $RecordLabel); 623 $RecordsCount = 0; 624 for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') { 625 for $ID (1 .. 3) { 626 $RecordID = "${RecordLabel}${ID}"; 627 if (exists $RecordTypesCountRef->{Count}{$RecordID}) { 628 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID}; 629 } 630 } 631 } 632 if ($NumOfTransformationsRecords != $RecordsCount) { 633 push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount."; 634 } 635 636 $RecordsCount = 0; 637 for $RecordLabel ('ATOM', 'HETATM') { 638 if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) { 639 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel}; 640 } 641 } 642 $NumOfAtomAndHetatmRecords += 0; 643 if ($NumOfAtomAndHetatmRecords != $RecordsCount) { 644 push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount."; 645 } 646 $NumOfTerRecords += 0; 647 if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) { 648 push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}."; 649 } 650 $NumOfConectRecords += 0; 651 if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) { 652 push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}."; 653 } 654 $NumOfSeqresRecords += 0; 655 if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) { 656 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}."; 657 } 658 659 if (@MasterRecordValidationInfo) { 660 print "\nMASTER record validation: Count mismatches found:\n"; 661 print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n"; 662 } 663 else { 664 print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n"; 665 } 666 } 667 668 # Total size of all the files... 669 sub ListTotalSizeOfFiles { 670 my($FileOkayCount, $TotalSize, $Index); 671 672 $FileOkayCount = 0; 673 $TotalSize = 0; 674 675 for $Index (0 .. $#PDBFilesList) { 676 if ($PDBFilesInfo{FileOkay}[$Index]) { 677 $FileOkayCount++; 678 $TotalSize += $PDBFilesInfo{FileSize}[$Index]; 679 } 680 } 681 if ($FileOkayCount > 1) { 682 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n"; 683 } 684 685 } 686 687 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any 688 # numbers from label... 689 sub CleanupChainID { 690 my($ChainID) = @_; 691 692 if ($ChainID =~ /^None/i) { 693 return 'None'; 694 } 695 return $ChainID; 696 } 697 698 # Process option values... 699 sub ProcessOptions { 700 %OptionsInfo = (); 701 702 # Setup record types to count... 703 if ($Options{count}) { 704 $OptionsInfo{CountRecordType} = $Options{count}; 705 } 706 else { 707 $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM'; 708 } 709 @{$OptionsInfo{SpecifiedRecordTypes}} =(); 710 if ($OptionsInfo{CountRecordType} !~ /^All$/i) { 711 my(@RecordTypes); 712 @RecordTypes = split /\,/, $OptionsInfo{CountRecordType}; 713 push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes; 714 } 715 $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0; 716 $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0; 717 718 # Residue count is the default. So $Options{residues} is simply ignored. 719 my($CountResidues) = 1; 720 $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0; 721 $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0; 722 723 $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0; 724 $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0; 725 726 $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all}) ? 1 : 0; 727 728 $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0; 729 730 $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0; 731 $OptionsInfo{DetailLevel} = $Options{detail}; 732 733 $OptionsInfo{ListExperimentalTechniqueInfo} = ($Options{experiment} || $Options{all}) ? 1 : 0; 734 735 } 736 737 # Retrieve information about PDB files... 738 sub RetrievePDBFilesInfo { 739 my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString); 740 741 %PDBFilesInfo = (); 742 @{$PDBFilesInfo{FileOkay}} = (); 743 @{$PDBFilesInfo{FileSize}} = (); 744 @{$PDBFilesInfo{FileLastModified}} = (); 745 746 FILELIST: for $Index (0 .. $#PDBFilesList) { 747 $PDBFilesInfo{FileOkay}[$Index] = 0; 748 $PDBFilesInfo{FileSize}[$Index] = 0; 749 $PDBFilesInfo{FileLastModified}[$Index] = ''; 750 751 $PDBFile = $PDBFilesList[$Index]; 752 if (!(-e $PDBFile)) { 753 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 754 next FILELIST; 755 } 756 if (!CheckFileType($PDBFile, "pdb")) { 757 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 758 next FILELIST; 759 } 760 if (! open PDBFILE, "$PDBFile") { 761 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 762 next FILELIST; 763 } 764 close PDBFILE; 765 766 $PDBFilesInfo{FileOkay}[$Index] = 1; 767 $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile); 768 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile); 769 $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString"; 770 } 771 } 772 773 774 # Setup script usage and retrieve command line arguments specified using various options... 775 sub SetupScriptUsage { 776 777 # Retrieve all the options... 778 %Options = (); 779 $Options{count} = ''; 780 $Options{detail} = 1; 781 $Options{residuesmode} = 'Both'; 782 783 if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "experiment|e", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) { 784 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 785 } 786 if ($Options{workingdir}) { 787 if (! -d $Options{workingdir}) { 788 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 789 } 790 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 791 } 792 if (!IsPositiveInteger($Options{detail})) { 793 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 794 } 795 if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) { 796 die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n"; 797 } 798 } 799