MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: InfoPDBFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:20 $
   5 # $Revision: 1.36 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use PDBFileUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename($0);
  46 print "\n$ScriptName: Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@PDBFilesList);
  56 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
  57 
  58 # Process options...
  59 print "Processing options...\n";
  60 my(%OptionsInfo);
  61 ProcessOptions();
  62 
  63 # Setup information about input files...
  64 my(%PDBFilesInfo);
  65 print "Checking input PDB file(s)...\n";
  66 RetrievePDBFilesInfo();
  67 
  68 # Process input files..
  69 my($FileIndex);
  70 if (@PDBFilesList > 1) {
  71   print "\nProcessing PDB files...\n";
  72 }
  73 for $FileIndex (0 .. $#PDBFilesList) {
  74   if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
  75     print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
  76     ListPDBFileInfo($FileIndex);
  77   }
  78 }
  79 ListTotalSizeOfFiles();
  80 
  81 print "\n$ScriptName:Done...\n\n";
  82 
  83 $EndTime = new Benchmark;
  84 $TotalTime = timediff ($EndTime, $StartTime);
  85 print "Total time: ", timestr($TotalTime), "\n";
  86 
  87 ###############################################################################
  88 
  89 # List appropriate information...
  90 sub ListPDBFileInfo {
  91   my($Index) = @_;
  92   my($PDBFile, $PDBRecordLinesRef);
  93 
  94   $PDBFile = $PDBFilesList[$Index];
  95   $PDBRecordLinesRef = ReadPDBFile($PDBFile);
  96 
  97   # Header informaton...
  98   if ($OptionsInfo{ListHeaderInfo}) {
  99     ListHeaderInfo($PDBRecordLinesRef);
 100   }
 101 
 102   # Experiment informaton...
 103   if ($OptionsInfo{ListExperimentalTechniqueInfo}) {
 104     ListExperimentalTechniqueInfo($PDBRecordLinesRef);
 105   }
 106 
 107   # Total number of records...
 108   my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef};
 109   print "\nTotal number of records: $TotalRecordsCount\n";
 110 
 111   # List record type count information...
 112   ListRecordTypesInfo($PDBRecordLinesRef);
 113 
 114   if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) {
 115     ListChainsAndResiduesInfo($PDBRecordLinesRef);
 116   }
 117   if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) {
 118     ListAllResiduesInfo($PDBRecordLinesRef);
 119   }
 120   if ($OptionsInfo{ResidueNumbersInfo}) {
 121     ListResidueNumbersInfo($PDBRecordLinesRef);
 122   }
 123   if ($OptionsInfo{CalculateBoundingBox}) {
 124     ListBoundingBox($PDBRecordLinesRef);
 125   }
 126 
 127   # File size and modification information...
 128   print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n";
 129   print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n";
 130 }
 131 
 132 sub ListHeaderInfo {
 133   my($PDBRecordLinesRef) = @_;
 134   my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode);
 135 
 136   ($Classification, $DepositionDate, $IDCode) = (undef) x 3;
 137   $HeaderRecordLine = $PDBRecordLinesRef->[0];
 138   if (IsHeaderRecordType($HeaderRecordLine)) {
 139     ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
 140   }
 141 
 142   $Classification = IsEmpty($Classification) ? 'Not available' : $Classification;
 143   $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate;
 144   $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode;
 145 
 146   print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n";
 147 }
 148 
 149 # List experimental technique information info...
 150 sub ListExperimentalTechniqueInfo {
 151   my($PDBRecordLinesRef) = @_;
 152   my($ExperimentalTechnique, $Resolution, $ResolutionUnits);
 153 
 154   $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef);
 155   print "\nExperimental technique: " . ($ExperimentalTechnique ? $ExperimentalTechnique : "Not available") . "\n";
 156 
 157   ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution($PDBRecordLinesRef);
 158   print "Resolution: " . ($Resolution ? "$Resolution $ResolutionUnits" : "Not available") . "\n";
 159 
 160 }
 161 
 162 # List record type info...
 163 sub ListRecordTypesInfo {
 164   my($PDBRecordLinesRef) = @_;
 165   my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo);
 166 
 167   $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef);
 168 
 169   @RecordTypeCountInfo = ();
 170   if ($OptionsInfo{CountRecordType} =~ /^All$/i) {
 171     for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) {
 172       $RecordCount = $RecordTypesCountRef->{Count}{$RecordType};
 173       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 174     }
 175   }
 176   else {
 177     for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) {
 178       $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0;
 179       push @RecordTypeCountInfo, "$RecordType - $RecordCount";
 180     }
 181   }
 182   print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n";
 183 
 184   if ($OptionsInfo{CheckMasterRecord}) {
 185     CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef);
 186   }
 187 }
 188 
 189 # List information about residues and chains...
 190 sub ListChainsAndResiduesInfo {
 191   my($PDBRecordLinesRef) = @_;
 192   my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef);
 193 
 194   $CollectChainResiduesBeyondTER = 1;
 195   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 196   $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}};
 197   if ($OptionsInfo{CountChains}) {
 198     print "\nNumber of chains: $ChainCount \n";
 199     my($ChainID, @ChainIDsList);
 200     @ChainIDsList = ();
 201     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 202       push @ChainIDsList, CleanupChainID($ChainID);
 203     }
 204     print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n";
 205   }
 206 
 207   if ($OptionsInfo{CountResiduesInChains}) {
 208     my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo);
 209     @ResiduesCountInfo = ();
 210     $TotalResiduesCount = 0;
 211     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 212       $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 213       $TotalResiduesCount += $ResidueCount;
 214       $ResidueCountInfo =  "Chain ${ChainID} - ${ResidueCount}";
 215       push @ResiduesCountInfo, $ResidueCountInfo;
 216     }
 217     print "\nNumber of residues in chain(s): ";
 218     if ($ChainCount > 1) {
 219       print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n";
 220     }
 221     else {
 222       print "$TotalResiduesCount\n";
 223     }
 224 
 225     # List of residues in each chain...
 226     if ($OptionsInfo{DetailLevel} >= 3) {
 227       print "List of residues in chain(s): \n";
 228       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 229         if ($ChainCount > 1) {
 230           print "Chain ", CleanupChainID($ChainID), ": ";
 231         }
 232         print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n";
 233       }
 234     }
 235   }
 236   if ($OptionsInfo{ResiduesFrequencyInChains}) {
 237     # Setup a hash using residue count as key for sorting the values...
 238     my(%ResiduesCountToNameMap);
 239     %ResiduesCountToNameMap = ();
 240     @{$ResiduesCountToNameMap{ChainIDs}} = ();
 241     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 242 
 243     for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 244       push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID;
 245       %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = ();
 246 
 247       for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 248         $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 249         # Setup count value for each chain...
 250         if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) {
 251           $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}";
 252         }
 253         else {
 254           $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName;
 255         }
 256       }
 257     }
 258     # Collect data for all the residues in all the chains...
 259     my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap);
 260     %AllResiduesNameToCountMap = ();
 261     %AllResiduesCountToNameMap = ();
 262     if ($ChainCount > 1) {
 263       for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 264         for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
 265           $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
 266           if (exists $AllResiduesNameToCountMap{$ResidueName}) {
 267             $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount;
 268           }
 269           else {
 270             $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount;
 271           }
 272         }
 273       }
 274       for $ResidueName (keys %AllResiduesNameToCountMap) {
 275         $ResidueCount = $AllResiduesNameToCountMap{$ResidueName};
 276         if (exists $AllResiduesCountToNameMap{$ResidueCount}) {
 277           $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}";
 278         }
 279         else {
 280           $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName;
 281         }
 282       }
 283     }
 284 
 285     # Setup distribution data for individual chains and the grand total as well...
 286     my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap);
 287     @{$ResiduesFrequencyInfoMap{ChainIDs}} = ();
 288     %{$ResiduesFrequencyInfoMap{Frequency}} = ();
 289     %{$ResiduesFrequencyInfoMap{PercentFrequency}} = ();
 290 
 291     @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = ();
 292     @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = ();
 293 
 294     $TotalResidueCount = 0;
 295 
 296     for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) {
 297       push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID;
 298       @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = ();
 299       @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = ();
 300 
 301       $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
 302       $TotalResidueCount += $ChainResidueCount;
 303 
 304       for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) {
 305         $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount};
 306         @ResidueNamesList = split /~/, $ResidueNames;
 307         for $ResidueName (@ResidueNamesList) {
 308           push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}";
 309           $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0;
 310           push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%";
 311         }
 312       }
 313     }
 314     if ($ChainCount > 1) {
 315       for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) {
 316         $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount};
 317         @ResidueNamesList = split /~/, $ResidueNames;
 318         for $ResidueName (@ResidueNamesList) {
 319           push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}";
 320           $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 321           push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%";
 322         }
 323       }
 324     }
 325 
 326     # List distribution of residues
 327     print "\nDistribution of residues in chain(s): \n";
 328     for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 329       if ($ChainCount > 1) {
 330         print "Chain ", CleanupChainID($ChainID), ": ";
 331       }
 332       print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n";
 333     }
 334     if ($OptionsInfo{DetailLevel} >= 2) {
 335       print "\nPercent distribution of residues in chain(s): \n";
 336       for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
 337         if ($ChainCount > 1) {
 338           print "Chain ", CleanupChainID($ChainID), ": ";
 339         }
 340         print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n";
 341       }
 342     }
 343     if ($ChainCount > 1) {
 344       print "\nDistribution of residues across all chains: \n";
 345       print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n";
 346 
 347       if ($OptionsInfo{DetailLevel} >= 2) {
 348         print "\nPercent distribution of residues across all chains: \n";
 349         print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n";
 350       }
 351     }
 352   }
 353 }
 354 
 355 # List information about all the residues...
 356 sub ListAllResiduesInfo {
 357   my($PDBRecordLinesRef) = @_;
 358   my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef);
 359 
 360   $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef);
 361   $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}};
 362   $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}};
 363   $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}};
 364 
 365   if ($OptionsInfo{CountResiduesAll}) {
 366     print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n";
 367 
 368     if ($OptionsInfo{DetailLevel} >= 3) {
 369       print "List of residues: \n";
 370       if ($AtomResiduesCount) {
 371         print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n";
 372       }
 373       if ($HetatmResiduesCount) {
 374         print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n";
 375       }
 376     }
 377   }
 378 
 379   if ($OptionsInfo{ResiduesFrequencyAll}) {
 380     my($ResidueName, $ResidueCount);
 381 
 382     # Setup a hash using residue count as key for sorting the values...
 383     my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap);
 384     %ResiduesCountToNameMap = ();
 385     %{$ResiduesCountToNameMap{ResidueNames}} = ();
 386 
 387     %AtomResiduesCountToNameMap = ();
 388     %{$AtomResiduesCountToNameMap{ResidueNames}} = ();
 389 
 390     %HetatmResiduesCountToNameMap = ();
 391     %{$HetatmResiduesCountToNameMap{ResidueNames}} = ();
 392 
 393     for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) {
 394       $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName};
 395       if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 396         $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 397       }
 398       else {
 399         $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 400       }
 401     }
 402 
 403     if ($OptionsInfo{DetailLevel} >= 1) {
 404       for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) {
 405         $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName};
 406         if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 407           $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 408         }
 409         else {
 410           $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 411         }
 412       }
 413       for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) {
 414         $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName};
 415         if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
 416           $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
 417         }
 418         else {
 419           $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
 420         }
 421       }
 422     }
 423 
 424     # Setup distribution of residues info...
 425     my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap);
 426 
 427     @{$ResiduesCountInfoMap{Frequency}} = ();
 428     @{$ResiduesCountInfoMap{PercentFrequency}} = ();
 429     for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) {
 430       $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 431       $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 432       @ResidueNamesList = split /~/, $ResidueNames;
 433       for $ResidueName (@ResidueNamesList) {
 434         push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 435         push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 436       }
 437     }
 438     if ($OptionsInfo{DetailLevel} >= 1) {
 439       @{$AtomResiduesCountInfoMap{Frequency}} = ();
 440       @{$AtomResiduesCountInfoMap{PercentFrequency}} = ();
 441       for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) {
 442         $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 443         $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 444         @ResidueNamesList = split /~/, $ResidueNames;
 445         for $ResidueName (@ResidueNamesList) {
 446           push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 447           push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 448         }
 449       }
 450       @{$HetatmResiduesCountInfoMap{Frequency}} = ();
 451       @{$HetatmResiduesCountInfoMap{PercentFrequency}} = ();
 452       for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) {
 453         $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
 454         $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount};
 455         @ResidueNamesList = split /~/, $ResidueNames;
 456         for $ResidueName (@ResidueNamesList) {
 457           push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
 458           push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
 459         }
 460       }
 461     }
 462 
 463     # List distribution of residues
 464     print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 465     if ($OptionsInfo{DetailLevel} >= 2) {
 466       print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 467     }
 468 
 469     if ($OptionsInfo{DetailLevel} >= 1) {
 470       print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 471       if ($OptionsInfo{DetailLevel} >= 2) {
 472         print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 473       }
 474 
 475       print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
 476       if ($OptionsInfo{DetailLevel} >= 2) {
 477         print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
 478       }
 479     }
 480   }
 481 }
 482 
 483 # List information about residue numbers for each chain...
 484 sub ListResidueNumbersInfo {
 485   my($PDBRecordLinesRef) = @_;
 486   my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs);
 487 
 488   $CollectChainResiduesBeyondTER = 0;
 489   $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
 490 
 491   print "\nATOM/HETATM residue numbers information for chains:\n";
 492 
 493   for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
 494     print "\nChain ID - ",  CleanupChainID($ChainID), "";
 495 
 496     $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}};
 497 
 498     # Start and end residue numbers...
 499     $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0];
 500     $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1];
 501     print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n";
 502 
 503     # Identify any gaps in residue numbers or non-ascending order residue numbers...
 504     $GapResiduePairsCount = 0;
 505     $DescendingOrderResiduePairsCount = 0;
 506 
 507     @DescendingOrderResiduePairs = ();
 508     @GapResiduePairs = ();
 509 
 510     RESIDUE: for $Index (1 .. ($ResidueCount - 1)) {
 511       $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index];
 512       $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1];
 513 
 514       $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index];
 515       $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1];
 516 
 517       if ($ResidueNum == ($PreviousResidueNum + 1)) {
 518         # All is good...
 519         next RESIDUE;
 520       }
 521 
 522       # Are residue in descending order?
 523       if ($ResidueNum < $PreviousResidueNum) {
 524         $DescendingOrderResiduePairsCount++;
 525         push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>";
 526       }
 527 
 528       # Track gaps in residue pairs...
 529       $GapResiduePairsCount++;
 530       $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1;
 531 
 532       push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>";
 533     }
 534 
 535     # Print gaps information...
 536     print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None";
 537     if ($GapResiduePairsCount) {
 538       print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0);
 539     }
 540     print "\n";
 541 
 542     # Print descending residue order information...
 543     print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None";
 544     if ($DescendingOrderResiduePairsCount) {
 545       print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0);
 546     }
 547     print "\n";
 548   }
 549 }
 550 
 551 # List min/max XYZ coordinates for ATOM/HETATM records...
 552 sub ListBoundingBox {
 553   my($PDBRecordLinesRef) = @_;
 554   my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize);
 555 
 556   ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef);
 557   $XSize = abs($XMax - $XMin);
 558   $YSize = abs($YMax - $YMin);
 559   $ZSize = abs($ZMax - $ZMin);
 560 
 561   $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0;
 562   $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0;
 563   $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0;
 564 
 565   $XSize = sprintf("%.3f", $XSize) + 0;
 566   $YSize = sprintf("%.3f", $YSize) + 0;
 567   $ZSize = sprintf("%.3f", $ZSize) + 0;
 568 
 569   print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n";
 570   print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n";
 571 
 572 }
 573 
 574 # Check master record counts against actual record counts...
 575 sub CheckMasterRecord {
 576   my($RecordTypesCountRef, $PDBRecordLinesRef) = @_;
 577 
 578   # Get master record information...
 579   my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11;
 580   my($RecordLine, $MasterRecordFound);
 581   $MasterRecordFound = 0;
 582 
 583   LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
 584       if (IsMasterRecordType($RecordLine)) {
 585         ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine);
 586         $MasterRecordFound = 1;
 587         last LINE;
 588       }
 589   }
 590   if (!$MasterRecordFound) {
 591     print "\nWarning: MASTER record is missing.\n";
 592     return;
 593   }
 594   my(@MasterRecordValidationInfo);
 595   @MasterRecordValidationInfo = ();
 596   $NumOfRemarkRecords += 0;
 597   if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) {
 598     push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 599   }
 600   $NumOfHetRecords += 0;
 601   if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) {
 602     push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}.";
 603   }
 604   $NumOfHelixRecords += 0;
 605   if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) {
 606     push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}.";
 607   }
 608   $NumOfSheetRecords += 0;
 609   if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) {
 610     push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}.";
 611   }
 612   $NumOfTurnRecords += 0;
 613   if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) {
 614     push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
 615   }
 616   $NumOfSiteRecords += 0;
 617   if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) {
 618     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}.";
 619   }
 620 
 621   $NumOfTransformationsRecords += 0;
 622   my($RecordsCount, $ID, $RecordID, $RecordLabel);
 623   $RecordsCount = 0;
 624   for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') {
 625     for $ID (1 .. 3) {
 626       $RecordID = "${RecordLabel}${ID}";
 627       if (exists $RecordTypesCountRef->{Count}{$RecordID}) {
 628         $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID};
 629       }
 630     }
 631   }
 632   if ($NumOfTransformationsRecords != $RecordsCount) {
 633     push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 634   }
 635 
 636   $RecordsCount = 0;
 637   for $RecordLabel ('ATOM', 'HETATM') {
 638       if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) {
 639         $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel};
 640       }
 641   }
 642   $NumOfAtomAndHetatmRecords += 0;
 643   if ($NumOfAtomAndHetatmRecords != $RecordsCount) {
 644     push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
 645   }
 646   $NumOfTerRecords += 0;
 647   if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) {
 648     push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}.";
 649   }
 650   $NumOfConectRecords += 0;
 651   if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) {
 652     push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}.";
 653   }
 654   $NumOfSeqresRecords += 0;
 655   if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) {
 656     push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}.";
 657   }
 658 
 659   if (@MasterRecordValidationInfo) {
 660     print "\nMASTER record validation: Count mismatches found:\n";
 661     print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n";
 662   }
 663   else {
 664     print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n";
 665   }
 666 }
 667 
 668 # Total size of all the files...
 669 sub ListTotalSizeOfFiles {
 670   my($FileOkayCount, $TotalSize, $Index);
 671 
 672   $FileOkayCount = 0;
 673   $TotalSize = 0;
 674 
 675   for $Index (0 .. $#PDBFilesList) {
 676     if ($PDBFilesInfo{FileOkay}[$Index]) {
 677       $FileOkayCount++;
 678       $TotalSize += $PDBFilesInfo{FileSize}[$Index];
 679     }
 680   }
 681   if ($FileOkayCount > 1) {
 682     print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
 683   }
 684 
 685 }
 686 
 687 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any
 688 # numbers from label...
 689 sub CleanupChainID {
 690   my($ChainID) = @_;
 691 
 692   if ($ChainID =~ /^None/i) {
 693     return 'None';
 694   }
 695   return $ChainID;
 696 }
 697 
 698 # Process option values...
 699 sub ProcessOptions {
 700   %OptionsInfo = ();
 701 
 702   # Setup record types to count...
 703   if ($Options{count}) {
 704     $OptionsInfo{CountRecordType} = $Options{count};
 705   }
 706   else {
 707     $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM';
 708   }
 709   @{$OptionsInfo{SpecifiedRecordTypes}} =();
 710   if ($OptionsInfo{CountRecordType} !~ /^All$/i) {
 711     my(@RecordTypes);
 712     @RecordTypes = split /\,/, $OptionsInfo{CountRecordType};
 713     push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes;
 714   }
 715   $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0;
 716   $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0;
 717 
 718   # Residue count is the default. So $Options{residues} is simply ignored.
 719   my($CountResidues) = 1;
 720   $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 721   $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 722 
 723   $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
 724   $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
 725 
 726   $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all})  ? 1 : 0;
 727 
 728   $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0;
 729 
 730   $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0;
 731   $OptionsInfo{DetailLevel} = $Options{detail};
 732 
 733   $OptionsInfo{ListExperimentalTechniqueInfo} = ($Options{experiment} || $Options{all}) ? 1 : 0;
 734 
 735 }
 736 
 737 # Retrieve information about PDB files...
 738 sub RetrievePDBFilesInfo {
 739   my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString);
 740 
 741   %PDBFilesInfo = ();
 742   @{$PDBFilesInfo{FileOkay}} = ();
 743   @{$PDBFilesInfo{FileSize}} = ();
 744   @{$PDBFilesInfo{FileLastModified}} = ();
 745 
 746   FILELIST: for $Index (0 .. $#PDBFilesList) {
 747     $PDBFilesInfo{FileOkay}[$Index] = 0;
 748     $PDBFilesInfo{FileSize}[$Index] = 0;
 749     $PDBFilesInfo{FileLastModified}[$Index] = '';
 750 
 751     $PDBFile = $PDBFilesList[$Index];
 752     if (!(-e $PDBFile)) {
 753       warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
 754       next FILELIST;
 755     }
 756     if (!CheckFileType($PDBFile, "pdb")) {
 757       warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
 758       next FILELIST;
 759     }
 760     if (! open PDBFILE, "$PDBFile") {
 761       warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
 762       next FILELIST;
 763     }
 764     close PDBFILE;
 765 
 766     $PDBFilesInfo{FileOkay}[$Index] = 1;
 767     $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile);
 768     ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile);
 769     $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
 770   }
 771 }
 772 
 773 
 774 # Setup script usage  and retrieve command line arguments specified using various options...
 775 sub SetupScriptUsage {
 776 
 777   # Retrieve all the options...
 778   %Options = ();
 779   $Options{count} = '';
 780   $Options{detail} = 1;
 781   $Options{residuesmode} = 'Both';
 782 
 783   if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "experiment|e", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) {
 784     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 785   }
 786   if ($Options{workingdir}) {
 787     if (! -d $Options{workingdir}) {
 788       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 789     }
 790     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 791   }
 792   if (!IsPositiveInteger($Options{detail})) {
 793     die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
 794   }
 795   if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) {
 796     die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n";
 797   }
 798 }
 799