0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: InfoPDBFiles.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:20 $
|
|
5 # $Revision: 1.36 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use FileUtil;
|
|
36 use TextUtil;
|
|
37 use PDBFileUtil;
|
|
38
|
|
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
40
|
|
41 # Autoflush STDOUT
|
|
42 $| = 1;
|
|
43
|
|
44 # Starting message...
|
|
45 $ScriptName = basename($0);
|
|
46 print "\n$ScriptName: Starting...\n\n";
|
|
47 $StartTime = new Benchmark;
|
|
48
|
|
49 # Get the options and setup script...
|
|
50 SetupScriptUsage();
|
|
51 if ($Options{help} || @ARGV < 1) {
|
|
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
53 }
|
|
54
|
|
55 my(@PDBFilesList);
|
|
56 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
|
|
57
|
|
58 # Process options...
|
|
59 print "Processing options...\n";
|
|
60 my(%OptionsInfo);
|
|
61 ProcessOptions();
|
|
62
|
|
63 # Setup information about input files...
|
|
64 my(%PDBFilesInfo);
|
|
65 print "Checking input PDB file(s)...\n";
|
|
66 RetrievePDBFilesInfo();
|
|
67
|
|
68 # Process input files..
|
|
69 my($FileIndex);
|
|
70 if (@PDBFilesList > 1) {
|
|
71 print "\nProcessing PDB files...\n";
|
|
72 }
|
|
73 for $FileIndex (0 .. $#PDBFilesList) {
|
|
74 if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
|
|
75 print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
|
|
76 ListPDBFileInfo($FileIndex);
|
|
77 }
|
|
78 }
|
|
79 ListTotalSizeOfFiles();
|
|
80
|
|
81 print "\n$ScriptName:Done...\n\n";
|
|
82
|
|
83 $EndTime = new Benchmark;
|
|
84 $TotalTime = timediff ($EndTime, $StartTime);
|
|
85 print "Total time: ", timestr($TotalTime), "\n";
|
|
86
|
|
87 ###############################################################################
|
|
88
|
|
89 # List appropriate information...
|
|
90 sub ListPDBFileInfo {
|
|
91 my($Index) = @_;
|
|
92 my($PDBFile, $PDBRecordLinesRef);
|
|
93
|
|
94 $PDBFile = $PDBFilesList[$Index];
|
|
95 $PDBRecordLinesRef = ReadPDBFile($PDBFile);
|
|
96
|
|
97 # Header informaton...
|
|
98 if ($OptionsInfo{ListHeaderInfo}) {
|
|
99 ListHeaderInfo($PDBRecordLinesRef);
|
|
100 }
|
|
101
|
|
102 # Experiment informaton...
|
|
103 if ($OptionsInfo{ListExperimentalTechniqueInfo}) {
|
|
104 ListExperimentalTechniqueInfo($PDBRecordLinesRef);
|
|
105 }
|
|
106
|
|
107 # Total number of records...
|
|
108 my($TotalRecordsCount) = scalar @{$PDBRecordLinesRef};
|
|
109 print "\nTotal number of records: $TotalRecordsCount\n";
|
|
110
|
|
111 # List record type count information...
|
|
112 ListRecordTypesInfo($PDBRecordLinesRef);
|
|
113
|
|
114 if ($OptionsInfo{CountChains} || $OptionsInfo{CountResiduesInChains} || $OptionsInfo{ResiduesFrequencyInChains}) {
|
|
115 ListChainsAndResiduesInfo($PDBRecordLinesRef);
|
|
116 }
|
|
117 if ($OptionsInfo{CountResiduesAll} || $OptionsInfo{ResiduesFrequencyAll}) {
|
|
118 ListAllResiduesInfo($PDBRecordLinesRef);
|
|
119 }
|
|
120 if ($OptionsInfo{ResidueNumbersInfo}) {
|
|
121 ListResidueNumbersInfo($PDBRecordLinesRef);
|
|
122 }
|
|
123 if ($OptionsInfo{CalculateBoundingBox}) {
|
|
124 ListBoundingBox($PDBRecordLinesRef);
|
|
125 }
|
|
126
|
|
127 # File size and modification information...
|
|
128 print "\nFile size: ", FormatFileSize($PDBFilesInfo{FileSize}[$Index]), " \n";
|
|
129 print "Last modified: ", $PDBFilesInfo{FileLastModified}[$Index], " \n";
|
|
130 }
|
|
131
|
|
132 sub ListHeaderInfo {
|
|
133 my($PDBRecordLinesRef) = @_;
|
|
134 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode);
|
|
135
|
|
136 ($Classification, $DepositionDate, $IDCode) = (undef) x 3;
|
|
137 $HeaderRecordLine = $PDBRecordLinesRef->[0];
|
|
138 if (IsHeaderRecordType($HeaderRecordLine)) {
|
|
139 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
|
|
140 }
|
|
141
|
|
142 $Classification = IsEmpty($Classification) ? 'Not available' : $Classification;
|
|
143 $DepositionDate = IsEmpty($DepositionDate) ? 'Not available' : $DepositionDate;
|
|
144 $IDCode = IsEmpty($IDCode) ? 'Not available' : $IDCode;
|
|
145
|
|
146 print "\nClassification: $Classification\nID: $IDCode\nDeposition date: $DepositionDate\n";
|
|
147 }
|
|
148
|
|
149 # List experimental technique information info...
|
|
150 sub ListExperimentalTechniqueInfo {
|
|
151 my($PDBRecordLinesRef) = @_;
|
|
152 my($ExperimentalTechnique, $Resolution, $ResolutionUnits);
|
|
153
|
|
154 $ExperimentalTechnique = GetExperimentalTechnique($PDBRecordLinesRef);
|
|
155 print "\nExperimental technique: " . ($ExperimentalTechnique ? $ExperimentalTechnique : "Not available") . "\n";
|
|
156
|
|
157 ($Resolution, $ResolutionUnits) = GetExperimentalTechniqueResolution($PDBRecordLinesRef);
|
|
158 print "Resolution: " . ($Resolution ? "$Resolution $ResolutionUnits" : "Not available") . "\n";
|
|
159
|
|
160 }
|
|
161
|
|
162 # List record type info...
|
|
163 sub ListRecordTypesInfo {
|
|
164 my($PDBRecordLinesRef) = @_;
|
|
165 my($RecordType, $RecordCount, $RecordTypesCountRef, @RecordTypeCountInfo);
|
|
166
|
|
167 $RecordTypesCountRef = GetRecordTypesCount($PDBRecordLinesRef);
|
|
168
|
|
169 @RecordTypeCountInfo = ();
|
|
170 if ($OptionsInfo{CountRecordType} =~ /^All$/i) {
|
|
171 for $RecordType (@{$RecordTypesCountRef->{RecordTypes}}) {
|
|
172 $RecordCount = $RecordTypesCountRef->{Count}{$RecordType};
|
|
173 push @RecordTypeCountInfo, "$RecordType - $RecordCount";
|
|
174 }
|
|
175 }
|
|
176 else {
|
|
177 for $RecordType (@{$OptionsInfo{SpecifiedRecordTypes}}) {
|
|
178 $RecordCount = (exists $RecordTypesCountRef->{Count}{$RecordType}) ? ($RecordTypesCountRef->{Count}{$RecordType}) : 0;
|
|
179 push @RecordTypeCountInfo, "$RecordType - $RecordCount";
|
|
180 }
|
|
181 }
|
|
182 print "Number of individual records: ", JoinWords(\@RecordTypeCountInfo, '; ', 0), "\n";
|
|
183
|
|
184 if ($OptionsInfo{CheckMasterRecord}) {
|
|
185 CheckMasterRecord($RecordTypesCountRef, $PDBRecordLinesRef);
|
|
186 }
|
|
187 }
|
|
188
|
|
189 # List information about residues and chains...
|
|
190 sub ListChainsAndResiduesInfo {
|
|
191 my($PDBRecordLinesRef) = @_;
|
|
192 my($ResidueName, $ResidueCount, $ChainCount, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef);
|
|
193
|
|
194 $CollectChainResiduesBeyondTER = 1;
|
|
195 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
|
|
196 $ChainCount = @{$ChainsAndResiduesInfoRef->{ChainIDs}};
|
|
197 if ($OptionsInfo{CountChains}) {
|
|
198 print "\nNumber of chains: $ChainCount \n";
|
|
199 my($ChainID, @ChainIDsList);
|
|
200 @ChainIDsList = ();
|
|
201 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
202 push @ChainIDsList, CleanupChainID($ChainID);
|
|
203 }
|
|
204 print "Chain IDs: ", JoinWords(\@ChainIDsList, ', ', 0),"\n";
|
|
205 }
|
|
206
|
|
207 if ($OptionsInfo{CountResiduesInChains}) {
|
|
208 my($TotalResiduesCount, $ResidueCountInfo, @ResiduesCountInfo);
|
|
209 @ResiduesCountInfo = ();
|
|
210 $TotalResiduesCount = 0;
|
|
211 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
212 $ResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
|
|
213 $TotalResiduesCount += $ResidueCount;
|
|
214 $ResidueCountInfo = "Chain ${ChainID} - ${ResidueCount}";
|
|
215 push @ResiduesCountInfo, $ResidueCountInfo;
|
|
216 }
|
|
217 print "\nNumber of residues in chain(s): ";
|
|
218 if ($ChainCount > 1) {
|
|
219 print "Total - $TotalResiduesCount; ", JoinWords(\@ResiduesCountInfo, '; ', 0),"\n";
|
|
220 }
|
|
221 else {
|
|
222 print "$TotalResiduesCount\n";
|
|
223 }
|
|
224
|
|
225 # List of residues in each chain...
|
|
226 if ($OptionsInfo{DetailLevel} >= 3) {
|
|
227 print "List of residues in chain(s): \n";
|
|
228 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
229 if ($ChainCount > 1) {
|
|
230 print "Chain ", CleanupChainID($ChainID), ": ";
|
|
231 }
|
|
232 print JoinWords(\@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}, ', ', 0),"\n";
|
|
233 }
|
|
234 }
|
|
235 }
|
|
236 if ($OptionsInfo{ResiduesFrequencyInChains}) {
|
|
237 # Setup a hash using residue count as key for sorting the values...
|
|
238 my(%ResiduesCountToNameMap);
|
|
239 %ResiduesCountToNameMap = ();
|
|
240 @{$ResiduesCountToNameMap{ChainIDs}} = ();
|
|
241 %{$ResiduesCountToNameMap{ResidueNames}} = ();
|
|
242
|
|
243 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
244 push @{$ResiduesCountToNameMap{ChainIDs}}, $ChainID;
|
|
245 %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}} = ();
|
|
246
|
|
247 for $ResidueName (sort keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
|
|
248 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
|
|
249 # Setup count value for each chain...
|
|
250 if (exists $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount}) {
|
|
251 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} .= "~${ResidueName}";
|
|
252 }
|
|
253 else {
|
|
254 $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount} = $ResidueName;
|
|
255 }
|
|
256 }
|
|
257 }
|
|
258 # Collect data for all the residues in all the chains...
|
|
259 my(%AllResiduesNameToCountMap, %AllResiduesCountToNameMap);
|
|
260 %AllResiduesNameToCountMap = ();
|
|
261 %AllResiduesCountToNameMap = ();
|
|
262 if ($ChainCount > 1) {
|
|
263 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
264 for $ResidueName (keys %{$ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}}) {
|
|
265 $ResidueCount = $ChainsAndResiduesInfoRef->{ResidueCount}{$ChainID}{$ResidueName};
|
|
266 if (exists $AllResiduesNameToCountMap{$ResidueName}) {
|
|
267 $AllResiduesNameToCountMap{$ResidueName} += $ResidueCount;
|
|
268 }
|
|
269 else {
|
|
270 $AllResiduesNameToCountMap{$ResidueName} = $ResidueCount;
|
|
271 }
|
|
272 }
|
|
273 }
|
|
274 for $ResidueName (keys %AllResiduesNameToCountMap) {
|
|
275 $ResidueCount = $AllResiduesNameToCountMap{$ResidueName};
|
|
276 if (exists $AllResiduesCountToNameMap{$ResidueCount}) {
|
|
277 $AllResiduesCountToNameMap{$ResidueCount} .= "~${ResidueName}";
|
|
278 }
|
|
279 else {
|
|
280 $AllResiduesCountToNameMap{$ResidueCount} = $ResidueName;
|
|
281 }
|
|
282 }
|
|
283 }
|
|
284
|
|
285 # Setup distribution data for individual chains and the grand total as well...
|
|
286 my($ChainResidueCount, $PercentResidueCount, $TotalResidueCount, $ResidueNames, @ResidueNamesList, %ResiduesFrequencyInfoMap);
|
|
287 @{$ResiduesFrequencyInfoMap{ChainIDs}} = ();
|
|
288 %{$ResiduesFrequencyInfoMap{Frequency}} = ();
|
|
289 %{$ResiduesFrequencyInfoMap{PercentFrequency}} = ();
|
|
290
|
|
291 @{$ResiduesFrequencyInfoMap{AllChainsFrequency}} = ();
|
|
292 @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}} = ();
|
|
293
|
|
294 $TotalResidueCount = 0;
|
|
295
|
|
296 for $ChainID (@{$ResiduesCountToNameMap{ChainIDs}}) {
|
|
297 push @{$ResiduesFrequencyInfoMap{ChainIDs}}, $ChainID;
|
|
298 @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}} = ();
|
|
299 @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}} = ();
|
|
300
|
|
301 $ChainResidueCount = @{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
|
|
302 $TotalResidueCount += $ChainResidueCount;
|
|
303
|
|
304 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}{$ChainID}}) {
|
|
305 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ChainID}{$ResidueCount};
|
|
306 @ResidueNamesList = split /~/, $ResidueNames;
|
|
307 for $ResidueName (@ResidueNamesList) {
|
|
308 push @{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, "${ResidueName} - ${ResidueCount}";
|
|
309 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$ChainResidueCount)*100)) + 0;
|
|
310 push @{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, "${ResidueName} - ${PercentResidueCount}%";
|
|
311 }
|
|
312 }
|
|
313 }
|
|
314 if ($ChainCount > 1) {
|
|
315 for $ResidueCount (sort {$b <=> $a} keys %AllResiduesCountToNameMap) {
|
|
316 $ResidueNames = $AllResiduesCountToNameMap{$ResidueCount};
|
|
317 @ResidueNamesList = split /~/, $ResidueNames;
|
|
318 for $ResidueName (@ResidueNamesList) {
|
|
319 push @{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, "${ResidueName} - ${ResidueCount}";
|
|
320 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
|
|
321 push @{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, "${ResidueName} - ${PercentResidueCount}%";
|
|
322 }
|
|
323 }
|
|
324 }
|
|
325
|
|
326 # List distribution of residues
|
|
327 print "\nDistribution of residues in chain(s): \n";
|
|
328 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
|
|
329 if ($ChainCount > 1) {
|
|
330 print "Chain ", CleanupChainID($ChainID), ": ";
|
|
331 }
|
|
332 print JoinWords(\@{$ResiduesFrequencyInfoMap{Frequency}{$ChainID}}, '; ', 0), "\n";
|
|
333 }
|
|
334 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
335 print "\nPercent distribution of residues in chain(s): \n";
|
|
336 for $ChainID (@{$ResiduesFrequencyInfoMap{ChainIDs}}) {
|
|
337 if ($ChainCount > 1) {
|
|
338 print "Chain ", CleanupChainID($ChainID), ": ";
|
|
339 }
|
|
340 print JoinWords(\@{$ResiduesFrequencyInfoMap{PercentFrequency}{$ChainID}}, '; ', 0), "\n";
|
|
341 }
|
|
342 }
|
|
343 if ($ChainCount > 1) {
|
|
344 print "\nDistribution of residues across all chains: \n";
|
|
345 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsFrequency}}, '; ', 0), "\n";
|
|
346
|
|
347 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
348 print "\nPercent distribution of residues across all chains: \n";
|
|
349 print JoinWords(\@{$ResiduesFrequencyInfoMap{AllChainsPercentFrequency}}, '; ', 0), "\n";
|
|
350 }
|
|
351 }
|
|
352 }
|
|
353 }
|
|
354
|
|
355 # List information about all the residues...
|
|
356 sub ListAllResiduesInfo {
|
|
357 my($PDBRecordLinesRef) = @_;
|
|
358 my($TotalResidueCount, $AtomResiduesCount, $HetatmResiduesCount, $ResiduesInfoRef);
|
|
359
|
|
360 $ResiduesInfoRef = GetAllResidues($PDBRecordLinesRef);
|
|
361 $TotalResidueCount = @{$ResiduesInfoRef->{ResidueNames}};
|
|
362 $AtomResiduesCount = @{$ResiduesInfoRef->{AtomResidueNames}};
|
|
363 $HetatmResiduesCount = @{$ResiduesInfoRef->{HetatmResidueNames}};
|
|
364
|
|
365 if ($OptionsInfo{CountResiduesAll}) {
|
|
366 print "\nTotal number of residues: Total - $TotalResidueCount; ATOM residues - $AtomResiduesCount; HETATM residues - $HetatmResiduesCount\n";
|
|
367
|
|
368 if ($OptionsInfo{DetailLevel} >= 3) {
|
|
369 print "List of residues: \n";
|
|
370 if ($AtomResiduesCount) {
|
|
371 print "ATOM residues: ", JoinWords(\@{$ResiduesInfoRef->{AtomResidueNames}}, ', ', 0), "\n";
|
|
372 }
|
|
373 if ($HetatmResiduesCount) {
|
|
374 print "HETATM residues: ", JoinWords(\@{$ResiduesInfoRef->{HetatmResidueNames}}, ', ', 0), "\n";
|
|
375 }
|
|
376 }
|
|
377 }
|
|
378
|
|
379 if ($OptionsInfo{ResiduesFrequencyAll}) {
|
|
380 my($ResidueName, $ResidueCount);
|
|
381
|
|
382 # Setup a hash using residue count as key for sorting the values...
|
|
383 my(%ResiduesCountToNameMap, %AtomResiduesCountToNameMap, %HetatmResiduesCountToNameMap);
|
|
384 %ResiduesCountToNameMap = ();
|
|
385 %{$ResiduesCountToNameMap{ResidueNames}} = ();
|
|
386
|
|
387 %AtomResiduesCountToNameMap = ();
|
|
388 %{$AtomResiduesCountToNameMap{ResidueNames}} = ();
|
|
389
|
|
390 %HetatmResiduesCountToNameMap = ();
|
|
391 %{$HetatmResiduesCountToNameMap{ResidueNames}} = ();
|
|
392
|
|
393 for $ResidueName (keys %{$ResiduesInfoRef->{ResidueCount}}) {
|
|
394 $ResidueCount = $ResiduesInfoRef->{ResidueCount}{$ResidueName};
|
|
395 if (exists $ResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
|
|
396 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
|
|
397 }
|
|
398 else {
|
|
399 $ResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
|
|
400 }
|
|
401 }
|
|
402
|
|
403 if ($OptionsInfo{DetailLevel} >= 1) {
|
|
404 for $ResidueName (keys %{$ResiduesInfoRef->{AtomResidueCount}}) {
|
|
405 $ResidueCount = $ResiduesInfoRef->{AtomResidueCount}{$ResidueName};
|
|
406 if (exists $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
|
|
407 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
|
|
408 }
|
|
409 else {
|
|
410 $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
|
|
411 }
|
|
412 }
|
|
413 for $ResidueName (keys %{$ResiduesInfoRef->{HetatmResidueCount}}) {
|
|
414 $ResidueCount = $ResiduesInfoRef->{HetatmResidueCount}{$ResidueName};
|
|
415 if (exists $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount}) {
|
|
416 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} .= "~${ResidueName}";
|
|
417 }
|
|
418 else {
|
|
419 $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount} = $ResidueName;
|
|
420 }
|
|
421 }
|
|
422 }
|
|
423
|
|
424 # Setup distribution of residues info...
|
|
425 my($ResidueNames, $PercentResidueCount, @ResidueNamesList, %ResiduesCountInfoMap, %AtomResiduesCountInfoMap, %HetatmResiduesCountInfoMap);
|
|
426
|
|
427 @{$ResiduesCountInfoMap{Frequency}} = ();
|
|
428 @{$ResiduesCountInfoMap{PercentFrequency}} = ();
|
|
429 for $ResidueCount (sort {$b <=> $a} keys %{$ResiduesCountToNameMap{ResidueNames}}) {
|
|
430 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
|
|
431 $ResidueNames = $ResiduesCountToNameMap{ResidueNames}{$ResidueCount};
|
|
432 @ResidueNamesList = split /~/, $ResidueNames;
|
|
433 for $ResidueName (@ResidueNamesList) {
|
|
434 push @{$ResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
|
|
435 push @{$ResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
|
|
436 }
|
|
437 }
|
|
438 if ($OptionsInfo{DetailLevel} >= 1) {
|
|
439 @{$AtomResiduesCountInfoMap{Frequency}} = ();
|
|
440 @{$AtomResiduesCountInfoMap{PercentFrequency}} = ();
|
|
441 for $ResidueCount (sort {$b <=> $a} keys %{$AtomResiduesCountToNameMap{ResidueNames}}) {
|
|
442 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
|
|
443 $ResidueNames = $AtomResiduesCountToNameMap{ResidueNames}{$ResidueCount};
|
|
444 @ResidueNamesList = split /~/, $ResidueNames;
|
|
445 for $ResidueName (@ResidueNamesList) {
|
|
446 push @{$AtomResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
|
|
447 push @{$AtomResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
|
|
448 }
|
|
449 }
|
|
450 @{$HetatmResiduesCountInfoMap{Frequency}} = ();
|
|
451 @{$HetatmResiduesCountInfoMap{PercentFrequency}} = ();
|
|
452 for $ResidueCount (sort {$b <=> $a} keys %{$HetatmResiduesCountToNameMap{ResidueNames}}) {
|
|
453 $PercentResidueCount = sprintf("%.1f", (($ResidueCount/$TotalResidueCount)*100)) + 0;
|
|
454 $ResidueNames = $HetatmResiduesCountToNameMap{ResidueNames}{$ResidueCount};
|
|
455 @ResidueNamesList = split /~/, $ResidueNames;
|
|
456 for $ResidueName (@ResidueNamesList) {
|
|
457 push @{$HetatmResiduesCountInfoMap{Frequency}}, "${ResidueName} - ${ResidueCount}";
|
|
458 push @{$HetatmResiduesCountInfoMap{PercentFrequency}}, "${ResidueName} - ${PercentResidueCount}";
|
|
459 }
|
|
460 }
|
|
461 }
|
|
462
|
|
463 # List distribution of residues
|
|
464 print "\nDistribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
|
|
465 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
466 print "\nPercent distribution of residues: ", JoinWords(\@{$ResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
|
|
467 }
|
|
468
|
|
469 if ($OptionsInfo{DetailLevel} >= 1) {
|
|
470 print "\nDistribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
|
|
471 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
472 print "\nPercent distribution of ATOM residues: ", JoinWords(\@{$AtomResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
|
|
473 }
|
|
474
|
|
475 print "\nDistribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{Frequency}},'; ', 0), "\n";
|
|
476 if ($OptionsInfo{DetailLevel} >= 2) {
|
|
477 print "\nPercent distribution of HETATM residues: ", JoinWords(\@{$HetatmResiduesCountInfoMap{PercentFrequency}},'; ', 0), "\n";
|
|
478 }
|
|
479 }
|
|
480 }
|
|
481 }
|
|
482
|
|
483 # List information about residue numbers for each chain...
|
|
484 sub ListResidueNumbersInfo {
|
|
485 my($PDBRecordLinesRef) = @_;
|
|
486 my($Index, $ResidueCount, $StartResidueNum, $EndResidueNum, $ChainID, $CollectChainResiduesBeyondTER, $ChainsAndResiduesInfoRef, $ResidueNum, $PreviousResidueNum, $ResidueName, $PreviousResidueName, $GapResiduePairsCount, $GapLength, $DescendingOrderResiduePairsCount, @DescendingOrderResiduePairs, @GapResiduePairs);
|
|
487
|
|
488 $CollectChainResiduesBeyondTER = 0;
|
|
489 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', $CollectChainResiduesBeyondTER);
|
|
490
|
|
491 print "\nATOM/HETATM residue numbers information for chains:\n";
|
|
492
|
|
493 for $ChainID (@{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
494 print "\nChain ID - ", CleanupChainID($ChainID), "";
|
|
495
|
|
496 $ResidueCount = @{$ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}};
|
|
497
|
|
498 # Start and end residue numbers...
|
|
499 $StartResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[0];
|
|
500 $EndResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$ResidueCount - 1];
|
|
501 print "; Number of residues: $ResidueCount; Start residue number - $StartResidueNum; End residue number - $EndResidueNum\n";
|
|
502
|
|
503 # Identify any gaps in residue numbers or non-ascending order residue numbers...
|
|
504 $GapResiduePairsCount = 0;
|
|
505 $DescendingOrderResiduePairsCount = 0;
|
|
506
|
|
507 @DescendingOrderResiduePairs = ();
|
|
508 @GapResiduePairs = ();
|
|
509
|
|
510 RESIDUE: for $Index (1 .. ($ResidueCount - 1)) {
|
|
511 $ResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index];
|
|
512 $PreviousResidueNum = $ChainsAndResiduesInfoRef->{ResidueNumbers}{$ChainID}[$Index - 1];
|
|
513
|
|
514 $ResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index];
|
|
515 $PreviousResidueName = $ChainsAndResiduesInfoRef->{Residues}{$ChainID}[$Index - 1];
|
|
516
|
|
517 if ($ResidueNum == ($PreviousResidueNum + 1)) {
|
|
518 # All is good...
|
|
519 next RESIDUE;
|
|
520 }
|
|
521
|
|
522 # Are residue in descending order?
|
|
523 if ($ResidueNum < $PreviousResidueNum) {
|
|
524 $DescendingOrderResiduePairsCount++;
|
|
525 push @DescendingOrderResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}>";
|
|
526 }
|
|
527
|
|
528 # Track gaps in residue pairs...
|
|
529 $GapResiduePairsCount++;
|
|
530 $GapLength = abs($ResidueNum - $PreviousResidueNum) - 1;
|
|
531
|
|
532 push @GapResiduePairs, "<${PreviousResidueName}${PreviousResidueNum} - ${ResidueName}${ResidueNum}; $GapLength>";
|
|
533 }
|
|
534
|
|
535 # Print gaps information...
|
|
536 print "Gaps in residue numbers: ", $GapResiduePairsCount ? "Yes" : "None";
|
|
537 if ($GapResiduePairsCount) {
|
|
538 print "; Number of gap residue number pairs: $GapResiduePairsCount; Gap residue pairs: <StartRes-EndRes; GapLength> - ", JoinWords(\@GapResiduePairs, "; ", 0);
|
|
539 }
|
|
540 print "\n";
|
|
541
|
|
542 # Print descending residue order information...
|
|
543 print "Residue numbers in descending order: ", $DescendingOrderResiduePairsCount ? "Yes" : "None";
|
|
544 if ($DescendingOrderResiduePairsCount) {
|
|
545 print "; Number of descending residue number pairs: $DescendingOrderResiduePairsCount; Descending residue number pairs: <StartRes-EndRes> ", JoinWords(\@DescendingOrderResiduePairs, "; ", 0);
|
|
546 }
|
|
547 print "\n";
|
|
548 }
|
|
549 }
|
|
550
|
|
551 # List min/max XYZ coordinates for ATOM/HETATM records...
|
|
552 sub ListBoundingBox {
|
|
553 my($PDBRecordLinesRef) = @_;
|
|
554 my($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax, $XSize, $YSize, $ZSize);
|
|
555
|
|
556 ($XMin, $YMin, $ZMin, $XMax, $YMax, $ZMax) = GetMinMaxCoords($PDBRecordLinesRef);
|
|
557 $XSize = abs($XMax - $XMin);
|
|
558 $YSize = abs($YMax - $YMin);
|
|
559 $ZSize = abs($ZMax - $ZMin);
|
|
560
|
|
561 $XMin = sprintf("%.3f", $XMin) + 0; $XMax = sprintf("%.3f", $XMax) + 0;
|
|
562 $YMin = sprintf("%.3f", $YMin) + 0; $YMax = sprintf("%.3f", $YMax) + 0;
|
|
563 $ZMin = sprintf("%.3f", $ZMin) + 0; $ZMax = sprintf("%.3f", $ZMax) + 0;
|
|
564
|
|
565 $XSize = sprintf("%.3f", $XSize) + 0;
|
|
566 $YSize = sprintf("%.3f", $YSize) + 0;
|
|
567 $ZSize = sprintf("%.3f", $ZSize) + 0;
|
|
568
|
|
569 print "\nBounding box coordinates: <XMin, XMax> - <$XMin, $XMax>; <YMin, YMax> - <$YMin, $YMax>; <ZMin, ZMax> - <$ZMin, $ZMax>;\n";
|
|
570 print "Bounding box size in angstroms: XSize - $XSize; YSize - $YSize; ZSize - $ZSize\n";
|
|
571
|
|
572 }
|
|
573
|
|
574 # Check master record counts against actual record counts...
|
|
575 sub CheckMasterRecord {
|
|
576 my($RecordTypesCountRef, $PDBRecordLinesRef) = @_;
|
|
577
|
|
578 # Get master record information...
|
|
579 my($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = (undef) x 11;
|
|
580 my($RecordLine, $MasterRecordFound);
|
|
581 $MasterRecordFound = 0;
|
|
582
|
|
583 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
584 if (IsMasterRecordType($RecordLine)) {
|
|
585 ($NumOfRemarkRecords, $NumOfHetRecords, $NumOfHelixRecords, $NumOfSheetRecords, $NumOfTurnRecords, $NumOfSiteRecords, $NumOfTransformationsRecords, $NumOfAtomAndHetatmRecords, $NumOfTerRecords, $NumOfConectRecords, $NumOfSeqresRecords) = ParseMasterRecordLine($RecordLine);
|
|
586 $MasterRecordFound = 1;
|
|
587 last LINE;
|
|
588 }
|
|
589 }
|
|
590 if (!$MasterRecordFound) {
|
|
591 print "\nWarning: MASTER record is missing.\n";
|
|
592 return;
|
|
593 }
|
|
594 my(@MasterRecordValidationInfo);
|
|
595 @MasterRecordValidationInfo = ();
|
|
596 $NumOfRemarkRecords += 0;
|
|
597 if (exists($RecordTypesCountRef->{Count}{REMARK}) && $NumOfRemarkRecords != $RecordTypesCountRef->{Count}{REMARK}) {
|
|
598 push @MasterRecordValidationInfo, "Number of REMARK records, $NumOfRemarkRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
|
|
599 }
|
|
600 $NumOfHetRecords += 0;
|
|
601 if (exists($RecordTypesCountRef->{Count}{HET}) && $NumOfHetRecords != $RecordTypesCountRef->{Count}{HET}) {
|
|
602 push @MasterRecordValidationInfo, "Number of HET records, $NumOfHetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HET}.";
|
|
603 }
|
|
604 $NumOfHelixRecords += 0;
|
|
605 if (exists($RecordTypesCountRef->{Count}{HELIX}) && $NumOfHelixRecords != $RecordTypesCountRef->{Count}{HELIX}) {
|
|
606 push @MasterRecordValidationInfo, "Number of HELIX records, $NumOfHelixRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{HELIX}.";
|
|
607 }
|
|
608 $NumOfSheetRecords += 0;
|
|
609 if (exists($RecordTypesCountRef->{Count}{SHEET}) && $NumOfSheetRecords != $RecordTypesCountRef->{Count}{SHEET}) {
|
|
610 push @MasterRecordValidationInfo, "Number of SHEET records, $NumOfSheetRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SHEET}.";
|
|
611 }
|
|
612 $NumOfTurnRecords += 0;
|
|
613 if (exists($RecordTypesCountRef->{Count}{TURN}) && $NumOfTurnRecords != $RecordTypesCountRef->{Count}{TURN}) {
|
|
614 push @MasterRecordValidationInfo, "Number of TURN records, $NumOfTurnRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{REMARK}.";
|
|
615 }
|
|
616 $NumOfSiteRecords += 0;
|
|
617 if (exists($RecordTypesCountRef->{Count}{SITE}) && $NumOfSiteRecords != $RecordTypesCountRef->{Count}{SITE}) {
|
|
618 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSiteRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SITE}.";
|
|
619 }
|
|
620
|
|
621 $NumOfTransformationsRecords += 0;
|
|
622 my($RecordsCount, $ID, $RecordID, $RecordLabel);
|
|
623 $RecordsCount = 0;
|
|
624 for $RecordLabel ('ORIGX', 'SCALE', 'MTRIX') {
|
|
625 for $ID (1 .. 3) {
|
|
626 $RecordID = "${RecordLabel}${ID}";
|
|
627 if (exists $RecordTypesCountRef->{Count}{$RecordID}) {
|
|
628 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordID};
|
|
629 }
|
|
630 }
|
|
631 }
|
|
632 if ($NumOfTransformationsRecords != $RecordsCount) {
|
|
633 push @MasterRecordValidationInfo, "Number of transformation records (ORIGXn+SCALEn+MTRIXn), $NumOfTransformationsRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
|
|
634 }
|
|
635
|
|
636 $RecordsCount = 0;
|
|
637 for $RecordLabel ('ATOM', 'HETATM') {
|
|
638 if (exists $RecordTypesCountRef->{Count}{$RecordLabel}) {
|
|
639 $RecordsCount += $RecordTypesCountRef->{Count}{$RecordLabel};
|
|
640 }
|
|
641 }
|
|
642 $NumOfAtomAndHetatmRecords += 0;
|
|
643 if ($NumOfAtomAndHetatmRecords != $RecordsCount) {
|
|
644 push @MasterRecordValidationInfo, "Number of ATOM + HETATM records, $NumOfAtomAndHetatmRecords, specified in MASTER record doen't match its explict count, $RecordsCount.";
|
|
645 }
|
|
646 $NumOfTerRecords += 0;
|
|
647 if (exists($RecordTypesCountRef->{Count}{TER}) && $NumOfTerRecords != $RecordTypesCountRef->{Count}{TER}) {
|
|
648 push @MasterRecordValidationInfo, "Number of TER records, $NumOfTerRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{TER}.";
|
|
649 }
|
|
650 $NumOfConectRecords += 0;
|
|
651 if (exists($RecordTypesCountRef->{Count}{CONECT}) && $NumOfConectRecords != $RecordTypesCountRef->{Count}{CONECT}) {
|
|
652 push @MasterRecordValidationInfo, "Number of CONECT records, $NumOfConectRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{CONECT}.";
|
|
653 }
|
|
654 $NumOfSeqresRecords += 0;
|
|
655 if (exists($RecordTypesCountRef->{Count}{SEQRES}) && $NumOfSeqresRecords != $RecordTypesCountRef->{Count}{SEQRES}) {
|
|
656 push @MasterRecordValidationInfo, "Number of SITE records, $NumOfSeqresRecords, specified in MASTER record doen't match its explict count, $RecordTypesCountRef->{Count}{SEQRES}.";
|
|
657 }
|
|
658
|
|
659 if (@MasterRecordValidationInfo) {
|
|
660 print "\nMASTER record validation: Count mismatches found:\n";
|
|
661 print JoinWords(\@MasterRecordValidationInfo, "\n", 0), "\n";
|
|
662 }
|
|
663 else {
|
|
664 print "\nMASTER record validation: Count values match with the explicit count of the corresponding records.\n";
|
|
665 }
|
|
666 }
|
|
667
|
|
668 # Total size of all the files...
|
|
669 sub ListTotalSizeOfFiles {
|
|
670 my($FileOkayCount, $TotalSize, $Index);
|
|
671
|
|
672 $FileOkayCount = 0;
|
|
673 $TotalSize = 0;
|
|
674
|
|
675 for $Index (0 .. $#PDBFilesList) {
|
|
676 if ($PDBFilesInfo{FileOkay}[$Index]) {
|
|
677 $FileOkayCount++;
|
|
678 $TotalSize += $PDBFilesInfo{FileSize}[$Index];
|
|
679 }
|
|
680 }
|
|
681 if ($FileOkayCount > 1) {
|
|
682 print "\nTotal size of $FileOkayCount files: ", FormatFileSize($TotalSize), "\n";
|
|
683 }
|
|
684
|
|
685 }
|
|
686
|
|
687 # Empty chain IDs are replaced with "None[1-9]". But for displaying purposes, take out any
|
|
688 # numbers from label...
|
|
689 sub CleanupChainID {
|
|
690 my($ChainID) = @_;
|
|
691
|
|
692 if ($ChainID =~ /^None/i) {
|
|
693 return 'None';
|
|
694 }
|
|
695 return $ChainID;
|
|
696 }
|
|
697
|
|
698 # Process option values...
|
|
699 sub ProcessOptions {
|
|
700 %OptionsInfo = ();
|
|
701
|
|
702 # Setup record types to count...
|
|
703 if ($Options{count}) {
|
|
704 $OptionsInfo{CountRecordType} = $Options{count};
|
|
705 }
|
|
706 else {
|
|
707 $OptionsInfo{CountRecordType} = $Options{all} ? 'All' : 'ATOM,HETATM';
|
|
708 }
|
|
709 @{$OptionsInfo{SpecifiedRecordTypes}} =();
|
|
710 if ($OptionsInfo{CountRecordType} !~ /^All$/i) {
|
|
711 my(@RecordTypes);
|
|
712 @RecordTypes = split /\,/, $OptionsInfo{CountRecordType};
|
|
713 push @{$OptionsInfo{SpecifiedRecordTypes}}, @RecordTypes;
|
|
714 }
|
|
715 $OptionsInfo{CountChains} = ($Options{chains} || $Options{all}) ? 1 : 0;
|
|
716 $OptionsInfo{CheckMasterRecord} = ($Options{mastercheck} || $Options{all}) ? 1 : 0;
|
|
717
|
|
718 # Residue count is the default. So $Options{residues} is simply ignored.
|
|
719 my($CountResidues) = 1;
|
|
720 $OptionsInfo{CountResiduesInChains} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
|
|
721 $OptionsInfo{CountResiduesAll} = (($CountResidues || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
|
|
722
|
|
723 $OptionsInfo{ResiduesFrequencyInChains} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(InChains|Both)$/i) ? 1 : 0;
|
|
724 $OptionsInfo{ResiduesFrequencyAll} = (($Options{frequency} || $Options{all}) && $Options{residuesmode} =~ /^(All|Both)$/i) ? 1 : 0;
|
|
725
|
|
726 $OptionsInfo{ResidueNumbersInfo} = ($Options{residuenumbers} || $Options{all}) ? 1 : 0;
|
|
727
|
|
728 $OptionsInfo{CalculateBoundingBox} = ($Options{boundingbox} || $Options{all}) ? 1 : 0;
|
|
729
|
|
730 $OptionsInfo{ListHeaderInfo} = ($Options{header} || $Options{all}) ? 1 : 0;
|
|
731 $OptionsInfo{DetailLevel} = $Options{detail};
|
|
732
|
|
733 $OptionsInfo{ListExperimentalTechniqueInfo} = ($Options{experiment} || $Options{all}) ? 1 : 0;
|
|
734
|
|
735 }
|
|
736
|
|
737 # Retrieve information about PDB files...
|
|
738 sub RetrievePDBFilesInfo {
|
|
739 my($Index, $PDBFile, $ModifiedTimeString, $ModifiedDateString);
|
|
740
|
|
741 %PDBFilesInfo = ();
|
|
742 @{$PDBFilesInfo{FileOkay}} = ();
|
|
743 @{$PDBFilesInfo{FileSize}} = ();
|
|
744 @{$PDBFilesInfo{FileLastModified}} = ();
|
|
745
|
|
746 FILELIST: for $Index (0 .. $#PDBFilesList) {
|
|
747 $PDBFilesInfo{FileOkay}[$Index] = 0;
|
|
748 $PDBFilesInfo{FileSize}[$Index] = 0;
|
|
749 $PDBFilesInfo{FileLastModified}[$Index] = '';
|
|
750
|
|
751 $PDBFile = $PDBFilesList[$Index];
|
|
752 if (!(-e $PDBFile)) {
|
|
753 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
|
|
754 next FILELIST;
|
|
755 }
|
|
756 if (!CheckFileType($PDBFile, "pdb")) {
|
|
757 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
|
|
758 next FILELIST;
|
|
759 }
|
|
760 if (! open PDBFILE, "$PDBFile") {
|
|
761 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
|
|
762 next FILELIST;
|
|
763 }
|
|
764 close PDBFILE;
|
|
765
|
|
766 $PDBFilesInfo{FileOkay}[$Index] = 1;
|
|
767 $PDBFilesInfo{FileSize}[$Index] = FileSize($PDBFile);
|
|
768 ($ModifiedTimeString, $ModifiedDateString) = FormattedFileModificationTimeAndDate($PDBFile);
|
|
769 $PDBFilesInfo{FileLastModified}[$Index] = "$ModifiedTimeString; $ModifiedDateString";
|
|
770 }
|
|
771 }
|
|
772
|
|
773
|
|
774 # Setup script usage and retrieve command line arguments specified using various options...
|
|
775 sub SetupScriptUsage {
|
|
776
|
|
777 # Retrieve all the options...
|
|
778 %Options = ();
|
|
779 $Options{count} = '';
|
|
780 $Options{detail} = 1;
|
|
781 $Options{residuesmode} = 'Both';
|
|
782
|
|
783 if (!GetOptions(\%Options, "all|a", "boundingbox|b", "count|c=s", "chains", "detail|d=i", "experiment|e", "frequency|f", "mastercheck|m", "header", "help|h", "residues", "residuesmode=s", "residuenumbers", "workingdir|w=s")) {
|
|
784 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
785 }
|
|
786 if ($Options{workingdir}) {
|
|
787 if (! -d $Options{workingdir}) {
|
|
788 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
789 }
|
|
790 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
791 }
|
|
792 if (!IsPositiveInteger($Options{detail})) {
|
|
793 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
|
|
794 }
|
|
795 if ($Options{residuesmode} !~ /^(InChains|All|Both)$/i) {
|
|
796 die "Error: The value specified, $Options{residuesmode}, for option \"--ResiduesMode\" is not valid. Allowed values: InChains, All, or Both\n";
|
|
797 }
|
|
798 }
|
|
799
|
|
800 __END__
|
|
801
|
|
802 =head1 NAME
|
|
803
|
|
804 InfoPDBFiles.pl - List information about PDB files
|
|
805
|
|
806 =head1 SYNOPSIS
|
|
807
|
|
808 InfoPDBFiles.pl PDBFile(s) PDB(s)...
|
|
809
|
|
810 InfoPDBFiles.pl [B<-a, --all>] [B<-b, --BoundingBox>]
|
|
811 [B<-c, --count> "RecordType, [RecordType,...]" | All] [B<--chains>]
|
|
812 [B<-d, --detail> infolevel] [B<-e, --experiment>] [B<-f, --frequency>]
|
|
813 [B<-h, --help>] [B<--header>] [B<m, --MasterCheck>] [B<--residues>]
|
|
814 [B<--ResiduesMode> InChains | All | Both] [B<--ResidueNumbers>]
|
|
815 [B<-w, --WorkingDir> dirname] PDBFile(s)...
|
|
816
|
|
817 =head1 DESCRIPTION
|
|
818
|
|
819 List information about contents of I<PDBFile(s)>: number of each record type, number of chains,
|
|
820 count and percent distribution of residues in each chain, bounding box and so on.
|
|
821 Multiple PDBFile names are separated by spaces. The valid file extension is I<.pdb>.
|
|
822 All other file name extensions are ignored during the wild card expansion. All the PDB files
|
|
823 in a current directory can be specified either by I<*.pdb> or the current directory name.
|
|
824
|
|
825 In PDB files containing data for multiple models, all ATOM/HETAM records for chains after the first model
|
|
826 are ignored.
|
|
827
|
|
828 =head1 OPTIONS
|
|
829
|
|
830 =over 4
|
|
831
|
|
832 =item B<-a, --all>
|
|
833
|
|
834 List all the available information.
|
|
835
|
|
836 =item B<-b, --BoundingBox>
|
|
837
|
|
838 List min/max XYZ coordiates of ATOM/HETATM records.
|
|
839
|
|
840 =item B<-c, --count> I<RecordType,[RecordType,...]|All>
|
|
841
|
|
842 Types of PDB records to count in I<PDBFile(s)>. You can specify a list of any valid PDB
|
|
843 record type or count all record types found in the files. Possible values: Comma delimited list
|
|
844 of valid I<RecordTypes> or I<All>. Default: I<ATOM,HETATM>. And this is also B<default behavior>.
|
|
845
|
|
846 The list of valid PDB record types includes: I<HEADER, OBSLTE, TITLE, CAVEAT, COMPND, SOURCE, KEYWDS,
|
|
847 EXPDTA, AUTHOR, REVDAT, SPRSDE, JRN, REMARK, DBRE, SEQADV, SEQRES, MODRES, HET, HETNAM, HETSYN,
|
|
848 FORMUL, HELIX, SHEET, TURN, SSBOND, LINK, HYDBND, SLTBRG, CISPEP, SITE, CRYST1, ORIGX1, ORIGX2, ORIGX3,
|
|
849 SCALE1, SCALE2, SCALE3, MTRIX1 MTRIX2 MTRIX3, TVECT, MODEL, ATOM, SIGATM, ANISOU, SIGUIJ, TER,
|
|
850 HETATM, ENDMDL, CONECT, MASTER, END>.
|
|
851
|
|
852 =item B<--chains>
|
|
853
|
|
854 Count number of chains.
|
|
855
|
|
856 =item B<-d, --detail> I<infolevel>
|
|
857
|
|
858 Level of information to print about PDB during various options. Default: I<1>.
|
|
859 Possible values: I<1, 2 or 3>.
|
|
860
|
|
861 =item B<-e, --experiment>
|
|
862
|
|
863 List experimental technique information along with any applicable resolution.
|
|
864
|
|
865 =item B<-f, --frequency>
|
|
866
|
|
867 List distribution of residues: report count and percent of residues in individual chains and
|
|
868 across all the chains, or for all the residues in the file. The value of option B<--residuesmode>
|
|
869 determines how residues are counted and what is listed. The list is sorted by frequency in
|
|
870 descending order. By default, only residue count values are reported. To list percent distribution
|
|
871 of residues, specify B<-d, --detail> value of I<2> or higher.
|
|
872
|
|
873 =item B<-h, --help>
|
|
874
|
|
875 Print this help message.
|
|
876
|
|
877 =item B<--header>
|
|
878
|
|
879 List header information.
|
|
880
|
|
881 =item B<m, --MasterCheck>
|
|
882
|
|
883 Check master record by explicitly counting the number of REMARK, HET, HELIX, SHEET, TURN, SITE,
|
|
884 ORIGX, SCALE, MTRIX, ATOM, HETATM, TER, CONECT and SEQRES records and comparing their
|
|
885 values against contents of master record.
|
|
886
|
|
887 =item B<--residues>
|
|
888
|
|
889 Count residues in I<PDBFile(s)>. This is also B<default behavior>.
|
|
890
|
|
891 By default, only residue count values are reported. To list percent distribution of residues,
|
|
892 specify B<-d, --detail> value of I<2> or higher.
|
|
893
|
|
894 =item B<--ResiduesMode> <InChains | All | Both>
|
|
895
|
|
896 Specify how to count residues in I<PDBFile(s)>: Count residue in each chain and across all the chains,
|
|
897 list count iof all the residues in the file, or list both. Possible values: I<InChains, All, or Both>.
|
|
898 Default: I<Both>.
|
|
899
|
|
900 =item B<--ResidueNumbers>
|
|
901
|
|
902 List information about ATOM residue numbers in each chain before TER record: start and end residue
|
|
903 number; gaps in residue numbers corresponding to non-sequential residue numbers; residue
|
|
904 numbers not in ascending order.
|
|
905
|
|
906 =item B<-w, --WorkingDir> I<dirname>
|
|
907
|
|
908 Location of working directory. Default: current directory.
|
|
909
|
|
910 =back
|
|
911
|
|
912 =head1 EXAMPLES
|
|
913
|
|
914 To list total number of records and number of chain(s) residues in PDB files, type:
|
|
915
|
|
916 % InfoPDBFiles.pl Sample1.pdb
|
|
917 % InfoPDBFiles.pl Sample2.pdb
|
|
918
|
|
919 To list all available information for PDB file Sample2.pdb, type:
|
|
920
|
|
921 % InfoPDBFiles.pl -a Sample2.pdb
|
|
922
|
|
923 To list all available information for PDB file Sample2.pdb with all available details, type:
|
|
924
|
|
925 % InfoPDBFiles.pl -a -d Sample2.pdb
|
|
926
|
|
927 To count ATOM and HETATM records in Sample2.pdb file, type:
|
|
928
|
|
929 % InfoPDBFiles.pl -c "ATOM,HETATM" Sample2.pdb
|
|
930
|
|
931 To list distribution of residues in chains across the whole PDB file Sample2.pdb along with
|
|
932 percent distribution, type
|
|
933
|
|
934 % InfoPDBFiles.pl --frequency -d 2 Sample2.pdb
|
|
935
|
|
936 To list distribution of residues only across chains in PDB file Sample2.pdb along with
|
|
937 percent distribution, type
|
|
938
|
|
939 % InfoPDBFiles.pl --frequency -d 2 --ResiduesMode InChains Sample2.pdb
|
|
940
|
|
941 To list min/max coordinates of the bounding box which encompasses the structure in Sample1.pdb
|
|
942 file, type:
|
|
943
|
|
944 % InfoPDBFiles.pl -b Sample1.pdb
|
|
945
|
|
946 =head1 AUTHOR
|
|
947
|
|
948 Manish Sud <msud@san.rr.com>
|
|
949
|
|
950 =head1 SEE ALSO
|
|
951
|
|
952 ExtractFromPDBFiles.pl, InfoAminoAcids.pl, InfoNucleicAcids.pl, InfoSequenceFiles.pl, ModifyPDBFiles.pl
|
|
953
|
|
954 =head1 COPYRIGHT
|
|
955
|
|
956 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
957
|
|
958 This file is part of MayaChemTools.
|
|
959
|
|
960 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
961 the terms of the GNU Lesser General Public License as published by the Free
|
|
962 Software Foundation; either version 3 of the License, or (at your option)
|
|
963 any later version.
|
|
964
|
|
965 =cut
|