comparison mayachemtools/bin/ExtractFromPDBFiles.pl @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: ExtractFromPDBFiles.pl,v $
4 # $Date: 2015/02/28 20:46:19 $
5 # $Revision: 1.39 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use PDBFileUtil;
38 use AminoAcids;
39 use SequenceFileUtil;
40
41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
42
43 # Autoflush STDOUT
44 $| = 1;
45
46 # Starting message...
47 $ScriptName = basename($0);
48 print "\n$ScriptName: Starting...\n\n";
49 $StartTime = new Benchmark;
50
51 # Get the options and setup script...
52 SetupScriptUsage();
53 if ($Options{help} || @ARGV < 1) {
54 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
55 }
56
57 my(@PDBFilesList);
58 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
59
60 # Process options...
61 print "Processing options...\n";
62 my(%OptionsInfo);
63 ProcessOptions();
64
65 # Setup information about input files...
66 print "Checking input PDB file(s)...\n";
67 my(%PDBFilesInfo);
68 RetrievePDBFilesInfo();
69
70 # Process input files..
71 my($FileIndex);
72 if (@PDBFilesList > 1) {
73 print "\nProcessing PDB files...\n";
74 }
75 for $FileIndex (0 .. $#PDBFilesList) {
76 if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
77 print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
78 ExtractFromPDBFiles($FileIndex);
79 }
80 }
81 print "\n$ScriptName:Done...\n\n";
82
83 $EndTime = new Benchmark;
84 $TotalTime = timediff ($EndTime, $StartTime);
85 print "Total time: ", timestr($TotalTime), "\n";
86
87 ###############################################################################
88
89 # Extract appropriate information...
90 sub ExtractFromPDBFiles {
91 my($FileIndex) = @_;
92 my($PDBFile, $PDBRecordLinesRef);
93
94 # Get PDB data...
95 $PDBFile = $PDBFilesList[$FileIndex];
96 $PDBRecordLinesRef = ReadPDBFile($PDBFile);
97
98 if ($OptionsInfo{Mode} =~ /Chains/i) {
99 ExtractChains($FileIndex, $PDBRecordLinesRef);
100 }
101 elsif ($OptionsInfo{Mode} =~ /Sequences/i) {
102 ExtractSequences($FileIndex, $PDBRecordLinesRef);
103 }
104 elsif ($OptionsInfo{Mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i) {
105 ExtractByAtoms($FileIndex, $PDBRecordLinesRef);
106 }
107 elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) {
108 ExtractByResidues($FileIndex, $PDBRecordLinesRef);
109 }
110 elsif ($OptionsInfo{Mode} =~ /Distance/i) {
111 ExtractByDistance($FileIndex, $PDBRecordLinesRef);
112 }
113 elsif ($OptionsInfo{Mode} =~ /NonWater/i) {
114 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef);
115 }
116 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) {
117 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef);
118 }
119 }
120
121 # Extract chains and generate new PDB files...
122 #
123 sub ExtractChains {
124 my($FileIndex, $PDBRecordLinesRef) = @_;
125 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap);
126
127 # Get chains and residues data...
128 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1);
129
130 if ($OptionsInfo{CombineChains}) {
131 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
132 print "Generating PDBFileName file $PDBFileName...\n";
133
134 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
135
136 # Write out header and other older recors...
137 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
138 }
139
140 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
141 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
142 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex];
143
144 if (!$OptionsInfo{CombineChains}) {
145 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex];
146 print "Generating PDBFileName file $PDBFileName...\n";
147
148 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
149
150 # Write out header and other older recors...
151 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
152 }
153
154 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers
155 # for writing out appropriate CONECT records...
156 %ChainAtomNumbersMap = ();
157 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) {
158 print OUTFILE "$RecordLine\n";
159 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine);
160 $AtomNumber = int $AtomNumber;
161 $ChainAtomNumbersMap{$AtomNumber} = $AtomName;
162 }
163 # Write out TER record using information from last chain record...
164 $AtomNumber += 1;
165 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n";
166
167 # Write out CONECT records...
168 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap);
169
170 for $RecordLine (@{$ConectRecordLinesRef}) {
171 print OUTFILE "$RecordLine\n";
172 }
173
174 if (!$OptionsInfo{CombineChains}) {
175 # Write out END record...
176 print OUTFILE GenerateEndRecordLine(), "\n";
177
178 close OUTFILE;
179 }
180 }
181
182 if ($OptionsInfo{CombineChains}) {
183 # Write out END record...
184 print OUTFILE GenerateEndRecordLine(), "\n";
185
186 close OUTFILE;
187 }
188
189 }
190
191 # Extract sequences for individual chains or combine all the chains...
192 sub ExtractSequences {
193 my($FileIndex, $PDBRecordLinesRef) = @_;
194 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap);
195
196 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) {
197 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes');
198 }
199 else {
200 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef);
201 }
202
203 # Generate sequence data for all the chains...
204 %ChainSequencesDataMap = ();
205 @{$ChainSequencesDataMap{IDs}} = ();
206 %{$ChainSequencesDataMap{Sequence}} = ();
207 %{$ChainSequencesDataMap{Description}} = ();
208
209 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
210 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
211 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex];
212
213 # Setup sequence ID...
214 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex];
215 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID;
216 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID;
217
218 # Collect sequence data for the chain...
219 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) {
220 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
221 }
222 else {
223 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
224 }
225 # Setup sequence data...
226 $ChainSequence = '';
227 RESIDUE: for $Residue (@{$ChainResiduesRef}) {
228 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue);
229 if (!$StandardResidue) {
230 if ($OptionsInfo{KeepNonStandardSequences}) {
231 $ResidueCode = $OptionsInfo{NonStandardSequenceCode};
232 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n";
233 }
234 else {
235 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n";
236 next RESIDUE;
237 }
238 }
239 $ChainSequence .= $ResidueCode;
240 }
241 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence;
242
243 }
244
245 # Write out the sequence files...
246 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap );
247 if ($OptionsInfo{CombineChainSequences}) {
248 # Combine all the chain sequences...
249 $Sequence = '';
250 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
251 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
252
253 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID};
254 }
255 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";;
256 $SequenceDescription = $SequenceID;
257 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
258
259 print "Generating sequence file $SequenceFileName...\n";
260 %SequencesDataMap = ();
261 @{$SequencesDataMap{IDs}} = ();
262 %{$SequencesDataMap{Sequence}} = ();
263 %{$SequencesDataMap{Description}} = ();
264
265 push @{$SequencesDataMap{IDs}}, $SequenceID;
266 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription;
267 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence;
268
269 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength});
270 }
271 else {
272 # For each specifed chain, write out the sequences...
273 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
274 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
275
276 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex];
277
278 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex];
279 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID};
280 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID};
281
282 print "Generating sequence file $SequenceFileName...\n";
283 %SequencesDataMap = ();
284 @{$SequencesDataMap{IDs}} = ();
285 %{$SequencesDataMap{Sequence}} = ();
286 %{$SequencesDataMap{Description}} = ();
287
288 push @{$SequencesDataMap{IDs}}, $SequenceID;
289 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription;
290 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence;
291
292 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength});
293 }
294 }
295 }
296
297 # Extract atoms...
298 sub ExtractByAtoms {
299 my($FileIndex, $PDBRecordLinesRef) = @_;
300 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $IgnoreRecord, $ConectRecordLinesRef, %AtomNumbersMap);
301
302 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
303 print "Generating PDBFileName file $PDBFileName...\n";
304 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
305
306 # Write out header and other older recors...
307 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
308
309 # Write out all ATOM records along with TER and model records to indicate
310 # chains and multiple models..
311 %AtomNumbersMap = ();
312 $ChainRecordCount = 0;
313 for $RecordLine (@{$PDBRecordLinesRef}) {
314 if (CheckRecordType($RecordLine)) {
315 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine);
316
317 # Check atoms...
318 $IgnoreRecord = 1;
319 if ($OptionsInfo{Mode} =~ /^Atoms$/i) {
320 $IgnoreRecord = 0;
321 }
322 elsif ($OptionsInfo{Mode} =~ /^(CAlphas|AtomNames)$/i) {
323 if (exists $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName}) {
324 $IgnoreRecord = 0;
325 }
326 }
327 elsif ($OptionsInfo{Mode} =~ /^AtomNums$/i) {
328 if (exists $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNumber}) {
329 $IgnoreRecord = 0;
330 }
331 }
332 elsif ($OptionsInfo{Mode} =~ /^AtomsRange$/i) {
333 if ($AtomNumber >= $OptionsInfo{SpecifiedStartAtomNum} && $AtomNumber <= $OptionsInfo{SpecifiedEndAtomNum}) {
334 $IgnoreRecord = 0;
335 }
336 }
337
338 if (!$IgnoreRecord) {
339 $ChainRecordCount++;
340 print OUTFILE "$RecordLine\n";
341
342 $AtomNumber = int $AtomNumber;
343 $AtomNumbersMap{$AtomNumber} = $AtomName;
344 }
345 }
346 elsif (IsTerRecordType($RecordLine)) {
347 if ($ChainRecordCount) {
348 print OUTFILE GenerateTerRecordLine(), "\n";
349 }
350 $ChainRecordCount = 0;
351 }
352 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
353 print OUTFILE "$RecordLine\n";
354 }
355 }
356
357 # Write out appropriate CONECT records...
358 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
359 for $RecordLine (@{$ConectRecordLinesRef}) {
360 print OUTFILE "$RecordLine\n";
361 }
362
363 # Write out END record...
364 print OUTFILE GenerateEndRecordLine(), "\n";
365
366 close OUTFILE;
367 }
368
369 # Extract residues...
370 sub ExtractByResidues {
371 my($FileIndex, $PDBRecordLinesRef) = @_;
372 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap);
373
374 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
375 print "Generating PDBFileName file $PDBFileName...\n";
376 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
377
378 # Write out header and other older recors...
379 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
380
381 # Write out all ATOM records for specified residues with TER and model records to indicate
382 # chains and multiple models...
383 %AtomNumbersMap = ();
384 $ChainRecordCount = 0;
385 for $RecordLine (@{$PDBRecordLinesRef}) {
386 if (CheckRecordType($RecordLine)) {
387 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine);
388
389 # Check residues...
390 $IgnoreRecord = 1;
391 if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) {
392 if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) {
393 $IgnoreRecord = 0;
394 }
395 }
396 elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) {
397 if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) {
398 $IgnoreRecord = 0;
399 }
400 }
401 elsif ($OptionsInfo{Mode} =~ /^ResidueNames$/i) {
402 if (exists $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName}) {
403 $IgnoreRecord = 0;
404 }
405 }
406 if (!$IgnoreRecord) {
407 $ChainRecordCount++;
408 print OUTFILE "$RecordLine\n";
409 $AtomNumber = int $AtomNumber;
410 $AtomNumbersMap{$AtomNumber} = $AtomName;
411 }
412 }
413 elsif (IsTerRecordType($RecordLine)) {
414 if ($ChainRecordCount) {
415 print OUTFILE GenerateTerRecordLine(), "\n";
416 }
417 $ChainRecordCount = 0;
418 }
419 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
420 print OUTFILE "$RecordLine\n";
421 }
422 }
423
424 # Write out appropriate CONECT records...
425 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
426 for $RecordLine (@{$ConectRecordLinesRef}) {
427 print OUTFILE "$RecordLine\n";
428 }
429 # Write out END record...
430 print OUTFILE GenerateEndRecordLine(), "\n";
431
432 close OUTFILE;
433 }
434
435 # Extract non water records...
436 sub ExtractNonWaterRecords {
437 my($FileIndex, $PDBRecordLinesRef) = @_;
438 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap);
439
440 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
441 print "Generating PDBFileName file $PDBFileName...\n";
442 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
443
444 # Write out header and other older recors...
445 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
446
447 # Write out all ATOM/HETATM non water records along with TER and model records to indicate
448 # chains and multiple models..
449 %AtomNumbersMap = ();
450 $ChainRecordCount = 0;
451 for $RecordLine (@{$PDBRecordLinesRef}) {
452 if (CheckRecordType($RecordLine)) {
453 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine);
454 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) {
455 $ChainRecordCount++;
456 print OUTFILE "$RecordLine\n";
457 $AtomNumber = int $AtomNumber;
458 $AtomNumbersMap{$AtomNumber} = $AtomName;
459 }
460 }
461 elsif (IsTerRecordType($RecordLine)) {
462 if ($ChainRecordCount) {
463 print OUTFILE GenerateTerRecordLine(), "\n";
464 }
465 $ChainRecordCount = 0;
466 }
467 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
468 print OUTFILE "$RecordLine\n";
469 }
470 }
471
472 # Write out appropriate CONECT records...
473 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
474 for $RecordLine (@{$ConectRecordLinesRef}) {
475 print OUTFILE "$RecordLine\n";
476 }
477 # Write out END record...
478 print OUTFILE GenerateEndRecordLine(), "\n";
479
480 close OUTFILE;
481 }
482
483 # Extract non hydrogen records...
484 sub ExtractNonHydrogenRecords {
485 my($FileIndex, $PDBRecordLinesRef) = @_;
486 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap);
487
488 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
489 print "Generating PDBFileName file $PDBFileName...\n";
490 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
491
492 # Write out header and other older recors...
493 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
494
495 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate
496 # chains and multiple models..
497 %AtomNumbersMap = ();
498 $ChainRecordCount = 0;
499 for $RecordLine (@{$PDBRecordLinesRef}) {
500 if (CheckRecordType($RecordLine)) {
501 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
502 if ($ElementSymbol !~ /^H$/i) {
503 $ChainRecordCount++;
504 print OUTFILE "$RecordLine\n";
505 $AtomNumber = int $AtomNumber;
506 $AtomNumbersMap{$AtomNumber} = $AtomName;
507 }
508 }
509 elsif (IsTerRecordType($RecordLine)) {
510 if ($ChainRecordCount) {
511 print OUTFILE GenerateTerRecordLine(), "\n";
512 }
513 $ChainRecordCount = 0;
514 }
515 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
516 print OUTFILE "$RecordLine\n";
517 }
518 }
519
520 # Write out appropriate CONECT records...
521 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
522 for $RecordLine (@{$ConectRecordLinesRef}) {
523 print OUTFILE "$RecordLine\n";
524 }
525 # Write out END record...
526 print OUTFILE GenerateEndRecordLine(), "\n";
527
528 close OUTFILE;
529 }
530
531 # Extract ATOM/HETATM records by distance...
532 sub ExtractByDistance {
533 my($FileIndex, $PDBRecordLinesRef) = @_;
534 my($PDBFileName, $RecordLine, $RecordLineNum, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $IgnoreRecord, $ResidueID, @OriginCoords, @Coords, %AtomNumbersMap, %ResiduesDataMap);
535
536 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
537 print "Generating PDBFileName file $PDBFileName...\n";
538 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
539
540 # Write out header and other older recors...
541 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
542
543 # Setup coordinates of origin to calculate distance...
544 @OriginCoords = ();
545 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]};
546
547 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate
548 # chains and multiple models...
549 %AtomNumbersMap = ();
550
551 %ResiduesDataMap = ();
552 %{$ResiduesDataMap{ID}} = ();
553 %{$ResiduesDataMap{Status}} = ();
554
555 $ChainRecordCount = 0;
556 $RecordLineNum = 0;
557
558 for $RecordLine (@{$PDBRecordLinesRef}) {
559 $RecordLineNum++;
560 if (CheckRecordType($RecordLine)) {
561 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
562 @Coords = (); push @Coords, ($X, $Y, $Z);
563
564 $IgnoreRecord = 1;
565 if ($OptionsInfo{DistanceSelectionMode} =~ /^ByResidue$/i) {
566 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}";
567 if (exists $ResiduesDataMap{ID}{$ResidueID}) {
568 # Residue data has been processed; check its selection status...
569 if ($ResiduesDataMap{Status}{$ResidueID}) {
570 $IgnoreRecord = 0;
571 }
572 }
573 else {
574 # Residue hasn't been processed...
575 $ResiduesDataMap{ID}{$ResidueID} = $ResidueID;
576 $ResiduesDataMap{Status}{$ResidueID} = 0;
577 if (CheckResidueDistance($ResidueID, $RecordLineNum, $PDBRecordLinesRef, \@OriginCoords)) {
578 $IgnoreRecord = 0;
579 $ResiduesDataMap{Status}{$ResidueID} = 1;
580 }
581 }
582 }
583 elsif ($OptionsInfo{DistanceSelectionMode} =~ /^ByAtom$/i) {
584 if (CheckDistance(\@Coords, \@OriginCoords)) {
585 $IgnoreRecord = 0;
586 }
587 }
588
589 if (!$IgnoreRecord) {
590 $ChainRecordCount++;
591 print OUTFILE "$RecordLine\n";
592 $AtomNumber = int $AtomNumber;
593 $AtomNumbersMap{$AtomNumber} = $AtomName;
594 }
595 }
596 elsif (IsTerRecordType($RecordLine)) {
597 if ($ChainRecordCount) {
598 print OUTFILE GenerateTerRecordLine(), "\n";
599 }
600 $ChainRecordCount = 0;
601 }
602 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
603 print OUTFILE "$RecordLine\n";
604 }
605 }
606
607 # Write out appropriate CONECT records...
608 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
609 for $RecordLine (@{$ConectRecordLinesRef}) {
610 print OUTFILE "$RecordLine\n";
611 }
612
613 # Write out END record...
614 print OUTFILE GenerateEndRecordLine(), "\n";
615
616 close OUTFILE;
617 }
618
619 # Does record type correspond to the specified record type?
620 sub CheckRecordType {
621 my($RecordLine) = @_;
622 my($Status);
623
624 $Status = 0;
625 if ($OptionsInfo{RecordMode} =~ /^Atom$/i) {
626 $Status = IsAtomRecordType($RecordLine) ? 1 : 0;
627 }
628 elsif ($OptionsInfo{RecordMode} =~ /^Hetatm$/i) {
629 $Status = IsHetatmRecordType($RecordLine) ? 1 : 0;
630 }
631 elsif ($OptionsInfo{RecordMode} =~ /^AtomAndHetatm$/i) {
632 $Status = (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) ? 1 : 0;
633 }
634
635 return $Status;
636 }
637
638 # Does record meets distance citerion specified by the user?
639 sub CheckResidueDistance {
640 my($SpecifiedResidueID, $StartingLineNum, $PDBRecordLinesRef, $OriginCoordsRef) = @_;
641 my($Status, $RecordLine, $RecordLineIndex, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $ResidueID, @Coords);
642
643 $Status = 0;
644
645 RECORDLINE: for $RecordLineIndex (($StartingLineNum - 1) .. $#{$PDBRecordLinesRef}) {
646 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex];
647 if (!CheckRecordType($RecordLine)) {
648 next RECORDLINE;
649 }
650 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
651 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}";
652
653 if ($ResidueID !~ /^$SpecifiedResidueID$/i) {
654 # It's a new residue line...
655 last RECORDLINE;
656 }
657
658 # Check distance...
659 @Coords = (); push @Coords, ($X, $Y, $Z);
660 if (CheckDistance(\@Coords, $OriginCoordsRef)) {
661 # Distance criterion is met for at least one record in the residue...
662 $Status = 1;
663 last RECORDLINE;
664 }
665 }
666 return $Status;
667 }
668
669 # Does record meets distance citerion specified by the user?
670 sub CheckDistance {
671 my($CoordsRef, $OriginCoordsRef) = @_;
672 my($Status, $Index, $Distance, $DistanceSquare);
673
674 $Status = 0;
675
676 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
677 # Go over coordinates of all the atoms in the residue...
678 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef};
679 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) {
680 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2;
681 $Distance = sqrt $DistanceSquare;
682 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) {
683 $Status = 1;
684 last INDEX;
685 }
686 }
687 }
688 else {
689 $DistanceSquare = 0;
690 for $Index (0 .. 2) {
691 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2;
692 }
693 $Distance = sqrt $DistanceSquare;
694 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0;
695 }
696
697 return $Status;
698 }
699
700 # Write out modifed header and other older records...
701 sub WriteHeaderAndOlderRecords {
702 my($OutFileRef, $PDBRecordLinesRef) = @_;
703
704 if ($OptionsInfo{ModifyHeaderRecord}) {
705 # Write out modified HEADER record...
706 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef);
707 $Classification = 'Data extracted using MayaChemTools';
708 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n";
709 }
710 else {
711 print $OutFileRef $PDBRecordLinesRef->[0], "\n";
712 }
713
714 # Write out any old records...
715 if ($OptionsInfo{KeepOldRecords}) {
716 my($RecordLineIndex, $RecordLine);
717 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file...
718 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) {
719 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex];
720 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
721 last RECORDLINE;
722 }
723 print $OutFileRef "$RecordLine\n";
724 }
725 }
726 }
727
728 # Get header record information assuming it's the first record...
729 sub GetHeaderRecordInformation {
730 my($PDBRecordLinesRef) = @_;
731 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine);
732
733 ($Classification, $DepositionDate, $IDCode) = ('') x 3;
734 $HeaderRecordLine = $PDBRecordLinesRef->[0];
735 if (IsHeaderRecordType($HeaderRecordLine)) {
736 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
737 }
738 return ($Classification, $DepositionDate, $IDCode);
739 }
740
741 # Get one letter residue code...
742 sub GetResidueCode {
743 my($ResidueName) = @_;
744 my($ResidueCode, $StandardResidue);
745
746 $ResidueCode = $OptionsInfo{NonStandardSequenceCode};
747 $StandardResidue = 0;
748
749 if (length($ResidueName) == 3) {
750 # Assume it's an amino acid...
751 if (AminoAcids::IsAminoAcid($ResidueName)) {
752 # Standard amino acid...
753 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName);
754 $StandardResidue = 1;
755 }
756 }
757 elsif (length($ResidueName) == 1) {
758 # Assume it's a nucleic acid...
759 if ($ResidueName =~ /^(A|G|T|U|C)$/i) {
760 $ResidueCode = $ResidueName;
761 $StandardResidue = 1;
762 }
763 }
764
765 return ($ResidueCode, $StandardResidue);
766 }
767
768 # Process option values...
769 sub ProcessOptions {
770 %OptionsInfo = ();
771 $OptionsInfo{Mode} = $Options{mode};
772
773 my(@SpecifiedChains) = ();
774 if ($Options{chains} =~ /^(First|All)$/i) {
775 $OptionsInfo{ChainsToExtract} = $Options{chains};
776 }
777 else {
778 @SpecifiedChains = split /\,/, $Options{chains};
779 $OptionsInfo{ChainsToExtract} = 'Specified';
780 }
781 @{$OptionsInfo{SpecifiedChains}} = ();
782 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains;
783
784 $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0;
785
786 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0;
787
788 ProcessResiduesOptions();
789 ProcessAtomsOptions();
790 ProcessDistanceOptions();
791
792 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames};
793 @{$OptionsInfo{SpecifiedWaterResiduesList}} = ();
794 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = ();
795
796 my(@SpecifiedWaterResiduesList);
797 @SpecifiedWaterResiduesList = ();
798
799 if ($OptionsInfo{Mode} =~ /^NonWater$/i) {
800 my($WaterResidueName);
801 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) {
802 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O');
803 }
804 else {
805 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames};
806 }
807 for $WaterResidueName (@SpecifiedWaterResiduesList) {
808 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName;
809 }
810 }
811 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList;
812
813 $OptionsInfo{RecordMode} = $Options{recordmode} ? $Options{recordmode} : ($Options{mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i ? "Atom" : "AtomAndHetatm");
814
815 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0;
816
817 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0;
818
819 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0;
820 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode};
821 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
822 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords};
823 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix};
824
825 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
826 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
827 }
828
829 # Process specified residue options...
830 sub ProcessResiduesOptions {
831 my($ResidueNum, $StartResidueNum, $EndResNum, $ResidueName, @SpecifiedResidueNumsList, @SpecifiedResidueNamesList);
832
833 @SpecifiedResidueNumsList = ();
834 ($StartResidueNum, $EndResNum) = (0, 0);
835
836 @SpecifiedResidueNamesList = ();
837
838 if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) {
839 if (!$Options{residues}) {
840 die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums, ResiduesRange, or ResidueNames\" \"-m, --mode\". \n";
841 }
842 $OptionsInfo{Residues} = $Options{residues};
843 $OptionsInfo{Residues} =~ s/ //g;
844
845 if ($OptionsInfo{Mode} =~ /^ResidueNames$/i) {
846 @SpecifiedResidueNamesList = split /\,/, $OptionsInfo{Residues};
847 }
848 else {
849 @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues};
850 for $ResidueNum (@SpecifiedResidueNumsList) {
851 if (!IsPositiveInteger($ResidueNum)) {
852 die "Error: Invalid residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n";
853 }
854 }
855 if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) {
856 if (@SpecifiedResidueNumsList != 2) {
857 die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n";
858 }
859 if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) {
860 die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue number must be less than end residue number.\n";
861 }
862 ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList;
863 }
864 }
865 }
866
867 @{$OptionsInfo{SpecifiedResidueNumsList}} = ();
868 push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList;
869
870 $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum;
871 $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum;
872
873 @{$OptionsInfo{SpecifiedResidueNamesList}} = ();
874 push @{$OptionsInfo{SpecifiedResidueNamesList}}, @SpecifiedResidueNamesList;
875
876 # Set up a specified residue numbers map...
877 %{$OptionsInfo{SpecifiedResidueNumsMap}} = ();
878 for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) {
879 $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum;
880 }
881
882 # Set up a specified residue names map...
883 %{$OptionsInfo{SpecifiedResidueNamesMap}} = ();
884 for $ResidueName (@{$OptionsInfo{SpecifiedResidueNamesList}}) {
885 $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName} = lc $ResidueName;
886 }
887
888 }
889
890 # Process specified atom options...
891 sub ProcessAtomsOptions {
892 my($AtomNum, $StartAtomNum, $EndAtomNum, $AtomName, @SpecifiedAtomNumsList, @SpecifiedAtomNamesList);
893
894 @SpecifiedAtomNumsList = ();
895 ($StartAtomNum, $EndAtomNum) = (0, 0);
896
897 @SpecifiedAtomNamesList = ();
898
899 if ($OptionsInfo{Mode} =~ /^(AtomNums|AtomsRange|AtomNames)$/i) {
900 if (!$Options{atoms}) {
901 die "Error: You must specify a value for \"--Atoms\" option in \"AtomNums, AtomsRange, or AtomNames\" \"-m, --mode\". \n";
902 }
903 $OptionsInfo{Atoms} = $Options{atoms};
904 $OptionsInfo{Atoms} =~ s/ //g;
905
906 if ($OptionsInfo{Mode} =~ /^AtomNames$/i) {
907 @SpecifiedAtomNamesList = split /\,/, $OptionsInfo{Atoms};
908 }
909 else {
910 @SpecifiedAtomNumsList = split /\,/, $OptionsInfo{Atoms};
911 for $AtomNum (@SpecifiedAtomNumsList) {
912 if (!IsPositiveInteger($AtomNum)) {
913 die "Error: Invalid atom number value, $AtomNum, for \"--Atoms\" option during \"AtomNums\" or \"AtomsRange\"value of \"-m --mode\" option: Atom number must be a positive integer.\n";
914 }
915 }
916 if ($OptionsInfo{Mode} =~ /^AtomsRange$/i) {
917 if (@SpecifiedAtomNumsList != 2) {
918 die "Error: Invalid number of atom number values, ", scalar(@SpecifiedAtomNumsList), ", for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end atom numbers.\n";
919 }
920 if ($SpecifiedAtomNumsList[0] > $SpecifiedAtomNumsList[1]) {
921 die "Error: Invalid atom number values, @SpecifiedAtomNumsList, for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The start atom number must be less than end atom number.\n";
922 }
923 ($StartAtomNum, $EndAtomNum) = @SpecifiedAtomNumsList;
924 }
925 }
926 }
927 elsif ($OptionsInfo{Mode} =~ /^CAlphas$/i) {
928 @SpecifiedAtomNamesList = ("CA");
929 }
930
931 @{$OptionsInfo{SpecifiedAtomNumsList}} = ();
932 push @{$OptionsInfo{SpecifiedAtomNumsList}}, @SpecifiedAtomNumsList;
933
934 $OptionsInfo{SpecifiedStartAtomNum} = $StartAtomNum;
935 $OptionsInfo{SpecifiedEndAtomNum} = $EndAtomNum;
936
937 @{$OptionsInfo{SpecifiedAtomNamesList}} = ();
938 push @{$OptionsInfo{SpecifiedAtomNamesList}}, @SpecifiedAtomNamesList;
939
940 # Set up a specified residue numbers map...
941 %{$OptionsInfo{SpecifiedAtomNumsMap}} = ();
942 for $AtomNum (@{$OptionsInfo{SpecifiedAtomNumsList}}) {
943 $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNum} = $AtomNum;
944 }
945
946 # Set up a specified residue names map...
947 %{$OptionsInfo{SpecifiedAtomNamesMap}} = ();
948 for $AtomName (@{$OptionsInfo{SpecifiedAtomNamesList}}) {
949 $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName} = lc $AtomName;
950 }
951
952 }
953
954 # Process specified distance options...
955 sub ProcessDistanceOptions {
956 my(@SpecifiedDistanceOrigin) = ();
957
958 $OptionsInfo{MaxExtractionDistance} = $Options{distance};
959 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode};
960 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : '';
961 $OptionsInfo{DistanceSelectionMode} = $Options{distanceselectionmode};
962
963 if ($OptionsInfo{Mode} =~ /^Distance$/i) {
964 if (!$Options{distanceorigin}) {
965 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n";
966 }
967 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin};
968 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) {
969 if (@SpecifiedDistanceOrigin != 2) {
970 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n";
971 }
972 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
973 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n";
974 }
975 }
976 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) {
977 if (@SpecifiedDistanceOrigin != 2) {
978 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n";
979 }
980 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
981 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n";
982 }
983 }
984 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
985 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) {
986 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n";
987 }
988 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
989 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n";
990 }
991 }
992 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
993 if (@SpecifiedDistanceOrigin != 3) {
994 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n";
995 }
996 my($Value);
997 for $Value (@SpecifiedDistanceOrigin) {
998 if (!IsNumerical($Value)) {
999 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n";
1000 }
1001 }
1002 }
1003 }
1004 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = ();
1005 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin;
1006
1007 }
1008
1009 # Retrieve information about PDB files...
1010 sub RetrievePDBFilesInfo {
1011 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix);
1012
1013 %PDBFilesInfo = ();
1014 @{$PDBFilesInfo{FileOkay}} = ();
1015 @{$PDBFilesInfo{OutFileRoot}} = ();
1016 @{$PDBFilesInfo{OutFileNames}} = ();
1017 @{$PDBFilesInfo{ChainLabels}} = ();
1018 @{$PDBFilesInfo{ChainSequenceIDs}} = ();
1019 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = ();
1020 @{$PDBFilesInfo{SpecifiedChains}} = ();
1021 @{$PDBFilesInfo{DistanceOrigin}} = ();
1022
1023 FILELIST: for $Index (0 .. $#PDBFilesList) {
1024 $PDBFilesInfo{FileOkay}[$Index] = 0;
1025
1026 $PDBFilesInfo{OutFileRoot}[$Index] = '';
1027 @{$PDBFilesInfo{OutFileNames}[$Index]} = ();
1028 @{$PDBFilesInfo{OutFileNames}[$Index]} = ();
1029 @{$PDBFilesInfo{ChainLabels}[$Index]} = ();
1030 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = ();
1031 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = ();
1032 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = ();
1033 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = ();
1034
1035 $PDBFile = $PDBFilesList[$Index];
1036 if (!(-e $PDBFile)) {
1037 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
1038 next FILELIST;
1039 }
1040 if (!CheckFileType($PDBFile, "pdb")) {
1041 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
1042 next FILELIST;
1043 }
1044 if (! open PDBFILE, "$PDBFile") {
1045 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
1046 next FILELIST;
1047 }
1048 close PDBFILE;
1049
1050 # Get PDB data...
1051 $PDBRecordLinesRef = ReadPDBFile($PDBFile);
1052 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) {
1053 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes');
1054 }
1055 else {
1056 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef);
1057 }
1058 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
1059 warn "Warning: Ignoring file $PDBFile: No chains found \n";
1060 next FILELIST;
1061 }
1062
1063 # Make sure specified chains exist in PDB file...
1064 @SpecifiedChains = ();
1065 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) {
1066 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) {
1067 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) {
1068 push @SpecifiedChains, $ChainID;
1069 }
1070 else {
1071 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n";
1072 next FILELIST;
1073 }
1074 }
1075 }
1076 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) {
1077 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0];
1078 }
1079 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) {
1080 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}};
1081 }
1082 # Setup chain labels to use for sequence IDs and generating output files...
1083 @ChainLabels = ();
1084 for $ChainID (@SpecifiedChains) {
1085 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig;
1086 $ChainLabel = "Chain${ChainLabel}";
1087 push @ChainLabels, $ChainLabel;
1088 }
1089
1090 # Make sure specified distance origin is valid...
1091 @DistanceOrigin = ();
1092 if ($OptionsInfo{Mode} =~ /^Distance$/i) {
1093 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) {
1094 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine);
1095 $RecordType = $OptionsInfo{ExtractionDistanceMode};
1096 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
1097 $RecordFound = 0;
1098 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
1099 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
1100 next LINE;
1101 }
1102 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
1103 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName);
1104 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) {
1105 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) {
1106 $RecordFound = 1;
1107 last LINE;
1108 }
1109 }
1110 }
1111 if (!$RecordFound) {
1112 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n";
1113 next FILELIST;
1114 }
1115 push @DistanceOrigin, ($X, $Y, $Z);
1116 }
1117 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
1118 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine);
1119 $SpecifiedChainID = '';
1120 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) {
1121 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
1122 }
1123 else {
1124 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
1125 }
1126 $RecordFound = 0;
1127 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
1128 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
1129 next LINE;
1130 }
1131 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
1132 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName);
1133 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID);
1134 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) {
1135 next LINE;
1136 }
1137 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) {
1138 # Store coordinates for all the atoms...
1139 $RecordFound = 1;
1140 push @DistanceOrigin, ($X, $Y, $Z);
1141 next LINE;
1142 }
1143 }
1144 if (!$RecordFound) {
1145 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n";
1146 next FILELIST;
1147 }
1148 }
1149 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
1150 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
1151 }
1152 }
1153 # Setup output file names...
1154 @OutFileNames = ();
1155 $FileDir = ""; $FileName = ""; $FileExt = "";
1156 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile);
1157 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) {
1158 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
1159 if ($RootFileName && $RootFileExt) {
1160 $FileName = $RootFileName;
1161 }
1162 else {
1163 $FileName = $OptionsInfo{OutFileRoot};
1164 }
1165 $OutFileRoot = $FileName;
1166 }
1167 else {
1168 $OutFileRoot = $FileName;
1169 }
1170 $Mode = $OptionsInfo{Mode};
1171 if ($Mode =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResiduesRange|ResidueNames|Distance|NonWater|NonHydrogens)$/i) {
1172 $OutFileName = '';
1173 if ($Mode =~ /^CAlphas$/i) {
1174 $OutFileName = "${OutFileRoot}CAlphas.pdb";
1175 }
1176 elsif ($Mode =~ /^Atoms$/i) {
1177 $OutFileName = "${OutFileRoot}Atoms.pdb";
1178 }
1179 elsif ($Mode =~ /^AtomNums$/i) {
1180 $OutFileName = "${OutFileRoot}AtomNums.pdb";
1181 }
1182 elsif ($Mode =~ /^AtomsRange$/i) {
1183 $OutFileName = "${OutFileRoot}AtomsRange.pdb";
1184 }
1185 elsif ($Mode =~ /^AtomNames$/i) {
1186 $OutFileName = "${OutFileRoot}AtomNames.pdb";
1187 }
1188 elsif ($Mode =~ /^ResidueNums$/i) {
1189 $OutFileName = "${OutFileRoot}ResidueNums.pdb";
1190 }
1191 elsif ($Mode =~ /^ResiduesRange$/i) {
1192 $OutFileName = "${OutFileRoot}ResiduesRange.pdb";
1193 }
1194 elsif ($Mode =~ /^ResidueNames$/i) {
1195 $OutFileName = "${OutFileRoot}ResidueNames.pdb";
1196 }
1197 elsif ($Mode =~ /^NonWater$/i) {
1198 $OutFileName = "${OutFileRoot}NonWater.pdb";
1199 }
1200 elsif ($Mode =~ /^NonHydrogens$/i) {
1201 $OutFileName = "${OutFileRoot}NonHydrogens.pdb";
1202 }
1203 elsif ($Mode =~ /^Distance$/i) {
1204 my($DistanceMode) = '';
1205 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) {
1206 $DistanceMode = 'Atom';
1207 }
1208 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) {
1209 $DistanceMode = 'Hetatm';
1210 }
1211 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
1212 $DistanceMode = 'Residue';
1213 }
1214 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
1215 $DistanceMode = 'XYZ';
1216 }
1217 $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb";
1218 }
1219 push @OutFileNames, $OutFileName;
1220 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
1221 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
1222 next FILELIST;
1223 }
1224 }
1225 elsif ($Mode =~ /^(Chains|Sequences)$/i) {
1226 if ($OptionsInfo{CombineChainSequences}) {
1227 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta";
1228 push @OutFileNames, $OutFileName;
1229 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
1230 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
1231 next FILELIST;
1232 }
1233 }
1234 else {
1235 for $ChainLabel (@ChainLabels) {
1236 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta";
1237 push @OutFileNames, $OutFileName;
1238 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
1239 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
1240 next FILELIST;
1241 }
1242 }
1243 }
1244 }
1245 @ChainSequenceIDs = ();
1246 @ChainSequenceIDsPrefix = ();
1247 if ($Mode =~ /^Sequences$/i) {
1248 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix);
1249 ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef);
1250
1251 if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) {
1252 $IDPrefix = $FileName;
1253 }
1254 elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) {
1255 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : '';
1256 }
1257 else {
1258 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName;
1259 }
1260
1261 for $ChainLabel (@ChainLabels) {
1262 push @ChainSequenceIDsPrefix, $IDPrefix;
1263 push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB";
1264 }
1265 }
1266
1267 $PDBFilesInfo{FileOkay}[$Index] = 1;
1268 $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
1269
1270 push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames;
1271 push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels;
1272 push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix;
1273 push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs;
1274 push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains;
1275 push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin;
1276 }
1277 }
1278
1279
1280 # Setup script usage and retrieve command line arguments specified using various options...
1281 sub SetupScriptUsage {
1282
1283 # Retrieve all the options...
1284 %Options = ();
1285 $Options{chains} = 'First';
1286 $Options{combinechains} = 'no';
1287 $Options{distance} = 10.0;
1288 $Options{distancemode} = 'XYZ';
1289 $Options{distanceselectionmode} = 'ByAtom';
1290 $Options{keepoldrecords} = 'no';
1291 $Options{mode} = 'NonWater';
1292 $Options{modifyheader} = 'yes';
1293 $Options{nonstandardkeep} = 'yes';
1294 $Options{nonstandardcode} = 'X';
1295 $Options{sequencelength} = 80;
1296 $Options{sequenceidprefix} = 'Automatic';
1297 $Options{sequencerecords} = 'Atom';
1298 $Options{waterresiduenames} = 'Automatic';
1299
1300 if (!GetOptions(\%Options, "atoms|a=s", "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "distanceselectionmode=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "recordmode=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) {
1301 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1302 }
1303 if ($Options{workingdir}) {
1304 if (! -d $Options{workingdir}) {
1305 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1306 }
1307 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1308 }
1309 if ($Options{combinechains} !~ /^(yes|no)$/i) {
1310 die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n";
1311 }
1312 if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) {
1313 die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n";
1314 }
1315 if ($Options{distanceselectionmode} !~ /^(ByAtom|ByResidue)$/i) {
1316 die "Error: The value specified, $Options{distanceselectionmode}, for option \"--DistanceSelectionMode\" is not valid. Allowed values: ByAtom or ByResidue\n";
1317 }
1318 if ($Options{keepoldrecords} !~ /^(yes|no)$/i) {
1319 die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n";
1320 }
1321 if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResidueNames|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) {
1322 die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, AtomNums, AtomsRange, AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, NonHydrogens\n";
1323 }
1324 if ($Options{modifyheader} !~ /^(yes|no)$/i) {
1325 die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n";
1326 }
1327 if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) {
1328 die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n";
1329 }
1330 if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) {
1331 die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n";
1332 }
1333 if ($Options{recordmode} && $Options{recordmode} !~ /^(Atom|Hetatm|AtomAndHetatm)$/i) {
1334 die "Error: The value specified, $Options{recordmode}, for option \"--RecordMode\" is not valid. Allowed values: Atom, Hetatm, AtomAndHetatm\n";
1335 }
1336 if (!IsPositiveInteger($Options{sequencelength})) {
1337 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
1338 }
1339 if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) {
1340 die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n";
1341 }
1342 if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) {
1343 die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n";
1344 }
1345 }
1346
1347 __END__
1348
1349 =head1 NAME
1350
1351 ExtractFromPDBFiles.pl - Extract specific data from PDBFile(s)
1352
1353 =head1 SYNOPSIS
1354
1355 ExtractFromPDBFiles.pl PDBFile(s)...
1356
1357 ExtractFromPDBFiles.pl [B<-a, --Atoms> "AtomNum, [AtomNum...]" | "StartAtomNum, EndAtomNum" |
1358 "AtomName, [AtomName...]"] [B<-c, --chains> First | All | "ChainID, [ChainID,...]"]
1359 [<--CombineChains> yes | no] [B<-d, --distance> number] [B<--DistanceMode> Atom | Hetatm | Residue | XYZ]
1360 [B<--DistanceOrigin> "AtomNumber, AtomName" | "HetatmNumber, HetAtmName" | "ResidueNumber, ResidueName, [ChainID]" | "X,Y,Z">]
1361 [<--DistanceSelectionMode> ByAtom | ByResidue] [B<-h, --help>] [B<-k, --KeepOldRecords> yes | no]
1362 [B<-m, --mode > Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames |
1363 ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens]
1364 [B<--ModifyHeader> yes | no] [B<--NonStandardKeep> yes | no] [B<--NonStandardCode> character]
1365 [B<-o, --overwrite>] [B<-r, --root> rootname] B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm>]
1366 [B<--Residues> "ResidueNum,[ResidueNum...]" | StartResidueNum,EndResiduNum ]
1367 [B<--SequenceLength> number] [B<--SequenceRecords> Atom | SeqRes]
1368 [B<--SequenceIDPrefix> FileName | HeaderRecord | Automatic]
1369 [B<--WaterResidueNames> Automatic | "ResidueName, [ResidueName,...]"]
1370 [B<-w, --WorkingDir> dirname] PDBFile(s)...
1371
1372 =head1 DESCRIPTION
1373
1374 Extract specific data from I<PDBFile(s)> and generate appropriate PDB or sequence file(s).
1375 Multiple PDBFile names are separated by spaces. The valid file extension is I<.pdb>.
1376 All other file name extensions are ignored during the wild card expansion. All the PDB files
1377 in a current directory can be specified either by I<*.pdb> or the current directory name.
1378
1379 During I<Chains> and I<Sequences> values of B<-m, --mode> option, all ATOM/HETAM records
1380 for chains after the first model in PDB fils containing data for multiple models are ignored.
1381
1382 =head1 OPTIONS
1383
1384 =over 4
1385
1386 =item B<-a, --Atoms> I<"AtomNum,[AtomNum...]" | "StartAtomNum,EndAtomNum" | "AtomName,[AtomName...]">
1387
1388 Specify which atom records to extract from I<PDBFiles(s)> during I<AtomNums>,
1389 I<AtomsRange>, and I<AtomNames> value of B<-m, --mode> option: extract records
1390 corresponding to atom numbers specified in a comma delimited list of atom numbers/names,
1391 or with in the range of start and end atom numbers. Possible values: I<"AtomNum[,AtomNum,..]">,
1392 I<StartAtomNum,EndAtomNum>, or I<"AtomName[,AtomName,..]">. Default: I<None>. Examples:
1393
1394 10
1395 15,20
1396 N,CA,C,O
1397
1398 =item B<-c, --chains> I<First | All | ChainID,[ChainID,...]>
1399
1400 Specify which chains to extract from I<PDBFile(s)> during I<Chains | Sequences> value of
1401 B<-m, --mode> option: first chain, all chains, or a specific list of comma delimited chain IDs.
1402 Possible values: I<First | All | ChainID,[ChainID,...]>. Default: I<First>. Examples:
1403
1404 A
1405 A,B
1406 All
1407
1408 =item B<--CombineChains> I<yes | no>
1409
1410 Specify whether to combine extracted chains data into a single file during I<Chains> or
1411 I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>. Default: I<no>.
1412
1413 During I<Chains> value of <-m, --mode> option with I<Yes> value of <--CombineChains>,
1414 extracted data for specified chains is written into a single file instead of individual file for each
1415 chain.
1416
1417 During I<Sequences> value of <-m, --mode> option with I<Yes> value of <--CombineChains>,
1418 residues sequences for specified chains are extracted and concatenated into a single sequence
1419 file instead of individual file for each chain.
1420
1421 =item B<-d, --distance> I<number>
1422
1423 Specify distance used to extract ATOM/HETATM recods during I<Distance> value of
1424 B<-m, --mode> option. Default: I<10.0> angstroms.
1425
1426 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
1427 ATOM, HETATM or both.
1428
1429 =item B<--DistanceMode> I<Atom | Hetatm | Residue | XYZ>
1430
1431 Specify how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of
1432 B<-m, --mode> option: extract all the records within a certain distance specifed by B<-d, --distance>
1433 from an atom or hetro atom record, a residue, or any artbitrary point. Possible values: I<Atom |
1434 Hetatm | Residue | XYZ>. Default: I<XYZ>.
1435
1436 During I<Residue> value of B<--distancemode>, distance of ATOM/HETATM records is calculated from
1437 all the atoms in the residue and the records are selected as long as any atom of the residue lies with
1438 in the distace specified using B<-d, --distance> option.
1439
1440 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
1441 ATOM, HETATM or both.
1442
1443 =item B<--DistanceSelectionMode> I<ByAtom | ByResidue>
1444
1445 Specify how how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of
1446 B<-m, --mode> option for all values of B<--DistanceMode> option: extract only those ATOM/HETATM
1447 records that meet specified distance criterion; extract all records corresponding to a residue as
1448 long as one of the ATOM/HETATM record in the residue satisfies specified distance criterion. Possible
1449 values: I<ByAtom, ByResidue>. Default value: I<ByAtom>.
1450
1451 =item B<--DistanceOrigin> I<"AtomNumber,AtomName" | "HetatmNumber,HetAtmName" | "ResidueNumber,ResidueName[,ChainID]" | "X,Y,Z">
1452
1453 This value is B<--distancemode> specific. In general, it identifies a point used to select
1454 other ATOM/HETATMS with in a specific distance from this point.
1455
1456 For I<Atom> value of B<--distancemode>, this option corresponds to an atom specification.
1457 Format: I<AtomNumber,AtomName>. Example:
1458
1459 455,CA
1460
1461 For I<Hetatm> value of B<--distancemode>, this option corresponds to a hetatm specification.
1462 Format: I<HetatmNumber,HetAtmName>. Example:
1463
1464 5295,C1
1465
1466 For I<Residue> value of B<--distancemode>, this option corresponds to a residue specification.
1467 Format: I<ResidueNumber, ResidueName[,ChainID]>. Example:
1468
1469 78,MSE
1470 977,RET,A
1471 978,RET,B
1472
1473 For I<XYZ> value of B<--distancemode>, this option corresponds to a coordinate of an
1474 arbitrary point. Format: I<X,Y,X>. Example:
1475
1476 10.044,19.261,-4.292
1477
1478 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
1479 ATOM, HETATM or both.
1480
1481 =item B<-h, --help>
1482
1483 Print this help message.
1484
1485 =item B<-k, --KeepOldRecords> I<yes | no>
1486
1487 Specify whether to transfer old non ATOM and HETATM records from input PDBFile(s) to new
1488 PDBFile(s) during I<Chains | Atoms | HetAtms | CAlphas | Distance| NonWater | NonHydrogens>
1489 value of B<-m --mode> option. By default, except for the HEADER record, all
1490 other unnecessary non ATOM/HETATM records are dropped during the
1491 generation of new PDB files. Possible values: I<yes | no>. Default: I<no>.
1492
1493 =item B<-m, --mode > I<Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames | ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens>
1494
1495 Specify what to extract from I<PDBFile(s)>: I<Chains> - retrieve records for
1496 specified chains; I<Sequences> - generate sequence files for specific chains;
1497 I<Atoms> - extract atom records; I<CAlphas> - extract atom records for alpha
1498 carbon atoms; I<AtomNums> - extract atom records for specified atom numbers;
1499 I<AtomsRange> - extract atom records between specified atom number range;
1500 I<AtomNames> - extract atom records for specified atom names; I<ResidueNums>
1501 - extract records for specified residue numbers; I<ResiduesRange> - extract records
1502 for residues between specified residue number range; I<ResidueNames> - extract
1503 records for specified residue names; I<Distance> - extract records with in a
1504 certain distance from a specific position; I<NonWater> - extract records corresponding
1505 to residues other than water; I<NonHydrogens> - extract non-hydrogen records.
1506
1507 Possible values: I<Chains, Sequences Atoms, CAlphas, AtomNums, AtomsRange,
1508 AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater,
1509 NonHydrogens>. Default value: I<NonWater>
1510
1511 During the generation of new PDB files, unnecessay CONECT records are dropped.
1512
1513 For I<Chains> mode, data for appropriate chains specified by B<--c --chains> option
1514 is extracted from I<PDBFile(s)> and placed into new PDB file(s).
1515
1516 For I<Sequences> mode, residues names using various sequence related options are
1517 extracted for chains specified by B<--c --chains> option from I<PDBFile(s)> and
1518 FASTA sequence file(s) are generated.
1519
1520 For I<Distance> mode, all ATOM/HETATM records with in a distance specified
1521 by B<-d --distance> option from a specific atom, residue or a point indicated by
1522 B<--distancemode> are extracted and placed into new PDB file(s).
1523
1524 For I<NonWater> mode, non water ATOM/HETATM record lines, identified using value of
1525 B<--WaterResidueNames>, are extracted and written to new PDB file(s).
1526
1527 For I<NonHydrogens> mode, ATOM/HETATOM record lines containing element symbol
1528 other than I<H> are extracted and written to new PDB file(s).
1529
1530 For all other options, appropriate ATOM/HETATM records are extracted to generate new
1531 PDB file(s).
1532
1533 B<--RecordMode> option controls type of record lines to extract and process from
1534 I<PDBFile(s)>: ATOM, HETATM or both.
1535
1536 =item B<--ModifyHeader> I<yes | no>
1537
1538 Specify whether to modify HEADER record during the generation of new PDB files
1539 for B<-m, --mode> values of I<Chains | Atoms | CAlphas | Distance>. Possible values:
1540 I<yes | no>. Default: I<yes>. By default, Classification data is replaced by I<Data extracted
1541 using MayaChemTools> before writing out HEADER record.
1542
1543 =item B<--NonStandardKeep> I<yes | no>
1544
1545 Specify whether to include and convert non-standard three letter residue codes into
1546 a code specified using B<--nonstandardcode> option and include them into sequence file(s)
1547 generated during I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>.
1548 Default: I<yes>.
1549
1550 A warning is also printed about the presence of non-standard residues. Any residue other
1551 than standard 20 amino acids and 5 nucleic acid is considered non-standard; additionally,
1552 HETATM residues in chains also tagged as non-standard.
1553
1554 =item B<--NonStandardCode> I<character>
1555
1556 A single character code to use for non-standard residues. Default: I<X>. Possible values:
1557 I<?, -, or X>.
1558
1559 =item B<-o, --overwrite>
1560
1561 Overwrite existing files.
1562
1563 =item B<-r, --root> I<rootname>
1564
1565 New PDB and sequence file name is generated using the root: <Root><Mode>.<Ext>.
1566 Default new file name: <PDBFileName>Chain<ChainID>.pdb for I<Chains> B<mode>;
1567 <PDBFileName>SequenceChain<ChainID>.fasta for I<Sequences> B<mode>;
1568 <PDBFileName>DistanceBy<DistanceMode>.pdb for I<Distance> B<-m, --mode>
1569 <PDBFileName><Mode>.pdb for I<Atoms | CAlphas | NonWater | NonHydrogens> B<-m, --mode>
1570 values. This option is ignored for multiple input files.
1571
1572 =item B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm>
1573
1574 Specify type of record lines to extract and process from I<PDBFile(s)> during various
1575 values of B<-m, --mode> option: extract only ATOM record lines; extract only HETATM
1576 record lines; extract both ATOM and HETATM lines. Possible values: I<Atom | Hetatm
1577 | AtomAndHetatm | XYZ>. Default during I<Atoms, CAlphas, AtomNums, AtomsRange,
1578 AtomNames> values of B<-m, --mode> option: I<Atom>; otherwise: I<AtomAndHetatm>.
1579
1580 This option is ignored during I<Chains, Sequences> values of B<-m, --mode> option.
1581
1582 =item B<--Residues> I<"ResidueNum,[ResidueNum...]" | "StartResidueNum,EndResiduNum" | "ResidueName,[ResidueName...]">
1583
1584 Specify which resiude records to extract from I<PDBFiles(s)> during I<ResidueNums>,
1585 I<ResiduesRange>,and I<ResidueNames> value of B<-m, --mode> option: extract records
1586 corresponding to residue numbers specified in a comma delimited list of residue numbers/names,
1587 or with in the range of start and end residue numbers. Possible values: I<"ResidueNum[,ResidueNum,..]">,
1588 I<StartResidueNum,EndResiduNum>, or I<<"ResidueName[,ResidueName,..]">. Default: I<None>. Examples:
1589
1590 20
1591 5,10
1592 TYR,SER,THR
1593
1594 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
1595 ATOM, HETATM or both.
1596
1597 =item B<--SequenceLength> I<number>
1598
1599 Maximum sequence length per line in sequence file(s). Default: I<80>.
1600
1601 =item B<--SequenceRecords> I<Atom | SeqRes>
1602
1603 Specify which records to use for extracting residue names from I<PDBFiles(s)> during
1604 I<Sequences> value of B<-m, --mode> option: use ATOM records to compile a list
1605 of residues in a chain or parse SEQRES record to get a list of residues. Possible values:
1606 I<Atom | SeqRes>. Default: I<Atom>.
1607
1608 =item B<--SequenceIDPrefix> I<FileName | HeaderRecord | Automatic>
1609
1610 Specify how to generate a prefix for sequence IDs during I<Sequences> value
1611 of B<-m, --mode> option: use input file name prefix; retrieve PDB ID from HEADER record;
1612 or automatically decide the method for generating the prefix. The chain IDs are also
1613 appended to the prefix. Possible values: I<FileName | HeaderRecord | Automatic>.
1614 Default: I<Automatic>
1615
1616 =item B<--WaterResidueNames> I<Automatic | "ResidueName,[ResidueName,...]">
1617
1618 Identification of water residues during I<NonWater> value of B<-m, --mode> option. Possible values:
1619 I<Automatic | "ResidueName,[ResidueName,...]">. Default: I<Automatic> - corresponds
1620 to "HOH,WAT,H20". You can also specify a different comma delimited list of residue names
1621 to use for water.
1622
1623 =item B<-w, --WorkingDir> I<dirname>
1624
1625 Location of working directory. Default: current directory.
1626
1627 =back
1628
1629 =head1 EXAMPLES
1630
1631 To extract non-water records from Sample2.pdb file and generate Sample2NonWater.pdb
1632 file, type:
1633
1634 % ExtractFromPDBFiles.pl Sample2.pdb
1635
1636 To extract non-water records corresponding to only ATOM records from Sample2.pdb file
1637 and generate Sample2NonWater.pdb file, type:
1638
1639 % ExtractFromPDBFiles.pl --RecordMode Atom Sample2.pdb
1640
1641 To extract non-water records from Sample2.pdb file using HOH or WAT residue name for water along
1642 with all old non-coordinate records and generate Sample2NewNonWater.pdb file, type:
1643
1644 % ExtractFromPDBFiles.pl -m NonWater --WaterResidueNames "HOH,WAT"
1645 -KeepOldRecords Yes -r Sample2New -o Sample2.pdb
1646
1647 To extract non-hydrogens records from Sample2.pdb file and generate Sample2NonHydrogen.pdb
1648 file, type:
1649
1650 % ExtractFromPDBFiles.pl -m NonHydrogens Sample2.pdb
1651
1652 To extract data for first chain in Sample2.pdb and generate Sample2ChainA.pdb, type
1653 file, type:
1654
1655 % ExtractFromPDBFiles.pl -m chains -o Sample2.pdb
1656
1657 To extract data for both chains in Sample2.pdb and generate Sample2ChainA.pdb and
1658 Sample2ChainB.pdb, type:
1659
1660 % ExtractFromPDBFiles.pl -m chains -c All -o Sample2.pdb
1661
1662 To extract data for alpha carbons in Sample2.pdb and generate Sample2CAlphas.pdb, type:
1663
1664 % ExtractFromPDBFiles.pl -m CAlphas -o Sample2.pdb
1665
1666 To extract records for specific residue numbers in all chains from Sample2.pdb file and generate
1667 Sample2ResidueNums.pdb file, type:
1668
1669 % ExtractFromPDBFiles.pl -m ResidueNums --Residues "3,6"
1670 Sample2.pdb
1671
1672 To extract records for a specific range of residue number in all chains from Sample2.pdb
1673 file and generate Sample2ResiduesRange.pdb file, type:
1674
1675 % ExtractFromPDBFiles.pl -m ResiduesRange --Residues "10,30"
1676 Sample2.pdb
1677
1678 To extract data for all ATOM and HETATM records with in 10 angstrom of an atom specifed by
1679 atom serial number and name "1,N" in Sample2.pdb file and generate Sample2DistanceByAtom.pdb,
1680 type:
1681
1682 % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom
1683 --DistanceOrigin "1,N" -k No --distance 10 -o Sample2.pdb
1684
1685 To extract data for all ATOM and HETATM records for complete residues with any atom or hetatm
1686 less than 10 angstrom of an atom specifed by atom serial number and name "1,N" in Sample2.pdb
1687 file and generate Sample2DistanceByAtom.pdb, type:
1688
1689 % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom
1690 --DistanceOrigin "1,N" --DistanceSelectionMode ByResidue
1691 -k No --distance 10 -o Sample2.pdb
1692
1693 To extract data for all ATOM and HETATM records with in 25 angstrom of an arbitrary point "0,0,0"
1694 in Sample2.pdb file and generate Sample2DistanceByXYZ.pdb, type:
1695
1696 % ExtractFromPDBFiles.pl -m Distance --DistanceMode XYZ
1697 --DistanceOrigin "0,0,0" -k No --distance 25 -o Sample2.pdb
1698
1699 =head1 AUTHOR
1700
1701 Manish Sud <msud@san.rr.com>
1702
1703 =head1 SEE ALSO
1704
1705 InfoPDBFiles.pl, ModifyPDBFiles.pl
1706
1707 =head1 COPYRIGHT
1708
1709 Copyright (C) 2015 Manish Sud. All rights reserved.
1710
1711 This file is part of MayaChemTools.
1712
1713 MayaChemTools is free software; you can redistribute it and/or modify it under
1714 the terms of the GNU Lesser General Public License as published by the Free
1715 Software Foundation; either version 3 of the License, or (at your option)
1716 any later version.
1717
1718 =cut