0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: ExtractFromPDBFiles.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:19 $
|
|
5 # $Revision: 1.39 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use FileUtil;
|
|
36 use TextUtil;
|
|
37 use PDBFileUtil;
|
|
38 use AminoAcids;
|
|
39 use SequenceFileUtil;
|
|
40
|
|
41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
42
|
|
43 # Autoflush STDOUT
|
|
44 $| = 1;
|
|
45
|
|
46 # Starting message...
|
|
47 $ScriptName = basename($0);
|
|
48 print "\n$ScriptName: Starting...\n\n";
|
|
49 $StartTime = new Benchmark;
|
|
50
|
|
51 # Get the options and setup script...
|
|
52 SetupScriptUsage();
|
|
53 if ($Options{help} || @ARGV < 1) {
|
|
54 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
55 }
|
|
56
|
|
57 my(@PDBFilesList);
|
|
58 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb");
|
|
59
|
|
60 # Process options...
|
|
61 print "Processing options...\n";
|
|
62 my(%OptionsInfo);
|
|
63 ProcessOptions();
|
|
64
|
|
65 # Setup information about input files...
|
|
66 print "Checking input PDB file(s)...\n";
|
|
67 my(%PDBFilesInfo);
|
|
68 RetrievePDBFilesInfo();
|
|
69
|
|
70 # Process input files..
|
|
71 my($FileIndex);
|
|
72 if (@PDBFilesList > 1) {
|
|
73 print "\nProcessing PDB files...\n";
|
|
74 }
|
|
75 for $FileIndex (0 .. $#PDBFilesList) {
|
|
76 if ($PDBFilesInfo{FileOkay}[$FileIndex]) {
|
|
77 print "\nProcessing file $PDBFilesList[$FileIndex]...\n";
|
|
78 ExtractFromPDBFiles($FileIndex);
|
|
79 }
|
|
80 }
|
|
81 print "\n$ScriptName:Done...\n\n";
|
|
82
|
|
83 $EndTime = new Benchmark;
|
|
84 $TotalTime = timediff ($EndTime, $StartTime);
|
|
85 print "Total time: ", timestr($TotalTime), "\n";
|
|
86
|
|
87 ###############################################################################
|
|
88
|
|
89 # Extract appropriate information...
|
|
90 sub ExtractFromPDBFiles {
|
|
91 my($FileIndex) = @_;
|
|
92 my($PDBFile, $PDBRecordLinesRef);
|
|
93
|
|
94 # Get PDB data...
|
|
95 $PDBFile = $PDBFilesList[$FileIndex];
|
|
96 $PDBRecordLinesRef = ReadPDBFile($PDBFile);
|
|
97
|
|
98 if ($OptionsInfo{Mode} =~ /Chains/i) {
|
|
99 ExtractChains($FileIndex, $PDBRecordLinesRef);
|
|
100 }
|
|
101 elsif ($OptionsInfo{Mode} =~ /Sequences/i) {
|
|
102 ExtractSequences($FileIndex, $PDBRecordLinesRef);
|
|
103 }
|
|
104 elsif ($OptionsInfo{Mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i) {
|
|
105 ExtractByAtoms($FileIndex, $PDBRecordLinesRef);
|
|
106 }
|
|
107 elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) {
|
|
108 ExtractByResidues($FileIndex, $PDBRecordLinesRef);
|
|
109 }
|
|
110 elsif ($OptionsInfo{Mode} =~ /Distance/i) {
|
|
111 ExtractByDistance($FileIndex, $PDBRecordLinesRef);
|
|
112 }
|
|
113 elsif ($OptionsInfo{Mode} =~ /NonWater/i) {
|
|
114 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef);
|
|
115 }
|
|
116 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) {
|
|
117 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef);
|
|
118 }
|
|
119 }
|
|
120
|
|
121 # Extract chains and generate new PDB files...
|
|
122 #
|
|
123 sub ExtractChains {
|
|
124 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
125 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap);
|
|
126
|
|
127 # Get chains and residues data...
|
|
128 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1);
|
|
129
|
|
130 if ($OptionsInfo{CombineChains}) {
|
|
131 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
132 print "Generating PDBFileName file $PDBFileName...\n";
|
|
133
|
|
134 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
135
|
|
136 # Write out header and other older recors...
|
|
137 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
138 }
|
|
139
|
|
140 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
|
|
141 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
|
|
142 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex];
|
|
143
|
|
144 if (!$OptionsInfo{CombineChains}) {
|
|
145 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex];
|
|
146 print "Generating PDBFileName file $PDBFileName...\n";
|
|
147
|
|
148 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
149
|
|
150 # Write out header and other older recors...
|
|
151 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
152 }
|
|
153
|
|
154 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers
|
|
155 # for writing out appropriate CONECT records...
|
|
156 %ChainAtomNumbersMap = ();
|
|
157 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) {
|
|
158 print OUTFILE "$RecordLine\n";
|
|
159 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine);
|
|
160 $AtomNumber = int $AtomNumber;
|
|
161 $ChainAtomNumbersMap{$AtomNumber} = $AtomName;
|
|
162 }
|
|
163 # Write out TER record using information from last chain record...
|
|
164 $AtomNumber += 1;
|
|
165 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n";
|
|
166
|
|
167 # Write out CONECT records...
|
|
168 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap);
|
|
169
|
|
170 for $RecordLine (@{$ConectRecordLinesRef}) {
|
|
171 print OUTFILE "$RecordLine\n";
|
|
172 }
|
|
173
|
|
174 if (!$OptionsInfo{CombineChains}) {
|
|
175 # Write out END record...
|
|
176 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
177
|
|
178 close OUTFILE;
|
|
179 }
|
|
180 }
|
|
181
|
|
182 if ($OptionsInfo{CombineChains}) {
|
|
183 # Write out END record...
|
|
184 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
185
|
|
186 close OUTFILE;
|
|
187 }
|
|
188
|
|
189 }
|
|
190
|
|
191 # Extract sequences for individual chains or combine all the chains...
|
|
192 sub ExtractSequences {
|
|
193 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
194 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap);
|
|
195
|
|
196 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) {
|
|
197 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes');
|
|
198 }
|
|
199 else {
|
|
200 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef);
|
|
201 }
|
|
202
|
|
203 # Generate sequence data for all the chains...
|
|
204 %ChainSequencesDataMap = ();
|
|
205 @{$ChainSequencesDataMap{IDs}} = ();
|
|
206 %{$ChainSequencesDataMap{Sequence}} = ();
|
|
207 %{$ChainSequencesDataMap{Description}} = ();
|
|
208
|
|
209 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
|
|
210 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
|
|
211 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex];
|
|
212
|
|
213 # Setup sequence ID...
|
|
214 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex];
|
|
215 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID;
|
|
216 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID;
|
|
217
|
|
218 # Collect sequence data for the chain...
|
|
219 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) {
|
|
220 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
|
|
221 }
|
|
222 else {
|
|
223 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}};
|
|
224 }
|
|
225 # Setup sequence data...
|
|
226 $ChainSequence = '';
|
|
227 RESIDUE: for $Residue (@{$ChainResiduesRef}) {
|
|
228 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue);
|
|
229 if (!$StandardResidue) {
|
|
230 if ($OptionsInfo{KeepNonStandardSequences}) {
|
|
231 $ResidueCode = $OptionsInfo{NonStandardSequenceCode};
|
|
232 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n";
|
|
233 }
|
|
234 else {
|
|
235 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n";
|
|
236 next RESIDUE;
|
|
237 }
|
|
238 }
|
|
239 $ChainSequence .= $ResidueCode;
|
|
240 }
|
|
241 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence;
|
|
242
|
|
243 }
|
|
244
|
|
245 # Write out the sequence files...
|
|
246 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap );
|
|
247 if ($OptionsInfo{CombineChainSequences}) {
|
|
248 # Combine all the chain sequences...
|
|
249 $Sequence = '';
|
|
250 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
|
|
251 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
|
|
252
|
|
253 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID};
|
|
254 }
|
|
255 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";;
|
|
256 $SequenceDescription = $SequenceID;
|
|
257 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
258
|
|
259 print "Generating sequence file $SequenceFileName...\n";
|
|
260 %SequencesDataMap = ();
|
|
261 @{$SequencesDataMap{IDs}} = ();
|
|
262 %{$SequencesDataMap{Sequence}} = ();
|
|
263 %{$SequencesDataMap{Description}} = ();
|
|
264
|
|
265 push @{$SequencesDataMap{IDs}}, $SequenceID;
|
|
266 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription;
|
|
267 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence;
|
|
268
|
|
269 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength});
|
|
270 }
|
|
271 else {
|
|
272 # For each specifed chain, write out the sequences...
|
|
273 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) {
|
|
274 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex];
|
|
275
|
|
276 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex];
|
|
277
|
|
278 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex];
|
|
279 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID};
|
|
280 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID};
|
|
281
|
|
282 print "Generating sequence file $SequenceFileName...\n";
|
|
283 %SequencesDataMap = ();
|
|
284 @{$SequencesDataMap{IDs}} = ();
|
|
285 %{$SequencesDataMap{Sequence}} = ();
|
|
286 %{$SequencesDataMap{Description}} = ();
|
|
287
|
|
288 push @{$SequencesDataMap{IDs}}, $SequenceID;
|
|
289 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription;
|
|
290 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence;
|
|
291
|
|
292 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength});
|
|
293 }
|
|
294 }
|
|
295 }
|
|
296
|
|
297 # Extract atoms...
|
|
298 sub ExtractByAtoms {
|
|
299 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
300 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $IgnoreRecord, $ConectRecordLinesRef, %AtomNumbersMap);
|
|
301
|
|
302 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
303 print "Generating PDBFileName file $PDBFileName...\n";
|
|
304 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
305
|
|
306 # Write out header and other older recors...
|
|
307 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
308
|
|
309 # Write out all ATOM records along with TER and model records to indicate
|
|
310 # chains and multiple models..
|
|
311 %AtomNumbersMap = ();
|
|
312 $ChainRecordCount = 0;
|
|
313 for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
314 if (CheckRecordType($RecordLine)) {
|
|
315 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine);
|
|
316
|
|
317 # Check atoms...
|
|
318 $IgnoreRecord = 1;
|
|
319 if ($OptionsInfo{Mode} =~ /^Atoms$/i) {
|
|
320 $IgnoreRecord = 0;
|
|
321 }
|
|
322 elsif ($OptionsInfo{Mode} =~ /^(CAlphas|AtomNames)$/i) {
|
|
323 if (exists $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName}) {
|
|
324 $IgnoreRecord = 0;
|
|
325 }
|
|
326 }
|
|
327 elsif ($OptionsInfo{Mode} =~ /^AtomNums$/i) {
|
|
328 if (exists $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNumber}) {
|
|
329 $IgnoreRecord = 0;
|
|
330 }
|
|
331 }
|
|
332 elsif ($OptionsInfo{Mode} =~ /^AtomsRange$/i) {
|
|
333 if ($AtomNumber >= $OptionsInfo{SpecifiedStartAtomNum} && $AtomNumber <= $OptionsInfo{SpecifiedEndAtomNum}) {
|
|
334 $IgnoreRecord = 0;
|
|
335 }
|
|
336 }
|
|
337
|
|
338 if (!$IgnoreRecord) {
|
|
339 $ChainRecordCount++;
|
|
340 print OUTFILE "$RecordLine\n";
|
|
341
|
|
342 $AtomNumber = int $AtomNumber;
|
|
343 $AtomNumbersMap{$AtomNumber} = $AtomName;
|
|
344 }
|
|
345 }
|
|
346 elsif (IsTerRecordType($RecordLine)) {
|
|
347 if ($ChainRecordCount) {
|
|
348 print OUTFILE GenerateTerRecordLine(), "\n";
|
|
349 }
|
|
350 $ChainRecordCount = 0;
|
|
351 }
|
|
352 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
|
|
353 print OUTFILE "$RecordLine\n";
|
|
354 }
|
|
355 }
|
|
356
|
|
357 # Write out appropriate CONECT records...
|
|
358 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
|
|
359 for $RecordLine (@{$ConectRecordLinesRef}) {
|
|
360 print OUTFILE "$RecordLine\n";
|
|
361 }
|
|
362
|
|
363 # Write out END record...
|
|
364 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
365
|
|
366 close OUTFILE;
|
|
367 }
|
|
368
|
|
369 # Extract residues...
|
|
370 sub ExtractByResidues {
|
|
371 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
372 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap);
|
|
373
|
|
374 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
375 print "Generating PDBFileName file $PDBFileName...\n";
|
|
376 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
377
|
|
378 # Write out header and other older recors...
|
|
379 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
380
|
|
381 # Write out all ATOM records for specified residues with TER and model records to indicate
|
|
382 # chains and multiple models...
|
|
383 %AtomNumbersMap = ();
|
|
384 $ChainRecordCount = 0;
|
|
385 for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
386 if (CheckRecordType($RecordLine)) {
|
|
387 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine);
|
|
388
|
|
389 # Check residues...
|
|
390 $IgnoreRecord = 1;
|
|
391 if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) {
|
|
392 if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) {
|
|
393 $IgnoreRecord = 0;
|
|
394 }
|
|
395 }
|
|
396 elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) {
|
|
397 if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) {
|
|
398 $IgnoreRecord = 0;
|
|
399 }
|
|
400 }
|
|
401 elsif ($OptionsInfo{Mode} =~ /^ResidueNames$/i) {
|
|
402 if (exists $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName}) {
|
|
403 $IgnoreRecord = 0;
|
|
404 }
|
|
405 }
|
|
406 if (!$IgnoreRecord) {
|
|
407 $ChainRecordCount++;
|
|
408 print OUTFILE "$RecordLine\n";
|
|
409 $AtomNumber = int $AtomNumber;
|
|
410 $AtomNumbersMap{$AtomNumber} = $AtomName;
|
|
411 }
|
|
412 }
|
|
413 elsif (IsTerRecordType($RecordLine)) {
|
|
414 if ($ChainRecordCount) {
|
|
415 print OUTFILE GenerateTerRecordLine(), "\n";
|
|
416 }
|
|
417 $ChainRecordCount = 0;
|
|
418 }
|
|
419 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
|
|
420 print OUTFILE "$RecordLine\n";
|
|
421 }
|
|
422 }
|
|
423
|
|
424 # Write out appropriate CONECT records...
|
|
425 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
|
|
426 for $RecordLine (@{$ConectRecordLinesRef}) {
|
|
427 print OUTFILE "$RecordLine\n";
|
|
428 }
|
|
429 # Write out END record...
|
|
430 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
431
|
|
432 close OUTFILE;
|
|
433 }
|
|
434
|
|
435 # Extract non water records...
|
|
436 sub ExtractNonWaterRecords {
|
|
437 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
438 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap);
|
|
439
|
|
440 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
441 print "Generating PDBFileName file $PDBFileName...\n";
|
|
442 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
443
|
|
444 # Write out header and other older recors...
|
|
445 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
446
|
|
447 # Write out all ATOM/HETATM non water records along with TER and model records to indicate
|
|
448 # chains and multiple models..
|
|
449 %AtomNumbersMap = ();
|
|
450 $ChainRecordCount = 0;
|
|
451 for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
452 if (CheckRecordType($RecordLine)) {
|
|
453 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine);
|
|
454 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) {
|
|
455 $ChainRecordCount++;
|
|
456 print OUTFILE "$RecordLine\n";
|
|
457 $AtomNumber = int $AtomNumber;
|
|
458 $AtomNumbersMap{$AtomNumber} = $AtomName;
|
|
459 }
|
|
460 }
|
|
461 elsif (IsTerRecordType($RecordLine)) {
|
|
462 if ($ChainRecordCount) {
|
|
463 print OUTFILE GenerateTerRecordLine(), "\n";
|
|
464 }
|
|
465 $ChainRecordCount = 0;
|
|
466 }
|
|
467 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
|
|
468 print OUTFILE "$RecordLine\n";
|
|
469 }
|
|
470 }
|
|
471
|
|
472 # Write out appropriate CONECT records...
|
|
473 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
|
|
474 for $RecordLine (@{$ConectRecordLinesRef}) {
|
|
475 print OUTFILE "$RecordLine\n";
|
|
476 }
|
|
477 # Write out END record...
|
|
478 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
479
|
|
480 close OUTFILE;
|
|
481 }
|
|
482
|
|
483 # Extract non hydrogen records...
|
|
484 sub ExtractNonHydrogenRecords {
|
|
485 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
486 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap);
|
|
487
|
|
488 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
489 print "Generating PDBFileName file $PDBFileName...\n";
|
|
490 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
491
|
|
492 # Write out header and other older recors...
|
|
493 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
494
|
|
495 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate
|
|
496 # chains and multiple models..
|
|
497 %AtomNumbersMap = ();
|
|
498 $ChainRecordCount = 0;
|
|
499 for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
500 if (CheckRecordType($RecordLine)) {
|
|
501 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine);
|
|
502 if ($ElementSymbol !~ /^H$/i) {
|
|
503 $ChainRecordCount++;
|
|
504 print OUTFILE "$RecordLine\n";
|
|
505 $AtomNumber = int $AtomNumber;
|
|
506 $AtomNumbersMap{$AtomNumber} = $AtomName;
|
|
507 }
|
|
508 }
|
|
509 elsif (IsTerRecordType($RecordLine)) {
|
|
510 if ($ChainRecordCount) {
|
|
511 print OUTFILE GenerateTerRecordLine(), "\n";
|
|
512 }
|
|
513 $ChainRecordCount = 0;
|
|
514 }
|
|
515 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
|
|
516 print OUTFILE "$RecordLine\n";
|
|
517 }
|
|
518 }
|
|
519
|
|
520 # Write out appropriate CONECT records...
|
|
521 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
|
|
522 for $RecordLine (@{$ConectRecordLinesRef}) {
|
|
523 print OUTFILE "$RecordLine\n";
|
|
524 }
|
|
525 # Write out END record...
|
|
526 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
527
|
|
528 close OUTFILE;
|
|
529 }
|
|
530
|
|
531 # Extract ATOM/HETATM records by distance...
|
|
532 sub ExtractByDistance {
|
|
533 my($FileIndex, $PDBRecordLinesRef) = @_;
|
|
534 my($PDBFileName, $RecordLine, $RecordLineNum, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $IgnoreRecord, $ResidueID, @OriginCoords, @Coords, %AtomNumbersMap, %ResiduesDataMap);
|
|
535
|
|
536 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0];
|
|
537 print "Generating PDBFileName file $PDBFileName...\n";
|
|
538 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n";
|
|
539
|
|
540 # Write out header and other older recors...
|
|
541 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef);
|
|
542
|
|
543 # Setup coordinates of origin to calculate distance...
|
|
544 @OriginCoords = ();
|
|
545 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]};
|
|
546
|
|
547 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate
|
|
548 # chains and multiple models...
|
|
549 %AtomNumbersMap = ();
|
|
550
|
|
551 %ResiduesDataMap = ();
|
|
552 %{$ResiduesDataMap{ID}} = ();
|
|
553 %{$ResiduesDataMap{Status}} = ();
|
|
554
|
|
555 $ChainRecordCount = 0;
|
|
556 $RecordLineNum = 0;
|
|
557
|
|
558 for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
559 $RecordLineNum++;
|
|
560 if (CheckRecordType($RecordLine)) {
|
|
561 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
|
|
562 @Coords = (); push @Coords, ($X, $Y, $Z);
|
|
563
|
|
564 $IgnoreRecord = 1;
|
|
565 if ($OptionsInfo{DistanceSelectionMode} =~ /^ByResidue$/i) {
|
|
566 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}";
|
|
567 if (exists $ResiduesDataMap{ID}{$ResidueID}) {
|
|
568 # Residue data has been processed; check its selection status...
|
|
569 if ($ResiduesDataMap{Status}{$ResidueID}) {
|
|
570 $IgnoreRecord = 0;
|
|
571 }
|
|
572 }
|
|
573 else {
|
|
574 # Residue hasn't been processed...
|
|
575 $ResiduesDataMap{ID}{$ResidueID} = $ResidueID;
|
|
576 $ResiduesDataMap{Status}{$ResidueID} = 0;
|
|
577 if (CheckResidueDistance($ResidueID, $RecordLineNum, $PDBRecordLinesRef, \@OriginCoords)) {
|
|
578 $IgnoreRecord = 0;
|
|
579 $ResiduesDataMap{Status}{$ResidueID} = 1;
|
|
580 }
|
|
581 }
|
|
582 }
|
|
583 elsif ($OptionsInfo{DistanceSelectionMode} =~ /^ByAtom$/i) {
|
|
584 if (CheckDistance(\@Coords, \@OriginCoords)) {
|
|
585 $IgnoreRecord = 0;
|
|
586 }
|
|
587 }
|
|
588
|
|
589 if (!$IgnoreRecord) {
|
|
590 $ChainRecordCount++;
|
|
591 print OUTFILE "$RecordLine\n";
|
|
592 $AtomNumber = int $AtomNumber;
|
|
593 $AtomNumbersMap{$AtomNumber} = $AtomName;
|
|
594 }
|
|
595 }
|
|
596 elsif (IsTerRecordType($RecordLine)) {
|
|
597 if ($ChainRecordCount) {
|
|
598 print OUTFILE GenerateTerRecordLine(), "\n";
|
|
599 }
|
|
600 $ChainRecordCount = 0;
|
|
601 }
|
|
602 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) {
|
|
603 print OUTFILE "$RecordLine\n";
|
|
604 }
|
|
605 }
|
|
606
|
|
607 # Write out appropriate CONECT records...
|
|
608 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap);
|
|
609 for $RecordLine (@{$ConectRecordLinesRef}) {
|
|
610 print OUTFILE "$RecordLine\n";
|
|
611 }
|
|
612
|
|
613 # Write out END record...
|
|
614 print OUTFILE GenerateEndRecordLine(), "\n";
|
|
615
|
|
616 close OUTFILE;
|
|
617 }
|
|
618
|
|
619 # Does record type correspond to the specified record type?
|
|
620 sub CheckRecordType {
|
|
621 my($RecordLine) = @_;
|
|
622 my($Status);
|
|
623
|
|
624 $Status = 0;
|
|
625 if ($OptionsInfo{RecordMode} =~ /^Atom$/i) {
|
|
626 $Status = IsAtomRecordType($RecordLine) ? 1 : 0;
|
|
627 }
|
|
628 elsif ($OptionsInfo{RecordMode} =~ /^Hetatm$/i) {
|
|
629 $Status = IsHetatmRecordType($RecordLine) ? 1 : 0;
|
|
630 }
|
|
631 elsif ($OptionsInfo{RecordMode} =~ /^AtomAndHetatm$/i) {
|
|
632 $Status = (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) ? 1 : 0;
|
|
633 }
|
|
634
|
|
635 return $Status;
|
|
636 }
|
|
637
|
|
638 # Does record meets distance citerion specified by the user?
|
|
639 sub CheckResidueDistance {
|
|
640 my($SpecifiedResidueID, $StartingLineNum, $PDBRecordLinesRef, $OriginCoordsRef) = @_;
|
|
641 my($Status, $RecordLine, $RecordLineIndex, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $ResidueID, @Coords);
|
|
642
|
|
643 $Status = 0;
|
|
644
|
|
645 RECORDLINE: for $RecordLineIndex (($StartingLineNum - 1) .. $#{$PDBRecordLinesRef}) {
|
|
646 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex];
|
|
647 if (!CheckRecordType($RecordLine)) {
|
|
648 next RECORDLINE;
|
|
649 }
|
|
650 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
|
|
651 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}";
|
|
652
|
|
653 if ($ResidueID !~ /^$SpecifiedResidueID$/i) {
|
|
654 # It's a new residue line...
|
|
655 last RECORDLINE;
|
|
656 }
|
|
657
|
|
658 # Check distance...
|
|
659 @Coords = (); push @Coords, ($X, $Y, $Z);
|
|
660 if (CheckDistance(\@Coords, $OriginCoordsRef)) {
|
|
661 # Distance criterion is met for at least one record in the residue...
|
|
662 $Status = 1;
|
|
663 last RECORDLINE;
|
|
664 }
|
|
665 }
|
|
666 return $Status;
|
|
667 }
|
|
668
|
|
669 # Does record meets distance citerion specified by the user?
|
|
670 sub CheckDistance {
|
|
671 my($CoordsRef, $OriginCoordsRef) = @_;
|
|
672 my($Status, $Index, $Distance, $DistanceSquare);
|
|
673
|
|
674 $Status = 0;
|
|
675
|
|
676 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
|
|
677 # Go over coordinates of all the atoms in the residue...
|
|
678 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef};
|
|
679 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) {
|
|
680 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2;
|
|
681 $Distance = sqrt $DistanceSquare;
|
|
682 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) {
|
|
683 $Status = 1;
|
|
684 last INDEX;
|
|
685 }
|
|
686 }
|
|
687 }
|
|
688 else {
|
|
689 $DistanceSquare = 0;
|
|
690 for $Index (0 .. 2) {
|
|
691 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2;
|
|
692 }
|
|
693 $Distance = sqrt $DistanceSquare;
|
|
694 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0;
|
|
695 }
|
|
696
|
|
697 return $Status;
|
|
698 }
|
|
699
|
|
700 # Write out modifed header and other older records...
|
|
701 sub WriteHeaderAndOlderRecords {
|
|
702 my($OutFileRef, $PDBRecordLinesRef) = @_;
|
|
703
|
|
704 if ($OptionsInfo{ModifyHeaderRecord}) {
|
|
705 # Write out modified HEADER record...
|
|
706 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef);
|
|
707 $Classification = 'Data extracted using MayaChemTools';
|
|
708 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n";
|
|
709 }
|
|
710 else {
|
|
711 print $OutFileRef $PDBRecordLinesRef->[0], "\n";
|
|
712 }
|
|
713
|
|
714 # Write out any old records...
|
|
715 if ($OptionsInfo{KeepOldRecords}) {
|
|
716 my($RecordLineIndex, $RecordLine);
|
|
717 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file...
|
|
718 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) {
|
|
719 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex];
|
|
720 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) {
|
|
721 last RECORDLINE;
|
|
722 }
|
|
723 print $OutFileRef "$RecordLine\n";
|
|
724 }
|
|
725 }
|
|
726 }
|
|
727
|
|
728 # Get header record information assuming it's the first record...
|
|
729 sub GetHeaderRecordInformation {
|
|
730 my($PDBRecordLinesRef) = @_;
|
|
731 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine);
|
|
732
|
|
733 ($Classification, $DepositionDate, $IDCode) = ('') x 3;
|
|
734 $HeaderRecordLine = $PDBRecordLinesRef->[0];
|
|
735 if (IsHeaderRecordType($HeaderRecordLine)) {
|
|
736 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine);
|
|
737 }
|
|
738 return ($Classification, $DepositionDate, $IDCode);
|
|
739 }
|
|
740
|
|
741 # Get one letter residue code...
|
|
742 sub GetResidueCode {
|
|
743 my($ResidueName) = @_;
|
|
744 my($ResidueCode, $StandardResidue);
|
|
745
|
|
746 $ResidueCode = $OptionsInfo{NonStandardSequenceCode};
|
|
747 $StandardResidue = 0;
|
|
748
|
|
749 if (length($ResidueName) == 3) {
|
|
750 # Assume it's an amino acid...
|
|
751 if (AminoAcids::IsAminoAcid($ResidueName)) {
|
|
752 # Standard amino acid...
|
|
753 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName);
|
|
754 $StandardResidue = 1;
|
|
755 }
|
|
756 }
|
|
757 elsif (length($ResidueName) == 1) {
|
|
758 # Assume it's a nucleic acid...
|
|
759 if ($ResidueName =~ /^(A|G|T|U|C)$/i) {
|
|
760 $ResidueCode = $ResidueName;
|
|
761 $StandardResidue = 1;
|
|
762 }
|
|
763 }
|
|
764
|
|
765 return ($ResidueCode, $StandardResidue);
|
|
766 }
|
|
767
|
|
768 # Process option values...
|
|
769 sub ProcessOptions {
|
|
770 %OptionsInfo = ();
|
|
771 $OptionsInfo{Mode} = $Options{mode};
|
|
772
|
|
773 my(@SpecifiedChains) = ();
|
|
774 if ($Options{chains} =~ /^(First|All)$/i) {
|
|
775 $OptionsInfo{ChainsToExtract} = $Options{chains};
|
|
776 }
|
|
777 else {
|
|
778 @SpecifiedChains = split /\,/, $Options{chains};
|
|
779 $OptionsInfo{ChainsToExtract} = 'Specified';
|
|
780 }
|
|
781 @{$OptionsInfo{SpecifiedChains}} = ();
|
|
782 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains;
|
|
783
|
|
784 $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0;
|
|
785
|
|
786 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0;
|
|
787
|
|
788 ProcessResiduesOptions();
|
|
789 ProcessAtomsOptions();
|
|
790 ProcessDistanceOptions();
|
|
791
|
|
792 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames};
|
|
793 @{$OptionsInfo{SpecifiedWaterResiduesList}} = ();
|
|
794 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = ();
|
|
795
|
|
796 my(@SpecifiedWaterResiduesList);
|
|
797 @SpecifiedWaterResiduesList = ();
|
|
798
|
|
799 if ($OptionsInfo{Mode} =~ /^NonWater$/i) {
|
|
800 my($WaterResidueName);
|
|
801 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) {
|
|
802 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O');
|
|
803 }
|
|
804 else {
|
|
805 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames};
|
|
806 }
|
|
807 for $WaterResidueName (@SpecifiedWaterResiduesList) {
|
|
808 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName;
|
|
809 }
|
|
810 }
|
|
811 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList;
|
|
812
|
|
813 $OptionsInfo{RecordMode} = $Options{recordmode} ? $Options{recordmode} : ($Options{mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i ? "Atom" : "AtomAndHetatm");
|
|
814
|
|
815 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0;
|
|
816
|
|
817 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0;
|
|
818
|
|
819 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0;
|
|
820 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode};
|
|
821 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength};
|
|
822 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords};
|
|
823 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix};
|
|
824
|
|
825 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
|
|
826 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
|
|
827 }
|
|
828
|
|
829 # Process specified residue options...
|
|
830 sub ProcessResiduesOptions {
|
|
831 my($ResidueNum, $StartResidueNum, $EndResNum, $ResidueName, @SpecifiedResidueNumsList, @SpecifiedResidueNamesList);
|
|
832
|
|
833 @SpecifiedResidueNumsList = ();
|
|
834 ($StartResidueNum, $EndResNum) = (0, 0);
|
|
835
|
|
836 @SpecifiedResidueNamesList = ();
|
|
837
|
|
838 if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) {
|
|
839 if (!$Options{residues}) {
|
|
840 die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums, ResiduesRange, or ResidueNames\" \"-m, --mode\". \n";
|
|
841 }
|
|
842 $OptionsInfo{Residues} = $Options{residues};
|
|
843 $OptionsInfo{Residues} =~ s/ //g;
|
|
844
|
|
845 if ($OptionsInfo{Mode} =~ /^ResidueNames$/i) {
|
|
846 @SpecifiedResidueNamesList = split /\,/, $OptionsInfo{Residues};
|
|
847 }
|
|
848 else {
|
|
849 @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues};
|
|
850 for $ResidueNum (@SpecifiedResidueNumsList) {
|
|
851 if (!IsPositiveInteger($ResidueNum)) {
|
|
852 die "Error: Invalid residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n";
|
|
853 }
|
|
854 }
|
|
855 if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) {
|
|
856 if (@SpecifiedResidueNumsList != 2) {
|
|
857 die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n";
|
|
858 }
|
|
859 if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) {
|
|
860 die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue number must be less than end residue number.\n";
|
|
861 }
|
|
862 ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList;
|
|
863 }
|
|
864 }
|
|
865 }
|
|
866
|
|
867 @{$OptionsInfo{SpecifiedResidueNumsList}} = ();
|
|
868 push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList;
|
|
869
|
|
870 $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum;
|
|
871 $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum;
|
|
872
|
|
873 @{$OptionsInfo{SpecifiedResidueNamesList}} = ();
|
|
874 push @{$OptionsInfo{SpecifiedResidueNamesList}}, @SpecifiedResidueNamesList;
|
|
875
|
|
876 # Set up a specified residue numbers map...
|
|
877 %{$OptionsInfo{SpecifiedResidueNumsMap}} = ();
|
|
878 for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) {
|
|
879 $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum;
|
|
880 }
|
|
881
|
|
882 # Set up a specified residue names map...
|
|
883 %{$OptionsInfo{SpecifiedResidueNamesMap}} = ();
|
|
884 for $ResidueName (@{$OptionsInfo{SpecifiedResidueNamesList}}) {
|
|
885 $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName} = lc $ResidueName;
|
|
886 }
|
|
887
|
|
888 }
|
|
889
|
|
890 # Process specified atom options...
|
|
891 sub ProcessAtomsOptions {
|
|
892 my($AtomNum, $StartAtomNum, $EndAtomNum, $AtomName, @SpecifiedAtomNumsList, @SpecifiedAtomNamesList);
|
|
893
|
|
894 @SpecifiedAtomNumsList = ();
|
|
895 ($StartAtomNum, $EndAtomNum) = (0, 0);
|
|
896
|
|
897 @SpecifiedAtomNamesList = ();
|
|
898
|
|
899 if ($OptionsInfo{Mode} =~ /^(AtomNums|AtomsRange|AtomNames)$/i) {
|
|
900 if (!$Options{atoms}) {
|
|
901 die "Error: You must specify a value for \"--Atoms\" option in \"AtomNums, AtomsRange, or AtomNames\" \"-m, --mode\". \n";
|
|
902 }
|
|
903 $OptionsInfo{Atoms} = $Options{atoms};
|
|
904 $OptionsInfo{Atoms} =~ s/ //g;
|
|
905
|
|
906 if ($OptionsInfo{Mode} =~ /^AtomNames$/i) {
|
|
907 @SpecifiedAtomNamesList = split /\,/, $OptionsInfo{Atoms};
|
|
908 }
|
|
909 else {
|
|
910 @SpecifiedAtomNumsList = split /\,/, $OptionsInfo{Atoms};
|
|
911 for $AtomNum (@SpecifiedAtomNumsList) {
|
|
912 if (!IsPositiveInteger($AtomNum)) {
|
|
913 die "Error: Invalid atom number value, $AtomNum, for \"--Atoms\" option during \"AtomNums\" or \"AtomsRange\"value of \"-m --mode\" option: Atom number must be a positive integer.\n";
|
|
914 }
|
|
915 }
|
|
916 if ($OptionsInfo{Mode} =~ /^AtomsRange$/i) {
|
|
917 if (@SpecifiedAtomNumsList != 2) {
|
|
918 die "Error: Invalid number of atom number values, ", scalar(@SpecifiedAtomNumsList), ", for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end atom numbers.\n";
|
|
919 }
|
|
920 if ($SpecifiedAtomNumsList[0] > $SpecifiedAtomNumsList[1]) {
|
|
921 die "Error: Invalid atom number values, @SpecifiedAtomNumsList, for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The start atom number must be less than end atom number.\n";
|
|
922 }
|
|
923 ($StartAtomNum, $EndAtomNum) = @SpecifiedAtomNumsList;
|
|
924 }
|
|
925 }
|
|
926 }
|
|
927 elsif ($OptionsInfo{Mode} =~ /^CAlphas$/i) {
|
|
928 @SpecifiedAtomNamesList = ("CA");
|
|
929 }
|
|
930
|
|
931 @{$OptionsInfo{SpecifiedAtomNumsList}} = ();
|
|
932 push @{$OptionsInfo{SpecifiedAtomNumsList}}, @SpecifiedAtomNumsList;
|
|
933
|
|
934 $OptionsInfo{SpecifiedStartAtomNum} = $StartAtomNum;
|
|
935 $OptionsInfo{SpecifiedEndAtomNum} = $EndAtomNum;
|
|
936
|
|
937 @{$OptionsInfo{SpecifiedAtomNamesList}} = ();
|
|
938 push @{$OptionsInfo{SpecifiedAtomNamesList}}, @SpecifiedAtomNamesList;
|
|
939
|
|
940 # Set up a specified residue numbers map...
|
|
941 %{$OptionsInfo{SpecifiedAtomNumsMap}} = ();
|
|
942 for $AtomNum (@{$OptionsInfo{SpecifiedAtomNumsList}}) {
|
|
943 $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNum} = $AtomNum;
|
|
944 }
|
|
945
|
|
946 # Set up a specified residue names map...
|
|
947 %{$OptionsInfo{SpecifiedAtomNamesMap}} = ();
|
|
948 for $AtomName (@{$OptionsInfo{SpecifiedAtomNamesList}}) {
|
|
949 $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName} = lc $AtomName;
|
|
950 }
|
|
951
|
|
952 }
|
|
953
|
|
954 # Process specified distance options...
|
|
955 sub ProcessDistanceOptions {
|
|
956 my(@SpecifiedDistanceOrigin) = ();
|
|
957
|
|
958 $OptionsInfo{MaxExtractionDistance} = $Options{distance};
|
|
959 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode};
|
|
960 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : '';
|
|
961 $OptionsInfo{DistanceSelectionMode} = $Options{distanceselectionmode};
|
|
962
|
|
963 if ($OptionsInfo{Mode} =~ /^Distance$/i) {
|
|
964 if (!$Options{distanceorigin}) {
|
|
965 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n";
|
|
966 }
|
|
967 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin};
|
|
968 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) {
|
|
969 if (@SpecifiedDistanceOrigin != 2) {
|
|
970 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n";
|
|
971 }
|
|
972 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
|
|
973 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n";
|
|
974 }
|
|
975 }
|
|
976 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) {
|
|
977 if (@SpecifiedDistanceOrigin != 2) {
|
|
978 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n";
|
|
979 }
|
|
980 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
|
|
981 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n";
|
|
982 }
|
|
983 }
|
|
984 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
|
|
985 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) {
|
|
986 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n";
|
|
987 }
|
|
988 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) {
|
|
989 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n";
|
|
990 }
|
|
991 }
|
|
992 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
|
|
993 if (@SpecifiedDistanceOrigin != 3) {
|
|
994 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n";
|
|
995 }
|
|
996 my($Value);
|
|
997 for $Value (@SpecifiedDistanceOrigin) {
|
|
998 if (!IsNumerical($Value)) {
|
|
999 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n";
|
|
1000 }
|
|
1001 }
|
|
1002 }
|
|
1003 }
|
|
1004 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = ();
|
|
1005 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin;
|
|
1006
|
|
1007 }
|
|
1008
|
|
1009 # Retrieve information about PDB files...
|
|
1010 sub RetrievePDBFilesInfo {
|
|
1011 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix);
|
|
1012
|
|
1013 %PDBFilesInfo = ();
|
|
1014 @{$PDBFilesInfo{FileOkay}} = ();
|
|
1015 @{$PDBFilesInfo{OutFileRoot}} = ();
|
|
1016 @{$PDBFilesInfo{OutFileNames}} = ();
|
|
1017 @{$PDBFilesInfo{ChainLabels}} = ();
|
|
1018 @{$PDBFilesInfo{ChainSequenceIDs}} = ();
|
|
1019 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = ();
|
|
1020 @{$PDBFilesInfo{SpecifiedChains}} = ();
|
|
1021 @{$PDBFilesInfo{DistanceOrigin}} = ();
|
|
1022
|
|
1023 FILELIST: for $Index (0 .. $#PDBFilesList) {
|
|
1024 $PDBFilesInfo{FileOkay}[$Index] = 0;
|
|
1025
|
|
1026 $PDBFilesInfo{OutFileRoot}[$Index] = '';
|
|
1027 @{$PDBFilesInfo{OutFileNames}[$Index]} = ();
|
|
1028 @{$PDBFilesInfo{OutFileNames}[$Index]} = ();
|
|
1029 @{$PDBFilesInfo{ChainLabels}[$Index]} = ();
|
|
1030 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = ();
|
|
1031 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = ();
|
|
1032 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = ();
|
|
1033 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = ();
|
|
1034
|
|
1035 $PDBFile = $PDBFilesList[$Index];
|
|
1036 if (!(-e $PDBFile)) {
|
|
1037 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n";
|
|
1038 next FILELIST;
|
|
1039 }
|
|
1040 if (!CheckFileType($PDBFile, "pdb")) {
|
|
1041 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n";
|
|
1042 next FILELIST;
|
|
1043 }
|
|
1044 if (! open PDBFILE, "$PDBFile") {
|
|
1045 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n";
|
|
1046 next FILELIST;
|
|
1047 }
|
|
1048 close PDBFILE;
|
|
1049
|
|
1050 # Get PDB data...
|
|
1051 $PDBRecordLinesRef = ReadPDBFile($PDBFile);
|
|
1052 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) {
|
|
1053 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes');
|
|
1054 }
|
|
1055 else {
|
|
1056 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef);
|
|
1057 }
|
|
1058 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) {
|
|
1059 warn "Warning: Ignoring file $PDBFile: No chains found \n";
|
|
1060 next FILELIST;
|
|
1061 }
|
|
1062
|
|
1063 # Make sure specified chains exist in PDB file...
|
|
1064 @SpecifiedChains = ();
|
|
1065 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) {
|
|
1066 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) {
|
|
1067 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) {
|
|
1068 push @SpecifiedChains, $ChainID;
|
|
1069 }
|
|
1070 else {
|
|
1071 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n";
|
|
1072 next FILELIST;
|
|
1073 }
|
|
1074 }
|
|
1075 }
|
|
1076 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) {
|
|
1077 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0];
|
|
1078 }
|
|
1079 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) {
|
|
1080 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}};
|
|
1081 }
|
|
1082 # Setup chain labels to use for sequence IDs and generating output files...
|
|
1083 @ChainLabels = ();
|
|
1084 for $ChainID (@SpecifiedChains) {
|
|
1085 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig;
|
|
1086 $ChainLabel = "Chain${ChainLabel}";
|
|
1087 push @ChainLabels, $ChainLabel;
|
|
1088 }
|
|
1089
|
|
1090 # Make sure specified distance origin is valid...
|
|
1091 @DistanceOrigin = ();
|
|
1092 if ($OptionsInfo{Mode} =~ /^Distance$/i) {
|
|
1093 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) {
|
|
1094 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine);
|
|
1095 $RecordType = $OptionsInfo{ExtractionDistanceMode};
|
|
1096 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
|
|
1097 $RecordFound = 0;
|
|
1098 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
1099 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
|
|
1100 next LINE;
|
|
1101 }
|
|
1102 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
|
|
1103 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName);
|
|
1104 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) {
|
|
1105 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) {
|
|
1106 $RecordFound = 1;
|
|
1107 last LINE;
|
|
1108 }
|
|
1109 }
|
|
1110 }
|
|
1111 if (!$RecordFound) {
|
|
1112 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n";
|
|
1113 next FILELIST;
|
|
1114 }
|
|
1115 push @DistanceOrigin, ($X, $Y, $Z);
|
|
1116 }
|
|
1117 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
|
|
1118 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine);
|
|
1119 $SpecifiedChainID = '';
|
|
1120 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) {
|
|
1121 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
|
|
1122 }
|
|
1123 else {
|
|
1124 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
|
|
1125 }
|
|
1126 $RecordFound = 0;
|
|
1127 LINE: for $RecordLine (@{$PDBRecordLinesRef}) {
|
|
1128 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) {
|
|
1129 next LINE;
|
|
1130 }
|
|
1131 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine);
|
|
1132 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName);
|
|
1133 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID);
|
|
1134 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) {
|
|
1135 next LINE;
|
|
1136 }
|
|
1137 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) {
|
|
1138 # Store coordinates for all the atoms...
|
|
1139 $RecordFound = 1;
|
|
1140 push @DistanceOrigin, ($X, $Y, $Z);
|
|
1141 next LINE;
|
|
1142 }
|
|
1143 }
|
|
1144 if (!$RecordFound) {
|
|
1145 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n";
|
|
1146 next FILELIST;
|
|
1147 }
|
|
1148 }
|
|
1149 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
|
|
1150 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}};
|
|
1151 }
|
|
1152 }
|
|
1153 # Setup output file names...
|
|
1154 @OutFileNames = ();
|
|
1155 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
1156 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile);
|
|
1157 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) {
|
|
1158 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
|
|
1159 if ($RootFileName && $RootFileExt) {
|
|
1160 $FileName = $RootFileName;
|
|
1161 }
|
|
1162 else {
|
|
1163 $FileName = $OptionsInfo{OutFileRoot};
|
|
1164 }
|
|
1165 $OutFileRoot = $FileName;
|
|
1166 }
|
|
1167 else {
|
|
1168 $OutFileRoot = $FileName;
|
|
1169 }
|
|
1170 $Mode = $OptionsInfo{Mode};
|
|
1171 if ($Mode =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResiduesRange|ResidueNames|Distance|NonWater|NonHydrogens)$/i) {
|
|
1172 $OutFileName = '';
|
|
1173 if ($Mode =~ /^CAlphas$/i) {
|
|
1174 $OutFileName = "${OutFileRoot}CAlphas.pdb";
|
|
1175 }
|
|
1176 elsif ($Mode =~ /^Atoms$/i) {
|
|
1177 $OutFileName = "${OutFileRoot}Atoms.pdb";
|
|
1178 }
|
|
1179 elsif ($Mode =~ /^AtomNums$/i) {
|
|
1180 $OutFileName = "${OutFileRoot}AtomNums.pdb";
|
|
1181 }
|
|
1182 elsif ($Mode =~ /^AtomsRange$/i) {
|
|
1183 $OutFileName = "${OutFileRoot}AtomsRange.pdb";
|
|
1184 }
|
|
1185 elsif ($Mode =~ /^AtomNames$/i) {
|
|
1186 $OutFileName = "${OutFileRoot}AtomNames.pdb";
|
|
1187 }
|
|
1188 elsif ($Mode =~ /^ResidueNums$/i) {
|
|
1189 $OutFileName = "${OutFileRoot}ResidueNums.pdb";
|
|
1190 }
|
|
1191 elsif ($Mode =~ /^ResiduesRange$/i) {
|
|
1192 $OutFileName = "${OutFileRoot}ResiduesRange.pdb";
|
|
1193 }
|
|
1194 elsif ($Mode =~ /^ResidueNames$/i) {
|
|
1195 $OutFileName = "${OutFileRoot}ResidueNames.pdb";
|
|
1196 }
|
|
1197 elsif ($Mode =~ /^NonWater$/i) {
|
|
1198 $OutFileName = "${OutFileRoot}NonWater.pdb";
|
|
1199 }
|
|
1200 elsif ($Mode =~ /^NonHydrogens$/i) {
|
|
1201 $OutFileName = "${OutFileRoot}NonHydrogens.pdb";
|
|
1202 }
|
|
1203 elsif ($Mode =~ /^Distance$/i) {
|
|
1204 my($DistanceMode) = '';
|
|
1205 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) {
|
|
1206 $DistanceMode = 'Atom';
|
|
1207 }
|
|
1208 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) {
|
|
1209 $DistanceMode = 'Hetatm';
|
|
1210 }
|
|
1211 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) {
|
|
1212 $DistanceMode = 'Residue';
|
|
1213 }
|
|
1214 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) {
|
|
1215 $DistanceMode = 'XYZ';
|
|
1216 }
|
|
1217 $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb";
|
|
1218 }
|
|
1219 push @OutFileNames, $OutFileName;
|
|
1220 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
|
|
1221 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
|
|
1222 next FILELIST;
|
|
1223 }
|
|
1224 }
|
|
1225 elsif ($Mode =~ /^(Chains|Sequences)$/i) {
|
|
1226 if ($OptionsInfo{CombineChainSequences}) {
|
|
1227 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta";
|
|
1228 push @OutFileNames, $OutFileName;
|
|
1229 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
|
|
1230 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
|
|
1231 next FILELIST;
|
|
1232 }
|
|
1233 }
|
|
1234 else {
|
|
1235 for $ChainLabel (@ChainLabels) {
|
|
1236 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta";
|
|
1237 push @OutFileNames, $OutFileName;
|
|
1238 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) {
|
|
1239 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n";
|
|
1240 next FILELIST;
|
|
1241 }
|
|
1242 }
|
|
1243 }
|
|
1244 }
|
|
1245 @ChainSequenceIDs = ();
|
|
1246 @ChainSequenceIDsPrefix = ();
|
|
1247 if ($Mode =~ /^Sequences$/i) {
|
|
1248 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix);
|
|
1249 ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef);
|
|
1250
|
|
1251 if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) {
|
|
1252 $IDPrefix = $FileName;
|
|
1253 }
|
|
1254 elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) {
|
|
1255 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : '';
|
|
1256 }
|
|
1257 else {
|
|
1258 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName;
|
|
1259 }
|
|
1260
|
|
1261 for $ChainLabel (@ChainLabels) {
|
|
1262 push @ChainSequenceIDsPrefix, $IDPrefix;
|
|
1263 push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB";
|
|
1264 }
|
|
1265 }
|
|
1266
|
|
1267 $PDBFilesInfo{FileOkay}[$Index] = 1;
|
|
1268 $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
|
|
1269
|
|
1270 push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames;
|
|
1271 push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels;
|
|
1272 push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix;
|
|
1273 push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs;
|
|
1274 push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains;
|
|
1275 push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin;
|
|
1276 }
|
|
1277 }
|
|
1278
|
|
1279
|
|
1280 # Setup script usage and retrieve command line arguments specified using various options...
|
|
1281 sub SetupScriptUsage {
|
|
1282
|
|
1283 # Retrieve all the options...
|
|
1284 %Options = ();
|
|
1285 $Options{chains} = 'First';
|
|
1286 $Options{combinechains} = 'no';
|
|
1287 $Options{distance} = 10.0;
|
|
1288 $Options{distancemode} = 'XYZ';
|
|
1289 $Options{distanceselectionmode} = 'ByAtom';
|
|
1290 $Options{keepoldrecords} = 'no';
|
|
1291 $Options{mode} = 'NonWater';
|
|
1292 $Options{modifyheader} = 'yes';
|
|
1293 $Options{nonstandardkeep} = 'yes';
|
|
1294 $Options{nonstandardcode} = 'X';
|
|
1295 $Options{sequencelength} = 80;
|
|
1296 $Options{sequenceidprefix} = 'Automatic';
|
|
1297 $Options{sequencerecords} = 'Atom';
|
|
1298 $Options{waterresiduenames} = 'Automatic';
|
|
1299
|
|
1300 if (!GetOptions(\%Options, "atoms|a=s", "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "distanceselectionmode=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "recordmode=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) {
|
|
1301 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
1302 }
|
|
1303 if ($Options{workingdir}) {
|
|
1304 if (! -d $Options{workingdir}) {
|
|
1305 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
1306 }
|
|
1307 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
1308 }
|
|
1309 if ($Options{combinechains} !~ /^(yes|no)$/i) {
|
|
1310 die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n";
|
|
1311 }
|
|
1312 if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) {
|
|
1313 die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n";
|
|
1314 }
|
|
1315 if ($Options{distanceselectionmode} !~ /^(ByAtom|ByResidue)$/i) {
|
|
1316 die "Error: The value specified, $Options{distanceselectionmode}, for option \"--DistanceSelectionMode\" is not valid. Allowed values: ByAtom or ByResidue\n";
|
|
1317 }
|
|
1318 if ($Options{keepoldrecords} !~ /^(yes|no)$/i) {
|
|
1319 die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n";
|
|
1320 }
|
|
1321 if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResidueNames|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) {
|
|
1322 die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, AtomNums, AtomsRange, AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, NonHydrogens\n";
|
|
1323 }
|
|
1324 if ($Options{modifyheader} !~ /^(yes|no)$/i) {
|
|
1325 die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n";
|
|
1326 }
|
|
1327 if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) {
|
|
1328 die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n";
|
|
1329 }
|
|
1330 if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) {
|
|
1331 die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n";
|
|
1332 }
|
|
1333 if ($Options{recordmode} && $Options{recordmode} !~ /^(Atom|Hetatm|AtomAndHetatm)$/i) {
|
|
1334 die "Error: The value specified, $Options{recordmode}, for option \"--RecordMode\" is not valid. Allowed values: Atom, Hetatm, AtomAndHetatm\n";
|
|
1335 }
|
|
1336 if (!IsPositiveInteger($Options{sequencelength})) {
|
|
1337 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n";
|
|
1338 }
|
|
1339 if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) {
|
|
1340 die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n";
|
|
1341 }
|
|
1342 if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) {
|
|
1343 die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n";
|
|
1344 }
|
|
1345 }
|
|
1346
|
|
1347 __END__
|
|
1348
|
|
1349 =head1 NAME
|
|
1350
|
|
1351 ExtractFromPDBFiles.pl - Extract specific data from PDBFile(s)
|
|
1352
|
|
1353 =head1 SYNOPSIS
|
|
1354
|
|
1355 ExtractFromPDBFiles.pl PDBFile(s)...
|
|
1356
|
|
1357 ExtractFromPDBFiles.pl [B<-a, --Atoms> "AtomNum, [AtomNum...]" | "StartAtomNum, EndAtomNum" |
|
|
1358 "AtomName, [AtomName...]"] [B<-c, --chains> First | All | "ChainID, [ChainID,...]"]
|
|
1359 [<--CombineChains> yes | no] [B<-d, --distance> number] [B<--DistanceMode> Atom | Hetatm | Residue | XYZ]
|
|
1360 [B<--DistanceOrigin> "AtomNumber, AtomName" | "HetatmNumber, HetAtmName" | "ResidueNumber, ResidueName, [ChainID]" | "X,Y,Z">]
|
|
1361 [<--DistanceSelectionMode> ByAtom | ByResidue] [B<-h, --help>] [B<-k, --KeepOldRecords> yes | no]
|
|
1362 [B<-m, --mode > Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames |
|
|
1363 ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens]
|
|
1364 [B<--ModifyHeader> yes | no] [B<--NonStandardKeep> yes | no] [B<--NonStandardCode> character]
|
|
1365 [B<-o, --overwrite>] [B<-r, --root> rootname] B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm>]
|
|
1366 [B<--Residues> "ResidueNum,[ResidueNum...]" | StartResidueNum,EndResiduNum ]
|
|
1367 [B<--SequenceLength> number] [B<--SequenceRecords> Atom | SeqRes]
|
|
1368 [B<--SequenceIDPrefix> FileName | HeaderRecord | Automatic]
|
|
1369 [B<--WaterResidueNames> Automatic | "ResidueName, [ResidueName,...]"]
|
|
1370 [B<-w, --WorkingDir> dirname] PDBFile(s)...
|
|
1371
|
|
1372 =head1 DESCRIPTION
|
|
1373
|
|
1374 Extract specific data from I<PDBFile(s)> and generate appropriate PDB or sequence file(s).
|
|
1375 Multiple PDBFile names are separated by spaces. The valid file extension is I<.pdb>.
|
|
1376 All other file name extensions are ignored during the wild card expansion. All the PDB files
|
|
1377 in a current directory can be specified either by I<*.pdb> or the current directory name.
|
|
1378
|
|
1379 During I<Chains> and I<Sequences> values of B<-m, --mode> option, all ATOM/HETAM records
|
|
1380 for chains after the first model in PDB fils containing data for multiple models are ignored.
|
|
1381
|
|
1382 =head1 OPTIONS
|
|
1383
|
|
1384 =over 4
|
|
1385
|
|
1386 =item B<-a, --Atoms> I<"AtomNum,[AtomNum...]" | "StartAtomNum,EndAtomNum" | "AtomName,[AtomName...]">
|
|
1387
|
|
1388 Specify which atom records to extract from I<PDBFiles(s)> during I<AtomNums>,
|
|
1389 I<AtomsRange>, and I<AtomNames> value of B<-m, --mode> option: extract records
|
|
1390 corresponding to atom numbers specified in a comma delimited list of atom numbers/names,
|
|
1391 or with in the range of start and end atom numbers. Possible values: I<"AtomNum[,AtomNum,..]">,
|
|
1392 I<StartAtomNum,EndAtomNum>, or I<"AtomName[,AtomName,..]">. Default: I<None>. Examples:
|
|
1393
|
|
1394 10
|
|
1395 15,20
|
|
1396 N,CA,C,O
|
|
1397
|
|
1398 =item B<-c, --chains> I<First | All | ChainID,[ChainID,...]>
|
|
1399
|
|
1400 Specify which chains to extract from I<PDBFile(s)> during I<Chains | Sequences> value of
|
|
1401 B<-m, --mode> option: first chain, all chains, or a specific list of comma delimited chain IDs.
|
|
1402 Possible values: I<First | All | ChainID,[ChainID,...]>. Default: I<First>. Examples:
|
|
1403
|
|
1404 A
|
|
1405 A,B
|
|
1406 All
|
|
1407
|
|
1408 =item B<--CombineChains> I<yes | no>
|
|
1409
|
|
1410 Specify whether to combine extracted chains data into a single file during I<Chains> or
|
|
1411 I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>. Default: I<no>.
|
|
1412
|
|
1413 During I<Chains> value of <-m, --mode> option with I<Yes> value of <--CombineChains>,
|
|
1414 extracted data for specified chains is written into a single file instead of individual file for each
|
|
1415 chain.
|
|
1416
|
|
1417 During I<Sequences> value of <-m, --mode> option with I<Yes> value of <--CombineChains>,
|
|
1418 residues sequences for specified chains are extracted and concatenated into a single sequence
|
|
1419 file instead of individual file for each chain.
|
|
1420
|
|
1421 =item B<-d, --distance> I<number>
|
|
1422
|
|
1423 Specify distance used to extract ATOM/HETATM recods during I<Distance> value of
|
|
1424 B<-m, --mode> option. Default: I<10.0> angstroms.
|
|
1425
|
|
1426 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
|
|
1427 ATOM, HETATM or both.
|
|
1428
|
|
1429 =item B<--DistanceMode> I<Atom | Hetatm | Residue | XYZ>
|
|
1430
|
|
1431 Specify how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of
|
|
1432 B<-m, --mode> option: extract all the records within a certain distance specifed by B<-d, --distance>
|
|
1433 from an atom or hetro atom record, a residue, or any artbitrary point. Possible values: I<Atom |
|
|
1434 Hetatm | Residue | XYZ>. Default: I<XYZ>.
|
|
1435
|
|
1436 During I<Residue> value of B<--distancemode>, distance of ATOM/HETATM records is calculated from
|
|
1437 all the atoms in the residue and the records are selected as long as any atom of the residue lies with
|
|
1438 in the distace specified using B<-d, --distance> option.
|
|
1439
|
|
1440 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
|
|
1441 ATOM, HETATM or both.
|
|
1442
|
|
1443 =item B<--DistanceSelectionMode> I<ByAtom | ByResidue>
|
|
1444
|
|
1445 Specify how how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of
|
|
1446 B<-m, --mode> option for all values of B<--DistanceMode> option: extract only those ATOM/HETATM
|
|
1447 records that meet specified distance criterion; extract all records corresponding to a residue as
|
|
1448 long as one of the ATOM/HETATM record in the residue satisfies specified distance criterion. Possible
|
|
1449 values: I<ByAtom, ByResidue>. Default value: I<ByAtom>.
|
|
1450
|
|
1451 =item B<--DistanceOrigin> I<"AtomNumber,AtomName" | "HetatmNumber,HetAtmName" | "ResidueNumber,ResidueName[,ChainID]" | "X,Y,Z">
|
|
1452
|
|
1453 This value is B<--distancemode> specific. In general, it identifies a point used to select
|
|
1454 other ATOM/HETATMS with in a specific distance from this point.
|
|
1455
|
|
1456 For I<Atom> value of B<--distancemode>, this option corresponds to an atom specification.
|
|
1457 Format: I<AtomNumber,AtomName>. Example:
|
|
1458
|
|
1459 455,CA
|
|
1460
|
|
1461 For I<Hetatm> value of B<--distancemode>, this option corresponds to a hetatm specification.
|
|
1462 Format: I<HetatmNumber,HetAtmName>. Example:
|
|
1463
|
|
1464 5295,C1
|
|
1465
|
|
1466 For I<Residue> value of B<--distancemode>, this option corresponds to a residue specification.
|
|
1467 Format: I<ResidueNumber, ResidueName[,ChainID]>. Example:
|
|
1468
|
|
1469 78,MSE
|
|
1470 977,RET,A
|
|
1471 978,RET,B
|
|
1472
|
|
1473 For I<XYZ> value of B<--distancemode>, this option corresponds to a coordinate of an
|
|
1474 arbitrary point. Format: I<X,Y,X>. Example:
|
|
1475
|
|
1476 10.044,19.261,-4.292
|
|
1477
|
|
1478 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
|
|
1479 ATOM, HETATM or both.
|
|
1480
|
|
1481 =item B<-h, --help>
|
|
1482
|
|
1483 Print this help message.
|
|
1484
|
|
1485 =item B<-k, --KeepOldRecords> I<yes | no>
|
|
1486
|
|
1487 Specify whether to transfer old non ATOM and HETATM records from input PDBFile(s) to new
|
|
1488 PDBFile(s) during I<Chains | Atoms | HetAtms | CAlphas | Distance| NonWater | NonHydrogens>
|
|
1489 value of B<-m --mode> option. By default, except for the HEADER record, all
|
|
1490 other unnecessary non ATOM/HETATM records are dropped during the
|
|
1491 generation of new PDB files. Possible values: I<yes | no>. Default: I<no>.
|
|
1492
|
|
1493 =item B<-m, --mode > I<Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames | ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens>
|
|
1494
|
|
1495 Specify what to extract from I<PDBFile(s)>: I<Chains> - retrieve records for
|
|
1496 specified chains; I<Sequences> - generate sequence files for specific chains;
|
|
1497 I<Atoms> - extract atom records; I<CAlphas> - extract atom records for alpha
|
|
1498 carbon atoms; I<AtomNums> - extract atom records for specified atom numbers;
|
|
1499 I<AtomsRange> - extract atom records between specified atom number range;
|
|
1500 I<AtomNames> - extract atom records for specified atom names; I<ResidueNums>
|
|
1501 - extract records for specified residue numbers; I<ResiduesRange> - extract records
|
|
1502 for residues between specified residue number range; I<ResidueNames> - extract
|
|
1503 records for specified residue names; I<Distance> - extract records with in a
|
|
1504 certain distance from a specific position; I<NonWater> - extract records corresponding
|
|
1505 to residues other than water; I<NonHydrogens> - extract non-hydrogen records.
|
|
1506
|
|
1507 Possible values: I<Chains, Sequences Atoms, CAlphas, AtomNums, AtomsRange,
|
|
1508 AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater,
|
|
1509 NonHydrogens>. Default value: I<NonWater>
|
|
1510
|
|
1511 During the generation of new PDB files, unnecessay CONECT records are dropped.
|
|
1512
|
|
1513 For I<Chains> mode, data for appropriate chains specified by B<--c --chains> option
|
|
1514 is extracted from I<PDBFile(s)> and placed into new PDB file(s).
|
|
1515
|
|
1516 For I<Sequences> mode, residues names using various sequence related options are
|
|
1517 extracted for chains specified by B<--c --chains> option from I<PDBFile(s)> and
|
|
1518 FASTA sequence file(s) are generated.
|
|
1519
|
|
1520 For I<Distance> mode, all ATOM/HETATM records with in a distance specified
|
|
1521 by B<-d --distance> option from a specific atom, residue or a point indicated by
|
|
1522 B<--distancemode> are extracted and placed into new PDB file(s).
|
|
1523
|
|
1524 For I<NonWater> mode, non water ATOM/HETATM record lines, identified using value of
|
|
1525 B<--WaterResidueNames>, are extracted and written to new PDB file(s).
|
|
1526
|
|
1527 For I<NonHydrogens> mode, ATOM/HETATOM record lines containing element symbol
|
|
1528 other than I<H> are extracted and written to new PDB file(s).
|
|
1529
|
|
1530 For all other options, appropriate ATOM/HETATM records are extracted to generate new
|
|
1531 PDB file(s).
|
|
1532
|
|
1533 B<--RecordMode> option controls type of record lines to extract and process from
|
|
1534 I<PDBFile(s)>: ATOM, HETATM or both.
|
|
1535
|
|
1536 =item B<--ModifyHeader> I<yes | no>
|
|
1537
|
|
1538 Specify whether to modify HEADER record during the generation of new PDB files
|
|
1539 for B<-m, --mode> values of I<Chains | Atoms | CAlphas | Distance>. Possible values:
|
|
1540 I<yes | no>. Default: I<yes>. By default, Classification data is replaced by I<Data extracted
|
|
1541 using MayaChemTools> before writing out HEADER record.
|
|
1542
|
|
1543 =item B<--NonStandardKeep> I<yes | no>
|
|
1544
|
|
1545 Specify whether to include and convert non-standard three letter residue codes into
|
|
1546 a code specified using B<--nonstandardcode> option and include them into sequence file(s)
|
|
1547 generated during I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>.
|
|
1548 Default: I<yes>.
|
|
1549
|
|
1550 A warning is also printed about the presence of non-standard residues. Any residue other
|
|
1551 than standard 20 amino acids and 5 nucleic acid is considered non-standard; additionally,
|
|
1552 HETATM residues in chains also tagged as non-standard.
|
|
1553
|
|
1554 =item B<--NonStandardCode> I<character>
|
|
1555
|
|
1556 A single character code to use for non-standard residues. Default: I<X>. Possible values:
|
|
1557 I<?, -, or X>.
|
|
1558
|
|
1559 =item B<-o, --overwrite>
|
|
1560
|
|
1561 Overwrite existing files.
|
|
1562
|
|
1563 =item B<-r, --root> I<rootname>
|
|
1564
|
|
1565 New PDB and sequence file name is generated using the root: <Root><Mode>.<Ext>.
|
|
1566 Default new file name: <PDBFileName>Chain<ChainID>.pdb for I<Chains> B<mode>;
|
|
1567 <PDBFileName>SequenceChain<ChainID>.fasta for I<Sequences> B<mode>;
|
|
1568 <PDBFileName>DistanceBy<DistanceMode>.pdb for I<Distance> B<-m, --mode>
|
|
1569 <PDBFileName><Mode>.pdb for I<Atoms | CAlphas | NonWater | NonHydrogens> B<-m, --mode>
|
|
1570 values. This option is ignored for multiple input files.
|
|
1571
|
|
1572 =item B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm>
|
|
1573
|
|
1574 Specify type of record lines to extract and process from I<PDBFile(s)> during various
|
|
1575 values of B<-m, --mode> option: extract only ATOM record lines; extract only HETATM
|
|
1576 record lines; extract both ATOM and HETATM lines. Possible values: I<Atom | Hetatm
|
|
1577 | AtomAndHetatm | XYZ>. Default during I<Atoms, CAlphas, AtomNums, AtomsRange,
|
|
1578 AtomNames> values of B<-m, --mode> option: I<Atom>; otherwise: I<AtomAndHetatm>.
|
|
1579
|
|
1580 This option is ignored during I<Chains, Sequences> values of B<-m, --mode> option.
|
|
1581
|
|
1582 =item B<--Residues> I<"ResidueNum,[ResidueNum...]" | "StartResidueNum,EndResiduNum" | "ResidueName,[ResidueName...]">
|
|
1583
|
|
1584 Specify which resiude records to extract from I<PDBFiles(s)> during I<ResidueNums>,
|
|
1585 I<ResiduesRange>,and I<ResidueNames> value of B<-m, --mode> option: extract records
|
|
1586 corresponding to residue numbers specified in a comma delimited list of residue numbers/names,
|
|
1587 or with in the range of start and end residue numbers. Possible values: I<"ResidueNum[,ResidueNum,..]">,
|
|
1588 I<StartResidueNum,EndResiduNum>, or I<<"ResidueName[,ResidueName,..]">. Default: I<None>. Examples:
|
|
1589
|
|
1590 20
|
|
1591 5,10
|
|
1592 TYR,SER,THR
|
|
1593
|
|
1594 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>:
|
|
1595 ATOM, HETATM or both.
|
|
1596
|
|
1597 =item B<--SequenceLength> I<number>
|
|
1598
|
|
1599 Maximum sequence length per line in sequence file(s). Default: I<80>.
|
|
1600
|
|
1601 =item B<--SequenceRecords> I<Atom | SeqRes>
|
|
1602
|
|
1603 Specify which records to use for extracting residue names from I<PDBFiles(s)> during
|
|
1604 I<Sequences> value of B<-m, --mode> option: use ATOM records to compile a list
|
|
1605 of residues in a chain or parse SEQRES record to get a list of residues. Possible values:
|
|
1606 I<Atom | SeqRes>. Default: I<Atom>.
|
|
1607
|
|
1608 =item B<--SequenceIDPrefix> I<FileName | HeaderRecord | Automatic>
|
|
1609
|
|
1610 Specify how to generate a prefix for sequence IDs during I<Sequences> value
|
|
1611 of B<-m, --mode> option: use input file name prefix; retrieve PDB ID from HEADER record;
|
|
1612 or automatically decide the method for generating the prefix. The chain IDs are also
|
|
1613 appended to the prefix. Possible values: I<FileName | HeaderRecord | Automatic>.
|
|
1614 Default: I<Automatic>
|
|
1615
|
|
1616 =item B<--WaterResidueNames> I<Automatic | "ResidueName,[ResidueName,...]">
|
|
1617
|
|
1618 Identification of water residues during I<NonWater> value of B<-m, --mode> option. Possible values:
|
|
1619 I<Automatic | "ResidueName,[ResidueName,...]">. Default: I<Automatic> - corresponds
|
|
1620 to "HOH,WAT,H20". You can also specify a different comma delimited list of residue names
|
|
1621 to use for water.
|
|
1622
|
|
1623 =item B<-w, --WorkingDir> I<dirname>
|
|
1624
|
|
1625 Location of working directory. Default: current directory.
|
|
1626
|
|
1627 =back
|
|
1628
|
|
1629 =head1 EXAMPLES
|
|
1630
|
|
1631 To extract non-water records from Sample2.pdb file and generate Sample2NonWater.pdb
|
|
1632 file, type:
|
|
1633
|
|
1634 % ExtractFromPDBFiles.pl Sample2.pdb
|
|
1635
|
|
1636 To extract non-water records corresponding to only ATOM records from Sample2.pdb file
|
|
1637 and generate Sample2NonWater.pdb file, type:
|
|
1638
|
|
1639 % ExtractFromPDBFiles.pl --RecordMode Atom Sample2.pdb
|
|
1640
|
|
1641 To extract non-water records from Sample2.pdb file using HOH or WAT residue name for water along
|
|
1642 with all old non-coordinate records and generate Sample2NewNonWater.pdb file, type:
|
|
1643
|
|
1644 % ExtractFromPDBFiles.pl -m NonWater --WaterResidueNames "HOH,WAT"
|
|
1645 -KeepOldRecords Yes -r Sample2New -o Sample2.pdb
|
|
1646
|
|
1647 To extract non-hydrogens records from Sample2.pdb file and generate Sample2NonHydrogen.pdb
|
|
1648 file, type:
|
|
1649
|
|
1650 % ExtractFromPDBFiles.pl -m NonHydrogens Sample2.pdb
|
|
1651
|
|
1652 To extract data for first chain in Sample2.pdb and generate Sample2ChainA.pdb, type
|
|
1653 file, type:
|
|
1654
|
|
1655 % ExtractFromPDBFiles.pl -m chains -o Sample2.pdb
|
|
1656
|
|
1657 To extract data for both chains in Sample2.pdb and generate Sample2ChainA.pdb and
|
|
1658 Sample2ChainB.pdb, type:
|
|
1659
|
|
1660 % ExtractFromPDBFiles.pl -m chains -c All -o Sample2.pdb
|
|
1661
|
|
1662 To extract data for alpha carbons in Sample2.pdb and generate Sample2CAlphas.pdb, type:
|
|
1663
|
|
1664 % ExtractFromPDBFiles.pl -m CAlphas -o Sample2.pdb
|
|
1665
|
|
1666 To extract records for specific residue numbers in all chains from Sample2.pdb file and generate
|
|
1667 Sample2ResidueNums.pdb file, type:
|
|
1668
|
|
1669 % ExtractFromPDBFiles.pl -m ResidueNums --Residues "3,6"
|
|
1670 Sample2.pdb
|
|
1671
|
|
1672 To extract records for a specific range of residue number in all chains from Sample2.pdb
|
|
1673 file and generate Sample2ResiduesRange.pdb file, type:
|
|
1674
|
|
1675 % ExtractFromPDBFiles.pl -m ResiduesRange --Residues "10,30"
|
|
1676 Sample2.pdb
|
|
1677
|
|
1678 To extract data for all ATOM and HETATM records with in 10 angstrom of an atom specifed by
|
|
1679 atom serial number and name "1,N" in Sample2.pdb file and generate Sample2DistanceByAtom.pdb,
|
|
1680 type:
|
|
1681
|
|
1682 % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom
|
|
1683 --DistanceOrigin "1,N" -k No --distance 10 -o Sample2.pdb
|
|
1684
|
|
1685 To extract data for all ATOM and HETATM records for complete residues with any atom or hetatm
|
|
1686 less than 10 angstrom of an atom specifed by atom serial number and name "1,N" in Sample2.pdb
|
|
1687 file and generate Sample2DistanceByAtom.pdb, type:
|
|
1688
|
|
1689 % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom
|
|
1690 --DistanceOrigin "1,N" --DistanceSelectionMode ByResidue
|
|
1691 -k No --distance 10 -o Sample2.pdb
|
|
1692
|
|
1693 To extract data for all ATOM and HETATM records with in 25 angstrom of an arbitrary point "0,0,0"
|
|
1694 in Sample2.pdb file and generate Sample2DistanceByXYZ.pdb, type:
|
|
1695
|
|
1696 % ExtractFromPDBFiles.pl -m Distance --DistanceMode XYZ
|
|
1697 --DistanceOrigin "0,0,0" -k No --distance 25 -o Sample2.pdb
|
|
1698
|
|
1699 =head1 AUTHOR
|
|
1700
|
|
1701 Manish Sud <msud@san.rr.com>
|
|
1702
|
|
1703 =head1 SEE ALSO
|
|
1704
|
|
1705 InfoPDBFiles.pl, ModifyPDBFiles.pl
|
|
1706
|
|
1707 =head1 COPYRIGHT
|
|
1708
|
|
1709 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1710
|
|
1711 This file is part of MayaChemTools.
|
|
1712
|
|
1713 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1714 the terms of the GNU Lesser General Public License as published by the Free
|
|
1715 Software Foundation; either version 3 of the License, or (at your option)
|
|
1716 any later version.
|
|
1717
|
|
1718 =cut
|