1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromPDBFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:19 $ 5 # $Revision: 1.39 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use PDBFileUtil; 38 use AminoAcids; 39 use SequenceFileUtil; 40 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 42 43 # Autoflush STDOUT 44 $| = 1; 45 46 # Starting message... 47 $ScriptName = basename($0); 48 print "\n$ScriptName: Starting...\n\n"; 49 $StartTime = new Benchmark; 50 51 # Get the options and setup script... 52 SetupScriptUsage(); 53 if ($Options{help} || @ARGV < 1) { 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 55 } 56 57 my(@PDBFilesList); 58 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); 59 60 # Process options... 61 print "Processing options...\n"; 62 my(%OptionsInfo); 63 ProcessOptions(); 64 65 # Setup information about input files... 66 print "Checking input PDB file(s)...\n"; 67 my(%PDBFilesInfo); 68 RetrievePDBFilesInfo(); 69 70 # Process input files.. 71 my($FileIndex); 72 if (@PDBFilesList > 1) { 73 print "\nProcessing PDB files...\n"; 74 } 75 for $FileIndex (0 .. $#PDBFilesList) { 76 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { 77 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; 78 ExtractFromPDBFiles($FileIndex); 79 } 80 } 81 print "\n$ScriptName:Done...\n\n"; 82 83 $EndTime = new Benchmark; 84 $TotalTime = timediff ($EndTime, $StartTime); 85 print "Total time: ", timestr($TotalTime), "\n"; 86 87 ############################################################################### 88 89 # Extract appropriate information... 90 sub ExtractFromPDBFiles { 91 my($FileIndex) = @_; 92 my($PDBFile, $PDBRecordLinesRef); 93 94 # Get PDB data... 95 $PDBFile = $PDBFilesList[$FileIndex]; 96 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 97 98 if ($OptionsInfo{Mode} =~ /Chains/i) { 99 ExtractChains($FileIndex, $PDBRecordLinesRef); 100 } 101 elsif ($OptionsInfo{Mode} =~ /Sequences/i) { 102 ExtractSequences($FileIndex, $PDBRecordLinesRef); 103 } 104 elsif ($OptionsInfo{Mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i) { 105 ExtractByAtoms($FileIndex, $PDBRecordLinesRef); 106 } 107 elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { 108 ExtractByResidues($FileIndex, $PDBRecordLinesRef); 109 } 110 elsif ($OptionsInfo{Mode} =~ /Distance/i) { 111 ExtractByDistance($FileIndex, $PDBRecordLinesRef); 112 } 113 elsif ($OptionsInfo{Mode} =~ /NonWater/i) { 114 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef); 115 } 116 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) { 117 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef); 118 } 119 } 120 121 # Extract chains and generate new PDB files... 122 # 123 sub ExtractChains { 124 my($FileIndex, $PDBRecordLinesRef) = @_; 125 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap); 126 127 # Get chains and residues data... 128 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1); 129 130 if ($OptionsInfo{CombineChains}) { 131 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 132 print "Generating PDBFileName file $PDBFileName...\n"; 133 134 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 135 136 # Write out header and other older recors... 137 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 138 } 139 140 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 141 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 142 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 143 144 if (!$OptionsInfo{CombineChains}) { 145 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 146 print "Generating PDBFileName file $PDBFileName...\n"; 147 148 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 149 150 # Write out header and other older recors... 151 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 152 } 153 154 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers 155 # for writing out appropriate CONECT records... 156 %ChainAtomNumbersMap = (); 157 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) { 158 print OUTFILE "$RecordLine\n"; 159 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine); 160 $AtomNumber = int $AtomNumber; 161 $ChainAtomNumbersMap{$AtomNumber} = $AtomName; 162 } 163 # Write out TER record using information from last chain record... 164 $AtomNumber += 1; 165 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n"; 166 167 # Write out CONECT records... 168 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap); 169 170 for $RecordLine (@{$ConectRecordLinesRef}) { 171 print OUTFILE "$RecordLine\n"; 172 } 173 174 if (!$OptionsInfo{CombineChains}) { 175 # Write out END record... 176 print OUTFILE GenerateEndRecordLine(), "\n"; 177 178 close OUTFILE; 179 } 180 } 181 182 if ($OptionsInfo{CombineChains}) { 183 # Write out END record... 184 print OUTFILE GenerateEndRecordLine(), "\n"; 185 186 close OUTFILE; 187 } 188 189 } 190 191 # Extract sequences for individual chains or combine all the chains... 192 sub ExtractSequences { 193 my($FileIndex, $PDBRecordLinesRef) = @_; 194 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap); 195 196 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 197 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 198 } 199 else { 200 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 201 } 202 203 # Generate sequence data for all the chains... 204 %ChainSequencesDataMap = (); 205 @{$ChainSequencesDataMap{IDs}} = (); 206 %{$ChainSequencesDataMap{Sequence}} = (); 207 %{$ChainSequencesDataMap{Description}} = (); 208 209 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 210 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 211 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; 212 213 # Setup sequence ID... 214 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex]; 215 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID; 216 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID; 217 218 # Collect sequence data for the chain... 219 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) { 220 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 221 } 222 else { 223 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; 224 } 225 # Setup sequence data... 226 $ChainSequence = ''; 227 RESIDUE: for $Residue (@{$ChainResiduesRef}) { 228 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue); 229 if (!$StandardResidue) { 230 if ($OptionsInfo{KeepNonStandardSequences}) { 231 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 232 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n"; 233 } 234 else { 235 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n"; 236 next RESIDUE; 237 } 238 } 239 $ChainSequence .= $ResidueCode; 240 } 241 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence; 242 243 } 244 245 # Write out the sequence files... 246 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap ); 247 if ($OptionsInfo{CombineChainSequences}) { 248 # Combine all the chain sequences... 249 $Sequence = ''; 250 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 251 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 252 253 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID}; 254 } 255 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";; 256 $SequenceDescription = $SequenceID; 257 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 258 259 print "Generating sequence file $SequenceFileName...\n"; 260 %SequencesDataMap = (); 261 @{$SequencesDataMap{IDs}} = (); 262 %{$SequencesDataMap{Sequence}} = (); 263 %{$SequencesDataMap{Description}} = (); 264 265 push @{$SequencesDataMap{IDs}}, $SequenceID; 266 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 267 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 268 269 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 270 } 271 else { 272 # For each specifed chain, write out the sequences... 273 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { 274 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; 275 276 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; 277 278 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex]; 279 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID}; 280 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID}; 281 282 print "Generating sequence file $SequenceFileName...\n"; 283 %SequencesDataMap = (); 284 @{$SequencesDataMap{IDs}} = (); 285 %{$SequencesDataMap{Sequence}} = (); 286 %{$SequencesDataMap{Description}} = (); 287 288 push @{$SequencesDataMap{IDs}}, $SequenceID; 289 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; 290 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; 291 292 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); 293 } 294 } 295 } 296 297 # Extract atoms... 298 sub ExtractByAtoms { 299 my($FileIndex, $PDBRecordLinesRef) = @_; 300 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $IgnoreRecord, $ConectRecordLinesRef, %AtomNumbersMap); 301 302 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 303 print "Generating PDBFileName file $PDBFileName...\n"; 304 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 305 306 # Write out header and other older recors... 307 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 308 309 # Write out all ATOM records along with TER and model records to indicate 310 # chains and multiple models.. 311 %AtomNumbersMap = (); 312 $ChainRecordCount = 0; 313 for $RecordLine (@{$PDBRecordLinesRef}) { 314 if (CheckRecordType($RecordLine)) { 315 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); 316 317 # Check atoms... 318 $IgnoreRecord = 1; 319 if ($OptionsInfo{Mode} =~ /^Atoms$/i) { 320 $IgnoreRecord = 0; 321 } 322 elsif ($OptionsInfo{Mode} =~ /^(CAlphas|AtomNames)$/i) { 323 if (exists $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName}) { 324 $IgnoreRecord = 0; 325 } 326 } 327 elsif ($OptionsInfo{Mode} =~ /^AtomNums$/i) { 328 if (exists $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNumber}) { 329 $IgnoreRecord = 0; 330 } 331 } 332 elsif ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { 333 if ($AtomNumber >= $OptionsInfo{SpecifiedStartAtomNum} && $AtomNumber <= $OptionsInfo{SpecifiedEndAtomNum}) { 334 $IgnoreRecord = 0; 335 } 336 } 337 338 if (!$IgnoreRecord) { 339 $ChainRecordCount++; 340 print OUTFILE "$RecordLine\n"; 341 342 $AtomNumber = int $AtomNumber; 343 $AtomNumbersMap{$AtomNumber} = $AtomName; 344 } 345 } 346 elsif (IsTerRecordType($RecordLine)) { 347 if ($ChainRecordCount) { 348 print OUTFILE GenerateTerRecordLine(), "\n"; 349 } 350 $ChainRecordCount = 0; 351 } 352 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 353 print OUTFILE "$RecordLine\n"; 354 } 355 } 356 357 # Write out appropriate CONECT records... 358 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 359 for $RecordLine (@{$ConectRecordLinesRef}) { 360 print OUTFILE "$RecordLine\n"; 361 } 362 363 # Write out END record... 364 print OUTFILE GenerateEndRecordLine(), "\n"; 365 366 close OUTFILE; 367 } 368 369 # Extract residues... 370 sub ExtractByResidues { 371 my($FileIndex, $PDBRecordLinesRef) = @_; 372 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap); 373 374 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 375 print "Generating PDBFileName file $PDBFileName...\n"; 376 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 377 378 # Write out header and other older recors... 379 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 380 381 # Write out all ATOM records for specified residues with TER and model records to indicate 382 # chains and multiple models... 383 %AtomNumbersMap = (); 384 $ChainRecordCount = 0; 385 for $RecordLine (@{$PDBRecordLinesRef}) { 386 if (CheckRecordType($RecordLine)) { 387 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine); 388 389 # Check residues... 390 $IgnoreRecord = 1; 391 if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) { 392 if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) { 393 $IgnoreRecord = 0; 394 } 395 } 396 elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { 397 if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) { 398 $IgnoreRecord = 0; 399 } 400 } 401 elsif ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { 402 if (exists $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName}) { 403 $IgnoreRecord = 0; 404 } 405 } 406 if (!$IgnoreRecord) { 407 $ChainRecordCount++; 408 print OUTFILE "$RecordLine\n"; 409 $AtomNumber = int $AtomNumber; 410 $AtomNumbersMap{$AtomNumber} = $AtomName; 411 } 412 } 413 elsif (IsTerRecordType($RecordLine)) { 414 if ($ChainRecordCount) { 415 print OUTFILE GenerateTerRecordLine(), "\n"; 416 } 417 $ChainRecordCount = 0; 418 } 419 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 420 print OUTFILE "$RecordLine\n"; 421 } 422 } 423 424 # Write out appropriate CONECT records... 425 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 426 for $RecordLine (@{$ConectRecordLinesRef}) { 427 print OUTFILE "$RecordLine\n"; 428 } 429 # Write out END record... 430 print OUTFILE GenerateEndRecordLine(), "\n"; 431 432 close OUTFILE; 433 } 434 435 # Extract non water records... 436 sub ExtractNonWaterRecords { 437 my($FileIndex, $PDBRecordLinesRef) = @_; 438 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap); 439 440 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 441 print "Generating PDBFileName file $PDBFileName...\n"; 442 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 443 444 # Write out header and other older recors... 445 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 446 447 # Write out all ATOM/HETATM non water records along with TER and model records to indicate 448 # chains and multiple models.. 449 %AtomNumbersMap = (); 450 $ChainRecordCount = 0; 451 for $RecordLine (@{$PDBRecordLinesRef}) { 452 if (CheckRecordType($RecordLine)) { 453 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine); 454 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) { 455 $ChainRecordCount++; 456 print OUTFILE "$RecordLine\n"; 457 $AtomNumber = int $AtomNumber; 458 $AtomNumbersMap{$AtomNumber} = $AtomName; 459 } 460 } 461 elsif (IsTerRecordType($RecordLine)) { 462 if ($ChainRecordCount) { 463 print OUTFILE GenerateTerRecordLine(), "\n"; 464 } 465 $ChainRecordCount = 0; 466 } 467 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 468 print OUTFILE "$RecordLine\n"; 469 } 470 } 471 472 # Write out appropriate CONECT records... 473 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 474 for $RecordLine (@{$ConectRecordLinesRef}) { 475 print OUTFILE "$RecordLine\n"; 476 } 477 # Write out END record... 478 print OUTFILE GenerateEndRecordLine(), "\n"; 479 480 close OUTFILE; 481 } 482 483 # Extract non hydrogen records... 484 sub ExtractNonHydrogenRecords { 485 my($FileIndex, $PDBRecordLinesRef) = @_; 486 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap); 487 488 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 489 print "Generating PDBFileName file $PDBFileName...\n"; 490 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 491 492 # Write out header and other older recors... 493 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 494 495 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate 496 # chains and multiple models.. 497 %AtomNumbersMap = (); 498 $ChainRecordCount = 0; 499 for $RecordLine (@{$PDBRecordLinesRef}) { 500 if (CheckRecordType($RecordLine)) { 501 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); 502 if ($ElementSymbol !~ /^H$/i) { 503 $ChainRecordCount++; 504 print OUTFILE "$RecordLine\n"; 505 $AtomNumber = int $AtomNumber; 506 $AtomNumbersMap{$AtomNumber} = $AtomName; 507 } 508 } 509 elsif (IsTerRecordType($RecordLine)) { 510 if ($ChainRecordCount) { 511 print OUTFILE GenerateTerRecordLine(), "\n"; 512 } 513 $ChainRecordCount = 0; 514 } 515 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 516 print OUTFILE "$RecordLine\n"; 517 } 518 } 519 520 # Write out appropriate CONECT records... 521 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 522 for $RecordLine (@{$ConectRecordLinesRef}) { 523 print OUTFILE "$RecordLine\n"; 524 } 525 # Write out END record... 526 print OUTFILE GenerateEndRecordLine(), "\n"; 527 528 close OUTFILE; 529 } 530 531 # Extract ATOM/HETATM records by distance... 532 sub ExtractByDistance { 533 my($FileIndex, $PDBRecordLinesRef) = @_; 534 my($PDBFileName, $RecordLine, $RecordLineNum, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $IgnoreRecord, $ResidueID, @OriginCoords, @Coords, %AtomNumbersMap, %ResiduesDataMap); 535 536 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; 537 print "Generating PDBFileName file $PDBFileName...\n"; 538 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; 539 540 # Write out header and other older recors... 541 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); 542 543 # Setup coordinates of origin to calculate distance... 544 @OriginCoords = (); 545 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]}; 546 547 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate 548 # chains and multiple models... 549 %AtomNumbersMap = (); 550 551 %ResiduesDataMap = (); 552 %{$ResiduesDataMap{ID}} = (); 553 %{$ResiduesDataMap{Status}} = (); 554 555 $ChainRecordCount = 0; 556 $RecordLineNum = 0; 557 558 for $RecordLine (@{$PDBRecordLinesRef}) { 559 $RecordLineNum++; 560 if (CheckRecordType($RecordLine)) { 561 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 562 @Coords = (); push @Coords, ($X, $Y, $Z); 563 564 $IgnoreRecord = 1; 565 if ($OptionsInfo{DistanceSelectionMode} =~ /^ByResidue$/i) { 566 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; 567 if (exists $ResiduesDataMap{ID}{$ResidueID}) { 568 # Residue data has been processed; check its selection status... 569 if ($ResiduesDataMap{Status}{$ResidueID}) { 570 $IgnoreRecord = 0; 571 } 572 } 573 else { 574 # Residue hasn't been processed... 575 $ResiduesDataMap{ID}{$ResidueID} = $ResidueID; 576 $ResiduesDataMap{Status}{$ResidueID} = 0; 577 if (CheckResidueDistance($ResidueID, $RecordLineNum, $PDBRecordLinesRef, \@OriginCoords)) { 578 $IgnoreRecord = 0; 579 $ResiduesDataMap{Status}{$ResidueID} = 1; 580 } 581 } 582 } 583 elsif ($OptionsInfo{DistanceSelectionMode} =~ /^ByAtom$/i) { 584 if (CheckDistance(\@Coords, \@OriginCoords)) { 585 $IgnoreRecord = 0; 586 } 587 } 588 589 if (!$IgnoreRecord) { 590 $ChainRecordCount++; 591 print OUTFILE "$RecordLine\n"; 592 $AtomNumber = int $AtomNumber; 593 $AtomNumbersMap{$AtomNumber} = $AtomName; 594 } 595 } 596 elsif (IsTerRecordType($RecordLine)) { 597 if ($ChainRecordCount) { 598 print OUTFILE GenerateTerRecordLine(), "\n"; 599 } 600 $ChainRecordCount = 0; 601 } 602 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { 603 print OUTFILE "$RecordLine\n"; 604 } 605 } 606 607 # Write out appropriate CONECT records... 608 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); 609 for $RecordLine (@{$ConectRecordLinesRef}) { 610 print OUTFILE "$RecordLine\n"; 611 } 612 613 # Write out END record... 614 print OUTFILE GenerateEndRecordLine(), "\n"; 615 616 close OUTFILE; 617 } 618 619 # Does record type correspond to the specified record type? 620 sub CheckRecordType { 621 my($RecordLine) = @_; 622 my($Status); 623 624 $Status = 0; 625 if ($OptionsInfo{RecordMode} =~ /^Atom$/i) { 626 $Status = IsAtomRecordType($RecordLine) ? 1 : 0; 627 } 628 elsif ($OptionsInfo{RecordMode} =~ /^Hetatm$/i) { 629 $Status = IsHetatmRecordType($RecordLine) ? 1 : 0; 630 } 631 elsif ($OptionsInfo{RecordMode} =~ /^AtomAndHetatm$/i) { 632 $Status = (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) ? 1 : 0; 633 } 634 635 return $Status; 636 } 637 638 # Does record meets distance citerion specified by the user? 639 sub CheckResidueDistance { 640 my($SpecifiedResidueID, $StartingLineNum, $PDBRecordLinesRef, $OriginCoordsRef) = @_; 641 my($Status, $RecordLine, $RecordLineIndex, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $ResidueID, @Coords); 642 643 $Status = 0; 644 645 RECORDLINE: for $RecordLineIndex (($StartingLineNum - 1) .. $#{$PDBRecordLinesRef}) { 646 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; 647 if (!CheckRecordType($RecordLine)) { 648 next RECORDLINE; 649 } 650 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 651 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; 652 653 if ($ResidueID !~ /^$SpecifiedResidueID$/i) { 654 # It's a new residue line... 655 last RECORDLINE; 656 } 657 658 # Check distance... 659 @Coords = (); push @Coords, ($X, $Y, $Z); 660 if (CheckDistance(\@Coords, $OriginCoordsRef)) { 661 # Distance criterion is met for at least one record in the residue... 662 $Status = 1; 663 last RECORDLINE; 664 } 665 } 666 return $Status; 667 } 668 669 # Does record meets distance citerion specified by the user? 670 sub CheckDistance { 671 my($CoordsRef, $OriginCoordsRef) = @_; 672 my($Status, $Index, $Distance, $DistanceSquare); 673 674 $Status = 0; 675 676 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 677 # Go over coordinates of all the atoms in the residue... 678 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef}; 679 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) { 680 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2; 681 $Distance = sqrt $DistanceSquare; 682 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) { 683 $Status = 1; 684 last INDEX; 685 } 686 } 687 } 688 else { 689 $DistanceSquare = 0; 690 for $Index (0 .. 2) { 691 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2; 692 } 693 $Distance = sqrt $DistanceSquare; 694 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0; 695 } 696 697 return $Status; 698 } 699 700 # Write out modifed header and other older records... 701 sub WriteHeaderAndOlderRecords { 702 my($OutFileRef, $PDBRecordLinesRef) = @_; 703 704 if ($OptionsInfo{ModifyHeaderRecord}) { 705 # Write out modified HEADER record... 706 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 707 $Classification = 'Data extracted using MayaChemTools'; 708 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n"; 709 } 710 else { 711 print $OutFileRef $PDBRecordLinesRef->[0], "\n"; 712 } 713 714 # Write out any old records... 715 if ($OptionsInfo{KeepOldRecords}) { 716 my($RecordLineIndex, $RecordLine); 717 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file... 718 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) { 719 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; 720 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { 721 last RECORDLINE; 722 } 723 print $OutFileRef "$RecordLine\n"; 724 } 725 } 726 } 727 728 # Get header record information assuming it's the first record... 729 sub GetHeaderRecordInformation { 730 my($PDBRecordLinesRef) = @_; 731 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine); 732 733 ($Classification, $DepositionDate, $IDCode) = ('') x 3; 734 $HeaderRecordLine = $PDBRecordLinesRef->[0]; 735 if (IsHeaderRecordType($HeaderRecordLine)) { 736 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); 737 } 738 return ($Classification, $DepositionDate, $IDCode); 739 } 740 741 # Get one letter residue code... 742 sub GetResidueCode { 743 my($ResidueName) = @_; 744 my($ResidueCode, $StandardResidue); 745 746 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; 747 $StandardResidue = 0; 748 749 if (length($ResidueName) == 3) { 750 # Assume it's an amino acid... 751 if (AminoAcids::IsAminoAcid($ResidueName)) { 752 # Standard amino acid... 753 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName); 754 $StandardResidue = 1; 755 } 756 } 757 elsif (length($ResidueName) == 1) { 758 # Assume it's a nucleic acid... 759 if ($ResidueName =~ /^(A|G|T|U|C)$/i) { 760 $ResidueCode = $ResidueName; 761 $StandardResidue = 1; 762 } 763 } 764 765 return ($ResidueCode, $StandardResidue); 766 } 767 768 # Process option values... 769 sub ProcessOptions { 770 %OptionsInfo = (); 771 $OptionsInfo{Mode} = $Options{mode}; 772 773 my(@SpecifiedChains) = (); 774 if ($Options{chains} =~ /^(First|All)$/i) { 775 $OptionsInfo{ChainsToExtract} = $Options{chains}; 776 } 777 else { 778 @SpecifiedChains = split /\,/, $Options{chains}; 779 $OptionsInfo{ChainsToExtract} = 'Specified'; 780 } 781 @{$OptionsInfo{SpecifiedChains}} = (); 782 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains; 783 784 $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 785 786 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; 787 788 ProcessResiduesOptions(); 789 ProcessAtomsOptions(); 790 ProcessDistanceOptions(); 791 792 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames}; 793 @{$OptionsInfo{SpecifiedWaterResiduesList}} = (); 794 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = (); 795 796 my(@SpecifiedWaterResiduesList); 797 @SpecifiedWaterResiduesList = (); 798 799 if ($OptionsInfo{Mode} =~ /^NonWater$/i) { 800 my($WaterResidueName); 801 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) { 802 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O'); 803 } 804 else { 805 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames}; 806 } 807 for $WaterResidueName (@SpecifiedWaterResiduesList) { 808 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName; 809 } 810 } 811 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList; 812 813 $OptionsInfo{RecordMode} = $Options{recordmode} ? $Options{recordmode} : ($Options{mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i ? "Atom" : "AtomAndHetatm"); 814 815 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0; 816 817 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0; 818 819 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0; 820 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode}; 821 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; 822 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords}; 823 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix}; 824 825 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 826 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 827 } 828 829 # Process specified residue options... 830 sub ProcessResiduesOptions { 831 my($ResidueNum, $StartResidueNum, $EndResNum, $ResidueName, @SpecifiedResidueNumsList, @SpecifiedResidueNamesList); 832 833 @SpecifiedResidueNumsList = (); 834 ($StartResidueNum, $EndResNum) = (0, 0); 835 836 @SpecifiedResidueNamesList = (); 837 838 if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { 839 if (!$Options{residues}) { 840 die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums, ResiduesRange, or ResidueNames\" \"-m, --mode\". \n"; 841 } 842 $OptionsInfo{Residues} = $Options{residues}; 843 $OptionsInfo{Residues} =~ s/ //g; 844 845 if ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { 846 @SpecifiedResidueNamesList = split /\,/, $OptionsInfo{Residues}; 847 } 848 else { 849 @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues}; 850 for $ResidueNum (@SpecifiedResidueNumsList) { 851 if (!IsPositiveInteger($ResidueNum)) { 852 die "Error: Invalid residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n"; 853 } 854 } 855 if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { 856 if (@SpecifiedResidueNumsList != 2) { 857 die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n"; 858 } 859 if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) { 860 die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue number must be less than end residue number.\n"; 861 } 862 ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList; 863 } 864 } 865 } 866 867 @{$OptionsInfo{SpecifiedResidueNumsList}} = (); 868 push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList; 869 870 $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum; 871 $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum; 872 873 @{$OptionsInfo{SpecifiedResidueNamesList}} = (); 874 push @{$OptionsInfo{SpecifiedResidueNamesList}}, @SpecifiedResidueNamesList; 875 876 # Set up a specified residue numbers map... 877 %{$OptionsInfo{SpecifiedResidueNumsMap}} = (); 878 for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) { 879 $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum; 880 } 881 882 # Set up a specified residue names map... 883 %{$OptionsInfo{SpecifiedResidueNamesMap}} = (); 884 for $ResidueName (@{$OptionsInfo{SpecifiedResidueNamesList}}) { 885 $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName} = lc $ResidueName; 886 } 887 888 } 889 890 # Process specified atom options... 891 sub ProcessAtomsOptions { 892 my($AtomNum, $StartAtomNum, $EndAtomNum, $AtomName, @SpecifiedAtomNumsList, @SpecifiedAtomNamesList); 893 894 @SpecifiedAtomNumsList = (); 895 ($StartAtomNum, $EndAtomNum) = (0, 0); 896 897 @SpecifiedAtomNamesList = (); 898 899 if ($OptionsInfo{Mode} =~ /^(AtomNums|AtomsRange|AtomNames)$/i) { 900 if (!$Options{atoms}) { 901 die "Error: You must specify a value for \"--Atoms\" option in \"AtomNums, AtomsRange, or AtomNames\" \"-m, --mode\". \n"; 902 } 903 $OptionsInfo{Atoms} = $Options{atoms}; 904 $OptionsInfo{Atoms} =~ s/ //g; 905 906 if ($OptionsInfo{Mode} =~ /^AtomNames$/i) { 907 @SpecifiedAtomNamesList = split /\,/, $OptionsInfo{Atoms}; 908 } 909 else { 910 @SpecifiedAtomNumsList = split /\,/, $OptionsInfo{Atoms}; 911 for $AtomNum (@SpecifiedAtomNumsList) { 912 if (!IsPositiveInteger($AtomNum)) { 913 die "Error: Invalid atom number value, $AtomNum, for \"--Atoms\" option during \"AtomNums\" or \"AtomsRange\"value of \"-m --mode\" option: Atom number must be a positive integer.\n"; 914 } 915 } 916 if ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { 917 if (@SpecifiedAtomNumsList != 2) { 918 die "Error: Invalid number of atom number values, ", scalar(@SpecifiedAtomNumsList), ", for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end atom numbers.\n"; 919 } 920 if ($SpecifiedAtomNumsList[0] > $SpecifiedAtomNumsList[1]) { 921 die "Error: Invalid atom number values, @SpecifiedAtomNumsList, for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The start atom number must be less than end atom number.\n"; 922 } 923 ($StartAtomNum, $EndAtomNum) = @SpecifiedAtomNumsList; 924 } 925 } 926 } 927 elsif ($OptionsInfo{Mode} =~ /^CAlphas$/i) { 928 @SpecifiedAtomNamesList = ("CA"); 929 } 930 931 @{$OptionsInfo{SpecifiedAtomNumsList}} = (); 932 push @{$OptionsInfo{SpecifiedAtomNumsList}}, @SpecifiedAtomNumsList; 933 934 $OptionsInfo{SpecifiedStartAtomNum} = $StartAtomNum; 935 $OptionsInfo{SpecifiedEndAtomNum} = $EndAtomNum; 936 937 @{$OptionsInfo{SpecifiedAtomNamesList}} = (); 938 push @{$OptionsInfo{SpecifiedAtomNamesList}}, @SpecifiedAtomNamesList; 939 940 # Set up a specified residue numbers map... 941 %{$OptionsInfo{SpecifiedAtomNumsMap}} = (); 942 for $AtomNum (@{$OptionsInfo{SpecifiedAtomNumsList}}) { 943 $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNum} = $AtomNum; 944 } 945 946 # Set up a specified residue names map... 947 %{$OptionsInfo{SpecifiedAtomNamesMap}} = (); 948 for $AtomName (@{$OptionsInfo{SpecifiedAtomNamesList}}) { 949 $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName} = lc $AtomName; 950 } 951 952 } 953 954 # Process specified distance options... 955 sub ProcessDistanceOptions { 956 my(@SpecifiedDistanceOrigin) = (); 957 958 $OptionsInfo{MaxExtractionDistance} = $Options{distance}; 959 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode}; 960 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : ''; 961 $OptionsInfo{DistanceSelectionMode} = $Options{distanceselectionmode}; 962 963 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 964 if (!$Options{distanceorigin}) { 965 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n"; 966 } 967 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin}; 968 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 969 if (@SpecifiedDistanceOrigin != 2) { 970 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n"; 971 } 972 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 973 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n"; 974 } 975 } 976 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 977 if (@SpecifiedDistanceOrigin != 2) { 978 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n"; 979 } 980 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 981 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n"; 982 } 983 } 984 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 985 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) { 986 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n"; 987 } 988 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { 989 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n"; 990 } 991 } 992 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 993 if (@SpecifiedDistanceOrigin != 3) { 994 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n"; 995 } 996 my($Value); 997 for $Value (@SpecifiedDistanceOrigin) { 998 if (!IsNumerical($Value)) { 999 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n"; 1000 } 1001 } 1002 } 1003 } 1004 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = (); 1005 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin; 1006 1007 } 1008 1009 # Retrieve information about PDB files... 1010 sub RetrievePDBFilesInfo { 1011 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix); 1012 1013 %PDBFilesInfo = (); 1014 @{$PDBFilesInfo{FileOkay}} = (); 1015 @{$PDBFilesInfo{OutFileRoot}} = (); 1016 @{$PDBFilesInfo{OutFileNames}} = (); 1017 @{$PDBFilesInfo{ChainLabels}} = (); 1018 @{$PDBFilesInfo{ChainSequenceIDs}} = (); 1019 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = (); 1020 @{$PDBFilesInfo{SpecifiedChains}} = (); 1021 @{$PDBFilesInfo{DistanceOrigin}} = (); 1022 1023 FILELIST: for $Index (0 .. $#PDBFilesList) { 1024 $PDBFilesInfo{FileOkay}[$Index] = 0; 1025 1026 $PDBFilesInfo{OutFileRoot}[$Index] = ''; 1027 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 1028 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); 1029 @{$PDBFilesInfo{ChainLabels}[$Index]} = (); 1030 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = (); 1031 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = (); 1032 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = (); 1033 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = (); 1034 1035 $PDBFile = $PDBFilesList[$Index]; 1036 if (!(-e $PDBFile)) { 1037 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; 1038 next FILELIST; 1039 } 1040 if (!CheckFileType($PDBFile, "pdb")) { 1041 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; 1042 next FILELIST; 1043 } 1044 if (! open PDBFILE, "$PDBFile") { 1045 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; 1046 next FILELIST; 1047 } 1048 close PDBFILE; 1049 1050 # Get PDB data... 1051 $PDBRecordLinesRef = ReadPDBFile($PDBFile); 1052 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { 1053 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); 1054 } 1055 else { 1056 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); 1057 } 1058 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) { 1059 warn "Warning: Ignoring file $PDBFile: No chains found \n"; 1060 next FILELIST; 1061 } 1062 1063 # Make sure specified chains exist in PDB file... 1064 @SpecifiedChains = (); 1065 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) { 1066 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) { 1067 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) { 1068 push @SpecifiedChains, $ChainID; 1069 } 1070 else { 1071 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n"; 1072 next FILELIST; 1073 } 1074 } 1075 } 1076 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) { 1077 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0]; 1078 } 1079 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) { 1080 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}}; 1081 } 1082 # Setup chain labels to use for sequence IDs and generating output files... 1083 @ChainLabels = (); 1084 for $ChainID (@SpecifiedChains) { 1085 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig; 1086 $ChainLabel = "Chain${ChainLabel}"; 1087 push @ChainLabels, $ChainLabel; 1088 } 1089 1090 # Make sure specified distance origin is valid... 1091 @DistanceOrigin = (); 1092 if ($OptionsInfo{Mode} =~ /^Distance$/i) { 1093 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) { 1094 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 1095 $RecordType = $OptionsInfo{ExtractionDistanceMode}; 1096 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1097 $RecordFound = 0; 1098 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 1099 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 1100 next LINE; 1101 } 1102 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 1103 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName); 1104 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) { 1105 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) { 1106 $RecordFound = 1; 1107 last LINE; 1108 } 1109 } 1110 } 1111 if (!$RecordFound) { 1112 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 1113 next FILELIST; 1114 } 1115 push @DistanceOrigin, ($X, $Y, $Z); 1116 } 1117 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 1118 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); 1119 $SpecifiedChainID = ''; 1120 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) { 1121 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1122 } 1123 else { 1124 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1125 } 1126 $RecordFound = 0; 1127 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { 1128 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { 1129 next LINE; 1130 } 1131 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); 1132 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName); 1133 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID); 1134 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) { 1135 next LINE; 1136 } 1137 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) { 1138 # Store coordinates for all the atoms... 1139 $RecordFound = 1; 1140 push @DistanceOrigin, ($X, $Y, $Z); 1141 next LINE; 1142 } 1143 } 1144 if (!$RecordFound) { 1145 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; 1146 next FILELIST; 1147 } 1148 } 1149 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 1150 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; 1151 } 1152 } 1153 # Setup output file names... 1154 @OutFileNames = (); 1155 $FileDir = ""; $FileName = ""; $FileExt = ""; 1156 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile); 1157 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) { 1158 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 1159 if ($RootFileName && $RootFileExt) { 1160 $FileName = $RootFileName; 1161 } 1162 else { 1163 $FileName = $OptionsInfo{OutFileRoot}; 1164 } 1165 $OutFileRoot = $FileName; 1166 } 1167 else { 1168 $OutFileRoot = $FileName; 1169 } 1170 $Mode = $OptionsInfo{Mode}; 1171 if ($Mode =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResiduesRange|ResidueNames|Distance|NonWater|NonHydrogens)$/i) { 1172 $OutFileName = ''; 1173 if ($Mode =~ /^CAlphas$/i) { 1174 $OutFileName = "${OutFileRoot}CAlphas.pdb"; 1175 } 1176 elsif ($Mode =~ /^Atoms$/i) { 1177 $OutFileName = "${OutFileRoot}Atoms.pdb"; 1178 } 1179 elsif ($Mode =~ /^AtomNums$/i) { 1180 $OutFileName = "${OutFileRoot}AtomNums.pdb"; 1181 } 1182 elsif ($Mode =~ /^AtomsRange$/i) { 1183 $OutFileName = "${OutFileRoot}AtomsRange.pdb"; 1184 } 1185 elsif ($Mode =~ /^AtomNames$/i) { 1186 $OutFileName = "${OutFileRoot}AtomNames.pdb"; 1187 } 1188 elsif ($Mode =~ /^ResidueNums$/i) { 1189 $OutFileName = "${OutFileRoot}ResidueNums.pdb"; 1190 } 1191 elsif ($Mode =~ /^ResiduesRange$/i) { 1192 $OutFileName = "${OutFileRoot}ResiduesRange.pdb"; 1193 } 1194 elsif ($Mode =~ /^ResidueNames$/i) { 1195 $OutFileName = "${OutFileRoot}ResidueNames.pdb"; 1196 } 1197 elsif ($Mode =~ /^NonWater$/i) { 1198 $OutFileName = "${OutFileRoot}NonWater.pdb"; 1199 } 1200 elsif ($Mode =~ /^NonHydrogens$/i) { 1201 $OutFileName = "${OutFileRoot}NonHydrogens.pdb"; 1202 } 1203 elsif ($Mode =~ /^Distance$/i) { 1204 my($DistanceMode) = ''; 1205 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { 1206 $DistanceMode = 'Atom'; 1207 } 1208 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { 1209 $DistanceMode = 'Hetatm'; 1210 } 1211 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { 1212 $DistanceMode = 'Residue'; 1213 } 1214 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { 1215 $DistanceMode = 'XYZ'; 1216 } 1217 $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb"; 1218 } 1219 push @OutFileNames, $OutFileName; 1220 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1221 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1222 next FILELIST; 1223 } 1224 } 1225 elsif ($Mode =~ /^(Chains|Sequences)$/i) { 1226 if ($OptionsInfo{CombineChainSequences}) { 1227 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta"; 1228 push @OutFileNames, $OutFileName; 1229 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1230 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1231 next FILELIST; 1232 } 1233 } 1234 else { 1235 for $ChainLabel (@ChainLabels) { 1236 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta"; 1237 push @OutFileNames, $OutFileName; 1238 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { 1239 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; 1240 next FILELIST; 1241 } 1242 } 1243 } 1244 } 1245 @ChainSequenceIDs = (); 1246 @ChainSequenceIDsPrefix = (); 1247 if ($Mode =~ /^Sequences$/i) { 1248 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix); 1249 ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); 1250 1251 if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) { 1252 $IDPrefix = $FileName; 1253 } 1254 elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) { 1255 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : ''; 1256 } 1257 else { 1258 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName; 1259 } 1260 1261 for $ChainLabel (@ChainLabels) { 1262 push @ChainSequenceIDsPrefix, $IDPrefix; 1263 push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB"; 1264 } 1265 } 1266 1267 $PDBFilesInfo{FileOkay}[$Index] = 1; 1268 $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 1269 1270 push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames; 1271 push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels; 1272 push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix; 1273 push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs; 1274 push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains; 1275 push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin; 1276 } 1277 } 1278 1279 1280 # Setup script usage and retrieve command line arguments specified using various options... 1281 sub SetupScriptUsage { 1282 1283 # Retrieve all the options... 1284 %Options = (); 1285 $Options{chains} = 'First'; 1286 $Options{combinechains} = 'no'; 1287 $Options{distance} = 10.0; 1288 $Options{distancemode} = 'XYZ'; 1289 $Options{distanceselectionmode} = 'ByAtom'; 1290 $Options{keepoldrecords} = 'no'; 1291 $Options{mode} = 'NonWater'; 1292 $Options{modifyheader} = 'yes'; 1293 $Options{nonstandardkeep} = 'yes'; 1294 $Options{nonstandardcode} = 'X'; 1295 $Options{sequencelength} = 80; 1296 $Options{sequenceidprefix} = 'Automatic'; 1297 $Options{sequencerecords} = 'Atom'; 1298 $Options{waterresiduenames} = 'Automatic'; 1299 1300 if (!GetOptions(\%Options, "atoms|a=s", "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "distanceselectionmode=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "recordmode=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) { 1301 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1302 } 1303 if ($Options{workingdir}) { 1304 if (! -d $Options{workingdir}) { 1305 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1306 } 1307 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1308 } 1309 if ($Options{combinechains} !~ /^(yes|no)$/i) { 1310 die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n"; 1311 } 1312 if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) { 1313 die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n"; 1314 } 1315 if ($Options{distanceselectionmode} !~ /^(ByAtom|ByResidue)$/i) { 1316 die "Error: The value specified, $Options{distanceselectionmode}, for option \"--DistanceSelectionMode\" is not valid. Allowed values: ByAtom or ByResidue\n"; 1317 } 1318 if ($Options{keepoldrecords} !~ /^(yes|no)$/i) { 1319 die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n"; 1320 } 1321 if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResidueNames|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) { 1322 die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, AtomNums, AtomsRange, AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, NonHydrogens\n"; 1323 } 1324 if ($Options{modifyheader} !~ /^(yes|no)$/i) { 1325 die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n"; 1326 } 1327 if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) { 1328 die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n"; 1329 } 1330 if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) { 1331 die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n"; 1332 } 1333 if ($Options{recordmode} && $Options{recordmode} !~ /^(Atom|Hetatm|AtomAndHetatm)$/i) { 1334 die "Error: The value specified, $Options{recordmode}, for option \"--RecordMode\" is not valid. Allowed values: Atom, Hetatm, AtomAndHetatm\n"; 1335 } 1336 if (!IsPositiveInteger($Options{sequencelength})) { 1337 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n"; 1338 } 1339 if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) { 1340 die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n"; 1341 } 1342 if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) { 1343 die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n"; 1344 } 1345 } 1346