Mercurial > repos > deepakjadmin > mayatool3_test2
diff bin/ExtractFromPDBFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ExtractFromPDBFiles.pl Wed Jan 20 09:23:18 2016 -0500 @@ -0,0 +1,1718 @@ +#!/usr/bin/perl -w +# +# $RCSfile: ExtractFromPDBFiles.pl,v $ +# $Date: 2015/02/28 20:46:19 $ +# $Revision: 1.39 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Text::ParseWords; +use Benchmark; +use FileUtil; +use TextUtil; +use PDBFileUtil; +use AminoAcids; +use SequenceFileUtil; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +# Starting message... +$ScriptName = basename($0); +print "\n$ScriptName: Starting...\n\n"; +$StartTime = new Benchmark; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@PDBFilesList); +@PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +# Setup information about input files... +print "Checking input PDB file(s)...\n"; +my(%PDBFilesInfo); +RetrievePDBFilesInfo(); + +# Process input files.. +my($FileIndex); +if (@PDBFilesList > 1) { + print "\nProcessing PDB files...\n"; +} +for $FileIndex (0 .. $#PDBFilesList) { + if ($PDBFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; + ExtractFromPDBFiles($FileIndex); + } +} +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# Extract appropriate information... +sub ExtractFromPDBFiles { + my($FileIndex) = @_; + my($PDBFile, $PDBRecordLinesRef); + + # Get PDB data... + $PDBFile = $PDBFilesList[$FileIndex]; + $PDBRecordLinesRef = ReadPDBFile($PDBFile); + + if ($OptionsInfo{Mode} =~ /Chains/i) { + ExtractChains($FileIndex, $PDBRecordLinesRef); + } + elsif ($OptionsInfo{Mode} =~ /Sequences/i) { + ExtractSequences($FileIndex, $PDBRecordLinesRef); + } + elsif ($OptionsInfo{Mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i) { + ExtractByAtoms($FileIndex, $PDBRecordLinesRef); + } + elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { + ExtractByResidues($FileIndex, $PDBRecordLinesRef); + } + elsif ($OptionsInfo{Mode} =~ /Distance/i) { + ExtractByDistance($FileIndex, $PDBRecordLinesRef); + } + elsif ($OptionsInfo{Mode} =~ /NonWater/i) { + ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef); + } + elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) { + ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef); + } +} + +# Extract chains and generate new PDB files... +# +sub ExtractChains { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap); + + # Get chains and residues data... + $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1); + + if ($OptionsInfo{CombineChains}) { + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + print "Generating PDBFileName file $PDBFileName...\n"; + + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + } + + for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { + $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; + $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; + + if (!$OptionsInfo{CombineChains}) { + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; + print "Generating PDBFileName file $PDBFileName...\n"; + + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + } + + # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers + # for writing out appropriate CONECT records... + %ChainAtomNumbersMap = (); + for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) { + print OUTFILE "$RecordLine\n"; + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine); + $AtomNumber = int $AtomNumber; + $ChainAtomNumbersMap{$AtomNumber} = $AtomName; + } + # Write out TER record using information from last chain record... + $AtomNumber += 1; + print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n"; + + # Write out CONECT records... + $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap); + + for $RecordLine (@{$ConectRecordLinesRef}) { + print OUTFILE "$RecordLine\n"; + } + + if (!$OptionsInfo{CombineChains}) { + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; + } + } + + if ($OptionsInfo{CombineChains}) { + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; + } + +} + +# Extract sequences for individual chains or combine all the chains... +sub ExtractSequences { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap); + + if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { + $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); + } + else { + $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); + } + + # Generate sequence data for all the chains... + %ChainSequencesDataMap = (); + @{$ChainSequencesDataMap{IDs}} = (); + %{$ChainSequencesDataMap{Sequence}} = (); + %{$ChainSequencesDataMap{Description}} = (); + + for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { + $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; + $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; + + # Setup sequence ID... + $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex]; + push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID; + $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID; + + # Collect sequence data for the chain... + if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) { + $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; + } + else { + $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; + } + # Setup sequence data... + $ChainSequence = ''; + RESIDUE: for $Residue (@{$ChainResiduesRef}) { + ($ResidueCode, $StandardResidue) = GetResidueCode($Residue); + if (!$StandardResidue) { + if ($OptionsInfo{KeepNonStandardSequences}) { + $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; + warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n"; + } + else { + warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n"; + next RESIDUE; + } + } + $ChainSequence .= $ResidueCode; + } + $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence; + + } + + # Write out the sequence files... + my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap ); + if ($OptionsInfo{CombineChainSequences}) { + # Combine all the chain sequences... + $Sequence = ''; + for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { + $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; + + $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID}; + } + $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";; + $SequenceDescription = $SequenceID; + $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + + print "Generating sequence file $SequenceFileName...\n"; + %SequencesDataMap = (); + @{$SequencesDataMap{IDs}} = (); + %{$SequencesDataMap{Sequence}} = (); + %{$SequencesDataMap{Description}} = (); + + push @{$SequencesDataMap{IDs}}, $SequenceID; + $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; + $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; + + WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); + } + else { + # For each specifed chain, write out the sequences... + for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { + $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; + + $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; + + $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex]; + $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID}; + $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID}; + + print "Generating sequence file $SequenceFileName...\n"; + %SequencesDataMap = (); + @{$SequencesDataMap{IDs}} = (); + %{$SequencesDataMap{Sequence}} = (); + %{$SequencesDataMap{Description}} = (); + + push @{$SequencesDataMap{IDs}}, $SequenceID; + $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; + $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; + + WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); + } + } +} + +# Extract atoms... +sub ExtractByAtoms { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $IgnoreRecord, $ConectRecordLinesRef, %AtomNumbersMap); + + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + print "Generating PDBFileName file $PDBFileName...\n"; + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + + # Write out all ATOM records along with TER and model records to indicate + # chains and multiple models.. + %AtomNumbersMap = (); + $ChainRecordCount = 0; + for $RecordLine (@{$PDBRecordLinesRef}) { + if (CheckRecordType($RecordLine)) { + ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); + + # Check atoms... + $IgnoreRecord = 1; + if ($OptionsInfo{Mode} =~ /^Atoms$/i) { + $IgnoreRecord = 0; + } + elsif ($OptionsInfo{Mode} =~ /^(CAlphas|AtomNames)$/i) { + if (exists $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName}) { + $IgnoreRecord = 0; + } + } + elsif ($OptionsInfo{Mode} =~ /^AtomNums$/i) { + if (exists $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNumber}) { + $IgnoreRecord = 0; + } + } + elsif ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { + if ($AtomNumber >= $OptionsInfo{SpecifiedStartAtomNum} && $AtomNumber <= $OptionsInfo{SpecifiedEndAtomNum}) { + $IgnoreRecord = 0; + } + } + + if (!$IgnoreRecord) { + $ChainRecordCount++; + print OUTFILE "$RecordLine\n"; + + $AtomNumber = int $AtomNumber; + $AtomNumbersMap{$AtomNumber} = $AtomName; + } + } + elsif (IsTerRecordType($RecordLine)) { + if ($ChainRecordCount) { + print OUTFILE GenerateTerRecordLine(), "\n"; + } + $ChainRecordCount = 0; + } + elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { + print OUTFILE "$RecordLine\n"; + } + } + + # Write out appropriate CONECT records... + $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); + for $RecordLine (@{$ConectRecordLinesRef}) { + print OUTFILE "$RecordLine\n"; + } + + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; +} + +# Extract residues... +sub ExtractByResidues { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap); + + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + print "Generating PDBFileName file $PDBFileName...\n"; + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + + # Write out all ATOM records for specified residues with TER and model records to indicate + # chains and multiple models... + %AtomNumbersMap = (); + $ChainRecordCount = 0; + for $RecordLine (@{$PDBRecordLinesRef}) { + if (CheckRecordType($RecordLine)) { + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine); + + # Check residues... + $IgnoreRecord = 1; + if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) { + if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) { + $IgnoreRecord = 0; + } + } + elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { + if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) { + $IgnoreRecord = 0; + } + } + elsif ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { + if (exists $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName}) { + $IgnoreRecord = 0; + } + } + if (!$IgnoreRecord) { + $ChainRecordCount++; + print OUTFILE "$RecordLine\n"; + $AtomNumber = int $AtomNumber; + $AtomNumbersMap{$AtomNumber} = $AtomName; + } + } + elsif (IsTerRecordType($RecordLine)) { + if ($ChainRecordCount) { + print OUTFILE GenerateTerRecordLine(), "\n"; + } + $ChainRecordCount = 0; + } + elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { + print OUTFILE "$RecordLine\n"; + } + } + + # Write out appropriate CONECT records... + $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); + for $RecordLine (@{$ConectRecordLinesRef}) { + print OUTFILE "$RecordLine\n"; + } + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; +} + +# Extract non water records... +sub ExtractNonWaterRecords { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap); + + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + print "Generating PDBFileName file $PDBFileName...\n"; + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + + # Write out all ATOM/HETATM non water records along with TER and model records to indicate + # chains and multiple models.. + %AtomNumbersMap = (); + $ChainRecordCount = 0; + for $RecordLine (@{$PDBRecordLinesRef}) { + if (CheckRecordType($RecordLine)) { + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine); + if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) { + $ChainRecordCount++; + print OUTFILE "$RecordLine\n"; + $AtomNumber = int $AtomNumber; + $AtomNumbersMap{$AtomNumber} = $AtomName; + } + } + elsif (IsTerRecordType($RecordLine)) { + if ($ChainRecordCount) { + print OUTFILE GenerateTerRecordLine(), "\n"; + } + $ChainRecordCount = 0; + } + elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { + print OUTFILE "$RecordLine\n"; + } + } + + # Write out appropriate CONECT records... + $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); + for $RecordLine (@{$ConectRecordLinesRef}) { + print OUTFILE "$RecordLine\n"; + } + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; +} + +# Extract non hydrogen records... +sub ExtractNonHydrogenRecords { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap); + + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + print "Generating PDBFileName file $PDBFileName...\n"; + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + + # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate + # chains and multiple models.. + %AtomNumbersMap = (); + $ChainRecordCount = 0; + for $RecordLine (@{$PDBRecordLinesRef}) { + if (CheckRecordType($RecordLine)) { + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); + if ($ElementSymbol !~ /^H$/i) { + $ChainRecordCount++; + print OUTFILE "$RecordLine\n"; + $AtomNumber = int $AtomNumber; + $AtomNumbersMap{$AtomNumber} = $AtomName; + } + } + elsif (IsTerRecordType($RecordLine)) { + if ($ChainRecordCount) { + print OUTFILE GenerateTerRecordLine(), "\n"; + } + $ChainRecordCount = 0; + } + elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { + print OUTFILE "$RecordLine\n"; + } + } + + # Write out appropriate CONECT records... + $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); + for $RecordLine (@{$ConectRecordLinesRef}) { + print OUTFILE "$RecordLine\n"; + } + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; +} + +# Extract ATOM/HETATM records by distance... +sub ExtractByDistance { + my($FileIndex, $PDBRecordLinesRef) = @_; + my($PDBFileName, $RecordLine, $RecordLineNum, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $IgnoreRecord, $ResidueID, @OriginCoords, @Coords, %AtomNumbersMap, %ResiduesDataMap); + + $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; + print "Generating PDBFileName file $PDBFileName...\n"; + open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; + + # Write out header and other older recors... + WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); + + # Setup coordinates of origin to calculate distance... + @OriginCoords = (); + push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]}; + + # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate + # chains and multiple models... + %AtomNumbersMap = (); + + %ResiduesDataMap = (); + %{$ResiduesDataMap{ID}} = (); + %{$ResiduesDataMap{Status}} = (); + + $ChainRecordCount = 0; + $RecordLineNum = 0; + + for $RecordLine (@{$PDBRecordLinesRef}) { + $RecordLineNum++; + if (CheckRecordType($RecordLine)) { + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); + @Coords = (); push @Coords, ($X, $Y, $Z); + + $IgnoreRecord = 1; + if ($OptionsInfo{DistanceSelectionMode} =~ /^ByResidue$/i) { + $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; + if (exists $ResiduesDataMap{ID}{$ResidueID}) { + # Residue data has been processed; check its selection status... + if ($ResiduesDataMap{Status}{$ResidueID}) { + $IgnoreRecord = 0; + } + } + else { + # Residue hasn't been processed... + $ResiduesDataMap{ID}{$ResidueID} = $ResidueID; + $ResiduesDataMap{Status}{$ResidueID} = 0; + if (CheckResidueDistance($ResidueID, $RecordLineNum, $PDBRecordLinesRef, \@OriginCoords)) { + $IgnoreRecord = 0; + $ResiduesDataMap{Status}{$ResidueID} = 1; + } + } + } + elsif ($OptionsInfo{DistanceSelectionMode} =~ /^ByAtom$/i) { + if (CheckDistance(\@Coords, \@OriginCoords)) { + $IgnoreRecord = 0; + } + } + + if (!$IgnoreRecord) { + $ChainRecordCount++; + print OUTFILE "$RecordLine\n"; + $AtomNumber = int $AtomNumber; + $AtomNumbersMap{$AtomNumber} = $AtomName; + } + } + elsif (IsTerRecordType($RecordLine)) { + if ($ChainRecordCount) { + print OUTFILE GenerateTerRecordLine(), "\n"; + } + $ChainRecordCount = 0; + } + elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { + print OUTFILE "$RecordLine\n"; + } + } + + # Write out appropriate CONECT records... + $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); + for $RecordLine (@{$ConectRecordLinesRef}) { + print OUTFILE "$RecordLine\n"; + } + + # Write out END record... + print OUTFILE GenerateEndRecordLine(), "\n"; + + close OUTFILE; +} + +# Does record type correspond to the specified record type? +sub CheckRecordType { + my($RecordLine) = @_; + my($Status); + + $Status = 0; + if ($OptionsInfo{RecordMode} =~ /^Atom$/i) { + $Status = IsAtomRecordType($RecordLine) ? 1 : 0; + } + elsif ($OptionsInfo{RecordMode} =~ /^Hetatm$/i) { + $Status = IsHetatmRecordType($RecordLine) ? 1 : 0; + } + elsif ($OptionsInfo{RecordMode} =~ /^AtomAndHetatm$/i) { + $Status = (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) ? 1 : 0; + } + + return $Status; +} + +# Does record meets distance citerion specified by the user? +sub CheckResidueDistance { + my($SpecifiedResidueID, $StartingLineNum, $PDBRecordLinesRef, $OriginCoordsRef) = @_; + my($Status, $RecordLine, $RecordLineIndex, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $ResidueID, @Coords); + + $Status = 0; + + RECORDLINE: for $RecordLineIndex (($StartingLineNum - 1) .. $#{$PDBRecordLinesRef}) { + $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; + if (!CheckRecordType($RecordLine)) { + next RECORDLINE; + } + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); + $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; + + if ($ResidueID !~ /^$SpecifiedResidueID$/i) { + # It's a new residue line... + last RECORDLINE; + } + + # Check distance... + @Coords = (); push @Coords, ($X, $Y, $Z); + if (CheckDistance(\@Coords, $OriginCoordsRef)) { + # Distance criterion is met for at least one record in the residue... + $Status = 1; + last RECORDLINE; + } + } + return $Status; +} + +# Does record meets distance citerion specified by the user? +sub CheckDistance { + my($CoordsRef, $OriginCoordsRef) = @_; + my($Status, $Index, $Distance, $DistanceSquare); + + $Status = 0; + + if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { + # Go over coordinates of all the atoms in the residue... + my($ResidueCoordsCount) = scalar @{$OriginCoordsRef}; + INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) { + $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2; + $Distance = sqrt $DistanceSquare; + if ($Distance <= $OptionsInfo{MaxExtractionDistance}) { + $Status = 1; + last INDEX; + } + } + } + else { + $DistanceSquare = 0; + for $Index (0 .. 2) { + $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2; + } + $Distance = sqrt $DistanceSquare; + $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0; + } + + return $Status; +} + +# Write out modifed header and other older records... +sub WriteHeaderAndOlderRecords { + my($OutFileRef, $PDBRecordLinesRef) = @_; + + if ($OptionsInfo{ModifyHeaderRecord}) { + # Write out modified HEADER record... + my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); + $Classification = 'Data extracted using MayaChemTools'; + print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n"; + } + else { + print $OutFileRef $PDBRecordLinesRef->[0], "\n"; + } + + # Write out any old records... + if ($OptionsInfo{KeepOldRecords}) { + my($RecordLineIndex, $RecordLine); + # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file... + RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) { + $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; + if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { + last RECORDLINE; + } + print $OutFileRef "$RecordLine\n"; + } + } +} + +# Get header record information assuming it's the first record... +sub GetHeaderRecordInformation { + my($PDBRecordLinesRef) = @_; + my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine); + + ($Classification, $DepositionDate, $IDCode) = ('') x 3; + $HeaderRecordLine = $PDBRecordLinesRef->[0]; + if (IsHeaderRecordType($HeaderRecordLine)) { + ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); + } + return ($Classification, $DepositionDate, $IDCode); +} + +# Get one letter residue code... +sub GetResidueCode { + my($ResidueName) = @_; + my($ResidueCode, $StandardResidue); + + $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; + $StandardResidue = 0; + + if (length($ResidueName) == 3) { + # Assume it's an amino acid... + if (AminoAcids::IsAminoAcid($ResidueName)) { + # Standard amino acid... + $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName); + $StandardResidue = 1; + } + } + elsif (length($ResidueName) == 1) { + # Assume it's a nucleic acid... + if ($ResidueName =~ /^(A|G|T|U|C)$/i) { + $ResidueCode = $ResidueName; + $StandardResidue = 1; + } + } + + return ($ResidueCode, $StandardResidue); +} + +# Process option values... +sub ProcessOptions { + %OptionsInfo = (); + $OptionsInfo{Mode} = $Options{mode}; + + my(@SpecifiedChains) = (); + if ($Options{chains} =~ /^(First|All)$/i) { + $OptionsInfo{ChainsToExtract} = $Options{chains}; + } + else { + @SpecifiedChains = split /\,/, $Options{chains}; + $OptionsInfo{ChainsToExtract} = 'Specified'; + } + @{$OptionsInfo{SpecifiedChains}} = (); + push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains; + + $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; + + $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; + + ProcessResiduesOptions(); + ProcessAtomsOptions(); + ProcessDistanceOptions(); + + $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames}; + @{$OptionsInfo{SpecifiedWaterResiduesList}} = (); + %{$OptionsInfo{SpecifiedWaterResiduesMap}} = (); + + my(@SpecifiedWaterResiduesList); + @SpecifiedWaterResiduesList = (); + + if ($OptionsInfo{Mode} =~ /^NonWater$/i) { + my($WaterResidueName); + if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) { + push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O'); + } + else { + @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames}; + } + for $WaterResidueName (@SpecifiedWaterResiduesList) { + $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName; + } + } + push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList; + + $OptionsInfo{RecordMode} = $Options{recordmode} ? $Options{recordmode} : ($Options{mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i ? "Atom" : "AtomAndHetatm"); + + $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0; + + $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0; + + $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0; + $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode}; + $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; + $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords}; + $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix}; + + $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; + $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; +} + +# Process specified residue options... +sub ProcessResiduesOptions { + my($ResidueNum, $StartResidueNum, $EndResNum, $ResidueName, @SpecifiedResidueNumsList, @SpecifiedResidueNamesList); + + @SpecifiedResidueNumsList = (); + ($StartResidueNum, $EndResNum) = (0, 0); + + @SpecifiedResidueNamesList = (); + + if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { + if (!$Options{residues}) { + die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums, ResiduesRange, or ResidueNames\" \"-m, --mode\". \n"; + } + $OptionsInfo{Residues} = $Options{residues}; + $OptionsInfo{Residues} =~ s/ //g; + + if ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { + @SpecifiedResidueNamesList = split /\,/, $OptionsInfo{Residues}; + } + else { + @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues}; + for $ResidueNum (@SpecifiedResidueNumsList) { + if (!IsPositiveInteger($ResidueNum)) { + die "Error: Invalid residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n"; + } + } + if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { + if (@SpecifiedResidueNumsList != 2) { + die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n"; + } + if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) { + die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue number must be less than end residue number.\n"; + } + ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList; + } + } + } + + @{$OptionsInfo{SpecifiedResidueNumsList}} = (); + push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList; + + $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum; + $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum; + + @{$OptionsInfo{SpecifiedResidueNamesList}} = (); + push @{$OptionsInfo{SpecifiedResidueNamesList}}, @SpecifiedResidueNamesList; + + # Set up a specified residue numbers map... + %{$OptionsInfo{SpecifiedResidueNumsMap}} = (); + for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) { + $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum; + } + + # Set up a specified residue names map... + %{$OptionsInfo{SpecifiedResidueNamesMap}} = (); + for $ResidueName (@{$OptionsInfo{SpecifiedResidueNamesList}}) { + $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName} = lc $ResidueName; + } + +} + +# Process specified atom options... +sub ProcessAtomsOptions { + my($AtomNum, $StartAtomNum, $EndAtomNum, $AtomName, @SpecifiedAtomNumsList, @SpecifiedAtomNamesList); + + @SpecifiedAtomNumsList = (); + ($StartAtomNum, $EndAtomNum) = (0, 0); + + @SpecifiedAtomNamesList = (); + + if ($OptionsInfo{Mode} =~ /^(AtomNums|AtomsRange|AtomNames)$/i) { + if (!$Options{atoms}) { + die "Error: You must specify a value for \"--Atoms\" option in \"AtomNums, AtomsRange, or AtomNames\" \"-m, --mode\". \n"; + } + $OptionsInfo{Atoms} = $Options{atoms}; + $OptionsInfo{Atoms} =~ s/ //g; + + if ($OptionsInfo{Mode} =~ /^AtomNames$/i) { + @SpecifiedAtomNamesList = split /\,/, $OptionsInfo{Atoms}; + } + else { + @SpecifiedAtomNumsList = split /\,/, $OptionsInfo{Atoms}; + for $AtomNum (@SpecifiedAtomNumsList) { + if (!IsPositiveInteger($AtomNum)) { + die "Error: Invalid atom number value, $AtomNum, for \"--Atoms\" option during \"AtomNums\" or \"AtomsRange\"value of \"-m --mode\" option: Atom number must be a positive integer.\n"; + } + } + if ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { + if (@SpecifiedAtomNumsList != 2) { + die "Error: Invalid number of atom number values, ", scalar(@SpecifiedAtomNumsList), ", for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end atom numbers.\n"; + } + if ($SpecifiedAtomNumsList[0] > $SpecifiedAtomNumsList[1]) { + die "Error: Invalid atom number values, @SpecifiedAtomNumsList, for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The start atom number must be less than end atom number.\n"; + } + ($StartAtomNum, $EndAtomNum) = @SpecifiedAtomNumsList; + } + } + } + elsif ($OptionsInfo{Mode} =~ /^CAlphas$/i) { + @SpecifiedAtomNamesList = ("CA"); + } + + @{$OptionsInfo{SpecifiedAtomNumsList}} = (); + push @{$OptionsInfo{SpecifiedAtomNumsList}}, @SpecifiedAtomNumsList; + + $OptionsInfo{SpecifiedStartAtomNum} = $StartAtomNum; + $OptionsInfo{SpecifiedEndAtomNum} = $EndAtomNum; + + @{$OptionsInfo{SpecifiedAtomNamesList}} = (); + push @{$OptionsInfo{SpecifiedAtomNamesList}}, @SpecifiedAtomNamesList; + + # Set up a specified residue numbers map... + %{$OptionsInfo{SpecifiedAtomNumsMap}} = (); + for $AtomNum (@{$OptionsInfo{SpecifiedAtomNumsList}}) { + $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNum} = $AtomNum; + } + + # Set up a specified residue names map... + %{$OptionsInfo{SpecifiedAtomNamesMap}} = (); + for $AtomName (@{$OptionsInfo{SpecifiedAtomNamesList}}) { + $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName} = lc $AtomName; + } + +} + +# Process specified distance options... +sub ProcessDistanceOptions { + my(@SpecifiedDistanceOrigin) = (); + + $OptionsInfo{MaxExtractionDistance} = $Options{distance}; + $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode}; + $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : ''; + $OptionsInfo{DistanceSelectionMode} = $Options{distanceselectionmode}; + + if ($OptionsInfo{Mode} =~ /^Distance$/i) { + if (!$Options{distanceorigin}) { + die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n"; + } + @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin}; + if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { + if (@SpecifiedDistanceOrigin != 2) { + die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n"; + } + if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { + die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n"; + } + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { + if (@SpecifiedDistanceOrigin != 2) { + die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n"; + } + if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { + die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n"; + } + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { + if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) { + die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n"; + } + if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { + die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n"; + } + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { + if (@SpecifiedDistanceOrigin != 3) { + die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n"; + } + my($Value); + for $Value (@SpecifiedDistanceOrigin) { + if (!IsNumerical($Value)) { + die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n"; + } + } + } + } + @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = (); + push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin; + +} + +# Retrieve information about PDB files... +sub RetrievePDBFilesInfo { + my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix); + + %PDBFilesInfo = (); + @{$PDBFilesInfo{FileOkay}} = (); + @{$PDBFilesInfo{OutFileRoot}} = (); + @{$PDBFilesInfo{OutFileNames}} = (); + @{$PDBFilesInfo{ChainLabels}} = (); + @{$PDBFilesInfo{ChainSequenceIDs}} = (); + @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = (); + @{$PDBFilesInfo{SpecifiedChains}} = (); + @{$PDBFilesInfo{DistanceOrigin}} = (); + + FILELIST: for $Index (0 .. $#PDBFilesList) { + $PDBFilesInfo{FileOkay}[$Index] = 0; + + $PDBFilesInfo{OutFileRoot}[$Index] = ''; + @{$PDBFilesInfo{OutFileNames}[$Index]} = (); + @{$PDBFilesInfo{OutFileNames}[$Index]} = (); + @{$PDBFilesInfo{ChainLabels}[$Index]} = (); + @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = (); + @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = (); + @{$PDBFilesInfo{SpecifiedChains}[$Index]} = (); + @{$PDBFilesInfo{DistanceOrigin}[$Index]} = (); + + $PDBFile = $PDBFilesList[$Index]; + if (!(-e $PDBFile)) { + warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; + next FILELIST; + } + if (!CheckFileType($PDBFile, "pdb")) { + warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; + next FILELIST; + } + if (! open PDBFILE, "$PDBFile") { + warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; + next FILELIST; + } + close PDBFILE; + + # Get PDB data... + $PDBRecordLinesRef = ReadPDBFile($PDBFile); + if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { + $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); + } + else { + $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); + } + if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) { + warn "Warning: Ignoring file $PDBFile: No chains found \n"; + next FILELIST; + } + + # Make sure specified chains exist in PDB file... + @SpecifiedChains = (); + if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) { + for $ChainID (@{$OptionsInfo{SpecifiedChains}}) { + if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) { + push @SpecifiedChains, $ChainID; + } + else { + warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n"; + next FILELIST; + } + } + } + elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) { + push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0]; + } + elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) { + push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}}; + } + # Setup chain labels to use for sequence IDs and generating output files... + @ChainLabels = (); + for $ChainID (@SpecifiedChains) { + $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig; + $ChainLabel = "Chain${ChainLabel}"; + push @ChainLabels, $ChainLabel; + } + + # Make sure specified distance origin is valid... + @DistanceOrigin = (); + if ($OptionsInfo{Mode} =~ /^Distance$/i) { + if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) { + my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); + $RecordType = $OptionsInfo{ExtractionDistanceMode}; + ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; + $RecordFound = 0; + LINE: for $RecordLine (@{$PDBRecordLinesRef}) { + if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { + next LINE; + } + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); + $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName); + if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) { + if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) { + $RecordFound = 1; + last LINE; + } + } + } + if (!$RecordFound) { + warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; + next FILELIST; + } + push @DistanceOrigin, ($X, $Y, $Z); + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { + my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); + $SpecifiedChainID = ''; + if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) { + ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; + } + else { + ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; + } + $RecordFound = 0; + LINE: for $RecordLine (@{$PDBRecordLinesRef}) { + if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { + next LINE; + } + ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); + $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName); + $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID); + if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) { + next LINE; + } + if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) { + # Store coordinates for all the atoms... + $RecordFound = 1; + push @DistanceOrigin, ($X, $Y, $Z); + next LINE; + } + } + if (!$RecordFound) { + warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; + next FILELIST; + } + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { + push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; + } + } + # Setup output file names... + @OutFileNames = (); + $FileDir = ""; $FileName = ""; $FileExt = ""; + ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile); + if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) { + my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); + if ($RootFileName && $RootFileExt) { + $FileName = $RootFileName; + } + else { + $FileName = $OptionsInfo{OutFileRoot}; + } + $OutFileRoot = $FileName; + } + else { + $OutFileRoot = $FileName; + } + $Mode = $OptionsInfo{Mode}; + if ($Mode =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResiduesRange|ResidueNames|Distance|NonWater|NonHydrogens)$/i) { + $OutFileName = ''; + if ($Mode =~ /^CAlphas$/i) { + $OutFileName = "${OutFileRoot}CAlphas.pdb"; + } + elsif ($Mode =~ /^Atoms$/i) { + $OutFileName = "${OutFileRoot}Atoms.pdb"; + } + elsif ($Mode =~ /^AtomNums$/i) { + $OutFileName = "${OutFileRoot}AtomNums.pdb"; + } + elsif ($Mode =~ /^AtomsRange$/i) { + $OutFileName = "${OutFileRoot}AtomsRange.pdb"; + } + elsif ($Mode =~ /^AtomNames$/i) { + $OutFileName = "${OutFileRoot}AtomNames.pdb"; + } + elsif ($Mode =~ /^ResidueNums$/i) { + $OutFileName = "${OutFileRoot}ResidueNums.pdb"; + } + elsif ($Mode =~ /^ResiduesRange$/i) { + $OutFileName = "${OutFileRoot}ResiduesRange.pdb"; + } + elsif ($Mode =~ /^ResidueNames$/i) { + $OutFileName = "${OutFileRoot}ResidueNames.pdb"; + } + elsif ($Mode =~ /^NonWater$/i) { + $OutFileName = "${OutFileRoot}NonWater.pdb"; + } + elsif ($Mode =~ /^NonHydrogens$/i) { + $OutFileName = "${OutFileRoot}NonHydrogens.pdb"; + } + elsif ($Mode =~ /^Distance$/i) { + my($DistanceMode) = ''; + if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { + $DistanceMode = 'Atom'; + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { + $DistanceMode = 'Hetatm'; + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { + $DistanceMode = 'Residue'; + } + elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { + $DistanceMode = 'XYZ'; + } + $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb"; + } + push @OutFileNames, $OutFileName; + if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { + warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; + next FILELIST; + } + } + elsif ($Mode =~ /^(Chains|Sequences)$/i) { + if ($OptionsInfo{CombineChainSequences}) { + $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta"; + push @OutFileNames, $OutFileName; + if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { + warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; + next FILELIST; + } + } + else { + for $ChainLabel (@ChainLabels) { + $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta"; + push @OutFileNames, $OutFileName; + if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { + warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; + next FILELIST; + } + } + } + } + @ChainSequenceIDs = (); + @ChainSequenceIDsPrefix = (); + if ($Mode =~ /^Sequences$/i) { + my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix); + ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); + + if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) { + $IDPrefix = $FileName; + } + elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) { + $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : ''; + } + else { + $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName; + } + + for $ChainLabel (@ChainLabels) { + push @ChainSequenceIDsPrefix, $IDPrefix; + push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB"; + } + } + + $PDBFilesInfo{FileOkay}[$Index] = 1; + $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; + + push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames; + push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels; + push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix; + push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs; + push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains; + push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin; + } +} + + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Retrieve all the options... + %Options = (); + $Options{chains} = 'First'; + $Options{combinechains} = 'no'; + $Options{distance} = 10.0; + $Options{distancemode} = 'XYZ'; + $Options{distanceselectionmode} = 'ByAtom'; + $Options{keepoldrecords} = 'no'; + $Options{mode} = 'NonWater'; + $Options{modifyheader} = 'yes'; + $Options{nonstandardkeep} = 'yes'; + $Options{nonstandardcode} = 'X'; + $Options{sequencelength} = 80; + $Options{sequenceidprefix} = 'Automatic'; + $Options{sequencerecords} = 'Atom'; + $Options{waterresiduenames} = 'Automatic'; + + if (!GetOptions(\%Options, "atoms|a=s", "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "distanceselectionmode=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "recordmode=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if ($Options{combinechains} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) { + die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n"; + } + if ($Options{distanceselectionmode} !~ /^(ByAtom|ByResidue)$/i) { + die "Error: The value specified, $Options{distanceselectionmode}, for option \"--DistanceSelectionMode\" is not valid. Allowed values: ByAtom or ByResidue\n"; + } + if ($Options{keepoldrecords} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResidueNames|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) { + die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, AtomNums, AtomsRange, AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, NonHydrogens\n"; + } + if ($Options{modifyheader} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) { + die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n"; + } + if ($Options{recordmode} && $Options{recordmode} !~ /^(Atom|Hetatm|AtomAndHetatm)$/i) { + die "Error: The value specified, $Options{recordmode}, for option \"--RecordMode\" is not valid. Allowed values: Atom, Hetatm, AtomAndHetatm\n"; + } + if (!IsPositiveInteger($Options{sequencelength})) { + die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n"; + } + if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) { + die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n"; + } + if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) { + die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n"; + } +} + +__END__ + +=head1 NAME + +ExtractFromPDBFiles.pl - Extract specific data from PDBFile(s) + +=head1 SYNOPSIS + +ExtractFromPDBFiles.pl PDBFile(s)... + +ExtractFromPDBFiles.pl [B<-a, --Atoms> "AtomNum, [AtomNum...]" | "StartAtomNum, EndAtomNum" | +"AtomName, [AtomName...]"] [B<-c, --chains> First | All | "ChainID, [ChainID,...]"] +[<--CombineChains> yes | no] [B<-d, --distance> number] [B<--DistanceMode> Atom | Hetatm | Residue | XYZ] +[B<--DistanceOrigin> "AtomNumber, AtomName" | "HetatmNumber, HetAtmName" | "ResidueNumber, ResidueName, [ChainID]" | "X,Y,Z">] +[<--DistanceSelectionMode> ByAtom | ByResidue] [B<-h, --help>] [B<-k, --KeepOldRecords> yes | no] +[B<-m, --mode > Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames | +ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens] +[B<--ModifyHeader> yes | no] [B<--NonStandardKeep> yes | no] [B<--NonStandardCode> character] +[B<-o, --overwrite>] [B<-r, --root> rootname] B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm>] +[B<--Residues> "ResidueNum,[ResidueNum...]" | StartResidueNum,EndResiduNum ] +[B<--SequenceLength> number] [B<--SequenceRecords> Atom | SeqRes] +[B<--SequenceIDPrefix> FileName | HeaderRecord | Automatic] +[B<--WaterResidueNames> Automatic | "ResidueName, [ResidueName,...]"] +[B<-w, --WorkingDir> dirname] PDBFile(s)... + +=head1 DESCRIPTION + +Extract specific data from I<PDBFile(s)> and generate appropriate PDB or sequence file(s). +Multiple PDBFile names are separated by spaces. The valid file extension is I<.pdb>. +All other file name extensions are ignored during the wild card expansion. All the PDB files +in a current directory can be specified either by I<*.pdb> or the current directory name. + +During I<Chains> and I<Sequences> values of B<-m, --mode> option, all ATOM/HETAM records +for chains after the first model in PDB fils containing data for multiple models are ignored. + +=head1 OPTIONS + +=over 4 + +=item B<-a, --Atoms> I<"AtomNum,[AtomNum...]" | "StartAtomNum,EndAtomNum" | "AtomName,[AtomName...]"> + +Specify which atom records to extract from I<PDBFiles(s)> during I<AtomNums>, +I<AtomsRange>, and I<AtomNames> value of B<-m, --mode> option: extract records +corresponding to atom numbers specified in a comma delimited list of atom numbers/names, +or with in the range of start and end atom numbers. Possible values: I<"AtomNum[,AtomNum,..]">, +I<StartAtomNum,EndAtomNum>, or I<"AtomName[,AtomName,..]">. Default: I<None>. Examples: + + 10 + 15,20 + N,CA,C,O + +=item B<-c, --chains> I<First | All | ChainID,[ChainID,...]> + +Specify which chains to extract from I<PDBFile(s)> during I<Chains | Sequences> value of +B<-m, --mode> option: first chain, all chains, or a specific list of comma delimited chain IDs. +Possible values: I<First | All | ChainID,[ChainID,...]>. Default: I<First>. Examples: + + A + A,B + All + +=item B<--CombineChains> I<yes | no> + +Specify whether to combine extracted chains data into a single file during I<Chains> or +I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>. Default: I<no>. + +During I<Chains> value of <-m, --mode> option with I<Yes> value of <--CombineChains>, +extracted data for specified chains is written into a single file instead of individual file for each +chain. + +During I<Sequences> value of <-m, --mode> option with I<Yes> value of <--CombineChains>, +residues sequences for specified chains are extracted and concatenated into a single sequence +file instead of individual file for each chain. + +=item B<-d, --distance> I<number> + +Specify distance used to extract ATOM/HETATM recods during I<Distance> value of +B<-m, --mode> option. Default: I<10.0> angstroms. + +B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: +ATOM, HETATM or both. + +=item B<--DistanceMode> I<Atom | Hetatm | Residue | XYZ> + +Specify how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of +B<-m, --mode> option: extract all the records within a certain distance specifed by B<-d, --distance> +from an atom or hetro atom record, a residue, or any artbitrary point. Possible values: I<Atom | +Hetatm | Residue | XYZ>. Default: I<XYZ>. + +During I<Residue> value of B<--distancemode>, distance of ATOM/HETATM records is calculated from +all the atoms in the residue and the records are selected as long as any atom of the residue lies with +in the distace specified using B<-d, --distance> option. + +B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: +ATOM, HETATM or both. + +=item B<--DistanceSelectionMode> I<ByAtom | ByResidue> + +Specify how how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of +B<-m, --mode> option for all values of B<--DistanceMode> option: extract only those ATOM/HETATM +records that meet specified distance criterion; extract all records corresponding to a residue as +long as one of the ATOM/HETATM record in the residue satisfies specified distance criterion. Possible +values: I<ByAtom, ByResidue>. Default value: I<ByAtom>. + +=item B<--DistanceOrigin> I<"AtomNumber,AtomName" | "HetatmNumber,HetAtmName" | "ResidueNumber,ResidueName[,ChainID]" | "X,Y,Z"> + +This value is B<--distancemode> specific. In general, it identifies a point used to select +other ATOM/HETATMS with in a specific distance from this point. + +For I<Atom> value of B<--distancemode>, this option corresponds to an atom specification. +Format: I<AtomNumber,AtomName>. Example: + + 455,CA + +For I<Hetatm> value of B<--distancemode>, this option corresponds to a hetatm specification. +Format: I<HetatmNumber,HetAtmName>. Example: + + 5295,C1 + +For I<Residue> value of B<--distancemode>, this option corresponds to a residue specification. +Format: I<ResidueNumber, ResidueName[,ChainID]>. Example: + + 78,MSE + 977,RET,A + 978,RET,B + +For I<XYZ> value of B<--distancemode>, this option corresponds to a coordinate of an +arbitrary point. Format: I<X,Y,X>. Example: + + 10.044,19.261,-4.292 + +B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: +ATOM, HETATM or both. + +=item B<-h, --help> + +Print this help message. + +=item B<-k, --KeepOldRecords> I<yes | no> + +Specify whether to transfer old non ATOM and HETATM records from input PDBFile(s) to new +PDBFile(s) during I<Chains | Atoms | HetAtms | CAlphas | Distance| NonWater | NonHydrogens> +value of B<-m --mode> option. By default, except for the HEADER record, all +other unnecessary non ATOM/HETATM records are dropped during the +generation of new PDB files. Possible values: I<yes | no>. Default: I<no>. + +=item B<-m, --mode > I<Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames | ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens> + +Specify what to extract from I<PDBFile(s)>: I<Chains> - retrieve records for +specified chains; I<Sequences> - generate sequence files for specific chains; +I<Atoms> - extract atom records; I<CAlphas> - extract atom records for alpha +carbon atoms; I<AtomNums> - extract atom records for specified atom numbers; +I<AtomsRange> - extract atom records between specified atom number range; +I<AtomNames> - extract atom records for specified atom names; I<ResidueNums> +- extract records for specified residue numbers; I<ResiduesRange> - extract records +for residues between specified residue number range; I<ResidueNames> - extract +records for specified residue names; I<Distance> - extract records with in a +certain distance from a specific position; I<NonWater> - extract records corresponding +to residues other than water; I<NonHydrogens> - extract non-hydrogen records. + +Possible values: I<Chains, Sequences Atoms, CAlphas, AtomNums, AtomsRange, +AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, +NonHydrogens>. Default value: I<NonWater> + +During the generation of new PDB files, unnecessay CONECT records are dropped. + +For I<Chains> mode, data for appropriate chains specified by B<--c --chains> option +is extracted from I<PDBFile(s)> and placed into new PDB file(s). + +For I<Sequences> mode, residues names using various sequence related options are +extracted for chains specified by B<--c --chains> option from I<PDBFile(s)> and +FASTA sequence file(s) are generated. + +For I<Distance> mode, all ATOM/HETATM records with in a distance specified +by B<-d --distance> option from a specific atom, residue or a point indicated by +B<--distancemode> are extracted and placed into new PDB file(s). + +For I<NonWater> mode, non water ATOM/HETATM record lines, identified using value of +B<--WaterResidueNames>, are extracted and written to new PDB file(s). + +For I<NonHydrogens> mode, ATOM/HETATOM record lines containing element symbol +other than I<H> are extracted and written to new PDB file(s). + +For all other options, appropriate ATOM/HETATM records are extracted to generate new +PDB file(s). + +B<--RecordMode> option controls type of record lines to extract and process from +I<PDBFile(s)>: ATOM, HETATM or both. + +=item B<--ModifyHeader> I<yes | no> + +Specify whether to modify HEADER record during the generation of new PDB files +for B<-m, --mode> values of I<Chains | Atoms | CAlphas | Distance>. Possible values: +I<yes | no>. Default: I<yes>. By default, Classification data is replaced by I<Data extracted +using MayaChemTools> before writing out HEADER record. + +=item B<--NonStandardKeep> I<yes | no> + +Specify whether to include and convert non-standard three letter residue codes into +a code specified using B<--nonstandardcode> option and include them into sequence file(s) +generated during I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>. +Default: I<yes>. + +A warning is also printed about the presence of non-standard residues. Any residue other +than standard 20 amino acids and 5 nucleic acid is considered non-standard; additionally, +HETATM residues in chains also tagged as non-standard. + +=item B<--NonStandardCode> I<character> + +A single character code to use for non-standard residues. Default: I<X>. Possible values: +I<?, -, or X>. + +=item B<-o, --overwrite> + +Overwrite existing files. + +=item B<-r, --root> I<rootname> + +New PDB and sequence file name is generated using the root: <Root><Mode>.<Ext>. +Default new file name: <PDBFileName>Chain<ChainID>.pdb for I<Chains> B<mode>; +<PDBFileName>SequenceChain<ChainID>.fasta for I<Sequences> B<mode>; +<PDBFileName>DistanceBy<DistanceMode>.pdb for I<Distance> B<-m, --mode> +<PDBFileName><Mode>.pdb for I<Atoms | CAlphas | NonWater | NonHydrogens> B<-m, --mode> +values. This option is ignored for multiple input files. + +=item B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm> + +Specify type of record lines to extract and process from I<PDBFile(s)> during various +values of B<-m, --mode> option: extract only ATOM record lines; extract only HETATM +record lines; extract both ATOM and HETATM lines. Possible values: I<Atom | Hetatm +| AtomAndHetatm | XYZ>. Default during I<Atoms, CAlphas, AtomNums, AtomsRange, +AtomNames> values of B<-m, --mode> option: I<Atom>; otherwise: I<AtomAndHetatm>. + +This option is ignored during I<Chains, Sequences> values of B<-m, --mode> option. + +=item B<--Residues> I<"ResidueNum,[ResidueNum...]" | "StartResidueNum,EndResiduNum" | "ResidueName,[ResidueName...]"> + +Specify which resiude records to extract from I<PDBFiles(s)> during I<ResidueNums>, +I<ResiduesRange>,and I<ResidueNames> value of B<-m, --mode> option: extract records +corresponding to residue numbers specified in a comma delimited list of residue numbers/names, +or with in the range of start and end residue numbers. Possible values: I<"ResidueNum[,ResidueNum,..]">, +I<StartResidueNum,EndResiduNum>, or I<<"ResidueName[,ResidueName,..]">. Default: I<None>. Examples: + + 20 + 5,10 + TYR,SER,THR + +B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: +ATOM, HETATM or both. + +=item B<--SequenceLength> I<number> + +Maximum sequence length per line in sequence file(s). Default: I<80>. + +=item B<--SequenceRecords> I<Atom | SeqRes> + +Specify which records to use for extracting residue names from I<PDBFiles(s)> during +I<Sequences> value of B<-m, --mode> option: use ATOM records to compile a list +of residues in a chain or parse SEQRES record to get a list of residues. Possible values: +I<Atom | SeqRes>. Default: I<Atom>. + +=item B<--SequenceIDPrefix> I<FileName | HeaderRecord | Automatic> + +Specify how to generate a prefix for sequence IDs during I<Sequences> value +of B<-m, --mode> option: use input file name prefix; retrieve PDB ID from HEADER record; +or automatically decide the method for generating the prefix. The chain IDs are also +appended to the prefix. Possible values: I<FileName | HeaderRecord | Automatic>. +Default: I<Automatic> + +=item B<--WaterResidueNames> I<Automatic | "ResidueName,[ResidueName,...]"> + +Identification of water residues during I<NonWater> value of B<-m, --mode> option. Possible values: +I<Automatic | "ResidueName,[ResidueName,...]">. Default: I<Automatic> - corresponds +to "HOH,WAT,H20". You can also specify a different comma delimited list of residue names +to use for water. + +=item B<-w, --WorkingDir> I<dirname> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To extract non-water records from Sample2.pdb file and generate Sample2NonWater.pdb +file, type: + + % ExtractFromPDBFiles.pl Sample2.pdb + +To extract non-water records corresponding to only ATOM records from Sample2.pdb file +and generate Sample2NonWater.pdb file, type: + + % ExtractFromPDBFiles.pl --RecordMode Atom Sample2.pdb + +To extract non-water records from Sample2.pdb file using HOH or WAT residue name for water along +with all old non-coordinate records and generate Sample2NewNonWater.pdb file, type: + + % ExtractFromPDBFiles.pl -m NonWater --WaterResidueNames "HOH,WAT" + -KeepOldRecords Yes -r Sample2New -o Sample2.pdb + +To extract non-hydrogens records from Sample2.pdb file and generate Sample2NonHydrogen.pdb +file, type: + + % ExtractFromPDBFiles.pl -m NonHydrogens Sample2.pdb + +To extract data for first chain in Sample2.pdb and generate Sample2ChainA.pdb, type +file, type: + + % ExtractFromPDBFiles.pl -m chains -o Sample2.pdb + +To extract data for both chains in Sample2.pdb and generate Sample2ChainA.pdb and +Sample2ChainB.pdb, type: + + % ExtractFromPDBFiles.pl -m chains -c All -o Sample2.pdb + +To extract data for alpha carbons in Sample2.pdb and generate Sample2CAlphas.pdb, type: + + % ExtractFromPDBFiles.pl -m CAlphas -o Sample2.pdb + +To extract records for specific residue numbers in all chains from Sample2.pdb file and generate +Sample2ResidueNums.pdb file, type: + + % ExtractFromPDBFiles.pl -m ResidueNums --Residues "3,6" + Sample2.pdb + +To extract records for a specific range of residue number in all chains from Sample2.pdb +file and generate Sample2ResiduesRange.pdb file, type: + + % ExtractFromPDBFiles.pl -m ResiduesRange --Residues "10,30" + Sample2.pdb + +To extract data for all ATOM and HETATM records with in 10 angstrom of an atom specifed by +atom serial number and name "1,N" in Sample2.pdb file and generate Sample2DistanceByAtom.pdb, +type: + + % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom + --DistanceOrigin "1,N" -k No --distance 10 -o Sample2.pdb + +To extract data for all ATOM and HETATM records for complete residues with any atom or hetatm +less than 10 angstrom of an atom specifed by atom serial number and name "1,N" in Sample2.pdb +file and generate Sample2DistanceByAtom.pdb, type: + + % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom + --DistanceOrigin "1,N" --DistanceSelectionMode ByResidue + -k No --distance 10 -o Sample2.pdb + +To extract data for all ATOM and HETATM records with in 25 angstrom of an arbitrary point "0,0,0" +in Sample2.pdb file and generate Sample2DistanceByXYZ.pdb, type: + + % ExtractFromPDBFiles.pl -m Distance --DistanceMode XYZ + --DistanceOrigin "0,0,0" -k No --distance 25 -o Sample2.pdb + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +InfoPDBFiles.pl, ModifyPDBFiles.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut