Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/ExtractFromPDBFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: ExtractFromPDBFiles.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:19 $ | |
| 5 # $Revision: 1.39 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use Benchmark; | |
| 35 use FileUtil; | |
| 36 use TextUtil; | |
| 37 use PDBFileUtil; | |
| 38 use AminoAcids; | |
| 39 use SequenceFileUtil; | |
| 40 | |
| 41 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 42 | |
| 43 # Autoflush STDOUT | |
| 44 $| = 1; | |
| 45 | |
| 46 # Starting message... | |
| 47 $ScriptName = basename($0); | |
| 48 print "\n$ScriptName: Starting...\n\n"; | |
| 49 $StartTime = new Benchmark; | |
| 50 | |
| 51 # Get the options and setup script... | |
| 52 SetupScriptUsage(); | |
| 53 if ($Options{help} || @ARGV < 1) { | |
| 54 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 55 } | |
| 56 | |
| 57 my(@PDBFilesList); | |
| 58 @PDBFilesList = ExpandFileNames(\@ARGV, "pdb"); | |
| 59 | |
| 60 # Process options... | |
| 61 print "Processing options...\n"; | |
| 62 my(%OptionsInfo); | |
| 63 ProcessOptions(); | |
| 64 | |
| 65 # Setup information about input files... | |
| 66 print "Checking input PDB file(s)...\n"; | |
| 67 my(%PDBFilesInfo); | |
| 68 RetrievePDBFilesInfo(); | |
| 69 | |
| 70 # Process input files.. | |
| 71 my($FileIndex); | |
| 72 if (@PDBFilesList > 1) { | |
| 73 print "\nProcessing PDB files...\n"; | |
| 74 } | |
| 75 for $FileIndex (0 .. $#PDBFilesList) { | |
| 76 if ($PDBFilesInfo{FileOkay}[$FileIndex]) { | |
| 77 print "\nProcessing file $PDBFilesList[$FileIndex]...\n"; | |
| 78 ExtractFromPDBFiles($FileIndex); | |
| 79 } | |
| 80 } | |
| 81 print "\n$ScriptName:Done...\n\n"; | |
| 82 | |
| 83 $EndTime = new Benchmark; | |
| 84 $TotalTime = timediff ($EndTime, $StartTime); | |
| 85 print "Total time: ", timestr($TotalTime), "\n"; | |
| 86 | |
| 87 ############################################################################### | |
| 88 | |
| 89 # Extract appropriate information... | |
| 90 sub ExtractFromPDBFiles { | |
| 91 my($FileIndex) = @_; | |
| 92 my($PDBFile, $PDBRecordLinesRef); | |
| 93 | |
| 94 # Get PDB data... | |
| 95 $PDBFile = $PDBFilesList[$FileIndex]; | |
| 96 $PDBRecordLinesRef = ReadPDBFile($PDBFile); | |
| 97 | |
| 98 if ($OptionsInfo{Mode} =~ /Chains/i) { | |
| 99 ExtractChains($FileIndex, $PDBRecordLinesRef); | |
| 100 } | |
| 101 elsif ($OptionsInfo{Mode} =~ /Sequences/i) { | |
| 102 ExtractSequences($FileIndex, $PDBRecordLinesRef); | |
| 103 } | |
| 104 elsif ($OptionsInfo{Mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i) { | |
| 105 ExtractByAtoms($FileIndex, $PDBRecordLinesRef); | |
| 106 } | |
| 107 elsif ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { | |
| 108 ExtractByResidues($FileIndex, $PDBRecordLinesRef); | |
| 109 } | |
| 110 elsif ($OptionsInfo{Mode} =~ /Distance/i) { | |
| 111 ExtractByDistance($FileIndex, $PDBRecordLinesRef); | |
| 112 } | |
| 113 elsif ($OptionsInfo{Mode} =~ /NonWater/i) { | |
| 114 ExtractNonWaterRecords($FileIndex, $PDBRecordLinesRef); | |
| 115 } | |
| 116 elsif ($OptionsInfo{Mode} =~ /NonHydrogens/i) { | |
| 117 ExtractNonHydrogenRecords($FileIndex, $PDBRecordLinesRef); | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 # Extract chains and generate new PDB files... | |
| 122 # | |
| 123 sub ExtractChains { | |
| 124 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 125 my($ChainIndex, $ChainID, $ChainLabel, $PDBFileName, $RecordLine, $ChainsAndResiduesInfoRef, $AtomNumber, $AtomName, $ResidueName, $AtomChainID, $ResidueNumber, $AlternateLocation, $InsertionCode, $ConectRecordLinesRef, %ChainAtomNumbersMap); | |
| 126 | |
| 127 # Get chains and residues data... | |
| 128 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'AtomAndHetatm', 0, 1); | |
| 129 | |
| 130 if ($OptionsInfo{CombineChains}) { | |
| 131 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 132 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 133 | |
| 134 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 135 | |
| 136 # Write out header and other older recors... | |
| 137 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 138 } | |
| 139 | |
| 140 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { | |
| 141 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; | |
| 142 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; | |
| 143 | |
| 144 if (!$OptionsInfo{CombineChains}) { | |
| 145 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; | |
| 146 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 147 | |
| 148 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 149 | |
| 150 # Write out header and other older recors... | |
| 151 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 152 } | |
| 153 | |
| 154 # Write out ATOM/HETATM line for chain and collect all ATOM/HETATM serial numbers | |
| 155 # for writing out appropriate CONECT records... | |
| 156 %ChainAtomNumbersMap = (); | |
| 157 for $RecordLine (@{$ChainsAndResiduesInfoRef->{Lines}{$ChainID}}) { | |
| 158 print OUTFILE "$RecordLine\n"; | |
| 159 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode) = ParseAtomRecordLine($RecordLine); | |
| 160 $AtomNumber = int $AtomNumber; | |
| 161 $ChainAtomNumbersMap{$AtomNumber} = $AtomName; | |
| 162 } | |
| 163 # Write out TER record using information from last chain record... | |
| 164 $AtomNumber += 1; | |
| 165 print OUTFILE GenerateTerRecordLine($AtomNumber, $ResidueName, $AtomChainID, $ResidueNumber, $InsertionCode), "\n"; | |
| 166 | |
| 167 # Write out CONECT records... | |
| 168 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%ChainAtomNumbersMap); | |
| 169 | |
| 170 for $RecordLine (@{$ConectRecordLinesRef}) { | |
| 171 print OUTFILE "$RecordLine\n"; | |
| 172 } | |
| 173 | |
| 174 if (!$OptionsInfo{CombineChains}) { | |
| 175 # Write out END record... | |
| 176 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 177 | |
| 178 close OUTFILE; | |
| 179 } | |
| 180 } | |
| 181 | |
| 182 if ($OptionsInfo{CombineChains}) { | |
| 183 # Write out END record... | |
| 184 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 185 | |
| 186 close OUTFILE; | |
| 187 } | |
| 188 | |
| 189 } | |
| 190 | |
| 191 # Extract sequences for individual chains or combine all the chains... | |
| 192 sub ExtractSequences { | |
| 193 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 194 my($ChainIndex, $ChainID, $ChainLabel, $SequenceFileName, $Residue, $ResidueCode, $StandardResidue, $ChainSequence, $WrappedChainSequence, $ChainSequenceID, $ChainsAndResiduesInfoRef, $ChainResiduesRef, %ChainSequencesDataMap); | |
| 195 | |
| 196 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { | |
| 197 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); | |
| 198 } | |
| 199 else { | |
| 200 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); | |
| 201 } | |
| 202 | |
| 203 # Generate sequence data for all the chains... | |
| 204 %ChainSequencesDataMap = (); | |
| 205 @{$ChainSequencesDataMap{IDs}} = (); | |
| 206 %{$ChainSequencesDataMap{Sequence}} = (); | |
| 207 %{$ChainSequencesDataMap{Description}} = (); | |
| 208 | |
| 209 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { | |
| 210 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; | |
| 211 $ChainLabel = $PDBFilesInfo{ChainLabels}[$FileIndex][$ChainIndex]; | |
| 212 | |
| 213 # Setup sequence ID... | |
| 214 $ChainSequenceID = $PDBFilesInfo{ChainSequenceIDs}[$FileIndex][$ChainIndex]; | |
| 215 push @{$ChainSequencesDataMap{IDs}}, $ChainSequenceID; | |
| 216 $ChainSequencesDataMap{Description}{$ChainID} = $ChainSequenceID; | |
| 217 | |
| 218 # Collect sequence data for the chain... | |
| 219 if ($OptionsInfo{SequenceRecordSource} =~ /^SeqRes/i) { | |
| 220 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; | |
| 221 } | |
| 222 else { | |
| 223 $ChainResiduesRef = \@{$ChainsAndResiduesInfoRef->{Residues}{$ChainID}}; | |
| 224 } | |
| 225 # Setup sequence data... | |
| 226 $ChainSequence = ''; | |
| 227 RESIDUE: for $Residue (@{$ChainResiduesRef}) { | |
| 228 ($ResidueCode, $StandardResidue) = GetResidueCode($Residue); | |
| 229 if (!$StandardResidue) { | |
| 230 if ($OptionsInfo{KeepNonStandardSequences}) { | |
| 231 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; | |
| 232 warn "Warning: Keeping nonstandard residue $Residue in $ChainLabel...\n"; | |
| 233 } | |
| 234 else { | |
| 235 warn "Warning: Ignoring nonstandard residue $Residue in $ChainLabel...\n"; | |
| 236 next RESIDUE; | |
| 237 } | |
| 238 } | |
| 239 $ChainSequence .= $ResidueCode; | |
| 240 } | |
| 241 $ChainSequencesDataMap{Sequence}{$ChainID} = $ChainSequence; | |
| 242 | |
| 243 } | |
| 244 | |
| 245 # Write out the sequence files... | |
| 246 my($SequenceID, $SequenceDescription, $Sequence, %SequencesDataMap ); | |
| 247 if ($OptionsInfo{CombineChainSequences}) { | |
| 248 # Combine all the chain sequences... | |
| 249 $Sequence = ''; | |
| 250 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { | |
| 251 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; | |
| 252 | |
| 253 $Sequence .= $ChainSequencesDataMap{Sequence}{$ChainID}; | |
| 254 } | |
| 255 $SequenceID = $PDBFilesInfo{ChainSequenceIDsPrefix}[$FileIndex][0] . "_CombinedChains|PDB";; | |
| 256 $SequenceDescription = $SequenceID; | |
| 257 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 258 | |
| 259 print "Generating sequence file $SequenceFileName...\n"; | |
| 260 %SequencesDataMap = (); | |
| 261 @{$SequencesDataMap{IDs}} = (); | |
| 262 %{$SequencesDataMap{Sequence}} = (); | |
| 263 %{$SequencesDataMap{Description}} = (); | |
| 264 | |
| 265 push @{$SequencesDataMap{IDs}}, $SequenceID; | |
| 266 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; | |
| 267 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; | |
| 268 | |
| 269 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); | |
| 270 } | |
| 271 else { | |
| 272 # For each specifed chain, write out the sequences... | |
| 273 for $ChainIndex (0 .. $#{$PDBFilesInfo{SpecifiedChains}[$FileIndex]}) { | |
| 274 $ChainID = $PDBFilesInfo{SpecifiedChains}[$FileIndex][$ChainIndex]; | |
| 275 | |
| 276 $SequenceFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][$ChainIndex]; | |
| 277 | |
| 278 $SequenceID = $ChainSequencesDataMap{IDs}[$ChainIndex]; | |
| 279 $SequenceDescription = $ChainSequencesDataMap{Description}{$ChainID}; | |
| 280 $Sequence = $ChainSequencesDataMap{Sequence}{$ChainID}; | |
| 281 | |
| 282 print "Generating sequence file $SequenceFileName...\n"; | |
| 283 %SequencesDataMap = (); | |
| 284 @{$SequencesDataMap{IDs}} = (); | |
| 285 %{$SequencesDataMap{Sequence}} = (); | |
| 286 %{$SequencesDataMap{Description}} = (); | |
| 287 | |
| 288 push @{$SequencesDataMap{IDs}}, $SequenceID; | |
| 289 $SequencesDataMap{Description}{$SequenceID} = $SequenceDescription; | |
| 290 $SequencesDataMap{Sequence}{$SequenceID} = $Sequence; | |
| 291 | |
| 292 WritePearsonFastaSequenceFile($SequenceFileName, \%SequencesDataMap, $OptionsInfo{MaxSequenceLength}); | |
| 293 } | |
| 294 } | |
| 295 } | |
| 296 | |
| 297 # Extract atoms... | |
| 298 sub ExtractByAtoms { | |
| 299 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 300 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $IgnoreRecord, $ConectRecordLinesRef, %AtomNumbersMap); | |
| 301 | |
| 302 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 303 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 304 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 305 | |
| 306 # Write out header and other older recors... | |
| 307 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 308 | |
| 309 # Write out all ATOM records along with TER and model records to indicate | |
| 310 # chains and multiple models.. | |
| 311 %AtomNumbersMap = (); | |
| 312 $ChainRecordCount = 0; | |
| 313 for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 314 if (CheckRecordType($RecordLine)) { | |
| 315 ($AtomNumber, $AtomName) = ParseAtomRecordLine($RecordLine); | |
| 316 | |
| 317 # Check atoms... | |
| 318 $IgnoreRecord = 1; | |
| 319 if ($OptionsInfo{Mode} =~ /^Atoms$/i) { | |
| 320 $IgnoreRecord = 0; | |
| 321 } | |
| 322 elsif ($OptionsInfo{Mode} =~ /^(CAlphas|AtomNames)$/i) { | |
| 323 if (exists $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName}) { | |
| 324 $IgnoreRecord = 0; | |
| 325 } | |
| 326 } | |
| 327 elsif ($OptionsInfo{Mode} =~ /^AtomNums$/i) { | |
| 328 if (exists $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNumber}) { | |
| 329 $IgnoreRecord = 0; | |
| 330 } | |
| 331 } | |
| 332 elsif ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { | |
| 333 if ($AtomNumber >= $OptionsInfo{SpecifiedStartAtomNum} && $AtomNumber <= $OptionsInfo{SpecifiedEndAtomNum}) { | |
| 334 $IgnoreRecord = 0; | |
| 335 } | |
| 336 } | |
| 337 | |
| 338 if (!$IgnoreRecord) { | |
| 339 $ChainRecordCount++; | |
| 340 print OUTFILE "$RecordLine\n"; | |
| 341 | |
| 342 $AtomNumber = int $AtomNumber; | |
| 343 $AtomNumbersMap{$AtomNumber} = $AtomName; | |
| 344 } | |
| 345 } | |
| 346 elsif (IsTerRecordType($RecordLine)) { | |
| 347 if ($ChainRecordCount) { | |
| 348 print OUTFILE GenerateTerRecordLine(), "\n"; | |
| 349 } | |
| 350 $ChainRecordCount = 0; | |
| 351 } | |
| 352 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { | |
| 353 print OUTFILE "$RecordLine\n"; | |
| 354 } | |
| 355 } | |
| 356 | |
| 357 # Write out appropriate CONECT records... | |
| 358 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); | |
| 359 for $RecordLine (@{$ConectRecordLinesRef}) { | |
| 360 print OUTFILE "$RecordLine\n"; | |
| 361 } | |
| 362 | |
| 363 # Write out END record... | |
| 364 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 365 | |
| 366 close OUTFILE; | |
| 367 } | |
| 368 | |
| 369 # Extract residues... | |
| 370 sub ExtractByResidues { | |
| 371 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 372 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $ConectRecordLinesRef, $IgnoreRecord, %AtomNumbersMap); | |
| 373 | |
| 374 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 375 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 376 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 377 | |
| 378 # Write out header and other older recors... | |
| 379 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 380 | |
| 381 # Write out all ATOM records for specified residues with TER and model records to indicate | |
| 382 # chains and multiple models... | |
| 383 %AtomNumbersMap = (); | |
| 384 $ChainRecordCount = 0; | |
| 385 for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 386 if (CheckRecordType($RecordLine)) { | |
| 387 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber) = ParseAtomRecordLine($RecordLine); | |
| 388 | |
| 389 # Check residues... | |
| 390 $IgnoreRecord = 1; | |
| 391 if ($OptionsInfo{Mode} =~ /^ResidueNums$/i) { | |
| 392 if (exists $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNumber}) { | |
| 393 $IgnoreRecord = 0; | |
| 394 } | |
| 395 } | |
| 396 elsif ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { | |
| 397 if ($ResidueNumber >= $OptionsInfo{SpecifiedStartResidueNum} && $ResidueNumber <= $OptionsInfo{SpecifiedEndResidueNum}) { | |
| 398 $IgnoreRecord = 0; | |
| 399 } | |
| 400 } | |
| 401 elsif ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { | |
| 402 if (exists $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName}) { | |
| 403 $IgnoreRecord = 0; | |
| 404 } | |
| 405 } | |
| 406 if (!$IgnoreRecord) { | |
| 407 $ChainRecordCount++; | |
| 408 print OUTFILE "$RecordLine\n"; | |
| 409 $AtomNumber = int $AtomNumber; | |
| 410 $AtomNumbersMap{$AtomNumber} = $AtomName; | |
| 411 } | |
| 412 } | |
| 413 elsif (IsTerRecordType($RecordLine)) { | |
| 414 if ($ChainRecordCount) { | |
| 415 print OUTFILE GenerateTerRecordLine(), "\n"; | |
| 416 } | |
| 417 $ChainRecordCount = 0; | |
| 418 } | |
| 419 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { | |
| 420 print OUTFILE "$RecordLine\n"; | |
| 421 } | |
| 422 } | |
| 423 | |
| 424 # Write out appropriate CONECT records... | |
| 425 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); | |
| 426 for $RecordLine (@{$ConectRecordLinesRef}) { | |
| 427 print OUTFILE "$RecordLine\n"; | |
| 428 } | |
| 429 # Write out END record... | |
| 430 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 431 | |
| 432 close OUTFILE; | |
| 433 } | |
| 434 | |
| 435 # Extract non water records... | |
| 436 sub ExtractNonWaterRecords { | |
| 437 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 438 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ConectRecordLinesRef, %AtomNumbersMap); | |
| 439 | |
| 440 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 441 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 442 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 443 | |
| 444 # Write out header and other older recors... | |
| 445 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 446 | |
| 447 # Write out all ATOM/HETATM non water records along with TER and model records to indicate | |
| 448 # chains and multiple models.. | |
| 449 %AtomNumbersMap = (); | |
| 450 $ChainRecordCount = 0; | |
| 451 for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 452 if (CheckRecordType($RecordLine)) { | |
| 453 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName) = ParseAtomRecordLine($RecordLine); | |
| 454 if (! exists $OptionsInfo{SpecifiedWaterResiduesMap}{$ResidueName} ) { | |
| 455 $ChainRecordCount++; | |
| 456 print OUTFILE "$RecordLine\n"; | |
| 457 $AtomNumber = int $AtomNumber; | |
| 458 $AtomNumbersMap{$AtomNumber} = $AtomName; | |
| 459 } | |
| 460 } | |
| 461 elsif (IsTerRecordType($RecordLine)) { | |
| 462 if ($ChainRecordCount) { | |
| 463 print OUTFILE GenerateTerRecordLine(), "\n"; | |
| 464 } | |
| 465 $ChainRecordCount = 0; | |
| 466 } | |
| 467 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { | |
| 468 print OUTFILE "$RecordLine\n"; | |
| 469 } | |
| 470 } | |
| 471 | |
| 472 # Write out appropriate CONECT records... | |
| 473 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); | |
| 474 for $RecordLine (@{$ConectRecordLinesRef}) { | |
| 475 print OUTFILE "$RecordLine\n"; | |
| 476 } | |
| 477 # Write out END record... | |
| 478 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 479 | |
| 480 close OUTFILE; | |
| 481 } | |
| 482 | |
| 483 # Extract non hydrogen records... | |
| 484 sub ExtractNonHydrogenRecords { | |
| 485 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 486 my($PDBFileName, $RecordLine, $ChainRecordCount, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge, $ConectRecordLinesRef, %AtomNumbersMap); | |
| 487 | |
| 488 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 489 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 490 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 491 | |
| 492 # Write out header and other older recors... | |
| 493 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 494 | |
| 495 # Write out all ATOM/HETATM non hydrogen records along with TER and model records to indicate | |
| 496 # chains and multiple models.. | |
| 497 %AtomNumbersMap = (); | |
| 498 $ChainRecordCount = 0; | |
| 499 for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 500 if (CheckRecordType($RecordLine)) { | |
| 501 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $Occupancy, $TemperatureFactor, $SegmentID, $ElementSymbol, $AtomCharge) = ParseAtomRecordLine($RecordLine); | |
| 502 if ($ElementSymbol !~ /^H$/i) { | |
| 503 $ChainRecordCount++; | |
| 504 print OUTFILE "$RecordLine\n"; | |
| 505 $AtomNumber = int $AtomNumber; | |
| 506 $AtomNumbersMap{$AtomNumber} = $AtomName; | |
| 507 } | |
| 508 } | |
| 509 elsif (IsTerRecordType($RecordLine)) { | |
| 510 if ($ChainRecordCount) { | |
| 511 print OUTFILE GenerateTerRecordLine(), "\n"; | |
| 512 } | |
| 513 $ChainRecordCount = 0; | |
| 514 } | |
| 515 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { | |
| 516 print OUTFILE "$RecordLine\n"; | |
| 517 } | |
| 518 } | |
| 519 | |
| 520 # Write out appropriate CONECT records... | |
| 521 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); | |
| 522 for $RecordLine (@{$ConectRecordLinesRef}) { | |
| 523 print OUTFILE "$RecordLine\n"; | |
| 524 } | |
| 525 # Write out END record... | |
| 526 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 527 | |
| 528 close OUTFILE; | |
| 529 } | |
| 530 | |
| 531 # Extract ATOM/HETATM records by distance... | |
| 532 sub ExtractByDistance { | |
| 533 my($FileIndex, $PDBRecordLinesRef) = @_; | |
| 534 my($PDBFileName, $RecordLine, $RecordLineNum, $ChainRecordCount, $ConectRecordLinesRef, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $IgnoreRecord, $ResidueID, @OriginCoords, @Coords, %AtomNumbersMap, %ResiduesDataMap); | |
| 535 | |
| 536 $PDBFileName = $PDBFilesInfo{OutFileNames}[$FileIndex][0]; | |
| 537 print "Generating PDBFileName file $PDBFileName...\n"; | |
| 538 open OUTFILE, ">$PDBFileName" or die "Error: Can't open $PDBFileName: $! \n"; | |
| 539 | |
| 540 # Write out header and other older recors... | |
| 541 WriteHeaderAndOlderRecords(\*OUTFILE, $PDBRecordLinesRef); | |
| 542 | |
| 543 # Setup coordinates of origin to calculate distance... | |
| 544 @OriginCoords = (); | |
| 545 push @OriginCoords, @{$PDBFilesInfo{DistanceOrigin}[$FileIndex]}; | |
| 546 | |
| 547 # Write out all ATOM records for which meet specified criteria along with TER and model records to indicate | |
| 548 # chains and multiple models... | |
| 549 %AtomNumbersMap = (); | |
| 550 | |
| 551 %ResiduesDataMap = (); | |
| 552 %{$ResiduesDataMap{ID}} = (); | |
| 553 %{$ResiduesDataMap{Status}} = (); | |
| 554 | |
| 555 $ChainRecordCount = 0; | |
| 556 $RecordLineNum = 0; | |
| 557 | |
| 558 for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 559 $RecordLineNum++; | |
| 560 if (CheckRecordType($RecordLine)) { | |
| 561 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); | |
| 562 @Coords = (); push @Coords, ($X, $Y, $Z); | |
| 563 | |
| 564 $IgnoreRecord = 1; | |
| 565 if ($OptionsInfo{DistanceSelectionMode} =~ /^ByResidue$/i) { | |
| 566 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; | |
| 567 if (exists $ResiduesDataMap{ID}{$ResidueID}) { | |
| 568 # Residue data has been processed; check its selection status... | |
| 569 if ($ResiduesDataMap{Status}{$ResidueID}) { | |
| 570 $IgnoreRecord = 0; | |
| 571 } | |
| 572 } | |
| 573 else { | |
| 574 # Residue hasn't been processed... | |
| 575 $ResiduesDataMap{ID}{$ResidueID} = $ResidueID; | |
| 576 $ResiduesDataMap{Status}{$ResidueID} = 0; | |
| 577 if (CheckResidueDistance($ResidueID, $RecordLineNum, $PDBRecordLinesRef, \@OriginCoords)) { | |
| 578 $IgnoreRecord = 0; | |
| 579 $ResiduesDataMap{Status}{$ResidueID} = 1; | |
| 580 } | |
| 581 } | |
| 582 } | |
| 583 elsif ($OptionsInfo{DistanceSelectionMode} =~ /^ByAtom$/i) { | |
| 584 if (CheckDistance(\@Coords, \@OriginCoords)) { | |
| 585 $IgnoreRecord = 0; | |
| 586 } | |
| 587 } | |
| 588 | |
| 589 if (!$IgnoreRecord) { | |
| 590 $ChainRecordCount++; | |
| 591 print OUTFILE "$RecordLine\n"; | |
| 592 $AtomNumber = int $AtomNumber; | |
| 593 $AtomNumbersMap{$AtomNumber} = $AtomName; | |
| 594 } | |
| 595 } | |
| 596 elsif (IsTerRecordType($RecordLine)) { | |
| 597 if ($ChainRecordCount) { | |
| 598 print OUTFILE GenerateTerRecordLine(), "\n"; | |
| 599 } | |
| 600 $ChainRecordCount = 0; | |
| 601 } | |
| 602 elsif (IsModelRecordType($RecordLine) || IsEndmdlRecordType($RecordLine)) { | |
| 603 print OUTFILE "$RecordLine\n"; | |
| 604 } | |
| 605 } | |
| 606 | |
| 607 # Write out appropriate CONECT records... | |
| 608 $ConectRecordLinesRef = GetConectRecordLines($PDBRecordLinesRef, \%AtomNumbersMap); | |
| 609 for $RecordLine (@{$ConectRecordLinesRef}) { | |
| 610 print OUTFILE "$RecordLine\n"; | |
| 611 } | |
| 612 | |
| 613 # Write out END record... | |
| 614 print OUTFILE GenerateEndRecordLine(), "\n"; | |
| 615 | |
| 616 close OUTFILE; | |
| 617 } | |
| 618 | |
| 619 # Does record type correspond to the specified record type? | |
| 620 sub CheckRecordType { | |
| 621 my($RecordLine) = @_; | |
| 622 my($Status); | |
| 623 | |
| 624 $Status = 0; | |
| 625 if ($OptionsInfo{RecordMode} =~ /^Atom$/i) { | |
| 626 $Status = IsAtomRecordType($RecordLine) ? 1 : 0; | |
| 627 } | |
| 628 elsif ($OptionsInfo{RecordMode} =~ /^Hetatm$/i) { | |
| 629 $Status = IsHetatmRecordType($RecordLine) ? 1 : 0; | |
| 630 } | |
| 631 elsif ($OptionsInfo{RecordMode} =~ /^AtomAndHetatm$/i) { | |
| 632 $Status = (IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) ? 1 : 0; | |
| 633 } | |
| 634 | |
| 635 return $Status; | |
| 636 } | |
| 637 | |
| 638 # Does record meets distance citerion specified by the user? | |
| 639 sub CheckResidueDistance { | |
| 640 my($SpecifiedResidueID, $StartingLineNum, $PDBRecordLinesRef, $OriginCoordsRef) = @_; | |
| 641 my($Status, $RecordLine, $RecordLineIndex, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $ResidueID, @Coords); | |
| 642 | |
| 643 $Status = 0; | |
| 644 | |
| 645 RECORDLINE: for $RecordLineIndex (($StartingLineNum - 1) .. $#{$PDBRecordLinesRef}) { | |
| 646 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; | |
| 647 if (!CheckRecordType($RecordLine)) { | |
| 648 next RECORDLINE; | |
| 649 } | |
| 650 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); | |
| 651 $ResidueID = "${ResidueName}_${ResidueNumber}_${ChainID}"; | |
| 652 | |
| 653 if ($ResidueID !~ /^$SpecifiedResidueID$/i) { | |
| 654 # It's a new residue line... | |
| 655 last RECORDLINE; | |
| 656 } | |
| 657 | |
| 658 # Check distance... | |
| 659 @Coords = (); push @Coords, ($X, $Y, $Z); | |
| 660 if (CheckDistance(\@Coords, $OriginCoordsRef)) { | |
| 661 # Distance criterion is met for at least one record in the residue... | |
| 662 $Status = 1; | |
| 663 last RECORDLINE; | |
| 664 } | |
| 665 } | |
| 666 return $Status; | |
| 667 } | |
| 668 | |
| 669 # Does record meets distance citerion specified by the user? | |
| 670 sub CheckDistance { | |
| 671 my($CoordsRef, $OriginCoordsRef) = @_; | |
| 672 my($Status, $Index, $Distance, $DistanceSquare); | |
| 673 | |
| 674 $Status = 0; | |
| 675 | |
| 676 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { | |
| 677 # Go over coordinates of all the atoms in the residue... | |
| 678 my($ResidueCoordsCount) = scalar @{$OriginCoordsRef}; | |
| 679 INDEX: for ($Index = 0; $Index < $ResidueCoordsCount; $Index += 3) { | |
| 680 $DistanceSquare = ($CoordsRef->[0] - $OriginCoordsRef->[$Index])**2 + ($CoordsRef->[1] - $OriginCoordsRef->[$Index + 1])**2 + ($CoordsRef->[2] - $OriginCoordsRef->[$Index + 2])**2; | |
| 681 $Distance = sqrt $DistanceSquare; | |
| 682 if ($Distance <= $OptionsInfo{MaxExtractionDistance}) { | |
| 683 $Status = 1; | |
| 684 last INDEX; | |
| 685 } | |
| 686 } | |
| 687 } | |
| 688 else { | |
| 689 $DistanceSquare = 0; | |
| 690 for $Index (0 .. 2) { | |
| 691 $DistanceSquare += ($CoordsRef->[$Index] - $OriginCoordsRef->[$Index])**2; | |
| 692 } | |
| 693 $Distance = sqrt $DistanceSquare; | |
| 694 $Status = ($Distance <= $OptionsInfo{MaxExtractionDistance}) ? 1 : 0; | |
| 695 } | |
| 696 | |
| 697 return $Status; | |
| 698 } | |
| 699 | |
| 700 # Write out modifed header and other older records... | |
| 701 sub WriteHeaderAndOlderRecords { | |
| 702 my($OutFileRef, $PDBRecordLinesRef) = @_; | |
| 703 | |
| 704 if ($OptionsInfo{ModifyHeaderRecord}) { | |
| 705 # Write out modified HEADER record... | |
| 706 my($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); | |
| 707 $Classification = 'Data extracted using MayaChemTools'; | |
| 708 print $OutFileRef GenerateHeaderRecordLine($IDCode, $Classification), "\n"; | |
| 709 } | |
| 710 else { | |
| 711 print $OutFileRef $PDBRecordLinesRef->[0], "\n"; | |
| 712 } | |
| 713 | |
| 714 # Write out any old records... | |
| 715 if ($OptionsInfo{KeepOldRecords}) { | |
| 716 my($RecordLineIndex, $RecordLine); | |
| 717 # Skip HEADER record and write out older records all the way upto first MODEL/ATOM/HETATM records from input file... | |
| 718 RECORDLINE: for $RecordLineIndex (1 .. $#{$PDBRecordLinesRef}) { | |
| 719 $RecordLine = $PDBRecordLinesRef->[$RecordLineIndex]; | |
| 720 if (IsModelRecordType($RecordLine) || IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine)) { | |
| 721 last RECORDLINE; | |
| 722 } | |
| 723 print $OutFileRef "$RecordLine\n"; | |
| 724 } | |
| 725 } | |
| 726 } | |
| 727 | |
| 728 # Get header record information assuming it's the first record... | |
| 729 sub GetHeaderRecordInformation { | |
| 730 my($PDBRecordLinesRef) = @_; | |
| 731 my($Classification, $DepositionDate, $IDCode, $HeaderRecordLine); | |
| 732 | |
| 733 ($Classification, $DepositionDate, $IDCode) = ('') x 3; | |
| 734 $HeaderRecordLine = $PDBRecordLinesRef->[0]; | |
| 735 if (IsHeaderRecordType($HeaderRecordLine)) { | |
| 736 ($Classification, $DepositionDate, $IDCode) = ParseHeaderRecordLine($HeaderRecordLine); | |
| 737 } | |
| 738 return ($Classification, $DepositionDate, $IDCode); | |
| 739 } | |
| 740 | |
| 741 # Get one letter residue code... | |
| 742 sub GetResidueCode { | |
| 743 my($ResidueName) = @_; | |
| 744 my($ResidueCode, $StandardResidue); | |
| 745 | |
| 746 $ResidueCode = $OptionsInfo{NonStandardSequenceCode}; | |
| 747 $StandardResidue = 0; | |
| 748 | |
| 749 if (length($ResidueName) == 3) { | |
| 750 # Assume it's an amino acid... | |
| 751 if (AminoAcids::IsAminoAcid($ResidueName)) { | |
| 752 # Standard amino acid... | |
| 753 $ResidueCode = AminoAcids::GetAminoAcidOneLetterCode($ResidueName); | |
| 754 $StandardResidue = 1; | |
| 755 } | |
| 756 } | |
| 757 elsif (length($ResidueName) == 1) { | |
| 758 # Assume it's a nucleic acid... | |
| 759 if ($ResidueName =~ /^(A|G|T|U|C)$/i) { | |
| 760 $ResidueCode = $ResidueName; | |
| 761 $StandardResidue = 1; | |
| 762 } | |
| 763 } | |
| 764 | |
| 765 return ($ResidueCode, $StandardResidue); | |
| 766 } | |
| 767 | |
| 768 # Process option values... | |
| 769 sub ProcessOptions { | |
| 770 %OptionsInfo = (); | |
| 771 $OptionsInfo{Mode} = $Options{mode}; | |
| 772 | |
| 773 my(@SpecifiedChains) = (); | |
| 774 if ($Options{chains} =~ /^(First|All)$/i) { | |
| 775 $OptionsInfo{ChainsToExtract} = $Options{chains}; | |
| 776 } | |
| 777 else { | |
| 778 @SpecifiedChains = split /\,/, $Options{chains}; | |
| 779 $OptionsInfo{ChainsToExtract} = 'Specified'; | |
| 780 } | |
| 781 @{$OptionsInfo{SpecifiedChains}} = (); | |
| 782 push @{$OptionsInfo{SpecifiedChains}}, @SpecifiedChains; | |
| 783 | |
| 784 $OptionsInfo{CombineChains} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; | |
| 785 | |
| 786 $OptionsInfo{CombineChainSequences} = ($Options{combinechains} =~ /^Yes$/i) ? 1 : 0; | |
| 787 | |
| 788 ProcessResiduesOptions(); | |
| 789 ProcessAtomsOptions(); | |
| 790 ProcessDistanceOptions(); | |
| 791 | |
| 792 $OptionsInfo{WaterResidueNames} = $Options{waterresiduenames}; | |
| 793 @{$OptionsInfo{SpecifiedWaterResiduesList}} = (); | |
| 794 %{$OptionsInfo{SpecifiedWaterResiduesMap}} = (); | |
| 795 | |
| 796 my(@SpecifiedWaterResiduesList); | |
| 797 @SpecifiedWaterResiduesList = (); | |
| 798 | |
| 799 if ($OptionsInfo{Mode} =~ /^NonWater$/i) { | |
| 800 my($WaterResidueName); | |
| 801 if ($OptionsInfo{WaterResidueNames} =~ /Automatic/i) { | |
| 802 push @SpecifiedWaterResiduesList, ('HOH', 'WAT', 'H2O'); | |
| 803 } | |
| 804 else { | |
| 805 @SpecifiedWaterResiduesList = split /\,/, $Options{waterresiduenames}; | |
| 806 } | |
| 807 for $WaterResidueName (@SpecifiedWaterResiduesList) { | |
| 808 $OptionsInfo{SpecifiedWaterResiduesMap}{$WaterResidueName} = $WaterResidueName; | |
| 809 } | |
| 810 } | |
| 811 push @{$OptionsInfo{SpecifiedWaterResiduesList}}, @SpecifiedWaterResiduesList; | |
| 812 | |
| 813 $OptionsInfo{RecordMode} = $Options{recordmode} ? $Options{recordmode} : ($Options{mode} =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames)$/i ? "Atom" : "AtomAndHetatm"); | |
| 814 | |
| 815 $OptionsInfo{KeepOldRecords} = ($Options{keepoldrecords} =~ /^Yes$/i) ? 1 : 0; | |
| 816 | |
| 817 $OptionsInfo{ModifyHeaderRecord} = ($Options{modifyheader} =~ /^Yes$/i) ? 1 : 0; | |
| 818 | |
| 819 $OptionsInfo{KeepNonStandardSequences} = ($Options{nonstandardkeep} =~ /^Yes$/i) ? 1 : 0; | |
| 820 $OptionsInfo{NonStandardSequenceCode} = $Options{nonstandardcode}; | |
| 821 $OptionsInfo{MaxSequenceLength} = $Options{sequencelength}; | |
| 822 $OptionsInfo{SequenceRecordSource} = $Options{sequencerecords}; | |
| 823 $OptionsInfo{SequenceIDPrefixSource} = $Options{sequenceidprefix}; | |
| 824 | |
| 825 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | |
| 826 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | |
| 827 } | |
| 828 | |
| 829 # Process specified residue options... | |
| 830 sub ProcessResiduesOptions { | |
| 831 my($ResidueNum, $StartResidueNum, $EndResNum, $ResidueName, @SpecifiedResidueNumsList, @SpecifiedResidueNamesList); | |
| 832 | |
| 833 @SpecifiedResidueNumsList = (); | |
| 834 ($StartResidueNum, $EndResNum) = (0, 0); | |
| 835 | |
| 836 @SpecifiedResidueNamesList = (); | |
| 837 | |
| 838 if ($OptionsInfo{Mode} =~ /^(ResidueNums|ResiduesRange|ResidueNames)$/i) { | |
| 839 if (!$Options{residues}) { | |
| 840 die "Error: You must specify a value for \"--Residues\" option in \"ResidueNums, ResiduesRange, or ResidueNames\" \"-m, --mode\". \n"; | |
| 841 } | |
| 842 $OptionsInfo{Residues} = $Options{residues}; | |
| 843 $OptionsInfo{Residues} =~ s/ //g; | |
| 844 | |
| 845 if ($OptionsInfo{Mode} =~ /^ResidueNames$/i) { | |
| 846 @SpecifiedResidueNamesList = split /\,/, $OptionsInfo{Residues}; | |
| 847 } | |
| 848 else { | |
| 849 @SpecifiedResidueNumsList = split /\,/, $OptionsInfo{Residues}; | |
| 850 for $ResidueNum (@SpecifiedResidueNumsList) { | |
| 851 if (!IsPositiveInteger($ResidueNum)) { | |
| 852 die "Error: Invalid residue number value, $ResidueNum, for \"--Residues\" option during \"ResidueNumes\" or \"ResiduesRange\"value of \"-m --mode\" option: Residue number must be a positive integer.\n"; | |
| 853 } | |
| 854 } | |
| 855 if ($OptionsInfo{Mode} =~ /^ResiduesRange$/i) { | |
| 856 if (@SpecifiedResidueNumsList != 2) { | |
| 857 die "Error: Invalid number of residue number values, ", scalar(@SpecifiedResidueNumsList), ", for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end residue numbers.\n"; | |
| 858 } | |
| 859 if ($SpecifiedResidueNumsList[0] > $SpecifiedResidueNumsList[1]) { | |
| 860 die "Error: Invalid residue number values, @SpecifiedResidueNumsList, for \"--Residues\" option during \"ResiduesRange\" value of \"-m --mode\" option: The start residue number must be less than end residue number.\n"; | |
| 861 } | |
| 862 ($StartResidueNum, $EndResNum) = @SpecifiedResidueNumsList; | |
| 863 } | |
| 864 } | |
| 865 } | |
| 866 | |
| 867 @{$OptionsInfo{SpecifiedResidueNumsList}} = (); | |
| 868 push @{$OptionsInfo{SpecifiedResidueNumsList}}, @SpecifiedResidueNumsList; | |
| 869 | |
| 870 $OptionsInfo{SpecifiedStartResidueNum} = $StartResidueNum; | |
| 871 $OptionsInfo{SpecifiedEndResidueNum} = $EndResNum; | |
| 872 | |
| 873 @{$OptionsInfo{SpecifiedResidueNamesList}} = (); | |
| 874 push @{$OptionsInfo{SpecifiedResidueNamesList}}, @SpecifiedResidueNamesList; | |
| 875 | |
| 876 # Set up a specified residue numbers map... | |
| 877 %{$OptionsInfo{SpecifiedResidueNumsMap}} = (); | |
| 878 for $ResidueNum (@{$OptionsInfo{SpecifiedResidueNumsList}}) { | |
| 879 $OptionsInfo{SpecifiedResidueNumsMap}{$ResidueNum} = $ResidueNum; | |
| 880 } | |
| 881 | |
| 882 # Set up a specified residue names map... | |
| 883 %{$OptionsInfo{SpecifiedResidueNamesMap}} = (); | |
| 884 for $ResidueName (@{$OptionsInfo{SpecifiedResidueNamesList}}) { | |
| 885 $OptionsInfo{SpecifiedResidueNamesMap}{lc $ResidueName} = lc $ResidueName; | |
| 886 } | |
| 887 | |
| 888 } | |
| 889 | |
| 890 # Process specified atom options... | |
| 891 sub ProcessAtomsOptions { | |
| 892 my($AtomNum, $StartAtomNum, $EndAtomNum, $AtomName, @SpecifiedAtomNumsList, @SpecifiedAtomNamesList); | |
| 893 | |
| 894 @SpecifiedAtomNumsList = (); | |
| 895 ($StartAtomNum, $EndAtomNum) = (0, 0); | |
| 896 | |
| 897 @SpecifiedAtomNamesList = (); | |
| 898 | |
| 899 if ($OptionsInfo{Mode} =~ /^(AtomNums|AtomsRange|AtomNames)$/i) { | |
| 900 if (!$Options{atoms}) { | |
| 901 die "Error: You must specify a value for \"--Atoms\" option in \"AtomNums, AtomsRange, or AtomNames\" \"-m, --mode\". \n"; | |
| 902 } | |
| 903 $OptionsInfo{Atoms} = $Options{atoms}; | |
| 904 $OptionsInfo{Atoms} =~ s/ //g; | |
| 905 | |
| 906 if ($OptionsInfo{Mode} =~ /^AtomNames$/i) { | |
| 907 @SpecifiedAtomNamesList = split /\,/, $OptionsInfo{Atoms}; | |
| 908 } | |
| 909 else { | |
| 910 @SpecifiedAtomNumsList = split /\,/, $OptionsInfo{Atoms}; | |
| 911 for $AtomNum (@SpecifiedAtomNumsList) { | |
| 912 if (!IsPositiveInteger($AtomNum)) { | |
| 913 die "Error: Invalid atom number value, $AtomNum, for \"--Atoms\" option during \"AtomNums\" or \"AtomsRange\"value of \"-m --mode\" option: Atom number must be a positive integer.\n"; | |
| 914 } | |
| 915 } | |
| 916 if ($OptionsInfo{Mode} =~ /^AtomsRange$/i) { | |
| 917 if (@SpecifiedAtomNumsList != 2) { | |
| 918 die "Error: Invalid number of atom number values, ", scalar(@SpecifiedAtomNumsList), ", for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The number of values must be 2 corresponding to start and end atom numbers.\n"; | |
| 919 } | |
| 920 if ($SpecifiedAtomNumsList[0] > $SpecifiedAtomNumsList[1]) { | |
| 921 die "Error: Invalid atom number values, @SpecifiedAtomNumsList, for \"--Atoms\" option during \"AtomsRange\" value of \"-m --mode\" option: The start atom number must be less than end atom number.\n"; | |
| 922 } | |
| 923 ($StartAtomNum, $EndAtomNum) = @SpecifiedAtomNumsList; | |
| 924 } | |
| 925 } | |
| 926 } | |
| 927 elsif ($OptionsInfo{Mode} =~ /^CAlphas$/i) { | |
| 928 @SpecifiedAtomNamesList = ("CA"); | |
| 929 } | |
| 930 | |
| 931 @{$OptionsInfo{SpecifiedAtomNumsList}} = (); | |
| 932 push @{$OptionsInfo{SpecifiedAtomNumsList}}, @SpecifiedAtomNumsList; | |
| 933 | |
| 934 $OptionsInfo{SpecifiedStartAtomNum} = $StartAtomNum; | |
| 935 $OptionsInfo{SpecifiedEndAtomNum} = $EndAtomNum; | |
| 936 | |
| 937 @{$OptionsInfo{SpecifiedAtomNamesList}} = (); | |
| 938 push @{$OptionsInfo{SpecifiedAtomNamesList}}, @SpecifiedAtomNamesList; | |
| 939 | |
| 940 # Set up a specified residue numbers map... | |
| 941 %{$OptionsInfo{SpecifiedAtomNumsMap}} = (); | |
| 942 for $AtomNum (@{$OptionsInfo{SpecifiedAtomNumsList}}) { | |
| 943 $OptionsInfo{SpecifiedAtomNumsMap}{$AtomNum} = $AtomNum; | |
| 944 } | |
| 945 | |
| 946 # Set up a specified residue names map... | |
| 947 %{$OptionsInfo{SpecifiedAtomNamesMap}} = (); | |
| 948 for $AtomName (@{$OptionsInfo{SpecifiedAtomNamesList}}) { | |
| 949 $OptionsInfo{SpecifiedAtomNamesMap}{lc $AtomName} = lc $AtomName; | |
| 950 } | |
| 951 | |
| 952 } | |
| 953 | |
| 954 # Process specified distance options... | |
| 955 sub ProcessDistanceOptions { | |
| 956 my(@SpecifiedDistanceOrigin) = (); | |
| 957 | |
| 958 $OptionsInfo{MaxExtractionDistance} = $Options{distance}; | |
| 959 $OptionsInfo{ExtractionDistanceMode} = $Options{distancemode}; | |
| 960 $OptionsInfo{ExtractionDistanceOrigin} = $Options{distanceorigin} ? $Options{distanceorigin} : ''; | |
| 961 $OptionsInfo{DistanceSelectionMode} = $Options{distanceselectionmode}; | |
| 962 | |
| 963 if ($OptionsInfo{Mode} =~ /^Distance$/i) { | |
| 964 if (!$Options{distanceorigin}) { | |
| 965 die "Error: You must specify a value for \"--distanceorigin\" option in \"Distance\" \"-m, --mode\". \n"; | |
| 966 } | |
| 967 @SpecifiedDistanceOrigin = split /\,/, $Options{distanceorigin}; | |
| 968 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { | |
| 969 if (@SpecifiedDistanceOrigin != 2) { | |
| 970 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\" : The number of values must be 2.\n"; | |
| 971 } | |
| 972 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { | |
| 973 die "Error: Invalid atom number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Atom\" value of \"--distancemode\". Allowed values: > 0\n"; | |
| 974 } | |
| 975 } | |
| 976 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { | |
| 977 if (@SpecifiedDistanceOrigin != 2) { | |
| 978 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\" : The number of values must be 2.\n"; | |
| 979 } | |
| 980 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { | |
| 981 die "Error: Invalid hetatm number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Hetatm\" value of \"--distancemode\". Allowed values: > 0\n"; | |
| 982 } | |
| 983 } | |
| 984 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { | |
| 985 if (!(@SpecifiedDistanceOrigin == 2 || @SpecifiedDistanceOrigin == 3)) { | |
| 986 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\" : The number of values must be either 2 or 3.\n"; | |
| 987 } | |
| 988 if (!IsPositiveInteger($SpecifiedDistanceOrigin[0])) { | |
| 989 die "Error: Invalid residue number value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"Residue\" value of \"--distancemode\". Allowed values: > 0\n"; | |
| 990 } | |
| 991 } | |
| 992 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { | |
| 993 if (@SpecifiedDistanceOrigin != 3) { | |
| 994 die "Error: Invalid number of values, ", scalar(@SpecifiedDistanceOrigin), " for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\" : The number of values must be 3.\n"; | |
| 995 } | |
| 996 my($Value); | |
| 997 for $Value (@SpecifiedDistanceOrigin) { | |
| 998 if (!IsNumerical($Value)) { | |
| 999 die "Error: Invalid coordinate value, ", $SpecifiedDistanceOrigin[0], ", for option \"distanceorigin\" option during \"XYZ\" value of \"--distancemode\". Allowed values: numerical\n"; | |
| 1000 } | |
| 1001 } | |
| 1002 } | |
| 1003 } | |
| 1004 @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} = (); | |
| 1005 push @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}, @SpecifiedDistanceOrigin; | |
| 1006 | |
| 1007 } | |
| 1008 | |
| 1009 # Retrieve information about PDB files... | |
| 1010 sub RetrievePDBFilesInfo { | |
| 1011 my($Index, $PDBFile, $PDBRecordLinesRef, $ChainID, $ChainLabel, $ChainsAndResiduesInfoRef, $Mode, $FileDir, $FileName, $FileExt, $OutFileName, $OutFileRoot, @SpecifiedChains, @DistanceOrigin, @OutFileNames, @ChainLabels, @ChainSequenceIDs, @ChainSequenceIDsPrefix); | |
| 1012 | |
| 1013 %PDBFilesInfo = (); | |
| 1014 @{$PDBFilesInfo{FileOkay}} = (); | |
| 1015 @{$PDBFilesInfo{OutFileRoot}} = (); | |
| 1016 @{$PDBFilesInfo{OutFileNames}} = (); | |
| 1017 @{$PDBFilesInfo{ChainLabels}} = (); | |
| 1018 @{$PDBFilesInfo{ChainSequenceIDs}} = (); | |
| 1019 @{$PDBFilesInfo{ChainSequenceIDsPrefix}} = (); | |
| 1020 @{$PDBFilesInfo{SpecifiedChains}} = (); | |
| 1021 @{$PDBFilesInfo{DistanceOrigin}} = (); | |
| 1022 | |
| 1023 FILELIST: for $Index (0 .. $#PDBFilesList) { | |
| 1024 $PDBFilesInfo{FileOkay}[$Index] = 0; | |
| 1025 | |
| 1026 $PDBFilesInfo{OutFileRoot}[$Index] = ''; | |
| 1027 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); | |
| 1028 @{$PDBFilesInfo{OutFileNames}[$Index]} = (); | |
| 1029 @{$PDBFilesInfo{ChainLabels}[$Index]} = (); | |
| 1030 @{$PDBFilesInfo{ChainSequenceIDs}[$Index]} = (); | |
| 1031 @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]} = (); | |
| 1032 @{$PDBFilesInfo{SpecifiedChains}[$Index]} = (); | |
| 1033 @{$PDBFilesInfo{DistanceOrigin}[$Index]} = (); | |
| 1034 | |
| 1035 $PDBFile = $PDBFilesList[$Index]; | |
| 1036 if (!(-e $PDBFile)) { | |
| 1037 warn "Warning: Ignoring file $PDBFile: It doesn't exist\n"; | |
| 1038 next FILELIST; | |
| 1039 } | |
| 1040 if (!CheckFileType($PDBFile, "pdb")) { | |
| 1041 warn "Warning: Ignoring file $PDBFile: It's not a PDB file\n"; | |
| 1042 next FILELIST; | |
| 1043 } | |
| 1044 if (! open PDBFILE, "$PDBFile") { | |
| 1045 warn "Warning: Ignoring file $PDBFile: Couldn't open it: $! \n"; | |
| 1046 next FILELIST; | |
| 1047 } | |
| 1048 close PDBFILE; | |
| 1049 | |
| 1050 # Get PDB data... | |
| 1051 $PDBRecordLinesRef = ReadPDBFile($PDBFile); | |
| 1052 if ($OptionsInfo{Mode} =~ /^Sequences$/i && $OptionsInfo{SequenceRecordSource} =~ /^SeqRes$/i) { | |
| 1053 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef, 'SeqRes'); | |
| 1054 } | |
| 1055 else { | |
| 1056 $ChainsAndResiduesInfoRef = GetChainsAndResidues($PDBRecordLinesRef); | |
| 1057 } | |
| 1058 if (!scalar @{$ChainsAndResiduesInfoRef->{ChainIDs}}) { | |
| 1059 warn "Warning: Ignoring file $PDBFile: No chains found \n"; | |
| 1060 next FILELIST; | |
| 1061 } | |
| 1062 | |
| 1063 # Make sure specified chains exist in PDB file... | |
| 1064 @SpecifiedChains = (); | |
| 1065 if ($OptionsInfo{ChainsToExtract} =~ /^Specified$/i) { | |
| 1066 for $ChainID (@{$OptionsInfo{SpecifiedChains}}) { | |
| 1067 if (exists $ChainsAndResiduesInfoRef->{Residues}{$ChainID}) { | |
| 1068 push @SpecifiedChains, $ChainID; | |
| 1069 } | |
| 1070 else { | |
| 1071 warn "Warning: Ignoring file $PDBFile: Specified chain, $ChainID, in \"-c, --chains\" option doesn't exist.\n"; | |
| 1072 next FILELIST; | |
| 1073 } | |
| 1074 } | |
| 1075 } | |
| 1076 elsif ($OptionsInfo{ChainsToExtract} =~ /^First$/i) { | |
| 1077 push @SpecifiedChains, $ChainsAndResiduesInfoRef->{ChainIDs}[0]; | |
| 1078 } | |
| 1079 elsif ($OptionsInfo{ChainsToExtract} =~ /^All$/i) { | |
| 1080 push @SpecifiedChains, @{$ChainsAndResiduesInfoRef->{ChainIDs}}; | |
| 1081 } | |
| 1082 # Setup chain labels to use for sequence IDs and generating output files... | |
| 1083 @ChainLabels = (); | |
| 1084 for $ChainID (@SpecifiedChains) { | |
| 1085 $ChainLabel = $ChainID; $ChainLabel =~ s/^None//ig; | |
| 1086 $ChainLabel = "Chain${ChainLabel}"; | |
| 1087 push @ChainLabels, $ChainLabel; | |
| 1088 } | |
| 1089 | |
| 1090 # Make sure specified distance origin is valid... | |
| 1091 @DistanceOrigin = (); | |
| 1092 if ($OptionsInfo{Mode} =~ /^Distance$/i) { | |
| 1093 if ($OptionsInfo{ExtractionDistanceMode} =~ /^(Atom|Hetatm)$/i) { | |
| 1094 my($RecordType, $SpecifiedAtomName, $SpecifiedAtomNumber, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); | |
| 1095 $RecordType = $OptionsInfo{ExtractionDistanceMode}; | |
| 1096 ($SpecifiedAtomNumber, $SpecifiedAtomName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; | |
| 1097 $RecordFound = 0; | |
| 1098 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 1099 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { | |
| 1100 next LINE; | |
| 1101 } | |
| 1102 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); | |
| 1103 $AtomName = RemoveLeadingAndTrailingWhiteSpaces($AtomName); | |
| 1104 if (($RecordType =~ /^Atom$/i && IsAtomRecordType($RecordLine)) || ($RecordType =~ /^Hetatm$/i && IsHetatmRecordType($RecordLine))) { | |
| 1105 if ($AtomNumber == $SpecifiedAtomNumber && $AtomName eq $SpecifiedAtomName) { | |
| 1106 $RecordFound = 1; | |
| 1107 last LINE; | |
| 1108 } | |
| 1109 } | |
| 1110 } | |
| 1111 if (!$RecordFound) { | |
| 1112 warn "Warning: Ignoring file $PDBFile: ", uc($RecordType), " record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; | |
| 1113 next FILELIST; | |
| 1114 } | |
| 1115 push @DistanceOrigin, ($X, $Y, $Z); | |
| 1116 } | |
| 1117 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { | |
| 1118 my($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID, $RecordFound, $AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z, $RecordLine); | |
| 1119 $SpecifiedChainID = ''; | |
| 1120 if (@{$OptionsInfo{SpecifiedExtractionDistanceOrigin}} == 3) { | |
| 1121 ($SpecifiedResidueNumber, $SpecifiedResidueName, $SpecifiedChainID) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; | |
| 1122 } | |
| 1123 else { | |
| 1124 ($SpecifiedResidueNumber, $SpecifiedResidueName) = @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; | |
| 1125 } | |
| 1126 $RecordFound = 0; | |
| 1127 LINE: for $RecordLine (@{$PDBRecordLinesRef}) { | |
| 1128 if (!(IsAtomRecordType($RecordLine) || IsHetatmRecordType($RecordLine))) { | |
| 1129 next LINE; | |
| 1130 } | |
| 1131 ($AtomNumber, $AtomName, $AlternateLocation, $ResidueName, $ChainID, $ResidueNumber, $InsertionCode, $X, $Y, $Z) = ParseAtomRecordLine($RecordLine); | |
| 1132 $ResidueName = RemoveLeadingAndTrailingWhiteSpaces($ResidueName); | |
| 1133 $ChainID = RemoveLeadingAndTrailingWhiteSpaces($ChainID); | |
| 1134 if ($SpecifiedChainID && ($SpecifiedChainID ne $ChainID)) { | |
| 1135 next LINE; | |
| 1136 } | |
| 1137 if ($ResidueNumber == $SpecifiedResidueNumber && $ResidueName eq $SpecifiedResidueName) { | |
| 1138 # Store coordinates for all the atoms... | |
| 1139 $RecordFound = 1; | |
| 1140 push @DistanceOrigin, ($X, $Y, $Z); | |
| 1141 next LINE; | |
| 1142 } | |
| 1143 } | |
| 1144 if (!$RecordFound) { | |
| 1145 warn "Warning: Ignoring file $PDBFile: ATOM/HETATM record corresponding to \"--distanceorigin\" option value, $OptionsInfo{ExtractionDistanceOrigin}, doesn't exist.\n"; | |
| 1146 next FILELIST; | |
| 1147 } | |
| 1148 } | |
| 1149 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { | |
| 1150 push @DistanceOrigin, @{$OptionsInfo{SpecifiedExtractionDistanceOrigin}}; | |
| 1151 } | |
| 1152 } | |
| 1153 # Setup output file names... | |
| 1154 @OutFileNames = (); | |
| 1155 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 1156 ($FileDir, $FileName, $FileExt) = ParseFileName($PDBFile); | |
| 1157 if ($OptionsInfo{OutFileRoot} && (@PDBFilesList == 1)) { | |
| 1158 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
| 1159 if ($RootFileName && $RootFileExt) { | |
| 1160 $FileName = $RootFileName; | |
| 1161 } | |
| 1162 else { | |
| 1163 $FileName = $OptionsInfo{OutFileRoot}; | |
| 1164 } | |
| 1165 $OutFileRoot = $FileName; | |
| 1166 } | |
| 1167 else { | |
| 1168 $OutFileRoot = $FileName; | |
| 1169 } | |
| 1170 $Mode = $OptionsInfo{Mode}; | |
| 1171 if ($Mode =~ /^(Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResiduesRange|ResidueNames|Distance|NonWater|NonHydrogens)$/i) { | |
| 1172 $OutFileName = ''; | |
| 1173 if ($Mode =~ /^CAlphas$/i) { | |
| 1174 $OutFileName = "${OutFileRoot}CAlphas.pdb"; | |
| 1175 } | |
| 1176 elsif ($Mode =~ /^Atoms$/i) { | |
| 1177 $OutFileName = "${OutFileRoot}Atoms.pdb"; | |
| 1178 } | |
| 1179 elsif ($Mode =~ /^AtomNums$/i) { | |
| 1180 $OutFileName = "${OutFileRoot}AtomNums.pdb"; | |
| 1181 } | |
| 1182 elsif ($Mode =~ /^AtomsRange$/i) { | |
| 1183 $OutFileName = "${OutFileRoot}AtomsRange.pdb"; | |
| 1184 } | |
| 1185 elsif ($Mode =~ /^AtomNames$/i) { | |
| 1186 $OutFileName = "${OutFileRoot}AtomNames.pdb"; | |
| 1187 } | |
| 1188 elsif ($Mode =~ /^ResidueNums$/i) { | |
| 1189 $OutFileName = "${OutFileRoot}ResidueNums.pdb"; | |
| 1190 } | |
| 1191 elsif ($Mode =~ /^ResiduesRange$/i) { | |
| 1192 $OutFileName = "${OutFileRoot}ResiduesRange.pdb"; | |
| 1193 } | |
| 1194 elsif ($Mode =~ /^ResidueNames$/i) { | |
| 1195 $OutFileName = "${OutFileRoot}ResidueNames.pdb"; | |
| 1196 } | |
| 1197 elsif ($Mode =~ /^NonWater$/i) { | |
| 1198 $OutFileName = "${OutFileRoot}NonWater.pdb"; | |
| 1199 } | |
| 1200 elsif ($Mode =~ /^NonHydrogens$/i) { | |
| 1201 $OutFileName = "${OutFileRoot}NonHydrogens.pdb"; | |
| 1202 } | |
| 1203 elsif ($Mode =~ /^Distance$/i) { | |
| 1204 my($DistanceMode) = ''; | |
| 1205 if ($OptionsInfo{ExtractionDistanceMode} =~ /^Atom$/i) { | |
| 1206 $DistanceMode = 'Atom'; | |
| 1207 } | |
| 1208 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Hetatm$/i) { | |
| 1209 $DistanceMode = 'Hetatm'; | |
| 1210 } | |
| 1211 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^Residue$/i) { | |
| 1212 $DistanceMode = 'Residue'; | |
| 1213 } | |
| 1214 elsif ($OptionsInfo{ExtractionDistanceMode} =~ /^XYZ$/i) { | |
| 1215 $DistanceMode = 'XYZ'; | |
| 1216 } | |
| 1217 $OutFileName = "${OutFileRoot}DistanceBy${DistanceMode}.pdb"; | |
| 1218 } | |
| 1219 push @OutFileNames, $OutFileName; | |
| 1220 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { | |
| 1221 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; | |
| 1222 next FILELIST; | |
| 1223 } | |
| 1224 } | |
| 1225 elsif ($Mode =~ /^(Chains|Sequences)$/i) { | |
| 1226 if ($OptionsInfo{CombineChainSequences}) { | |
| 1227 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}ExtractedChains.pdb" : "${OutFileRoot}SequencesChainsCombined.fasta"; | |
| 1228 push @OutFileNames, $OutFileName; | |
| 1229 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { | |
| 1230 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; | |
| 1231 next FILELIST; | |
| 1232 } | |
| 1233 } | |
| 1234 else { | |
| 1235 for $ChainLabel (@ChainLabels) { | |
| 1236 $OutFileName = ($Mode =~ /^Chains$/i) ? "${OutFileRoot}${ChainLabel}.pdb" : "${OutFileRoot}Sequences${ChainLabel}.fasta"; | |
| 1237 push @OutFileNames, $OutFileName; | |
| 1238 if (!$OptionsInfo{OverwriteFiles} && (-e $OutFileName)) { | |
| 1239 warn "Warning: Ignoring file $PDBFile: The file $OutFileName already exists\n"; | |
| 1240 next FILELIST; | |
| 1241 } | |
| 1242 } | |
| 1243 } | |
| 1244 } | |
| 1245 @ChainSequenceIDs = (); | |
| 1246 @ChainSequenceIDsPrefix = (); | |
| 1247 if ($Mode =~ /^Sequences$/i) { | |
| 1248 my($HeaderRecordLine, $Classification, $DepositionDate, $IDCode, $IDPrefix); | |
| 1249 ($Classification, $DepositionDate, $IDCode) = GetHeaderRecordInformation($PDBRecordLinesRef); | |
| 1250 | |
| 1251 if ($OptionsInfo{SequenceIDPrefixSource} =~ /^FileName$/i) { | |
| 1252 $IDPrefix = $FileName; | |
| 1253 } | |
| 1254 elsif ($OptionsInfo{SequenceIDPrefixSource} =~ /^HeaderRecord$/i) { | |
| 1255 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : ''; | |
| 1256 } | |
| 1257 else { | |
| 1258 $IDPrefix = IsNotEmpty($IDCode) ? $IDCode : $FileName; | |
| 1259 } | |
| 1260 | |
| 1261 for $ChainLabel (@ChainLabels) { | |
| 1262 push @ChainSequenceIDsPrefix, $IDPrefix; | |
| 1263 push @ChainSequenceIDs, "${IDPrefix}_${ChainLabel}|PDB"; | |
| 1264 } | |
| 1265 } | |
| 1266 | |
| 1267 $PDBFilesInfo{FileOkay}[$Index] = 1; | |
| 1268 $PDBFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | |
| 1269 | |
| 1270 push @{$PDBFilesInfo{OutFileNames}[$Index]}, @OutFileNames; | |
| 1271 push @{$PDBFilesInfo{ChainLabels}[$Index]}, @ChainLabels; | |
| 1272 push @{$PDBFilesInfo{ChainSequenceIDsPrefix}[$Index]}, @ChainSequenceIDsPrefix; | |
| 1273 push @{$PDBFilesInfo{ChainSequenceIDs}[$Index]}, @ChainSequenceIDs; | |
| 1274 push @{$PDBFilesInfo{SpecifiedChains}[$Index]}, @SpecifiedChains; | |
| 1275 push @{$PDBFilesInfo{DistanceOrigin}[$Index]}, @DistanceOrigin; | |
| 1276 } | |
| 1277 } | |
| 1278 | |
| 1279 | |
| 1280 # Setup script usage and retrieve command line arguments specified using various options... | |
| 1281 sub SetupScriptUsage { | |
| 1282 | |
| 1283 # Retrieve all the options... | |
| 1284 %Options = (); | |
| 1285 $Options{chains} = 'First'; | |
| 1286 $Options{combinechains} = 'no'; | |
| 1287 $Options{distance} = 10.0; | |
| 1288 $Options{distancemode} = 'XYZ'; | |
| 1289 $Options{distanceselectionmode} = 'ByAtom'; | |
| 1290 $Options{keepoldrecords} = 'no'; | |
| 1291 $Options{mode} = 'NonWater'; | |
| 1292 $Options{modifyheader} = 'yes'; | |
| 1293 $Options{nonstandardkeep} = 'yes'; | |
| 1294 $Options{nonstandardcode} = 'X'; | |
| 1295 $Options{sequencelength} = 80; | |
| 1296 $Options{sequenceidprefix} = 'Automatic'; | |
| 1297 $Options{sequencerecords} = 'Atom'; | |
| 1298 $Options{waterresiduenames} = 'Automatic'; | |
| 1299 | |
| 1300 if (!GetOptions(\%Options, "atoms|a=s", "chains|c=s", "combinechains=s", "distance|d=f", "distancemode=s", "distanceorigin=s", "distanceselectionmode=s", "help|h", "keepoldrecords|k=s", "mode|m=s", "modifyheader=s", "nonstandardkeep=s", "nonstandardcode=s", "overwrite|o", "root|r=s", "recordmode=s", "residues=s", "sequencelength=i", "sequenceidprefix=s", "sequencerecords=s", "waterresiduenames=s", "workingdir|w=s")) { | |
| 1301 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 1302 } | |
| 1303 if ($Options{workingdir}) { | |
| 1304 if (! -d $Options{workingdir}) { | |
| 1305 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 1306 } | |
| 1307 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 1308 } | |
| 1309 if ($Options{combinechains} !~ /^(yes|no)$/i) { | |
| 1310 die "Error: The value specified, $Options{combinechains}, for option \"--CombineChains\" is not valid. Allowed values: yes or no\n"; | |
| 1311 } | |
| 1312 if ($Options{distancemode} !~ /^(Atom|Hetatm|Residue|XYZ)$/i) { | |
| 1313 die "Error: The value specified, $Options{distancemode}, for option \"--DistanceMode\" is not valid. Allowed values: Atom, Hetatm, Residue, or XYZ\n"; | |
| 1314 } | |
| 1315 if ($Options{distanceselectionmode} !~ /^(ByAtom|ByResidue)$/i) { | |
| 1316 die "Error: The value specified, $Options{distanceselectionmode}, for option \"--DistanceSelectionMode\" is not valid. Allowed values: ByAtom or ByResidue\n"; | |
| 1317 } | |
| 1318 if ($Options{keepoldrecords} !~ /^(yes|no)$/i) { | |
| 1319 die "Error: The value specified, $Options{keepoldrecords}, for option \"--KeepOldRecords\" is not valid. Allowed values: yes or no\n"; | |
| 1320 } | |
| 1321 if ($Options{mode} !~ /^(Chains|Sequences|Atoms|CAlphas|AtomNums|AtomsRange|AtomNames|ResidueNums|ResidueNames|ResiduesRange|Distance|NonWater|NonHydrogens)$/i) { | |
| 1322 die "Error: The value specified, $Options{mode}, for option \"m, --mode\" is not valid. Allowed values: Chains, Sequences, Atoms, CAlphas, AtomNums, AtomsRange, AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, NonHydrogens\n"; | |
| 1323 } | |
| 1324 if ($Options{modifyheader} !~ /^(yes|no)$/i) { | |
| 1325 die "Error: The value specified, $Options{modifyheader}, for option \"--ModifyHeader\" is not valid. Allowed values: yes or no\n"; | |
| 1326 } | |
| 1327 if ($Options{nonstandardkeep} !~ /^(yes|no)$/i) { | |
| 1328 die "Error: The value specified, $Options{nonstandardkeep}, for option \"--NonStandardKeep\" is not valid. Allowed values: yes or no\n"; | |
| 1329 } | |
| 1330 if ($Options{nonstandardcode} !~ /^(\?|\-|X)$/i) { | |
| 1331 die "Error: The value specified, $Options{nonstandardcode}, for option \"--NonStandardCode\" is not valid. Allowed values: ?, -, or X\n"; | |
| 1332 } | |
| 1333 if ($Options{recordmode} && $Options{recordmode} !~ /^(Atom|Hetatm|AtomAndHetatm)$/i) { | |
| 1334 die "Error: The value specified, $Options{recordmode}, for option \"--RecordMode\" is not valid. Allowed values: Atom, Hetatm, AtomAndHetatm\n"; | |
| 1335 } | |
| 1336 if (!IsPositiveInteger($Options{sequencelength})) { | |
| 1337 die "Error: The value specified, $Options{sequencelength}, for option \"--SequenceLength\" is not valid. Allowed values: >0\n"; | |
| 1338 } | |
| 1339 if ($Options{sequencerecords} !~ /^(Atom|SeqRes)$/i) { | |
| 1340 die "Error: The value specified, $Options{sequencerecords}, for option \"--SequenceRecords\" is not valid. Allowed values: Atom or SeqRes\n"; | |
| 1341 } | |
| 1342 if ($Options{sequenceidprefix} !~ /^(FileName|HeaderRecord|Automatic)$/i) { | |
| 1343 die "Error: The value specified, $Options{sequenceidprefix}, for option \"--SequenceIDPrefix\" is not valid. Allowed values: FileName, HeaderRecord, or AutomaticAtom\n"; | |
| 1344 } | |
| 1345 } | |
| 1346 | |
| 1347 __END__ | |
| 1348 | |
| 1349 =head1 NAME | |
| 1350 | |
| 1351 ExtractFromPDBFiles.pl - Extract specific data from PDBFile(s) | |
| 1352 | |
| 1353 =head1 SYNOPSIS | |
| 1354 | |
| 1355 ExtractFromPDBFiles.pl PDBFile(s)... | |
| 1356 | |
| 1357 ExtractFromPDBFiles.pl [B<-a, --Atoms> "AtomNum, [AtomNum...]" | "StartAtomNum, EndAtomNum" | | |
| 1358 "AtomName, [AtomName...]"] [B<-c, --chains> First | All | "ChainID, [ChainID,...]"] | |
| 1359 [<--CombineChains> yes | no] [B<-d, --distance> number] [B<--DistanceMode> Atom | Hetatm | Residue | XYZ] | |
| 1360 [B<--DistanceOrigin> "AtomNumber, AtomName" | "HetatmNumber, HetAtmName" | "ResidueNumber, ResidueName, [ChainID]" | "X,Y,Z">] | |
| 1361 [<--DistanceSelectionMode> ByAtom | ByResidue] [B<-h, --help>] [B<-k, --KeepOldRecords> yes | no] | |
| 1362 [B<-m, --mode > Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames | | |
| 1363 ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens] | |
| 1364 [B<--ModifyHeader> yes | no] [B<--NonStandardKeep> yes | no] [B<--NonStandardCode> character] | |
| 1365 [B<-o, --overwrite>] [B<-r, --root> rootname] B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm>] | |
| 1366 [B<--Residues> "ResidueNum,[ResidueNum...]" | StartResidueNum,EndResiduNum ] | |
| 1367 [B<--SequenceLength> number] [B<--SequenceRecords> Atom | SeqRes] | |
| 1368 [B<--SequenceIDPrefix> FileName | HeaderRecord | Automatic] | |
| 1369 [B<--WaterResidueNames> Automatic | "ResidueName, [ResidueName,...]"] | |
| 1370 [B<-w, --WorkingDir> dirname] PDBFile(s)... | |
| 1371 | |
| 1372 =head1 DESCRIPTION | |
| 1373 | |
| 1374 Extract specific data from I<PDBFile(s)> and generate appropriate PDB or sequence file(s). | |
| 1375 Multiple PDBFile names are separated by spaces. The valid file extension is I<.pdb>. | |
| 1376 All other file name extensions are ignored during the wild card expansion. All the PDB files | |
| 1377 in a current directory can be specified either by I<*.pdb> or the current directory name. | |
| 1378 | |
| 1379 During I<Chains> and I<Sequences> values of B<-m, --mode> option, all ATOM/HETAM records | |
| 1380 for chains after the first model in PDB fils containing data for multiple models are ignored. | |
| 1381 | |
| 1382 =head1 OPTIONS | |
| 1383 | |
| 1384 =over 4 | |
| 1385 | |
| 1386 =item B<-a, --Atoms> I<"AtomNum,[AtomNum...]" | "StartAtomNum,EndAtomNum" | "AtomName,[AtomName...]"> | |
| 1387 | |
| 1388 Specify which atom records to extract from I<PDBFiles(s)> during I<AtomNums>, | |
| 1389 I<AtomsRange>, and I<AtomNames> value of B<-m, --mode> option: extract records | |
| 1390 corresponding to atom numbers specified in a comma delimited list of atom numbers/names, | |
| 1391 or with in the range of start and end atom numbers. Possible values: I<"AtomNum[,AtomNum,..]">, | |
| 1392 I<StartAtomNum,EndAtomNum>, or I<"AtomName[,AtomName,..]">. Default: I<None>. Examples: | |
| 1393 | |
| 1394 10 | |
| 1395 15,20 | |
| 1396 N,CA,C,O | |
| 1397 | |
| 1398 =item B<-c, --chains> I<First | All | ChainID,[ChainID,...]> | |
| 1399 | |
| 1400 Specify which chains to extract from I<PDBFile(s)> during I<Chains | Sequences> value of | |
| 1401 B<-m, --mode> option: first chain, all chains, or a specific list of comma delimited chain IDs. | |
| 1402 Possible values: I<First | All | ChainID,[ChainID,...]>. Default: I<First>. Examples: | |
| 1403 | |
| 1404 A | |
| 1405 A,B | |
| 1406 All | |
| 1407 | |
| 1408 =item B<--CombineChains> I<yes | no> | |
| 1409 | |
| 1410 Specify whether to combine extracted chains data into a single file during I<Chains> or | |
| 1411 I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>. Default: I<no>. | |
| 1412 | |
| 1413 During I<Chains> value of <-m, --mode> option with I<Yes> value of <--CombineChains>, | |
| 1414 extracted data for specified chains is written into a single file instead of individual file for each | |
| 1415 chain. | |
| 1416 | |
| 1417 During I<Sequences> value of <-m, --mode> option with I<Yes> value of <--CombineChains>, | |
| 1418 residues sequences for specified chains are extracted and concatenated into a single sequence | |
| 1419 file instead of individual file for each chain. | |
| 1420 | |
| 1421 =item B<-d, --distance> I<number> | |
| 1422 | |
| 1423 Specify distance used to extract ATOM/HETATM recods during I<Distance> value of | |
| 1424 B<-m, --mode> option. Default: I<10.0> angstroms. | |
| 1425 | |
| 1426 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: | |
| 1427 ATOM, HETATM or both. | |
| 1428 | |
| 1429 =item B<--DistanceMode> I<Atom | Hetatm | Residue | XYZ> | |
| 1430 | |
| 1431 Specify how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of | |
| 1432 B<-m, --mode> option: extract all the records within a certain distance specifed by B<-d, --distance> | |
| 1433 from an atom or hetro atom record, a residue, or any artbitrary point. Possible values: I<Atom | | |
| 1434 Hetatm | Residue | XYZ>. Default: I<XYZ>. | |
| 1435 | |
| 1436 During I<Residue> value of B<--distancemode>, distance of ATOM/HETATM records is calculated from | |
| 1437 all the atoms in the residue and the records are selected as long as any atom of the residue lies with | |
| 1438 in the distace specified using B<-d, --distance> option. | |
| 1439 | |
| 1440 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: | |
| 1441 ATOM, HETATM or both. | |
| 1442 | |
| 1443 =item B<--DistanceSelectionMode> I<ByAtom | ByResidue> | |
| 1444 | |
| 1445 Specify how how to extract ATOM/HETATM records from I<PDBFile(s)> during I<Distance> value of | |
| 1446 B<-m, --mode> option for all values of B<--DistanceMode> option: extract only those ATOM/HETATM | |
| 1447 records that meet specified distance criterion; extract all records corresponding to a residue as | |
| 1448 long as one of the ATOM/HETATM record in the residue satisfies specified distance criterion. Possible | |
| 1449 values: I<ByAtom, ByResidue>. Default value: I<ByAtom>. | |
| 1450 | |
| 1451 =item B<--DistanceOrigin> I<"AtomNumber,AtomName" | "HetatmNumber,HetAtmName" | "ResidueNumber,ResidueName[,ChainID]" | "X,Y,Z"> | |
| 1452 | |
| 1453 This value is B<--distancemode> specific. In general, it identifies a point used to select | |
| 1454 other ATOM/HETATMS with in a specific distance from this point. | |
| 1455 | |
| 1456 For I<Atom> value of B<--distancemode>, this option corresponds to an atom specification. | |
| 1457 Format: I<AtomNumber,AtomName>. Example: | |
| 1458 | |
| 1459 455,CA | |
| 1460 | |
| 1461 For I<Hetatm> value of B<--distancemode>, this option corresponds to a hetatm specification. | |
| 1462 Format: I<HetatmNumber,HetAtmName>. Example: | |
| 1463 | |
| 1464 5295,C1 | |
| 1465 | |
| 1466 For I<Residue> value of B<--distancemode>, this option corresponds to a residue specification. | |
| 1467 Format: I<ResidueNumber, ResidueName[,ChainID]>. Example: | |
| 1468 | |
| 1469 78,MSE | |
| 1470 977,RET,A | |
| 1471 978,RET,B | |
| 1472 | |
| 1473 For I<XYZ> value of B<--distancemode>, this option corresponds to a coordinate of an | |
| 1474 arbitrary point. Format: I<X,Y,X>. Example: | |
| 1475 | |
| 1476 10.044,19.261,-4.292 | |
| 1477 | |
| 1478 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: | |
| 1479 ATOM, HETATM or both. | |
| 1480 | |
| 1481 =item B<-h, --help> | |
| 1482 | |
| 1483 Print this help message. | |
| 1484 | |
| 1485 =item B<-k, --KeepOldRecords> I<yes | no> | |
| 1486 | |
| 1487 Specify whether to transfer old non ATOM and HETATM records from input PDBFile(s) to new | |
| 1488 PDBFile(s) during I<Chains | Atoms | HetAtms | CAlphas | Distance| NonWater | NonHydrogens> | |
| 1489 value of B<-m --mode> option. By default, except for the HEADER record, all | |
| 1490 other unnecessary non ATOM/HETATM records are dropped during the | |
| 1491 generation of new PDB files. Possible values: I<yes | no>. Default: I<no>. | |
| 1492 | |
| 1493 =item B<-m, --mode > I<Chains | Sequences | Atoms | CAlphas | AtomNums | AtomsRange | AtomNames | ResidueNums | ResiduesRange | ResidueNames | Distance | NonWater | NonHydrogens> | |
| 1494 | |
| 1495 Specify what to extract from I<PDBFile(s)>: I<Chains> - retrieve records for | |
| 1496 specified chains; I<Sequences> - generate sequence files for specific chains; | |
| 1497 I<Atoms> - extract atom records; I<CAlphas> - extract atom records for alpha | |
| 1498 carbon atoms; I<AtomNums> - extract atom records for specified atom numbers; | |
| 1499 I<AtomsRange> - extract atom records between specified atom number range; | |
| 1500 I<AtomNames> - extract atom records for specified atom names; I<ResidueNums> | |
| 1501 - extract records for specified residue numbers; I<ResiduesRange> - extract records | |
| 1502 for residues between specified residue number range; I<ResidueNames> - extract | |
| 1503 records for specified residue names; I<Distance> - extract records with in a | |
| 1504 certain distance from a specific position; I<NonWater> - extract records corresponding | |
| 1505 to residues other than water; I<NonHydrogens> - extract non-hydrogen records. | |
| 1506 | |
| 1507 Possible values: I<Chains, Sequences Atoms, CAlphas, AtomNums, AtomsRange, | |
| 1508 AtomNames, ResidueNums, ResiduesRange, ResidueNames, Distance, NonWater, | |
| 1509 NonHydrogens>. Default value: I<NonWater> | |
| 1510 | |
| 1511 During the generation of new PDB files, unnecessay CONECT records are dropped. | |
| 1512 | |
| 1513 For I<Chains> mode, data for appropriate chains specified by B<--c --chains> option | |
| 1514 is extracted from I<PDBFile(s)> and placed into new PDB file(s). | |
| 1515 | |
| 1516 For I<Sequences> mode, residues names using various sequence related options are | |
| 1517 extracted for chains specified by B<--c --chains> option from I<PDBFile(s)> and | |
| 1518 FASTA sequence file(s) are generated. | |
| 1519 | |
| 1520 For I<Distance> mode, all ATOM/HETATM records with in a distance specified | |
| 1521 by B<-d --distance> option from a specific atom, residue or a point indicated by | |
| 1522 B<--distancemode> are extracted and placed into new PDB file(s). | |
| 1523 | |
| 1524 For I<NonWater> mode, non water ATOM/HETATM record lines, identified using value of | |
| 1525 B<--WaterResidueNames>, are extracted and written to new PDB file(s). | |
| 1526 | |
| 1527 For I<NonHydrogens> mode, ATOM/HETATOM record lines containing element symbol | |
| 1528 other than I<H> are extracted and written to new PDB file(s). | |
| 1529 | |
| 1530 For all other options, appropriate ATOM/HETATM records are extracted to generate new | |
| 1531 PDB file(s). | |
| 1532 | |
| 1533 B<--RecordMode> option controls type of record lines to extract and process from | |
| 1534 I<PDBFile(s)>: ATOM, HETATM or both. | |
| 1535 | |
| 1536 =item B<--ModifyHeader> I<yes | no> | |
| 1537 | |
| 1538 Specify whether to modify HEADER record during the generation of new PDB files | |
| 1539 for B<-m, --mode> values of I<Chains | Atoms | CAlphas | Distance>. Possible values: | |
| 1540 I<yes | no>. Default: I<yes>. By default, Classification data is replaced by I<Data extracted | |
| 1541 using MayaChemTools> before writing out HEADER record. | |
| 1542 | |
| 1543 =item B<--NonStandardKeep> I<yes | no> | |
| 1544 | |
| 1545 Specify whether to include and convert non-standard three letter residue codes into | |
| 1546 a code specified using B<--nonstandardcode> option and include them into sequence file(s) | |
| 1547 generated during I<Sequences> value of B<-m, --mode> option. Possible values: I<yes | no>. | |
| 1548 Default: I<yes>. | |
| 1549 | |
| 1550 A warning is also printed about the presence of non-standard residues. Any residue other | |
| 1551 than standard 20 amino acids and 5 nucleic acid is considered non-standard; additionally, | |
| 1552 HETATM residues in chains also tagged as non-standard. | |
| 1553 | |
| 1554 =item B<--NonStandardCode> I<character> | |
| 1555 | |
| 1556 A single character code to use for non-standard residues. Default: I<X>. Possible values: | |
| 1557 I<?, -, or X>. | |
| 1558 | |
| 1559 =item B<-o, --overwrite> | |
| 1560 | |
| 1561 Overwrite existing files. | |
| 1562 | |
| 1563 =item B<-r, --root> I<rootname> | |
| 1564 | |
| 1565 New PDB and sequence file name is generated using the root: <Root><Mode>.<Ext>. | |
| 1566 Default new file name: <PDBFileName>Chain<ChainID>.pdb for I<Chains> B<mode>; | |
| 1567 <PDBFileName>SequenceChain<ChainID>.fasta for I<Sequences> B<mode>; | |
| 1568 <PDBFileName>DistanceBy<DistanceMode>.pdb for I<Distance> B<-m, --mode> | |
| 1569 <PDBFileName><Mode>.pdb for I<Atoms | CAlphas | NonWater | NonHydrogens> B<-m, --mode> | |
| 1570 values. This option is ignored for multiple input files. | |
| 1571 | |
| 1572 =item B<--RecordMode> I<Atom | Hetatm | AtomAndHetatm> | |
| 1573 | |
| 1574 Specify type of record lines to extract and process from I<PDBFile(s)> during various | |
| 1575 values of B<-m, --mode> option: extract only ATOM record lines; extract only HETATM | |
| 1576 record lines; extract both ATOM and HETATM lines. Possible values: I<Atom | Hetatm | |
| 1577 | AtomAndHetatm | XYZ>. Default during I<Atoms, CAlphas, AtomNums, AtomsRange, | |
| 1578 AtomNames> values of B<-m, --mode> option: I<Atom>; otherwise: I<AtomAndHetatm>. | |
| 1579 | |
| 1580 This option is ignored during I<Chains, Sequences> values of B<-m, --mode> option. | |
| 1581 | |
| 1582 =item B<--Residues> I<"ResidueNum,[ResidueNum...]" | "StartResidueNum,EndResiduNum" | "ResidueName,[ResidueName...]"> | |
| 1583 | |
| 1584 Specify which resiude records to extract from I<PDBFiles(s)> during I<ResidueNums>, | |
| 1585 I<ResiduesRange>,and I<ResidueNames> value of B<-m, --mode> option: extract records | |
| 1586 corresponding to residue numbers specified in a comma delimited list of residue numbers/names, | |
| 1587 or with in the range of start and end residue numbers. Possible values: I<"ResidueNum[,ResidueNum,..]">, | |
| 1588 I<StartResidueNum,EndResiduNum>, or I<<"ResidueName[,ResidueName,..]">. Default: I<None>. Examples: | |
| 1589 | |
| 1590 20 | |
| 1591 5,10 | |
| 1592 TYR,SER,THR | |
| 1593 | |
| 1594 B<--RecordMode> option controls type of record lines to extract from I<PDBFile(s)>: | |
| 1595 ATOM, HETATM or both. | |
| 1596 | |
| 1597 =item B<--SequenceLength> I<number> | |
| 1598 | |
| 1599 Maximum sequence length per line in sequence file(s). Default: I<80>. | |
| 1600 | |
| 1601 =item B<--SequenceRecords> I<Atom | SeqRes> | |
| 1602 | |
| 1603 Specify which records to use for extracting residue names from I<PDBFiles(s)> during | |
| 1604 I<Sequences> value of B<-m, --mode> option: use ATOM records to compile a list | |
| 1605 of residues in a chain or parse SEQRES record to get a list of residues. Possible values: | |
| 1606 I<Atom | SeqRes>. Default: I<Atom>. | |
| 1607 | |
| 1608 =item B<--SequenceIDPrefix> I<FileName | HeaderRecord | Automatic> | |
| 1609 | |
| 1610 Specify how to generate a prefix for sequence IDs during I<Sequences> value | |
| 1611 of B<-m, --mode> option: use input file name prefix; retrieve PDB ID from HEADER record; | |
| 1612 or automatically decide the method for generating the prefix. The chain IDs are also | |
| 1613 appended to the prefix. Possible values: I<FileName | HeaderRecord | Automatic>. | |
| 1614 Default: I<Automatic> | |
| 1615 | |
| 1616 =item B<--WaterResidueNames> I<Automatic | "ResidueName,[ResidueName,...]"> | |
| 1617 | |
| 1618 Identification of water residues during I<NonWater> value of B<-m, --mode> option. Possible values: | |
| 1619 I<Automatic | "ResidueName,[ResidueName,...]">. Default: I<Automatic> - corresponds | |
| 1620 to "HOH,WAT,H20". You can also specify a different comma delimited list of residue names | |
| 1621 to use for water. | |
| 1622 | |
| 1623 =item B<-w, --WorkingDir> I<dirname> | |
| 1624 | |
| 1625 Location of working directory. Default: current directory. | |
| 1626 | |
| 1627 =back | |
| 1628 | |
| 1629 =head1 EXAMPLES | |
| 1630 | |
| 1631 To extract non-water records from Sample2.pdb file and generate Sample2NonWater.pdb | |
| 1632 file, type: | |
| 1633 | |
| 1634 % ExtractFromPDBFiles.pl Sample2.pdb | |
| 1635 | |
| 1636 To extract non-water records corresponding to only ATOM records from Sample2.pdb file | |
| 1637 and generate Sample2NonWater.pdb file, type: | |
| 1638 | |
| 1639 % ExtractFromPDBFiles.pl --RecordMode Atom Sample2.pdb | |
| 1640 | |
| 1641 To extract non-water records from Sample2.pdb file using HOH or WAT residue name for water along | |
| 1642 with all old non-coordinate records and generate Sample2NewNonWater.pdb file, type: | |
| 1643 | |
| 1644 % ExtractFromPDBFiles.pl -m NonWater --WaterResidueNames "HOH,WAT" | |
| 1645 -KeepOldRecords Yes -r Sample2New -o Sample2.pdb | |
| 1646 | |
| 1647 To extract non-hydrogens records from Sample2.pdb file and generate Sample2NonHydrogen.pdb | |
| 1648 file, type: | |
| 1649 | |
| 1650 % ExtractFromPDBFiles.pl -m NonHydrogens Sample2.pdb | |
| 1651 | |
| 1652 To extract data for first chain in Sample2.pdb and generate Sample2ChainA.pdb, type | |
| 1653 file, type: | |
| 1654 | |
| 1655 % ExtractFromPDBFiles.pl -m chains -o Sample2.pdb | |
| 1656 | |
| 1657 To extract data for both chains in Sample2.pdb and generate Sample2ChainA.pdb and | |
| 1658 Sample2ChainB.pdb, type: | |
| 1659 | |
| 1660 % ExtractFromPDBFiles.pl -m chains -c All -o Sample2.pdb | |
| 1661 | |
| 1662 To extract data for alpha carbons in Sample2.pdb and generate Sample2CAlphas.pdb, type: | |
| 1663 | |
| 1664 % ExtractFromPDBFiles.pl -m CAlphas -o Sample2.pdb | |
| 1665 | |
| 1666 To extract records for specific residue numbers in all chains from Sample2.pdb file and generate | |
| 1667 Sample2ResidueNums.pdb file, type: | |
| 1668 | |
| 1669 % ExtractFromPDBFiles.pl -m ResidueNums --Residues "3,6" | |
| 1670 Sample2.pdb | |
| 1671 | |
| 1672 To extract records for a specific range of residue number in all chains from Sample2.pdb | |
| 1673 file and generate Sample2ResiduesRange.pdb file, type: | |
| 1674 | |
| 1675 % ExtractFromPDBFiles.pl -m ResiduesRange --Residues "10,30" | |
| 1676 Sample2.pdb | |
| 1677 | |
| 1678 To extract data for all ATOM and HETATM records with in 10 angstrom of an atom specifed by | |
| 1679 atom serial number and name "1,N" in Sample2.pdb file and generate Sample2DistanceByAtom.pdb, | |
| 1680 type: | |
| 1681 | |
| 1682 % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom | |
| 1683 --DistanceOrigin "1,N" -k No --distance 10 -o Sample2.pdb | |
| 1684 | |
| 1685 To extract data for all ATOM and HETATM records for complete residues with any atom or hetatm | |
| 1686 less than 10 angstrom of an atom specifed by atom serial number and name "1,N" in Sample2.pdb | |
| 1687 file and generate Sample2DistanceByAtom.pdb, type: | |
| 1688 | |
| 1689 % ExtractFromPDBFiles.pl -m Distance --DistanceMode Atom | |
| 1690 --DistanceOrigin "1,N" --DistanceSelectionMode ByResidue | |
| 1691 -k No --distance 10 -o Sample2.pdb | |
| 1692 | |
| 1693 To extract data for all ATOM and HETATM records with in 25 angstrom of an arbitrary point "0,0,0" | |
| 1694 in Sample2.pdb file and generate Sample2DistanceByXYZ.pdb, type: | |
| 1695 | |
| 1696 % ExtractFromPDBFiles.pl -m Distance --DistanceMode XYZ | |
| 1697 --DistanceOrigin "0,0,0" -k No --distance 25 -o Sample2.pdb | |
| 1698 | |
| 1699 =head1 AUTHOR | |
| 1700 | |
| 1701 Manish Sud <msud@san.rr.com> | |
| 1702 | |
| 1703 =head1 SEE ALSO | |
| 1704 | |
| 1705 InfoPDBFiles.pl, ModifyPDBFiles.pl | |
| 1706 | |
| 1707 =head1 COPYRIGHT | |
| 1708 | |
| 1709 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 1710 | |
| 1711 This file is part of MayaChemTools. | |
| 1712 | |
| 1713 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 1714 the terms of the GNU Lesser General Public License as published by the Free | |
| 1715 Software Foundation; either version 3 of the License, or (at your option) | |
| 1716 any later version. | |
| 1717 | |
| 1718 =cut |
