1 package SDFileUtil; 2 # 3 # $RCSfile: SDFileUtil.pm,v $ 4 # $Date: 2015/02/28 20:47:18 $ 5 # $Revision: 1.49 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Exporter; 31 use Carp; 32 use PeriodicTable qw(IsElement); 33 use TimeUtil (); 34 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 @ISA = qw(Exporter); 38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType MDLBondStereoToInternalBondStereochemistry InternalBondStereochemistryToMDLBondStereo InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd); 39 @EXPORT_OK = qw(); 40 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 41 42 # Format data for compounds count line... 43 sub GenerateCmpdCountsLine { 44 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line); 45 46 if (@_ == 5) { 47 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_; 48 } 49 elsif (@_ == 3) { 50 ($AtomCount, $BondCount, $ChiralFlag) = @_; 51 $PropertyCount = 999; 52 $Version = "V2000"; 53 } 54 else { 55 ($AtomCount, $BondCount) = @_; 56 $ChiralFlag = 0; 57 $PropertyCount = 999; 58 $Version = "V2000"; 59 } 60 if ($AtomCount > 999) { 61 croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 62 } 63 $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version; 64 65 return ($Line); 66 } 67 68 # Generate comments line... 69 sub GenerateCmpdCommentsLine { 70 my($Comments) = @_; 71 my($Line); 72 73 $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments; 74 75 return $Line; 76 } 77 78 # Generate molname line... 79 sub GenerateCmpdMolNameLine { 80 my($MolName) = @_; 81 my($Line); 82 83 $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName; 84 85 return $Line; 86 } 87 88 # Generate data for compounds misc info line... 89 sub GenerateCmpdMiscInfoLine { 90 my($ProgramName, $UserInitial, $Code) = @_; 91 my($Date, $Line); 92 93 if (!(defined($ProgramName) && $ProgramName)) { 94 $ProgramName = "MayaChem"; 95 } 96 if (!(defined($UserInitial) && $UserInitial)) { 97 $UserInitial = " "; 98 } 99 if (!(defined($Code) && $Code)) { 100 $Code = "2D"; 101 } 102 103 if (length($ProgramName) > 8) { 104 $ProgramName = substr($ProgramName, 0, 8); 105 } 106 if (length($UserInitial) > 2) { 107 $UserInitial = substr($UserInitial, 0, 2); 108 } 109 if (length($Code) > 2) { 110 $Code = substr($Code, 0, 2); 111 } 112 $Date = GenerateMiscLineDateStamp(); 113 114 $Line = "${UserInitial}${ProgramName}${Date}${Code}"; 115 116 return $Line; 117 } 118 119 # Generate data for compounds misc info line... 120 sub GenerateEmptyCtabBlockLines { 121 my($Date, $Lines); 122 123 if (@_ == 1) { 124 ($Date) = @_; 125 } 126 else { 127 $Date = GenerateMiscLineDateStamp(); 128 } 129 # First line: Blank molname line... 130 # Second line: Misc info... 131 # Third line: Blank comments line... 132 # Fourth line: Counts line reflecting empty structure data block... 133 $Lines = "\n"; 134 $Lines .= " MayaChem${Date}2D\n"; 135 $Lines .= "\n"; 136 $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n"; 137 $Lines .= "M END"; 138 139 return $Lines; 140 } 141 142 # Generate SD file data stamp... 143 sub GenerateMiscLineDateStamp { 144 return TimeUtil::SDFileTimeStamp(); 145 } 146 147 # Generate data for compound atom line... 148 # 149 sub GenerateCmpdAtomLine { 150 my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_; 151 my($Line); 152 153 if (!defined $MassDifference) { 154 $MassDifference = 0; 155 } 156 if (!defined $Charge) { 157 $Charge = 0; 158 } 159 if (!defined $StereoParity) { 160 $StereoParity = 0; 161 } 162 $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i 0 0 0 0 0 0 0 0 0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity; 163 164 return $Line 165 } 166 167 # Generate data for compound bond line... 168 # 169 sub GenerateCmpdBondLine { 170 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_; 171 my($Line); 172 173 if (!defined $BondStereo) { 174 $BondStereo = 0; 175 } 176 $Line = sprintf "%3i%3i%3i%3i 0 0 0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo; 177 178 return $Line 179 } 180 181 # Generate charge property lines for CTAB block... 182 # 183 sub GenerateCmpdChargePropertyLines { 184 my($ChargeValuePairsRef) = @_; 185 186 return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef); 187 } 188 189 # Generate isotope property lines for CTAB block... 190 # 191 sub GenerateCmpdIsotopePropertyLines { 192 my($IsotopeValuePairsRef) = @_; 193 194 return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef); 195 } 196 197 # Generate radical property line property lines for CTAB block... 198 # 199 sub GenerateCmpdRadicalPropertyLines { 200 my($RadicalValuePairsRef) = @_; 201 202 return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef); 203 } 204 205 # Generate atom alias property line property lines for CTAB block... 206 # 207 # Atom alias property line format: 208 # 209 # A aaa 210 # x... 211 # 212 # aaa: Atom number 213 # x: Atom alias in next line 214 # 215 sub GenerateCmpdAtomAliasPropertyLines { 216 my($PropertyValuePairsRef) = @_; 217 my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines); 218 219 @PropertyLines = (); 220 221 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 222 $AtomNum = $PropertyValuePairsRef->[$Index]; 223 $AtomAlias = $PropertyValuePairsRef->[$Index + 1]; 224 225 $Line = "A " . sprintf "%3i", $AtomNum; 226 227 push @PropertyLines, $Line; 228 push @PropertyLines, $AtomAlias; 229 } 230 231 return @PropertyLines; 232 } 233 234 # Generate data header labels and values lines... 235 # 236 sub GenerateCmpdDataHeaderLabelsAndValuesLines { 237 my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_; 238 my($DataLabel, $DataValue, @DataLabels, @DataLines); 239 240 if (!defined $SortDataLabels) { 241 $SortDataLabels = 0; 242 } 243 244 @DataLines = (); 245 @DataLabels = (); 246 if ($SortDataLabels) { 247 push @DataLabels, sort @{$DataHeaderLabelsRef}; 248 } 249 else { 250 push @DataLabels, @{$DataHeaderLabelsRef}; 251 } 252 for $DataLabel (@DataLabels) { 253 $DataValue = ''; 254 if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) { 255 $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel}; 256 } 257 push @DataLines, ("> <${DataLabel}>", "$DataValue", ""); 258 } 259 return @DataLines; 260 } 261 262 # Parse data field header in SD file and return lists of all and common data field 263 # labels. 264 sub GetAllAndCommonCmpdDataHeaderLabels { 265 my($SDFileRef) = @_; 266 my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap); 267 268 $CmpdCount = 0; 269 @DataFieldLabels = (); 270 @CommonDataFieldLabels = (); 271 %DataFieldLabelsMap = (); 272 273 while ($CmpdString = ReadCmpdString($SDFileRef)) { 274 $CmpdCount++; 275 @CmpdLines = split "\n", $CmpdString; 276 # Process compound data header labels and figure out which ones are present for 277 # all the compounds... 278 if (@DataFieldLabels) { 279 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 280 my(%CmpdDataFieldLabelsMap) = (); 281 # Setup a map for the current labels... 282 for $Label (@CmpdDataFieldLabels) { 283 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 284 } 285 # Check the presence old labels for this compound; otherwise, mark 'em new... 286 for $Label (@DataFieldLabels) { 287 if (!$CmpdDataFieldLabelsMap{$Label}) { 288 $DataFieldLabelsMap{$Label} = "PresentInSome"; 289 } 290 } 291 # Check the presence this compound in the old labels; otherwise, add 'em... 292 for $Label (@CmpdDataFieldLabels ) { 293 if (!$DataFieldLabelsMap{$Label}) { 294 # It's a new label... 295 push @DataFieldLabels, $Label; 296 $DataFieldLabelsMap{$Label} = "PresentInSome"; 297 } 298 } 299 } 300 else { 301 # Get the initial label set and set up a map... 302 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 303 for $Label (@DataFieldLabels) { 304 $DataFieldLabelsMap{$Label} = "PresentInAll"; 305 } 306 } 307 } 308 # Identify the common data field labels... 309 @CommonDataFieldLabels = (); 310 for $Label (@DataFieldLabels) { 311 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 312 push @CommonDataFieldLabels, $Label; 313 } 314 } 315 return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels); 316 } 317 318 # Parse all the data header labels and return 'em as an list... 319 # 320 # Format: 321 # 322 #> Data header line 323 #Data line(s) 324 #Blank line 325 # 326 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and 327 # contains at least one of the following: 328 # The field name enclosed in angle brackets. For example: <melting.point> 329 # The field number, DTn , where n represents the number assigned to the field in a MACCS-II database 330 # 331 #Optional information for the data header includes: 332 # The compounds external and internal registry numbers. External registry numbers must be enclosed in parentheses. 333 # Any combination of information 334 # 335 #The following are examples of valid data headers: 336 #> <MELTING.POINT> 337 #> 55 (MD-08974) <BOILING.POINT> DT12 338 #> DT12 55 339 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES 340 # 341 #Notes: Sometimes last blank line is missing and can be just followed by $$$$ 342 # 343 sub GetCmpdDataHeaderLabels { 344 my($CmpdLines) = @_; 345 my($CmpdLine, $Label, @Labels); 346 347 @Labels = (); 348 CMPDLINE: for $CmpdLine (@$CmpdLines) { 349 if ($CmpdLine !~ /^>/) { 350 next CMPDLINE; 351 } 352 # Does the line contains field name enclosed in angular brackets? 353 ($Label) = $CmpdLine =~ /<.*?>/g; 354 if (!defined($Label)) { 355 next CMPDLINE; 356 } 357 $Label =~ s/(<|>)//g; 358 push @Labels, $Label; 359 } 360 return (@Labels); 361 } 362 363 # Parse all the data header labels and values 364 sub GetCmpdDataHeaderLabelsAndValues { 365 my($CmpdLines) = @_; 366 my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields); 367 368 %DataFields = (); 369 $ProcessingLabelData = 0; 370 $ValueCount = 0; 371 CMPDLINE: for $CmpdLine (@$CmpdLines) { 372 if ($CmpdLine =~ /^\$\$\$\$/) { 373 last CMPDLINE; 374 } 375 if ($CmpdLine =~ /^>/) { 376 # Does the line contains field name enclosed in angular brackets? 377 ($Label) = $CmpdLine =~ /<.*?>/g; 378 if (defined $Label) { 379 $CurrentLabel = $Label; 380 $CurrentLabel =~ s/(<|>)//g; 381 $ProcessingLabelData = 0; 382 $ValueCount = 0; 383 384 if ($CurrentLabel) { 385 $ProcessingLabelData = 1; 386 $DataFields{$CurrentLabel} = ''; 387 next CMPDLINE; 388 } 389 } 390 else { 391 if (!$ProcessingLabelData) { 392 # Data line containing no <label> as allowed by SDF format. Just ignore it... 393 next CMPDLINE; 394 } 395 } 396 } 397 if (!$ProcessingLabelData) { 398 next CMPDLINE; 399 } 400 if (!(defined($CmpdLine) && length($CmpdLine))) { 401 # Blank line terminates value for a label... 402 $CurrentLabel = ''; 403 $ValueCount = 0; 404 $ProcessingLabelData = 0; 405 next CMPDLINE; 406 } 407 $ValueCount++; 408 $Value = $CmpdLine; 409 410 if ($ValueCount > 1) { 411 $DataFields{$CurrentLabel} .= "\n" . $Value; 412 } 413 else { 414 $DataFields{$CurrentLabel} = $Value; 415 } 416 } 417 return (%DataFields); 418 } 419 420 # Return an updated compoud string after removing data header label along with its 421 # value from the specified compound string... 422 # 423 sub RemoveCmpdDataHeaderLabelAndValue { 424 my($CmpdString, $DataHeaderLabel) = @_; 425 my($Line, $PorcessingDataHeaderLabel, @CmpdLines); 426 427 @CmpdLines = (); 428 $PorcessingDataHeaderLabel = 0; 429 430 CMPDLINE: for $Line (split "\n", $CmpdString) { 431 if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) { 432 $PorcessingDataHeaderLabel = 1; 433 next CMPDLINE; 434 } 435 436 if ($PorcessingDataHeaderLabel) { 437 # Blank line indicates end of fingerprints data value... 438 if ($Line =~ /^\$\$\$\$/) { 439 push @CmpdLines, $Line; 440 $PorcessingDataHeaderLabel = 0; 441 } 442 elsif (!length($Line)) { 443 $PorcessingDataHeaderLabel = 0; 444 } 445 next CMPDLINE; 446 } 447 448 # Track compound lines without fingerprints data... 449 push @CmpdLines, $Line; 450 } 451 452 return join "\n", @CmpdLines; 453 } 454 455 # 456 # Using bond blocks, figure out the number of disconnected fragments and 457 # return their values along with the atom numbers in a string delimited by new 458 # line character. 459 # 460 sub GetCmpdFragments { 461 my($CmpdLines) = @_; 462 my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString); 463 464 # Setup the connection table for each atom... 465 @AtomConnections = (); 466 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 467 for $AtomNum (1 .. $AtomCount) { 468 %{$AtomConnections[$AtomNum]} = (); 469 } 470 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 471 ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 472 if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) { 473 $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType; 474 } 475 if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) { 476 $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType; 477 } 478 } 479 480 #Get set to count fragments... 481 $ProcessedAtomCount = 0; 482 $FragmentNum = 0; 483 %Fragments = (); 484 @ProcessedAtoms = (); 485 for $AtomNum (1 .. $AtomCount) { 486 $ProcessedAtoms[$AtomNum] = 0; 487 } 488 while ($ProcessedAtomCount < $AtomCount) { 489 @ProcessingAtoms = (); 490 @ConnectedAtoms = (); 491 ATOMNUM: for $AtomNum (1 .. $AtomCount) { 492 if (!$ProcessedAtoms[$AtomNum]) { 493 $ProcessedAtomCount++; 494 $ProcessedAtoms[$AtomNum] = 1; 495 push @ProcessingAtoms, $AtomNum; 496 $FragmentNum++; 497 @{$Fragments{$FragmentNum} } = (); 498 push @{$Fragments{$FragmentNum} }, $AtomNum; 499 last ATOMNUM; 500 } 501 } 502 503 # Go over the neighbors and follow the connection trail while collecting the 504 # atoms numbers present in the connected fragment... 505 while (@ProcessingAtoms) { 506 for ($Index = 0; $Index < @ProcessingAtoms; $Index++) { 507 $ProcessAtomNum = $ProcessingAtoms[$Index]; 508 for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]}) { 509 if (!$ProcessedAtoms[$NbrAtomNum]) { 510 $ProcessedAtomCount++; 511 $ProcessedAtoms[$NbrAtomNum] = 1; 512 push @ConnectedAtoms, $NbrAtomNum; 513 push @{ $Fragments{$FragmentNum} }, $NbrAtomNum; 514 } 515 } 516 } 517 @ProcessingAtoms = (); 518 @ProcessingAtoms = @ConnectedAtoms; 519 @ConnectedAtoms = (); 520 } 521 } 522 $FragmentCount = $FragmentNum; 523 $FragmentString = ""; 524 525 # Sort out the fragments by size... 526 for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}} } keys %Fragments ) { 527 # Sort the atoms in a fragment by their numbers... 528 $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} }; 529 if ($FragmentString) { 530 $FragmentString .= "\n" . $AFragmentString; 531 } 532 else { 533 $FragmentString = $AFragmentString; 534 } 535 } 536 return ($FragmentCount, $FragmentString); 537 } 538 539 # Count number of lines present in between 4th and line containg "M END" 540 sub GetCtabLinesCount { 541 my($CmpdLines) = @_; 542 my($LineIndex, $CtabLinesCount); 543 544 $CtabLinesCount = 0; 545 LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) { 546 # 547 # Any line after atom and bond data starting with anything other than space or 548 # a digit indicates end of Ctab atom/bond data block... 549 # 550 if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) { 551 $CtabLinesCount = $LineIndex - 4; 552 last LINE; 553 } 554 } 555 return $CtabLinesCount; 556 } 557 558 # Using atom blocks, count the number of atoms which contain special element 559 # symbols not present in the periodic table. 560 sub GetUnknownAtoms { 561 my($CmpdLines) = @_; 562 my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol); 563 564 $UnknownAtomCount = 0; 565 $UnknownAtoms = ""; 566 $UnknownAtomLines = ""; 567 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 568 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 569 ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 570 if (!IsElement($AtomSymbol)) { 571 $UnknownAtomCount++; 572 $UnknownAtoms .= " $AtomSymbol"; 573 if ($UnknownAtomLines) { 574 $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex]; 575 } 576 else { 577 $UnknownAtomLines = @$CmpdLines[$LineIndex]; 578 } 579 } 580 } 581 return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines); 582 } 583 584 # Check z coordinates of all atoms to see whether any of them is non-zero 585 # which makes the compound geometry three dimensional... 586 # 587 sub IsCmpd3D { 588 my($CmpdLines) = @_; 589 my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ); 590 591 ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 592 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 593 ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]); 594 if ($AtomZ != 0) { 595 return 1; 596 } 597 } 598 return 0; 599 } 600 601 # Check whether it's a 2D compound... 602 # 603 sub IsCmpd2D { 604 my($CmpdLines) = @_; 605 606 return IsCmpd3D($CmpdLines) ? 0 : 1; 607 } 608 609 # Using bond blocks, count the number of bond lines which contain atom numbers 610 # greater than atom count specified in compound count line... 611 # 612 sub GetInvalidAtomNumbers { 613 my($CmpdLines) = @_; 614 my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs); 615 616 ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]); 617 618 $InvalidAtomNumbersCount = 0; 619 $InvalidAtomNumbers = ""; 620 $InvalidAtomNumberLines = ""; 621 622 # Go over bond block lines... 623 LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 624 ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 625 if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) { 626 next LINE; 627 } 628 if ($FirstAtomNum > $AtomCount) { 629 $InvalidAtomNumbersCount++; 630 $InvalidAtomNumbers .= " $FirstAtomNum"; 631 } 632 if ($SecondAtomNum > $AtomCount) { 633 $InvalidAtomNumbersCount++; 634 $InvalidAtomNumbers .= " $SecondAtomNum"; 635 } 636 if ($InvalidAtomNumberLines) { 637 $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex]; 638 } 639 else { 640 $InvalidAtomNumberLines = @$CmpdLines[$LineIndex]; 641 } 642 } 643 # Go over property lines before M END... 644 # 645 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 646 $Line = @$CmpdLines[$LineIndex]; 647 @ValuePairs = (); 648 if ($Line =~ /^M END/i) { 649 last LINE; 650 } 651 @ValuePairs = (); 652 if ($Line =~ /^M CHG/i) { 653 @ValuePairs = ParseCmpdChargePropertyLine($Line); 654 } 655 elsif ($Line =~ /^M RAD/i) { 656 @ValuePairs = ParseCmpdRadicalPropertyLine($Line); 657 } 658 elsif ($Line =~ /^M ISO/i) { 659 @ValuePairs = ParseCmpdIsotopePropertyLine($Line); 660 } 661 elsif ($Line =~ /^A /i) { 662 my($NextLine); 663 $LineIndex++; 664 $NextLine = @$CmpdLines[$LineIndex]; 665 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine); 666 } 667 else { 668 next LINE; 669 } 670 671 $InvalidAtomPropertyLine = 0; 672 for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) { 673 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1]; 674 if ($AtomNum > $AtomCount) { 675 $InvalidAtomPropertyLine = 1; 676 $InvalidAtomNumbersCount++; 677 $InvalidAtomNumbers .= " $AtomNum"; 678 } 679 } 680 if ($InvalidAtomPropertyLine) { 681 if ($InvalidAtomNumberLines) { 682 $InvalidAtomNumberLines .= "\n" . $Line; 683 } 684 else { 685 $InvalidAtomNumberLines = $Line; 686 } 687 } 688 } 689 690 return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines); 691 } 692 693 # Ctab lines: Atom block 694 # 695 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee 696 # A10 A10 A10 xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 697 # x,y,z: Atom coordinates 698 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified 699 # atom, and LP for lone pair, or R# for Rgroup label 700 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits) 701 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1, 702 # 4 = doublet radical, 5 = -1, 6 = -2, 7 = -3 703 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center 704 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4 705 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo 706 # configuration of double bond atom must match 707 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence 708 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh) 709 # rrr: Not used 710 # iii: Not used 711 # mmm: Atom-atom mapping number. 1 - number of atoms 712 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted, 713 # 2 = configuration is retained. 714 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be 715 # exactly as shown 716 # 717 # Notes: 718 # . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These 719 # values determine chirailty around the chiral center; a non zero value indicates atom 720 # has been marked as chiral center. 721 # 722 sub ParseCmpdAtomLine { 723 my($Line) = @_; 724 my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity); 725 726 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7; 727 if (length($Line) > 31) { 728 ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line); 729 } 730 else { 731 ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line); 732 } 733 return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity); 734 } 735 736 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools. 737 # 738 sub MDLChargeToInternalCharge { 739 my($MDLCharge) = @_; 740 my($InternalCharge); 741 742 CHARGE: { 743 if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;} 744 if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;} 745 if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;} 746 if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;} 747 if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;} 748 if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;} 749 if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;} 750 # All other MDL charge values, including 4 corresponding to "doublet radical", 751 # are assigned internal value of 0. 752 $InternalCharge = 0; 753 if ($MDLCharge != 4) { 754 carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned..."; 755 } 756 } 757 return $InternalCharge; 758 } 759 760 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files. 761 # 762 sub InternalChargeToMDLCharge { 763 my($InternalCharge) = @_; 764 my($MDLCharge); 765 766 CHARGE: { 767 if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;} 768 if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;} 769 if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;} 770 if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;} 771 if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;} 772 if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;} 773 # All other MDL charge values, including 4 corresponding to "doublet radical", 774 # are assigned internal value of 0. 775 $MDLCharge = 0; 776 } 777 return $MDLCharge; 778 } 779 780 # Ctab lines: Bond block 781 # 782 # Format: 111222tttsssxxxrrrccc 783 # 784 # 111: First atom number. 785 # 222: Second atom number. 786 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double, 787 # 6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any 788 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down, 789 # Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans, 790 # 3 = Cis or trans (either) double bond 791 # xxx: Not used 792 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain 793 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center, 794 # Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8 795 # (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible 796 # 797 sub ParseCmpdBondLine { 798 my($Line) = @_; 799 my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 800 801 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line); 802 return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo); 803 } 804 805 # Map MDL bond type value used in SD and MOL files to internal bond order and bond types 806 # values used by MayaChemTools... 807 # 808 sub MDLBondTypeToInternalBondOrder { 809 my($MDLBondType) = @_; 810 my($InternalBondOrder, $InternalBondType); 811 812 $InternalBondType = ''; 813 814 BONDTYPE: { 815 if ($MDLBondType == 1) { $InternalBondOrder = 1; $InternalBondType = 'Single'; last BONDTYPE;} 816 if ($MDLBondType == 2) { $InternalBondOrder = 2; $InternalBondType = 'Double'; last BONDTYPE;} 817 if ($MDLBondType == 3) { $InternalBondOrder = 3; $InternalBondType = 'Triple'; last BONDTYPE;} 818 if ($MDLBondType == 4) { $InternalBondOrder = 1.5; $InternalBondType = 'Aromatic'; last BONDTYPE;} # Aromatic 819 if ($MDLBondType == 5) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrDouble'; last BONDTYPE;} # Aromatic 820 if ($MDLBondType == 6) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrAromatic'; last BONDTYPE;} # Aromatic 821 if ($MDLBondType == 7) { $InternalBondOrder = 2; $InternalBondType = 'DoubleOrAromatic'; last BONDTYPE;} # Aromatic 822 if ($MDLBondType == 8) { $InternalBondOrder = 1; $InternalBondType = 'Any'; last BONDTYPE;} # Aromatic 823 # 824 # Although MDL aromatic bond values are used for query only and explicit Kekule bond order 825 # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders. 826 # 827 # All other MDL bond type values - 5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic, 828 # 8 = Any - are also assigned appropriate internal value of 1: These are meant to be used for 829 # structure queries by MDL products. 830 # 831 $InternalBondOrder = 1; 832 $InternalBondType = 'Single'; 833 834 carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 0, has been assigned..."; 835 } 836 return ($InternalBondOrder, $InternalBondType); 837 } 838 839 # Map internal bond order and bond type values used by MayaChemTools to MDL bond type value used 840 # in SD and MOL files... 841 # 842 sub InternalBondOrderToMDLBondType { 843 my($InternalBondOrder, $InternalBondType) = @_; 844 my($MDLBondType); 845 846 BONDTYPE: { 847 if ($InternalBondOrder == 1) { 848 if ($InternalBondType =~ /^SingleOrDouble$/i) { 849 $MDLBondType = 5; 850 } 851 elsif ($InternalBondType =~ /^SingleOrAromatic$/i) { 852 $MDLBondType = 6; 853 } 854 elsif ($InternalBondType =~ /^Any$/i) { 855 $MDLBondType = 8; 856 } 857 else { 858 $MDLBondType = 1; 859 } 860 $MDLBondType = 1; 861 last BONDTYPE; 862 } 863 if ($InternalBondOrder == 2) { 864 if ($InternalBondType =~ /^DoubleOrAromatic$/i) { 865 $MDLBondType = 7; 866 } 867 else { 868 $MDLBondType = 2; 869 } 870 last BONDTYPE; 871 } 872 if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;} 873 if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;} 874 if ($InternalBondType =~ /^Any$/i) { $MDLBondType = 8; last BONDTYPE;} 875 876 $MDLBondType = 1; 877 878 carp "Warning: InternalBondOrderToMDLBondType: Internal bond order and type values, $InternalBondOrder and $InternalBondType, don't match any valid MDL bond type: MDL bond type value, 1, has been assigned..."; 879 } 880 return $MDLBondType; 881 } 882 883 # Third line: Comments - A blank line is also allowed. 884 sub ParseCmpdCommentsLine { 885 my($Line) = @_; 886 my($Comments); 887 888 $Comments = unpack("A80", $Line); 889 890 return ($Comments); 891 } 892 893 # Map MDL bond stereo value used in SD and MOL files to internal bond stereochemistry values used by MayaChemTools... 894 # 895 sub MDLBondStereoToInternalBondStereochemistry { 896 my($MDLBondStereo) = @_; 897 my($InternalBondStereo); 898 899 $InternalBondStereo = ''; 900 901 BONDSTEREO: { 902 if ($MDLBondStereo == 1) { $InternalBondStereo = 'Up'; last BONDSTEREO;} 903 if ($MDLBondStereo == 4) { $InternalBondStereo = 'UpOrDown'; last BONDSTEREO;} 904 if ($MDLBondStereo == 6) { $InternalBondStereo = 'Down'; last BONDSTEREO;} 905 if ($MDLBondStereo == 3) { $InternalBondStereo = 'CisOrTrans'; last BONDSTEREO;} 906 if ($MDLBondStereo == 0) { $InternalBondStereo = 'None'; last BONDSTEREO;} 907 908 $InternalBondStereo = ''; 909 carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type..."; 910 } 911 return $InternalBondStereo; 912 } 913 914 # Map internal bond stereochemistry values used by MayaChemTools to MDL bond stereo value used in SD and MOL files... 915 # 916 sub InternalBondStereochemistryToMDLBondStereo { 917 my($InternalBondStereo) = @_; 918 my($MDLBondStereo); 919 920 $MDLBondStereo = 0; 921 922 BONDSTEREO: { 923 if ($InternalBondStereo =~ /^Up$/i) { $MDLBondStereo = 1; last BONDSTEREO;} 924 if ($InternalBondStereo =~ /^UpOrDown$/i) { $MDLBondStereo = 4; last BONDSTEREO;} 925 if ($InternalBondStereo =~ /^Down$/) { $MDLBondStereo = 6; last BONDSTEREO;} 926 if ($InternalBondStereo =~ /^CisOrTrans$/) { $MDLBondStereo = 3; last BONDSTEREO;} 927 928 $MDLBondStereo = 0; 929 } 930 return $MDLBondStereo; 931 } 932 933 # Fourth line: Counts 934 # 935 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv 936 # 937 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete) 938 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii: 939 # (obsolete); mmm: number of lines of additional properties, including the M END line, No 940 # longer supported, default is set to 999; vvvvvv: version 941 942 sub ParseCmpdCountsLine { 943 my($Line) = @_; 944 my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 945 946 if (length($Line) >= 39) { 947 ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line); 948 } 949 elsif (length($Line) >= 15) { 950 ($PropertyCount, $Version) = ("999", "v2000"); 951 ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line); 952 } 953 else { 954 ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000"); 955 ($AtomCount, $BondCount) = unpack("A3A3", $Line); 956 } 957 958 if ($Version =~ /V3000/i) { 959 # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support 960 # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000 961 # format. 962 croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools..."; 963 } 964 965 return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version); 966 } 967 968 # Second line: Misc info 969 # 970 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR 971 # A2A8 A10 A2I2A10 A12 A6 972 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m), 973 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input, 974 # internal registry number (R) if input through MDL form. A blank line is also allowed. 975 sub ParseCmpdMiscInfoLine { 976 my($Line) = @_; 977 my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 978 979 ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line); 980 return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum); 981 } 982 983 # First line: Molecule name. This line is unformatted, but like all other lines in a 984 # molfile may not extend beyond column 80. A blank line is also allowed. 985 sub ParseCmpdMolNameLine { 986 my($Line) = @_; 987 my($MolName); 988 989 $MolName = unpack("A80", $Line); 990 991 return ($MolName); 992 } 993 994 # Parse atom alias property line in CTAB generic properties block. 995 # 996 # Atom alias property line format: 997 # 998 # A aaa 999 # x... 1000 # 1001 # aaa: Atom number 1002 # x: Atom alias in next line 1003 # 1004 sub ParseCmpdAtomAliasPropertyLine { 1005 my($Line, $NextLine) = @_; 1006 my($Label, $AtomNumber, $AtomAlias); 1007 1008 ($Label, $AtomNumber) = split(' ', $Line); 1009 $AtomAlias = $NextLine; 1010 1011 if (!$AtomAlias) { 1012 carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line..."; 1013 } 1014 1015 return ($AtomNumber, $AtomAlias); 1016 } 1017 1018 # Parse charge property line in CTAB generic properties block. 1019 # 1020 # Charge property line format: 1021 # 1022 # M CHGnn8 aaa vvv ... 1023 # 1024 # nn8: Number of value pairs. Maximum of 8 pairs allowed. 1025 # aaa: Atom number 1026 # vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes 1027 # all charge and radical values in the atom block, forcing a 0 charge on all atoms not 1028 # listed in an M CHG or M RAD line. 1029 # 1030 sub ParseCmpdChargePropertyLine { 1031 my($Line) = @_; 1032 1033 return _ParseCmpdGenericPropertyLine('Charge', $Line); 1034 } 1035 1036 1037 # Parse isotope property line in CTAB generic properties block. 1038 # 1039 # Isoptope property line format: 1040 # 1041 # M ISOnn8 aaa vvv ... 1042 # 1043 # nn8: Number of value paris. Maximum of 8 pairs allowed. 1044 # aaa: Atom number 1045 # vvv: Absolute mass of the atom isotope as a positive integer. When present, this property 1046 # supersedes all isotope values in the atom block. Default (no entry) means natural 1047 # abundance. The difference between this absolute mass value and the natural 1048 # abundance value specified in the PTABLE.DAT file must be within the range of -18 1049 # to +12 1050 # 1051 # Notes: 1052 # . Values correspond to mass numbers... 1053 # 1054 sub ParseCmpdIsotopePropertyLine { 1055 my($Line) = @_; 1056 1057 return _ParseCmpdGenericPropertyLine('Isotope', $Line); 1058 } 1059 1060 # Parse radical property line in CTAB generic properties block. 1061 # 1062 # Radical property line format: 1063 # 1064 # M RADnn8 aaa vvv ... 1065 # 1066 # nn8: Number of value paris. Maximum of 8 pairs allowed. 1067 # aaa: Atom number 1068 # vvv: Default of 0 = no radical, 1 = singlet, 2 = doublet, 3 = triplet . When 1069 # present, this property supersedes all charge and radical values in the atom block, 1070 # forcing a 0 (zero) charge and radical on all atoms not listed in an M CHG or 1071 # M RAD line. 1072 # 1073 sub ParseCmpdRadicalPropertyLine { 1074 my($Line) = @_; 1075 1076 return _ParseCmpdGenericPropertyLine('Radical', $Line); 1077 } 1078 1079 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools... 1080 # 1081 sub MDLRadicalToInternalSpinMultiplicity { 1082 my($MDLRadical) = @_; 1083 my($InternalSpinMultiplicity); 1084 1085 $InternalSpinMultiplicity = ''; 1086 1087 SPINMULTIPLICITY: { 1088 if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last SPINMULTIPLICITY;} 1089 if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last SPINMULTIPLICITY;} 1090 if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last SPINMULTIPLICITY;} 1091 if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last SPINMULTIPLICITY;} 1092 $InternalSpinMultiplicity = ''; 1093 carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M RAD is not supported..."; 1094 } 1095 return $InternalSpinMultiplicity; 1096 } 1097 1098 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files... 1099 # 1100 sub InternalSpinMultiplicityToMDLRadical { 1101 my($InternalSpinMultiplicity) = @_; 1102 my($MDLRadical); 1103 1104 $MDLRadical = 0; 1105 1106 SPINMULTIPLICITY: { 1107 if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last SPINMULTIPLICITY;} 1108 if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last SPINMULTIPLICITY;} 1109 if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last SPINMULTIPLICITY;} 1110 $MDLRadical = 0; 1111 } 1112 return $MDLRadical; 1113 } 1114 1115 # Process generic CTAB property line... 1116 sub _ParseCmpdGenericPropertyLine { 1117 my($PropertyName, $Line) = @_; 1118 1119 my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs); 1120 1121 @ValuePairs = (); 1122 ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line); 1123 $ValuePairsCount = (scalar @ValuePairs)/2; 1124 if ($ValuesCount != $ValuePairsCount) { 1125 carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount..."; 1126 } 1127 1128 return (@ValuePairs); 1129 } 1130 1131 # Generic CTAB property lines for charge, istope and radical properties... 1132 # 1133 sub _GenerateCmpdGenericPropertyLines { 1134 my($PropertyName, $PropertyValuePairsRef) = @_; 1135 my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines); 1136 1137 @PropertyLines = (); 1138 NAME: { 1139 if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M CHG"; last NAME; } 1140 if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M ISO"; last NAME; } 1141 if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M RAD"; last NAME; } 1142 carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified..."; 1143 return @PropertyLines; 1144 } 1145 1146 # A maximum of 8 property pair values allowed per line... 1147 $PropertyCount = 0; 1148 $Line = ''; 1149 for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) { 1150 if ($PropertyCount > 8) { 1151 # Setup property line... 1152 $Line = "${PropertyLabel} 8${Line}"; 1153 push @PropertyLines, $Line; 1154 1155 $PropertyCount = 0; 1156 $Line = ''; 1157 } 1158 $PropertyCount++; 1159 $AtomNum = $PropertyValuePairsRef->[$Index]; 1160 $PropertyValue = $PropertyValuePairsRef->[$Index + 1]; 1161 $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue; 1162 } 1163 if ($Line) { 1164 $Line = "${PropertyLabel} ${PropertyCount}${Line}"; 1165 push @PropertyLines, $Line; 1166 } 1167 return @PropertyLines; 1168 } 1169 1170 # 1171 # Read compound data into a string and return its value 1172 sub ReadCmpdString { 1173 my($SDFileRef) = @_; 1174 my($CmpdString); 1175 1176 $CmpdString = ""; 1177 LINE: while (defined($_ = <$SDFileRef>)) { 1178 # Change Windows and Mac new line char to UNIX... 1179 s/(\r\n)|(\r)/\n/g; 1180 1181 if (/^\$\$\$\$/) { 1182 # Take out any new line char at the end by explicitly removing it instead of using 1183 # chomp, which might not always work correctly on files generated on a system 1184 # with a value of input line separator different from the current system... 1185 s/\n$//g; 1186 1187 # Doesn't hurt to chomp... 1188 chomp; 1189 1190 $CmpdString .= $_; 1191 last LINE; 1192 } 1193 else { 1194 $CmpdString .= $_; 1195 } 1196 } 1197 return $CmpdString; 1198 } 1199 1200 # Find out the number of fragements in the compounds. And for the compound with 1201 # more than one fragment, remove all the others besides the largest one. 1202 sub WashCmpd { 1203 my($CmpdLines) = @_; 1204 my($WashedCmpdString, $FragmentCount, $Fragments); 1205 1206 $WashedCmpdString = ""; 1207 ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines); 1208 if ($FragmentCount > 1) { 1209 # Go over the compound data for the largest fragment including property 1210 # data... 1211 my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines); 1212 1213 @AllFragments = (); @LargestFragment = (); 1214 %LargestFragmentAtoms = (); 1215 @AllFragments = split "\n", $Fragments; 1216 @LargestFragment = split " ", $AllFragments[0]; 1217 for $Index (0 .. $#LargestFragment) { 1218 # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted 1219 # from lowest to highest old atom numbers... 1220 $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1; 1221 } 1222 @WashedCmpdLines = (); 1223 push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3]; 1224 ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]); 1225 $NewAtomCount = @LargestFragment; 1226 $NewBondCount = 0; 1227 $AtomNum = 0; 1228 # Retrieve the largest fragment atom lines... 1229 for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) { 1230 $AtomNum++; 1231 if ($LargestFragmentAtoms{$AtomNum}) { 1232 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 1233 } 1234 } 1235 # Retrieve the largest fragment bond lines... 1236 for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) { 1237 ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]); 1238 if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) { 1239 $NewBondCount++; 1240 # Set up bond line with new atom number mapping... 1241 $FirstNewAtomNum = $LargestFragmentAtoms{$FirstAtomNum}; 1242 $SecondNewAtomNum = $LargestFragmentAtoms{$SecondAtomNum}; 1243 $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo); 1244 push @WashedCmpdLines, $BondLine; 1245 } 1246 } 1247 # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new 1248 # atom numners; Others, property lines before M END line are skipped as atom numbers for 1249 # other properties might not valid anymore... 1250 # 1251 $MENDLineIndex = $LineIndex; 1252 LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) { 1253 $Line = @$CmpdLines[$LineIndex]; 1254 if ($Line =~ /^M END/i) { 1255 push @WashedCmpdLines, "M END"; 1256 $MENDLineIndex = $LineIndex; 1257 last LINE; 1258 } 1259 1260 @ValuePairs = (); 1261 if ($Line =~ /^M CHG/i) { 1262 @ValuePairs = ParseCmpdChargePropertyLine($Line); 1263 } 1264 elsif ($Line =~ /^M RAD/i) { 1265 @ValuePairs = ParseCmpdRadicalPropertyLine($Line); 1266 } 1267 elsif ($Line =~ /^M ISO/i) { 1268 @ValuePairs = ParseCmpdIsotopePropertyLine($Line); 1269 } 1270 elsif ($Line =~ /^A /i) { 1271 my($NextLine); 1272 $LineIndex++; 1273 $NextLine = @$CmpdLines[$LineIndex]; 1274 @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine); 1275 } 1276 else { 1277 next LINE; 1278 } 1279 1280 if (!@ValuePairs) { 1281 next LINE; 1282 } 1283 1284 # Collect values for valid atom numbers with mapping to new atom numbers... 1285 @NewValuePairs = (); 1286 VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) { 1287 $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1]; 1288 if (!exists $LargestFragmentAtoms{$AtomNum}) { 1289 next VALUEINDEX; 1290 } 1291 $NewAtomNum = $LargestFragmentAtoms{$AtomNum}; 1292 push @NewValuePairs, ($NewAtomNum, $Value) 1293 } 1294 if (!@NewValuePairs) { 1295 next LINE; 1296 } 1297 @NewPropertyLines = (); 1298 if ($Line =~ /^M CHG/i) { 1299 @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs); 1300 } 1301 elsif ($Line =~ /^M RAD/i) { 1302 @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs); 1303 } 1304 elsif ($Line =~ /^M ISO/i) { 1305 @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs); 1306 } 1307 elsif ($Line =~ /^A /i) { 1308 @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs); 1309 } 1310 push @WashedCmpdLines, @NewPropertyLines; 1311 } 1312 1313 # Retrieve rest of the data label and value property data... 1314 for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) { 1315 push @WashedCmpdLines, @$CmpdLines[$LineIndex]; 1316 } 1317 # Update atom and bond count line... 1318 $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag); 1319 1320 $WashedCmpdString = join "\n", @WashedCmpdLines; 1321 } 1322 return ($FragmentCount, $Fragments, $WashedCmpdString); 1323 } 1324