MayaChemTools

   1 package SDFileUtil;
   2 #
   3 # $RCSfile: SDFileUtil.pm,v $
   4 # $Date: 2015/02/28 20:47:18 $
   5 # $Revision: 1.49 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Exporter;
  31 use Carp;
  32 use PeriodicTable qw(IsElement);
  33 use TimeUtil ();
  34 
  35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw(GenerateCmpdAtomLine GenerateCmpdBondLine GenerateCmpdChargePropertyLines GenerateCmpdCommentsLine GenerateCmpdCountsLine GenerateCmpdAtomAliasPropertyLines GenerateCmpdIsotopePropertyLines GenerateCmpdDataHeaderLabelsAndValuesLines GenerateCmpdMiscInfoLine GenerateCmpdRadicalPropertyLines GenerateCmpdMolNameLine GenerateEmptyCtabBlockLines GenerateMiscLineDateStamp GetAllAndCommonCmpdDataHeaderLabels GetCmpdDataHeaderLabels GetCmpdDataHeaderLabelsAndValues GetCmpdFragments GetCtabLinesCount GetUnknownAtoms GetInvalidAtomNumbers MDLChargeToInternalCharge InternalChargeToMDLCharge MDLBondTypeToInternalBondOrder InternalBondOrderToMDLBondType MDLBondStereoToInternalBondStereochemistry InternalBondStereochemistryToMDLBondStereo InternalSpinMultiplicityToMDLRadical MDLRadicalToInternalSpinMultiplicity IsCmpd3D IsCmpd2D ParseCmpdAtomLine ParseCmpdBondLine ParseCmpdCommentsLine ParseCmpdCountsLine ParseCmpdMiscInfoLine ParseCmpdMolNameLine ParseCmpdAtomAliasPropertyLine ParseCmpdChargePropertyLine ParseCmpdIsotopePropertyLine ParseCmpdRadicalPropertyLine ReadCmpdString RemoveCmpdDataHeaderLabelAndValue WashCmpd);
  39 @EXPORT_OK = qw();
  40 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  41 
  42 # Format data for compounds count line...
  43 sub GenerateCmpdCountsLine {
  44   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version, $Line);
  45 
  46   if (@_ == 5) {
  47     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = @_;
  48   }
  49   elsif (@_ == 3) {
  50     ($AtomCount, $BondCount, $ChiralFlag) = @_;
  51     $PropertyCount = 999;
  52     $Version = "V2000";
  53   }
  54   else {
  55     ($AtomCount, $BondCount) = @_;
  56     $ChiralFlag = 0;
  57     $PropertyCount = 999;
  58     $Version = "V2000";
  59   }
  60   if ($AtomCount > 999) {
  61     croak "Error: SDFileUtil::GenerateCmpdCountsLine: The atom count, $AtomCount, exceeds maximum of 999 allowed for CTAB version 2000. The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
  62   }
  63   $Line = sprintf "%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%3i%6s", $AtomCount, $BondCount, 0, 0, $ChiralFlag, 0, 0, 0, 0, 0, $PropertyCount, $Version;
  64 
  65   return ($Line);
  66 }
  67 
  68 # Generate comments line...
  69 sub GenerateCmpdCommentsLine {
  70   my($Comments) = @_;
  71   my($Line);
  72 
  73   $Line = (length($Comments) > 80) ? substr($Comments, 0, 80) : $Comments;
  74 
  75   return $Line;
  76 }
  77 
  78 # Generate molname line...
  79 sub GenerateCmpdMolNameLine {
  80   my($MolName) = @_;
  81   my($Line);
  82 
  83   $Line = (length($MolName) > 80) ? substr($MolName, 0, 80) : $MolName;
  84 
  85   return $Line;
  86 }
  87 
  88 # Generate data for compounds misc info line...
  89 sub GenerateCmpdMiscInfoLine {
  90   my($ProgramName, $UserInitial, $Code) = @_;
  91   my($Date, $Line);
  92 
  93   if (!(defined($ProgramName) && $ProgramName)) {
  94     $ProgramName = "MayaChem";
  95   }
  96   if (!(defined($UserInitial) && $UserInitial)) {
  97     $UserInitial = "  ";
  98   }
  99   if (!(defined($Code) && $Code)) {
 100     $Code = "2D";
 101   }
 102 
 103   if (length($ProgramName) > 8) {
 104     $ProgramName = substr($ProgramName, 0, 8);
 105   }
 106   if (length($UserInitial) > 2) {
 107     $UserInitial = substr($UserInitial, 0, 2);
 108   }
 109   if (length($Code) > 2) {
 110     $Code = substr($Code, 0, 2);
 111   }
 112   $Date = GenerateMiscLineDateStamp();
 113 
 114   $Line = "${UserInitial}${ProgramName}${Date}${Code}";
 115 
 116   return $Line;
 117 }
 118 
 119 # Generate data for compounds misc info line...
 120 sub GenerateEmptyCtabBlockLines {
 121   my($Date, $Lines);
 122 
 123   if (@_ == 1) {
 124     ($Date) = @_;
 125   }
 126   else {
 127     $Date = GenerateMiscLineDateStamp();
 128   }
 129   # First line: Blank molname line...
 130   # Second line: Misc info...
 131   # Third line: Blank comments line...
 132   # Fourth line: Counts line reflecting empty structure data block...
 133   $Lines = "\n";
 134   $Lines .= "  MayaChem${Date}2D\n";
 135   $Lines .= "\n";
 136   $Lines .= GenerateCmpdCountsLine(0, 0, 0) . "\n";
 137   $Lines .= "M  END";
 138 
 139   return $Lines;
 140 }
 141 
 142 # Generate SD file data stamp...
 143 sub GenerateMiscLineDateStamp {
 144   return TimeUtil::SDFileTimeStamp();
 145 }
 146 
 147 # Generate data for compound atom line...
 148 #
 149 sub GenerateCmpdAtomLine {
 150   my($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity) = @_;
 151   my($Line);
 152 
 153   if (!defined $MassDifference) {
 154     $MassDifference = 0;
 155   }
 156   if (!defined $Charge) {
 157     $Charge = 0;
 158   }
 159   if (!defined $StereoParity) {
 160     $StereoParity = 0;
 161   }
 162   $Line = sprintf "%10.4f%10.4f%10.4f %-3s%2i%3i%3i  0  0  0  0  0  0  0  0  0", $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity;
 163 
 164   return $Line
 165 }
 166 
 167 # Generate data for compound bond line...
 168 #
 169 sub GenerateCmpdBondLine {
 170   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = @_;
 171   my($Line);
 172 
 173   if (!defined $BondStereo) {
 174     $BondStereo = 0;
 175   }
 176   $Line = sprintf "%3i%3i%3i%3i  0  0  0", $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo;
 177 
 178   return $Line
 179 }
 180 
 181 # Generate charge property lines for CTAB block...
 182 #
 183 sub GenerateCmpdChargePropertyLines {
 184   my($ChargeValuePairsRef) = @_;
 185 
 186   return _GenerateCmpdGenericPropertyLines('Charge', $ChargeValuePairsRef);
 187 }
 188 
 189 # Generate isotope property lines for CTAB block...
 190 #
 191 sub GenerateCmpdIsotopePropertyLines {
 192   my($IsotopeValuePairsRef) = @_;
 193 
 194   return _GenerateCmpdGenericPropertyLines('Isotope', $IsotopeValuePairsRef);
 195 }
 196 
 197 # Generate radical property line property lines for CTAB block...
 198 #
 199 sub GenerateCmpdRadicalPropertyLines {
 200   my($RadicalValuePairsRef) = @_;
 201 
 202   return _GenerateCmpdGenericPropertyLines('Radical', $RadicalValuePairsRef);
 203 }
 204 
 205 # Generate atom alias property line property lines for CTAB block...
 206 #
 207 # Atom alias property line format:
 208 #
 209 # A  aaa
 210 # x...
 211 #
 212 #    aaa: Atom number
 213 #    x: Atom alias in next line
 214 #
 215 sub GenerateCmpdAtomAliasPropertyLines {
 216   my($PropertyValuePairsRef) = @_;
 217   my($Index, $AtomNum, $AtomAlias, $Line, @PropertyLines);
 218 
 219   @PropertyLines = ();
 220 
 221   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
 222     $AtomNum = $PropertyValuePairsRef->[$Index];
 223     $AtomAlias = $PropertyValuePairsRef->[$Index + 1];
 224 
 225     $Line = "A  " . sprintf "%3i", $AtomNum;
 226 
 227     push @PropertyLines, $Line;
 228     push @PropertyLines, $AtomAlias;
 229   }
 230 
 231   return @PropertyLines;
 232 }
 233 
 234 # Generate data header labels and values lines...
 235 #
 236 sub GenerateCmpdDataHeaderLabelsAndValuesLines {
 237   my($DataHeaderLabelsRef, $DataHeaderLabelsAndValuesRef, $SortDataLabels) = @_;
 238   my($DataLabel, $DataValue, @DataLabels, @DataLines);
 239 
 240   if (!defined $SortDataLabels) {
 241     $SortDataLabels = 0;
 242   }
 243 
 244   @DataLines = ();
 245   @DataLabels = ();
 246   if ($SortDataLabels) {
 247     push @DataLabels, sort @{$DataHeaderLabelsRef};
 248   }
 249   else {
 250     push @DataLabels,  @{$DataHeaderLabelsRef};
 251   }
 252   for $DataLabel (@DataLabels) {
 253     $DataValue = '';
 254     if (exists $DataHeaderLabelsAndValuesRef->{$DataLabel}) {
 255       $DataValue = $DataHeaderLabelsAndValuesRef->{$DataLabel};
 256     }
 257     push @DataLines, (">  <${DataLabel}>", "$DataValue", "");
 258   }
 259   return @DataLines;
 260 }
 261 
 262 # Parse data field header in SD file and return lists of all and common data field
 263 # labels.
 264 sub GetAllAndCommonCmpdDataHeaderLabels {
 265   my($SDFileRef) = @_;
 266   my($CmpdCount, $CmpdString, $Label, @CmpdLines, @DataFieldLabels, @CommonDataFieldLabels, %DataFieldLabelsMap);
 267 
 268   $CmpdCount = 0;
 269   @DataFieldLabels = ();
 270   @CommonDataFieldLabels = ();
 271   %DataFieldLabelsMap = ();
 272 
 273   while ($CmpdString = ReadCmpdString($SDFileRef)) {
 274     $CmpdCount++;
 275     @CmpdLines = split "\n", $CmpdString;
 276     # Process compound data header labels and figure out which ones are present for
 277     # all the compounds...
 278     if (@DataFieldLabels) {
 279       my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
 280       my(%CmpdDataFieldLabelsMap) = ();
 281       # Setup a map for the current labels...
 282       for $Label (@CmpdDataFieldLabels) {
 283         $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
 284       }
 285       # Check the presence old labels for this compound; otherwise, mark 'em new...
 286       for $Label (@DataFieldLabels) {
 287         if (!$CmpdDataFieldLabelsMap{$Label}) {
 288           $DataFieldLabelsMap{$Label} = "PresentInSome";
 289         }
 290       }
 291       # Check the presence this compound in the old labels; otherwise, add 'em...
 292       for $Label (@CmpdDataFieldLabels ) {
 293         if (!$DataFieldLabelsMap{$Label}) {
 294           # It's a new label...
 295           push @DataFieldLabels, $Label;
 296           $DataFieldLabelsMap{$Label} = "PresentInSome";
 297         }
 298       }
 299     }
 300     else {
 301       # Get the initial label set and set up a map...
 302       @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
 303       for $Label (@DataFieldLabels) {
 304         $DataFieldLabelsMap{$Label} = "PresentInAll";
 305       }
 306     }
 307   }
 308   # Identify the common data field labels...
 309   @CommonDataFieldLabels = ();
 310   for $Label (@DataFieldLabels) {
 311     if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
 312       push @CommonDataFieldLabels, $Label;
 313     }
 314   }
 315   return ($CmpdCount, \@DataFieldLabels, \@CommonDataFieldLabels);
 316 }
 317 
 318 # Parse all the data header labels and return 'em as an list...
 319 #
 320 # Format:
 321 #
 322 #> Data header line
 323 #Data line(s)
 324 #Blank line
 325 #
 326 # [Data Header] (one line) precedes each item of data, starts with a greater than (>) sign, and
 327 # contains at least one of the following:
 328 #  The field name enclosed in angle brackets. For example: <melting.point>
 329 #  The field number, DTn , where n represents the number assigned to the field in a MACCS-II database
 330 #
 331 #Optional information for the data header includes:
 332 #  The compound’s external and internal registry numbers. External registry numbers must be enclosed in parentheses.
 333 #  Any combination of information
 334 #
 335 #The following are examples of valid data headers:
 336 #> <MELTING.POINT>
 337 #> 55 (MD-08974) <BOILING.POINT> DT12
 338 #> DT12 55
 339 #> (MD-0894) <BOILING.POINT> FROM ARCHIVES
 340 #
 341 #Notes: Sometimes last blank line is missing and can be just followed by $$$$
 342 #
 343 sub GetCmpdDataHeaderLabels {
 344   my($CmpdLines) = @_;
 345   my($CmpdLine, $Label, @Labels);
 346 
 347   @Labels = ();
 348   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 349     if ($CmpdLine !~ /^>/) {
 350       next CMPDLINE;
 351     }
 352     # Does the line contains field name enclosed in angular brackets?
 353     ($Label) = $CmpdLine =~ /<.*?>/g;
 354     if (!defined($Label)) {
 355       next CMPDLINE;
 356     }
 357     $Label =~ s/(<|>)//g;
 358     push @Labels, $Label;
 359   }
 360   return (@Labels);
 361 }
 362 
 363 # Parse all the data header labels and values
 364 sub GetCmpdDataHeaderLabelsAndValues {
 365   my($CmpdLines) = @_;
 366   my($CmpdLine, $CurrentLabel, $Label, $Value, $ValueCount, $ProcessingLabelData, @Values, %DataFields);
 367 
 368   %DataFields = ();
 369   $ProcessingLabelData = 0;
 370   $ValueCount = 0;
 371   CMPDLINE: for $CmpdLine (@$CmpdLines) {
 372     if ($CmpdLine =~ /^\$\$\$\$/) {
 373       last CMPDLINE;
 374     }
 375     if ($CmpdLine =~ /^>/) {
 376       # Does the line contains field name enclosed in angular brackets?
 377       ($Label) = $CmpdLine =~ /<.*?>/g;
 378       if (defined $Label) {
 379         $CurrentLabel = $Label;
 380         $CurrentLabel =~ s/(<|>)//g;
 381         $ProcessingLabelData = 0;
 382         $ValueCount = 0;
 383 
 384         if ($CurrentLabel) {
 385           $ProcessingLabelData = 1;
 386           $DataFields{$CurrentLabel} = '';
 387           next CMPDLINE;
 388         }
 389       }
 390       else {
 391         if (!$ProcessingLabelData) {
 392           # Data line containing no <label> as allowed by SDF format. Just ignore it...
 393           next CMPDLINE;
 394         }
 395       }
 396     }
 397     if (!$ProcessingLabelData) {
 398       next CMPDLINE;
 399     }
 400     if (!(defined($CmpdLine) && length($CmpdLine))) {
 401       # Blank line terminates value for a label...
 402       $CurrentLabel = '';
 403       $ValueCount = 0;
 404       $ProcessingLabelData = 0;
 405       next CMPDLINE;
 406     }
 407     $ValueCount++;
 408     $Value = $CmpdLine;
 409 
 410     if ($ValueCount > 1) {
 411       $DataFields{$CurrentLabel} .= "\n" . $Value;
 412     }
 413     else {
 414       $DataFields{$CurrentLabel} = $Value;
 415     }
 416   }
 417   return (%DataFields);
 418 }
 419 
 420 # Return an updated compoud string after removing  data header label along with its
 421 # value from the specified compound string...
 422 #
 423 sub RemoveCmpdDataHeaderLabelAndValue {
 424   my($CmpdString, $DataHeaderLabel) = @_;
 425   my($Line, $PorcessingDataHeaderLabel, @CmpdLines);
 426 
 427   @CmpdLines = ();
 428   $PorcessingDataHeaderLabel = 0;
 429 
 430   CMPDLINE: for $Line (split "\n", $CmpdString) {
 431     if ($Line =~ /^>/ && $Line =~ /<$DataHeaderLabel>/i) {
 432       $PorcessingDataHeaderLabel = 1;
 433       next CMPDLINE;
 434     }
 435 
 436     if ($PorcessingDataHeaderLabel) {
 437       # Blank line indicates end of fingerprints data value...
 438       if ($Line =~ /^\$\$\$\$/) {
 439         push @CmpdLines, $Line;
 440         $PorcessingDataHeaderLabel = 0;
 441       }
 442       elsif (!length($Line)) {
 443         $PorcessingDataHeaderLabel = 0;
 444       }
 445       next CMPDLINE;
 446     }
 447 
 448     # Track compound lines without fingerprints data...
 449     push @CmpdLines, $Line;
 450   }
 451 
 452   return join "\n", @CmpdLines;
 453 }
 454 
 455 #
 456 # Using bond blocks, figure out the number of disconnected fragments  and
 457 # return their values along with the atom numbers in a string delimited by new
 458 # line character.
 459 #
 460 sub GetCmpdFragments {
 461   my($CmpdLines) = @_;
 462   my($AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, @AtomConnections, $BondType, $FragmentString, $FragmentCount, $LineIndex, $Index, $AtomNum, $NbrAtomNum, @ProcessedAtoms, $ProcessedAtomCount, $ProcessAtomNum, @ProcessingAtoms, @ConnectedAtoms, %Fragments, $FragmentNum, $AFragmentString);
 463 
 464   # Setup the connection table for each atom...
 465   @AtomConnections = ();
 466   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 467   for $AtomNum (1 .. $AtomCount) {
 468     %{$AtomConnections[$AtomNum]} = ();
 469   }
 470   for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 471     ($FirstAtomNum, $SecondAtomNum, $BondType) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 472     if (!$AtomConnections[$FirstAtomNum]{$SecondAtomNum}) {
 473       $AtomConnections[$FirstAtomNum]{$SecondAtomNum} = $BondType;
 474     }
 475     if (!$AtomConnections[$SecondAtomNum]{$FirstAtomNum}) {
 476       $AtomConnections[$SecondAtomNum]{$FirstAtomNum} = $BondType;
 477     }
 478   }
 479 
 480   #Get set to count fragments...
 481   $ProcessedAtomCount = 0;
 482   $FragmentNum = 0;
 483   %Fragments = ();
 484   @ProcessedAtoms = ();
 485   for $AtomNum (1 .. $AtomCount) {
 486     $ProcessedAtoms[$AtomNum] = 0;
 487   }
 488   while ($ProcessedAtomCount < $AtomCount) {
 489     @ProcessingAtoms = ();
 490     @ConnectedAtoms = ();
 491     ATOMNUM: for $AtomNum (1 .. $AtomCount) {
 492       if (!$ProcessedAtoms[$AtomNum]) {
 493         $ProcessedAtomCount++;
 494         $ProcessedAtoms[$AtomNum] = 1;
 495         push @ProcessingAtoms, $AtomNum;
 496         $FragmentNum++;
 497         @{$Fragments{$FragmentNum} } = ();
 498         push @{$Fragments{$FragmentNum} }, $AtomNum;
 499         last ATOMNUM;
 500       }
 501     }
 502 
 503     # Go over the neighbors and follow the connection trail while collecting the
 504     # atoms numbers present in the connected fragment...
 505     while (@ProcessingAtoms) {
 506       for ($Index = 0; $Index < @ProcessingAtoms; $Index++) {
 507         $ProcessAtomNum = $ProcessingAtoms[$Index];
 508         for $NbrAtomNum (keys %{$AtomConnections[$ProcessAtomNum]})  {
 509           if (!$ProcessedAtoms[$NbrAtomNum]) {
 510             $ProcessedAtomCount++;
 511             $ProcessedAtoms[$NbrAtomNum] = 1;
 512             push @ConnectedAtoms, $NbrAtomNum;
 513             push @{ $Fragments{$FragmentNum} }, $NbrAtomNum;
 514           }
 515         }
 516       }
 517       @ProcessingAtoms = ();
 518       @ProcessingAtoms = @ConnectedAtoms;
 519       @ConnectedAtoms = ();
 520     }
 521   }
 522   $FragmentCount = $FragmentNum;
 523   $FragmentString = "";
 524 
 525   # Sort out the fragments by size...
 526   for $FragmentNum (sort { @{$Fragments{$b}} <=> @{$Fragments{$a}}  } keys %Fragments ) {
 527     # Sort the atoms in a fragment by their numbers...
 528     $AFragmentString = join " ", sort { $a <=> $b } @{ $Fragments{$FragmentNum} };
 529     if ($FragmentString) {
 530       $FragmentString .=  "\n" . $AFragmentString;
 531     }
 532     else {
 533       $FragmentString = $AFragmentString;
 534     }
 535   }
 536   return ($FragmentCount, $FragmentString);
 537 }
 538 
 539 # Count number of lines present in between 4th and line containg "M END"
 540 sub GetCtabLinesCount {
 541   my($CmpdLines) = @_;
 542   my($LineIndex, $CtabLinesCount);
 543 
 544   $CtabLinesCount = 0;
 545  LINE: for ($LineIndex = 4; $LineIndex < @$CmpdLines; $LineIndex++) {
 546     #
 547     # Any line after atom and bond data starting with anything other than space or
 548     # a digit indicates end of Ctab atom/bond data block...
 549     #
 550     if (@$CmpdLines[$LineIndex] !~ /^[0-9 ]/) {
 551       $CtabLinesCount = $LineIndex - 4;
 552       last LINE;
 553     }
 554   }
 555   return $CtabLinesCount;
 556 }
 557 
 558 # Using atom blocks, count the number of atoms which contain special element
 559 # symbols not present in the periodic table.
 560 sub GetUnknownAtoms {
 561   my($CmpdLines) = @_;
 562   my($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines, $LineIndex, $AtomCount, $AtomSymbol);
 563 
 564   $UnknownAtomCount = 0;
 565   $UnknownAtoms = "";
 566   $UnknownAtomLines = "";
 567   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 568   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 569     ($AtomSymbol) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 570     if (!IsElement($AtomSymbol)) {
 571       $UnknownAtomCount++;
 572       $UnknownAtoms .= " $AtomSymbol";
 573       if ($UnknownAtomLines) {
 574         $UnknownAtomLines .= "\n" . @$CmpdLines[$LineIndex];
 575       }
 576       else {
 577         $UnknownAtomLines = @$CmpdLines[$LineIndex];
 578       }
 579     }
 580   }
 581   return ($UnknownAtomCount, $UnknownAtoms, $UnknownAtomLines);
 582 }
 583 
 584 # Check z coordinates of all atoms to see whether any of them is non-zero
 585 # which makes the compound geometry three dimensional...
 586 #
 587 sub IsCmpd3D {
 588   my($CmpdLines) = @_;
 589   my($LineIndex, $AtomCount, $AtomSymbol, $AtomX, $AtomY, $AtomZ);
 590 
 591   ($AtomCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 592   for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
 593     ($AtomSymbol, $AtomX, $AtomY, $AtomZ) = ParseCmpdAtomLine(@$CmpdLines[$LineIndex]);
 594     if ($AtomZ != 0) {
 595       return 1;
 596     }
 597   }
 598   return 0;
 599 }
 600 
 601 # Check whether it's a 2D compound...
 602 #
 603 sub IsCmpd2D {
 604   my($CmpdLines) = @_;
 605 
 606   return IsCmpd3D($CmpdLines) ? 0 : 1;
 607 }
 608 
 609 # Using bond blocks, count the number of bond lines which contain atom numbers
 610 # greater than atom count specified in compound count line...
 611 #
 612 sub GetInvalidAtomNumbers {
 613   my($CmpdLines) = @_;
 614   my($LineIndex, $AtomCount, $BondCount, $FirstAtomNum, $SecondAtomNum, $InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines, $Line, $InvalidAtomPropertyLine, $ValuePairIndex, $AtomNum, $Value, @ValuePairs);
 615 
 616   ($AtomCount, $BondCount) = ParseCmpdCountsLine(@$CmpdLines[3]);
 617 
 618   $InvalidAtomNumbersCount = 0;
 619   $InvalidAtomNumbers = "";
 620   $InvalidAtomNumberLines = "";
 621 
 622   # Go over bond block lines...
 623   LINE: for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
 624     ($FirstAtomNum, $SecondAtomNum) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
 625     if ($FirstAtomNum <= $AtomCount && $SecondAtomNum <= $AtomCount) {
 626       next LINE;
 627     }
 628     if ($FirstAtomNum > $AtomCount) {
 629       $InvalidAtomNumbersCount++;
 630       $InvalidAtomNumbers .= " $FirstAtomNum";
 631     }
 632     if ($SecondAtomNum > $AtomCount) {
 633       $InvalidAtomNumbersCount++;
 634       $InvalidAtomNumbers .= " $SecondAtomNum";
 635     }
 636     if ($InvalidAtomNumberLines) {
 637       $InvalidAtomNumberLines .= "\n" . @$CmpdLines[$LineIndex];
 638     }
 639     else {
 640       $InvalidAtomNumberLines = @$CmpdLines[$LineIndex];
 641     }
 642   }
 643   # Go over property lines before M  END...
 644   #
 645   LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
 646     $Line = @$CmpdLines[$LineIndex];
 647     @ValuePairs = ();
 648     if ($Line =~ /^M  END/i) {
 649       last LINE;
 650     }
 651     @ValuePairs = ();
 652     if ($Line =~ /^M  CHG/i) {
 653       @ValuePairs = ParseCmpdChargePropertyLine($Line);
 654     }
 655     elsif ($Line =~ /^M  RAD/i) {
 656       @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
 657     }
 658     elsif ($Line =~ /^M  ISO/i) {
 659       @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
 660     }
 661     elsif ($Line =~ /^A  /i) {
 662       my($NextLine);
 663       $LineIndex++;
 664       $NextLine = @$CmpdLines[$LineIndex];
 665       @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
 666     }
 667     else {
 668       next LINE;
 669     }
 670 
 671     $InvalidAtomPropertyLine = 0;
 672     for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
 673       $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
 674       if ($AtomNum > $AtomCount) {
 675         $InvalidAtomPropertyLine = 1;
 676         $InvalidAtomNumbersCount++;
 677         $InvalidAtomNumbers .= " $AtomNum";
 678       }
 679     }
 680     if ($InvalidAtomPropertyLine) {
 681       if ($InvalidAtomNumberLines) {
 682         $InvalidAtomNumberLines .= "\n" . $Line;
 683       }
 684       else {
 685         $InvalidAtomNumberLines = $Line;
 686       }
 687     }
 688   }
 689 
 690   return ($InvalidAtomNumbersCount, $InvalidAtomNumbers, $InvalidAtomNumberLines);
 691 }
 692 
 693 # Ctab lines: Atom block
 694 #
 695 # Format: xxxxx.xxxxyyyyy.yyyyzzzzz.zzzz aaaddcccssshhhbbbvvvHHHrrriiimmmnnneee
 696 #         A10       A10       A10       xA3 A2A3 A3 A3 A3 A3 A3 A3 A3 A3 A3 A3
 697 # x,y,z: Atom coordinates
 698 # aaa: Atom symbol. Entry in periodic table or L for atom list, A, Q, * for unspecified
 699 #      atom, and LP for lone pair, or R# for Rgroup label
 700 # dd: Mass difference. -3, -2, -1, 0, 1, 2, 3, 4 (0 for value beyond these limits)
 701 # ccc: Charge. 0 = uncharged or value other than these, 1 = +3, 2 = +2, 3 = +1,
 702 #      4 = doublet radical, 5 = -1, 6 = -2, 7 = -3
 703 # sss: Atom stereo parity. 0 = not stereo, 1 = odd, 2 = even, 3 = either or unmarked stereo center
 704 # hhh: Hydrogen count + 1. 1 = H0, 2 = H1, 3 = H2, 4 = H3, 5 = H4
 705 # bbb: Stereo care box. 0 = ignore stereo configuration of this double bond atom, 1 = stereo
 706 #      configuration of double bond atom must match
 707 # vvv: Valence. 0 = no marking (default)(1 to 14) = (1 to 14) 15 = zero valence
 708 # HHH: H0 designator. 0 = not specified, 1 = no H atoms allowed (redundant due to hhh)
 709 # rrr: Not used
 710 # iii: Not used
 711 # mmm: Atom-atom mapping number. 1 - number of atoms
 712 # nnn: Inversion/retention flag. 0 = property not applied, 1 = configuration is inverted,
 713 #      2 = configuration is retained.
 714 # eee: Exact change flag. 0 = property not applied, 1 = change on atom must be
 715 #      exactly as shown
 716 #
 717 # Notes:
 718 #  . StereoParity: 1 - ClockwiseStereo, 2 - AntiClockwiseStereo; 3 - Either; 0 - none. These
 719 #    values determine chirailty around the chiral center; a non zero value indicates atom
 720 #    has been marked as chiral center.
 721 #
 722 sub ParseCmpdAtomLine {
 723   my($Line) = @_;
 724   my ($LineIndex, $AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity);
 725 
 726   ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = ('') x 7;
 727   if (length($Line) > 31) {
 728     ($AtomX, $AtomY, $AtomZ, $AtomSymbol, $MassDifference, $Charge, $StereoParity) = unpack("A10A10A10xA3A2A3A3", $Line);
 729   }
 730   else {
 731     ($AtomX, $AtomY, $AtomZ, $AtomSymbol) = unpack("A10A10A10", $Line);
 732   }
 733   return ($AtomSymbol, $AtomX, $AtomY, $AtomZ, $MassDifference, $Charge, $StereoParity);
 734 }
 735 
 736 # Map MDL charge value used in SD and MOL files to internal charge used by MayaChemTools.
 737 #
 738 sub MDLChargeToInternalCharge {
 739   my($MDLCharge) = @_;
 740   my($InternalCharge);
 741 
 742   CHARGE: {
 743     if ($MDLCharge == 0) { $InternalCharge = 0; last CHARGE;}
 744     if ($MDLCharge == 1) { $InternalCharge = 3; last CHARGE;}
 745     if ($MDLCharge == 2) { $InternalCharge = 2; last CHARGE;}
 746     if ($MDLCharge == 3) { $InternalCharge = 1; last CHARGE;}
 747     if ($MDLCharge == 5) { $InternalCharge = -1; last CHARGE;}
 748     if ($MDLCharge == 6) { $InternalCharge = -2; last CHARGE;}
 749     if ($MDLCharge == 7) { $InternalCharge = -3; last CHARGE;}
 750     # All other MDL charge values, including 4 corresponding to "doublet radical",
 751     # are assigned internal value of 0.
 752     $InternalCharge = 0;
 753     if ($MDLCharge != 4) {
 754       carp "Warning: MDLChargeToInternalCharge: MDL charge value, $MDLCharge, is not supported: An internal charge value, 0, has been assigned...";
 755     }
 756   }
 757   return $InternalCharge;
 758 }
 759 
 760 # Map internal charge used by MayaChemTools to MDL charge value used in SD and MOL files.
 761 #
 762 sub InternalChargeToMDLCharge {
 763   my($InternalCharge) = @_;
 764   my($MDLCharge);
 765 
 766   CHARGE: {
 767     if ($InternalCharge == 3) { $MDLCharge = 1; last CHARGE;}
 768     if ($InternalCharge == 2) { $MDLCharge = 2; last CHARGE;}
 769     if ($InternalCharge == 1) { $MDLCharge = 3; last CHARGE;}
 770     if ($InternalCharge == -1) { $MDLCharge = 5; last CHARGE;}
 771     if ($InternalCharge == -2) { $MDLCharge = 6; last CHARGE;}
 772     if ($InternalCharge == -3) { $MDLCharge = 7; last CHARGE;}
 773     # All other MDL charge values, including 4 corresponding to "doublet radical",
 774     # are assigned internal value of 0.
 775     $MDLCharge = 0;
 776   }
 777   return $MDLCharge;
 778 }
 779 
 780 # Ctab lines: Bond block
 781 #
 782 # Format: 111222tttsssxxxrrrccc
 783 #
 784 # 111: First atom number.
 785 # 222: Second atom number.
 786 # ttt: Bond type. 1 = Single, 2 = Double, 3 = Triple, 4 = Aromatic, 5 = Single or Double,
 787 #      6 = Single or Aromatic, 7 = Double or Aromatic, 8 = Any
 788 # sss: Bond stereo. Single bonds: 0 = not stereo, 1 = Up, 4 = Either, 6 = Down,
 789 #      Double bonds: 0 = Use x-, y-, z-coords from atom block to determine cis or trans,
 790 #      3 = Cis or trans (either) double bond
 791 # xxx: Not used
 792 # rrr: Bond topology. 0 = Either, 1 = Ring, 2 = Chain
 793 # ccc: Reacting center status. 0 = unmarked, 1 = a center, -1 = not a center,
 794 #      Additional: 2 = no change,4 = bond made/broken, 8 = bond order changes 12 = 4+8
 795 #      (both made/broken and changes); 5 = (4 + 1), 9 = (8 + 1), and 13 = (12 + 1) are also possible
 796 #
 797 sub ParseCmpdBondLine {
 798   my($Line) = @_;
 799   my($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 800 
 801   ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = map {s/ //g; $_} unpack("A3A3A3A3", $Line);
 802   return ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo);
 803 }
 804 
 805 # Map MDL bond type value used in SD and MOL files to internal bond order  and bond types
 806 # values used by MayaChemTools...
 807 #
 808 sub MDLBondTypeToInternalBondOrder {
 809   my($MDLBondType) = @_;
 810   my($InternalBondOrder, $InternalBondType);
 811 
 812   $InternalBondType = '';
 813 
 814   BONDTYPE: {
 815     if ($MDLBondType == 1) { $InternalBondOrder = 1; $InternalBondType = 'Single'; last BONDTYPE;}
 816     if ($MDLBondType == 2) { $InternalBondOrder = 2; $InternalBondType = 'Double'; last BONDTYPE;}
 817     if ($MDLBondType == 3) { $InternalBondOrder = 3; $InternalBondType = 'Triple'; last BONDTYPE;}
 818     if ($MDLBondType == 4) { $InternalBondOrder = 1.5; $InternalBondType = 'Aromatic'; last BONDTYPE;} # Aromatic
 819     if ($MDLBondType == 5) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrDouble'; last BONDTYPE;} # Aromatic
 820     if ($MDLBondType == 6) { $InternalBondOrder = 1; $InternalBondType = 'SingleOrAromatic'; last BONDTYPE;} # Aromatic
 821     if ($MDLBondType == 7) { $InternalBondOrder = 2; $InternalBondType = 'DoubleOrAromatic'; last BONDTYPE;} # Aromatic
 822     if ($MDLBondType == 8) { $InternalBondOrder = 1; $InternalBondType = 'Any'; last BONDTYPE;} # Aromatic
 823     #
 824     # Although MDL aromatic bond values are used for query only and explicit Kekule bond order
 825     # values must be assigned, internal value of 1.5 is allowed to indicate aromatic bond orders.
 826     #
 827     # All other MDL bond type values -  5 = Single or Double, 6 = Single or Aromatic, 7 = Double or Aromatic,
 828     # 8 = Any - are also assigned appropriate internal value of 1: These are meant to be used for
 829     # structure queries by MDL products.
 830     #
 831     $InternalBondOrder = 1;
 832     $InternalBondType = 'Single';
 833 
 834     carp "Warning: MDLBondTypeToInternalBondOrder: MDL bond type value, $MDLBondType, is not supported: An internal bond order value, 0, has been assigned...";
 835   }
 836   return ($InternalBondOrder, $InternalBondType);
 837 }
 838 
 839 # Map internal bond order  and bond type values used by MayaChemTools to MDL bond type value used
 840 # in SD and MOL files...
 841 #
 842 sub InternalBondOrderToMDLBondType {
 843   my($InternalBondOrder, $InternalBondType) = @_;
 844   my($MDLBondType);
 845 
 846   BONDTYPE: {
 847     if ($InternalBondOrder == 1) {
 848       if ($InternalBondType =~ /^SingleOrDouble$/i) {
 849         $MDLBondType = 5;
 850       }
 851       elsif ($InternalBondType =~ /^SingleOrAromatic$/i) {
 852         $MDLBondType = 6;
 853       }
 854       elsif ($InternalBondType =~ /^Any$/i) {
 855         $MDLBondType = 8;
 856       }
 857       else {
 858         $MDLBondType = 1;
 859       }
 860       $MDLBondType = 1;
 861       last BONDTYPE;
 862     }
 863     if ($InternalBondOrder == 2) {
 864       if ($InternalBondType =~ /^DoubleOrAromatic$/i) {
 865         $MDLBondType = 7;
 866       }
 867       else {
 868         $MDLBondType = 2;
 869       }
 870       last BONDTYPE;
 871     }
 872     if ($InternalBondOrder == 3) { $MDLBondType = 3; last BONDTYPE;}
 873     if ($InternalBondOrder == 1.5) { $MDLBondType = 4; last BONDTYPE;}
 874     if ($InternalBondType =~ /^Any$/i) { $MDLBondType = 8; last BONDTYPE;}
 875 
 876     $MDLBondType = 1;
 877 
 878     carp "Warning: InternalBondOrderToMDLBondType: Internal bond order and type values, $InternalBondOrder and $InternalBondType, don't match any valid MDL bond type: MDL bond type value, 1, has been assigned...";
 879   }
 880   return $MDLBondType;
 881 }
 882 
 883 # Third line: Comments - A blank line is also allowed.
 884 sub ParseCmpdCommentsLine {
 885   my($Line) = @_;
 886   my($Comments);
 887 
 888   $Comments = unpack("A80", $Line);
 889 
 890   return ($Comments);
 891 }
 892 
 893 # Map MDL bond stereo value used in SD and MOL files to internal bond stereochemistry values used by MayaChemTools...
 894 #
 895 sub MDLBondStereoToInternalBondStereochemistry {
 896   my($MDLBondStereo) = @_;
 897   my($InternalBondStereo);
 898 
 899   $InternalBondStereo = '';
 900 
 901   BONDSTEREO: {
 902     if ($MDLBondStereo == 1) { $InternalBondStereo = 'Up'; last BONDSTEREO;}
 903     if ($MDLBondStereo == 4) { $InternalBondStereo = 'UpOrDown'; last BONDSTEREO;}
 904     if ($MDLBondStereo == 6) { $InternalBondStereo = 'Down'; last BONDSTEREO;}
 905     if ($MDLBondStereo == 3) { $InternalBondStereo = 'CisOrTrans'; last BONDSTEREO;}
 906     if ($MDLBondStereo == 0) { $InternalBondStereo = 'None'; last BONDSTEREO;}
 907 
 908     $InternalBondStereo = '';
 909     carp "Warning: MDLBondStereoToInternalBondType: MDL bond stereo value, $MDLBondStereo, is not supported: It has been ignored and bond order would be used to determine bond type...";
 910   }
 911   return $InternalBondStereo;
 912 }
 913 
 914 # Map internal bond stereochemistry values used by MayaChemTools to MDL bond stereo value used in SD and MOL files...
 915 #
 916 sub InternalBondStereochemistryToMDLBondStereo {
 917   my($InternalBondStereo) = @_;
 918   my($MDLBondStereo);
 919 
 920   $MDLBondStereo = 0;
 921 
 922   BONDSTEREO: {
 923     if ($InternalBondStereo =~ /^Up$/i) { $MDLBondStereo = 1; last BONDSTEREO;}
 924     if ($InternalBondStereo =~ /^UpOrDown$/i) { $MDLBondStereo = 4; last BONDSTEREO;}
 925     if ($InternalBondStereo =~ /^Down$/) { $MDLBondStereo = 6; last BONDSTEREO;}
 926     if ($InternalBondStereo =~ /^CisOrTrans$/) { $MDLBondStereo = 3; last BONDSTEREO;}
 927 
 928     $MDLBondStereo = 0;
 929   }
 930   return $MDLBondStereo;
 931 }
 932 
 933 # Fourth line: Counts
 934 #
 935 # Format: aaabbblllfffcccsssxxxrrrpppiiimmmvvvvvv
 936 #
 937 # aaa: number of atoms; bbb: number of bonds; lll: number of atom lists; fff: (obsolete)
 938 # ccc: chiral flag: 0=not chiral, 1=chiral; sss: number of stext entries; xxx,rrr,ppp,iii:
 939 # (obsolete); mmm: number of lines of additional properties, including the M END line, No
 940 # longer supported, default is set to 999; vvvvvv: version
 941 
 942 sub ParseCmpdCountsLine {
 943   my($Line) = @_;
 944   my($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 945 
 946   if (length($Line) >= 39) {
 947     ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version) = unpack("A3A3x3x3A3x3x3x3x3x3A3A6", $Line);
 948   }
 949   elsif (length($Line) >= 15) {
 950     ($PropertyCount, $Version) = ("999", "v2000");
 951     ($AtomCount, $BondCount, $ChiralFlag) = unpack("A3A3x3x3A3", $Line);
 952   }
 953   else {
 954     ($ChiralFlag, $PropertyCount, $Version) = ("0", "999", "v2000");
 955     ($AtomCount, $BondCount) = unpack("A3A3", $Line);
 956   }
 957 
 958   if ($Version =~ /V3000/i) {
 959     # Current version of MayaChemTools modules and classes for processing MDL MOL and SD don't support
 960     # V3000. So instead of relying on callers, just exit with an error to disable any processing of V3000
 961     # format.
 962     croak "Error: SDFileUtil::ParseCmpdCountsLine: The Extended Connection Table (V3000) format in MDL MOL and SD files is not supported by the current release of MayaChemTools...";
 963   }
 964 
 965   return ($AtomCount, $BondCount, $ChiralFlag, $PropertyCount, $Version);
 966 }
 967 
 968 # Second line: Misc info
 969 #
 970 # Format: IIPPPPPPPPMMDDYYHHmmddSSssssssssssEEEEEEEEEEEERRRRRR
 971 #         A2A8      A10       A2I2A10       A12         A6
 972 # User's first and last initials (I), program name (P), date/time (M/D/Y,H:m),
 973 # dimensional codes - 2D or 3D (d),scaling factors (S, s), energy (E) if modeling program input,
 974 # internal registry number (R) if input through MDL form. A blank line is also allowed.
 975 sub ParseCmpdMiscInfoLine {
 976   my($Line) = @_;
 977   my($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 978 
 979   ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum) = unpack("A2A8A10A2A2A10A12A6", $Line);
 980   return ($UserInitial, $ProgramName, $Date, $Code, $ScalingFactor1, $ScalingFactor2, $Energy, $RegistryNum);
 981 }
 982 
 983 # First line: Molecule name. This line is unformatted, but like all other lines in a
 984 # molfile may not extend beyond column 80. A blank line is also allowed.
 985 sub ParseCmpdMolNameLine {
 986   my($Line) = @_;
 987   my($MolName);
 988 
 989   $MolName = unpack("A80", $Line);
 990 
 991   return ($MolName);
 992 }
 993 
 994 # Parse atom alias property line in CTAB generic properties block.
 995 #
 996 # Atom alias property line format:
 997 #
 998 # A  aaa
 999 # x...
1000 #
1001 #    aaa: Atom number
1002 #    x: Atom alias in next line
1003 #
1004 sub ParseCmpdAtomAliasPropertyLine {
1005   my($Line, $NextLine) = @_;
1006   my($Label, $AtomNumber, $AtomAlias);
1007 
1008   ($Label, $AtomNumber) = split(' ', $Line);
1009   $AtomAlias = $NextLine;
1010 
1011   if (!$AtomAlias) {
1012     carp "Warning: _ParseCmpdAtomAliasPropertyLine: No atom alias value specified on the line following atom alias property line...";
1013   }
1014 
1015   return ($AtomNumber, $AtomAlias);
1016 }
1017 
1018 # Parse charge property line in CTAB generic properties block.
1019 #
1020 # Charge property line format:
1021 #
1022 # M  CHGnn8 aaa vvv ...
1023 #
1024 #    nn8: Number of value pairs. Maximum of 8 pairs allowed.
1025 #    aaa: Atom number
1026 #    vvv: -15 to +15. Default of 0 = uncharged atom. When present, this property supersedes
1027 #    all charge and radical values in the atom block, forcing a 0 charge on all atoms not
1028 #    listed in an M  CHG or M  RAD line.
1029 #
1030 sub ParseCmpdChargePropertyLine {
1031   my($Line) = @_;
1032 
1033   return _ParseCmpdGenericPropertyLine('Charge', $Line);
1034 }
1035 
1036 
1037 # Parse isotope property line in CTAB generic properties block.
1038 #
1039 # Isoptope property line format:
1040 #
1041 # M  ISOnn8 aaa vvv ...
1042 #
1043 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
1044 #    aaa: Atom number
1045 #    vvv: Absolute mass of the atom isotope as a positive integer. When present, this property
1046 #    supersedes all isotope values in the atom block. Default (no entry) means natural
1047 #    abundance. The difference between this absolute mass value and the natural
1048 #    abundance value specified in the PTABLE.DAT file must be within the range of -18
1049 #    to +12
1050 #
1051 # Notes:
1052 #  . Values correspond to mass numbers...
1053 #
1054 sub ParseCmpdIsotopePropertyLine {
1055   my($Line) = @_;
1056 
1057   return _ParseCmpdGenericPropertyLine('Isotope', $Line);
1058 }
1059 
1060 # Parse radical property line in CTAB generic properties block.
1061 #
1062 # Radical property line format:
1063 #
1064 # M  RADnn8 aaa vvv ...
1065 #
1066 #    nn8: Number of value paris. Maximum of 8 pairs allowed.
1067 #    aaa: Atom number
1068 #    vvv: Default of 0 = no radical, 1 = singlet, 2 = doublet, 3 = triplet . When
1069 #    present, this property supersedes all charge and radical values in the atom block,
1070 #    forcing a 0 (zero) charge and radical on all atoms not listed in an M  CHG or
1071 #    M  RAD line.
1072 #
1073 sub ParseCmpdRadicalPropertyLine {
1074   my($Line) = @_;
1075 
1076   return _ParseCmpdGenericPropertyLine('Radical', $Line);
1077 }
1078 
1079 # Map MDL radical stereo value used in SD and MOL files to internal spin multiplicity values used by MayaChemTools...
1080 #
1081 sub MDLRadicalToInternalSpinMultiplicity {
1082   my($MDLRadical) = @_;
1083   my($InternalSpinMultiplicity);
1084 
1085   $InternalSpinMultiplicity = '';
1086 
1087   SPINMULTIPLICITY: {
1088     if ($MDLRadical == 0) { $InternalSpinMultiplicity = 0; last SPINMULTIPLICITY;}
1089     if ($MDLRadical == 1) { $InternalSpinMultiplicity = 1; last SPINMULTIPLICITY;}
1090     if ($MDLRadical == 2) { $InternalSpinMultiplicity = 2; last SPINMULTIPLICITY;}
1091     if ($MDLRadical == 3) { $InternalSpinMultiplicity = 3; last SPINMULTIPLICITY;}
1092     $InternalSpinMultiplicity = '';
1093     carp "Warning: MDLRadicalToInternalSpinMultiplicity: MDL radical value, $MDLRadical, specifed on line M  RAD is not supported...";
1094   }
1095   return $InternalSpinMultiplicity;
1096 }
1097 
1098 # Map internal spin multiplicity values used by MayaChemTools to MDL radical stereo value used in SD and MOL files...
1099 #
1100 sub InternalSpinMultiplicityToMDLRadical {
1101   my($InternalSpinMultiplicity) = @_;
1102   my($MDLRadical);
1103 
1104   $MDLRadical = 0;
1105 
1106   SPINMULTIPLICITY: {
1107     if ($InternalSpinMultiplicity == 1) { $MDLRadical = 1; last SPINMULTIPLICITY;}
1108     if ($InternalSpinMultiplicity == 2) { $MDLRadical = 2; last SPINMULTIPLICITY;}
1109     if ($InternalSpinMultiplicity == 3) { $MDLRadical = 3; last SPINMULTIPLICITY;}
1110     $MDLRadical = 0;
1111   }
1112   return $MDLRadical;
1113 }
1114 
1115 # Process generic CTAB property line...
1116 sub _ParseCmpdGenericPropertyLine {
1117   my($PropertyName, $Line) = @_;
1118 
1119   my($Label, $PropertyLabel, $ValuesCount, $ValuePairsCount, @ValuePairs);
1120 
1121   @ValuePairs = ();
1122   ($Label, $PropertyLabel, $ValuesCount, @ValuePairs) = split(' ', $Line);
1123   $ValuePairsCount = (scalar @ValuePairs)/2;
1124   if ($ValuesCount != $ValuePairsCount) {
1125     carp "Warning: _ParseCmpdGenericPropertyLine: Number of atom number and $PropertyName value paris specified on $Label $PropertyLabel property line, $ValuePairsCount, does not match expected value of $ValuesCount...";
1126   }
1127 
1128   return (@ValuePairs);
1129 }
1130 
1131 # Generic CTAB property lines for charge, istope and radical properties...
1132 #
1133 sub _GenerateCmpdGenericPropertyLines {
1134   my($PropertyName, $PropertyValuePairsRef) = @_;
1135   my($Index, $PropertyLabel, $Line, $PropertyCount, $AtomNum, $PropertyValue, @PropertyLines);
1136 
1137   @PropertyLines = ();
1138   NAME: {
1139     if ($PropertyName =~ /^Charge$/i) { $PropertyLabel = "M  CHG"; last NAME; }
1140     if ($PropertyName =~ /^Isotope$/i) { $PropertyLabel = "M  ISO"; last NAME; }
1141     if ($PropertyName =~ /^Radical$/i) { $PropertyLabel = "M  RAD"; last NAME; }
1142     carp "Warning: _GenerateCmpdGenericPropertyLines: Unknown property name, $PropertyName, specified...";
1143     return @PropertyLines;
1144   }
1145 
1146   # A maximum of 8 property pair values allowed per line...
1147   $PropertyCount = 0;
1148   $Line = '';
1149   for ($Index = 0; $Index < $#{$PropertyValuePairsRef}; $Index += 2) {
1150     if ($PropertyCount > 8) {
1151       # Setup property line...
1152       $Line = "${PropertyLabel}  8${Line}";
1153       push @PropertyLines, $Line;
1154 
1155       $PropertyCount = 0;
1156       $Line = '';
1157     }
1158     $PropertyCount++;
1159     $AtomNum = $PropertyValuePairsRef->[$Index];
1160     $PropertyValue = $PropertyValuePairsRef->[$Index + 1];
1161     $Line .= sprintf " %3i %3i", $AtomNum, $PropertyValue;
1162   }
1163   if ($Line) {
1164     $Line = "${PropertyLabel}  ${PropertyCount}${Line}";
1165     push @PropertyLines, $Line;
1166   }
1167   return @PropertyLines;
1168 }
1169 
1170 #
1171 # Read compound data into a string and return its value
1172 sub ReadCmpdString {
1173   my($SDFileRef) = @_;
1174   my($CmpdString);
1175 
1176   $CmpdString = "";
1177   LINE: while (defined($_ = <$SDFileRef>)) {
1178     # Change Windows and Mac new line char to UNIX...
1179     s/(\r\n)|(\r)/\n/g;
1180 
1181     if (/^\$\$\$\$/) {
1182       # Take out any new line char at the end by explicitly removing it instead of using
1183       # chomp, which might not always work correctly on files generated on a system
1184       # with a value of input line separator different from the current system...
1185       s/\n$//g;
1186 
1187       # Doesn't hurt to chomp...
1188       chomp;
1189 
1190       $CmpdString .=  $_;
1191       last LINE;
1192     }
1193     else {
1194       $CmpdString .=  $_;
1195     }
1196   }
1197   return $CmpdString;
1198 }
1199 
1200 # Find out the number of fragements in the compounds. And for the compound with
1201 # more than one fragment, remove all the others besides the largest one.
1202 sub WashCmpd {
1203   my($CmpdLines) = @_;
1204   my($WashedCmpdString, $FragmentCount, $Fragments);
1205 
1206   $WashedCmpdString = "";
1207   ($FragmentCount, $Fragments) = GetCmpdFragments($CmpdLines);
1208   if ($FragmentCount > 1) {
1209     # Go over the compound data for the largest fragment including property
1210     # data...
1211     my (@AllFragments, @LargestFragment, %LargestFragmentAtoms, @WashedCmpdLines, $Index, $LineIndex, $AtomCount, $BondCount, $NewAtomCount, $NewBondCount, $FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo, $FirstNewAtomNum, $SecondNewAtomNum, $AtomNum, $ChiralFlag, $BondLine, $MENDLineIndex, $Line, $Value, @ValuePairs, @NewValuePairs, $ValuePairIndex, $NewAtomNum, @NewPropertyLines);
1212 
1213     @AllFragments = (); @LargestFragment = ();
1214     %LargestFragmentAtoms = ();
1215     @AllFragments = split "\n", $Fragments;
1216     @LargestFragment = split " ", $AllFragments[0];
1217     for $Index (0 .. $#LargestFragment) {
1218       # Map old atom numbers to new atom numbers as the fragment atom numbers are sorted
1219       # from lowest to highest old atom numbers...
1220       $LargestFragmentAtoms{$LargestFragment[$Index]} = $Index + 1;
1221     }
1222     @WashedCmpdLines = ();
1223     push @WashedCmpdLines, @$CmpdLines[0], @$CmpdLines[1], @$CmpdLines[2], @$CmpdLines[3];
1224     ($AtomCount, $BondCount, $ChiralFlag) = ParseCmpdCountsLine(@$CmpdLines[3]);
1225     $NewAtomCount = @LargestFragment;
1226     $NewBondCount = 0;
1227     $AtomNum = 0;
1228     # Retrieve the largest fragment atom lines...
1229     for ($LineIndex = 4; $LineIndex < (4 + $AtomCount); $LineIndex++) {
1230       $AtomNum++;
1231       if ($LargestFragmentAtoms{$AtomNum}) {
1232         push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1233       }
1234     }
1235     # Retrieve the largest fragment bond lines...
1236     for ($LineIndex = 4 + $AtomCount; $LineIndex < (4 + $AtomCount + $BondCount); $LineIndex++) {
1237       ($FirstAtomNum, $SecondAtomNum, $BondType, $BondStereo) = ParseCmpdBondLine(@$CmpdLines[$LineIndex]);
1238       if ($LargestFragmentAtoms{$FirstAtomNum} && $LargestFragmentAtoms{$SecondAtomNum}) {
1239         $NewBondCount++;
1240         # Set up bond line with new atom number mapping...
1241         $FirstNewAtomNum =  $LargestFragmentAtoms{$FirstAtomNum};
1242         $SecondNewAtomNum =  $LargestFragmentAtoms{$SecondAtomNum};
1243         $BondLine = GenerateCmpdBondLine($FirstNewAtomNum, $SecondNewAtomNum, $BondType, $BondStereo);
1244         push @WashedCmpdLines, $BondLine;
1245       }
1246     }
1247     # Get property lines for CHG, ISO and RAD label and map the old atom numbers to new
1248     # atom numners; Others, property lines before M  END line are skipped as atom numbers for
1249     # other properties might not valid anymore...
1250     #
1251     $MENDLineIndex = $LineIndex;
1252     LINE: for ($LineIndex = (4 + $AtomCount + $BondCount); $LineIndex < @$CmpdLines; $LineIndex++) {
1253       $Line = @$CmpdLines[$LineIndex];
1254       if ($Line =~ /^M  END/i) {
1255         push @WashedCmpdLines, "M  END";
1256         $MENDLineIndex = $LineIndex;
1257         last LINE;
1258       }
1259 
1260       @ValuePairs = ();
1261       if ($Line =~ /^M  CHG/i) {
1262         @ValuePairs = ParseCmpdChargePropertyLine($Line);
1263       }
1264       elsif ($Line =~ /^M  RAD/i) {
1265         @ValuePairs = ParseCmpdRadicalPropertyLine($Line);
1266       }
1267       elsif ($Line =~ /^M  ISO/i) {
1268         @ValuePairs = ParseCmpdIsotopePropertyLine($Line);
1269       }
1270       elsif ($Line =~ /^A  /i) {
1271         my($NextLine);
1272         $LineIndex++;
1273         $NextLine = @$CmpdLines[$LineIndex];
1274         @ValuePairs = ParseCmpdAtomAliasPropertyLine($Line, $NextLine);
1275       }
1276       else {
1277         next LINE;
1278       }
1279 
1280       if (!@ValuePairs) {
1281         next LINE;
1282       }
1283 
1284       # Collect values for valid atom numbers with mapping to new atom numbers...
1285       @NewValuePairs = ();
1286       VALUEINDEX: for ($ValuePairIndex = 0; $ValuePairIndex < $#ValuePairs; $ValuePairIndex += 2) {
1287         $AtomNum = $ValuePairs[$ValuePairIndex]; $Value = $ValuePairs[$ValuePairIndex + 1];
1288         if (!exists $LargestFragmentAtoms{$AtomNum}) {
1289           next VALUEINDEX;
1290         }
1291         $NewAtomNum = $LargestFragmentAtoms{$AtomNum};
1292         push @NewValuePairs, ($NewAtomNum, $Value)
1293       }
1294       if (!@NewValuePairs) {
1295         next LINE;
1296       }
1297       @NewPropertyLines = ();
1298       if ($Line =~ /^M  CHG/i) {
1299         @NewPropertyLines = GenerateCmpdChargePropertyLines(\@NewValuePairs);
1300       }
1301       elsif ($Line =~ /^M  RAD/i) {
1302         @NewPropertyLines = GenerateCmpdRadicalPropertyLines(\@NewValuePairs);
1303       }
1304       elsif ($Line =~ /^M  ISO/i) {
1305         @NewPropertyLines = GenerateCmpdIsotopePropertyLines(\@NewValuePairs);
1306       }
1307       elsif ($Line =~ /^A  /i) {
1308         @NewPropertyLines = GenerateCmpdAtomAliasPropertyLines(\@NewValuePairs);
1309       }
1310       push @WashedCmpdLines, @NewPropertyLines;
1311     }
1312 
1313     # Retrieve rest of the data label and value property data...
1314     for ($LineIndex = (1 + $MENDLineIndex); $LineIndex < @$CmpdLines; $LineIndex++) {
1315       push @WashedCmpdLines, @$CmpdLines[$LineIndex];
1316     }
1317     # Update atom and bond count line...
1318     $WashedCmpdLines[3] = GenerateCmpdCountsLine($NewAtomCount, $NewBondCount, $ChiralFlag);
1319 
1320     $WashedCmpdString = join "\n", @WashedCmpdLines;
1321   }
1322   return ($FragmentCount, $Fragments, $WashedCmpdString);
1323 }
1324