MayaChemTools

   1 package MolecularFormula;
   2 #
   3 # $RCSfile: MolecularFormula.pm,v $
   4 # $Date: 2015/02/28 20:47:18 $
   5 # $Revision: 1.25 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use Carp;
  31 use Text::ParseWords;
  32 use TextUtil;
  33 use PeriodicTable;
  34 
  35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  36 
  37 @ISA = qw(Exporter);
  38 @EXPORT = qw();
  39 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula);
  40 
  41 %EXPORT_TAGS = (all  => [@EXPORT, @EXPORT_OK]);
  42 
  43 #
  44 # Calculate molecular weight assuming its a valid molecular formula...
  45 #
  46 sub CalculateMolecularWeight {
  47   my($MolecularFormula) = @_;
  48   my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef);
  49 
  50   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
  51   if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
  52     return undef;
  53   }
  54 
  55   $MolecularWeight = 0;
  56 
  57   for $Index (0 .. $#{$FormulaElementsRef}) {
  58     $ElementSymbol = $FormulaElementsRef->[$Index];
  59     $ElementCount = $FormulaElementCountRef->[$Index];
  60     $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
  61     $MolecularWeight += $AtomicWeight * $ElementCount;
  62   }
  63   return $MolecularWeight;
  64 }
  65 
  66 #
  67 # Calculate exact mass assuming it's a valid formula...
  68 #
  69 sub CalculateExactMass {
  70   my($MolecularFormula) = @_;
  71   my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef);
  72 
  73   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
  74   if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
  75     return undef;
  76   }
  77   $ExactMass = 0;
  78 
  79   for $Index (0 .. $#{$FormulaElementsRef}) {
  80     $ElementSymbol = $FormulaElementsRef->[$Index];
  81     $ElementCount = $FormulaElementCountRef->[$Index];
  82     $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol);
  83     if (!defined($RelativeAtomicMass)) {
  84       next ELEMENT;
  85     }
  86     $ExactMass += $RelativeAtomicMass * $ElementCount;
  87   }
  88   return $ExactMass;
  89 }
  90 
  91 
  92 #
  93 # Calculate elemental composition and return reference to arrays
  94 # containing elements and their percent composition...
  95 #
  96 sub CalculateElementalComposition {
  97   my($MolecularFormula) = @_;
  98   my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition);
  99 
 100   $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
 101   if (! defined $MolecularWeight) {
 102     return (undef, undef);
 103   }
 104   ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
 105 
 106   @FormulaElements = ();
 107   @FormulaElementComposition = ();
 108 
 109   if (!$MolecularWeight) {
 110     return ( \@FormulaElements, \@FormulaElementComposition);
 111   }
 112 
 113   $CompositionMultiplier = 100 / $MolecularWeight;
 114 
 115   for $Index (0 .. $#{$FormulaElementsRef}) {
 116     $ElementSymbol = $FormulaElementsRef->[$Index];
 117     $ElementCount = $FormulaElementCountRef->[$Index];
 118     $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
 119     $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier;
 120 
 121     push @FormulaElements, $ElementSymbol;
 122     push @FormulaElementComposition, $Composition;
 123   }
 124 
 125   return ( \@FormulaElements, \@FormulaElementComposition);
 126 }
 127 
 128 # Using refernece to element and its composition arrays, format composition information
 129 # as: Element: Composition;...
 130 #
 131 sub FormatCompositionInfomation {
 132   my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition);
 133 
 134   $Precision = 2;
 135   if (@_ == 3) {
 136     ($ElementsRef, $ElementCompositionRef, $Precision) = @_;
 137   }
 138   else {
 139     ($ElementsRef, $ElementCompositionRef) = @_;
 140   }
 141 
 142   $Composition = '';
 143   for $Index (0 .. $#{$ElementsRef}) {
 144     $ElementSymbol = $ElementsRef->[$Index];
 145     $ElementComposition = $ElementCompositionRef->[$Index];
 146     $ElementComposition = sprintf("%.${Precision}f", $ElementComposition);
 147 
 148     $Composition .= ($Composition) ? '; ' : '';
 149     $Composition .=  "${ElementSymbol}: ${ElementComposition}%";
 150   }
 151 
 152   return $Composition;
 153 }
 154 
 155 #
 156 # Get elements and their count...
 157 #
 158 sub GetElementsAndCount {
 159   my($MolecularFormula) = @_;
 160   my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
 161 
 162   ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
 163 
 164   return ($FormulaElementsRef, $FormulaElementCountRef);
 165 }
 166 
 167 #
 168 # Is it a valid molecular formula?
 169 #
 170 sub IsMolecularFormula {
 171   my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
 172 
 173   ($MolecularFormula) = @_;
 174 
 175   ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
 176   $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0;
 177 
 178   return (wantarray ? ($Status, $ErrorMsg) : $Status);
 179 }
 180 
 181 #
 182 # Process molecular formula. For a valid formula, return references to arrays conatining elements
 183 # and element count; otherwsie, return undef.
 184 #
 185 sub _ProcessMolecularFormula {
 186   my($MolecularFormula) = @_;
 187   my($ErrorMsg) = '';
 188 
 189   $MolecularFormula = _CleanUpFormula($MolecularFormula);
 190 
 191   # Make sure it only contains numbers and letters...
 192   if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) {
 193     $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9';
 194     return (undef, undef, $ErrorMsg);
 195   }
 196 
 197   # Parse the formula...
 198   my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount,  @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap);
 199 
 200   @FormulaElements = (); @ElementCount = ();
 201   %FormulaElementsToCountMap = ();
 202 
 203 # Setup element symbol and count regular expression...
 204 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf
 205 #
 206 
 207   $FormulaElementSpec = qr/
 208                                       \G(                         # $1
 209                                                   (?:
 210                                                       ([A-Z][a-z]?)   # Two or one letter element symbol; $2
 211                                                       ([0-9]*)          # Optionally followed by element count; $3
 212                                                   )
 213                                                   | \( | \[
 214                                                   | \)[0-9]* | \][0-9]*
 215                                                   | .
 216                                             )
 217                                       /x;
 218 
 219   my($ProcessingParenthesis);
 220   $ProcessingParenthesis = 0;
 221   # Go over the formula...
 222   FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) {
 223     ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3);
 224 
 225     # Handle parenthesis in formula to indicate repeating units...
 226     if ($Spec =~ /^(\(|\[)/) {
 227       if ($ProcessingParenthesis) {
 228         $ErrorMsg = "Molecular formula contains multiple level of () or []";
 229         return (undef, undef, $ErrorMsg);
 230       }
 231       $ProcessingParenthesis = 1;
 232       @SubFormulaElements = ();
 233       %SubFormulaElementsToCountMap = ();
 234       next FORMULA;
 235     }
 236     elsif ($Spec =~ /^(\)|\])/) {
 237       $ProcessingParenthesis = 0;
 238 
 239       # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap;
 240       my($RepeatCount, $Symbol, $Count);
 241       $RepeatCount = $Spec;
 242       $RepeatCount =~  s/(\)|\])//g;
 243       if (!$RepeatCount) {
 244         $RepeatCount = 1;
 245       }
 246       # Copy data...
 247       for $Symbol (@SubFormulaElements) {
 248         $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount;
 249         _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count);
 250       }
 251 
 252       # Get ready again...
 253       @SubFormulaElements = ();
 254       %SubFormulaElementsToCountMap = ();
 255 
 256       next FORMULA;
 257     }
 258 
 259     # Retrieve element symbol and count...
 260     $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : '');
 261     $ElementCount = $ElementCount ? $ElementCount : 1;
 262     if (!PeriodicTable::IsElement($ElementSymbol)) {
 263       $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol";
 264       return (undef, undef, $ErrorMsg);
 265     }
 266 
 267     if ($ProcessingParenthesis) {
 268       _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount);
 269     }
 270     else {
 271       _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount);
 272     }
 273   }
 274 
 275   # Setup element count array...
 276   for $ElementSymbol (@FormulaElements) {
 277     $ElementCount = $FormulaElementsToCountMap{$ElementSymbol};
 278     push @ElementCount, $ElementCount;
 279   }
 280 
 281   # Make sure it all adds up to 100%; otherwise, adjust the last value..
 282 
 283   return (\@FormulaElements, \@ElementCount, $ErrorMsg);
 284 }
 285 
 286 # Clean it up...
 287 sub _CleanUpFormula {
 288   my($MolecularFormula) = @_;
 289   #Take out any spaces...
 290   $MolecularFormula =~ s/ //g;
 291 
 292   # Eliminate any charge specifications: +, - or [1-9]+[+-]
 293   # e.g NO+ [Al(H2O)6]3+ [H2NO3]+
 294   if ($MolecularFormula =~ /[\+\-]/) {
 295     if ($MolecularFormula =~ /\][0-9]+[\+\-]/) {
 296       # Bracket followed optionally by number and then, +/- ...
 297       # [Al(H2O)6]3+ ...
 298       $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g;
 299     }
 300     elsif ($MolecularFormula =~ /[\+\-][0-9]*/) {
 301       # +/- followed optionally by a number...
 302       # C37H42N2O6+2, Cu+
 303       $MolecularFormula =~ s/[\+\-][0-9]*//g;
 304     }
 305   }
 306 
 307   # Eliminate any brackets - ] or ) - not followed by numbers:
 308   # e.g. Li[H2PO4]
 309   if ($MolecularFormula !~ /\][0-9]+/) {
 310     $MolecularFormula =~ s/[\[\]]//g;
 311   }
 312   if ($MolecularFormula !~ /\)[0-9]+/) {
 313     $MolecularFormula =~ s/[\(\)]//g;
 314   }
 315   # Change adducts to parenthesis format...
 316   # Na2CO3.10H2O -> Na2CO3(H2O)10
 317   # 3CdSO4.8H2O -> (CdSO4)3(H2O)8
 318   if ($MolecularFormula =~ /\./) {
 319     my($SubFormula, $Count, $Spec);
 320     my(@MolecularFormulaSplits) = split /\./, $MolecularFormula;
 321     $MolecularFormula = '';
 322     for $SubFormula (@MolecularFormulaSplits) {
 323       ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/;
 324       if ($Count) {
 325         $MolecularFormula .= "(${Spec})${Count}";
 326       }
 327       else {
 328         $MolecularFormula .= $Spec;
 329       }
 330     }
 331   }
 332 
 333   return $MolecularFormula;
 334 }
 335 
 336 # Store the element and count...
 337 sub _SetupFormulaElementData {
 338   my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_;
 339 
 340   if (exists $ElementsToCountMapRef->{$Element}) {
 341     $ElementsToCountMapRef->{$Element} += $Count;
 342   }
 343   else {
 344     push @{$ElementsRef}, $Element;
 345     $ElementsToCountMapRef->{$Element} = $Count;
 346   }
 347 }
 348