1 package MolecularFormula; 2 # 3 # $RCSfile: MolecularFormula.pm,v $ 4 # $Date: 2015/02/28 20:47:18 $ 5 # $Revision: 1.25 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use Carp; 31 use Text::ParseWords; 32 use TextUtil; 33 use PeriodicTable; 34 35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS); 36 37 @ISA = qw(Exporter); 38 @EXPORT = qw(); 39 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula); 40 41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]); 42 43 # 44 # Calculate molecular weight assuming its a valid molecular formula... 45 # 46 sub CalculateMolecularWeight { 47 my($MolecularFormula) = @_; 48 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef); 49 50 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 51 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) { 52 return undef; 53 } 54 55 $MolecularWeight = 0; 56 57 for $Index (0 .. $#{$FormulaElementsRef}) { 58 $ElementSymbol = $FormulaElementsRef->[$Index]; 59 $ElementCount = $FormulaElementCountRef->[$Index]; 60 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol); 61 $MolecularWeight += $AtomicWeight * $ElementCount; 62 } 63 return $MolecularWeight; 64 } 65 66 # 67 # Calculate exact mass assuming it's a valid formula... 68 # 69 sub CalculateExactMass { 70 my($MolecularFormula) = @_; 71 my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef); 72 73 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 74 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) { 75 return undef; 76 } 77 $ExactMass = 0; 78 79 for $Index (0 .. $#{$FormulaElementsRef}) { 80 $ElementSymbol = $FormulaElementsRef->[$Index]; 81 $ElementCount = $FormulaElementCountRef->[$Index]; 82 $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol); 83 if (!defined($RelativeAtomicMass)) { 84 next ELEMENT; 85 } 86 $ExactMass += $RelativeAtomicMass * $ElementCount; 87 } 88 return $ExactMass; 89 } 90 91 92 # 93 # Calculate elemental composition and return reference to arrays 94 # containing elements and their percent composition... 95 # 96 sub CalculateElementalComposition { 97 my($MolecularFormula) = @_; 98 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition); 99 100 $MolecularWeight = CalculateMolecularWeight($MolecularFormula); 101 if (! defined $MolecularWeight) { 102 return (undef, undef); 103 } 104 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula); 105 106 @FormulaElements = (); 107 @FormulaElementComposition = (); 108 109 if (!$MolecularWeight) { 110 return ( \@FormulaElements, \@FormulaElementComposition); 111 } 112 113 $CompositionMultiplier = 100 / $MolecularWeight; 114 115 for $Index (0 .. $#{$FormulaElementsRef}) { 116 $ElementSymbol = $FormulaElementsRef->[$Index]; 117 $ElementCount = $FormulaElementCountRef->[$Index]; 118 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol); 119 $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier; 120 121 push @FormulaElements, $ElementSymbol; 122 push @FormulaElementComposition, $Composition; 123 } 124 125 return ( \@FormulaElements, \@FormulaElementComposition); 126 } 127 128 # Using refernece to element and its composition arrays, format composition information 129 # as: Element: Composition;... 130 # 131 sub FormatCompositionInfomation { 132 my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition); 133 134 $Precision = 2; 135 if (@_ == 3) { 136 ($ElementsRef, $ElementCompositionRef, $Precision) = @_; 137 } 138 else { 139 ($ElementsRef, $ElementCompositionRef) = @_; 140 } 141 142 $Composition = ''; 143 for $Index (0 .. $#{$ElementsRef}) { 144 $ElementSymbol = $ElementsRef->[$Index]; 145 $ElementComposition = $ElementCompositionRef->[$Index]; 146 $ElementComposition = sprintf("%.${Precision}f", $ElementComposition); 147 148 $Composition .= ($Composition) ? '; ' : ''; 149 $Composition .= "${ElementSymbol}: ${ElementComposition}%"; 150 } 151 152 return $Composition; 153 } 154 155 # 156 # Get elements and their count... 157 # 158 sub GetElementsAndCount { 159 my($MolecularFormula) = @_; 160 my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg); 161 162 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula); 163 164 return ($FormulaElementsRef, $FormulaElementCountRef); 165 } 166 167 # 168 # Is it a valid molecular formula? 169 # 170 sub IsMolecularFormula { 171 my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg); 172 173 ($MolecularFormula) = @_; 174 175 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula); 176 $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0; 177 178 return (wantarray ? ($Status, $ErrorMsg) : $Status); 179 } 180 181 # 182 # Process molecular formula. For a valid formula, return references to arrays conatining elements 183 # and element count; otherwsie, return undef. 184 # 185 sub _ProcessMolecularFormula { 186 my($MolecularFormula) = @_; 187 my($ErrorMsg) = ''; 188 189 $MolecularFormula = _CleanUpFormula($MolecularFormula); 190 191 # Make sure it only contains numbers and letters... 192 if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) { 193 $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9'; 194 return (undef, undef, $ErrorMsg); 195 } 196 197 # Parse the formula... 198 my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount, @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap); 199 200 @FormulaElements = (); @ElementCount = (); 201 %FormulaElementsToCountMap = (); 202 203 # Setup element symbol and count regular expression... 204 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf 205 # 206 207 $FormulaElementSpec = qr/ 208 \G( # $1 209 (?: 210 ([A-Z][a-z]?) # Two or one letter element symbol; $2 211 ([0-9]*) # Optionally followed by element count; $3 212 ) 213 | \( | \[ 214 | \)[0-9]* | \][0-9]* 215 | . 216 ) 217 /x; 218 219 my($ProcessingParenthesis); 220 $ProcessingParenthesis = 0; 221 # Go over the formula... 222 FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) { 223 ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3); 224 225 # Handle parenthesis in formula to indicate repeating units... 226 if ($Spec =~ /^(\(|\[)/) { 227 if ($ProcessingParenthesis) { 228 $ErrorMsg = "Molecular formula contains multiple level of () or []"; 229 return (undef, undef, $ErrorMsg); 230 } 231 $ProcessingParenthesis = 1; 232 @SubFormulaElements = (); 233 %SubFormulaElementsToCountMap = (); 234 next FORMULA; 235 } 236 elsif ($Spec =~ /^(\)|\])/) { 237 $ProcessingParenthesis = 0; 238 239 # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap; 240 my($RepeatCount, $Symbol, $Count); 241 $RepeatCount = $Spec; 242 $RepeatCount =~ s/(\)|\])//g; 243 if (!$RepeatCount) { 244 $RepeatCount = 1; 245 } 246 # Copy data... 247 for $Symbol (@SubFormulaElements) { 248 $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount; 249 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count); 250 } 251 252 # Get ready again... 253 @SubFormulaElements = (); 254 %SubFormulaElementsToCountMap = (); 255 256 next FORMULA; 257 } 258 259 # Retrieve element symbol and count... 260 $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : ''); 261 $ElementCount = $ElementCount ? $ElementCount : 1; 262 if (!PeriodicTable::IsElement($ElementSymbol)) { 263 $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol"; 264 return (undef, undef, $ErrorMsg); 265 } 266 267 if ($ProcessingParenthesis) { 268 _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount); 269 } 270 else { 271 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount); 272 } 273 } 274 275 # Setup element count array... 276 for $ElementSymbol (@FormulaElements) { 277 $ElementCount = $FormulaElementsToCountMap{$ElementSymbol}; 278 push @ElementCount, $ElementCount; 279 } 280 281 # Make sure it all adds up to 100%; otherwise, adjust the last value.. 282 283 return (\@FormulaElements, \@ElementCount, $ErrorMsg); 284 } 285 286 # Clean it up... 287 sub _CleanUpFormula { 288 my($MolecularFormula) = @_; 289 #Take out any spaces... 290 $MolecularFormula =~ s/ //g; 291 292 # Eliminate any charge specifications: +, - or [1-9]+[+-] 293 # e.g NO+ [Al(H2O)6]3+ [H2NO3]+ 294 if ($MolecularFormula =~ /[\+\-]/) { 295 if ($MolecularFormula =~ /\][0-9]+[\+\-]/) { 296 # Bracket followed optionally by number and then, +/- ... 297 # [Al(H2O)6]3+ ... 298 $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g; 299 } 300 elsif ($MolecularFormula =~ /[\+\-][0-9]*/) { 301 # +/- followed optionally by a number... 302 # C37H42N2O6+2, Cu+ 303 $MolecularFormula =~ s/[\+\-][0-9]*//g; 304 } 305 } 306 307 # Eliminate any brackets - ] or ) - not followed by numbers: 308 # e.g. Li[H2PO4] 309 if ($MolecularFormula !~ /\][0-9]+/) { 310 $MolecularFormula =~ s/[\[\]]//g; 311 } 312 if ($MolecularFormula !~ /\)[0-9]+/) { 313 $MolecularFormula =~ s/[\(\)]//g; 314 } 315 # Change adducts to parenthesis format... 316 # Na2CO3.10H2O -> Na2CO3(H2O)10 317 # 3CdSO4.8H2O -> (CdSO4)3(H2O)8 318 if ($MolecularFormula =~ /\./) { 319 my($SubFormula, $Count, $Spec); 320 my(@MolecularFormulaSplits) = split /\./, $MolecularFormula; 321 $MolecularFormula = ''; 322 for $SubFormula (@MolecularFormulaSplits) { 323 ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/; 324 if ($Count) { 325 $MolecularFormula .= "(${Spec})${Count}"; 326 } 327 else { 328 $MolecularFormula .= $Spec; 329 } 330 } 331 } 332 333 return $MolecularFormula; 334 } 335 336 # Store the element and count... 337 sub _SetupFormulaElementData { 338 my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_; 339 340 if (exists $ElementsToCountMapRef->{$Element}) { 341 $ElementsToCountMapRef->{$Element} += $Count; 342 } 343 else { 344 push @{$ElementsRef}, $Element; 345 $ElementsToCountMapRef->{$Element} = $Count; 346 } 347 } 348