0
|
1 package MolecularFormula;
|
|
2 #
|
|
3 # $RCSfile: MolecularFormula.pm,v $
|
|
4 # $Date: 2015/02/28 20:47:18 $
|
|
5 # $Revision: 1.25 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use Carp;
|
|
31 use Text::ParseWords;
|
|
32 use TextUtil;
|
|
33 use PeriodicTable;
|
|
34
|
|
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
|
|
36
|
|
37 @ISA = qw(Exporter);
|
|
38 @EXPORT = qw();
|
|
39 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula);
|
|
40
|
|
41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
|
|
42
|
|
43 #
|
|
44 # Calculate molecular weight assuming its a valid molecular formula...
|
|
45 #
|
|
46 sub CalculateMolecularWeight {
|
|
47 my($MolecularFormula) = @_;
|
|
48 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef);
|
|
49
|
|
50 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
|
|
51 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
|
|
52 return undef;
|
|
53 }
|
|
54
|
|
55 $MolecularWeight = 0;
|
|
56
|
|
57 for $Index (0 .. $#{$FormulaElementsRef}) {
|
|
58 $ElementSymbol = $FormulaElementsRef->[$Index];
|
|
59 $ElementCount = $FormulaElementCountRef->[$Index];
|
|
60 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
|
|
61 $MolecularWeight += $AtomicWeight * $ElementCount;
|
|
62 }
|
|
63 return $MolecularWeight;
|
|
64 }
|
|
65
|
|
66 #
|
|
67 # Calculate exact mass assuming it's a valid formula...
|
|
68 #
|
|
69 sub CalculateExactMass {
|
|
70 my($MolecularFormula) = @_;
|
|
71 my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef);
|
|
72
|
|
73 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
|
|
74 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
|
|
75 return undef;
|
|
76 }
|
|
77 $ExactMass = 0;
|
|
78
|
|
79 for $Index (0 .. $#{$FormulaElementsRef}) {
|
|
80 $ElementSymbol = $FormulaElementsRef->[$Index];
|
|
81 $ElementCount = $FormulaElementCountRef->[$Index];
|
|
82 $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol);
|
|
83 if (!defined($RelativeAtomicMass)) {
|
|
84 next ELEMENT;
|
|
85 }
|
|
86 $ExactMass += $RelativeAtomicMass * $ElementCount;
|
|
87 }
|
|
88 return $ExactMass;
|
|
89 }
|
|
90
|
|
91
|
|
92 #
|
|
93 # Calculate elemental composition and return reference to arrays
|
|
94 # containing elements and their percent composition...
|
|
95 #
|
|
96 sub CalculateElementalComposition {
|
|
97 my($MolecularFormula) = @_;
|
|
98 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition);
|
|
99
|
|
100 $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
|
|
101 if (! defined $MolecularWeight) {
|
|
102 return (undef, undef);
|
|
103 }
|
|
104 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
|
|
105
|
|
106 @FormulaElements = ();
|
|
107 @FormulaElementComposition = ();
|
|
108
|
|
109 if (!$MolecularWeight) {
|
|
110 return ( \@FormulaElements, \@FormulaElementComposition);
|
|
111 }
|
|
112
|
|
113 $CompositionMultiplier = 100 / $MolecularWeight;
|
|
114
|
|
115 for $Index (0 .. $#{$FormulaElementsRef}) {
|
|
116 $ElementSymbol = $FormulaElementsRef->[$Index];
|
|
117 $ElementCount = $FormulaElementCountRef->[$Index];
|
|
118 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
|
|
119 $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier;
|
|
120
|
|
121 push @FormulaElements, $ElementSymbol;
|
|
122 push @FormulaElementComposition, $Composition;
|
|
123 }
|
|
124
|
|
125 return ( \@FormulaElements, \@FormulaElementComposition);
|
|
126 }
|
|
127
|
|
128 # Using refernece to element and its composition arrays, format composition information
|
|
129 # as: Element: Composition;...
|
|
130 #
|
|
131 sub FormatCompositionInfomation {
|
|
132 my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition);
|
|
133
|
|
134 $Precision = 2;
|
|
135 if (@_ == 3) {
|
|
136 ($ElementsRef, $ElementCompositionRef, $Precision) = @_;
|
|
137 }
|
|
138 else {
|
|
139 ($ElementsRef, $ElementCompositionRef) = @_;
|
|
140 }
|
|
141
|
|
142 $Composition = '';
|
|
143 for $Index (0 .. $#{$ElementsRef}) {
|
|
144 $ElementSymbol = $ElementsRef->[$Index];
|
|
145 $ElementComposition = $ElementCompositionRef->[$Index];
|
|
146 $ElementComposition = sprintf("%.${Precision}f", $ElementComposition);
|
|
147
|
|
148 $Composition .= ($Composition) ? '; ' : '';
|
|
149 $Composition .= "${ElementSymbol}: ${ElementComposition}%";
|
|
150 }
|
|
151
|
|
152 return $Composition;
|
|
153 }
|
|
154
|
|
155 #
|
|
156 # Get elements and their count...
|
|
157 #
|
|
158 sub GetElementsAndCount {
|
|
159 my($MolecularFormula) = @_;
|
|
160 my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
|
|
161
|
|
162 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
|
|
163
|
|
164 return ($FormulaElementsRef, $FormulaElementCountRef);
|
|
165 }
|
|
166
|
|
167 #
|
|
168 # Is it a valid molecular formula?
|
|
169 #
|
|
170 sub IsMolecularFormula {
|
|
171 my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
|
|
172
|
|
173 ($MolecularFormula) = @_;
|
|
174
|
|
175 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
|
|
176 $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0;
|
|
177
|
|
178 return (wantarray ? ($Status, $ErrorMsg) : $Status);
|
|
179 }
|
|
180
|
|
181 #
|
|
182 # Process molecular formula. For a valid formula, return references to arrays conatining elements
|
|
183 # and element count; otherwsie, return undef.
|
|
184 #
|
|
185 sub _ProcessMolecularFormula {
|
|
186 my($MolecularFormula) = @_;
|
|
187 my($ErrorMsg) = '';
|
|
188
|
|
189 $MolecularFormula = _CleanUpFormula($MolecularFormula);
|
|
190
|
|
191 # Make sure it only contains numbers and letters...
|
|
192 if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) {
|
|
193 $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9';
|
|
194 return (undef, undef, $ErrorMsg);
|
|
195 }
|
|
196
|
|
197 # Parse the formula...
|
|
198 my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount, @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap);
|
|
199
|
|
200 @FormulaElements = (); @ElementCount = ();
|
|
201 %FormulaElementsToCountMap = ();
|
|
202
|
|
203 # Setup element symbol and count regular expression...
|
|
204 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf
|
|
205 #
|
|
206
|
|
207 $FormulaElementSpec = qr/
|
|
208 \G( # $1
|
|
209 (?:
|
|
210 ([A-Z][a-z]?) # Two or one letter element symbol; $2
|
|
211 ([0-9]*) # Optionally followed by element count; $3
|
|
212 )
|
|
213 | \( | \[
|
|
214 | \)[0-9]* | \][0-9]*
|
|
215 | .
|
|
216 )
|
|
217 /x;
|
|
218
|
|
219 my($ProcessingParenthesis);
|
|
220 $ProcessingParenthesis = 0;
|
|
221 # Go over the formula...
|
|
222 FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) {
|
|
223 ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3);
|
|
224
|
|
225 # Handle parenthesis in formula to indicate repeating units...
|
|
226 if ($Spec =~ /^(\(|\[)/) {
|
|
227 if ($ProcessingParenthesis) {
|
|
228 $ErrorMsg = "Molecular formula contains multiple level of () or []";
|
|
229 return (undef, undef, $ErrorMsg);
|
|
230 }
|
|
231 $ProcessingParenthesis = 1;
|
|
232 @SubFormulaElements = ();
|
|
233 %SubFormulaElementsToCountMap = ();
|
|
234 next FORMULA;
|
|
235 }
|
|
236 elsif ($Spec =~ /^(\)|\])/) {
|
|
237 $ProcessingParenthesis = 0;
|
|
238
|
|
239 # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap;
|
|
240 my($RepeatCount, $Symbol, $Count);
|
|
241 $RepeatCount = $Spec;
|
|
242 $RepeatCount =~ s/(\)|\])//g;
|
|
243 if (!$RepeatCount) {
|
|
244 $RepeatCount = 1;
|
|
245 }
|
|
246 # Copy data...
|
|
247 for $Symbol (@SubFormulaElements) {
|
|
248 $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount;
|
|
249 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count);
|
|
250 }
|
|
251
|
|
252 # Get ready again...
|
|
253 @SubFormulaElements = ();
|
|
254 %SubFormulaElementsToCountMap = ();
|
|
255
|
|
256 next FORMULA;
|
|
257 }
|
|
258
|
|
259 # Retrieve element symbol and count...
|
|
260 $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : '');
|
|
261 $ElementCount = $ElementCount ? $ElementCount : 1;
|
|
262 if (!PeriodicTable::IsElement($ElementSymbol)) {
|
|
263 $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol";
|
|
264 return (undef, undef, $ErrorMsg);
|
|
265 }
|
|
266
|
|
267 if ($ProcessingParenthesis) {
|
|
268 _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount);
|
|
269 }
|
|
270 else {
|
|
271 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount);
|
|
272 }
|
|
273 }
|
|
274
|
|
275 # Setup element count array...
|
|
276 for $ElementSymbol (@FormulaElements) {
|
|
277 $ElementCount = $FormulaElementsToCountMap{$ElementSymbol};
|
|
278 push @ElementCount, $ElementCount;
|
|
279 }
|
|
280
|
|
281 # Make sure it all adds up to 100%; otherwise, adjust the last value..
|
|
282
|
|
283 return (\@FormulaElements, \@ElementCount, $ErrorMsg);
|
|
284 }
|
|
285
|
|
286 # Clean it up...
|
|
287 sub _CleanUpFormula {
|
|
288 my($MolecularFormula) = @_;
|
|
289 #Take out any spaces...
|
|
290 $MolecularFormula =~ s/ //g;
|
|
291
|
|
292 # Eliminate any charge specifications: +, - or [1-9]+[+-]
|
|
293 # e.g NO+ [Al(H2O)6]3+ [H2NO3]+
|
|
294 if ($MolecularFormula =~ /[\+\-]/) {
|
|
295 if ($MolecularFormula =~ /\][0-9]+[\+\-]/) {
|
|
296 # Bracket followed optionally by number and then, +/- ...
|
|
297 # [Al(H2O)6]3+ ...
|
|
298 $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g;
|
|
299 }
|
|
300 elsif ($MolecularFormula =~ /[\+\-][0-9]*/) {
|
|
301 # +/- followed optionally by a number...
|
|
302 # C37H42N2O6+2, Cu+
|
|
303 $MolecularFormula =~ s/[\+\-][0-9]*//g;
|
|
304 }
|
|
305 }
|
|
306
|
|
307 # Eliminate any brackets - ] or ) - not followed by numbers:
|
|
308 # e.g. Li[H2PO4]
|
|
309 if ($MolecularFormula !~ /\][0-9]+/) {
|
|
310 $MolecularFormula =~ s/[\[\]]//g;
|
|
311 }
|
|
312 if ($MolecularFormula !~ /\)[0-9]+/) {
|
|
313 $MolecularFormula =~ s/[\(\)]//g;
|
|
314 }
|
|
315 # Change adducts to parenthesis format...
|
|
316 # Na2CO3.10H2O -> Na2CO3(H2O)10
|
|
317 # 3CdSO4.8H2O -> (CdSO4)3(H2O)8
|
|
318 if ($MolecularFormula =~ /\./) {
|
|
319 my($SubFormula, $Count, $Spec);
|
|
320 my(@MolecularFormulaSplits) = split /\./, $MolecularFormula;
|
|
321 $MolecularFormula = '';
|
|
322 for $SubFormula (@MolecularFormulaSplits) {
|
|
323 ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/;
|
|
324 if ($Count) {
|
|
325 $MolecularFormula .= "(${Spec})${Count}";
|
|
326 }
|
|
327 else {
|
|
328 $MolecularFormula .= $Spec;
|
|
329 }
|
|
330 }
|
|
331 }
|
|
332
|
|
333 return $MolecularFormula;
|
|
334 }
|
|
335
|
|
336 # Store the element and count...
|
|
337 sub _SetupFormulaElementData {
|
|
338 my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_;
|
|
339
|
|
340 if (exists $ElementsToCountMapRef->{$Element}) {
|
|
341 $ElementsToCountMapRef->{$Element} += $Count;
|
|
342 }
|
|
343 else {
|
|
344 push @{$ElementsRef}, $Element;
|
|
345 $ElementsToCountMapRef->{$Element} = $Count;
|
|
346 }
|
|
347 }
|
|
348
|
|
349 1;
|
|
350
|
|
351 __END__
|
|
352
|
|
353 =head1 NAME
|
|
354
|
|
355 MolecularFormula
|
|
356
|
|
357 =head1 SYNOPSIS
|
|
358
|
|
359 use MolecularFormula;
|
|
360
|
|
361 use MolecularFormula qw(:all);
|
|
362
|
|
363 =head1 DESCRIPTION
|
|
364
|
|
365 B<MolecularFormula> module provides the following functions:
|
|
366
|
|
367 CalculateElementalComposition, CalculateExactMass, CalculateMolecularWeight,
|
|
368 FormatCompositionInfomation, GetElementsAndCount, IsMolecularFormula
|
|
369
|
|
370 =head1 FUNCTIONS
|
|
371
|
|
372 =over 4
|
|
373
|
|
374 =item B<CalculateMolecularWeight>
|
|
375
|
|
376 $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
|
|
377
|
|
378 Calculates and returns the molecular weight for a specified I<MolecularFormula>.
|
|
379
|
|
380 =item B<CalculateElementalComposition>
|
|
381
|
|
382 ($ElementsRef, $ElementCompositionRef) =
|
|
383 CalculateElementalComposition($MolecularFormula);
|
|
384
|
|
385 Calculates the percent composition in a specified I<MolecularFormula> and returns references
|
|
386 to arrays containing elements and their percent composition.
|
|
387
|
|
388 =item B<CalculateExactMass>
|
|
389
|
|
390 $ExactMass = CalculateMolecularWeight($MolecularFormula);
|
|
391
|
|
392 Calculates and returns the exact mass for a specified I<MolecularFormula>.
|
|
393
|
|
394 =item B<FormatCompositionInfomation>
|
|
395
|
|
396 $FormattedString = FormatCompositionInfomation($ElementsRef,
|
|
397 $ElementCompositionRef, [$Precision]);
|
|
398
|
|
399 Returns a formatted elemental composition string using references to elements and elemental
|
|
400 composition arrays. Precision is an optional parameter; its default value is I<2>.
|
|
401
|
|
402 =item B<GetElementsAndCount>
|
|
403
|
|
404 ($ElementsRef, $ElementCountRef) = GetElementsAndCount(
|
|
405 $MolecularFormula);
|
|
406
|
|
407 Retrieves elements and their count composition in a specified I<MolecularFormula> and
|
|
408 returns references to arrays containing elements and their count.
|
|
409
|
|
410 =item B<IsMolecularFormula>
|
|
411
|
|
412 $Status = IsMolecularFormula($MolecularFormula);
|
|
413 ($Status, $ErrorMsg) = IsMolecularFormula($MolecularFormula);
|
|
414
|
|
415 Returns 1 or 0 a based on whether it's a valid I<MolecularFormula>.
|
|
416
|
|
417 =back
|
|
418
|
|
419 =head1 AUTHOR
|
|
420
|
|
421 Manish Sud <msud@san.rr.com>
|
|
422
|
|
423 =head1 SEE ALSO
|
|
424
|
|
425 Molecule.pm
|
|
426
|
|
427 =head1 COPYRIGHT
|
|
428
|
|
429 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
430
|
|
431 This file is part of MayaChemTools.
|
|
432
|
|
433 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
434 the terms of the GNU Lesser General Public License as published by the Free
|
|
435 Software Foundation; either version 3 of the License, or (at your option)
|
|
436 any later version.
|
|
437
|
|
438 =cut
|