comparison lib/MolecularFormula.pm @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 package MolecularFormula;
2 #
3 # $RCSfile: MolecularFormula.pm,v $
4 # $Date: 2015/02/28 20:47:18 $
5 # $Revision: 1.25 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use Carp;
31 use Text::ParseWords;
32 use TextUtil;
33 use PeriodicTable;
34
35 use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
36
37 @ISA = qw(Exporter);
38 @EXPORT = qw();
39 @EXPORT_OK = qw(CalculateMolecularWeight CalculateExactMass CalculateElementalComposition FormatCompositionInfomation GetElementsAndCount IsMolecularFormula);
40
41 %EXPORT_TAGS = (all => [@EXPORT, @EXPORT_OK]);
42
43 #
44 # Calculate molecular weight assuming its a valid molecular formula...
45 #
46 sub CalculateMolecularWeight {
47 my($MolecularFormula) = @_;
48 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $FormulaElementsRef, $FormulaElementCountRef);
49
50 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
51 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
52 return undef;
53 }
54
55 $MolecularWeight = 0;
56
57 for $Index (0 .. $#{$FormulaElementsRef}) {
58 $ElementSymbol = $FormulaElementsRef->[$Index];
59 $ElementCount = $FormulaElementCountRef->[$Index];
60 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
61 $MolecularWeight += $AtomicWeight * $ElementCount;
62 }
63 return $MolecularWeight;
64 }
65
66 #
67 # Calculate exact mass assuming it's a valid formula...
68 #
69 sub CalculateExactMass {
70 my($MolecularFormula) = @_;
71 my($Index, $ElementSymbol, $ElementCount, $ExactMass, $RelativeAtomicMass, $FormulaElementsRef, $FormulaElementCountRef);
72
73 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
74 if (!(defined($FormulaElementsRef) && defined($FormulaElementCountRef))) {
75 return undef;
76 }
77 $ExactMass = 0;
78
79 for $Index (0 .. $#{$FormulaElementsRef}) {
80 $ElementSymbol = $FormulaElementsRef->[$Index];
81 $ElementCount = $FormulaElementCountRef->[$Index];
82 $RelativeAtomicMass = PeriodicTable::GetElementMostAbundantNaturalIsotopeMass($ElementSymbol);
83 if (!defined($RelativeAtomicMass)) {
84 next ELEMENT;
85 }
86 $ExactMass += $RelativeAtomicMass * $ElementCount;
87 }
88 return $ExactMass;
89 }
90
91
92 #
93 # Calculate elemental composition and return reference to arrays
94 # containing elements and their percent composition...
95 #
96 sub CalculateElementalComposition {
97 my($MolecularFormula) = @_;
98 my($Index, $MolecularWeight, $ElementSymbol, $ElementCount, $AtomicWeight, $Composition, $CompositionMultiplier, $FormulaElementsRef, $FormulaElementCountRef, @FormulaElements, @FormulaElementComposition);
99
100 $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
101 if (! defined $MolecularWeight) {
102 return (undef, undef);
103 }
104 ($FormulaElementsRef, $FormulaElementCountRef) = _ProcessMolecularFormula($MolecularFormula);
105
106 @FormulaElements = ();
107 @FormulaElementComposition = ();
108
109 if (!$MolecularWeight) {
110 return ( \@FormulaElements, \@FormulaElementComposition);
111 }
112
113 $CompositionMultiplier = 100 / $MolecularWeight;
114
115 for $Index (0 .. $#{$FormulaElementsRef}) {
116 $ElementSymbol = $FormulaElementsRef->[$Index];
117 $ElementCount = $FormulaElementCountRef->[$Index];
118 $AtomicWeight = PeriodicTable::GetElementAtomicWeight($ElementSymbol);
119 $Composition = ($AtomicWeight * $ElementCount) * $CompositionMultiplier;
120
121 push @FormulaElements, $ElementSymbol;
122 push @FormulaElementComposition, $Composition;
123 }
124
125 return ( \@FormulaElements, \@FormulaElementComposition);
126 }
127
128 # Using refernece to element and its composition arrays, format composition information
129 # as: Element: Composition;...
130 #
131 sub FormatCompositionInfomation {
132 my($Index, $ElementSymbol, $ElementComposition, $ElementsRef, $ElementCompositionRef, $Precision, $Composition);
133
134 $Precision = 2;
135 if (@_ == 3) {
136 ($ElementsRef, $ElementCompositionRef, $Precision) = @_;
137 }
138 else {
139 ($ElementsRef, $ElementCompositionRef) = @_;
140 }
141
142 $Composition = '';
143 for $Index (0 .. $#{$ElementsRef}) {
144 $ElementSymbol = $ElementsRef->[$Index];
145 $ElementComposition = $ElementCompositionRef->[$Index];
146 $ElementComposition = sprintf("%.${Precision}f", $ElementComposition);
147
148 $Composition .= ($Composition) ? '; ' : '';
149 $Composition .= "${ElementSymbol}: ${ElementComposition}%";
150 }
151
152 return $Composition;
153 }
154
155 #
156 # Get elements and their count...
157 #
158 sub GetElementsAndCount {
159 my($MolecularFormula) = @_;
160 my($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
161
162 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
163
164 return ($FormulaElementsRef, $FormulaElementCountRef);
165 }
166
167 #
168 # Is it a valid molecular formula?
169 #
170 sub IsMolecularFormula {
171 my($MolecularFormula, $PrintErrorMsg, $Status, $FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg);
172
173 ($MolecularFormula) = @_;
174
175 ($FormulaElementsRef, $FormulaElementCountRef, $ErrorMsg) = _ProcessMolecularFormula($MolecularFormula);
176 $Status = (defined($FormulaElementsRef) && defined($FormulaElementCountRef)) ? 1 : 0;
177
178 return (wantarray ? ($Status, $ErrorMsg) : $Status);
179 }
180
181 #
182 # Process molecular formula. For a valid formula, return references to arrays conatining elements
183 # and element count; otherwsie, return undef.
184 #
185 sub _ProcessMolecularFormula {
186 my($MolecularFormula) = @_;
187 my($ErrorMsg) = '';
188
189 $MolecularFormula = _CleanUpFormula($MolecularFormula);
190
191 # Make sure it only contains numbers and letters...
192 if ($MolecularFormula =~ /[^a-zA-Z0-9\(\)\[\]]/) {
193 $ErrorMsg = 'Molecular formula contains characters other than a-zA-Z0-9';
194 return (undef, undef, $ErrorMsg);
195 }
196
197 # Parse the formula...
198 my($ElementSpec, $FormulaElementSpec, $Spec, $ElementSymbol, $ElementCount, @FormulaElements, @ElementCount, %FormulaElementsToCountMap, @SubFormulaElements, %SubFormulaElementsToCountMap);
199
200 @FormulaElements = (); @ElementCount = ();
201 %FormulaElementsToCountMap = ();
202
203 # Setup element symbol and count regular expression...
204 # IUPAC: http://www.iupac.org/reports/provisional/abstract04/RB-prs310804/Chap4-3.04.pdf
205 #
206
207 $FormulaElementSpec = qr/
208 \G( # $1
209 (?:
210 ([A-Z][a-z]?) # Two or one letter element symbol; $2
211 ([0-9]*) # Optionally followed by element count; $3
212 )
213 | \( | \[
214 | \)[0-9]* | \][0-9]*
215 | .
216 )
217 /x;
218
219 my($ProcessingParenthesis);
220 $ProcessingParenthesis = 0;
221 # Go over the formula...
222 FORMULA: while ($MolecularFormula =~ /$FormulaElementSpec/gx) {
223 ($Spec, $ElementSymbol, $ElementCount) = ($1, $2, $3);
224
225 # Handle parenthesis in formula to indicate repeating units...
226 if ($Spec =~ /^(\(|\[)/) {
227 if ($ProcessingParenthesis) {
228 $ErrorMsg = "Molecular formula contains multiple level of () or []";
229 return (undef, undef, $ErrorMsg);
230 }
231 $ProcessingParenthesis = 1;
232 @SubFormulaElements = ();
233 %SubFormulaElementsToCountMap = ();
234 next FORMULA;
235 }
236 elsif ($Spec =~ /^(\)|\])/) {
237 $ProcessingParenthesis = 0;
238
239 # Retrieve repeat count and move data to @FormulaElements and %FormulaElementsToCountMap;
240 my($RepeatCount, $Symbol, $Count);
241 $RepeatCount = $Spec;
242 $RepeatCount =~ s/(\)|\])//g;
243 if (!$RepeatCount) {
244 $RepeatCount = 1;
245 }
246 # Copy data...
247 for $Symbol (@SubFormulaElements) {
248 $Count = $SubFormulaElementsToCountMap{$Symbol} * $RepeatCount;
249 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $Symbol, $Count);
250 }
251
252 # Get ready again...
253 @SubFormulaElements = ();
254 %SubFormulaElementsToCountMap = ();
255
256 next FORMULA;
257 }
258
259 # Retrieve element symbol and count...
260 $ElementSymbol = ($Spec && !$ElementSymbol) ? $Spec : ($ElementSymbol ? $ElementSymbol : '');
261 $ElementCount = $ElementCount ? $ElementCount : 1;
262 if (!PeriodicTable::IsElement($ElementSymbol)) {
263 $ErrorMsg = "Molecular formula contains unknown elemental symbol $ElementSymbol";
264 return (undef, undef, $ErrorMsg);
265 }
266
267 if ($ProcessingParenthesis) {
268 _SetupFormulaElementData(\@SubFormulaElements, \%SubFormulaElementsToCountMap, $ElementSymbol, $ElementCount);
269 }
270 else {
271 _SetupFormulaElementData(\@FormulaElements, \%FormulaElementsToCountMap, $ElementSymbol, $ElementCount);
272 }
273 }
274
275 # Setup element count array...
276 for $ElementSymbol (@FormulaElements) {
277 $ElementCount = $FormulaElementsToCountMap{$ElementSymbol};
278 push @ElementCount, $ElementCount;
279 }
280
281 # Make sure it all adds up to 100%; otherwise, adjust the last value..
282
283 return (\@FormulaElements, \@ElementCount, $ErrorMsg);
284 }
285
286 # Clean it up...
287 sub _CleanUpFormula {
288 my($MolecularFormula) = @_;
289 #Take out any spaces...
290 $MolecularFormula =~ s/ //g;
291
292 # Eliminate any charge specifications: +, - or [1-9]+[+-]
293 # e.g NO+ [Al(H2O)6]3+ [H2NO3]+
294 if ($MolecularFormula =~ /[\+\-]/) {
295 if ($MolecularFormula =~ /\][0-9]+[\+\-]/) {
296 # Bracket followed optionally by number and then, +/- ...
297 # [Al(H2O)6]3+ ...
298 $MolecularFormula =~ s/\][0-9]+[\+\-]/\]/g;
299 }
300 elsif ($MolecularFormula =~ /[\+\-][0-9]*/) {
301 # +/- followed optionally by a number...
302 # C37H42N2O6+2, Cu+
303 $MolecularFormula =~ s/[\+\-][0-9]*//g;
304 }
305 }
306
307 # Eliminate any brackets - ] or ) - not followed by numbers:
308 # e.g. Li[H2PO4]
309 if ($MolecularFormula !~ /\][0-9]+/) {
310 $MolecularFormula =~ s/[\[\]]//g;
311 }
312 if ($MolecularFormula !~ /\)[0-9]+/) {
313 $MolecularFormula =~ s/[\(\)]//g;
314 }
315 # Change adducts to parenthesis format...
316 # Na2CO3.10H2O -> Na2CO3(H2O)10
317 # 3CdSO4.8H2O -> (CdSO4)3(H2O)8
318 if ($MolecularFormula =~ /\./) {
319 my($SubFormula, $Count, $Spec);
320 my(@MolecularFormulaSplits) = split /\./, $MolecularFormula;
321 $MolecularFormula = '';
322 for $SubFormula (@MolecularFormulaSplits) {
323 ($Count, $Spec) = $SubFormula =~ /^([0-9]*)(.*?)$/;
324 if ($Count) {
325 $MolecularFormula .= "(${Spec})${Count}";
326 }
327 else {
328 $MolecularFormula .= $Spec;
329 }
330 }
331 }
332
333 return $MolecularFormula;
334 }
335
336 # Store the element and count...
337 sub _SetupFormulaElementData {
338 my($ElementsRef, $ElementsToCountMapRef, $Element, $Count) = @_;
339
340 if (exists $ElementsToCountMapRef->{$Element}) {
341 $ElementsToCountMapRef->{$Element} += $Count;
342 }
343 else {
344 push @{$ElementsRef}, $Element;
345 $ElementsToCountMapRef->{$Element} = $Count;
346 }
347 }
348
349 1;
350
351 __END__
352
353 =head1 NAME
354
355 MolecularFormula
356
357 =head1 SYNOPSIS
358
359 use MolecularFormula;
360
361 use MolecularFormula qw(:all);
362
363 =head1 DESCRIPTION
364
365 B<MolecularFormula> module provides the following functions:
366
367 CalculateElementalComposition, CalculateExactMass, CalculateMolecularWeight,
368 FormatCompositionInfomation, GetElementsAndCount, IsMolecularFormula
369
370 =head1 FUNCTIONS
371
372 =over 4
373
374 =item B<CalculateMolecularWeight>
375
376 $MolecularWeight = CalculateMolecularWeight($MolecularFormula);
377
378 Calculates and returns the molecular weight for a specified I<MolecularFormula>.
379
380 =item B<CalculateElementalComposition>
381
382 ($ElementsRef, $ElementCompositionRef) =
383 CalculateElementalComposition($MolecularFormula);
384
385 Calculates the percent composition in a specified I<MolecularFormula> and returns references
386 to arrays containing elements and their percent composition.
387
388 =item B<CalculateExactMass>
389
390 $ExactMass = CalculateMolecularWeight($MolecularFormula);
391
392 Calculates and returns the exact mass for a specified I<MolecularFormula>.
393
394 =item B<FormatCompositionInfomation>
395
396 $FormattedString = FormatCompositionInfomation($ElementsRef,
397 $ElementCompositionRef, [$Precision]);
398
399 Returns a formatted elemental composition string using references to elements and elemental
400 composition arrays. Precision is an optional parameter; its default value is I<2>.
401
402 =item B<GetElementsAndCount>
403
404 ($ElementsRef, $ElementCountRef) = GetElementsAndCount(
405 $MolecularFormula);
406
407 Retrieves elements and their count composition in a specified I<MolecularFormula> and
408 returns references to arrays containing elements and their count.
409
410 =item B<IsMolecularFormula>
411
412 $Status = IsMolecularFormula($MolecularFormula);
413 ($Status, $ErrorMsg) = IsMolecularFormula($MolecularFormula);
414
415 Returns 1 or 0 a based on whether it's a valid I<MolecularFormula>.
416
417 =back
418
419 =head1 AUTHOR
420
421 Manish Sud <msud@san.rr.com>
422
423 =head1 SEE ALSO
424
425 Molecule.pm
426
427 =head1 COPYRIGHT
428
429 Copyright (C) 2015 Manish Sud. All rights reserved.
430
431 This file is part of MayaChemTools.
432
433 MayaChemTools is free software; you can redistribute it and/or modify it under
434 the terms of the GNU Lesser General Public License as published by the Free
435 Software Foundation; either version 3 of the License, or (at your option)
436 any later version.
437
438 =cut