Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/ElementalAnalysisTextFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: ElementalAnalysisTextFiles.pl,v $ | |
4 # $Date: 2015/02/28 20:46:19 $ | |
5 # $Revision: 1.28 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileUtil; | |
36 use TextUtil; | |
37 use MolecularFormula; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename($0); | |
46 print "\n$ScriptName: Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Get the options and setup script... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 my(@TextFilesList); | |
56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); | |
57 | |
58 # Process options... | |
59 my(%OptionsInfo); | |
60 print "Processing options...\n"; | |
61 ProcessOptions(); | |
62 | |
63 print "Checking input text file(s)...\n"; | |
64 my(%TextFilesInfo); | |
65 RetrieveTextFilesInfo(); | |
66 RetrieveColumnsAndLabelsInfo(); | |
67 | |
68 # Generate output files... | |
69 my($FileIndex); | |
70 if (@TextFilesList > 1) { | |
71 print "\nProcessing text files...\n"; | |
72 } | |
73 for $FileIndex (0 .. $#TextFilesList) { | |
74 if ($TextFilesInfo{FileOkay}[$FileIndex]) { | |
75 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; | |
76 PerformElementalAnalysis($FileIndex); | |
77 } | |
78 } | |
79 print "\n$ScriptName:Done...\n\n"; | |
80 | |
81 $EndTime = new Benchmark; | |
82 $TotalTime = timediff ($EndTime, $StartTime); | |
83 print "Total time: ", timestr($TotalTime), "\n"; | |
84 | |
85 ############################################################################### | |
86 | |
87 # Perform elemental analysis... | |
88 sub PerformElementalAnalysis { | |
89 my($Index) = @_; | |
90 my($TextFile, $NewTextFile, $FormulaCol, $Line, $NewLine, $FormulaColValue, $InDelim, $ColNum, $Value, $Status, $ErrorMsg, @ColLabels, @LineWords, @ColNumsBeforeNew, @ColNumsAfterNew); | |
91 | |
92 $TextFile = $TextFilesList[$Index]; | |
93 $InDelim = $TextFilesInfo{InDelim}[$Index]; | |
94 $NewTextFile = $TextFilesInfo{OutFile}[$Index]; | |
95 $FormulaCol = $TextFilesInfo{FormulaColNum}[$Index]; | |
96 | |
97 @ColNumsBeforeNew = @{$TextFilesInfo{ColNumsBeforeNew}[$Index]}; | |
98 @ColNumsAfterNew = @{$TextFilesInfo{ColNumsAfterNew}[$Index]}; | |
99 | |
100 print "Generating new Text file $NewTextFile...\n"; | |
101 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n"; | |
102 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; | |
103 | |
104 # Skip over column labels from old file... | |
105 $Line = GetTextLine(\*TEXTFILE); | |
106 | |
107 # Add column lablels in new file... | |
108 @ColLabels = (); | |
109 for $ColNum (@ColNumsBeforeNew) { | |
110 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
111 } | |
112 for $Value (@{$OptionsInfo{SpecifiedCalculations}}) { | |
113 push @ColLabels, $TextFilesInfo{ValueLabelsMap}[$Index]{$Value}; | |
114 } | |
115 for $ColNum (@ColNumsAfterNew) { | |
116 push @ColLabels, $TextFilesInfo{ColLabels}[$Index][$ColNum]; | |
117 } | |
118 $NewLine = ''; | |
119 $NewLine = JoinWords(\@ColLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
120 print NEWTEXTFILE "$NewLine\n"; | |
121 | |
122 # Go over all rows... | |
123 my($LineCount, $ElementsRef, $ElementCompositionRef, $CalculationType, $CalculatedValue, @CalculatedValues); | |
124 | |
125 $LineCount = 1; | |
126 TEXTLINE: while ($Line = GetTextLine(\*TEXTFILE)) { | |
127 @LineWords = quotewords($InDelim, 0, $Line); | |
128 $LineCount++; | |
129 | |
130 @CalculatedValues = (); | |
131 for $Value (@{$OptionsInfo{SpecifiedCalculations}}) { | |
132 push @CalculatedValues, ''; | |
133 } | |
134 if ($FormulaCol > @LineWords) { | |
135 $ErrorMsg = "Ignoring line $LineCount: Formula column $ColLabels[$FormulaCol] not found"; | |
136 PrintErrorMsg($Line, $ErrorMsg); | |
137 ComposeAndWriteNewLine(\*NEWTEXTFILE, \@LineWords, \@ColNumsBeforeNew, \@ColNumsAfterNew, \@CalculatedValues); | |
138 next TEXTLINE; | |
139 } | |
140 | |
141 # Make sure it's a valid molecular formula... | |
142 $FormulaColValue = $LineWords[$FormulaCol]; | |
143 if ($OptionsInfo{CheckFormula}) { | |
144 ($Status, $ErrorMsg) = MolecularFormula::IsMolecularFormula($FormulaColValue); | |
145 if (!$Status) { | |
146 $ErrorMsg = "Ignoring line $LineCount: Formula column $ColLabels[$FormulaCol] value is not valid: $ErrorMsg"; | |
147 PrintErrorMsg($Line, $ErrorMsg); | |
148 ComposeAndWriteNewLine(\*NEWTEXTFILE, \@LineWords, \@ColNumsBeforeNew, \@ColNumsAfterNew, \@CalculatedValues); | |
149 next TEXTLINE; | |
150 } | |
151 } | |
152 | |
153 # Calculate appropriate values and write 'em out... | |
154 @CalculatedValues = (); | |
155 for $CalculationType (@{$OptionsInfo{SpecifiedCalculations}}) { | |
156 if ($CalculationType =~ /^ElementalAnalysis$/i) { | |
157 ($ElementsRef, $ElementCompositionRef) = MolecularFormula::CalculateElementalComposition($FormulaColValue); | |
158 $CalculatedValue = (defined($ElementsRef) && defined($ElementCompositionRef)) ? MolecularFormula::FormatCompositionInfomation($ElementsRef, $ElementCompositionRef, $OptionsInfo{Precision}) : ''; | |
159 } | |
160 elsif ($CalculationType =~ /^MolecularWeight$/i) { | |
161 $CalculatedValue = MolecularFormula::CalculateMolecularWeight($FormulaColValue); | |
162 $CalculatedValue = (defined($CalculatedValue) && length($CalculatedValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CalculatedValue)) : ""; | |
163 } | |
164 elsif ($CalculationType =~ /^ExactMass$/i) { | |
165 $CalculatedValue = MolecularFormula::CalculateExactMass($FormulaColValue); | |
166 $CalculatedValue = (defined($CalculatedValue) && length($CalculatedValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CalculatedValue)) : ""; | |
167 } | |
168 else { | |
169 $CalculatedValue = ''; | |
170 } | |
171 push @CalculatedValues, $CalculatedValue; | |
172 } | |
173 | |
174 ComposeAndWriteNewLine(\*NEWTEXTFILE, \@LineWords, \@ColNumsBeforeNew, \@ColNumsAfterNew, \@CalculatedValues); | |
175 } | |
176 close NEWTEXTFILE; | |
177 close TEXTFILE; | |
178 | |
179 } | |
180 | |
181 # Write out new line using old and new calculated data... | |
182 sub ComposeAndWriteNewLine { | |
183 my($NewTextFileRef, $OldLineWordsRef, $ColNumsBeforeNewRef, $ColNumsAfterNewRef, $CalculatedValuesRef) = @_; | |
184 my($NewLine, $ColNum, $Value, @NewLineWords); | |
185 | |
186 @NewLineWords = (); | |
187 for $ColNum (@{$ColNumsBeforeNewRef}) { | |
188 push @NewLineWords, $OldLineWordsRef->[$ColNum]; | |
189 } | |
190 for $Value (@{$CalculatedValuesRef}) { | |
191 push @NewLineWords, $Value; | |
192 } | |
193 for $ColNum (@{$ColNumsAfterNewRef}) { | |
194 push @NewLineWords, $OldLineWordsRef->[$ColNum]; | |
195 } | |
196 $NewLine = JoinWords(\@NewLineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
197 print $NewTextFileRef "$NewLine\n"; | |
198 } | |
199 | |
200 # Print out error message... | |
201 sub PrintErrorMsg { | |
202 my($Line, $ErrorMsg) = @_; | |
203 | |
204 if ($OptionsInfo{DetailLevel} >= 2 ) { | |
205 print "$ErrorMsg: $Line\n"; | |
206 } | |
207 elsif ($OptionsInfo{DetailLevel} >= 1) { | |
208 print "$ErrorMsg\n"; | |
209 } | |
210 } | |
211 | |
212 # Process formula columns and other information... | |
213 sub RetrieveColumnsAndLabelsInfo { | |
214 RetrieveFormulaColumnsInfo(); | |
215 RetrieveStartColumnsAndValueLabelsInfo(); | |
216 } | |
217 | |
218 # Make sure specified formula column are okay... | |
219 sub RetrieveFormulaColumnsInfo { | |
220 my($Index, $TextFile); | |
221 | |
222 @{$TextFilesInfo{FormulaColNum}} = (); | |
223 | |
224 FILELIST: for $Index (0 .. $#TextFilesList) { | |
225 $TextFile = $TextFilesList[$Index]; | |
226 | |
227 $TextFilesInfo{FormulaColNum}[$Index] = 0; | |
228 | |
229 if ($TextFilesInfo{FileOkay}[$Index]) { | |
230 my($FormulaColNum, $FormulaColValid); | |
231 | |
232 $FormulaColNum = 0; | |
233 $FormulaColValid = 0; | |
234 if ($OptionsInfo{SpecifiedFormulaCol}) { | |
235 if ($OptionsInfo{ColMode} =~ /^colnum$/i) { | |
236 if ($OptionsInfo{SpecifiedFormulaCol} <= $TextFilesInfo{ColCount}[$Index]) { | |
237 $FormulaColNum = $OptionsInfo{SpecifiedFormulaCol} - 1; | |
238 $FormulaColValid = 1; | |
239 } | |
240 } | |
241 else { | |
242 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$OptionsInfo{SpecifiedFormulaCol}})) { | |
243 $FormulaColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$OptionsInfo{SpecifiedFormulaCol}}; | |
244 $FormulaColValid = 1; | |
245 } | |
246 } | |
247 } | |
248 else { | |
249 # Grab the first column with the word Formula in its label... | |
250 my($ColLabel); | |
251 LABEL: for $ColLabel (@{$TextFilesInfo{ColLabels}[$Index]}) { | |
252 if ($ColLabel =~ /Formula/i) { | |
253 $FormulaColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel}; | |
254 $FormulaColValid = 1; | |
255 last LABEL; | |
256 } | |
257 } | |
258 } | |
259 if ($FormulaColValid) { | |
260 $TextFilesInfo{FormulaColNum}[$Index] = $FormulaColNum; | |
261 } | |
262 else { | |
263 if ($OptionsInfo{SpecifiedFormulaCol}) { | |
264 warn "Warning: Ignoring file $TextFile: Formula column specified, $OptionsInfo{SpecifiedFormulaCol}, using \"f --formulacol\" option doesn't exist\n"; | |
265 } | |
266 else { | |
267 warn "Warning: Ignoring file $TextFile: Column label containing the word Formula doesn't exist\n"; | |
268 } | |
269 $TextFilesInfo{FileOkay}[$Index] = 0; | |
270 } | |
271 } | |
272 } | |
273 } | |
274 | |
275 # Setup starting column number for adding calculated values and | |
276 # column lables to use for these values... | |
277 sub RetrieveStartColumnsAndValueLabelsInfo { | |
278 my($Index, $TextFile, $SpecifiedStartColNum, $StartColNum, $Label, $Value, $NewLabel, $Count, $BeforeStartColNum, $AfterStartColNum, $FirstColNum, $LastColNum, $ColNum, $Part1StartColNum, $Part1EndColNum, $Part2StartColNum, $Part2EndColNum, @Part1ColNums, @Part2ColNums); | |
279 | |
280 # Start column number for inserting new values... | |
281 $SpecifiedStartColNum = "last"; | |
282 if (defined($OptionsInfo{StartCol})) { | |
283 if (length($OptionsInfo{StartCol})) { | |
284 $SpecifiedStartColNum = $OptionsInfo{StartCol} | |
285 } | |
286 } | |
287 | |
288 # Column labels for for new calculated values... | |
289 my(%NewValueLabels) = (ElementalAnalysis => 'ElementalAnalysis', MolecularWeight => 'MolecularWeight', ExactMass => 'ExactMass'); | |
290 if (@{$OptionsInfo{SpecifiedValueLabels}}) { | |
291 for ($Index = 0; $Index < @{$OptionsInfo{SpecifiedValueLabels}}; $Index +=2) { | |
292 $Value = $OptionsInfo{SpecifiedValueLabels}[$Index]; | |
293 $Label = $OptionsInfo{SpecifiedValueLabels}[$Index + 1]; | |
294 if (exists $NewValueLabels{$Value}) { | |
295 $NewValueLabels{$Value} = $Label; | |
296 } | |
297 } | |
298 } | |
299 | |
300 @{$TextFilesInfo{ColNumsBeforeNew}} = (); | |
301 @{$TextFilesInfo{ColNumsAfterNew}} = (); | |
302 @{$TextFilesInfo{ValueLabelsMap}} = (); | |
303 | |
304 FILELIST: for $Index (0 .. $#TextFilesList) { | |
305 $TextFile = $TextFilesList[$Index]; | |
306 | |
307 @{$TextFilesInfo{ColNumsBeforeNew}[$Index]} = (); | |
308 @{$TextFilesInfo{ColNumsAfterNew}[$Index]} = (); | |
309 %{$TextFilesInfo{ValueLabelsMap}[$Index]} = (); | |
310 | |
311 if (!$TextFilesInfo{FileOkay}[$Index]) { | |
312 next FILELIST; | |
313 } | |
314 | |
315 if ($SpecifiedStartColNum !~ /^last$/i) { | |
316 if ($OptionsInfo{ColMode} =~ /^collabel$/i) { | |
317 if (exists($TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedStartColNum})) { | |
318 $StartColNum = $TextFilesInfo{ColLabelToNumMap}[$Index]{$SpecifiedStartColNum}; | |
319 } | |
320 else { | |
321 die "Error: Invalid value $SpecifiedStartColNum specified using \"-s --startcol\" option: column name doesn't exist in $TextFile \n"; | |
322 } | |
323 } | |
324 else { | |
325 if ($SpecifiedStartColNum > 0 && $SpecifiedStartColNum <= $TextFilesInfo{ColCount}[$Index]) { | |
326 $StartColNum -= 1; | |
327 } | |
328 else { | |
329 die "Error: Invalid value $SpecifiedStartColNum specified using \"-s --startcol\" option: column number doesn't exist in $TextFile \n"; | |
330 } | |
331 } | |
332 } | |
333 else { | |
334 $StartColNum = $TextFilesInfo{ColCount}[$Index] - 1; | |
335 } | |
336 # Set up columns lists for before and after the addition of calculated column values | |
337 # for each text file... | |
338 my($BeforeStartColNum, $AfterStartColNum, $FirstColNum, $LastColNum, $ColNum, $Part1StartColNum, $Part1EndColNum, $Part2StartColNum, $Part2EndColNum, @Part1ColNums, @Part2ColNums); | |
339 | |
340 $FirstColNum = 0; $LastColNum = $TextFilesInfo{ColCount}[$Index] - 1; | |
341 | |
342 $BeforeStartColNum = $StartColNum - 1; | |
343 $AfterStartColNum = $StartColNum + 1; | |
344 | |
345 if ($OptionsInfo{StartColMode} =~ /^after$/i) { | |
346 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $StartColNum; | |
347 $Part2StartColNum = $AfterStartColNum; $Part2EndColNum = $LastColNum; | |
348 } | |
349 else { | |
350 $Part1StartColNum = $FirstColNum; $Part1EndColNum = $BeforeStartColNum; | |
351 $Part2StartColNum = $StartColNum; $Part2EndColNum = $LastColNum; | |
352 } | |
353 @Part1ColNums = (); @Part2ColNums = (); | |
354 for $ColNum (0 .. $TextFilesInfo{ColCount}[$Index]) { | |
355 if ($ColNum >= $Part1StartColNum && $ColNum <= $Part1EndColNum) { | |
356 push @Part1ColNums, $ColNum; | |
357 } | |
358 } | |
359 for $ColNum (0 .. $TextFilesInfo{ColCount}[$Index]) { | |
360 if ($ColNum >= $Part2StartColNum && $ColNum <= $Part2EndColNum) { | |
361 push @Part2ColNums, $ColNum; | |
362 } | |
363 } | |
364 push @{$TextFilesInfo{ColNumsBeforeNew}[$Index]}, @Part1ColNums; | |
365 push @{$TextFilesInfo{ColNumsAfterNew}[$Index]}, @Part2ColNums; | |
366 | |
367 # Setup column labels for calculated values... | |
368 for $Value (keys %NewValueLabels) { | |
369 $Label = $NewValueLabels{$Value}; | |
370 | |
371 # Make sure it doesn't already exists... | |
372 $Count = 1; | |
373 $NewLabel = $Label; | |
374 while (exists $TextFilesInfo{ColLabelToNumMap}[$Index]{$NewLabel}) { | |
375 $Count++; | |
376 $NewLabel = $Label . $Count; | |
377 } | |
378 $TextFilesInfo{ValueLabelsMap}[$Index]{$Value} = $NewLabel; | |
379 } | |
380 } | |
381 } | |
382 | |
383 # Retrieve information about input text files... | |
384 sub RetrieveTextFilesInfo { | |
385 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @ColLabels, $OutFileRoot, $OutFile, $ColNum, $ColLabel); | |
386 | |
387 %TextFilesInfo = (); | |
388 | |
389 @{$TextFilesInfo{FileOkay}} = (); | |
390 @{$TextFilesInfo{ColCount}} = (); | |
391 @{$TextFilesInfo{ColLabels}} = (); | |
392 @{$TextFilesInfo{ColLabelToNumMap}} = (); | |
393 @{$TextFilesInfo{InDelim}} = (); | |
394 @{$TextFilesInfo{OutFile}} = (); | |
395 | |
396 FILELIST: for $Index (0 .. $#TextFilesList) { | |
397 $TextFile = $TextFilesList[$Index]; | |
398 | |
399 $TextFilesInfo{FileOkay}[$Index] = 0; | |
400 $TextFilesInfo{ColCount}[$Index] = 0; | |
401 $TextFilesInfo{InDelim}[$Index] = ""; | |
402 $TextFilesInfo{OutFile}[$Index] = ""; | |
403 | |
404 @{$TextFilesInfo{ColLabels}[$Index]} = (); | |
405 %{$TextFilesInfo{ColLabelToNumMap}[$Index]} = (); | |
406 | |
407 if (!(-e $TextFile)) { | |
408 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; | |
409 next FILELIST; | |
410 } | |
411 if (!CheckFileType($TextFile, "csv tsv")) { | |
412 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; | |
413 next FILELIST; | |
414 } | |
415 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
416 if ($FileExt =~ /^tsv$/i) { | |
417 $InDelim = "\t"; | |
418 } | |
419 else { | |
420 $InDelim = "\,"; | |
421 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
422 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; | |
423 next FILELIST; | |
424 } | |
425 if ($Options{indelim} =~ /^semicolon$/i) { | |
426 $InDelim = "\;"; | |
427 } | |
428 } | |
429 | |
430 if (!open TEXTFILE, "$TextFile") { | |
431 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; | |
432 next FILELIST; | |
433 } | |
434 | |
435 $Line = GetTextLine(\*TEXTFILE); | |
436 @ColLabels = quotewords($InDelim, 0, $Line); | |
437 close TEXTFILE; | |
438 | |
439 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
440 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); | |
441 $FileExt = "csv"; | |
442 if ($Options{outdelim} =~ /^tab$/i) { | |
443 $FileExt = "tsv"; | |
444 } | |
445 if ($Options{root} && (@TextFilesList == 1)) { | |
446 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); | |
447 if ($RootFileName && $RootFileExt) { | |
448 $FileName = $RootFileName; | |
449 } | |
450 else { | |
451 $FileName = $Options{root}; | |
452 } | |
453 $OutFileRoot = $FileName; | |
454 } | |
455 else { | |
456 $OutFileRoot = $FileName . "ElementalAnalysis"; | |
457 } | |
458 | |
459 $OutFile = $OutFileRoot . ".$FileExt"; | |
460 if (lc($OutFile) eq lc($TextFile)) { | |
461 warn "Warning: Ignoring file $TextFile:Output file name, $OutFile, is same as input text file name, $TextFile\n"; | |
462 next FILELIST; | |
463 } | |
464 if (!$Options{overwrite}) { | |
465 if (-e $OutFile) { | |
466 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; | |
467 next FILELIST; | |
468 } | |
469 } | |
470 | |
471 $TextFilesInfo{FileOkay}[$Index] = 1; | |
472 $TextFilesInfo{InDelim}[$Index] = $InDelim; | |
473 $TextFilesInfo{OutFile}[$Index] = "$OutFile"; | |
474 | |
475 $TextFilesInfo{ColCount}[$Index] = @ColLabels; | |
476 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; | |
477 for $ColNum (0 .. $#ColLabels) { | |
478 $ColLabel = $ColLabels[$ColNum]; | |
479 $TextFilesInfo{ColLabelToNumMap}[$Index]{$ColLabel} = $ColNum; | |
480 } | |
481 } | |
482 | |
483 } | |
484 | |
485 # Process option values... | |
486 sub ProcessOptions { | |
487 %OptionsInfo = (); | |
488 | |
489 $OptionsInfo{Mode} = $Options{mode}; | |
490 | |
491 $OptionsInfo{ColMode} = $Options{colmode}; | |
492 $OptionsInfo{StartColMode} = $Options{startcolmode}; | |
493 | |
494 $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef; | |
495 | |
496 $OptionsInfo{DetailLevel} = $Options{detail}; | |
497 $OptionsInfo{CheckFormula} = $Options{fast} ? 0 : 1; | |
498 $OptionsInfo{Precision} = $Options{precision}; | |
499 | |
500 $OptionsInfo{InDelim} = $Options{indelim}; | |
501 | |
502 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /^tab$/i ) ? "\t" : (($Options{outdelim} =~ /^semicolon$/i) ? "\;" : "\,"); | |
503 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; | |
504 | |
505 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
506 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; | |
507 | |
508 $OptionsInfo{StartCol} = defined $Options{startcol} ? $Options{startcol} : undef; | |
509 | |
510 $OptionsInfo{FormulaCol} = defined $Options{formulacol} ? $Options{formulacol} : undef; | |
511 $OptionsInfo{SpecifiedFormulaCol} = ""; | |
512 | |
513 if (defined $Options{formulacol}) { | |
514 $OptionsInfo{SpecifiedFormulaCol} = $Options{formulacol}; | |
515 if ($Options{colmode} =~ /^colnum$/i) { | |
516 if (!IsPositiveInteger($OptionsInfo{SpecifiedFormulaCol})) { | |
517 die "Error: Invalid value $Options{formulacol} specified using \"-f -formulacol\" option: Allowed values: > 0\n"; | |
518 } | |
519 } | |
520 } | |
521 | |
522 # Setup what to calculate... | |
523 @{$OptionsInfo{SpecifiedCalculations}} = (); | |
524 if ($Options{mode} =~ /^All$/i) { | |
525 @{$OptionsInfo{SpecifiedCalculations}} = qw(ElementalAnalysis MolecularWeight ExactMass); | |
526 } | |
527 else { | |
528 my($Mode, $ModeValue, @SpecifiedModeValues); | |
529 $Mode = $Options{mode}; | |
530 $Mode =~ s/ //g; | |
531 @SpecifiedModeValues = split /\,/, $Mode; | |
532 for $ModeValue (@SpecifiedModeValues) { | |
533 if ($ModeValue !~ /^(ElementalAnalysis|MolecularWeight|ExactMass)$/i) { | |
534 if ($ModeValue =~ /^All$/i) { | |
535 die "Error: All value for option \"-m --mode\" is not allowed with other valid values.\n"; | |
536 } | |
537 else { | |
538 die "Error: The value specified, $ModeValue, for option \"-m --mode\" is not valid. Allowed values: ElementalAnalysis, MolecularWeight, or ExactMass\n"; | |
539 } | |
540 } | |
541 push @{$OptionsInfo{SpecifiedCalculations}}, $ModeValue; | |
542 } | |
543 } | |
544 | |
545 $OptionsInfo{ValueColLabels} = defined $Options{valuecollabels} ? $Options{valuecollabels} : undef; | |
546 @{$OptionsInfo{SpecifiedValueLabels}} = (); | |
547 | |
548 if ($Options{valuecollabels}) { | |
549 my($Value, $Label, @ValueLabels); | |
550 @ValueLabels = split /\,/, $Options{valuecollabels}; | |
551 if (@ValueLabels % 2) { | |
552 die "Error: The value specified, $Options{valuecollabels}, for option \"-v --valuecollabels\" is not valid: It must contain even number of comma delimited values\n"; | |
553 } | |
554 my($Index); | |
555 for ($Index = 0; $Index < @ValueLabels; $Index +=2) { | |
556 $Value = $ValueLabels[$Index]; | |
557 $Value =~ s/ //g; | |
558 $Label = $ValueLabels[$Index + 1]; | |
559 if ($Value !~ /^(ElementalAnalysis|MolecularWeight|ExactMass)$/i) { | |
560 die "Error: The value specified, $Value, using option \"-v --valuecollabels\" is not valid. Allowed values: ElementalAnalysis, MolecularWeight, or ExactMass\n"; | |
561 } | |
562 push @{$OptionsInfo{SpecifiedValueLabels}}, ($Value, $Label); | |
563 } | |
564 } | |
565 } | |
566 | |
567 # Setup script usage and retrieve command line arguments specified using various options... | |
568 sub SetupScriptUsage { | |
569 | |
570 # Retrieve all the options... | |
571 %Options = (); | |
572 $Options{colmode} = "colnum"; | |
573 $Options{detail} = 1; | |
574 $Options{mode} = "All"; | |
575 $Options{indelim} = "comma"; | |
576 $Options{outdelim} = "comma"; | |
577 $Options{precision} = 2; | |
578 $Options{quote} = "yes"; | |
579 $Options{startcolmode} = "after"; | |
580 | |
581 if (!GetOptions(\%Options, "colmode|c=s", "detail|d=i", "fast", "formulacol|f=s", "help|h", "indelim=s", "mode|m=s", "outdelim=s", "overwrite|o", "precision|p=i", "quote|q=s", "root|r=s", "startcol|s=s", "startcolmode=s", "valuecollabels|v=s", "workingdir|w=s")) { | |
582 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
583 } | |
584 if ($Options{workingdir}) { | |
585 if (! -d $Options{workingdir}) { | |
586 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
587 } | |
588 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
589 } | |
590 if ($Options{colmode} !~ /^(colnum|collabel)$/i) { | |
591 die "Error: The value specified, $Options{colmode}, for option \"-c --colmode\" is not valid. Allowed values: colnum or collabel\n"; | |
592 } | |
593 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
594 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; | |
595 } | |
596 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
597 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
598 } | |
599 if (!IsPositiveInteger($Options{precision})) { | |
600 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; | |
601 } | |
602 if ($Options{quote} !~ /^(yes|no)$/i) { | |
603 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
604 } | |
605 if ($Options{startcolmode} !~ /^(before|after)$/i) { | |
606 die "Error: The value specified, $Options{quote}, for option \"--startcolmode\" is not valid. Allowed values: before or after\n"; | |
607 } | |
608 if (!IsPositiveInteger($Options{detail})) { | |
609 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; | |
610 } | |
611 } | |
612 | |
613 __END__ | |
614 | |
615 =head1 NAME | |
616 | |
617 ElementalAnalysisTextFiles.pl - Perform elemental analysis using formula column in TextFile(s) | |
618 | |
619 =head1 SYNOPSIS | |
620 | |
621 ElementalAnalysisTextFiles.pl TextFile(s)... | |
622 | |
623 ElementalAnalysisTextFiles.pl [B<-c, --colmode> colnum | collabel] [B<-d, --detail> infolevel] [B<-f, --fast>] | |
624 [B<-f, --formulacol> colnum | collabel] [B<-h, --help>] [B<--indelim> comma | semicolon] | |
625 [B<-m, --mode> All | "ElementalAnysis, [MolecularWeight, ExactMass]"] [B<-o, --overwrite>] | |
626 [B<--outdelim> comma | tab | semicolon] [B<-p, --precision> number] [B<-q, --quote> yes | no] | |
627 [B<-r, --root> rootname] [B<-s, --startcol> colnum | collabel] [B<--startcolmode> before | after] | |
628 B<-v --valuecollabels> [Name, Label, [Name, Label,...]] [B<-w, --workingdir> dirname] TextFile(s)... | |
629 | |
630 =head1 DESCRIPTION | |
631 | |
632 Perform elemental analysis using molecular formula column specified by a column number or label in | |
633 I<TextFile(s)>. | |
634 | |
635 In addition to straightforward molecular formulas - H2O, HCl, C3H7O2N - | |
636 other supported variations are: Ca3(PO4)2, [PCl4]+, [Fe(CN)6]4-, C37H42N2O6+2, Na2CO3.10H2O, | |
637 8H2S.46H2O, and so on. Charges are simply ignored. Isotope symbols in formulas specification, including | |
638 D and T, are not supported. | |
639 | |
640 The valid file extensions are I<.csv> and I<.tsv> for comma/semicolon and tab delimited | |
641 text files respectively. All other file names are ignored. All the text files in a | |
642 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory | |
643 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file | |
644 which doesn't correspond to the format indicated by B<--indelim> option is ignored. | |
645 | |
646 =head1 OPTIONS | |
647 | |
648 =over 4 | |
649 | |
650 =item B<-c, --colmode> I<colnum | collabel> | |
651 | |
652 Specify how columns are identified in I<TextFile(s)>: using column number or column | |
653 label. Possible values: I<colnum or collabel>. Default value: I<colnum>. | |
654 | |
655 =item B<-d, --detail> I<infolevel> | |
656 | |
657 Level of information to print about lines being ignored. Default: I<1>. Possible values: | |
658 I<1, 2 or 3>. | |
659 | |
660 =item B<-h, --help> | |
661 | |
662 Print this help message. | |
663 | |
664 =item B<--fast> | |
665 | |
666 In this mode, the formula column specified using B<-f, --formulacol> option is assumed | |
667 to contain valid molecular formula data and initial formula validation check is skipped. | |
668 | |
669 =item B<-f, --formulacol> I<col number | col name> | |
670 | |
671 This value is mode specific. It specifies molecular formula column to use for performing | |
672 elemental analysis on I<TextFile(s)>. Possible values: I<col number or col label>. | |
673 Default value: I<first column containing the word formula in its column label>. | |
674 | |
675 =item B<-m, --mode> I<All | "ElementalAnalysis,[MolecularWeight,ExactMass]"> | |
676 | |
677 Specify what values to calculate using molecular formula in I<TextFile(s)>: calculate all supported | |
678 values or specify a comma delimited list of values. Possible values: I<All | "ElementalAnalysis, | |
679 [MolecularWeight, ExactMass]">. Default: I<All> | |
680 | |
681 =item B<--indelim> I<comma | semicolon> | |
682 | |
683 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
684 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
685 delimiter. | |
686 | |
687 =item B<-o, --overwrite> | |
688 | |
689 Overwrite existing files. | |
690 | |
691 =item B<--outdelim> I<comma | tab | semicolon> | |
692 | |
693 Output text file delimiter. Possible values: I<comma, tab, or semicolon> | |
694 Default value: I<comma>. | |
695 | |
696 =item B<-p, --precision> I<number> | |
697 | |
698 Precision of calculated values in the output file. Default: up to I<2> decimal places. | |
699 Valid values: positive integers. | |
700 | |
701 =item B<-q, --quote> I<yes | no> | |
702 | |
703 Put quotes around column values in output text file. Possible values: I<yes or | |
704 no>. Default value: I<yes>. | |
705 | |
706 =item B<-r, --root> I<rootname> | |
707 | |
708 New text file name is generated using the root: <Root>.<Ext>. Default new file | |
709 name: <InitialTextFileName>ElementalAnalysis.<Ext>. The csv, and tsv | |
710 <Ext> values are used for comma/semicolon, and tab delimited text files | |
711 respectively. This option is ignored for multiple input files. | |
712 | |
713 =item B<-s, --startcol> I<colnum | collabel> | |
714 | |
715 This value is mode specific. It specifies the column in text files which is | |
716 used for start adding calculated column values. For I<colnum> mode, specify | |
717 column number and for I<collabel> mode, specify column label. | |
718 | |
719 Default value: I<last>. Start merge after the last column. | |
720 | |
721 =item B<--startcolmode> I<before | after> | |
722 | |
723 Start adding calculated column values after the B<-s, --startcol> value. Possible values: I<before or after>. | |
724 Default value: I<after>. | |
725 | |
726 =item B<-v --valuecollabels> I<Name,Label,[Name,Label,...]> | |
727 | |
728 Specify column labels to use for calculated values. In general, it's a comma delimited | |
729 list of value name and column label pairs. Supported value names: I<ElementalAnalysis, | |
730 MolecularWeight, and ExactMass>. Default labels: I<ElementalAnalysis, MolecularWeight, | |
731 and ExactMass>. | |
732 | |
733 =item B<-w, --workingdir> I<dirname> | |
734 | |
735 Location of working directory. Default: current directory. | |
736 | |
737 =back | |
738 | |
739 =head1 EXAMPLES | |
740 | |
741 To perform elemental analysis, calculate molecular weight and exact mass using formulas | |
742 in a column with the word Formula in its column label and generate a new CSV text | |
743 file NewSample1.csv, type: | |
744 | |
745 % ElementalAnalysisTextFiles.pl -o -r NewSample1 Sample1.csv | |
746 | |
747 To perform elemental analysis using formulas in column number two, use column label | |
748 Analysis for calculated data, and generate a new CSV text file NewSample1.csv, type: | |
749 | |
750 % ElementalAnalysisTextFiles.pl --m ElementalAnalysis --formulacol 2 | |
751 --valuecollabels "ElementalAnalysis,Analysis" -o -r NewSample1 | |
752 Sample1.csv | |
753 | |
754 To calculate molecular weight using formula in column label Formula with four decimal | |
755 precision and generate a new CSV text file NewSample1.csv, type | |
756 | |
757 % ElementalAnalysisTextFiles.pl --m MolecularWeight --colmode collabel | |
758 --formulacol Formula --precision 4 -o -r NewSample1 Sample1.csv | |
759 | |
760 To calculate exact mass using formula in column label Formula with four decimal | |
761 precision, adding column for exact mass right after Formula column, and generate a | |
762 new CSV text file NewSample1.csv, type | |
763 | |
764 % ElementalAnalysisTextFiles.pl --m ExactMass --colmode collabel | |
765 --formulacol Formula --precision 4 --startcolmode after | |
766 --startcol Formula -o -r NewSample1 Sample1.csv | |
767 | |
768 | |
769 =head1 AUTHOR | |
770 | |
771 Manish Sud <msud@san.rr.com> | |
772 | |
773 =head1 SEE ALSO | |
774 | |
775 AnalyzeTextFilesData.pl, InfoTextFiles.pl, ExtractFromTextFiles.pl | |
776 | |
777 =head1 COPYRIGHT | |
778 | |
779 Copyright (C) 2015 Manish Sud. All rights reserved. | |
780 | |
781 This file is part of MayaChemTools. | |
782 | |
783 MayaChemTools is free software; you can redistribute it and/or modify it under | |
784 the terms of the GNU Lesser General Public License as published by the Free | |
785 Software Foundation; either version 3 of the License, or (at your option) | |
786 any later version. | |
787 | |
788 =cut |