1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ElementalAnalysisSDFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:19 $ 5 # $Revision: 1.24 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use SDFileUtil; 37 use TextUtil; 38 use MolecularFormula; 39 use FileIO::SDFileIO; 40 use Molecule; 41 42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 43 44 # Autoflush STDOUT 45 $| = 1; 46 47 # Starting message... 48 $ScriptName = basename($0); 49 print "\n$ScriptName: Starting...\n\n"; 50 $StartTime = new Benchmark; 51 52 # Get the options and setup script... 53 SetupScriptUsage(); 54 if ($Options{help} || @ARGV < 1) { 55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 56 } 57 58 my(@SDFilesList); 59 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 60 61 print "Processing options...\n"; 62 my(%OptionsInfo); 63 ProcessOptions(); 64 65 print "Checking input SD file(s)...\n"; 66 my(%SDFilesInfo); 67 RetrieveSDFilesInfo(); 68 69 # Generate output files... 70 my($FileIndex); 71 if (@SDFilesList > 1) { 72 print "\nProcessing SD files...\n"; 73 } 74 for $FileIndex (0 .. $#SDFilesList) { 75 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 76 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 77 PerformElementalAnalysis($FileIndex); 78 } 79 } 80 print "\n$ScriptName:Done...\n\n"; 81 82 $EndTime = new Benchmark; 83 $TotalTime = timediff ($EndTime, $StartTime); 84 print "Total time: ", timestr($TotalTime), "\n"; 85 86 ############################################################################### 87 88 # Perform analysis... 89 sub PerformElementalAnalysis { 90 my($Index) = @_; 91 my($SDFile, $NewSDFile, $KeyDataFieldName, $CmpdCount, $CurrentFormula, $FormulaFieldName, $CmpdString, $Value, $CalculationType, $CalculatedValue, $ErrorMsg, $Status, $ElementsRef, $ElementCompositionRef, $Molecule, @CalculatedValues, @CmpdLines, %DataFieldValuesMap); 92 93 $SDFile = $SDFilesList[$Index]; 94 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; 95 96 print "Generating new SD file $NewSDFile...\n"; 97 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 98 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 99 100 101 $CmpdCount = 0; 102 $FormulaFieldName = $SDFilesInfo{FormulaFieldName}[$Index]; 103 104 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 105 $CmpdCount++; 106 @CmpdLines = split "\n", $CmpdString; 107 %DataFieldValuesMap = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 108 109 @CalculatedValues = (); 110 for $Value (@{$OptionsInfo{SpecifiedCalculations}}) { 111 push @CalculatedValues, ''; 112 } 113 114 $CurrentFormula = undef; 115 116 if ($OptionsInfo{UseStructureData}) { 117 $Molecule = FileIO::SDFileIO::ParseMoleculeString($CmpdString); 118 $CurrentFormula = $Molecule->GetMolecularFormula(); 119 } 120 else { 121 if (!exists $DataFieldValuesMap{$FormulaFieldName}) { 122 $ErrorMsg = "Ignoring compound record $CmpdCount: Formula field $FormulaFieldName not found"; 123 PrintErrorMsg($CmpdString, $ErrorMsg); 124 WriteNewCompoundRecord($Index, \*NEWSDFILE, \@CmpdLines, \@CalculatedValues); 125 next COMPOUND; 126 } 127 128 # Make sure it's a valid molecular formula... 129 $CurrentFormula = $DataFieldValuesMap{$FormulaFieldName}; 130 if ($OptionsInfo{CheckFormula}) { 131 ($Status, $ErrorMsg) = MolecularFormula::IsMolecularFormula($CurrentFormula); 132 if (!$Status) { 133 $ErrorMsg = "Ignoring compound record $CmpdCount: Formula field value $CurrentFormula is not valid: $ErrorMsg"; 134 PrintErrorMsg($CmpdString, $ErrorMsg); 135 WriteNewCompoundRecord($Index, \*NEWSDFILE, \@CmpdLines, \@CalculatedValues); 136 next COMPOUND; 137 } 138 } 139 } 140 141 # Calculate appropriate values and write 'em out... 142 @CalculatedValues = (); 143 for $CalculationType (@{$OptionsInfo{SpecifiedCalculations}}) { 144 if ($CalculationType =~ /^ElementalAnalysis$/i) { 145 ($ElementsRef, $ElementCompositionRef) = MolecularFormula::CalculateElementalComposition($CurrentFormula); 146 $CalculatedValue = (defined($ElementsRef) && defined($ElementCompositionRef)) ? MolecularFormula::FormatCompositionInfomation($ElementsRef, $ElementCompositionRef, $OptionsInfo{Precision}) : ''; 147 } 148 elsif ($CalculationType =~ /^MolecularWeight$/i) { 149 $CalculatedValue = MolecularFormula::CalculateMolecularWeight($CurrentFormula); 150 $CalculatedValue = (defined($CalculatedValue) && length($CalculatedValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CalculatedValue)) : ""; 151 } 152 elsif ($CalculationType =~ /^ExactMass$/i) { 153 $CalculatedValue = MolecularFormula::CalculateExactMass($CurrentFormula); 154 $CalculatedValue = (defined($CalculatedValue) && length($CalculatedValue)) ? (sprintf("%.$OptionsInfo{Precision}f", $CalculatedValue)) : ""; 155 } 156 else { 157 $CalculatedValue = ''; 158 } 159 push @CalculatedValues, $CalculatedValue; 160 } 161 WriteNewCompoundRecord($Index, \*NEWSDFILE, \@CmpdLines, \@CalculatedValues, $CurrentFormula); 162 } 163 close NEWSDFILE; 164 close SDFILE; 165 } 166 167 # Write out compound record with calculated values... 168 sub WriteNewCompoundRecord { 169 my($Index, $SDFileRef, $CmpdLinesRef, $CalculatedValuesRef, $MolecularFormula) = @_; 170 171 # Write out compound lines except the last line which contains $$$$... 172 my($LineIndex); 173 for $LineIndex (0 .. ($#{$CmpdLinesRef} - 1)) { 174 print $SDFileRef "$CmpdLinesRef->[$LineIndex]\n"; 175 } 176 177 # Write out calculated values... 178 my($CalcIndex, $FieldName, $FieldValue); 179 for $CalcIndex (0 .. $#{$OptionsInfo{SpecifiedCalculations}}) { 180 $FieldName = $SDFilesInfo{ValueFieldNamesMap}[$Index]{$OptionsInfo{SpecifiedCalculations}[$CalcIndex]}; 181 $FieldValue = $CalculatedValuesRef->[$CalcIndex]; 182 print $SDFileRef "> <$FieldName>\n$FieldValue\n\n"; 183 } 184 185 if ($OptionsInfo{UseStructureData} && $OptionsInfo{WriteOutFormula} && defined($MolecularFormula)) { 186 $FieldName = $SDFilesInfo{ValueFieldNamesMap}[$Index]{MolecularFormula}; 187 $FieldValue = $MolecularFormula; 188 print $SDFileRef "> <$FieldName>\n$FieldValue\n\n"; 189 } 190 191 print $SDFileRef "\$\$\$\$\n"; 192 } 193 194 # Print out error message... 195 sub PrintErrorMsg { 196 my($CmpdString, $ErrorMsg) = @_; 197 198 if ($OptionsInfo{DetailLevel} >= 2 ) { 199 print "$ErrorMsg:\n$CmpdString\n"; 200 } 201 elsif ($OptionsInfo{DetailLevel} >= 1) { 202 print "$ErrorMsg\n"; 203 } 204 } 205 206 # Retrieve information about input SD files... 207 sub RetrieveSDFilesInfo { 208 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $FormulaFieldName, $Value, $FieldName, $NewFieldName, $Count); 209 210 my(%NewValueFieldNames) = (ElementalAnalysis => 'ElementalAnalysis', MolecularWeight => 'MolecularWeight', ExactMass => 'ExactMass', MolecularFormula => 'MolecularFormula'); 211 if (@{$OptionsInfo{SpecifiedValueFieldNames}}) { 212 for ($Index = 0; $Index < @{$OptionsInfo{SpecifiedValueFieldNames}}; $Index +=2) { 213 $Value = $OptionsInfo{SpecifiedValueFieldNames}[$Index]; 214 $FieldName = $OptionsInfo{SpecifiedValueFieldNames}[$Index + 1]; 215 if (exists $NewValueFieldNames{$Value}) { 216 $NewValueFieldNames{$Value} = $FieldName; 217 } 218 } 219 } 220 221 %SDFilesInfo = (); 222 223 @{$SDFilesInfo{FileOkay}} = (); 224 @{$SDFilesInfo{OutFile}} = (); 225 @{$SDFilesInfo{FormulaFieldName}} = (); 226 @{$SDFilesInfo{ValueFieldNamesMap}} = (); 227 228 FILELIST: for $Index (0 .. $#SDFilesList) { 229 $SDFile = $SDFilesList[$Index]; 230 231 $SDFilesInfo{FileOkay}[$Index] = 0; 232 $SDFilesInfo{OutFile}[$Index] = ''; 233 $SDFilesInfo{FormulaFieldName}[$Index] = ''; 234 235 %{$SDFilesInfo{ValueFieldNamesMap}[$Index]} = (); 236 237 if (!(-e $SDFile)) { 238 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 239 next FILELIST; 240 } 241 if (!CheckFileType($SDFile, "sd sdf")) { 242 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 243 next FILELIST; 244 } 245 $FileDir = ""; $FileName = ""; $FileExt = ""; 246 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 247 if ($Options{root} && (@SDFilesList == 1)) { 248 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 249 if ($RootFileName && $RootFileExt) { 250 $FileName = $RootFileName; 251 } 252 else { 253 $FileName = $Options{root}; 254 } 255 $OutFileRoot = $FileName; 256 } 257 else { 258 $OutFileRoot = $FileName . "ElementalAnalysis"; 259 } 260 261 $OutFile = $OutFileRoot . ".$FileExt"; 262 if (lc($OutFile) eq lc($SDFile)) { 263 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n"; 264 next FILELIST; 265 } 266 if (!$Options{overwrite}) { 267 if (-e $OutFile) { 268 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; 269 next FILELIST; 270 } 271 } 272 # Get data field names and values... 273 my($CmpdString, $FieldName, @CmpdLines, @DataFieldNames, %DataFieldNamesMap); 274 @DataFieldNames = (); 275 if (!open(SDFILE, "$SDFile")) { 276 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 277 next FILELIST; 278 } 279 $CmpdString = ReadCmpdString(\*SDFILE); 280 close SDFILE; 281 282 @CmpdLines = split "\n", $CmpdString; 283 @DataFieldNames = GetCmpdDataHeaderLabels(\@CmpdLines); 284 %DataFieldNamesMap = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 285 286 # Setup formula field name... 287 $FormulaFieldName = ''; 288 if ($OptionsInfo{UseDataField}) { 289 if ($OptionsInfo{SpecifiedFormulaFieldName}) { 290 $FormulaFieldName = $OptionsInfo{SpecifiedFormulaFieldName}; 291 } 292 else { 293 FIELDNAME: for $FieldName (@DataFieldNames) { 294 if ($FieldName =~ /Formula/i) { 295 $FormulaFieldName = $FieldName; 296 last FIELDNAME; 297 } 298 } 299 if (!$FormulaFieldName) { 300 warn "Warning: Ignoring file $SDFile: Data field label containing the word Formula doesn't exist\n"; 301 next FILELIST; 302 } 303 } 304 } 305 $SDFilesInfo{FileOkay}[$Index] = 1; 306 $SDFilesInfo{OutFile}[$Index] = $OutFile; 307 $SDFilesInfo{FormulaFieldName}[$Index] = $FormulaFieldName; 308 309 # Setup value data field names for calculated values... 310 for $Value (keys %NewValueFieldNames) { 311 $FieldName = $NewValueFieldNames{$Value}; 312 313 # Make sure it doesn't already exists... 314 $Count = 1; 315 $NewFieldName = $FieldName; 316 while (exists $DataFieldNamesMap{$NewFieldName}) { 317 $Count++; 318 $NewFieldName = $FieldName . $Count; 319 } 320 $SDFilesInfo{ValueFieldNamesMap}[$Index]{$Value} = $NewFieldName; 321 } 322 } 323 } 324 325 # Process option values... 326 sub ProcessOptions { 327 %OptionsInfo = (); 328 329 $OptionsInfo{Mode} = $Options{mode}; 330 $OptionsInfo{FormulaMode} = $Options{formulamode}; 331 332 $OptionsInfo{WriteOutFormula} = ($Options{formulaout} =~ /^Yes$/i) ? 1 : 0; 333 334 $OptionsInfo{UseStructureData} = ($Options{formulamode} =~ /^StructureData$/i) ? 1 : 0; 335 $OptionsInfo{UseDataField} = ($Options{formulamode} =~ /^DataField$/i) ? 1 : 0; 336 337 $OptionsInfo{Fast} = defined $Options{fast} ? $Options{fast} : undef; 338 339 $OptionsInfo{DetailLevel} = $Options{detail}; 340 $OptionsInfo{CheckFormula} = $Options{fast} ? 0 : 1; 341 $OptionsInfo{Precision} = $Options{precision}; 342 343 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 344 345 $OptionsInfo{FormulaField} = defined $Options{formulafield} ? $Options{formulafield} : undef; 346 $OptionsInfo{SpecifiedFormulaFieldName} = ""; 347 348 if (defined $Options{formulafield}) { 349 $OptionsInfo{SpecifiedFormulaFieldName} = $Options{formulafield}; 350 } 351 # Setup what to calculate... 352 @{$OptionsInfo{SpecifiedCalculations}} = (); 353 if ($Options{mode} =~ /^All$/i) { 354 @{$OptionsInfo{SpecifiedCalculations}} = qw(ElementalAnalysis MolecularWeight ExactMass); 355 } 356 else { 357 my($Mode, $ModeValue, @SpecifiedModeValues); 358 $Mode = $Options{mode}; 359 $Mode =~ s/ //g; 360 @SpecifiedModeValues = split /\,/, $Mode; 361 for $ModeValue (@SpecifiedModeValues) { 362 if ($ModeValue !~ /^(ElementalAnalysis|MolecularWeight|ExactMass)$/i) { 363 if ($ModeValue =~ /^All$/i) { 364 die "Error: All value for option \"-m --mode\" is not allowed with other valid values.\n"; 365 } 366 else { 367 die "Error: The value specified, $ModeValue, for option \"-m --mode\" is not valid. Allowed values: ElementalAnalysis, MolecularWeight, or ExactMass\n"; 368 } 369 } 370 push @{$OptionsInfo{SpecifiedCalculations}}, $ModeValue; 371 } 372 } 373 374 $OptionsInfo{ValueFieldNames} = defined $Options{valuefieldnames} ? $Options{valuefieldnames} : undef; 375 @{$OptionsInfo{SpecifiedValueFieldNames}} = (); 376 377 if ($Options{valuefieldnames}) { 378 my($Value, $Label, @ValueLabels); 379 @ValueLabels = split /\,/, $Options{valuefieldnames}; 380 if (@ValueLabels % 2) { 381 die "Error: The value specified, $Options{valuefieldnames}, for option \"-v --valuefieldnames\" is not valid: It must contain even number of comma delimited values\n"; 382 } 383 my($Index); 384 for ($Index = 0; $Index < @ValueLabels; $Index +=2) { 385 $Value = $ValueLabels[$Index]; 386 $Value =~ s/ //g; 387 $Label = $ValueLabels[$Index + 1]; 388 if ($Value !~ /^(ElementalAnalysis|MolecularWeight|ExactMass|MolecularFormula)$/i) { 389 die "Error: The value specified, $Value, using option \"-v --valuefieldnames\" is not valid. Allowed values: ElementalAnalysis, MolecularWeight, ExactMass, or MolecularFormula\n"; 390 } 391 push @{$OptionsInfo{SpecifiedValueFieldNames}}, ($Value, $Label); 392 } 393 } 394 } 395 396 # Setup script usage and retrieve command line arguments specified using various options... 397 sub SetupScriptUsage { 398 399 # Retrieve all the options... 400 %Options = (); 401 $Options{detail} = 1; 402 $Options{formulamode} = "DataField"; 403 $Options{formulaout} = "No"; 404 $Options{mode} = "All"; 405 $Options{precision} = 2; 406 407 if (!GetOptions(\%Options, "detail|d=i", "fast", "formulafield=s", "formulamode|f=s", "formulaout=s", "mode|m=s", "help|h", "overwrite|o", "precision|p=i", "root|r=s", "valuefieldnames|v=s", "workingdir|w=s")) { 408 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 409 } 410 if ($Options{workingdir}) { 411 if (! -d $Options{workingdir}) { 412 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 413 } 414 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 415 } 416 if (!IsPositiveInteger($Options{detail})) { 417 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 418 } 419 if ($Options{formulamode} !~ /^(StructureData|DataField)$/i) { 420 die "Error: The value specified, $Options{formulamode}, for option \"-f, --formulamode\" is not valid. Allowed values: StructureData or DataField \n"; 421 } 422 if ($Options{formulaout} !~ /^(Yes|No)$/i) { 423 die "Error: The value specified, $Options{formulaout}, for option \"--formulaout\" is not valid. Allowed values: Yes or No \n"; 424 } 425 if (!IsPositiveInteger($Options{precision})) { 426 die "Error: The value specified, $Options{precision}, for option \"-p --precision\" is not valid. Allowed values: > 0 \n"; 427 } 428 } 429