1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: ExtractFromSDFiles.pl,v $ 4 # $Date: 2015/03/22 19:11:27 $ 5 # $Revision: 1.48 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use SDFileUtil; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName:Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@SDFilesList); 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 57 58 # Process options... 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 # Collect information about SD files... 64 print "Checking input SD file(s)...\n"; 65 my(%SDFilesInfo); 66 RetrieveSDFilesInfo(); 67 68 # Generate output files... 69 my($FileIndex); 70 if (@SDFilesList > 1) { 71 print "\nProcessing SD files...\n"; 72 } 73 for $FileIndex (0 .. $#SDFilesList) { 74 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 75 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 76 ExtractFromSDFile($FileIndex); 77 } 78 } 79 print "\n$ScriptName:Done...\n\n"; 80 81 $EndTime = new Benchmark; 82 $TotalTime = timediff ($EndTime, $StartTime); 83 print "Total time: ", timestr($TotalTime), "\n"; 84 85 ############################################################################### 86 87 # Extract data from a SD file... 88 sub ExtractFromSDFile { 89 my($FileIndex) = @_; 90 91 OpenInputAndOutputFiles($FileIndex); 92 93 MODE: { 94 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) { 95 ExtractAllDataFields($FileIndex); 96 last MODE; 97 } 98 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) { 99 ExtractCommonDataFields($FileIndex); 100 last MODE; 101 } 102 if ($OptionsInfo{Mode} =~ /^DataFields$/i) { 103 ExtractDataFields($FileIndex); 104 last MODE; 105 } 106 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) { 107 ExtractDataFieldByList($FileIndex); 108 last MODE; 109 } 110 if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) { 111 ExtractDataFieldNotByList($FileIndex); 112 last MODE; 113 } 114 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) { 115 ExtractDataFieldsByValue($FileIndex); 116 last MODE; 117 } 118 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) { 119 ExtractDataFieldsByRegex($FileIndex); 120 last MODE; 121 } 122 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) { 123 ExtractRandomCompounds($FileIndex); 124 last MODE; 125 } 126 if ($OptionsInfo{Mode} =~ /^MolNames$/i) { 127 ExtractMolNames($FileIndex); 128 last MODE; 129 } 130 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) { 131 ExtractRecordNum($FileIndex); 132 last MODE; 133 } 134 if ($OptionsInfo{Mode} =~ /^RecordNums$/i) { 135 ExtractRecordNums($FileIndex); 136 last MODE; 137 } 138 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) { 139 ExtractRecordRange($FileIndex); 140 last MODE; 141 } 142 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) { 143 Extract2DCmpdRecords($FileIndex); 144 last MODE; 145 } 146 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) { 147 Extract3DCmpdRecords($FileIndex); 148 last MODE; 149 } 150 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 151 } 152 153 CloseInputAndOutputFiles(); 154 } 155 156 # Extract all data fields... 157 sub ExtractAllDataFields { 158 my($FileIndex) = @_; 159 my(@CmpdLines); 160 161 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 162 WriteTextFileColLabels(); 163 164 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 165 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 166 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 167 168 SetupDataValues(); 169 WriteTextFileCmpdData(); 170 WriteSDFileCmpdData(); 171 } 172 } 173 174 # Extract common data fields... 175 sub ExtractCommonDataFields { 176 my($FileIndex) = @_; 177 my(@CmpdLines); 178 179 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]}; 180 WriteTextFileColLabels(); 181 182 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 183 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 184 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 185 186 SetupDataValues(); 187 WriteTextFileCmpdData(); 188 WriteSDFileCmpdData(); 189 } 190 } 191 192 # Extract specified data fields... 193 sub ExtractDataFields { 194 my($FileIndex) = @_; 195 my(@CmpdLines); 196 197 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}}; 198 WriteTextFileColLabels(); 199 200 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 201 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 202 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 203 204 SetupDataValues(); 205 WriteTextFileCmpdData(); 206 WriteSDFileCmpdData(); 207 } 208 } 209 210 # Extract data fields using a list... 211 sub ExtractDataFieldByList { 212 my($FileIndex) = @_; 213 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); 214 215 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 216 WriteTextFileColLabels(); 217 218 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) { 219 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; 220 } 221 $SpecifiedDataFieldValuesFoundCount = 0; 222 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 223 224 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 225 $CmpdNum++; 226 227 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 228 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 229 230 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { 231 next CMPDSTRING; 232 } 233 234 SetupDataValues(); 235 236 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 237 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; 238 239 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { 240 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) { 241 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") { 242 $SpecifiedDataFieldValuesFoundCount++; 243 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found"; 244 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) { 245 WriteSDFileCmpdString(); 246 WriteTextFileCmpdData(); 247 } 248 } 249 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) { 250 WriteSDFileCmpdString(); 251 WriteTextFileCmpdData(); 252 } 253 } 254 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) { 255 last CMPDSTRING; 256 } 257 } 258 } 259 } 260 261 # Extract data field whose values are not on the specified list... 262 sub ExtractDataFieldNotByList { 263 my($FileIndex) = @_; 264 my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); 265 266 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 267 WriteTextFileColLabels(); 268 269 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; 270 271 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 272 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 273 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 274 275 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { 276 next CMPDSTRING; 277 } 278 279 SetupDataValues(); 280 281 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; 282 283 # Make sure the current value is not empty and is not only specified list of values... 284 if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { 285 next CMPDSTRING; 286 } 287 288 WriteSDFileCmpdString(); 289 WriteTextFileCmpdData(); 290 } 291 } 292 293 # Extract data fields by value... 294 sub ExtractDataFieldsByValue { 295 my($FileIndex) = @_; 296 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines); 297 298 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 299 WriteTextFileColLabels(); 300 301 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 302 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 303 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 304 305 SetupDataValues(); 306 $ViolationCount = 0; 307 308 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { 309 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { 310 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; 311 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label}; 312 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label}; 313 314 if ($OptionsInfo{NumericalComparison}) { 315 CRITERION: { 316 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 317 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 318 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 319 $Nothing = 1; 320 } 321 } 322 else { 323 CRITERION: { 324 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 325 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 326 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } 327 $Nothing = 1; 328 } 329 } 330 } 331 } 332 if ($ViolationCount <= $OptionsInfo{Violations}) { 333 WriteSDFileCmpdString(); 334 WriteTextFileCmpdData(); 335 } 336 } 337 } 338 339 # Extract data fields by value using regular expression match... 340 sub ExtractDataFieldsByRegex { 341 my($FileIndex) = @_; 342 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines); 343 344 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 345 WriteTextFileColLabels(); 346 347 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 348 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 349 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 350 351 SetupDataValues(); 352 $ViolationCount = 0; 353 354 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { 355 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { 356 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; 357 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label}; 358 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label}; 359 360 if ($OptionsInfo{RegexIgnoreCase}) { 361 CRITERION: { 362 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } 363 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } 364 $Nothing = 1; 365 } 366 } 367 else { 368 CRITERION: { 369 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } 370 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } 371 $Nothing = 1; 372 } 373 } 374 } 375 } 376 if ($ViolationCount <= $OptionsInfo{Violations}) { 377 WriteSDFileCmpdString(); 378 WriteTextFileCmpdData(); 379 } 380 } 381 } 382 383 # Extract random compounds... 384 sub ExtractRandomCompounds { 385 my($FileIndex) = @_; 386 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap); 387 388 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 389 WriteTextFileColLabels(); 390 391 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex]; 392 srand($OptionsInfo{Seed}); 393 $RandomCycleCount = 0; 394 395 %RandomCmpdIndexMap = (); 396 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) { 397 $RandomCycleCount++; 398 $RandomIndex = int (rand $CmpdCount) + 1; 399 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; 400 } 401 402 $CmpdNum = 0; 403 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 404 $CmpdNum++; 405 if (!exists $RandomCmpdIndexMap{$CmpdNum}) { 406 next CMPDSTRING; 407 } 408 409 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 410 411 WriteSDFileCmpdString(); 412 413 if ($OptionsInfo{OutputTextFile}) { 414 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 415 SetupDataValues(); 416 WriteTextFileCmpdData(); 417 } 418 } 419 } 420 421 # Extract mol names... 422 sub ExtractMolNames { 423 my($FileIndex) = @_; 424 my($MolName, $NewTextFileRef, @CmpdLines); 425 426 push @{$SDFilesInfo{DataLabels}}, "MolName"; 427 WriteTextFileColLabels(); 428 429 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 430 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 431 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 432 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote}); 433 print $NewTextFileRef "$MolName\n"; 434 } 435 } 436 437 # Extract a specific compound record... 438 sub ExtractRecordNum { 439 my($FileIndex) = @_; 440 my($CmpdNum, @CmpdLines); 441 442 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 443 WriteTextFileColLabels(); 444 445 $CmpdNum = 0; 446 447 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 448 $CmpdNum++; 449 if ($CmpdNum != $OptionsInfo{RecordNum}) { 450 next CMPDSTRING; 451 } 452 453 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 454 WriteSDFileCmpdString(); 455 456 if ($OptionsInfo{OutputTextFile}) { 457 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 458 SetupDataValues(); 459 WriteTextFileCmpdData(); 460 } 461 last CMPDSTRING; 462 } 463 } 464 465 # Extract a specific compound records... 466 sub ExtractRecordNums { 467 my($FileIndex) = @_; 468 my($CmpdNum, $CmpdCount, @CmpdLines); 469 470 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 471 WriteTextFileColLabels(); 472 473 $CmpdNum = 0; 474 $CmpdCount = 0; 475 476 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 477 $CmpdNum++; 478 479 if (exists $OptionsInfo{RecordNums}{$CmpdNum}) { 480 $CmpdCount++; 481 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 482 483 WriteSDFileCmpdString(); 484 485 if ($OptionsInfo{OutputTextFile}) { 486 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 487 SetupDataValues(); 488 WriteTextFileCmpdData(); 489 } 490 } 491 elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) { 492 last CMPDSTRING; 493 } 494 } 495 } 496 497 498 # Extract compounds in a specific record range... 499 sub ExtractRecordRange { 500 my($FileIndex) = @_; 501 my($CmpdNum, @CmpdLines); 502 503 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 504 WriteTextFileColLabels(); 505 506 $CmpdNum = 0; 507 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 508 $CmpdNum++; 509 510 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) { 511 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 512 513 WriteSDFileCmpdString(); 514 515 if ($OptionsInfo{OutputTextFile}) { 516 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 517 SetupDataValues(); 518 WriteTextFileCmpdData(); 519 } 520 } 521 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) { 522 last CMPDSTRING; 523 } 524 } 525 } 526 527 # Extract 2D compound records... 528 sub Extract2DCmpdRecords { 529 my($FileIndex) = @_; 530 my(@CmpdLines); 531 532 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 533 WriteTextFileColLabels(); 534 535 536 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 537 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 538 if (!IsCmpd2D(\@CmpdLines)) { 539 next CMPDSTRING; 540 } 541 542 WriteSDFileCmpdString(); 543 544 if ($OptionsInfo{OutputTextFile}) { 545 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 546 SetupDataValues(); 547 WriteTextFileCmpdData(); 548 } 549 } 550 } 551 552 # Extract 3D compound records... 553 sub Extract3DCmpdRecords { 554 my($FileIndex) = @_; 555 my(@CmpdLines); 556 557 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; 558 WriteTextFileColLabels(); 559 560 561 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { 562 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; 563 if (!IsCmpd3D(\@CmpdLines)) { 564 next CMPDSTRING; 565 } 566 567 WriteSDFileCmpdString(); 568 569 if ($OptionsInfo{OutputTextFile}) { 570 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 571 SetupDataValues(); 572 WriteTextFileCmpdData(); 573 } 574 } 575 } 576 577 578 # Open input and output files... 579 sub OpenInputAndOutputFiles { 580 my($FileIndex) = @_; 581 582 $SDFilesInfo{NewTextFileRef} = undef; 583 $SDFilesInfo{NewSDFileRef} = undef; 584 585 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) { 586 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; 587 } 588 elsif ($OptionsInfo{OutputSDFile}) { 589 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n"; 590 } 591 else { 592 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; 593 } 594 595 if ($OptionsInfo{OutputSDFile}) { 596 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n"; 597 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE; 598 } 599 if ($OptionsInfo{OutputTextFile}) { 600 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n"; 601 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE; 602 } 603 604 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n"; 605 $SDFilesInfo{InputSDFileRef} = \*SDFILE; 606 607 } 608 609 # Close open input and output files... 610 sub CloseInputAndOutputFiles { 611 if ($SDFilesInfo{NewSDFileRef}) { 612 close $SDFilesInfo{NewSDFileRef}; 613 } 614 if ($SDFilesInfo{NewTextFileRef}) { 615 close $SDFilesInfo{NewTextFileRef}; 616 } 617 618 if ($SDFilesInfo{InputSDFileRef}) { 619 close $SDFilesInfo{InputSDFileRef}; 620 } 621 622 $SDFilesInfo{NewTextFileRef} = undef; 623 $SDFilesInfo{NewSDFileRef} = undef; 624 $SDFilesInfo{InputSDFileRef} = undef; 625 } 626 627 # Write out column labels for text file... 628 sub WriteTextFileColLabels { 629 my($ColLabelsLine, $NewTextFileRef); 630 631 if (!$OptionsInfo{OutputTextFile}) { 632 return; 633 } 634 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 635 636 if ($OptionsInfo{OutputStrDataString}) { 637 # Append structure data string label... 638 my(@DataLabels); 639 640 @DataLabels = (); 641 push @DataLabels, @{$SDFilesInfo{DataLabels}}; 642 push @DataLabels, "StructureDataString"; 643 644 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 645 } 646 else { 647 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 648 } 649 print $NewTextFileRef "$ColLabelsLine\n"; 650 } 651 652 # Setup values for data fields... 653 sub SetupDataValues { 654 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}}; 655 } 656 657 # Write out structure data and specific data fields to SD file... 658 sub WriteSDFileCmpdData { 659 my($MolString, $Count, $NewSDFileRef); 660 661 if (!$OptionsInfo{OutputSDFile}) { 662 return; 663 } 664 665 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; 666 667 ($MolString) = split "M END", $SDFilesInfo{CmpdString}; 668 $MolString .= "M END"; 669 print $NewSDFileRef "$MolString\n"; 670 671 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) { 672 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n"; 673 } 674 print $NewSDFileRef "\$\$\$\$\n"; 675 } 676 677 # Write out compound string... 678 sub WriteSDFileCmpdString { 679 my($NewSDFileRef); 680 681 if (!$OptionsInfo{OutputSDFile}) { 682 return; 683 } 684 685 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; 686 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n"; 687 } 688 689 # Write out data for text file... 690 sub WriteTextFileCmpdData { 691 my($DataValuesLine, $NewTextFileRef); 692 693 if (!$OptionsInfo{OutputTextFile}) { 694 return; 695 } 696 697 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; 698 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); 699 700 # Handle multiple lines data values for data fields by joining 'em using semicolons... 701 if ($DataValuesLine =~ /\n/) { 702 $DataValuesLine =~ s/\n/;/g; 703 } 704 705 if ($OptionsInfo{OutputStrDataString}) { 706 # Append structure data string... 707 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter); 708 709 if ($OptionsInfo{StrDataStringWithFields}) { 710 $StrDataString = $SDFilesInfo{CmpdString}; 711 } 712 else { 713 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString}; 714 $StrDataString .= "M END"; 715 } 716 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter}; 717 $StrDataString =~ s/\n/$StrDataStringDelimiter/g; 718 719 $OutDelim = $OptionsInfo{OutDelim}; 720 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : ""; 721 722 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; 723 } 724 else { 725 print $NewTextFileRef "$DataValuesLine\n"; 726 } 727 } 728 729 # Retrieve information about input SD files... 730 sub RetrieveSDFilesInfo { 731 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); 732 733 %SDFilesInfo = (); 734 735 @{$SDFilesInfo{FileOkay}} = (); 736 @{$SDFilesInfo{CmpdCount}} = (); 737 @{$SDFilesInfo{NewTextFileName}} = (); 738 @{$SDFilesInfo{NewSDFileName}} = (); 739 740 @{$SDFilesInfo{AllDataFieldLabels}} = (); 741 @{$SDFilesInfo{CommonDataFieldLabels}} = (); 742 743 FILELIST: for $Index (0 .. $#SDFilesList) { 744 $SDFile = $SDFilesList[$Index]; 745 746 $SDFilesInfo{FileOkay}[$Index] = 0; 747 748 $SDFilesInfo{CmpdCount}[$Index] = 0; 749 $SDFilesInfo{NewTextFileName}[$Index] = ""; 750 $SDFilesInfo{NewSDFileName}[$Index] = ""; 751 752 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = (); 753 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = (); 754 755 if (!(-e $SDFile)) { 756 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 757 next FILELIST; 758 } 759 760 if (!CheckFileType($SDFile, "sd sdf")) { 761 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 762 next FILELIST; 763 } 764 765 # Generate appropriate name for the new output file. 766 $FileDir = ""; $FileName = ""; $FileExt = ""; 767 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 768 $NewFileName = $FileName; 769 $NewFileName = $FileName . $OptionsInfo{FileNameMode}; 770 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 771 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 772 if ($RootFileName && $RootFileExt) { 773 $NewFileName = $RootFileName; 774 } 775 else { 776 $NewFileName = $OptionsInfo{OutFileRoot}; 777 } 778 } 779 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}"; 780 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}"; 781 782 if ($OptionsInfo{OutputSDFile}) { 783 if (lc($NewSDFileName) eq lc($SDFile)) { 784 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; 785 print "Specify a different name using \"-r --root\" option or use default name.\n"; 786 next FILELIST; 787 } 788 } 789 790 if (!$OptionsInfo{Overwrite}) { 791 if ($OptionsInfo{OutputSDFile}) { 792 if (-e $NewSDFileName) { 793 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; 794 next FILELIST; 795 } 796 } 797 if ($OptionsInfo{OutputTextFile}) { 798 if (-e $NewTextFileName) { 799 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; 800 next FILELIST; 801 } 802 } 803 } 804 805 if (!open SDFILE, "$SDFile") { 806 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 807 next FILELIST; 808 } 809 810 my($CountCmpds, $CollectDataFields); 811 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); 812 813 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0; 814 815 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0; 816 817 $CmpdCount = 0; 818 if ($CountCmpds || $CollectDataFields) { 819 @DataFieldLabels = (); 820 @CommonDataFieldLabels = (); 821 %DataFieldLabelsMap = (); 822 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 823 $CmpdCount++; 824 if ($OptionsInfo{Mode} =~ /^recordnum$/i) { 825 if ($CmpdCount == $OptionsInfo{RecordNum}) { 826 @CmpdLines = split "\n", $CmpdString; 827 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 828 last CMPDSTRING; 829 } 830 } 831 if ($CollectDataFields) { 832 my($Label); 833 @CmpdLines = split "\n", $CmpdString; 834 # Process compound data header labels and figure out which ones are present for 835 # all the compounds... 836 if (@DataFieldLabels) { 837 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); 838 my(%CmpdDataFieldLabelsMap) = (); 839 # Setup a map for the current labels... 840 for $Label (@CmpdDataFieldLabels) { 841 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; 842 } 843 # Check the presence old labels for this compound; otherwise, mark 'em new... 844 for $Label (@DataFieldLabels) { 845 if (!$CmpdDataFieldLabelsMap{$Label}) { 846 $DataFieldLabelsMap{$Label} = "PresentInSome"; 847 } 848 } 849 # Check the presence this compound in the old labels; otherwise, add 'em... 850 for $Label (@CmpdDataFieldLabels ) { 851 if (!$DataFieldLabelsMap{$Label}) { 852 # It's a new label... 853 push @DataFieldLabels, $Label; 854 $DataFieldLabelsMap{$Label} = "PresentInSome"; 855 } 856 } 857 } 858 else { 859 # Get the initial label set and set up a map... 860 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); 861 for $Label (@DataFieldLabels) { 862 $DataFieldLabelsMap{$Label} = "PresentInAll"; 863 } 864 } 865 # Identify the common data field labels... 866 if ($Options{mode} =~ /^commondatafields$/i) { 867 @CommonDataFieldLabels = (); 868 for $Label (@DataFieldLabels) { 869 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { 870 push @CommonDataFieldLabels, $Label; 871 } 872 } 873 } 874 } 875 } 876 } 877 878 $SDFilesInfo{FileOkay}[$Index] = 1; 879 880 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName; 881 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName; 882 883 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; 884 885 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels; 886 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels; 887 888 close SDFILE; 889 } 890 } 891 892 # Process options... 893 sub ProcessOptions { 894 %OptionsInfo = (); 895 896 $OptionsInfo{Mode} = $Options{mode}; 897 898 $OptionsInfo{InDelim} = "\,"; 899 if ($Options{indelim} =~ /^semicolon$/i) { 900 $OptionsInfo{InDelim} = "\;"; 901 } 902 elsif ($Options{indelim} =~ /^tab$/i) { 903 $OptionsInfo{InDelim} = "\t"; 904 } 905 906 $OptionsInfo{OutDelim} = "\,"; 907 if ($Options{outdelim} =~ /^semicolon$/i) { 908 $OptionsInfo{OutDelim} = "\;"; 909 } 910 elsif ($Options{outdelim} =~ /^tab$/i) { 911 $OptionsInfo{OutDelim} = "\t"; 912 } 913 914 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; 915 916 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0; 917 918 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; 919 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; 920 921 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds}; 922 923 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode}; 924 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0; 925 926 $OptionsInfo{Violations} = $Options{violations}; 927 $OptionsInfo{Seed} = $Options{seed}; 928 929 930 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { 931 if ($Options{datafields} || $Options{datafieldsfile}) { 932 if ($Options{datafields} && $Options{datafieldsfile}) { 933 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 934 } 935 } 936 else { 937 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; 938 } 939 } 940 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef; 941 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef; 942 943 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0; 944 945 %{$OptionsInfo{RecordNums}} = (); 946 $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0; 947 948 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef; 949 950 if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) { 951 if ($Options{record}) { 952 my($Record, @RecordSplit); 953 954 $Record = $Options{record}; 955 $Record =~ s/ //g; 956 957 @RecordSplit = split ",", $Record; 958 959 if ($Options{mode} =~ /^recordnum$/i ) { 960 if (@RecordSplit == 1) { 961 $OptionsInfo{RecordNum} = $RecordSplit[0]; 962 if ($OptionsInfo{RecordNum} <= 0) { 963 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 964 } 965 } 966 else { 967 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; 968 } 969 } 970 elsif ($Options{mode} =~ /^recordnums$/i ) { 971 my($RecordNum, $RecordCount, @SortedRecordSplit); 972 973 @SortedRecordSplit = sort { $a <=> $b } @RecordSplit; 974 975 $RecordCount = 0; 976 RECORDNUM: for $RecordNum (@SortedRecordSplit) { 977 if (exists $OptionsInfo{RecordNums}{$RecordNum}) { 978 next RECORDNUM; 979 } 980 $RecordCount++; 981 $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum; 982 } 983 $OptionsInfo{RecordNumsCount} = $RecordCount; 984 $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0]; 985 $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit]; 986 } 987 else { 988 if (@RecordSplit == 2) { 989 $OptionsInfo{StartRecordNum} = $RecordSplit[0]; 990 $OptionsInfo{EndRecordNum} = $RecordSplit[1]; 991 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) { 992 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; 993 } 994 } 995 else { 996 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; 997 } 998 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) { 999 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n"; 1000 } 1001 } 1002 } 1003 else { 1004 die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n"; 1005 } 1006 } 1007 1008 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1009 1010 my(@Words, $Line, $Value); 1011 if ($Options{mode} =~ /^datafields$/i) { 1012 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1013 if ($Options{datafields}) { 1014 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields}; 1015 } 1016 elsif ($Options{datafieldsfile}) { 1017 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1018 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1019 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1020 if (@Words) { 1021 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words; 1022 } 1023 } 1024 close DATAFIELDSFILE; 1025 } 1026 } 1027 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { 1028 my(@DataFieldsByValueTriplets); 1029 @DataFieldsByValueTriplets = (); 1030 if ($Options{datafields}) { 1031 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields}; 1032 } 1033 elsif ($Options{datafieldsfile}) { 1034 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1035 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1036 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1037 if (@Words) { 1038 push @DataFieldsByValueTriplets, @Words; 1039 } 1040 } 1041 close DATAFIELDSFILE; 1042 } 1043 if ((@DataFieldsByValueTriplets % 3)) { 1044 if ($Options{datafields}) { 1045 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; 1046 } 1047 elsif ($Options{datafieldsfile}) { 1048 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; 1049 } 1050 } 1051 my($Index, $Label, $Value, $Criterion); 1052 1053 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1054 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = (); 1055 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = (); 1056 1057 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { 1058 $Label = $DataFieldsByValueTriplets[$Index]; 1059 $Value = $DataFieldsByValueTriplets[$Index + 1]; 1060 $Criterion = $DataFieldsByValueTriplets[$Index + 2]; 1061 1062 if ($Criterion =~ /^(eq|le|ge)$/i) { 1063 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; 1064 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value; 1065 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion; 1066 } 1067 else { 1068 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; 1069 } 1070 } 1071 } 1072 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) { 1073 my(@DataFieldsByRegexTriplets); 1074 1075 @DataFieldsByRegexTriplets = (); 1076 if ($Options{datafields}) { 1077 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields}); 1078 } 1079 elsif ($Options{datafieldsfile}) { 1080 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1081 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1082 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1083 if (@Words) { 1084 push @DataFieldsByRegexTriplets, @Words; 1085 } 1086 } 1087 close DATAFIELDSFILE; 1088 } 1089 if ((@DataFieldsByRegexTriplets % 3)) { 1090 if ($Options{datafields}) { 1091 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n"; 1092 } 1093 elsif ($Options{datafieldsfile}) { 1094 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n"; 1095 } 1096 } 1097 1098 my($Index, $Label, $Value, $Criterion); 1099 1100 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); 1101 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = (); 1102 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = (); 1103 1104 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) { 1105 $Label = $DataFieldsByRegexTriplets[$Index]; 1106 $Value = $DataFieldsByRegexTriplets[$Index + 1]; 1107 $Criterion = $DataFieldsByRegexTriplets[$Index + 2]; 1108 1109 if ($Criterion =~ /^(eq|ne)$/i) { 1110 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; 1111 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value; 1112 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion; 1113 } 1114 else { 1115 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n"; 1116 } 1117 } 1118 } 1119 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { 1120 my($Index, @DataFieldAndValuesList); 1121 if ($Options{datafields}) { 1122 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields}; 1123 } 1124 elsif ($Options{datafieldsfile}) { 1125 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; 1126 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { 1127 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); 1128 if (@Words) { 1129 push @DataFieldAndValuesList, @Words; 1130 } 1131 } 1132 close DATAFIELDSFILE; 1133 } 1134 if (@DataFieldAndValuesList < 2) { 1135 if ($Options{datafields}) { 1136 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; 1137 } 1138 elsif ($Options{datafieldsfile}) { 1139 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; 1140 } 1141 } 1142 1143 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0]; 1144 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1; 1145 %{$OptionsInfo{SpecifiedDataFieldValues}} = (); 1146 1147 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { 1148 $Value = $DataFieldAndValuesList[$Index]; 1149 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; 1150 } 1151 } 1152 1153 $OptionsInfo{SDFileExt} = "sdf"; 1154 $OptionsInfo{TextFileExt} = "csv"; 1155 1156 if ($Options{outdelim} =~ /^tab$/i) { 1157 $OptionsInfo{TextFileExt} = "tsv"; 1158 } 1159 1160 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { 1161 $OptionsInfo{OutputSDFile} = 0; 1162 $OptionsInfo{OutputTextFile} = 1; 1163 } 1164 else { 1165 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; 1166 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; 1167 } 1168 1169 $OptionsInfo{StrDataString} = $Options{strdatastring}; 1170 $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; 1171 1172 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter}; 1173 1174 if (IsEmpty($Options{strdatastringdelimiter})) { 1175 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; 1176 } 1177 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode}; 1178 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; 1179 1180 MODE: { 1181 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; } 1182 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; } 1183 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; } 1184 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; } 1185 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; } 1186 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; } 1187 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; } 1188 if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; } 1189 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; } 1190 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; } 1191 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; } 1192 if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; } 1193 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; } 1194 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; } 1195 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; } 1196 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 1197 } 1198 1199 } 1200 1201 # Setup script usage and retrieve command line arguments specified using various options... 1202 sub SetupScriptUsage { 1203 1204 # Retrieve all the options... 1205 %Options = (); 1206 $Options{numofcmpds} = 1; 1207 $Options{mode} = "alldatafields"; 1208 $Options{indelim} = "comma"; 1209 $Options{outdelim} = "comma"; 1210 $Options{output} = "SD"; 1211 $Options{quote} = "yes"; 1212 $Options{regexignorecase} = "yes"; 1213 $Options{valuecomparisonmode} = "numeric"; 1214 $Options{violations} = 0; 1215 $Options{seed} = 123456789; 1216 1217 $Options{strdatastring} = "no"; 1218 $Options{strdatastringdelimiter} = "|"; 1219 $Options{strdatastringmode} = "StrOnly"; 1220 1221 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) { 1222 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 1223 } 1224 if ($Options{workingdir}) { 1225 if (! -d $Options{workingdir}) { 1226 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 1227 } 1228 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 1229 } 1230 if ($Options{numofcmpds} < 1) { 1231 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; 1232 } 1233 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) { 1234 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n"; 1235 } 1236 if ($Options{violations} < 0) { 1237 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; 1238 } 1239 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) { 1240 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; 1241 } 1242 if ($Options{output} !~ /^(SD|text|both)$/i) { 1243 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; 1244 } 1245 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { 1246 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1247 } 1248 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { 1249 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; 1250 } 1251 if ($Options{quote} !~ /^(yes|no)$/i) { 1252 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; 1253 } 1254 if ($Options{regexignorecase} !~ /^(yes|no)$/i) { 1255 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n"; 1256 } 1257 if ($Options{strdatastring} !~ /^(yes|no)$/i) { 1258 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; 1259 } 1260 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { 1261 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; 1262 } 1263 } 1264