Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/ExtractFromSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: ExtractFromSDFiles.pl,v $ | |
| 4 # $Date: 2015/03/22 19:11:27 $ | |
| 5 # $Revision: 1.48 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use Benchmark; | |
| 35 use SDFileUtil; | |
| 36 use FileUtil; | |
| 37 use TextUtil; | |
| 38 | |
| 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 40 | |
| 41 # Autoflush STDOUT | |
| 42 $| = 1; | |
| 43 | |
| 44 # Starting message... | |
| 45 $ScriptName = basename($0); | |
| 46 print "\n$ScriptName:Starting...\n\n"; | |
| 47 $StartTime = new Benchmark; | |
| 48 | |
| 49 # Get the options and setup script... | |
| 50 SetupScriptUsage(); | |
| 51 if ($Options{help} || @ARGV < 1) { | |
| 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 53 } | |
| 54 | |
| 55 my(@SDFilesList); | |
| 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
| 57 | |
| 58 # Process options... | |
| 59 print "Processing options...\n"; | |
| 60 my(%OptionsInfo); | |
| 61 ProcessOptions(); | |
| 62 | |
| 63 # Collect information about SD files... | |
| 64 print "Checking input SD file(s)...\n"; | |
| 65 my(%SDFilesInfo); | |
| 66 RetrieveSDFilesInfo(); | |
| 67 | |
| 68 # Generate output files... | |
| 69 my($FileIndex); | |
| 70 if (@SDFilesList > 1) { | |
| 71 print "\nProcessing SD files...\n"; | |
| 72 } | |
| 73 for $FileIndex (0 .. $#SDFilesList) { | |
| 74 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
| 75 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
| 76 ExtractFromSDFile($FileIndex); | |
| 77 } | |
| 78 } | |
| 79 print "\n$ScriptName:Done...\n\n"; | |
| 80 | |
| 81 $EndTime = new Benchmark; | |
| 82 $TotalTime = timediff ($EndTime, $StartTime); | |
| 83 print "Total time: ", timestr($TotalTime), "\n"; | |
| 84 | |
| 85 ############################################################################### | |
| 86 | |
| 87 # Extract data from a SD file... | |
| 88 sub ExtractFromSDFile { | |
| 89 my($FileIndex) = @_; | |
| 90 | |
| 91 OpenInputAndOutputFiles($FileIndex); | |
| 92 | |
| 93 MODE: { | |
| 94 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) { | |
| 95 ExtractAllDataFields($FileIndex); | |
| 96 last MODE; | |
| 97 } | |
| 98 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) { | |
| 99 ExtractCommonDataFields($FileIndex); | |
| 100 last MODE; | |
| 101 } | |
| 102 if ($OptionsInfo{Mode} =~ /^DataFields$/i) { | |
| 103 ExtractDataFields($FileIndex); | |
| 104 last MODE; | |
| 105 } | |
| 106 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) { | |
| 107 ExtractDataFieldByList($FileIndex); | |
| 108 last MODE; | |
| 109 } | |
| 110 if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) { | |
| 111 ExtractDataFieldNotByList($FileIndex); | |
| 112 last MODE; | |
| 113 } | |
| 114 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) { | |
| 115 ExtractDataFieldsByValue($FileIndex); | |
| 116 last MODE; | |
| 117 } | |
| 118 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) { | |
| 119 ExtractDataFieldsByRegex($FileIndex); | |
| 120 last MODE; | |
| 121 } | |
| 122 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) { | |
| 123 ExtractRandomCompounds($FileIndex); | |
| 124 last MODE; | |
| 125 } | |
| 126 if ($OptionsInfo{Mode} =~ /^MolNames$/i) { | |
| 127 ExtractMolNames($FileIndex); | |
| 128 last MODE; | |
| 129 } | |
| 130 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) { | |
| 131 ExtractRecordNum($FileIndex); | |
| 132 last MODE; | |
| 133 } | |
| 134 if ($OptionsInfo{Mode} =~ /^RecordNums$/i) { | |
| 135 ExtractRecordNums($FileIndex); | |
| 136 last MODE; | |
| 137 } | |
| 138 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) { | |
| 139 ExtractRecordRange($FileIndex); | |
| 140 last MODE; | |
| 141 } | |
| 142 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) { | |
| 143 Extract2DCmpdRecords($FileIndex); | |
| 144 last MODE; | |
| 145 } | |
| 146 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) { | |
| 147 Extract3DCmpdRecords($FileIndex); | |
| 148 last MODE; | |
| 149 } | |
| 150 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; | |
| 151 } | |
| 152 | |
| 153 CloseInputAndOutputFiles(); | |
| 154 } | |
| 155 | |
| 156 # Extract all data fields... | |
| 157 sub ExtractAllDataFields { | |
| 158 my($FileIndex) = @_; | |
| 159 my(@CmpdLines); | |
| 160 | |
| 161 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 162 WriteTextFileColLabels(); | |
| 163 | |
| 164 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 165 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 166 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 167 | |
| 168 SetupDataValues(); | |
| 169 WriteTextFileCmpdData(); | |
| 170 WriteSDFileCmpdData(); | |
| 171 } | |
| 172 } | |
| 173 | |
| 174 # Extract common data fields... | |
| 175 sub ExtractCommonDataFields { | |
| 176 my($FileIndex) = @_; | |
| 177 my(@CmpdLines); | |
| 178 | |
| 179 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]}; | |
| 180 WriteTextFileColLabels(); | |
| 181 | |
| 182 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 183 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 184 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 185 | |
| 186 SetupDataValues(); | |
| 187 WriteTextFileCmpdData(); | |
| 188 WriteSDFileCmpdData(); | |
| 189 } | |
| 190 } | |
| 191 | |
| 192 # Extract specified data fields... | |
| 193 sub ExtractDataFields { | |
| 194 my($FileIndex) = @_; | |
| 195 my(@CmpdLines); | |
| 196 | |
| 197 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}}; | |
| 198 WriteTextFileColLabels(); | |
| 199 | |
| 200 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 201 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 202 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 203 | |
| 204 SetupDataValues(); | |
| 205 WriteTextFileCmpdData(); | |
| 206 WriteSDFileCmpdData(); | |
| 207 } | |
| 208 } | |
| 209 | |
| 210 # Extract data fields using a list... | |
| 211 sub ExtractDataFieldByList { | |
| 212 my($FileIndex) = @_; | |
| 213 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); | |
| 214 | |
| 215 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 216 WriteTextFileColLabels(); | |
| 217 | |
| 218 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) { | |
| 219 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; | |
| 220 } | |
| 221 $SpecifiedDataFieldValuesFoundCount = 0; | |
| 222 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; | |
| 223 | |
| 224 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 225 $CmpdNum++; | |
| 226 | |
| 227 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 228 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 229 | |
| 230 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { | |
| 231 next CMPDSTRING; | |
| 232 } | |
| 233 | |
| 234 SetupDataValues(); | |
| 235 | |
| 236 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; | |
| 237 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; | |
| 238 | |
| 239 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { | |
| 240 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) { | |
| 241 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") { | |
| 242 $SpecifiedDataFieldValuesFoundCount++; | |
| 243 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found"; | |
| 244 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) { | |
| 245 WriteSDFileCmpdString(); | |
| 246 WriteTextFileCmpdData(); | |
| 247 } | |
| 248 } | |
| 249 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) { | |
| 250 WriteSDFileCmpdString(); | |
| 251 WriteTextFileCmpdData(); | |
| 252 } | |
| 253 } | |
| 254 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) { | |
| 255 last CMPDSTRING; | |
| 256 } | |
| 257 } | |
| 258 } | |
| 259 } | |
| 260 | |
| 261 # Extract data field whose values are not on the specified list... | |
| 262 sub ExtractDataFieldNotByList { | |
| 263 my($FileIndex) = @_; | |
| 264 my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); | |
| 265 | |
| 266 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 267 WriteTextFileColLabels(); | |
| 268 | |
| 269 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; | |
| 270 | |
| 271 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 272 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 273 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 274 | |
| 275 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { | |
| 276 next CMPDSTRING; | |
| 277 } | |
| 278 | |
| 279 SetupDataValues(); | |
| 280 | |
| 281 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; | |
| 282 | |
| 283 # Make sure the current value is not empty and is not only specified list of values... | |
| 284 if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { | |
| 285 next CMPDSTRING; | |
| 286 } | |
| 287 | |
| 288 WriteSDFileCmpdString(); | |
| 289 WriteTextFileCmpdData(); | |
| 290 } | |
| 291 } | |
| 292 | |
| 293 # Extract data fields by value... | |
| 294 sub ExtractDataFieldsByValue { | |
| 295 my($FileIndex) = @_; | |
| 296 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines); | |
| 297 | |
| 298 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 299 WriteTextFileColLabels(); | |
| 300 | |
| 301 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 302 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 303 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 304 | |
| 305 SetupDataValues(); | |
| 306 $ViolationCount = 0; | |
| 307 | |
| 308 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { | |
| 309 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { | |
| 310 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; | |
| 311 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label}; | |
| 312 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label}; | |
| 313 | |
| 314 if ($OptionsInfo{NumericalComparison}) { | |
| 315 CRITERION: { | |
| 316 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
| 317 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
| 318 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
| 319 $Nothing = 1; | |
| 320 } | |
| 321 } | |
| 322 else { | |
| 323 CRITERION: { | |
| 324 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
| 325 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
| 326 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
| 327 $Nothing = 1; | |
| 328 } | |
| 329 } | |
| 330 } | |
| 331 } | |
| 332 if ($ViolationCount <= $OptionsInfo{Violations}) { | |
| 333 WriteSDFileCmpdString(); | |
| 334 WriteTextFileCmpdData(); | |
| 335 } | |
| 336 } | |
| 337 } | |
| 338 | |
| 339 # Extract data fields by value using regular expression match... | |
| 340 sub ExtractDataFieldsByRegex { | |
| 341 my($FileIndex) = @_; | |
| 342 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines); | |
| 343 | |
| 344 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 345 WriteTextFileColLabels(); | |
| 346 | |
| 347 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 348 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 349 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 350 | |
| 351 SetupDataValues(); | |
| 352 $ViolationCount = 0; | |
| 353 | |
| 354 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { | |
| 355 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { | |
| 356 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; | |
| 357 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label}; | |
| 358 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label}; | |
| 359 | |
| 360 if ($OptionsInfo{RegexIgnoreCase}) { | |
| 361 CRITERION: { | |
| 362 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } | |
| 363 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } | |
| 364 $Nothing = 1; | |
| 365 } | |
| 366 } | |
| 367 else { | |
| 368 CRITERION: { | |
| 369 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } | |
| 370 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } | |
| 371 $Nothing = 1; | |
| 372 } | |
| 373 } | |
| 374 } | |
| 375 } | |
| 376 if ($ViolationCount <= $OptionsInfo{Violations}) { | |
| 377 WriteSDFileCmpdString(); | |
| 378 WriteTextFileCmpdData(); | |
| 379 } | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 # Extract random compounds... | |
| 384 sub ExtractRandomCompounds { | |
| 385 my($FileIndex) = @_; | |
| 386 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap); | |
| 387 | |
| 388 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 389 WriteTextFileColLabels(); | |
| 390 | |
| 391 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex]; | |
| 392 srand($OptionsInfo{Seed}); | |
| 393 $RandomCycleCount = 0; | |
| 394 | |
| 395 %RandomCmpdIndexMap = (); | |
| 396 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) { | |
| 397 $RandomCycleCount++; | |
| 398 $RandomIndex = int (rand $CmpdCount) + 1; | |
| 399 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; | |
| 400 } | |
| 401 | |
| 402 $CmpdNum = 0; | |
| 403 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 404 $CmpdNum++; | |
| 405 if (!exists $RandomCmpdIndexMap{$CmpdNum}) { | |
| 406 next CMPDSTRING; | |
| 407 } | |
| 408 | |
| 409 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 410 | |
| 411 WriteSDFileCmpdString(); | |
| 412 | |
| 413 if ($OptionsInfo{OutputTextFile}) { | |
| 414 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 415 SetupDataValues(); | |
| 416 WriteTextFileCmpdData(); | |
| 417 } | |
| 418 } | |
| 419 } | |
| 420 | |
| 421 # Extract mol names... | |
| 422 sub ExtractMolNames { | |
| 423 my($FileIndex) = @_; | |
| 424 my($MolName, $NewTextFileRef, @CmpdLines); | |
| 425 | |
| 426 push @{$SDFilesInfo{DataLabels}}, "MolName"; | |
| 427 WriteTextFileColLabels(); | |
| 428 | |
| 429 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; | |
| 430 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 431 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 432 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote}); | |
| 433 print $NewTextFileRef "$MolName\n"; | |
| 434 } | |
| 435 } | |
| 436 | |
| 437 # Extract a specific compound record... | |
| 438 sub ExtractRecordNum { | |
| 439 my($FileIndex) = @_; | |
| 440 my($CmpdNum, @CmpdLines); | |
| 441 | |
| 442 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 443 WriteTextFileColLabels(); | |
| 444 | |
| 445 $CmpdNum = 0; | |
| 446 | |
| 447 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 448 $CmpdNum++; | |
| 449 if ($CmpdNum != $OptionsInfo{RecordNum}) { | |
| 450 next CMPDSTRING; | |
| 451 } | |
| 452 | |
| 453 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 454 WriteSDFileCmpdString(); | |
| 455 | |
| 456 if ($OptionsInfo{OutputTextFile}) { | |
| 457 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 458 SetupDataValues(); | |
| 459 WriteTextFileCmpdData(); | |
| 460 } | |
| 461 last CMPDSTRING; | |
| 462 } | |
| 463 } | |
| 464 | |
| 465 # Extract a specific compound records... | |
| 466 sub ExtractRecordNums { | |
| 467 my($FileIndex) = @_; | |
| 468 my($CmpdNum, $CmpdCount, @CmpdLines); | |
| 469 | |
| 470 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 471 WriteTextFileColLabels(); | |
| 472 | |
| 473 $CmpdNum = 0; | |
| 474 $CmpdCount = 0; | |
| 475 | |
| 476 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 477 $CmpdNum++; | |
| 478 | |
| 479 if (exists $OptionsInfo{RecordNums}{$CmpdNum}) { | |
| 480 $CmpdCount++; | |
| 481 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 482 | |
| 483 WriteSDFileCmpdString(); | |
| 484 | |
| 485 if ($OptionsInfo{OutputTextFile}) { | |
| 486 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 487 SetupDataValues(); | |
| 488 WriteTextFileCmpdData(); | |
| 489 } | |
| 490 } | |
| 491 elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) { | |
| 492 last CMPDSTRING; | |
| 493 } | |
| 494 } | |
| 495 } | |
| 496 | |
| 497 | |
| 498 # Extract compounds in a specific record range... | |
| 499 sub ExtractRecordRange { | |
| 500 my($FileIndex) = @_; | |
| 501 my($CmpdNum, @CmpdLines); | |
| 502 | |
| 503 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 504 WriteTextFileColLabels(); | |
| 505 | |
| 506 $CmpdNum = 0; | |
| 507 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 508 $CmpdNum++; | |
| 509 | |
| 510 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) { | |
| 511 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 512 | |
| 513 WriteSDFileCmpdString(); | |
| 514 | |
| 515 if ($OptionsInfo{OutputTextFile}) { | |
| 516 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 517 SetupDataValues(); | |
| 518 WriteTextFileCmpdData(); | |
| 519 } | |
| 520 } | |
| 521 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) { | |
| 522 last CMPDSTRING; | |
| 523 } | |
| 524 } | |
| 525 } | |
| 526 | |
| 527 # Extract 2D compound records... | |
| 528 sub Extract2DCmpdRecords { | |
| 529 my($FileIndex) = @_; | |
| 530 my(@CmpdLines); | |
| 531 | |
| 532 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 533 WriteTextFileColLabels(); | |
| 534 | |
| 535 | |
| 536 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 537 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 538 if (!IsCmpd2D(\@CmpdLines)) { | |
| 539 next CMPDSTRING; | |
| 540 } | |
| 541 | |
| 542 WriteSDFileCmpdString(); | |
| 543 | |
| 544 if ($OptionsInfo{OutputTextFile}) { | |
| 545 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 546 SetupDataValues(); | |
| 547 WriteTextFileCmpdData(); | |
| 548 } | |
| 549 } | |
| 550 } | |
| 551 | |
| 552 # Extract 3D compound records... | |
| 553 sub Extract3DCmpdRecords { | |
| 554 my($FileIndex) = @_; | |
| 555 my(@CmpdLines); | |
| 556 | |
| 557 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
| 558 WriteTextFileColLabels(); | |
| 559 | |
| 560 | |
| 561 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
| 562 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
| 563 if (!IsCmpd3D(\@CmpdLines)) { | |
| 564 next CMPDSTRING; | |
| 565 } | |
| 566 | |
| 567 WriteSDFileCmpdString(); | |
| 568 | |
| 569 if ($OptionsInfo{OutputTextFile}) { | |
| 570 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
| 571 SetupDataValues(); | |
| 572 WriteTextFileCmpdData(); | |
| 573 } | |
| 574 } | |
| 575 } | |
| 576 | |
| 577 | |
| 578 # Open input and output files... | |
| 579 sub OpenInputAndOutputFiles { | |
| 580 my($FileIndex) = @_; | |
| 581 | |
| 582 $SDFilesInfo{NewTextFileRef} = undef; | |
| 583 $SDFilesInfo{NewSDFileRef} = undef; | |
| 584 | |
| 585 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) { | |
| 586 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; | |
| 587 } | |
| 588 elsif ($OptionsInfo{OutputSDFile}) { | |
| 589 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n"; | |
| 590 } | |
| 591 else { | |
| 592 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; | |
| 593 } | |
| 594 | |
| 595 if ($OptionsInfo{OutputSDFile}) { | |
| 596 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n"; | |
| 597 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE; | |
| 598 } | |
| 599 if ($OptionsInfo{OutputTextFile}) { | |
| 600 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n"; | |
| 601 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE; | |
| 602 } | |
| 603 | |
| 604 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n"; | |
| 605 $SDFilesInfo{InputSDFileRef} = \*SDFILE; | |
| 606 | |
| 607 } | |
| 608 | |
| 609 # Close open input and output files... | |
| 610 sub CloseInputAndOutputFiles { | |
| 611 if ($SDFilesInfo{NewSDFileRef}) { | |
| 612 close $SDFilesInfo{NewSDFileRef}; | |
| 613 } | |
| 614 if ($SDFilesInfo{NewTextFileRef}) { | |
| 615 close $SDFilesInfo{NewTextFileRef}; | |
| 616 } | |
| 617 | |
| 618 if ($SDFilesInfo{InputSDFileRef}) { | |
| 619 close $SDFilesInfo{InputSDFileRef}; | |
| 620 } | |
| 621 | |
| 622 $SDFilesInfo{NewTextFileRef} = undef; | |
| 623 $SDFilesInfo{NewSDFileRef} = undef; | |
| 624 $SDFilesInfo{InputSDFileRef} = undef; | |
| 625 } | |
| 626 | |
| 627 # Write out column labels for text file... | |
| 628 sub WriteTextFileColLabels { | |
| 629 my($ColLabelsLine, $NewTextFileRef); | |
| 630 | |
| 631 if (!$OptionsInfo{OutputTextFile}) { | |
| 632 return; | |
| 633 } | |
| 634 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; | |
| 635 | |
| 636 if ($OptionsInfo{OutputStrDataString}) { | |
| 637 # Append structure data string label... | |
| 638 my(@DataLabels); | |
| 639 | |
| 640 @DataLabels = (); | |
| 641 push @DataLabels, @{$SDFilesInfo{DataLabels}}; | |
| 642 push @DataLabels, "StructureDataString"; | |
| 643 | |
| 644 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 645 } | |
| 646 else { | |
| 647 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 648 } | |
| 649 print $NewTextFileRef "$ColLabelsLine\n"; | |
| 650 } | |
| 651 | |
| 652 # Setup values for data fields... | |
| 653 sub SetupDataValues { | |
| 654 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}}; | |
| 655 } | |
| 656 | |
| 657 # Write out structure data and specific data fields to SD file... | |
| 658 sub WriteSDFileCmpdData { | |
| 659 my($MolString, $Count, $NewSDFileRef); | |
| 660 | |
| 661 if (!$OptionsInfo{OutputSDFile}) { | |
| 662 return; | |
| 663 } | |
| 664 | |
| 665 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; | |
| 666 | |
| 667 ($MolString) = split "M END", $SDFilesInfo{CmpdString}; | |
| 668 $MolString .= "M END"; | |
| 669 print $NewSDFileRef "$MolString\n"; | |
| 670 | |
| 671 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) { | |
| 672 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n"; | |
| 673 } | |
| 674 print $NewSDFileRef "\$\$\$\$\n"; | |
| 675 } | |
| 676 | |
| 677 # Write out compound string... | |
| 678 sub WriteSDFileCmpdString { | |
| 679 my($NewSDFileRef); | |
| 680 | |
| 681 if (!$OptionsInfo{OutputSDFile}) { | |
| 682 return; | |
| 683 } | |
| 684 | |
| 685 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; | |
| 686 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n"; | |
| 687 } | |
| 688 | |
| 689 # Write out data for text file... | |
| 690 sub WriteTextFileCmpdData { | |
| 691 my($DataValuesLine, $NewTextFileRef); | |
| 692 | |
| 693 if (!$OptionsInfo{OutputTextFile}) { | |
| 694 return; | |
| 695 } | |
| 696 | |
| 697 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; | |
| 698 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 699 | |
| 700 # Handle multiple lines data values for data fields by joining 'em using semicolons... | |
| 701 if ($DataValuesLine =~ /\n/) { | |
| 702 $DataValuesLine =~ s/\n/;/g; | |
| 703 } | |
| 704 | |
| 705 if ($OptionsInfo{OutputStrDataString}) { | |
| 706 # Append structure data string... | |
| 707 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter); | |
| 708 | |
| 709 if ($OptionsInfo{StrDataStringWithFields}) { | |
| 710 $StrDataString = $SDFilesInfo{CmpdString}; | |
| 711 } | |
| 712 else { | |
| 713 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString}; | |
| 714 $StrDataString .= "M END"; | |
| 715 } | |
| 716 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter}; | |
| 717 $StrDataString =~ s/\n/$StrDataStringDelimiter/g; | |
| 718 | |
| 719 $OutDelim = $OptionsInfo{OutDelim}; | |
| 720 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : ""; | |
| 721 | |
| 722 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; | |
| 723 } | |
| 724 else { | |
| 725 print $NewTextFileRef "$DataValuesLine\n"; | |
| 726 } | |
| 727 } | |
| 728 | |
| 729 # Retrieve information about input SD files... | |
| 730 sub RetrieveSDFilesInfo { | |
| 731 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); | |
| 732 | |
| 733 %SDFilesInfo = (); | |
| 734 | |
| 735 @{$SDFilesInfo{FileOkay}} = (); | |
| 736 @{$SDFilesInfo{CmpdCount}} = (); | |
| 737 @{$SDFilesInfo{NewTextFileName}} = (); | |
| 738 @{$SDFilesInfo{NewSDFileName}} = (); | |
| 739 | |
| 740 @{$SDFilesInfo{AllDataFieldLabels}} = (); | |
| 741 @{$SDFilesInfo{CommonDataFieldLabels}} = (); | |
| 742 | |
| 743 FILELIST: for $Index (0 .. $#SDFilesList) { | |
| 744 $SDFile = $SDFilesList[$Index]; | |
| 745 | |
| 746 $SDFilesInfo{FileOkay}[$Index] = 0; | |
| 747 | |
| 748 $SDFilesInfo{CmpdCount}[$Index] = 0; | |
| 749 $SDFilesInfo{NewTextFileName}[$Index] = ""; | |
| 750 $SDFilesInfo{NewSDFileName}[$Index] = ""; | |
| 751 | |
| 752 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = (); | |
| 753 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = (); | |
| 754 | |
| 755 if (!(-e $SDFile)) { | |
| 756 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
| 757 next FILELIST; | |
| 758 } | |
| 759 | |
| 760 if (!CheckFileType($SDFile, "sd sdf")) { | |
| 761 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
| 762 next FILELIST; | |
| 763 } | |
| 764 | |
| 765 # Generate appropriate name for the new output file. | |
| 766 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 767 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
| 768 $NewFileName = $FileName; | |
| 769 $NewFileName = $FileName . $OptionsInfo{FileNameMode}; | |
| 770 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { | |
| 771 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
| 772 if ($RootFileName && $RootFileExt) { | |
| 773 $NewFileName = $RootFileName; | |
| 774 } | |
| 775 else { | |
| 776 $NewFileName = $OptionsInfo{OutFileRoot}; | |
| 777 } | |
| 778 } | |
| 779 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}"; | |
| 780 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}"; | |
| 781 | |
| 782 if ($OptionsInfo{OutputSDFile}) { | |
| 783 if (lc($NewSDFileName) eq lc($SDFile)) { | |
| 784 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; | |
| 785 print "Specify a different name using \"-r --root\" option or use default name.\n"; | |
| 786 next FILELIST; | |
| 787 } | |
| 788 } | |
| 789 | |
| 790 if (!$OptionsInfo{Overwrite}) { | |
| 791 if ($OptionsInfo{OutputSDFile}) { | |
| 792 if (-e $NewSDFileName) { | |
| 793 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; | |
| 794 next FILELIST; | |
| 795 } | |
| 796 } | |
| 797 if ($OptionsInfo{OutputTextFile}) { | |
| 798 if (-e $NewTextFileName) { | |
| 799 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; | |
| 800 next FILELIST; | |
| 801 } | |
| 802 } | |
| 803 } | |
| 804 | |
| 805 if (!open SDFILE, "$SDFile") { | |
| 806 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; | |
| 807 next FILELIST; | |
| 808 } | |
| 809 | |
| 810 my($CountCmpds, $CollectDataFields); | |
| 811 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); | |
| 812 | |
| 813 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0; | |
| 814 | |
| 815 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0; | |
| 816 | |
| 817 $CmpdCount = 0; | |
| 818 if ($CountCmpds || $CollectDataFields) { | |
| 819 @DataFieldLabels = (); | |
| 820 @CommonDataFieldLabels = (); | |
| 821 %DataFieldLabelsMap = (); | |
| 822 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
| 823 $CmpdCount++; | |
| 824 if ($OptionsInfo{Mode} =~ /^recordnum$/i) { | |
| 825 if ($CmpdCount == $OptionsInfo{RecordNum}) { | |
| 826 @CmpdLines = split "\n", $CmpdString; | |
| 827 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); | |
| 828 last CMPDSTRING; | |
| 829 } | |
| 830 } | |
| 831 if ($CollectDataFields) { | |
| 832 my($Label); | |
| 833 @CmpdLines = split "\n", $CmpdString; | |
| 834 # Process compound data header labels and figure out which ones are present for | |
| 835 # all the compounds... | |
| 836 if (@DataFieldLabels) { | |
| 837 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); | |
| 838 my(%CmpdDataFieldLabelsMap) = (); | |
| 839 # Setup a map for the current labels... | |
| 840 for $Label (@CmpdDataFieldLabels) { | |
| 841 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; | |
| 842 } | |
| 843 # Check the presence old labels for this compound; otherwise, mark 'em new... | |
| 844 for $Label (@DataFieldLabels) { | |
| 845 if (!$CmpdDataFieldLabelsMap{$Label}) { | |
| 846 $DataFieldLabelsMap{$Label} = "PresentInSome"; | |
| 847 } | |
| 848 } | |
| 849 # Check the presence this compound in the old labels; otherwise, add 'em... | |
| 850 for $Label (@CmpdDataFieldLabels ) { | |
| 851 if (!$DataFieldLabelsMap{$Label}) { | |
| 852 # It's a new label... | |
| 853 push @DataFieldLabels, $Label; | |
| 854 $DataFieldLabelsMap{$Label} = "PresentInSome"; | |
| 855 } | |
| 856 } | |
| 857 } | |
| 858 else { | |
| 859 # Get the initial label set and set up a map... | |
| 860 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); | |
| 861 for $Label (@DataFieldLabels) { | |
| 862 $DataFieldLabelsMap{$Label} = "PresentInAll"; | |
| 863 } | |
| 864 } | |
| 865 # Identify the common data field labels... | |
| 866 if ($Options{mode} =~ /^commondatafields$/i) { | |
| 867 @CommonDataFieldLabels = (); | |
| 868 for $Label (@DataFieldLabels) { | |
| 869 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { | |
| 870 push @CommonDataFieldLabels, $Label; | |
| 871 } | |
| 872 } | |
| 873 } | |
| 874 } | |
| 875 } | |
| 876 } | |
| 877 | |
| 878 $SDFilesInfo{FileOkay}[$Index] = 1; | |
| 879 | |
| 880 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName; | |
| 881 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName; | |
| 882 | |
| 883 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; | |
| 884 | |
| 885 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels; | |
| 886 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels; | |
| 887 | |
| 888 close SDFILE; | |
| 889 } | |
| 890 } | |
| 891 | |
| 892 # Process options... | |
| 893 sub ProcessOptions { | |
| 894 %OptionsInfo = (); | |
| 895 | |
| 896 $OptionsInfo{Mode} = $Options{mode}; | |
| 897 | |
| 898 $OptionsInfo{InDelim} = "\,"; | |
| 899 if ($Options{indelim} =~ /^semicolon$/i) { | |
| 900 $OptionsInfo{InDelim} = "\;"; | |
| 901 } | |
| 902 elsif ($Options{indelim} =~ /^tab$/i) { | |
| 903 $OptionsInfo{InDelim} = "\t"; | |
| 904 } | |
| 905 | |
| 906 $OptionsInfo{OutDelim} = "\,"; | |
| 907 if ($Options{outdelim} =~ /^semicolon$/i) { | |
| 908 $OptionsInfo{OutDelim} = "\;"; | |
| 909 } | |
| 910 elsif ($Options{outdelim} =~ /^tab$/i) { | |
| 911 $OptionsInfo{OutDelim} = "\t"; | |
| 912 } | |
| 913 | |
| 914 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; | |
| 915 | |
| 916 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0; | |
| 917 | |
| 918 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; | |
| 919 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; | |
| 920 | |
| 921 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds}; | |
| 922 | |
| 923 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode}; | |
| 924 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0; | |
| 925 | |
| 926 $OptionsInfo{Violations} = $Options{violations}; | |
| 927 $OptionsInfo{Seed} = $Options{seed}; | |
| 928 | |
| 929 | |
| 930 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { | |
| 931 if ($Options{datafields} || $Options{datafieldsfile}) { | |
| 932 if ($Options{datafields} && $Options{datafieldsfile}) { | |
| 933 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; | |
| 934 } | |
| 935 } | |
| 936 else { | |
| 937 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; | |
| 938 } | |
| 939 } | |
| 940 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef; | |
| 941 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef; | |
| 942 | |
| 943 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0; | |
| 944 | |
| 945 %{$OptionsInfo{RecordNums}} = (); | |
| 946 $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0; | |
| 947 | |
| 948 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef; | |
| 949 | |
| 950 if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) { | |
| 951 if ($Options{record}) { | |
| 952 my($Record, @RecordSplit); | |
| 953 | |
| 954 $Record = $Options{record}; | |
| 955 $Record =~ s/ //g; | |
| 956 | |
| 957 @RecordSplit = split ",", $Record; | |
| 958 | |
| 959 if ($Options{mode} =~ /^recordnum$/i ) { | |
| 960 if (@RecordSplit == 1) { | |
| 961 $OptionsInfo{RecordNum} = $RecordSplit[0]; | |
| 962 if ($OptionsInfo{RecordNum} <= 0) { | |
| 963 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n"; | |
| 964 } | |
| 965 } | |
| 966 else { | |
| 967 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; | |
| 968 } | |
| 969 } | |
| 970 elsif ($Options{mode} =~ /^recordnums$/i ) { | |
| 971 my($RecordNum, $RecordCount, @SortedRecordSplit); | |
| 972 | |
| 973 @SortedRecordSplit = sort { $a <=> $b } @RecordSplit; | |
| 974 | |
| 975 $RecordCount = 0; | |
| 976 RECORDNUM: for $RecordNum (@SortedRecordSplit) { | |
| 977 if (exists $OptionsInfo{RecordNums}{$RecordNum}) { | |
| 978 next RECORDNUM; | |
| 979 } | |
| 980 $RecordCount++; | |
| 981 $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum; | |
| 982 } | |
| 983 $OptionsInfo{RecordNumsCount} = $RecordCount; | |
| 984 $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0]; | |
| 985 $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit]; | |
| 986 } | |
| 987 else { | |
| 988 if (@RecordSplit == 2) { | |
| 989 $OptionsInfo{StartRecordNum} = $RecordSplit[0]; | |
| 990 $OptionsInfo{EndRecordNum} = $RecordSplit[1]; | |
| 991 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) { | |
| 992 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; | |
| 993 } | |
| 994 } | |
| 995 else { | |
| 996 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; | |
| 997 } | |
| 998 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) { | |
| 999 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n"; | |
| 1000 } | |
| 1001 } | |
| 1002 } | |
| 1003 else { | |
| 1004 die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n"; | |
| 1005 } | |
| 1006 } | |
| 1007 | |
| 1008 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
| 1009 | |
| 1010 my(@Words, $Line, $Value); | |
| 1011 if ($Options{mode} =~ /^datafields$/i) { | |
| 1012 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
| 1013 if ($Options{datafields}) { | |
| 1014 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields}; | |
| 1015 } | |
| 1016 elsif ($Options{datafieldsfile}) { | |
| 1017 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
| 1018 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
| 1019 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
| 1020 if (@Words) { | |
| 1021 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words; | |
| 1022 } | |
| 1023 } | |
| 1024 close DATAFIELDSFILE; | |
| 1025 } | |
| 1026 } | |
| 1027 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { | |
| 1028 my(@DataFieldsByValueTriplets); | |
| 1029 @DataFieldsByValueTriplets = (); | |
| 1030 if ($Options{datafields}) { | |
| 1031 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields}; | |
| 1032 } | |
| 1033 elsif ($Options{datafieldsfile}) { | |
| 1034 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
| 1035 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
| 1036 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
| 1037 if (@Words) { | |
| 1038 push @DataFieldsByValueTriplets, @Words; | |
| 1039 } | |
| 1040 } | |
| 1041 close DATAFIELDSFILE; | |
| 1042 } | |
| 1043 if ((@DataFieldsByValueTriplets % 3)) { | |
| 1044 if ($Options{datafields}) { | |
| 1045 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; | |
| 1046 } | |
| 1047 elsif ($Options{datafieldsfile}) { | |
| 1048 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; | |
| 1049 } | |
| 1050 } | |
| 1051 my($Index, $Label, $Value, $Criterion); | |
| 1052 | |
| 1053 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
| 1054 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = (); | |
| 1055 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = (); | |
| 1056 | |
| 1057 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { | |
| 1058 $Label = $DataFieldsByValueTriplets[$Index]; | |
| 1059 $Value = $DataFieldsByValueTriplets[$Index + 1]; | |
| 1060 $Criterion = $DataFieldsByValueTriplets[$Index + 2]; | |
| 1061 | |
| 1062 if ($Criterion =~ /^(eq|le|ge)$/i) { | |
| 1063 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; | |
| 1064 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value; | |
| 1065 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion; | |
| 1066 } | |
| 1067 else { | |
| 1068 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; | |
| 1069 } | |
| 1070 } | |
| 1071 } | |
| 1072 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) { | |
| 1073 my(@DataFieldsByRegexTriplets); | |
| 1074 | |
| 1075 @DataFieldsByRegexTriplets = (); | |
| 1076 if ($Options{datafields}) { | |
| 1077 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields}); | |
| 1078 } | |
| 1079 elsif ($Options{datafieldsfile}) { | |
| 1080 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
| 1081 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
| 1082 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
| 1083 if (@Words) { | |
| 1084 push @DataFieldsByRegexTriplets, @Words; | |
| 1085 } | |
| 1086 } | |
| 1087 close DATAFIELDSFILE; | |
| 1088 } | |
| 1089 if ((@DataFieldsByRegexTriplets % 3)) { | |
| 1090 if ($Options{datafields}) { | |
| 1091 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n"; | |
| 1092 } | |
| 1093 elsif ($Options{datafieldsfile}) { | |
| 1094 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n"; | |
| 1095 } | |
| 1096 } | |
| 1097 | |
| 1098 my($Index, $Label, $Value, $Criterion); | |
| 1099 | |
| 1100 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
| 1101 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = (); | |
| 1102 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = (); | |
| 1103 | |
| 1104 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) { | |
| 1105 $Label = $DataFieldsByRegexTriplets[$Index]; | |
| 1106 $Value = $DataFieldsByRegexTriplets[$Index + 1]; | |
| 1107 $Criterion = $DataFieldsByRegexTriplets[$Index + 2]; | |
| 1108 | |
| 1109 if ($Criterion =~ /^(eq|ne)$/i) { | |
| 1110 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; | |
| 1111 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value; | |
| 1112 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion; | |
| 1113 } | |
| 1114 else { | |
| 1115 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n"; | |
| 1116 } | |
| 1117 } | |
| 1118 } | |
| 1119 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { | |
| 1120 my($Index, @DataFieldAndValuesList); | |
| 1121 if ($Options{datafields}) { | |
| 1122 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields}; | |
| 1123 } | |
| 1124 elsif ($Options{datafieldsfile}) { | |
| 1125 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
| 1126 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
| 1127 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
| 1128 if (@Words) { | |
| 1129 push @DataFieldAndValuesList, @Words; | |
| 1130 } | |
| 1131 } | |
| 1132 close DATAFIELDSFILE; | |
| 1133 } | |
| 1134 if (@DataFieldAndValuesList < 2) { | |
| 1135 if ($Options{datafields}) { | |
| 1136 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; | |
| 1137 } | |
| 1138 elsif ($Options{datafieldsfile}) { | |
| 1139 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; | |
| 1140 } | |
| 1141 } | |
| 1142 | |
| 1143 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0]; | |
| 1144 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1; | |
| 1145 %{$OptionsInfo{SpecifiedDataFieldValues}} = (); | |
| 1146 | |
| 1147 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { | |
| 1148 $Value = $DataFieldAndValuesList[$Index]; | |
| 1149 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; | |
| 1150 } | |
| 1151 } | |
| 1152 | |
| 1153 $OptionsInfo{SDFileExt} = "sdf"; | |
| 1154 $OptionsInfo{TextFileExt} = "csv"; | |
| 1155 | |
| 1156 if ($Options{outdelim} =~ /^tab$/i) { | |
| 1157 $OptionsInfo{TextFileExt} = "tsv"; | |
| 1158 } | |
| 1159 | |
| 1160 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { | |
| 1161 $OptionsInfo{OutputSDFile} = 0; | |
| 1162 $OptionsInfo{OutputTextFile} = 1; | |
| 1163 } | |
| 1164 else { | |
| 1165 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; | |
| 1166 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; | |
| 1167 } | |
| 1168 | |
| 1169 $OptionsInfo{StrDataString} = $Options{strdatastring}; | |
| 1170 $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; | |
| 1171 | |
| 1172 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter}; | |
| 1173 | |
| 1174 if (IsEmpty($Options{strdatastringdelimiter})) { | |
| 1175 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; | |
| 1176 } | |
| 1177 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode}; | |
| 1178 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; | |
| 1179 | |
| 1180 MODE: { | |
| 1181 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; } | |
| 1182 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; } | |
| 1183 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; } | |
| 1184 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; } | |
| 1185 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; } | |
| 1186 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; } | |
| 1187 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; } | |
| 1188 if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; } | |
| 1189 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; } | |
| 1190 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; } | |
| 1191 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; } | |
| 1192 if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; } | |
| 1193 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; } | |
| 1194 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; } | |
| 1195 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; } | |
| 1196 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; | |
| 1197 } | |
| 1198 | |
| 1199 } | |
| 1200 | |
| 1201 # Setup script usage and retrieve command line arguments specified using various options... | |
| 1202 sub SetupScriptUsage { | |
| 1203 | |
| 1204 # Retrieve all the options... | |
| 1205 %Options = (); | |
| 1206 $Options{numofcmpds} = 1; | |
| 1207 $Options{mode} = "alldatafields"; | |
| 1208 $Options{indelim} = "comma"; | |
| 1209 $Options{outdelim} = "comma"; | |
| 1210 $Options{output} = "SD"; | |
| 1211 $Options{quote} = "yes"; | |
| 1212 $Options{regexignorecase} = "yes"; | |
| 1213 $Options{valuecomparisonmode} = "numeric"; | |
| 1214 $Options{violations} = 0; | |
| 1215 $Options{seed} = 123456789; | |
| 1216 | |
| 1217 $Options{strdatastring} = "no"; | |
| 1218 $Options{strdatastringdelimiter} = "|"; | |
| 1219 $Options{strdatastringmode} = "StrOnly"; | |
| 1220 | |
| 1221 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) { | |
| 1222 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 1223 } | |
| 1224 if ($Options{workingdir}) { | |
| 1225 if (! -d $Options{workingdir}) { | |
| 1226 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 1227 } | |
| 1228 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 1229 } | |
| 1230 if ($Options{numofcmpds} < 1) { | |
| 1231 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; | |
| 1232 } | |
| 1233 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) { | |
| 1234 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n"; | |
| 1235 } | |
| 1236 if ($Options{violations} < 0) { | |
| 1237 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; | |
| 1238 } | |
| 1239 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) { | |
| 1240 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; | |
| 1241 } | |
| 1242 if ($Options{output} !~ /^(SD|text|both)$/i) { | |
| 1243 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; | |
| 1244 } | |
| 1245 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { | |
| 1246 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
| 1247 } | |
| 1248 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
| 1249 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
| 1250 } | |
| 1251 if ($Options{quote} !~ /^(yes|no)$/i) { | |
| 1252 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
| 1253 } | |
| 1254 if ($Options{regexignorecase} !~ /^(yes|no)$/i) { | |
| 1255 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n"; | |
| 1256 } | |
| 1257 if ($Options{strdatastring} !~ /^(yes|no)$/i) { | |
| 1258 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; | |
| 1259 } | |
| 1260 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { | |
| 1261 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; | |
| 1262 } | |
| 1263 } | |
| 1264 | |
| 1265 __END__ | |
| 1266 | |
| 1267 =head1 NAME | |
| 1268 | |
| 1269 ExtractFromSDFiles.pl - Extract specific data from SDFile(s) | |
| 1270 | |
| 1271 =head1 SYNOPSIS | |
| 1272 | |
| 1273 ExtractFromSDFiles.pl SDFile(s)... | |
| 1274 | |
| 1275 ExtractFromSDFiles.pl [B<-h, --help>] | |
| 1276 [B<-d, --datafields> "fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value..."] | |
| 1277 [B<--datafieldsfile> filename] [B<--indelim> comma | tab | semicolon] [B<-m, --mode> alldatafields | | |
| 1278 commondatafields | | datafieldnotbylist | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | | |
| 1279 datafielduniquebylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | | |
| 1280 3dcmpdrecords ] [B<-n, --numofcmpds> number] [B<--outdelim> comma | tab | semicolon] | |
| 1281 [B<--output> SD | text | both] [B<-o, --overwrite>] [B<-q, --quote> yes | no] | |
| 1282 [B<--record> recnum | startrecnum,endrecnum] B<--RegexIgnoreCase> I<yes or no> | |
| 1283 [B<-r, --root> rootname] [B<-s, --seed> number] [B<--StrDataString> yes | no] | |
| 1284 [B<--StrDataStringDelimiter> text] [B<--StrDataStringMode> StrOnly | StrAndDataFields] | |
| 1285 [B<--ValueComparisonMode> I<Numeric | Alphanumeric>] | |
| 1286 [B<-v, --violations-> number] [B<-w, --workingdir> dirname] SDFile(s)... | |
| 1287 | |
| 1288 =head1 DESCRIPTION | |
| 1289 | |
| 1290 Extract specific data from I<SDFile(s)> and generate appropriate SD or CSV/TSV text | |
| 1291 file(s). The structure data from SDFile(s) is not transferred to CSV/TSV text file(s). | |
| 1292 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf> | |
| 1293 and I<.sd>. All other file names are ignored. All the SD files in a current directory | |
| 1294 can be specified either by I<*.sdf> or the current directory name. | |
| 1295 | |
| 1296 =head1 OPTIONS | |
| 1297 | |
| 1298 =over 4 | |
| 1299 | |
| 1300 =item B<-h, --help> | |
| 1301 | |
| 1302 Print this help message. | |
| 1303 | |
| 1304 =item B<-d, --datafields> I<"fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value,..."> | |
| 1305 | |
| 1306 This value is mode specific. In general, it's a list of comma separated data field labels | |
| 1307 and associated mode specific values. | |
| 1308 | |
| 1309 For I<datafields> mode, input value format is: I<fieldlabel,...>. Examples: | |
| 1310 | |
| 1311 Extreg | |
| 1312 Extreg,CompoundName,ID | |
| 1313 | |
| 1314 For I<datafieldsbyvalue> mode, input value format contains these triplets: | |
| 1315 I<fieldlabel,value, criteria...>. Possible values for criteria: I<le, ge or eq>. | |
| 1316 The values of B<--ValueComparisonMode> indicates whether values are | |
| 1317 compared numerical or string comarison operators. Default is to consider | |
| 1318 data field values as numerical values and use numerical comparison operators. | |
| 1319 Examples: | |
| 1320 | |
| 1321 MolWt,450,le | |
| 1322 MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le | |
| 1323 | |
| 1324 For I<datafieldsbyregex> mode, input value format contains these triplets: | |
| 1325 I<fieldlabel,regex, criteria...>. I<regex> corresponds to any valid regular expression | |
| 1326 and is used to match the values for specified I<fieldlabel>. Possible values for criteria: | |
| 1327 I<eq or ne>. During I<eq> and I<ne> values, data field label value is matched with | |
| 1328 regular expression using =~ and !~ respectively. B<--RegexIgnoreCase> option | |
| 1329 value is used to determine whether to ignore letter upper/lower case during | |
| 1330 regular expression match. Examples: | |
| 1331 | |
| 1332 Name,ol,eq | |
| 1333 Name,'^pat',ne | |
| 1334 | |
| 1335 For I<datafieldbylist> and I<datafielduniquebylist> mode, input value format is: | |
| 1336 I<fieldlabel,value1,value2...>. This is equivalent to I<datafieldsbyvalue> mode with | |
| 1337 this input value format:I<fieldlabel,value1,eq,fieldlabel,value2,eq,...>. For | |
| 1338 I<datafielduniquebylist> mode, only unique compounds identified by first occurrence | |
| 1339 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds | |
| 1340 are simply ignored. | |
| 1341 | |
| 1342 For I<datafieldnotbylist> mode, input value format is: I<fieldlabel,value1,value2...>. In this | |
| 1343 mode, the script behaves exactly opposite of I<datafieldbylist> mode, and only those compounds | |
| 1344 are extracted whose data field values don't match any specified data field value. | |
| 1345 | |
| 1346 =item B<--datafieldsfile> I<filename> | |
| 1347 | |
| 1348 Filename which contains various mode specific values. This option provides a way | |
| 1349 to specify mode specific values in a file instead of entering them on the command | |
| 1350 line using B<-d --datafields>. | |
| 1351 | |
| 1352 For I<datafields> mode, input file lines contain comma delimited field labels: | |
| 1353 I<fieldlabel,...>. Example: | |
| 1354 | |
| 1355 Line 1:MolId | |
| 1356 Line 2:"Extreg",CompoundName,ID | |
| 1357 | |
| 1358 For I<datafieldsbyvalue> mode, input file lines contains these comma separated triplets: | |
| 1359 I<fieldlabel,value, criteria>. Possible values for criteria: I<le, ge or eq>. Examples: | |
| 1360 | |
| 1361 Line 1:MolWt,450,le | |
| 1362 | |
| 1363 Line 1:"MolWt",450,le,"LogP",5,le,"SumNumNO",10,le,"SumNHOH",5,le | |
| 1364 | |
| 1365 Line 1:MolWt,450,le | |
| 1366 Line 2:"LogP",5,le | |
| 1367 Line 3:"SumNumNO",10,le | |
| 1368 Line 4: SumNHOH,5,le | |
| 1369 | |
| 1370 For I<datafieldbylist> and I<datafielduniquebylist> mode, input file line format is: | |
| 1371 | |
| 1372 Line 1:fieldlabel; | |
| 1373 Subsequent lines:value1,value2... | |
| 1374 | |
| 1375 For I<datafieldbylist>, I<datafielduniquebylist>, and I<datafieldnotbylist> mode, input file | |
| 1376 line format is: | |
| 1377 | |
| 1378 Line 1:fieldlabel; | |
| 1379 Subsequent lines:value1,value2... | |
| 1380 | |
| 1381 For I<datafielduniquebylist> mode, only unique compounds identified by first occurrence | |
| 1382 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds | |
| 1383 are simply ignored. Example: | |
| 1384 | |
| 1385 Line 1: MolID | |
| 1386 Subsequent Lines: | |
| 1387 907508 | |
| 1388 832291,4642 | |
| 1389 "1254","907303" | |
| 1390 | |
| 1391 =item B<--indelim> I<comma | tab | semicolon> | |
| 1392 | |
| 1393 Delimiter used to specify text values for B<-d --datafields> and B<--datafieldsfile> options. | |
| 1394 Possible values: I<comma, tab, or semicolon>. Default value: I<comma>. | |
| 1395 | |
| 1396 =item B<-m, --mode> I<alldatafields | commondatafields | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | datafielduniquebylist | datafieldnotbylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | 3dcmpdrecords> | |
| 1397 | |
| 1398 Specify what to extract from I<SDFile(s)>. Possible values: I<alldatafields, commondatafields, | |
| 1399 datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, datafieldnotbylist, | |
| 1400 molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords>. | |
| 1401 Default value: I<alldatafields>. | |
| 1402 | |
| 1403 For I<alldatafields> and I<molnames> mode, only a CSV/TSV text file is generated; for all | |
| 1404 other modes, however, a SD file is generated by default - you can change the behavior to genereate | |
| 1405 text file using I<--output> option. | |
| 1406 | |
| 1407 For I<3DCmpdRecords> mode, only those compounds with at least one non-zero value for Z atomic coordinates | |
| 1408 are retrieved; however, during retrieval of compounds in I<2DCmpdRecords> mode, all Z atomic coordinates must | |
| 1409 be zero. | |
| 1410 | |
| 1411 =item B<-n, --numofcmpds> I<number> | |
| 1412 | |
| 1413 Number of compouds to extract during I<randomcmpds> mode. | |
| 1414 | |
| 1415 =item B<--outdelim> I<comma | tab | semicolon> | |
| 1416 | |
| 1417 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> | |
| 1418 Default value: I<comma> | |
| 1419 | |
| 1420 =item B<--output> I<SD | text | both> | |
| 1421 | |
| 1422 Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<SD>. For | |
| 1423 I<alldatafields> and I<molnames> mode, this option is ingored and only a CSV/TSV text file is generated. | |
| 1424 | |
| 1425 =item B<-o, --overwrite> | |
| 1426 | |
| 1427 Overwrite existing files. | |
| 1428 | |
| 1429 =item B<-q, --quote> I<yes | no> | |
| 1430 | |
| 1431 Put quote around column values in output CSV/TSV text file(s). Possible values: | |
| 1432 I<yes or no>. Default value: I<yes>. | |
| 1433 | |
| 1434 =item B<--record> I<recnum | recnums | startrecnum,endrecnum> | |
| 1435 | |
| 1436 Record number, record numbers or range of records to extract during I<recordnum>, I<recordnums> | |
| 1437 and I<recordrange> mode. Input value format is: <num>, <num1,num2,...> and <startnum, endnum> | |
| 1438 for I<recordnum>, I<recordnums> and I<recordrange> modes recpectively. Default value: none. | |
| 1439 | |
| 1440 =item B<--RegexIgnoreCase> I<yes or no> | |
| 1441 | |
| 1442 Specify whether to ingnore case during I<datafieldsbyregex> value of B<-m, --mode> option. | |
| 1443 Possible values: I<yes or no>. Default value: I<yes>. | |
| 1444 | |
| 1445 =item B<-r, --root> I<rootname> | |
| 1446 | |
| 1447 New file name is generated using the root: <Root>.<Ext>. Default for new file | |
| 1448 names: <SDFileName><mode>.<Ext>. The file type determines <Ext> value. | |
| 1449 The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab | |
| 1450 delimited text files respectively.This option is ignored for multiple input files. | |
| 1451 | |
| 1452 =item B<-s, --seed> I<number> | |
| 1453 | |
| 1454 Random number seed used for I<randomcmpds> mode. Default:123456789. | |
| 1455 | |
| 1456 =item B<--StrDataString> I<yes | no> | |
| 1457 | |
| 1458 Specify whether to write out structure data string to CSV/TSV text file(s). Possible values: | |
| 1459 I<yes or no>. Default value: I<no>. | |
| 1460 | |
| 1461 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure | |
| 1462 data lines into a structure data string. | |
| 1463 | |
| 1464 This option is ignored during generation of SD file(s). | |
| 1465 | |
| 1466 =item B<--StrDataStringDelimiter> I<text> | |
| 1467 | |
| 1468 Delimiter for joining multiple stucture data lines into a string before writing to CSV/TSV text | |
| 1469 file(s). Possible values: I<any alphanumeric text>. Default value: I<|>. | |
| 1470 | |
| 1471 This option is ignored during generation of SD file(s). | |
| 1472 | |
| 1473 =item B<--StrDataStringMode> I<StrOnly | StrAndDataFields> | |
| 1474 | |
| 1475 Specify whether to include SD data fields and values along with the structure data into structure | |
| 1476 data string before writing it out to CSV/TSV text file(s). Possible values: I<StrOnly or StrAndDataFields>. | |
| 1477 Default value: I<StrOnly>. | |
| 1478 | |
| 1479 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure | |
| 1480 data lines into a structure data string. | |
| 1481 | |
| 1482 This option is ignored during generation of SD file(s). | |
| 1483 | |
| 1484 =item B<--ValueComparisonMode> I<Numeric | Alphanumeric> | |
| 1485 | |
| 1486 Specify how to compare data field values during I<datafieldsbyvalue> mode: Compare | |
| 1487 values using either numeric or string ((eq, le, ge) comparison operators. Possible values: | |
| 1488 I<Numeric or Alphanumeric>. Defaule value: I<Numeric>. | |
| 1489 | |
| 1490 =item B<-v, --violations> I<number> | |
| 1491 | |
| 1492 Number of criterion violations allowed for values specified during I<datafieldsbyvalue> | |
| 1493 and I<datafieldsbyregex> mode. Default value: I<0>. | |
| 1494 | |
| 1495 =item B<-w, --workingdir> I<dirname> | |
| 1496 | |
| 1497 Location of working directory. Default: current directory. | |
| 1498 | |
| 1499 =back | |
| 1500 | |
| 1501 =head1 EXAMPLES | |
| 1502 | |
| 1503 To retrieve all data fields from SD files and generate CSV text files, type: | |
| 1504 | |
| 1505 % ExtractFromSDFiles.pl -o Sample.sdf | |
| 1506 % ExtractFromSDFiles.pl -o *.sdf | |
| 1507 | |
| 1508 To retrieve all data fields from SD file and generate CSV text files containing | |
| 1509 a column with structure data as a string with | as line delimiter, type: | |
| 1510 | |
| 1511 % ExtractFromSDFiles.pl --StrDataString Yes -o Sample.sdf | |
| 1512 | |
| 1513 To retrieve MOL_ID data fileld from SD file and generate CSV text files containing | |
| 1514 a column with structure data along with all data fields as a string with | as line | |
| 1515 delimiter, type: | |
| 1516 | |
| 1517 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID" --StrDataString Yes | |
| 1518 --StrDataStringMode StrAndDataFields --StrDataStringDelimiter "|" | |
| 1519 --output text -o Sample.sdf | |
| 1520 | |
| 1521 To retrieve common data fields which exists for all the compounds in | |
| 1522 a SD file and generate a TSV text file NewSample.tsv, type: | |
| 1523 | |
| 1524 % ExtractFromSDFiles.pl -m commondatafields --outdelim tab -r NewSample | |
| 1525 --output Text -o Sample.sdf | |
| 1526 | |
| 1527 To retrieve MolId, ExtReg, and CompoundName data field from a SD file and generate a | |
| 1528 CSV text file NewSample.csv, type: | |
| 1529 | |
| 1530 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID,MolWeight, | |
| 1531 CompoundName" -r NewSample --output Text -o Sample.sdf | |
| 1532 | |
| 1533 To retrieve compounds from a SD which meet a specific set of criteria - MolWt <= 450, | |
| 1534 LogP <= 5 and SumNO < 10 - from a SD file and generate a new SD file NewSample.sdf, | |
| 1535 type: | |
| 1536 | |
| 1537 % ExtractFromSDFiles.pl -m datafieldsbyvalue -d "MolWt,450,le,LogP | |
| 1538 ,5,le,SumNO,10" -r NewSample -o Sample.sdf | |
| 1539 | |
| 1540 To retrive compounds from a SD file with a specific set of values for MolID and | |
| 1541 generate a new SD file NewSample.sdf, type: | |
| 1542 | |
| 1543 % ExtractFromSDFiles.pl -m datafieldbylist -d "Mol_ID,159,4509,4619" | |
| 1544 -r NewSample -o Sample.sdf | |
| 1545 | |
| 1546 To retrive compounds from a SD file with values for MolID not on a list of specified | |
| 1547 values and generate a new SD file NewSample.sdf, type: | |
| 1548 | |
| 1549 % ExtractFromSDFiles.pl -m datafieldnotbylist -d "Mol_ID,159,4509,4619" | |
| 1550 -r NewSample -o Sample.sdf | |
| 1551 | |
| 1552 To retrive 10 random compounds from a SD file and generate a new SD file RandomSample.sdf, type: | |
| 1553 | |
| 1554 % ExtractFromSDFiles.pl -m randomcmpds -n 10 -r RandomSample | |
| 1555 -o Sample.sdf | |
| 1556 | |
| 1557 To retrive compound record number 10 from a SD file and generate a new SD file NewSample.sdf, type: | |
| 1558 | |
| 1559 % ExtractFromSDFiles.pl -m recordnum --record 10 -r NewSample | |
| 1560 -o Sample.sdf | |
| 1561 | |
| 1562 To retrive compound record numbers 10, 20 and 30 from a SD file and generate a new SD file | |
| 1563 NewSample.sdf, type: | |
| 1564 | |
| 1565 % ExtractFromSDFiles.pl -m recordnums --record 10,20,30 -r NewSample | |
| 1566 -o Sample.sdf | |
| 1567 | |
| 1568 To retrive compound records between 10 to 20 from SD file and generate a new SD | |
| 1569 file NewSample.sdf, type: | |
| 1570 | |
| 1571 % ExtractFromSDFiles.pl -m recordrange --record 10,20 -r NewSample | |
| 1572 -o Sample.sdf | |
| 1573 | |
| 1574 =head1 AUTHOR | |
| 1575 | |
| 1576 Manish Sud <msud@san.rr.com> | |
| 1577 | |
| 1578 =head1 SEE ALSO | |
| 1579 | |
| 1580 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl | |
| 1581 | |
| 1582 =head1 COPYRIGHT | |
| 1583 | |
| 1584 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 1585 | |
| 1586 This file is part of MayaChemTools. | |
| 1587 | |
| 1588 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 1589 the terms of the GNU Lesser General Public License as published by the Free | |
| 1590 Software Foundation; either version 3 of the License, or (at your option) | |
| 1591 any later version. | |
| 1592 | |
| 1593 =cut |
