Mercurial > repos > deepakjadmin > mayatool3_test2
diff bin/ExtractFromSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/ExtractFromSDFiles.pl Wed Jan 20 09:23:18 2016 -0500 @@ -0,0 +1,1593 @@ +#!/usr/bin/perl -w +# +# $RCSfile: ExtractFromSDFiles.pl,v $ +# $Date: 2015/03/22 19:11:27 $ +# $Revision: 1.48 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Text::ParseWords; +use Benchmark; +use SDFileUtil; +use FileUtil; +use TextUtil; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +# Starting message... +$ScriptName = basename($0); +print "\n$ScriptName:Starting...\n\n"; +$StartTime = new Benchmark; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@SDFilesList); +@SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +# Collect information about SD files... +print "Checking input SD file(s)...\n"; +my(%SDFilesInfo); +RetrieveSDFilesInfo(); + +# Generate output files... +my($FileIndex); +if (@SDFilesList > 1) { + print "\nProcessing SD files...\n"; +} +for $FileIndex (0 .. $#SDFilesList) { + if ($SDFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $SDFilesList[$FileIndex]...\n"; + ExtractFromSDFile($FileIndex); + } +} +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# Extract data from a SD file... +sub ExtractFromSDFile { + my($FileIndex) = @_; + + OpenInputAndOutputFiles($FileIndex); + + MODE: { + if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) { + ExtractAllDataFields($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) { + ExtractCommonDataFields($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^DataFields$/i) { + ExtractDataFields($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) { + ExtractDataFieldByList($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) { + ExtractDataFieldNotByList($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) { + ExtractDataFieldsByValue($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) { + ExtractDataFieldsByRegex($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) { + ExtractRandomCompounds($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^MolNames$/i) { + ExtractMolNames($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^RecordNum$/i) { + ExtractRecordNum($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^RecordNums$/i) { + ExtractRecordNums($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^RecordRange$/i) { + ExtractRecordRange($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) { + Extract2DCmpdRecords($FileIndex); + last MODE; + } + if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) { + Extract3DCmpdRecords($FileIndex); + last MODE; + } + die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; + } + + CloseInputAndOutputFiles(); +} + +# Extract all data fields... +sub ExtractAllDataFields { + my($FileIndex) = @_; + my(@CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + SetupDataValues(); + WriteTextFileCmpdData(); + WriteSDFileCmpdData(); + } +} + +# Extract common data fields... +sub ExtractCommonDataFields { + my($FileIndex) = @_; + my(@CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + SetupDataValues(); + WriteTextFileCmpdData(); + WriteSDFileCmpdData(); + } +} + +# Extract specified data fields... +sub ExtractDataFields { + my($FileIndex) = @_; + my(@CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}}; + WriteTextFileColLabels(); + + while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + SetupDataValues(); + WriteTextFileCmpdData(); + WriteSDFileCmpdData(); + } +} + +# Extract data fields using a list... +sub ExtractDataFieldByList { + my($FileIndex) = @_; + my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) { + $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; + } + $SpecifiedDataFieldValuesFoundCount = 0; + $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + $CmpdNum++; + + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { + next CMPDSTRING; + } + + SetupDataValues(); + + $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; + $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; + + if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { + if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) { + if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") { + $SpecifiedDataFieldValuesFoundCount++; + $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found"; + if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) { + WriteSDFileCmpdString(); + WriteTextFileCmpdData(); + } + } + if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) { + WriteSDFileCmpdString(); + WriteTextFileCmpdData(); + } + } + if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) { + last CMPDSTRING; + } + } + } +} + +# Extract data field whose values are not on the specified list... +sub ExtractDataFieldNotByList { + my($FileIndex) = @_; + my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { + next CMPDSTRING; + } + + SetupDataValues(); + + $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; + + # Make sure the current value is not empty and is not only specified list of values... + if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { + next CMPDSTRING; + } + + WriteSDFileCmpdString(); + WriteTextFileCmpdData(); + } +} + +# Extract data fields by value... +sub ExtractDataFieldsByValue { + my($FileIndex) = @_; + my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + SetupDataValues(); + $ViolationCount = 0; + + for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { + if (exists $SDFilesInfo{DataFieldValues}{$Label}) { + $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; + $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label}; + $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label}; + + if ($OptionsInfo{NumericalComparison}) { + CRITERION: { + if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } } + if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } } + if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } } + $Nothing = 1; + } + } + else { + CRITERION: { + if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } } + if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } + if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } + $Nothing = 1; + } + } + } + } + if ($ViolationCount <= $OptionsInfo{Violations}) { + WriteSDFileCmpdString(); + WriteTextFileCmpdData(); + } + } +} + +# Extract data fields by value using regular expression match... +sub ExtractDataFieldsByRegex { + my($FileIndex) = @_; + my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + + SetupDataValues(); + $ViolationCount = 0; + + for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { + if (exists $SDFilesInfo{DataFieldValues}{$Label}) { + $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; + $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label}; + $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label}; + + if ($OptionsInfo{RegexIgnoreCase}) { + CRITERION: { + if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } + if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } + $Nothing = 1; + } + } + else { + CRITERION: { + if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } + if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } + $Nothing = 1; + } + } + } + } + if ($ViolationCount <= $OptionsInfo{Violations}) { + WriteSDFileCmpdString(); + WriteTextFileCmpdData(); + } + } +} + +# Extract random compounds... +sub ExtractRandomCompounds { + my($FileIndex) = @_; + my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex]; + srand($OptionsInfo{Seed}); + $RandomCycleCount = 0; + + %RandomCmpdIndexMap = (); + while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) { + $RandomCycleCount++; + $RandomIndex = int (rand $CmpdCount) + 1; + $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; + } + + $CmpdNum = 0; + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + $CmpdNum++; + if (!exists $RandomCmpdIndexMap{$CmpdNum}) { + next CMPDSTRING; + } + + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + + WriteSDFileCmpdString(); + + if ($OptionsInfo{OutputTextFile}) { + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + SetupDataValues(); + WriteTextFileCmpdData(); + } + } +} + +# Extract mol names... +sub ExtractMolNames { + my($FileIndex) = @_; + my($MolName, $NewTextFileRef, @CmpdLines); + + push @{$SDFilesInfo{DataLabels}}, "MolName"; + WriteTextFileColLabels(); + + $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; + while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote}); + print $NewTextFileRef "$MolName\n"; + } +} + +# Extract a specific compound record... +sub ExtractRecordNum { + my($FileIndex) = @_; + my($CmpdNum, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + $CmpdNum = 0; + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + $CmpdNum++; + if ($CmpdNum != $OptionsInfo{RecordNum}) { + next CMPDSTRING; + } + + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + WriteSDFileCmpdString(); + + if ($OptionsInfo{OutputTextFile}) { + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + SetupDataValues(); + WriteTextFileCmpdData(); + } + last CMPDSTRING; + } +} + +# Extract a specific compound records... +sub ExtractRecordNums { + my($FileIndex) = @_; + my($CmpdNum, $CmpdCount, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + $CmpdNum = 0; + $CmpdCount = 0; + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + $CmpdNum++; + + if (exists $OptionsInfo{RecordNums}{$CmpdNum}) { + $CmpdCount++; + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + + WriteSDFileCmpdString(); + + if ($OptionsInfo{OutputTextFile}) { + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + SetupDataValues(); + WriteTextFileCmpdData(); + } + } + elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) { + last CMPDSTRING; + } + } +} + + +# Extract compounds in a specific record range... +sub ExtractRecordRange { + my($FileIndex) = @_; + my($CmpdNum, @CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + $CmpdNum = 0; + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + $CmpdNum++; + + if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + + WriteSDFileCmpdString(); + + if ($OptionsInfo{OutputTextFile}) { + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + SetupDataValues(); + WriteTextFileCmpdData(); + } + } + elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) { + last CMPDSTRING; + } + } +} + +# Extract 2D compound records... +sub Extract2DCmpdRecords { + my($FileIndex) = @_; + my(@CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + if (!IsCmpd2D(\@CmpdLines)) { + next CMPDSTRING; + } + + WriteSDFileCmpdString(); + + if ($OptionsInfo{OutputTextFile}) { + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + SetupDataValues(); + WriteTextFileCmpdData(); + } + } +} + +# Extract 3D compound records... +sub Extract3DCmpdRecords { + my($FileIndex) = @_; + my(@CmpdLines); + + @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; + WriteTextFileColLabels(); + + + CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { + @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; + if (!IsCmpd3D(\@CmpdLines)) { + next CMPDSTRING; + } + + WriteSDFileCmpdString(); + + if ($OptionsInfo{OutputTextFile}) { + %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + SetupDataValues(); + WriteTextFileCmpdData(); + } + } +} + + +# Open input and output files... +sub OpenInputAndOutputFiles { + my($FileIndex) = @_; + + $SDFilesInfo{NewTextFileRef} = undef; + $SDFilesInfo{NewSDFileRef} = undef; + + if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) { + print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; + } + elsif ($OptionsInfo{OutputSDFile}) { + print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n"; + } + else { + print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; + } + + if ($OptionsInfo{OutputSDFile}) { + open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n"; + $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE; + } + if ($OptionsInfo{OutputTextFile}) { + open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n"; + $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE; + } + + open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n"; + $SDFilesInfo{InputSDFileRef} = \*SDFILE; + +} + +# Close open input and output files... +sub CloseInputAndOutputFiles { + if ($SDFilesInfo{NewSDFileRef}) { + close $SDFilesInfo{NewSDFileRef}; + } + if ($SDFilesInfo{NewTextFileRef}) { + close $SDFilesInfo{NewTextFileRef}; + } + + if ($SDFilesInfo{InputSDFileRef}) { + close $SDFilesInfo{InputSDFileRef}; + } + + $SDFilesInfo{NewTextFileRef} = undef; + $SDFilesInfo{NewSDFileRef} = undef; + $SDFilesInfo{InputSDFileRef} = undef; +} + +# Write out column labels for text file... +sub WriteTextFileColLabels { + my($ColLabelsLine, $NewTextFileRef); + + if (!$OptionsInfo{OutputTextFile}) { + return; + } + $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; + + if ($OptionsInfo{OutputStrDataString}) { + # Append structure data string label... + my(@DataLabels); + + @DataLabels = (); + push @DataLabels, @{$SDFilesInfo{DataLabels}}; + push @DataLabels, "StructureDataString"; + + $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + } + else { + $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + } + print $NewTextFileRef "$ColLabelsLine\n"; +} + +# Setup values for data fields... +sub SetupDataValues { + @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}}; +} + +# Write out structure data and specific data fields to SD file... +sub WriteSDFileCmpdData { + my($MolString, $Count, $NewSDFileRef); + + if (!$OptionsInfo{OutputSDFile}) { + return; + } + + $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; + + ($MolString) = split "M END", $SDFilesInfo{CmpdString}; + $MolString .= "M END"; + print $NewSDFileRef "$MolString\n"; + + for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) { + print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n"; + } + print $NewSDFileRef "\$\$\$\$\n"; +} + +# Write out compound string... +sub WriteSDFileCmpdString { + my($NewSDFileRef); + + if (!$OptionsInfo{OutputSDFile}) { + return; + } + + $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; + print $NewSDFileRef "$SDFilesInfo{CmpdString}\n"; +} + +# Write out data for text file... +sub WriteTextFileCmpdData { + my($DataValuesLine, $NewTextFileRef); + + if (!$OptionsInfo{OutputTextFile}) { + return; + } + + $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; + $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); + + # Handle multiple lines data values for data fields by joining 'em using semicolons... + if ($DataValuesLine =~ /\n/) { + $DataValuesLine =~ s/\n/;/g; + } + + if ($OptionsInfo{OutputStrDataString}) { + # Append structure data string... + my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter); + + if ($OptionsInfo{StrDataStringWithFields}) { + $StrDataString = $SDFilesInfo{CmpdString}; + } + else { + ($StrDataString) = split "M END", $SDFilesInfo{CmpdString}; + $StrDataString .= "M END"; + } + $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter}; + $StrDataString =~ s/\n/$StrDataStringDelimiter/g; + + $OutDelim = $OptionsInfo{OutDelim}; + $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : ""; + + print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; + } + else { + print $NewTextFileRef "$DataValuesLine\n"; + } +} + +# Retrieve information about input SD files... +sub RetrieveSDFilesInfo { + my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); + + %SDFilesInfo = (); + + @{$SDFilesInfo{FileOkay}} = (); + @{$SDFilesInfo{CmpdCount}} = (); + @{$SDFilesInfo{NewTextFileName}} = (); + @{$SDFilesInfo{NewSDFileName}} = (); + + @{$SDFilesInfo{AllDataFieldLabels}} = (); + @{$SDFilesInfo{CommonDataFieldLabels}} = (); + + FILELIST: for $Index (0 .. $#SDFilesList) { + $SDFile = $SDFilesList[$Index]; + + $SDFilesInfo{FileOkay}[$Index] = 0; + + $SDFilesInfo{CmpdCount}[$Index] = 0; + $SDFilesInfo{NewTextFileName}[$Index] = ""; + $SDFilesInfo{NewSDFileName}[$Index] = ""; + + @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = (); + @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = (); + + if (!(-e $SDFile)) { + warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; + next FILELIST; + } + + if (!CheckFileType($SDFile, "sd sdf")) { + warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; + next FILELIST; + } + + # Generate appropriate name for the new output file. + $FileDir = ""; $FileName = ""; $FileExt = ""; + ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); + $NewFileName = $FileName; + $NewFileName = $FileName . $OptionsInfo{FileNameMode}; + if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { + my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); + if ($RootFileName && $RootFileExt) { + $NewFileName = $RootFileName; + } + else { + $NewFileName = $OptionsInfo{OutFileRoot}; + } + } + $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}"; + $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}"; + + if ($OptionsInfo{OutputSDFile}) { + if (lc($NewSDFileName) eq lc($SDFile)) { + warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; + print "Specify a different name using \"-r --root\" option or use default name.\n"; + next FILELIST; + } + } + + if (!$OptionsInfo{Overwrite}) { + if ($OptionsInfo{OutputSDFile}) { + if (-e $NewSDFileName) { + warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; + next FILELIST; + } + } + if ($OptionsInfo{OutputTextFile}) { + if (-e $NewTextFileName) { + warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; + next FILELIST; + } + } + } + + if (!open SDFILE, "$SDFile") { + warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; + next FILELIST; + } + + my($CountCmpds, $CollectDataFields); + my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); + + $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0; + + $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0; + + $CmpdCount = 0; + if ($CountCmpds || $CollectDataFields) { + @DataFieldLabels = (); + @CommonDataFieldLabels = (); + %DataFieldLabelsMap = (); + CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { + $CmpdCount++; + if ($OptionsInfo{Mode} =~ /^recordnum$/i) { + if ($CmpdCount == $OptionsInfo{RecordNum}) { + @CmpdLines = split "\n", $CmpdString; + @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); + last CMPDSTRING; + } + } + if ($CollectDataFields) { + my($Label); + @CmpdLines = split "\n", $CmpdString; + # Process compound data header labels and figure out which ones are present for + # all the compounds... + if (@DataFieldLabels) { + my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); + my(%CmpdDataFieldLabelsMap) = (); + # Setup a map for the current labels... + for $Label (@CmpdDataFieldLabels) { + $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; + } + # Check the presence old labels for this compound; otherwise, mark 'em new... + for $Label (@DataFieldLabels) { + if (!$CmpdDataFieldLabelsMap{$Label}) { + $DataFieldLabelsMap{$Label} = "PresentInSome"; + } + } + # Check the presence this compound in the old labels; otherwise, add 'em... + for $Label (@CmpdDataFieldLabels ) { + if (!$DataFieldLabelsMap{$Label}) { + # It's a new label... + push @DataFieldLabels, $Label; + $DataFieldLabelsMap{$Label} = "PresentInSome"; + } + } + } + else { + # Get the initial label set and set up a map... + @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); + for $Label (@DataFieldLabels) { + $DataFieldLabelsMap{$Label} = "PresentInAll"; + } + } + # Identify the common data field labels... + if ($Options{mode} =~ /^commondatafields$/i) { + @CommonDataFieldLabels = (); + for $Label (@DataFieldLabels) { + if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { + push @CommonDataFieldLabels, $Label; + } + } + } + } + } + } + + $SDFilesInfo{FileOkay}[$Index] = 1; + + $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName; + $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName; + + $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; + + push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels; + push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels; + + close SDFILE; + } +} + +# Process options... +sub ProcessOptions { + %OptionsInfo = (); + + $OptionsInfo{Mode} = $Options{mode}; + + $OptionsInfo{InDelim} = "\,"; + if ($Options{indelim} =~ /^semicolon$/i) { + $OptionsInfo{InDelim} = "\;"; + } + elsif ($Options{indelim} =~ /^tab$/i) { + $OptionsInfo{InDelim} = "\t"; + } + + $OptionsInfo{OutDelim} = "\,"; + if ($Options{outdelim} =~ /^semicolon$/i) { + $OptionsInfo{OutDelim} = "\;"; + } + elsif ($Options{outdelim} =~ /^tab$/i) { + $OptionsInfo{OutDelim} = "\t"; + } + + $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; + + $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0; + + $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; + $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; + + $OptionsInfo{NumOfCmpds} = $Options{numofcmpds}; + + $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode}; + $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0; + + $OptionsInfo{Violations} = $Options{violations}; + $OptionsInfo{Seed} = $Options{seed}; + + + if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { + if ($Options{datafields} || $Options{datafieldsfile}) { + if ($Options{datafields} && $Options{datafieldsfile}) { + die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; + } + } + else { + die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; + } + } + $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef; + $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef; + + $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0; + + %{$OptionsInfo{RecordNums}} = (); + $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0; + + $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef; + + if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) { + if ($Options{record}) { + my($Record, @RecordSplit); + + $Record = $Options{record}; + $Record =~ s/ //g; + + @RecordSplit = split ",", $Record; + + if ($Options{mode} =~ /^recordnum$/i ) { + if (@RecordSplit == 1) { + $OptionsInfo{RecordNum} = $RecordSplit[0]; + if ($OptionsInfo{RecordNum} <= 0) { + die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n"; + } + } + else { + die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; + } + } + elsif ($Options{mode} =~ /^recordnums$/i ) { + my($RecordNum, $RecordCount, @SortedRecordSplit); + + @SortedRecordSplit = sort { $a <=> $b } @RecordSplit; + + $RecordCount = 0; + RECORDNUM: for $RecordNum (@SortedRecordSplit) { + if (exists $OptionsInfo{RecordNums}{$RecordNum}) { + next RECORDNUM; + } + $RecordCount++; + $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum; + } + $OptionsInfo{RecordNumsCount} = $RecordCount; + $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0]; + $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit]; + } + else { + if (@RecordSplit == 2) { + $OptionsInfo{StartRecordNum} = $RecordSplit[0]; + $OptionsInfo{EndRecordNum} = $RecordSplit[1]; + if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) { + die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; + } + } + else { + die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; + } + if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) { + die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n"; + } + } + } + else { + die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n"; + } + } + + @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); + + my(@Words, $Line, $Value); + if ($Options{mode} =~ /^datafields$/i) { + @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); + if ($Options{datafields}) { + @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields}; + } + elsif ($Options{datafieldsfile}) { + open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; + while ($Line = GetTextLine(\*DATAFIELDSFILE)) { + @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); + if (@Words) { + push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words; + } + } + close DATAFIELDSFILE; + } + } + elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { + my(@DataFieldsByValueTriplets); + @DataFieldsByValueTriplets = (); + if ($Options{datafields}) { + @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields}; + } + elsif ($Options{datafieldsfile}) { + open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; + while ($Line = GetTextLine(\*DATAFIELDSFILE)) { + @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); + if (@Words) { + push @DataFieldsByValueTriplets, @Words; + } + } + close DATAFIELDSFILE; + } + if ((@DataFieldsByValueTriplets % 3)) { + if ($Options{datafields}) { + die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; + } + elsif ($Options{datafieldsfile}) { + die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; + } + } + my($Index, $Label, $Value, $Criterion); + + @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); + %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = (); + %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = (); + + for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { + $Label = $DataFieldsByValueTriplets[$Index]; + $Value = $DataFieldsByValueTriplets[$Index + 1]; + $Criterion = $DataFieldsByValueTriplets[$Index + 2]; + + if ($Criterion =~ /^(eq|le|ge)$/i) { + push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; + $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value; + $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion; + } + else { + warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; + } + } + } + elsif ($Options{mode} =~ /^datafieldsbyregex$/i) { + my(@DataFieldsByRegexTriplets); + + @DataFieldsByRegexTriplets = (); + if ($Options{datafields}) { + @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields}); + } + elsif ($Options{datafieldsfile}) { + open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; + while ($Line = GetTextLine(\*DATAFIELDSFILE)) { + @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); + if (@Words) { + push @DataFieldsByRegexTriplets, @Words; + } + } + close DATAFIELDSFILE; + } + if ((@DataFieldsByRegexTriplets % 3)) { + if ($Options{datafields}) { + die "Error: Triplet not found in values specified by \"-d --datafields\" option\n"; + } + elsif ($Options{datafieldsfile}) { + die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n"; + } + } + + my($Index, $Label, $Value, $Criterion); + + @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); + %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = (); + %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = (); + + for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) { + $Label = $DataFieldsByRegexTriplets[$Index]; + $Value = $DataFieldsByRegexTriplets[$Index + 1]; + $Criterion = $DataFieldsByRegexTriplets[$Index + 2]; + + if ($Criterion =~ /^(eq|ne)$/i) { + push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; + $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value; + $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion; + } + else { + warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n"; + } + } + } + elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { + my($Index, @DataFieldAndValuesList); + if ($Options{datafields}) { + @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields}; + } + elsif ($Options{datafieldsfile}) { + open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; + while ($Line = GetTextLine(\*DATAFIELDSFILE)) { + @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); + if (@Words) { + push @DataFieldAndValuesList, @Words; + } + } + close DATAFIELDSFILE; + } + if (@DataFieldAndValuesList < 2) { + if ($Options{datafields}) { + die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; + } + elsif ($Options{datafieldsfile}) { + die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; + } + } + + $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0]; + $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1; + %{$OptionsInfo{SpecifiedDataFieldValues}} = (); + + for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { + $Value = $DataFieldAndValuesList[$Index]; + $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; + } + } + + $OptionsInfo{SDFileExt} = "sdf"; + $OptionsInfo{TextFileExt} = "csv"; + + if ($Options{outdelim} =~ /^tab$/i) { + $OptionsInfo{TextFileExt} = "tsv"; + } + + if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { + $OptionsInfo{OutputSDFile} = 0; + $OptionsInfo{OutputTextFile} = 1; + } + else { + $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; + $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; + } + + $OptionsInfo{StrDataString} = $Options{strdatastring}; + $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; + + $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter}; + + if (IsEmpty($Options{strdatastringdelimiter})) { + die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; + } + $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode}; + $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; + + MODE: { + if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; } + if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; } + if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; } + if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; } + if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; } + if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; } + if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; } + if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; } + if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; } + if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; } + if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; } + if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; } + if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; } + if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; } + if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; } + die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; + } + +} + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Retrieve all the options... + %Options = (); + $Options{numofcmpds} = 1; + $Options{mode} = "alldatafields"; + $Options{indelim} = "comma"; + $Options{outdelim} = "comma"; + $Options{output} = "SD"; + $Options{quote} = "yes"; + $Options{regexignorecase} = "yes"; + $Options{valuecomparisonmode} = "numeric"; + $Options{violations} = 0; + $Options{seed} = 123456789; + + $Options{strdatastring} = "no"; + $Options{strdatastringdelimiter} = "|"; + $Options{strdatastringmode} = "StrOnly"; + + if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if ($Options{numofcmpds} < 1) { + die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; + } + if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) { + die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n"; + } + if ($Options{violations} < 0) { + die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; + } + if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) { + die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; + } + if ($Options{output} !~ /^(SD|text|both)$/i) { + die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; + } + if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { + die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; + } + if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { + die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; + } + if ($Options{quote} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{regexignorecase} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{strdatastring} !~ /^(yes|no)$/i) { + die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; + } + if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { + die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; + } +} + +__END__ + +=head1 NAME + +ExtractFromSDFiles.pl - Extract specific data from SDFile(s) + +=head1 SYNOPSIS + +ExtractFromSDFiles.pl SDFile(s)... + +ExtractFromSDFiles.pl [B<-h, --help>] +[B<-d, --datafields> "fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value..."] +[B<--datafieldsfile> filename] [B<--indelim> comma | tab | semicolon] [B<-m, --mode> alldatafields | +commondatafields | | datafieldnotbylist | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | +datafielduniquebylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | +3dcmpdrecords ] [B<-n, --numofcmpds> number] [B<--outdelim> comma | tab | semicolon] +[B<--output> SD | text | both] [B<-o, --overwrite>] [B<-q, --quote> yes | no] +[B<--record> recnum | startrecnum,endrecnum] B<--RegexIgnoreCase> I<yes or no> +[B<-r, --root> rootname] [B<-s, --seed> number] [B<--StrDataString> yes | no] +[B<--StrDataStringDelimiter> text] [B<--StrDataStringMode> StrOnly | StrAndDataFields] +[B<--ValueComparisonMode> I<Numeric | Alphanumeric>] +[B<-v, --violations-> number] [B<-w, --workingdir> dirname] SDFile(s)... + +=head1 DESCRIPTION + +Extract specific data from I<SDFile(s)> and generate appropriate SD or CSV/TSV text +file(s). The structure data from SDFile(s) is not transferred to CSV/TSV text file(s). +Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf> +and I<.sd>. All other file names are ignored. All the SD files in a current directory +can be specified either by I<*.sdf> or the current directory name. + +=head1 OPTIONS + +=over 4 + +=item B<-h, --help> + +Print this help message. + +=item B<-d, --datafields> I<"fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value,..."> + +This value is mode specific. In general, it's a list of comma separated data field labels +and associated mode specific values. + +For I<datafields> mode, input value format is: I<fieldlabel,...>. Examples: + + Extreg + Extreg,CompoundName,ID + +For I<datafieldsbyvalue> mode, input value format contains these triplets: +I<fieldlabel,value, criteria...>. Possible values for criteria: I<le, ge or eq>. +The values of B<--ValueComparisonMode> indicates whether values are +compared numerical or string comarison operators. Default is to consider +data field values as numerical values and use numerical comparison operators. +Examples: + + MolWt,450,le + MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le + +For I<datafieldsbyregex> mode, input value format contains these triplets: +I<fieldlabel,regex, criteria...>. I<regex> corresponds to any valid regular expression +and is used to match the values for specified I<fieldlabel>. Possible values for criteria: + I<eq or ne>. During I<eq> and I<ne> values, data field label value is matched with +regular expression using =~ and !~ respectively. B<--RegexIgnoreCase> option +value is used to determine whether to ignore letter upper/lower case during +regular expression match. Examples: + + Name,ol,eq + Name,'^pat',ne + +For I<datafieldbylist> and I<datafielduniquebylist> mode, input value format is: +I<fieldlabel,value1,value2...>. This is equivalent to I<datafieldsbyvalue> mode with +this input value format:I<fieldlabel,value1,eq,fieldlabel,value2,eq,...>. For +I<datafielduniquebylist> mode, only unique compounds identified by first occurrence +of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds +are simply ignored. + +For I<datafieldnotbylist> mode, input value format is: I<fieldlabel,value1,value2...>. In this +mode, the script behaves exactly opposite of I<datafieldbylist> mode, and only those compounds +are extracted whose data field values don't match any specified data field value. + +=item B<--datafieldsfile> I<filename> + +Filename which contains various mode specific values. This option provides a way +to specify mode specific values in a file instead of entering them on the command +line using B<-d --datafields>. + +For I<datafields> mode, input file lines contain comma delimited field labels: +I<fieldlabel,...>. Example: + + Line 1:MolId + Line 2:"Extreg",CompoundName,ID + +For I<datafieldsbyvalue> mode, input file lines contains these comma separated triplets: +I<fieldlabel,value, criteria>. Possible values for criteria: I<le, ge or eq>. Examples: + + Line 1:MolWt,450,le + + Line 1:"MolWt",450,le,"LogP",5,le,"SumNumNO",10,le,"SumNHOH",5,le + + Line 1:MolWt,450,le + Line 2:"LogP",5,le + Line 3:"SumNumNO",10,le + Line 4: SumNHOH,5,le + +For I<datafieldbylist> and I<datafielduniquebylist> mode, input file line format is: + + Line 1:fieldlabel; + Subsequent lines:value1,value2... + +For I<datafieldbylist>, I<datafielduniquebylist>, and I<datafieldnotbylist> mode, input file +line format is: + + Line 1:fieldlabel; + Subsequent lines:value1,value2... + +For I<datafielduniquebylist> mode, only unique compounds identified by first occurrence +of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds +are simply ignored. Example: + + Line 1: MolID + Subsequent Lines: + 907508 + 832291,4642 + "1254","907303" + +=item B<--indelim> I<comma | tab | semicolon> + +Delimiter used to specify text values for B<-d --datafields> and B<--datafieldsfile> options. +Possible values: I<comma, tab, or semicolon>. Default value: I<comma>. + +=item B<-m, --mode> I<alldatafields | commondatafields | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | datafielduniquebylist | datafieldnotbylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | 3dcmpdrecords> + +Specify what to extract from I<SDFile(s)>. Possible values: I<alldatafields, commondatafields, +datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, datafieldnotbylist, +molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords>. +Default value: I<alldatafields>. + +For I<alldatafields> and I<molnames> mode, only a CSV/TSV text file is generated; for all +other modes, however, a SD file is generated by default - you can change the behavior to genereate +text file using I<--output> option. + +For I<3DCmpdRecords> mode, only those compounds with at least one non-zero value for Z atomic coordinates +are retrieved; however, during retrieval of compounds in I<2DCmpdRecords> mode, all Z atomic coordinates must +be zero. + +=item B<-n, --numofcmpds> I<number> + +Number of compouds to extract during I<randomcmpds> mode. + +=item B<--outdelim> I<comma | tab | semicolon> + +Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> +Default value: I<comma> + +=item B<--output> I<SD | text | both> + +Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<SD>. For +I<alldatafields> and I<molnames> mode, this option is ingored and only a CSV/TSV text file is generated. + +=item B<-o, --overwrite> + +Overwrite existing files. + +=item B<-q, --quote> I<yes | no> + +Put quote around column values in output CSV/TSV text file(s). Possible values: +I<yes or no>. Default value: I<yes>. + +=item B<--record> I<recnum | recnums | startrecnum,endrecnum> + +Record number, record numbers or range of records to extract during I<recordnum>, I<recordnums> +and I<recordrange> mode. Input value format is: <num>, <num1,num2,...> and <startnum, endnum> +for I<recordnum>, I<recordnums> and I<recordrange> modes recpectively. Default value: none. + +=item B<--RegexIgnoreCase> I<yes or no> + +Specify whether to ingnore case during I<datafieldsbyregex> value of B<-m, --mode> option. +Possible values: I<yes or no>. Default value: I<yes>. + +=item B<-r, --root> I<rootname> + +New file name is generated using the root: <Root>.<Ext>. Default for new file +names: <SDFileName><mode>.<Ext>. The file type determines <Ext> value. +The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab +delimited text files respectively.This option is ignored for multiple input files. + +=item B<-s, --seed> I<number> + +Random number seed used for I<randomcmpds> mode. Default:123456789. + +=item B<--StrDataString> I<yes | no> + +Specify whether to write out structure data string to CSV/TSV text file(s). Possible values: +I<yes or no>. Default value: I<no>. + +The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure +data lines into a structure data string. + +This option is ignored during generation of SD file(s). + +=item B<--StrDataStringDelimiter> I<text> + +Delimiter for joining multiple stucture data lines into a string before writing to CSV/TSV text +file(s). Possible values: I<any alphanumeric text>. Default value: I<|>. + +This option is ignored during generation of SD file(s). + +=item B<--StrDataStringMode> I<StrOnly | StrAndDataFields> + +Specify whether to include SD data fields and values along with the structure data into structure +data string before writing it out to CSV/TSV text file(s). Possible values: I<StrOnly or StrAndDataFields>. +Default value: I<StrOnly>. + +The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure +data lines into a structure data string. + +This option is ignored during generation of SD file(s). + +=item B<--ValueComparisonMode> I<Numeric | Alphanumeric> + +Specify how to compare data field values during I<datafieldsbyvalue> mode: Compare +values using either numeric or string ((eq, le, ge) comparison operators. Possible values: +I<Numeric or Alphanumeric>. Defaule value: I<Numeric>. + +=item B<-v, --violations> I<number> + +Number of criterion violations allowed for values specified during I<datafieldsbyvalue> +and I<datafieldsbyregex> mode. Default value: I<0>. + +=item B<-w, --workingdir> I<dirname> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To retrieve all data fields from SD files and generate CSV text files, type: + + % ExtractFromSDFiles.pl -o Sample.sdf + % ExtractFromSDFiles.pl -o *.sdf + +To retrieve all data fields from SD file and generate CSV text files containing +a column with structure data as a string with | as line delimiter, type: + + % ExtractFromSDFiles.pl --StrDataString Yes -o Sample.sdf + +To retrieve MOL_ID data fileld from SD file and generate CSV text files containing +a column with structure data along with all data fields as a string with | as line +delimiter, type: + + % ExtractFromSDFiles.pl -m datafields -d "Mol_ID" --StrDataString Yes + --StrDataStringMode StrAndDataFields --StrDataStringDelimiter "|" + --output text -o Sample.sdf + +To retrieve common data fields which exists for all the compounds in +a SD file and generate a TSV text file NewSample.tsv, type: + + % ExtractFromSDFiles.pl -m commondatafields --outdelim tab -r NewSample + --output Text -o Sample.sdf + +To retrieve MolId, ExtReg, and CompoundName data field from a SD file and generate a +CSV text file NewSample.csv, type: + + % ExtractFromSDFiles.pl -m datafields -d "Mol_ID,MolWeight, + CompoundName" -r NewSample --output Text -o Sample.sdf + +To retrieve compounds from a SD which meet a specific set of criteria - MolWt <= 450, +LogP <= 5 and SumNO < 10 - from a SD file and generate a new SD file NewSample.sdf, +type: + + % ExtractFromSDFiles.pl -m datafieldsbyvalue -d "MolWt,450,le,LogP + ,5,le,SumNO,10" -r NewSample -o Sample.sdf + +To retrive compounds from a SD file with a specific set of values for MolID and +generate a new SD file NewSample.sdf, type: + + % ExtractFromSDFiles.pl -m datafieldbylist -d "Mol_ID,159,4509,4619" + -r NewSample -o Sample.sdf + +To retrive compounds from a SD file with values for MolID not on a list of specified +values and generate a new SD file NewSample.sdf, type: + + % ExtractFromSDFiles.pl -m datafieldnotbylist -d "Mol_ID,159,4509,4619" + -r NewSample -o Sample.sdf + +To retrive 10 random compounds from a SD file and generate a new SD file RandomSample.sdf, type: + + % ExtractFromSDFiles.pl -m randomcmpds -n 10 -r RandomSample + -o Sample.sdf + +To retrive compound record number 10 from a SD file and generate a new SD file NewSample.sdf, type: + + % ExtractFromSDFiles.pl -m recordnum --record 10 -r NewSample + -o Sample.sdf + +To retrive compound record numbers 10, 20 and 30 from a SD file and generate a new SD file +NewSample.sdf, type: + + % ExtractFromSDFiles.pl -m recordnums --record 10,20,30 -r NewSample + -o Sample.sdf + +To retrive compound records between 10 to 20 from SD file and generate a new SD +file NewSample.sdf, type: + + % ExtractFromSDFiles.pl -m recordrange --record 10,20 -r NewSample + -o Sample.sdf + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut