Mercurial > repos > deepakjadmin > mayatool3_test3
diff mayachemtools/bin/MACCSKeysFingerprints.pl @ 0:73ae111cf86f draft
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 11:55:01 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mayachemtools/bin/MACCSKeysFingerprints.pl Wed Jan 20 11:55:01 2016 -0500 @@ -0,0 +1,1718 @@ +#!/usr/bin/perl -w +# +# $RCSfile: MACCSKeysFingerprints.pl,v $ +# $Date: 2015/02/28 20:46:20 $ +# $Revision: 1.31 $ +# +# Author: Manish Sud <msud@san.rr.com> +# +# Copyright (C) 2015 Manish Sud. All rights reserved. +# +# This file is part of MayaChemTools. +# +# MayaChemTools is free software; you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# MayaChemTools is distributed in the hope that it will be useful, but without +# any warranty; without even the implied warranty of merchantability of fitness +# for a particular purpose. See the GNU Lesser General Public License for more +# details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or +# write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, +# Boston, MA, 02111-1307, USA. +# + +use strict; +use FindBin; use lib "$FindBin::Bin/../lib"; +use Getopt::Long; +use File::Basename; +use Text::ParseWords; +use Benchmark; +use FileUtil; +use TextUtil; +use SDFileUtil; +use MoleculeFileIO; +use FileIO::FingerprintsSDFileIO; +use FileIO::FingerprintsTextFileIO; +use FileIO::FingerprintsFPFileIO; +use Fingerprints::MACCSKeys; + +my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); + +# Autoflush STDOUT +$| = 1; + +# Starting message... +$ScriptName = basename($0); +print "\n$ScriptName: Starting...\n\n"; +$StartTime = new Benchmark; + +# Get the options and setup script... +SetupScriptUsage(); +if ($Options{help} || @ARGV < 1) { + die GetUsageFromPod("$FindBin::Bin/$ScriptName"); +} + +my(@SDFilesList); +@SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); + +# Process options... +print "Processing options...\n"; +my(%OptionsInfo); +ProcessOptions(); + +# Setup information about input files... +print "Checking input SD file(s)...\n"; +my(%SDFilesInfo); +RetrieveSDFilesInfo(); + +# Process input files.. +my($FileIndex); +if (@SDFilesList > 1) { + print "\nProcessing SD files...\n"; +} +for $FileIndex (0 .. $#SDFilesList) { + if ($SDFilesInfo{FileOkay}[$FileIndex]) { + print "\nProcessing file $SDFilesList[$FileIndex]...\n"; + GenerateMACCSKeysFingerprints($FileIndex); + } +} +print "\n$ScriptName:Done...\n\n"; + +$EndTime = new Benchmark; +$TotalTime = timediff ($EndTime, $StartTime); +print "Total time: ", timestr($TotalTime), "\n"; + +############################################################################### + +# Generate fingerprints for a SD file... +# +sub GenerateMACCSKeysFingerprints { + my($FileIndex) = @_; + my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); + + $SDFile = $SDFilesList[$FileIndex]; + + # Setup output files... + # + ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); + + $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); + $MoleculeFileIO->Open(); + + $CmpdCount = 0; + $IgnoredCmpdCount = 0; + + COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { + $CmpdCount++; + + # Filter compound data before calculating fingerprints... + if ($OptionsInfo{Filter}) { + if (CheckAndFilterCompound($CmpdCount, $Molecule)) { + $IgnoredCmpdCount++; + next COMPOUND; + } + } + + $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule); + if (!$MACCSKeysFingerprints) { + $IgnoredCmpdCount++; + ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); + next COMPOUND; + } + + WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); + } + $MoleculeFileIO->Close(); + + if ($NewFPSDFileIO) { + $NewFPSDFileIO->Close(); + } + if ($NewFPTextFileIO) { + $NewFPTextFileIO->Close(); + } + if ($NewFPFileIO) { + $NewFPFileIO->Close(); + } + + WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); +} + +# Process compound being ignored due to problems in fingerprints geneation... +# +sub ProcessIgnoredCompound { + my($Mode, $CmpdCount, $Molecule) = @_; + my($CmpdID, $DataFieldLabelAndValuesRef); + + $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); + $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); + + MODE: { + if ($Mode =~ /^ContainsNonElementalData$/i) { + warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; + next MODE; + } + + if ($Mode =~ /^ContainsNoElementalData$/i) { + warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; + next MODE; + } + + if ($Mode =~ /^FingerprintsGenerationFailed$/i) { + warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; + next MODE; + } + warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; + } +} + +# Check and filter compounds.... +# +sub CheckAndFilterCompound { + my($CmpdCount, $Molecule) = @_; + my($ElementCount, $NonElementCount); + + ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); + + if ($NonElementCount) { + ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); + return 1; + } + + if (!$ElementCount) { + ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); + return 1; + } + + return 0; +} + +# Write out compounds fingerprints generation summary statistics... +# +sub WriteFingerprintsGenerationSummaryStatistics { + my($CmpdCount, $IgnoredCmpdCount) = @_; + my($ProcessedCmpdCount); + + $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; + + print "\nNumber of compounds: $CmpdCount\n"; + print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; + print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; +} + +# Open output files... +# +sub SetupAndOpenOutputFiles { + my($FileIndex) = @_; + my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); + + ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; + + # Setup common parameters for fingerprints file IO objects... + # + %FingerprintsFileIOParams = (); + if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) { + %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder}); + } + elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) { + %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); + } + + if ($OptionsInfo{SDOutput}) { + $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; + print "Generating SD file $NewFPSDFile...\n"; + $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); + $NewFPSDFileIO->Open(); + } + + if ($OptionsInfo{FPOutput}) { + $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; + print "Generating FP file $NewFPFile...\n"; + $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); + $NewFPFileIO->Open(); + } + + if ($OptionsInfo{TextOutput}) { + my($ColLabelsRef); + + $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; + $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); + + print "Generating text file $NewFPTextFile...\n"; + $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); + $NewFPTextFileIO->Open(); + } + + return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); +} + +# Write fingerpritns and other data to appropriate output files... +# +sub WriteDataToOutputFiles { + my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; + my($DataFieldLabelAndValuesRef); + + $DataFieldLabelAndValuesRef = undef; + if ($NewFPTextFileIO || $NewFPFileIO) { + $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); + } + + if ($NewFPSDFileIO) { + my($CmpdString); + + $CmpdString = $Molecule->GetInputMoleculeString(); + $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString); + } + + if ($NewFPTextFileIO) { + my($ColValuesRef); + + $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); + $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef); + } + + if ($NewFPFileIO) { + my($CompoundID); + + $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); + $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID); + } +} + +# Generate approriate column labels for FPText output file... +# +sub SetupFPTextFileCoulmnLabels { + my($FileIndex) = @_; + my($Line, @ColLabels); + + @ColLabels = (); + if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { + push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; + } + elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { + push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; + } + elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { + push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; + } + elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { + push @ColLabels, $OptionsInfo{CompoundIDLabel}; + } + # Add fingerprints label... + push @ColLabels, $OptionsInfo{FingerprintsLabel}; + + return \@ColLabels; +} + +# Generate column values FPText output file.. +# +sub SetupFPTextFileCoulmnValues { + my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; + my(@ColValues); + + @ColValues = (); + if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { + push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); + } + elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { + @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; + } + elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { + @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; + } + elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { + @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; + } + + return \@ColValues; +} + +# Generate compound ID for FP and FPText output files.. +# +sub SetupCmpdIDForOutputFiles { + my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; + my($CmpdID); + + $CmpdID = ''; + if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { + my($MolName); + $MolName = $Molecule->GetName(); + $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; + } + elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { + $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; + } + elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { + my($SpecifiedDataField); + $SpecifiedDataField = $OptionsInfo{CompoundID}; + $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; + } + elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { + $CmpdID = $Molecule->GetName(); + } + return $CmpdID; +} + +# Generate fingerprints for molecule... +# +sub GenerateMoleculeFingerprints { + my($Molecule) = @_; + my($MACCSKeysFingerprints); + + if ($OptionsInfo{KeepLargestComponent}) { + $Molecule->KeepLargestComponent(); + } + if (!$Molecule->DetectRings()) { + return undef; + } + $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); + $Molecule->DetectAromaticity(); + + $MACCSKeysFingerprints = undef; + if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) { + $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size}); + } + elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) { + $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size}); + } + else { + die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n"; + } + $MACCSKeysFingerprints->GenerateMACCSKeys(); + + return $MACCSKeysFingerprints; +} + +# Retrieve information about SD files... +# +sub RetrieveSDFilesInfo { + my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); + + %SDFilesInfo = (); + @{$SDFilesInfo{FileOkay}} = (); + @{$SDFilesInfo{OutFileRoot}} = (); + @{$SDFilesInfo{SDOutFileNames}} = (); + @{$SDFilesInfo{FPOutFileNames}} = (); + @{$SDFilesInfo{TextOutFileNames}} = (); + @{$SDFilesInfo{AllDataFieldsRef}} = (); + @{$SDFilesInfo{CommonDataFieldsRef}} = (); + + $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; + $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; + + FILELIST: for $Index (0 .. $#SDFilesList) { + $SDFile = $SDFilesList[$Index]; + + $SDFilesInfo{FileOkay}[$Index] = 0; + $SDFilesInfo{OutFileRoot}[$Index] = ''; + $SDFilesInfo{SDOutFileNames}[$Index] = ''; + $SDFilesInfo{FPOutFileNames}[$Index] = ''; + $SDFilesInfo{TextOutFileNames}[$Index] = ''; + + $SDFile = $SDFilesList[$Index]; + if (!(-e $SDFile)) { + warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; + next FILELIST; + } + if (!CheckFileType($SDFile, "sd sdf")) { + warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; + next FILELIST; + } + + if ($CheckDataField) { + # Make sure data field exists in SD file.. + my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); + + @CmpdLines = (); + open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; + $CmpdString = ReadCmpdString(\*SDFILE); + close SDFILE; + @CmpdLines = split "\n", $CmpdString; + %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); + $SpecifiedDataField = $OptionsInfo{CompoundID}; + if (!exists $DataFieldValues{$SpecifiedDataField}) { + warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; + next FILELIST; + } + } + + $AllDataFieldsRef = ''; + $CommonDataFieldsRef = ''; + if ($CollectDataFields) { + my($CmpdCount); + open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; + ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); + close SDFILE; + } + + # Setup output file names... + $FileDir = ""; $FileName = ""; $FileExt = ""; + ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); + + $TextOutFileExt = "csv"; + if ($Options{outdelim} =~ /^tab$/i) { + $TextOutFileExt = "tsv"; + } + $SDOutFileExt = $FileExt; + $FPOutFileExt = "fpf"; + + if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { + my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); + if ($RootFileName && $RootFileExt) { + $FileName = $RootFileName; + } + else { + $FileName = $OptionsInfo{OutFileRoot}; + } + $OutFileRoot = $FileName; + } + else { + $OutFileRoot = "${FileName}MACCSKeysFP"; + } + + $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; + $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; + $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; + + if ($OptionsInfo{SDOutput}) { + if ($SDFile =~ /$NewSDFileName/i) { + warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; + print "Specify a different name using \"-r --root\" option or use default name.\n"; + next FILELIST; + } + } + + if (!$OptionsInfo{OverwriteFiles}) { + # Check SD and text outout files... + if ($OptionsInfo{SDOutput}) { + if (-e $NewSDFileName) { + warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; + next FILELIST; + } + } + if ($OptionsInfo{FPOutput}) { + if (-e $NewFPFileName) { + warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; + next FILELIST; + } + } + if ($OptionsInfo{TextOutput}) { + if (-e $NewTextFileName) { + warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; + next FILELIST; + } + } + } + + $SDFilesInfo{FileOkay}[$Index] = 1; + + $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; + $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; + $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; + $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; + + $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; + $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; + } +} + +# Process option values... +sub ProcessOptions { + %OptionsInfo = (); + + $OptionsInfo{Mode} = $Options{mode}; + $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; + + $OptionsInfo{BitsOrder} = $Options{bitsorder}; + $OptionsInfo{BitStringFormat} = $Options{bitstringformat}; + + $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; + $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; + $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; + + $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; + + my(@SpecifiedDataFields); + @SpecifiedDataFields = (); + + @{$OptionsInfo{SpecifiedDataFields}} = (); + $OptionsInfo{CompoundID} = ''; + + if ($Options{datafieldsmode} =~ /^CompoundID$/i) { + if ($Options{compoundidmode} =~ /^DataField$/i) { + if (!$Options{compoundid}) { + die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; + } + $OptionsInfo{CompoundID} = $Options{compoundid}; + } + elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { + $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; + } + } + elsif ($Options{datafieldsmode} =~ /^Specify$/i) { + if (!$Options{datafields}) { + die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; + } + @SpecifiedDataFields = split /\,/, $Options{datafields}; + push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; + } + + $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints'; + + $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; + + $OptionsInfo{Output} = $Options{output}; + $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; + $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; + $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; + + $OptionsInfo{OutDelim} = $Options{outdelim}; + $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; + + $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; + $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; + + $OptionsInfo{Size} = $Options{size}; + + $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat}; +} + +# Setup script usage and retrieve command line arguments specified using various options... +sub SetupScriptUsage { + + # Retrieve all the options... + %Options = (); + + $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; + + $Options{bitsorder} = 'Ascending'; + $Options{bitstringformat} = 'BinaryString'; + + $Options{compoundidmode} = 'LabelPrefix'; + $Options{compoundidlabel} = 'CompoundID'; + $Options{datafieldsmode} = 'CompoundID'; + + $Options{filter} = 'Yes'; + + $Options{keeplargestcomponent} = 'Yes'; + + $Options{mode} = 'MACCSKeyBits'; + + $Options{output} = 'text'; + $Options{outdelim} = 'comma'; + $Options{quote} = 'yes'; + + $Options{size} = 166; + + $Options{vectorstringformat} = 'ValuesString'; + + if (!GetOptions(\%Options, "aromaticitymodel=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) { + die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; + } + if ($Options{workingdir}) { + if (! -d $Options{workingdir}) { + die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; + } + chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; + } + if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { + my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); + die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; + } + if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) { + die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n"; + } + if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) { + die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n"; + } + if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { + die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; + } + if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { + die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; + } + if ($Options{filter} !~ /^(Yes|No)$/i) { + die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; + } + if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { + die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; + } + if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) { + die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n"; + } + if ($Options{output} !~ /^(SD|FP|text|all)$/i) { + die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; + } + if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { + die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; + } + if ($Options{quote} !~ /^(Yes|No)$/i) { + die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; + } + if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { + die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; + } + if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) { + die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n"; + } + if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { + die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; + } +} + +__END__ + +=head1 NAME + +MACCSKeysFingerprints.pl - Generate MACCS key fingerprints for SD files + +=head1 SYNOPSIS + +MACCSKeysFingerprints.pl SDFile(s)... + +MACCSKeysFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>] +[B<--BitsOrder> I<Ascending | Descending>] +[B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>] +[B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>] +[B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>] +[B<--DataFields> I<"FieldLabel1,FieldLabel2,...">] [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>] +[B<-f, --Filter> I<Yes | No>] [B<--FingerprintsLabel> I<text>] [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>] +[B<-m, --mode> I<MACCSKeyBits | MACCSKeyCount>] [B<--OutDelim> I<comma | tab | semicolon>] +[B<--output> I<SD | FP | text | all>] [B<-o, --overwrite>] +[B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-s, --size> I<number>] +[B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>] +[B<-w, --WorkingDir> I<DirName>] + +=head1 DESCRIPTION + +Generate MACCS (Molecular ACCess System) keys fingerprints [ Ref 45-47 ] for I<SDFile(s)> +and create appropriate SD, FP or CSV/TSV text file(s) containing fingerprints bit-vector or +vector strings corresponding to molecular fingerprints. + +Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf> +and I<.sd>. All other file names are ignored. All the SD files in a current directory +can be specified either by I<*.sdf> or the current directory name. + +For each MACCS keys definition, atoms are processed to determine their membership to the key +and the appropriate molecular fingerprints strings are generated. An atom can belong to multiple +MACCS keys. + +For I<MACCSKeyBits> value of B<-m, --mode> option, a fingerprint bit-vector string containing +zeros and ones is generated and for I<MACCSKeyCount> value, a fingerprint vector string +corresponding to number of MACCS keys [ Ref 45-47 ] is generated. + +I<MACCSKeyBits | MACCSKeyCount> values for B<-m, --mode> option along with two possible +I<166 | 322> values of B<-s, --size> supports generation of four different types of MACCS +keys fingerprint: I<MACCS166KeyBits, MACCS166KeyCount, MACCS322KeyBits, MACCS322KeyCount>. + +Example of I<SD> file containing MAACS keys fingerprints string data: + + ... ... + ... ... + $$$$ + ... ... + ... ... + ... ... + 41 44 0 0 0 0 0 0 0 0999 V2000 + -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 + ... ... + 2 3 1 0 0 0 0 + ... ... + M END + > <CmpdID> + Cmpd1 + + > <MACCSKeysFingerprints> + FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;000000000 + 00000000000000000000000000000000100100001001000000001001000000001110001 + 00101010111100011011000100110110000011011110100110111111111111011111111 + 11111111110111000 + + $$$$ + ... ... + ... ... + +Example of I<FP> file containing MAACS keys fingerprints string data: + + # + # Package = MayaChemTools 7.4 + # Release Date = Oct 21, 2010 + # + # TimeStamp = Fri Mar 11 14:57:24 2011 + # + # FingerprintsStringType = FingerprintsBitVector + # + # Description = MACCSKeyBits + # Size = 166 + # BitStringFormat = BinaryString + # BitsOrder = Ascending + # + Cmpd1 00000000000000000000000000000000000000000100100001001000000001... + Cmpd2 00000000000000000000000010000000001000000010000000001000000000... + ... ... + ... .. + +Example of CSV I<Text> file containing MAACS keys fingerprints string data: + + "CompoundID","MACCSKeysFingerprints" + "Cmpd1","FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending; + 00000000000000000000000000000000000000000100100001001000000001001000000 + 00111000100101010111100011011000100110110000011011110100110111111111111 + 01111111111111111110111000" + ... ... + ... ... + +The current release of MayaChemTools generates the following types of MACCS keys +fingerprints bit-vector and vector strings: + + FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000 + 0000000000000000000000000000000001001000010010000000010010000000011100 + 0100101010111100011011000100110110000011011110100110111111111111011111 + 11111111111110111000 + + FingerprintsBitVector;MACCSKeyBits;166;HexadecimalString;Ascending;000 + 000000021210210e845f8d8c60b79dffbffffd1 + + FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011 + 1110011111100101111111000111101100110000000000000011100010000000000000 + 0000000000000000000000000000000000000000000000101000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000011000000000000000000000000000000 + 0000000000000000000000000000000000000000 + + FingerprintsBitVector;MACCSKeyBits;322;HexadecimalString;Ascending;7d7 + e7af3edc000c1100000000000000500000000000000000000000000000000300000000 + 000000000 + + FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri + ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 + 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0 + 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1 + 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1 + + FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri + ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0 + 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... + +=head1 OPTIONS + +=over 4 + +=item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel> + +Specify aromaticity model to use during detection of aromaticity. Possible values in the current +release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel, +ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel +or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>. + +The supported aromaticity model names along with model specific control parameters +are defined in B<AromaticityModelsData.csv>, which is distributed with the current release +and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from +this file during class instantiation and makes it available to method B<DetectAromaticity> +for detecting aromaticity corresponding to a specific model. + +=item B<--BitsOrder> I<Ascending | Descending> + +Bits order to use during generation of fingerprints bit-vector string for I<MACCSKeyBits> value of +B<-m, --mode> option. Possible values: I<Ascending, Descending>. Default: I<Ascending>. + +I<Ascending> bit order which corresponds to first bit in each byte as the lowest bit as +opposed to the highest bit. + +Internally, bits are stored in I<Ascending> order using Perl vec function. Regardless +of machine order, big-endian or little-endian, vec function always considers first +string byte as the lowest byte and first bit within each byte as the lowest bit. + +=item B<-b, --BitStringFormat> I<BinaryString | HexadecimalString> + +Format of fingerprints bit-vector string data in output SD, FP or CSV/TSV text file(s) specified by +B<--output> used during I<MACCSKeyBits> value of B<-m, --mode> option. Possible +values: I<BinaryString, HexadecimalString>. Default value: I<BinaryString>. + +I<BinaryString> corresponds to an ASCII string containing 1s and 0s. I<HexadecimalString> +contains bit values in ASCII hexadecimal format. + +Examples: + + FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000 + 0000000000000000000000000000000001001000010010000000010010000000011100 + 0100101010111100011011000100110110000011011110100110111111111111011111 + 11111111111110111000 + + FingerprintsBitVector;MACCSKeyBits;166;HexadecimalString;Ascending;000 + 000000021210210e845f8d8c60b79dffbffffd1 + + FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011 + 1110011111100101111111000111101100110000000000000011100010000000000000 + 0000000000000000000000000000000000000000000000101000000000000000000000 + 0000000000000000000000000000000000000000000000000000000000000000000000 + 0000000000000000000000000000000000000011000000000000000000000000000000 + 0000000000000000000000000000000000000000 + + FingerprintsBitVector;MACCSKeyBits;322;HexadecimalString;Ascending;7d7 + e7af3edc000c1100000000000000500000000000000000000000000000000300000000 + 000000000 + +=item B<--CompoundID> I<DataFieldName or LabelPrefixString> + +This value is B<--CompoundIDMode> specific and indicates how compound ID is generated. + +For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name +whose value is used as compound ID; otherwise, it's a prefix string used for generating compound +IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which +look like Cmpd<Number>. + +Examples for I<DataField> value of B<--CompoundIDMode>: + + MolID + ExtReg + +Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>: + + Compound + +The value specified above generates compound IDs which correspond to Compound<Number> +instead of default value of Cmpd<Number>. + +=item B<--CompoundIDLabel> I<text> + +Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value +of B<--DataFieldsMode> option. Default: I<CompoundID>. + +=item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix> + +Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated +fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value; +use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination +of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines. + +Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>. +Default: I<LabelPrefix>. + +For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes +precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname +values are replaced with sequential compound IDs. + +This is only used for I<CompoundID> value of B<--DataFieldsMode> option. + +=item B<--DataFields> I<"FieldLabel1,FieldLabel2,..."> + +Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along +with generated fingerprints for I<text | all> values of B<--output> option. + +This is only used for I<Specify> value of B<--DataFieldsMode> option. + +Examples: + + Extreg + MolID,CompoundName + +=item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID> + +Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along +with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD +data field; transfer SD data files common to all compounds; extract specified data fields; +generate a compound ID using molname line, a compound prefix, or a combination of both. +Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>. + +=item B<-f, --Filter> I<Yes | No> + +Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>. +Default value: I<Yes>. + +By default, compound data is checked before calculating fingerprints and compounds containing +atom data corresponding to non-element symbols or no atom data are ignored. + +=item B<--FingerprintsLabel> I<text> + +SD data label or text file column label to use for fingerprints string in output SD or +CSV/TSV text file(s) specified by B<--output>. Default value: I<MACCSKeyFingerprints>. + +=item B<-h, --help> + +Print this help message. + +=item B<-k, --KeepLargestComponent> I<Yes | No> + +Generate fingerprints for only the largest component in molecule. Possible values: +I<Yes or No>. Default value: I<Yes>. + +For molecules containing multiple connected components, fingerprints can be generated +in two different ways: use all connected components or just the largest connected +component. By default, all atoms except for the largest connected component are +deleted before generation of fingerprints. + +=item B<-m, --mode> I<MACCSKeyBits | MACCSKeyCount> + +Specify type of MACCS keys [ Ref 45-47 ] fingerprints to generate for molecules in I<SDFile(s)>. +Possible values: I<MACCSKeyBits, MACCSKeyCount>. Default value: I<MACCSKeyBits>. + +For I<MACCSKeyBits> value of B<-m, --mode> option, a fingerprint bit-vector string containing +zeros and ones is generated and for I<MACCSKeyCount> value, a fingerprint vector string +corresponding to number of MACCS keys is generated. + +I<MACCSKeyBits | MACCSKeyCount> values for B<-m, --mode> option along with two possible +I<166 | 322> values of B<-s, --size> supports generation of four different types of MACCS +keys fingerprint: I<MACCS166KeyBits, MACCS166KeyCount, MACCS322KeyBits, MACCS322KeyCount>. + +Definition of MACCS keys uses the following atom and bond symbols to define atom and +bond environments: + + Atom symbols for 166 keys [ Ref 47 ]: + + A : Any valid periodic table element symbol + Q : Hetro atoms; any non-C or non-H atom + X : Halogens; F, Cl, Br, I + Z : Others; other than H, C, N, O, Si, P, S, F, Cl, Br, I + + Atom symbols for 322 keys [ Ref 46 ]: + + A : Any valid periodic table element symbol + Q : Hetro atoms; any non-C or non-H atom + X : Others; other than H, C, N, O, Si, P, S, F, Cl, Br, I + Z is neither defined nor used + + Bond types: + + - : Single + = : Double + T : Triple + # : Triple + ~ : Single or double query bond + % : An aromatic query bond + + None : Any bond type; no explicit bond specified + + $ : Ring bond; $ before a bond type specifies ring bond + ! : Chain or non-ring bond; ! before a bond type specifies chain bond + + @ : A ring linkage and the number following it specifies the + atoms position in the line, thus @1 means linked back to the first + atom in the list. + + Aromatic: Kekule or Arom5 + + Kekule: Bonds in 6-membered rings with alternate single/double bonds + or perimeter bonds + Arom5: Bonds in 5-membered rings with two double bonds and a hetro + atom at the apex of the ring. + +MACCS 166 keys [ Ref 45-47 ] are defined as follows: + + Key Description + + 1 ISOTOPE + 2 103 < ATOMIC NO. < 256 + 3 GROUP IVA,VA,VIA PERIODS 4-6 (Ge...) + 4 ACTINIDE + 5 GROUP IIIB,IVB (Sc...) + 6 LANTHANIDE + 7 GROUP VB,VIB,VIIB (V...) + 8 QAAA@1 + 9 GROUP VIII (Fe...) + 10 GROUP IIA (ALKALINE EARTH) + 11 4M RING + 12 GROUP IB,IIB (Cu...) + 13 ON(C)C + 14 S-S + 15 OC(O)O + 16 QAA@1 + 17 CTC + 18 GROUP IIIA (B...) + 19 7M RING + 20 SI + 21 C=C(Q)Q + 22 3M RING + 23 NC(O)O + 24 N-O + 25 NC(N)N + 26 C$=C($A)$A + 27 I + 28 QCH2Q + 29 P + 30 CQ(C)(C)A + 31 QX + 32 CSN + 33 NS + 34 CH2=A + 35 GROUP IA (ALKALI METAL) + 36 S HETEROCYCLE + 37 NC(O)N + 38 NC(C)N + 39 OS(O)O + 40 S-O + 41 CTN + 42 F + 43 QHAQH + 44 OTHER + 45 C=CN + 46 BR + 47 SAN + 48 OQ(O)O + 49 CHARGE + 50 C=C(C)C + 51 CSO + 52 NN + 53 QHAAAQH + 54 QHAAQH + 55 OSO + 56 ON(O)C + 57 O HETEROCYCLE + 58 QSQ + 59 Snot%A%A + 60 S=O + 61 AS(A)A + 62 A$A!A$A + 63 N=O + 64 A$A!S + 65 C%N + 66 CC(C)(C)A + 67 QS + 68 QHQH (&...) + 69 QQH + 70 QNQ + 71 NO + 72 OAAO + 73 S=A + 74 CH3ACH3 + 75 A!N$A + 76 C=C(A)A + 77 NAN + 78 C=N + 79 NAAN + 80 NAAAN + 81 SA(A)A + 82 ACH2QH + 83 QAAAA@1 + 84 NH2 + 85 CN(C)C + 86 CH2QCH2 + 87 X!A$A + 88 S + 89 OAAAO + 90 QHAACH2A + 91 QHAAACH2A + 92 OC(N)C + 93 QCH3 + 94 QN + 95 NAAO + 96 5M RING + 97 NAAAO + 98 QAAAAA@1 + 99 C=C + 100 ACH2N + 101 8M RING + 102 QO + 103 CL + 104 QHACH2A + 105 A$A($A)$A + 106 QA(Q)Q + 107 XA(A)A + 108 CH3AAACH2A + 109 ACH2O + 110 NCO + 111 NACH2A + 112 AA(A)(A)A + 113 Onot%A%A + 114 CH3CH2A + 115 CH3ACH2A + 116 CH3AACH2A + 117 NAO + 118 ACH2CH2A > 1 + 119 N=A + 120 HETEROCYCLIC ATOM > 1 (&...) + 121 N HETEROCYCLE + 122 AN(A)A + 123 OCO + 124 QQ + 125 AROMATIC RING > 1 + 126 A!O!A + 127 A$A!O > 1 (&...) + 128 ACH2AAACH2A + 129 ACH2AACH2A + 130 QQ > 1 (&...) + 131 QH > 1 + 132 OACH2A + 133 A$A!N + 134 X (HALOGEN) + 135 Nnot%A%A + 136 O=A > 1 + 137 HETEROCYCLE + 138 QCH2A > 1 (&...) + 139 OH + 140 O > 3 (&...) + 141 CH3 > 2 (&...) + 142 N > 1 + 143 A$A!O + 144 Anot%A%Anot%A + 145 6M RING > 1 + 146 O > 2 + 147 ACH2CH2A + 148 AQ(A)A + 149 CH3 > 1 + 150 A!A$A!A + 151 NH + 152 OC(C)C + 153 QCH2A + 154 C=O + 155 A!CH2!A + 156 NA(A)A + 157 C-O + 158 C-N + 159 O > 1 + 160 CH3 + 161 N + 162 AROMATIC + 163 6M RING + 164 O + 165 RING + 166 FRAGMENTS + +MACCS 322 keys set as defined in tables 1, 2 and 3 [ Ref 46 ] include: + + . 26 atom properties of type P, as listed in Table 1 + . 32 one-atom environments, as listed in Table 3 + . 264 atom-bond-atom combinations listed in Table 4 + +Total number of keys in three tables is : 322 + +Atom symbol, X, used for 322 keys [ Ref 46 ] doesn't refer to Halogens as it does for 166 keys. In +order to keep the definition of 322 keys consistent with the published definitions, the symbol X is +used to imply "others" atoms, but it's internally mapped to symbol X as defined for 166 keys +during the generation of key values. + +Atom properties-based keys (26): + + Key Description + 1 A(AAA) or AA(A)A - atom with at least three neighbors + 2 Q - heteroatom + 3 Anot%not-A - atom involved in one or more multiple bonds, not aromatic + 4 A(AAAA) or AA(A)(A)A - atom with at least four neighbors + 5 A(QQ) or QA(Q) - atom with at least two heteroatom neighbors + 6 A(QQQ) or QA(Q)Q - atom with at least three heteroatom neighbors + 7 QH - heteroatom with at least one hydrogen attached + 8 CH2(AA) or ACH2A - carbon with at least two single bonds and at least + two hydrogens attached + 9 CH3(A) or ACH3 - carbon with at least one single bond and at least three + hydrogens attached + 10 Halogen + 11 A(-A-A-A) or A-A(-A)-A - atom has at least three single bonds + 12 AAAAAA@1 > 2 - atom is in at least two different six-membered rings + 13 A($A$A$A) or A$A($A)$A - atom has more than two ring bonds + 14 A$A!A$A - atom is at a ring/chain boundary. When a comparison is done + with another atom the path passes through the chain bond. + 15 Anot%A%Anot%A - atom is at an aromatic/nonaromatic boundary. When a + comparison is done with another atom the path + passes through the aromatic bond. + 16 A!A!A - atom with more than one chain bond + 17 A!A$A!A - atom is at a ring/chain boundary. When a comparison is done + with another atom the path passes through the ring bond. + 18 A%Anot%A%A - atom is at an aromatic/nonaromatic boundary. When a + comparison is done with another atom the + path passes through the nonaromatic bond. + 19 HETEROCYCLE - atom is a heteroatom in a ring. + 20 rare properties: atom with five or more neighbors, atom in + four or more rings, or atom types other than + H, C, N, O, S, F, Cl, Br, or I + 21 rare properties: atom has a charge, is an isotope, has two or + more multiple bonds, or has a triple bond. + 22 N - nitrogen + 23 S - sulfur + 24 O - oxygen + 25 A(AA)A(A)A(AA) - atom has two neighbors, each with three or + more neighbors (including the central atom). + 26 CHACH2 - atom has two hydrocarbon (CH2) neighbors + +Atomic environments properties-based keys (32): + + Key Description + 27 C(CC) + 28 C(CCC) + 29 C(CN) + 30 C(CCN) + 31 C(NN) + 32 C(NNC) + 33 C(NNN) + 34 C(CO) + 35 C(CCO) + 36 C(NO) + 37 C(NCO) + 38 C(NNO) + 39 C(OO) + 40 C(COO) + 41 C(NOO) + 42 C(OOO) + 43 Q(CC) + 44 Q(CCC) + 45 Q(CN) + 46 Q(CCN) + 47 Q(NN) + 48 Q(CNN) + 49 Q(NNN) + 50 Q(CO) + 51 Q(CCO) + 52 Q(NO) + 53 Q(CNO) + 54 Q(NNO) + 55 Q(OO) + 56 Q(COO) + 57 Q(NOO) + 58 Q(OOO) + +Note: The first symbol is the central atom, with atoms bonded to the central atom listed in +parentheses. Q is any non-C, non-H atom. If only two atoms are in parentheses, there is +no implication concerning the other atoms bonded to the central atom. + +Atom-Bond-Atom properties-based keys: (264) + + Key Description + 59 C-C + 60 C-N + 61 C-O + 62 C-S + 63 C-Cl + 64 C-P + 65 C-F + 66 C-Br + 67 C-Si + 68 C-I + 69 C-X + 70 N-N + 71 N-O + 72 N-S + 73 N-Cl + 74 N-P + 75 N-F + 76 N-Br + 77 N-Si + 78 N-I + 79 N-X + 80 O-O + 81 O-S + 82 O-Cl + 83 O-P + 84 O-F + 85 O-Br + 86 O-Si + 87 O-I + 88 O-X + 89 S-S + 90 S-Cl + 91 S-P + 92 S-F + 93 S-Br + 94 S-Si + 95 S-I + 96 S-X + 97 Cl-Cl + 98 Cl-P + 99 Cl-F + 100 Cl-Br + 101 Cl-Si + 102 Cl-I + 103 Cl-X + 104 P-P + 105 P-F + 106 P-Br + 107 P-Si + 108 P-I + 109 P-X + 110 F-F + 111 F-Br + 112 F-Si + 113 F-I + 114 F-X + 115 Br-Br + 116 Br-Si + 117 Br-I + 118 Br-X + 119 Si-Si + 120 Si-I + 121 Si-X + 122 I-I + 123 I-X + 124 X-X + 125 C=C + 126 C=N + 127 C=O + 128 C=S + 129 C=Cl + 130 C=P + 131 C=F + 132 C=Br + 133 C=Si + 134 C=I + 135 C=X + 136 N=N + 137 N=O + 138 N=S + 139 N=Cl + 140 N=P + 141 N=F + 142 N=Br + 143 N=Si + 144 N=I + 145 N=X + 146 O=O + 147 O=S + 148 O=Cl + 149 O=P + 150 O=F + 151 O=Br + 152 O=Si + 153 O=I + 154 O=X + 155 S=S + 156 S=Cl + 157 S=P + 158 S=F + 159 S=Br + 160 S=Si + 161 S=I + 162 S=X + 163 Cl=Cl + 164 Cl=P + 165 Cl=F + 166 Cl=Br + 167 Cl=Si + 168 Cl=I + 169 Cl=X + 170 P=P + 171 P=F + 172 P=Br + 173 P=Si + 174 P=I + 175 P=X + 176 F=F + 177 F=Br + 178 F=Si + 179 F=I + 180 F=X + 181 Br=Br + 182 Br=Si + 183 Br=I + 184 Br=X + 185 Si=Si + 186 Si=I + 187 Si=X + 188 I=I + 189 I=X + 190 X=X + 191 C#C + 192 C#N + 193 C#O + 194 C#S + 195 C#Cl + 196 C#P + 197 C#F + 198 C#Br + 199 C#Si + 200 C#I + 201 C#X + 202 N#N + 203 N#O + 204 N#S + 205 N#Cl + 206 N#P + 207 N#F + 208 N#Br + 209 N#Si + 210 N#I + 211 N#X + 212 O#O + 213 O#S + 214 O#Cl + 215 O#P + 216 O#F + 217 O#Br + 218 O#Si + 219 O#I + 220 O#X + 221 S#S + 222 S#Cl + 223 S#P + 224 S#F + 225 S#Br + 226 S#Si + 227 S#I + 228 S#X + 229 Cl#Cl + 230 Cl#P + 231 Cl#F + 232 Cl#Br + 233 Cl#Si + 234 Cl#I + 235 Cl#X + 236 P#P + 237 P#F + 238 P#Br + 239 P#Si + 240 P#I + 241 P#X + 242 F#F + 243 F#Br + 244 F#Si + 245 F#I + 246 F#X + 247 Br#Br + 248 Br#Si + 249 Br#I + 250 Br#X + 251 Si#Si + 252 Si#I + 253 Si#X + 254 I#I + 255 I#X + 256 X#X + 257 C$C + 258 C$N + 259 C$O + 260 C$S + 261 C$Cl + 262 C$P + 263 C$F + 264 C$Br + 265 C$Si + 266 C$I + 267 C$X + 268 N$N + 269 N$O + 270 N$S + 271 N$Cl + 272 N$P + 273 N$F + 274 N$Br + 275 N$Si + 276 N$I + 277 N$X + 278 O$O + 279 O$S + 280 O$Cl + 281 O$P + 282 O$F + 283 O$Br + 284 O$Si + 285 O$I + 286 O$X + 287 S$S + 288 S$Cl + 289 S$P + 290 S$F + 291 S$Br + 292 S$Si + 293 S$I + 294 S$X + 295 Cl$Cl + 296 Cl$P + 297 Cl$F + 298 Cl$Br + 299 Cl$Si + 300 Cl$I + 301 Cl$X + 302 P$P + 303 P$F + 304 P$Br + 305 P$Si + 306 P$I + 307 P$X + 308 F$F + 309 F$Br + 310 F$Si + 311 F$I + 312 F$X + 313 Br$Br + 314 Br$Si + 315 Br$I + 316 Br$X + 317 Si$Si + 318 Si$I + 319 Si$X + 320 I$I + 321 I$X + 322 X$X + +=item B<--OutDelim> I<comma | tab | semicolon> + +Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> +Default value: I<comma>. + +=item B<--output> I<SD | FP | text | all> + +Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>. + +=item B<-o, --overwrite> + +Overwrite existing files. + +=item B<-q, --quote> I<Yes | No> + +Put quote around column values in output CSV/TSV text file(s). Possible values: +I<Yes or No>. Default value: I<Yes>. + +=item B<-r, --root> I<RootName> + +New file name is generated using the root: <Root>.<Ext>. Default for new file +names: <SDFileName><MACCSKeysFP>.<Ext>. The file type determines <Ext> value. +The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab +delimited text files, respectively.This option is ignored for multiple input files. + +=item B<-s, --size> I<number> + +Size of MACCS keys [ Ref 45-47 ] set to use during fingerprints generation. Possible values: I<166 or 322>. +Default value: I<166>. + +=item B<-v, --VectorStringFormat> I<ValuesString | IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString> + +Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by +B<--output> used during I<MACCSKeyCount> value of B<-m, --mode> option. Possible +values: I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | +ValuesAndIDsPairsString>. Defaultvalue: I<ValuesString>. + +Examples: + + FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri + ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 + 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0 + 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1 + 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1 + + FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri + ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0 + 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... + +=item B<-w, --WorkingDir> I<DirName> + +Location of working directory. Default: current directory. + +=back + +=head1 EXAMPLES + +To generate MACCS keys fingerprints of size 166 in binary bit-vector string format +and create a SampleMACCS166FPBin.csv file containing sequential compound IDs along with +fingerprints bit-vector strings data, type: + + % MACCSKeysFingerprints.pl -r SampleMACCS166FPBin -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 in binary bit-vector string format +and create SampleMACCS166FPBin.sdf, SampleMACCS166FPBin.csv and SampleMACCS166FPBin.csv +files containing sequential compound IDs in CSV file along with fingerprints bit-vector strings data, type: + + % MACCSKeysFingerprints.pl --output all -r SampleMACCS166FPBin + -o Sample.sdf + +To generate MACCS keys fingerprints of size 322 in binary bit-vector string format +and create a SampleMACCS322FPBin.csv file containing sequential compound IDs along with +fingerprints bit-vector strings data, type: + + % MACCSKeysFingerprints.pl -size 322 -r SampleMACCS322FPBin -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 corresponding to count of keys in +ValuesString format and create a SampleMACCS166FPCount.csv file containing sequential +compound IDs along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount -r SampleMACCS166FPCount + -o Sample.sdf + +To generate MACCS keys fingerprints of size 322 corresponding to count of keys in +ValuesString format and create a SampleMACCS322FPCount.csv file containing sequential +compound IDs along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount -size 322 + -r SampleMACCS322FPCount -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 in hexadecimal bit-vector string format with +ascending bits order and create a SampleMACCS166FPHex.csv file containing compound IDs +from MolName along with fingerprints bit-vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyBits --size 166 --BitStringFormat + HexadecimalString --BitsOrder Ascending --DataFieldsMode CompoundID + --CompoundIDMode MolName -r SampleMACCS166FPBin -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 corresponding to count of keys in +IDsAndValuesString format and create a SampleMACCS166FPCount.csv file containing +compound IDs from MolName line along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166 + --VectorStringFormat IDsAndValuesString --DataFieldsMode CompoundID + --CompoundIDMode MolName -r SampleMACCS166FPCount -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 corresponding to count of keys in +IDsAndValuesString format and create a SampleMACCS166FPCount.csv file containing +compound IDs using specified data field along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166 + --VectorStringFormat IDsAndValuesString --DataFieldsMode CompoundID + --CompoundIDMode DataField --CompoundID Mol_ID -r + SampleMACCS166FPCount -o Sample.sdf + +To generate MACCS keys fingerprints of size 322 corresponding to count of keys in +ValuesString format and create a SampleMACCS322FPCount.tsv file containing compound +IDs derived from combination of molecule name line and an explicit compound prefix +along with fingerprints vector strings data in a column labels MACCSKeyCountFP, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount -size 322 --DataFieldsMode + CompoundID --CompoundIDMode MolnameOrLabelPrefix --CompoundID Cmpd + --CompoundIDLabel MolID --FingerprintsLabel MACCSKeyCountFP --OutDelim + Tab -r SampleMACCS322FPCount -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 corresponding to count of keys in +ValuesString format and create a SampleMACCS166FPCount.csv file containing +specific data fields columns along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166 + --VectorStringFormat ValuesString --DataFieldsMode Specify --DataFields + Mol_ID -r SampleMACCS166FPCount -o Sample.sdf + +To generate MACCS keys fingerprints of size 322 corresponding to count of keys in +ValuesString format and create a SampleMACCS322FPCount.csv file containing +common data fields columns along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 322 + --VectorStringFormat ValuesString --DataFieldsMode Common -r + SampleMACCS322FPCount -o Sample.sdf + +To generate MACCS keys fingerprints of size 166 corresponding to count of keys in +ValuesString format and create SampleMACCS166FPCount.sdf, SampleMACCS166FPCount.fpf and +SampleMACCS166FPCount.csv files containing all data fields columns in CSV file +along with fingerprints vector strings data, type: + + % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166 --output all + --VectorStringFormat ValuesString --DataFieldsMode All -r + SampleMACCS166FPCount -o Sample.sdf + +=head1 AUTHOR + +Manish Sud <msud@san.rr.com> + +=head1 SEE ALSO + +InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl, +ExtendedConnectivityFingerprints.pl, PathLengthFingerprints.pl, +TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl, +TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl + +=head1 COPYRIGHT + +Copyright (C) 2015 Manish Sud. All rights reserved. + +This file is part of MayaChemTools. + +MayaChemTools is free software; you can redistribute it and/or modify it under +the terms of the GNU Lesser General Public License as published by the Free +Software Foundation; either version 3 of the License, or (at your option) +any later version. + +=cut