Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/SimilarityMatricesFingerprints.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 09:23:18 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4816e4a8ae95 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: SimilarityMatricesFingerprints.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:20 $ | |
| 5 # $Revision: 1.21 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use File::Copy; | |
| 34 use Text::ParseWords; | |
| 35 use Benchmark; | |
| 36 use FileUtil; | |
| 37 use TextUtil; | |
| 38 use Fingerprints::FingerprintsFileUtil; | |
| 39 use Fingerprints::FingerprintsBitVector; | |
| 40 use Fingerprints::FingerprintsVector; | |
| 41 | |
| 42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 43 | |
| 44 # Autoflush STDOUT | |
| 45 $| = 1; | |
| 46 | |
| 47 # Starting message... | |
| 48 $ScriptName = basename($0); | |
| 49 print "\n$ScriptName: Starting...\n\n"; | |
| 50 $StartTime = new Benchmark; | |
| 51 | |
| 52 # Get the options and setup script... | |
| 53 SetupScriptUsage(); | |
| 54 if ($Options{help} || @ARGV < 1) { | |
| 55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 56 } | |
| 57 | |
| 58 my(@FingerprintsFilesList); | |
| 59 @FingerprintsFilesList = ExpandFileNames(\@ARGV, "sdf sd fpf fp csv tsv"); | |
| 60 | |
| 61 # Process options... | |
| 62 print "Processing options...\n"; | |
| 63 my(%OptionsInfo); | |
| 64 ProcessOptions(); | |
| 65 | |
| 66 # Setup information about input files... | |
| 67 print "Checking input fingerprints file(s)...\n"; | |
| 68 my(%FingerprintsFilesInfo); | |
| 69 RetrieveFingerprintsFilesInfo(); | |
| 70 | |
| 71 # Process input files.. | |
| 72 my($FileIndex); | |
| 73 if (@FingerprintsFilesList > 1) { | |
| 74 print "\nProcessing fingerprints files...\n"; | |
| 75 } | |
| 76 for $FileIndex (0 .. $#FingerprintsFilesList) { | |
| 77 if ($FingerprintsFilesInfo{FileOkay}[$FileIndex]) { | |
| 78 print "\nProcessing file $FingerprintsFilesList[$FileIndex]...\n"; | |
| 79 GenerateSimilarityMatrices($FileIndex); | |
| 80 } | |
| 81 } | |
| 82 print "\n$ScriptName:Done...\n\n"; | |
| 83 | |
| 84 $EndTime = new Benchmark; | |
| 85 $TotalTime = timediff ($EndTime, $StartTime); | |
| 86 print "Total time: ", timestr($TotalTime), "\n"; | |
| 87 | |
| 88 ############################################################################### | |
| 89 | |
| 90 # Generate similarity matrices using fingerprints data in text file... | |
| 91 # | |
| 92 sub GenerateSimilarityMatrices { | |
| 93 my($FileIndex) = @_; | |
| 94 | |
| 95 ProcessFingerprintsData($FileIndex); | |
| 96 | |
| 97 if ($FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$FileIndex]) { | |
| 98 GenerateSimilarityMatricesForFingerprintsBitVectors($FileIndex); | |
| 99 } | |
| 100 elsif ($FingerprintsFilesInfo{FingerprintsVectorStringMode}[$FileIndex]) { | |
| 101 GenerateSimilarityMatricesForFingerprintsVectors($FileIndex); | |
| 102 } | |
| 103 | |
| 104 CleanupFingerprintsData($FileIndex); | |
| 105 } | |
| 106 | |
| 107 # Generate bit vector similarity matrices... | |
| 108 # | |
| 109 sub GenerateSimilarityMatricesForFingerprintsBitVectors { | |
| 110 my($FileIndex) = @_; | |
| 111 my($SpecifiedComparisonMeasure, $ComparisonMeasure, $NewTextFile, $SimilarityMatrixRef, $MethodName, @MethodParameters); | |
| 112 | |
| 113 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedBitVectorComparisonsRef}}) { | |
| 114 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
| 115 $NewTextFile = $FingerprintsFilesInfo{OutFileRoot}[$FileIndex] . "${ComparisonMeasure}." . $FingerprintsFilesInfo{OutFileExt}[$FileIndex]; | |
| 116 | |
| 117 $MethodName = $OptionsInfo{SpecifiedBitVectorComparisonsMethodRef}->{lc($ComparisonMeasure)}; | |
| 118 | |
| 119 @MethodParameters = (); | |
| 120 @MethodParameters = @{$OptionsInfo{SpecifiedBitVectorComparisonsParameterRef}->{lc($ComparisonMeasure)}}; | |
| 121 | |
| 122 GenerateSimilarityMatrix($FileIndex, $NewTextFile, $MethodName, \@MethodParameters); | |
| 123 } | |
| 124 } | |
| 125 | |
| 126 # Generate vector similarity and/or distance matrices... | |
| 127 # | |
| 128 sub GenerateSimilarityMatricesForFingerprintsVectors { | |
| 129 my($FileIndex) = @_; | |
| 130 my($SpecifiedComparisonMeasure, $ComparisonMode, $ComparisonMeasure, $NewTextFile, $MethodName, @MethodParameters); | |
| 131 | |
| 132 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedVectorComparisonsRef}}) { | |
| 133 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
| 134 | |
| 135 for $ComparisonMode (@{$OptionsInfo{SpecifiedVectorComparisonModesRef}}) { | |
| 136 $NewTextFile = $FingerprintsFilesInfo{OutFileRoot}[$FileIndex] . "${ComparisonMeasure}${ComparisonMode}." . $FingerprintsFilesInfo{OutFileExt}[$FileIndex]; | |
| 137 | |
| 138 $MethodName = $OptionsInfo{SpecifiedVectorComparisonsMethodRef}->{lc($ComparisonMeasure)}; | |
| 139 | |
| 140 @MethodParameters = (); | |
| 141 push @MethodParameters, $ComparisonMode; | |
| 142 push @MethodParameters, @{$OptionsInfo{SpecifiedVectorComparisonsParameterRef}->{lc($ComparisonMeasure)}}; | |
| 143 | |
| 144 GenerateSimilarityMatrix($FileIndex, $NewTextFile, $MethodName, \@MethodParameters); | |
| 145 } | |
| 146 } | |
| 147 } | |
| 148 | |
| 149 # Calculate similarity matrix and write it out... | |
| 150 # | |
| 151 sub GenerateSimilarityMatrix { | |
| 152 my($FileIndex, $NewTextFile, $MethodName, $MethodParametersRef) = @_; | |
| 153 | |
| 154 print "\nGenerating $NewTextFile...\n"; | |
| 155 | |
| 156 # Open new file and write out column labels... | |
| 157 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
| 158 WriteColumnLabels($FileIndex, \*NEWTEXTFILE); | |
| 159 | |
| 160 # Calculate and write out similarity matrix values... | |
| 161 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
| 162 GenerateSimilarityMatrixUsingMemoryData($FileIndex, \*NEWTEXTFILE, $MethodName, $MethodParametersRef); | |
| 163 } | |
| 164 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
| 165 GenerateSimilarityMatrixUsingFileData($FileIndex, \*NEWTEXTFILE, $MethodName, $MethodParametersRef); | |
| 166 } | |
| 167 else { | |
| 168 warn "Warning: Input data mode, $OptionsInfo{InputDataMode}, is not supported.\n"; | |
| 169 } | |
| 170 | |
| 171 # Close new text file... | |
| 172 close NEWTEXTFILE; | |
| 173 | |
| 174 } | |
| 175 | |
| 176 # Calculate and write out similarity values using fingerprints data already loaded in | |
| 177 # memory... | |
| 178 # | |
| 179 sub GenerateSimilarityMatrixUsingMemoryData { | |
| 180 my($FileIndex, $NewTextFileRef, $MethodName, $MethodParametersRef) = @_; | |
| 181 my($RowIndex, $ColIndex, $CmpdID1, $CmpdID2, $FingerprintsObject1, $FingerprintsObject2, $Value, $Line, @LineWords); | |
| 182 | |
| 183 for $RowIndex (0 .. $#{$FingerprintsFilesInfo{FingerprintsObjectsRef}}) { | |
| 184 $FingerprintsObject1 = $FingerprintsFilesInfo{FingerprintsObjectsRef}->[$RowIndex]; | |
| 185 $CmpdID1 = $FingerprintsFilesInfo{CompundIDsRef}->[$RowIndex]; | |
| 186 | |
| 187 if ($OptionsInfo{WriteRowsAndColumns}) { | |
| 188 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}${CmpdID1}$OptionsInfo{OutQuoteValue}"; | |
| 189 } | |
| 190 | |
| 191 COLINDEX: for $ColIndex (0 .. $#{$FingerprintsFilesInfo{FingerprintsObjectsRef}}) { | |
| 192 if (SkipMatrixData($RowIndex, $ColIndex)) { | |
| 193 next COLINDEX; | |
| 194 } | |
| 195 | |
| 196 $FingerprintsObject2 = $FingerprintsFilesInfo{FingerprintsObjectsRef}->[$ColIndex]; | |
| 197 | |
| 198 $Value = $FingerprintsObject1->$MethodName($FingerprintsObject2, @{$MethodParametersRef}); | |
| 199 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ''; | |
| 200 | |
| 201 if ($OptionsInfo{WriteRowsAndColumns}) { | |
| 202 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${Value}$OptionsInfo{OutQuoteValue}"; | |
| 203 } | |
| 204 elsif ($OptionsInfo{WriteIDPairsAndValue}) { | |
| 205 $CmpdID2 = $FingerprintsFilesInfo{CompundIDsRef}->[$ColIndex]; | |
| 206 | |
| 207 @LineWords = (); | |
| 208 push @LineWords, ($CmpdID1, $CmpdID2, $Value); | |
| 209 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 210 print $NewTextFileRef "$Line\n"; | |
| 211 } | |
| 212 } | |
| 213 if ($OptionsInfo{WriteRowsAndColumns}) { | |
| 214 print $NewTextFileRef "\n"; | |
| 215 } | |
| 216 } | |
| 217 } | |
| 218 | |
| 219 # Calculate and write out similarity values by retrieving and prcessing data | |
| 220 # from fingerprint file... | |
| 221 # | |
| 222 sub GenerateSimilarityMatrixUsingFileData { | |
| 223 my($FileIndex, $NewTextFileRef, $MethodName, $MethodParametersRef) = @_; | |
| 224 my($RowIndex, $ColIndex, $FingerprintsFileIO, $TmpFingerprintsFileIO, $FingerprintsObject1, $FingerprintsObject2, $CmpdID1, $CmpdID2, $FingerprintsCount, $IgnoredFingerprintsCount, $Value, $Line, @LineWords); | |
| 225 | |
| 226 print "\nReading and processing fingerprints data...\n"; | |
| 227 | |
| 228 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}); | |
| 229 $FingerprintsFileIO->Open(); | |
| 230 | |
| 231 $RowIndex = 0; $ColIndex = 0; | |
| 232 $FingerprintsCount = 0; $IgnoredFingerprintsCount = 0; | |
| 233 | |
| 234 FINGERPRINTSFILEIO: while ($FingerprintsFileIO->Read()) { | |
| 235 $FingerprintsCount++; | |
| 236 | |
| 237 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) { | |
| 238 $IgnoredFingerprintsCount++; | |
| 239 next FINGERPRINTSFILEIO; | |
| 240 } | |
| 241 $RowIndex++; | |
| 242 $FingerprintsObject1 = $FingerprintsFileIO->GetFingerprints(); | |
| 243 $CmpdID1 = $FingerprintsFileIO->GetCompoundID(); | |
| 244 | |
| 245 if ($OptionsInfo{WriteRowsAndColumns}) { | |
| 246 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}${CmpdID1}$OptionsInfo{OutQuoteValue}"; | |
| 247 } | |
| 248 | |
| 249 # Force detail level of 1 to avoid duplicate printing of diagnostic messages for invalid | |
| 250 # fingerprints data... | |
| 251 $TmpFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$FileIndex]}, "DetailLevel" => 1); | |
| 252 $TmpFingerprintsFileIO->Open(); | |
| 253 | |
| 254 $ColIndex = 0; | |
| 255 TMPFINGERPRINTSFILEIO: while ($TmpFingerprintsFileIO->Read()) { | |
| 256 if (!$TmpFingerprintsFileIO->IsFingerprintsDataValid()) { | |
| 257 next TMPFINGERPRINTSFILEIO; | |
| 258 } | |
| 259 $ColIndex++; | |
| 260 | |
| 261 if (SkipMatrixData($RowIndex, $ColIndex)) { | |
| 262 next TMPFINGERPRINTSFILEIO; | |
| 263 } | |
| 264 | |
| 265 $FingerprintsObject2 = $TmpFingerprintsFileIO->GetFingerprints(); | |
| 266 | |
| 267 $Value = $FingerprintsObject1->$MethodName($FingerprintsObject2, @{$MethodParametersRef}); | |
| 268 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ''; | |
| 269 | |
| 270 if ($OptionsInfo{WriteRowsAndColumns}) { | |
| 271 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${Value}$OptionsInfo{OutQuoteValue}"; | |
| 272 } | |
| 273 elsif ($OptionsInfo{WriteIDPairsAndValue}) { | |
| 274 $CmpdID2 = $TmpFingerprintsFileIO->GetCompoundID(); | |
| 275 | |
| 276 @LineWords = (); | |
| 277 push @LineWords, ($CmpdID1, $CmpdID2, $Value); | |
| 278 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 279 print $NewTextFileRef "$Line\n"; | |
| 280 } | |
| 281 } | |
| 282 $TmpFingerprintsFileIO->Close(); | |
| 283 | |
| 284 if ($OptionsInfo{WriteRowsAndColumns}) { | |
| 285 print $NewTextFileRef "\n"; | |
| 286 } | |
| 287 } | |
| 288 | |
| 289 $FingerprintsFileIO->Close(); | |
| 290 | |
| 291 print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n"; | |
| 292 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n"; | |
| 293 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n"; | |
| 294 } | |
| 295 | |
| 296 # Check whether matrix data need to be skipped... | |
| 297 # | |
| 298 sub SkipMatrixData { | |
| 299 my($RowIndex, $ColIndex) = @_; | |
| 300 | |
| 301 if ($OptionsInfo{WriteFullMatrix}) { | |
| 302 return 0; | |
| 303 } | |
| 304 elsif ($OptionsInfo{WriteUpperTriangularMatrix}) { | |
| 305 return ($RowIndex > $ColIndex) ? 1 : 0; | |
| 306 } | |
| 307 elsif ($OptionsInfo{WriteLowerTriangularMatrix}) { | |
| 308 return ($RowIndex < $ColIndex) ? 1 : 0; | |
| 309 } | |
| 310 | |
| 311 return 0; | |
| 312 } | |
| 313 | |
| 314 # Write out column labels... | |
| 315 # | |
| 316 sub WriteColumnLabels { | |
| 317 my($FileIndex, $NewTextFileRef) = @_; | |
| 318 my($Line, @LineWords); | |
| 319 | |
| 320 if ($OptionsInfo{OutMatrixFormat} =~ /^IDPairsAndValue$/i) { | |
| 321 @LineWords = (); | |
| 322 push @LineWords, ('CmpdID1', 'CmpdID2', 'Coefficient Value'); | |
| 323 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 324 print $NewTextFileRef "$Line\n"; | |
| 325 } | |
| 326 elsif ($OptionsInfo{OutMatrixFormat} =~ /^RowsAndColumns$/i) { | |
| 327 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
| 328 @LineWords = (); | |
| 329 push @LineWords, ''; | |
| 330 push @LineWords, @{$FingerprintsFilesInfo{CompundIDsRef}}; | |
| 331 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
| 332 print $NewTextFileRef "$Line\n"; | |
| 333 } | |
| 334 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
| 335 my( $FingerprintsFileIO, $CmpdID); | |
| 336 | |
| 337 # Scan file to retrieve compound IDs... | |
| 338 # | |
| 339 print "\nProcessing fingerprints file to generate compound IDs...\n"; | |
| 340 | |
| 341 # Force detail level of 1 to avoid diagnostics messages for invalid fingeprints data during | |
| 342 # retrieval of compound IDs as these get printed out during calculation of matrix... | |
| 343 # | |
| 344 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}, "DetailLevel" => 1); | |
| 345 $FingerprintsFileIO->Open(); | |
| 346 | |
| 347 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}$OptionsInfo{OutQuoteValue}"; | |
| 348 | |
| 349 FINGERPRINTSFILEIO: while ($FingerprintsFileIO->Read()) { | |
| 350 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) { | |
| 351 next FINGERPRINTSFILEIO; | |
| 352 } | |
| 353 $CmpdID = $FingerprintsFileIO->GetCompoundID(); | |
| 354 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${CmpdID}$OptionsInfo{OutQuoteValue}"; | |
| 355 } | |
| 356 $FingerprintsFileIO->Close(); | |
| 357 | |
| 358 print $NewTextFileRef "\n"; | |
| 359 | |
| 360 print "Processing fingerprints file to generate matrix...\n"; | |
| 361 } | |
| 362 } | |
| 363 else { | |
| 364 warn "Warning: Output matrix format, $OptionsInfo{OutMatrixFormat}, is not supported.\n"; | |
| 365 } | |
| 366 } | |
| 367 | |
| 368 # Process fingerprints data... | |
| 369 # | |
| 370 sub ProcessFingerprintsData { | |
| 371 my($FileIndex) = @_; | |
| 372 my($FingerprintsFileIO); | |
| 373 | |
| 374 $FingerprintsFilesInfo{CompundIDsRef} = undef; | |
| 375 $FingerprintsFilesInfo{FingerprintsObjectsRef} = undef; | |
| 376 | |
| 377 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
| 378 my($FingerprintsFileIO); | |
| 379 | |
| 380 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}); | |
| 381 ($FingerprintsFilesInfo{CompundIDsRef}, $FingerprintsFilesInfo{FingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO); | |
| 382 } | |
| 383 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
| 384 my($FingerprintsFile, $TmpFingerprintsFile); | |
| 385 | |
| 386 $FingerprintsFile = $FingerprintsFilesList[$FileIndex]; | |
| 387 $TmpFingerprintsFile = $FingerprintsFilesInfo{TmpFingerprintsFile}[$FileIndex]; | |
| 388 | |
| 389 # Copy fingerprints file to a tmp file for calculating similarity matrix... | |
| 390 print "\nCopying fingerprints file, $FingerprintsFile, to temporary fingperints file, $TmpFingerprintsFile...\n"; | |
| 391 copy $FingerprintsFile, $TmpFingerprintsFile or die "Error: Couldn't copy $FingerprintsFile to $TmpFingerprintsFile: $! \n"; | |
| 392 } | |
| 393 } | |
| 394 | |
| 395 # Clean up fingerprints data... | |
| 396 # | |
| 397 sub CleanupFingerprintsData { | |
| 398 my($FileIndex) = @_; | |
| 399 | |
| 400 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
| 401 $FingerprintsFilesInfo{CompundIDsRef} = undef; | |
| 402 $FingerprintsFilesInfo{FingerprintsObjectsRef} = undef; | |
| 403 } | |
| 404 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
| 405 my($TmpFingerprintsFile); | |
| 406 | |
| 407 # Delete temporary fingerprints file... | |
| 408 $TmpFingerprintsFile = $FingerprintsFilesInfo{TmpFingerprintsFile}[$FileIndex]; | |
| 409 | |
| 410 print "\nDeleting temporary fingerprints file $TmpFingerprintsFile...\n"; | |
| 411 unlink $TmpFingerprintsFile or die "Error: Couldn't unlink $TmpFingerprintsFile: $! \n"; | |
| 412 } | |
| 413 } | |
| 414 | |
| 415 # Retrieve information about fingerprints files... | |
| 416 # | |
| 417 sub RetrieveFingerprintsFilesInfo { | |
| 418 my($FingerprintsFile, $TmpFingerprintsFile, $FingerprintsFileIO, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FileType, $Index, $FileDir, $FileExt, $FileName, $InDelim, $OutFileRoot, $OutFileExt, %FingerprintsFileIOParameters); | |
| 419 | |
| 420 %FingerprintsFilesInfo = (); | |
| 421 @{$FingerprintsFilesInfo{FileOkay}} = (); | |
| 422 @{$FingerprintsFilesInfo{FileType}} = (); | |
| 423 @{$FingerprintsFilesInfo{InDelim}} = (); | |
| 424 @{$FingerprintsFilesInfo{OutFileRoot}} = (); | |
| 425 @{$FingerprintsFilesInfo{OutFileExt}} = (); | |
| 426 | |
| 427 @{$FingerprintsFilesInfo{TmpFingerprintsFile}} = (); | |
| 428 | |
| 429 @{$FingerprintsFilesInfo{FingerprintsFileIOParameters}} = (); | |
| 430 @{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}} = (); | |
| 431 | |
| 432 @{$FingerprintsFilesInfo{FingerprintsBitVectorStringMode}} = (); | |
| 433 @{$FingerprintsFilesInfo{FingerprintsVectorStringMode}} = (); | |
| 434 | |
| 435 FILELIST: for $Index (0 .. $#FingerprintsFilesList) { | |
| 436 $FingerprintsFilesInfo{FileOkay}[$Index] = 0; | |
| 437 $FingerprintsFilesInfo{FileType}[$Index] = ''; | |
| 438 $FingerprintsFilesInfo{InDelim}[$Index] = ""; | |
| 439 $FingerprintsFilesInfo{OutFileRoot}[$Index] = ''; | |
| 440 $FingerprintsFilesInfo{OutFileExt}[$Index] = ''; | |
| 441 | |
| 442 %{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$Index]} = (); | |
| 443 | |
| 444 $FingerprintsFilesInfo{TmpFingerprintsFile}[$Index] = ""; | |
| 445 %{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$Index]} = (); | |
| 446 | |
| 447 $FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$Index] = 0; | |
| 448 $FingerprintsFilesInfo{FingerprintsVectorStringMode}[$Index] = 0; | |
| 449 | |
| 450 $FingerprintsFile = $FingerprintsFilesList[$Index]; | |
| 451 if (!(-e $FingerprintsFile)) { | |
| 452 warn "Warning: Ignoring file $FingerprintsFile: It doesn't exist\n"; | |
| 453 next FILELIST; | |
| 454 } | |
| 455 | |
| 456 $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile); | |
| 457 if (IsEmpty($FileType)) { | |
| 458 warn "Warning: Ignoring file $FingerprintsFile: It's not a fingerprints file\n"; | |
| 459 next FILELIST; | |
| 460 } | |
| 461 | |
| 462 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 463 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile); | |
| 464 | |
| 465 # Setup temporary fingerprints file name for scan file mode... | |
| 466 $TmpFingerprintsFile = "${FileName}Tmp.${FileExt}"; | |
| 467 | |
| 468 $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim}; | |
| 469 | |
| 470 # Setup output file names... | |
| 471 $OutFileExt = "csv"; | |
| 472 if ($Options{outdelim} =~ /^tab$/i) { | |
| 473 $OutFileExt = "tsv"; | |
| 474 } | |
| 475 | |
| 476 $OutFileRoot = $FileName; | |
| 477 if ($OptionsInfo{OutFileRoot} && (@FingerprintsFilesList == 1)) { | |
| 478 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
| 479 if ($RootFileName && $RootFileExt) { | |
| 480 $FileName = $RootFileName; | |
| 481 } | |
| 482 else { | |
| 483 $FileName = $OptionsInfo{OutFileRoot}; | |
| 484 } | |
| 485 $OutFileRoot = $FileName; | |
| 486 } | |
| 487 | |
| 488 if (!$Options{overwrite}) { | |
| 489 # Similarity matrices output file names for bit-vector strings... | |
| 490 my($SpecifiedComparisonMeasure, $ComparisonMeasure); | |
| 491 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedBitVectorComparisonsRef}}) { | |
| 492 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
| 493 if (-e "${OutFileRoot}${ComparisonMeasure}.${OutFileExt}") { | |
| 494 warn "Warning: Ignoring file $FingerprintsFile: The file ${OutFileRoot}${ComparisonMeasure}.${OutFileExt} already exists.\n"; | |
| 495 next FILELIST; | |
| 496 } | |
| 497 } | |
| 498 # Similarity matrices output file names for vector strings... | |
| 499 my($ComparisonMode); | |
| 500 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedVectorComparisonsRef}}) { | |
| 501 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
| 502 for $ComparisonMode (@{$OptionsInfo{SpecifiedVectorComparisonModesRef}}) { | |
| 503 if (-e "${OutFileRoot}${ComparisonMeasure}${ComparisonMode}.${OutFileExt}") { | |
| 504 warn "Warning: Ignoring file $FingerprintsFile: The file ${OutFileRoot}${ComparisonMeasure}${ComparisonMode}.${OutFileExt} already exists.\n"; | |
| 505 next FILELIST; | |
| 506 } | |
| 507 } | |
| 508 } | |
| 509 } | |
| 510 | |
| 511 # Setup FingerprintsFileIO parameters... | |
| 512 %FingerprintsFileIOParameters = (); | |
| 513 FILEIOPARAMETERS: { | |
| 514 if ($FileType =~ /^SD$/i) { | |
| 515 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsField}, 'CompoundIDMode' => $OptionsInfo{CompoundIDMode}, 'CompoundIDFieldLabel' => $OptionsInfo{CompoundIDField}, 'CompoundIDPrefix' => $OptionsInfo{CompoundIDPrefix}); | |
| 516 last FILEIOPARAMETERS; | |
| 517 } | |
| 518 if ($FileType =~ /^FP$/i) { | |
| 519 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}); | |
| 520 last FILEIOPARAMETERS; | |
| 521 } | |
| 522 if ($FileType =~ /^Text$/i) { | |
| 523 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{FingerprintsCol}, 'ColMode' => $OptionsInfo{ColMode}, 'CompoundIDCol' => $OptionsInfo{CompoundIDCol}, 'CompoundIDPrefix' => $OptionsInfo{CompoundIDPrefix}, 'InDelim' => $OptionsInfo{InDelim}); | |
| 524 last FILEIOPARAMETERS; | |
| 525 } | |
| 526 warn "Warning: File type for fingerprints file, $FingerprintsFile, is not valid. Supported file types: SD, FP or Text\n"; | |
| 527 next FILELIST; | |
| 528 } | |
| 529 | |
| 530 # Retrieve fingerints file string mode information... | |
| 531 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%FingerprintsFileIOParameters); | |
| 532 | |
| 533 if (!$FingerprintsFileIO) { | |
| 534 warn "Warning: Ignoring fingerprints file $FingerprintsFile: It contains invalid fingerprints data\n"; | |
| 535 next FILELIST; | |
| 536 } | |
| 537 if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) { | |
| 538 warn "Warning: Ignoring fingerprints file $FingerprintsFile: It contains invalid fingerprints data\n"; | |
| 539 next FILELIST; | |
| 540 } | |
| 541 $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode(); | |
| 542 $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode(); | |
| 543 | |
| 544 | |
| 545 $FingerprintsFilesInfo{FileOkay}[$Index] = 1; | |
| 546 $FingerprintsFilesInfo{FileType}[$Index] = $FileType; | |
| 547 | |
| 548 $FingerprintsFilesInfo{InDelim}[$Index] = $InDelim; | |
| 549 | |
| 550 $FingerprintsFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | |
| 551 $FingerprintsFilesInfo{OutFileExt}[$Index] = $OutFileExt; | |
| 552 | |
| 553 %{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$Index]} = %FingerprintsFileIOParameters; | |
| 554 | |
| 555 $FingerprintsFilesInfo{TmpFingerprintsFile}[$Index] = $TmpFingerprintsFile; | |
| 556 | |
| 557 $FingerprintsFileIOParameters{Name} = $TmpFingerprintsFile; | |
| 558 %{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$Index]} = %FingerprintsFileIOParameters; | |
| 559 | |
| 560 $FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$Index] = $FingerprintsBitVectorStringMode; | |
| 561 $FingerprintsFilesInfo{FingerprintsVectorStringMode}[$Index] = $FingerprintsVectorStringMode; | |
| 562 } | |
| 563 } | |
| 564 | |
| 565 # Process option values... | |
| 566 sub ProcessOptions { | |
| 567 %OptionsInfo = (); | |
| 568 | |
| 569 $OptionsInfo{Mode} = $Options{mode}; | |
| 570 | |
| 571 $OptionsInfo{InputDataMode} = $Options{inputdatamode}; | |
| 572 | |
| 573 ProcessBitVectorComparisonOptions(); | |
| 574 ProcessVectorComparisonOptions(); | |
| 575 | |
| 576 $OptionsInfo{CompoundIDPrefix} = $Options{compoundidprefix} ? $Options{compoundidprefix} : 'Cmpd'; | |
| 577 | |
| 578 # Compound ID and fingerprints column options for text files... | |
| 579 $OptionsInfo{ColMode} = $Options{colmode}; | |
| 580 | |
| 581 if (IsNotEmpty($Options{compoundidcol})) { | |
| 582 if ($Options{colmode} =~ /^ColNum$/i) { | |
| 583 if (!IsPositiveInteger($Options{compoundidcol})) { | |
| 584 die "Error: Column value, $Options{compoundidcol}, specified using \"--CompoundIDCol\" is not valid: Allowed integer values: > 0\n"; | |
| 585 } | |
| 586 } | |
| 587 $OptionsInfo{CompoundIDCol} = $Options{compoundidcol}; | |
| 588 } | |
| 589 else { | |
| 590 $OptionsInfo{CompoundIDCol} = 'AutoDetect'; | |
| 591 } | |
| 592 | |
| 593 if (IsNotEmpty($Options{fingerprintscol})) { | |
| 594 if ($Options{colmode} =~ /^ColNum$/i) { | |
| 595 if (!IsPositiveInteger($Options{fingerprintscol})) { | |
| 596 die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0\n"; | |
| 597 } | |
| 598 } | |
| 599 $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol}; | |
| 600 } | |
| 601 else { | |
| 602 $OptionsInfo{FingerprintsCol} = 'AutoDetect'; | |
| 603 } | |
| 604 | |
| 605 if (IsNotEmpty($Options{compoundidcol}) && IsNotEmpty($Options{fingerprintscol})) { | |
| 606 if (IsPositiveInteger($Options{compoundidcol}) && IsPositiveInteger($Options{fingerprintscol})) { | |
| 607 if (($Options{compoundidcol} == $Options{fingerprintscol})) { | |
| 608 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n"; | |
| 609 } | |
| 610 } | |
| 611 else { | |
| 612 if (($Options{compoundidcol} eq $Options{fingerprintscol})) { | |
| 613 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n"; | |
| 614 } | |
| 615 } | |
| 616 } | |
| 617 | |
| 618 # Compound ID and fingerprints field options for SD files... | |
| 619 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; | |
| 620 $OptionsInfo{CompoundIDField} = ''; | |
| 621 | |
| 622 if ($Options{compoundidmode} =~ /^DataField$/i) { | |
| 623 if (!$Options{compoundidfield}) { | |
| 624 die "Error: You must specify a value for \"--CompoundIDField\" option in \"DataField\" \"--CompoundIDMode\". \n"; | |
| 625 } | |
| 626 $OptionsInfo{CompoundIDField} = $Options{compoundidfield}; | |
| 627 } | |
| 628 | |
| 629 | |
| 630 if (IsNotEmpty($Options{fingerprintsfield})) { | |
| 631 $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield}; | |
| 632 } | |
| 633 else { | |
| 634 $OptionsInfo{FingerprintsField} = 'AutoDetect'; | |
| 635 } | |
| 636 | |
| 637 if ($Options{compoundidfield} && IsNotEmpty($Options{fingerprintsfield})) { | |
| 638 if (($Options{compoundidfield} eq $Options{fingerprintsfield})) { | |
| 639 die "Error: Values specified using \"--CompoundIDField\" and \"--Fingerprintsfield\", $Options{compoundidfield}, must be different.\n"; | |
| 640 } | |
| 641 } | |
| 642 | |
| 643 $OptionsInfo{Detail} = $Options{detail}; | |
| 644 | |
| 645 $OptionsInfo{InDelim} = $Options{indelim}; | |
| 646 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); | |
| 647 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; | |
| 648 $OptionsInfo{OutQuoteValue} = ($Options{quote} =~ /^Yes$/i) ? '"' : ''; | |
| 649 | |
| 650 $OptionsInfo{OutMatrixFormat} = $Options{outmatrixformat}; | |
| 651 | |
| 652 $OptionsInfo{WriteRowsAndColumns} = 0; $OptionsInfo{WriteIDPairsAndValue} = 0; | |
| 653 OUTMATRIXFORMAT: { | |
| 654 if ($OptionsInfo{OutMatrixFormat} =~ /^RowsAndColumns$/i) { | |
| 655 $OptionsInfo{WriteRowsAndColumns} = 1; last OUTMATRIXFORMAT; | |
| 656 } | |
| 657 if ($OptionsInfo{OutMatrixFormat} =~ /^IDPairsAndValue$/i) { | |
| 658 $OptionsInfo{WriteIDPairsAndValue} = 1; last OUTMATRIXFORMAT; | |
| 659 } | |
| 660 die "Error: The value specified, $Options{outmatrixformat}, for option \"--OutMatrixFormat\" is not valid. Allowed values: RowsAndColumns or IDPairsAndValue\n"; | |
| 661 } | |
| 662 | |
| 663 $OptionsInfo{OutMatrixType} = $Options{outmatrixtype}; | |
| 664 | |
| 665 $OptionsInfo{WriteFullMatrix} = 0; | |
| 666 $OptionsInfo{WriteUpperTriangularMatrix} = 0; $OptionsInfo{WriteLowerTriangularMatrix} = 0; | |
| 667 OUTMATRIXTYPE: { | |
| 668 if ($OptionsInfo{OutMatrixType} =~ /^FullMatrix$/i) { | |
| 669 $OptionsInfo{WriteFullMatrix} = 1; last OUTMATRIXTYPE; | |
| 670 } | |
| 671 if ($OptionsInfo{OutMatrixType} =~ /^UpperTriangularMatrix$/i) { | |
| 672 $OptionsInfo{WriteUpperTriangularMatrix} = 1; last OUTMATRIXTYPE; | |
| 673 } | |
| 674 if ($OptionsInfo{OutMatrixType} =~ /^LowerTriangularMatrix$/i) { | |
| 675 $OptionsInfo{WriteLowerTriangularMatrix} = 1; last OUTMATRIXTYPE; | |
| 676 } | |
| 677 die "Error: The value specified, $Options{outmatrixtype}, for option \"--OutMatrixType\" is not valid. Allowed values: FullMatrix, UpperTriangularMatrix or LowerTriangularMatrix\n"; | |
| 678 } | |
| 679 | |
| 680 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | |
| 681 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | |
| 682 | |
| 683 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0; | |
| 684 $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1; | |
| 685 | |
| 686 $OptionsInfo{Precision} = $Options{precision}; | |
| 687 | |
| 688 } | |
| 689 | |
| 690 # Process options related to comparion of bit vector strings... | |
| 691 # | |
| 692 sub ProcessBitVectorComparisonOptions { | |
| 693 # Setup supported bit vector similarity coefficients for bit vector strings... | |
| 694 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); | |
| 695 | |
| 696 @SupportedComparisonMeasures = (); | |
| 697 %SupportedComparisonMeasuresNameMap = (); | |
| 698 %SupportedComparisonMeasuresMethodMap = (); | |
| 699 | |
| 700 for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) { | |
| 701 # Similarity coefficient function/method names contain "Coefficient" in their names. | |
| 702 # So take 'em out and setup a map to original function/method name... | |
| 703 $ComparisonMeasure = $SupportedComparisonMeasure; | |
| 704 $ComparisonMeasure =~ s/Coefficient$//; | |
| 705 | |
| 706 push @SupportedComparisonMeasures, $ComparisonMeasure; | |
| 707 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; | |
| 708 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; | |
| 709 } | |
| 710 | |
| 711 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings... | |
| 712 my($SpecifiedMeasure, @SpecifiedComparisonMeasures, %SpecifiedComparisonMeasuresNameMap, %SpecifiedComparisonMeasuresMethodMap, %SpecifiedComparisonMeasuresParameterMap); | |
| 713 | |
| 714 @SpecifiedComparisonMeasures = (); | |
| 715 %SpecifiedComparisonMeasuresNameMap = (); | |
| 716 %SpecifiedComparisonMeasuresMethodMap = (); | |
| 717 %SpecifiedComparisonMeasuresParameterMap = (); | |
| 718 | |
| 719 if ($Options{bitvectorcomparisonmode} =~ /^All$/i) { | |
| 720 push @SpecifiedComparisonMeasures, @SupportedComparisonMeasures; | |
| 721 } | |
| 722 else { | |
| 723 # Comma delimited list of similarity coefficients... | |
| 724 my($BitVectorComparisonMode, @SpecifiedMeasures, @UnsupportedSpecifiedMeasures); | |
| 725 | |
| 726 $BitVectorComparisonMode = $Options{bitvectorcomparisonmode}; | |
| 727 $BitVectorComparisonMode =~ s/ //g; | |
| 728 @SpecifiedMeasures = split ",", $BitVectorComparisonMode; | |
| 729 @UnsupportedSpecifiedMeasures = (); | |
| 730 | |
| 731 for $SpecifiedMeasure (@SpecifiedMeasures) { | |
| 732 if (exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) { | |
| 733 push @SpecifiedComparisonMeasures, $SpecifiedMeasure; | |
| 734 } | |
| 735 else { | |
| 736 push @UnsupportedSpecifiedMeasures, $SpecifiedMeasure; | |
| 737 } | |
| 738 } | |
| 739 if (@UnsupportedSpecifiedMeasures) { | |
| 740 if (@UnsupportedSpecifiedMeasures > 1) { | |
| 741 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedMeasures, ", ", 0)," - for option \"-b --BitVectorComparisonMode\" are not valid.\n"; | |
| 742 } | |
| 743 else { | |
| 744 warn "Error: The value specified, @UnsupportedSpecifiedMeasures, for option \"-b --BitVectorComparisonMode\" is not valid.\n"; | |
| 745 } | |
| 746 die "Allowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; | |
| 747 } | |
| 748 } | |
| 749 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
| 750 $SpecifiedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; | |
| 751 $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; | |
| 752 } | |
| 753 | |
| 754 $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode}; | |
| 755 $OptionsInfo{SpecifiedBitVectorComparisonsRef} = \@SpecifiedComparisonMeasures; | |
| 756 $OptionsInfo{SpecifiedBitVectorComparisonsNameRef} = \%SpecifiedComparisonMeasuresNameMap; | |
| 757 $OptionsInfo{SpecifiedBitVectorComparisonsMethodRef} = \%SpecifiedComparisonMeasuresMethodMap; | |
| 758 | |
| 759 # Make sure valid alpha parameter is specified for Tversky calculation... | |
| 760 my($SpecifiedMeasure1, $SpecifiedMeasure2); | |
| 761 $OptionsInfo{Alpha} = ''; | |
| 762 $SpecifiedMeasure1 = 'TverskySimilarity'; | |
| 763 $SpecifiedMeasure2 = 'WeightedTverskySimilarity'; | |
| 764 if ($SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure1)} || $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure2)}) { | |
| 765 if (IsEmpty($Options{alpha})) { | |
| 766 die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedMeasure1, $SpecifiedMeasure2, or All\" \"-m --mode\". \n"; | |
| 767 } | |
| 768 my($Alpha); | |
| 769 $Alpha = $Options{alpha}; | |
| 770 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) { | |
| 771 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n"; | |
| 772 } | |
| 773 $OptionsInfo{Alpha} = $Alpha; | |
| 774 } | |
| 775 | |
| 776 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky | |
| 777 # calculations... | |
| 778 $OptionsInfo{Beta} = ''; | |
| 779 $SpecifiedMeasure1 = 'WeightedTverskySimilarity'; | |
| 780 $SpecifiedMeasure2 = 'WeightedTanimotoSimilarity'; | |
| 781 if ($SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure1)} || $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure2)}) { | |
| 782 if (IsEmpty($Options{beta})) { | |
| 783 die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedMeasure1, $SpecifiedMeasure2, or All\" \"-m --mode\". \n"; | |
| 784 } | |
| 785 my($Beta); | |
| 786 $Beta = $Options{beta}; | |
| 787 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) { | |
| 788 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n"; | |
| 789 } | |
| 790 $OptionsInfo{Beta} = $Beta; | |
| 791 } | |
| 792 | |
| 793 # Setup any parameters required for specified comparison menthod... | |
| 794 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
| 795 @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}} = (); | |
| 796 if ($SpecifiedMeasure =~ /^TverskySimilarity$/i) { | |
| 797 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Alpha}; | |
| 798 } | |
| 799 elsif ($SpecifiedMeasure =~ /^WeightedTverskySimilarity$/i) { | |
| 800 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Alpha}; | |
| 801 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Beta}; | |
| 802 } | |
| 803 elsif ($SpecifiedMeasure =~ /^WeightedTanimotoSimilarity$/i) { | |
| 804 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Beta}; | |
| 805 } | |
| 806 } | |
| 807 $OptionsInfo{SpecifiedBitVectorComparisonsParameterRef} = \%SpecifiedComparisonMeasuresParameterMap; | |
| 808 } | |
| 809 | |
| 810 # Process options related to comparion of vector strings... | |
| 811 # | |
| 812 sub ProcessVectorComparisonOptions { | |
| 813 # Setup specified similarity coefficients for vector strings.. | |
| 814 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); | |
| 815 | |
| 816 @SupportedComparisonMeasures = (); | |
| 817 %SupportedComparisonMeasuresNameMap = (); | |
| 818 %SupportedComparisonMeasuresMethodMap = (); | |
| 819 for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) { | |
| 820 # Similarity and distance coefficient function/method names contain "Coefficient" in their names. | |
| 821 # So take 'em out and setup a map to original function/method name... | |
| 822 $ComparisonMeasure = $SupportedComparisonMeasure; | |
| 823 if ($ComparisonMeasure =~ /Coefficient$/i) { | |
| 824 $ComparisonMeasure =~ s/Coefficient$//i; | |
| 825 } | |
| 826 push @SupportedComparisonMeasures, $ComparisonMeasure; | |
| 827 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; | |
| 828 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; | |
| 829 } | |
| 830 | |
| 831 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings... | |
| 832 my($SpecifiedMeasure, @SpecifiedComparisonMeasures, %SpecifiedComparisonMeasuresNameMap, %SpecifiedComparisonMeasuresMethodMap, %SpecifiedComparisonMeasuresParameterMap); | |
| 833 | |
| 834 @SpecifiedComparisonMeasures = (); | |
| 835 %SpecifiedComparisonMeasuresNameMap = (); | |
| 836 %SpecifiedComparisonMeasuresMethodMap = (); | |
| 837 | |
| 838 if ($Options{vectorcomparisonmode} =~ /^All$/i) { | |
| 839 push @SpecifiedComparisonMeasures, @SupportedComparisonMeasures; | |
| 840 } | |
| 841 else { | |
| 842 # Comma delimited list of similarity coefficients... | |
| 843 my($VectorComparisonMode, @SpecifiedMeasures, @UnsupportedSpecifiedMeasures); | |
| 844 | |
| 845 $VectorComparisonMode = $Options{vectorcomparisonmode}; | |
| 846 $VectorComparisonMode =~ s/ //g; | |
| 847 @SpecifiedMeasures = split ",", $VectorComparisonMode; | |
| 848 @UnsupportedSpecifiedMeasures = (); | |
| 849 | |
| 850 for $SpecifiedMeasure (@SpecifiedMeasures) { | |
| 851 if (exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) { | |
| 852 push @SpecifiedComparisonMeasures, $SpecifiedMeasure; | |
| 853 } | |
| 854 else { | |
| 855 push @UnsupportedSpecifiedMeasures, $SpecifiedMeasure; | |
| 856 } | |
| 857 } | |
| 858 if (@UnsupportedSpecifiedMeasures) { | |
| 859 if (@UnsupportedSpecifiedMeasures > 1) { | |
| 860 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedMeasures, ", ", 0)," - for option \"-v --VectorComparisonMode\" are not valid.\n"; | |
| 861 } | |
| 862 else { | |
| 863 warn "Error: The value specified, @UnsupportedSpecifiedMeasures, for option \"-v --VectorComparisonMode\" is not valid.\n"; | |
| 864 } | |
| 865 die "Allowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; | |
| 866 } | |
| 867 } | |
| 868 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
| 869 $SpecifiedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; | |
| 870 $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; | |
| 871 } | |
| 872 | |
| 873 $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode}; | |
| 874 $OptionsInfo{SpecifiedVectorComparisonsRef} = \@SpecifiedComparisonMeasures; | |
| 875 $OptionsInfo{SpecifiedVectorComparisonsNameRef} = \%SpecifiedComparisonMeasuresNameMap; | |
| 876 $OptionsInfo{SpecifiedVectorComparisonsMethodRef} = \%SpecifiedComparisonMeasuresMethodMap; | |
| 877 | |
| 878 # Setup specified vector comparison calculation modes... | |
| 879 my(@SpecifiedVectorComparisonModes); | |
| 880 @SpecifiedVectorComparisonModes = (); | |
| 881 if ($Options{vectorcomparisonformulism} =~ /^All$/i) { | |
| 882 push @SpecifiedVectorComparisonModes, ("AlgebraicForm", "BinaryForm", "SetTheoreticForm"); | |
| 883 } | |
| 884 else { | |
| 885 my($SpecifiedFormulism, @SpecifiedFormulismWords); | |
| 886 | |
| 887 @SpecifiedFormulismWords = split /\,/, $Options{vectorcomparisonformulism}; | |
| 888 for $SpecifiedFormulism (@SpecifiedFormulismWords) { | |
| 889 if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) { | |
| 890 die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n"; | |
| 891 } | |
| 892 push @SpecifiedVectorComparisonModes, $SpecifiedFormulism; | |
| 893 } | |
| 894 } | |
| 895 $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism}; | |
| 896 $OptionsInfo{SpecifiedVectorComparisonModesRef} = \@SpecifiedVectorComparisonModes; | |
| 897 | |
| 898 # Setup any parameters required for specified comparison menthod... | |
| 899 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
| 900 @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}} = (); | |
| 901 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, ($Options{fast} ? 1 : 0); | |
| 902 } | |
| 903 $OptionsInfo{SpecifiedVectorComparisonsParameterRef} = \%SpecifiedComparisonMeasuresParameterMap; | |
| 904 } | |
| 905 | |
| 906 # Setup script usage and retrieve command line arguments specified using various options... | |
| 907 sub SetupScriptUsage { | |
| 908 | |
| 909 # Retrieve all the options... | |
| 910 %Options = (); | |
| 911 | |
| 912 $Options{alpha} = 0.5; | |
| 913 $Options{beta} = 1; | |
| 914 | |
| 915 $Options{bitvectorcomparisonmode} = "TanimotoSimilarity"; | |
| 916 | |
| 917 $Options{colmode} = 'colnum'; | |
| 918 | |
| 919 $Options{compoundidprefix} = 'Cmpd'; | |
| 920 $Options{compoundidmode} = 'LabelPrefix'; | |
| 921 | |
| 922 $Options{detail} = 1; | |
| 923 | |
| 924 $Options{indelim} = 'comma'; | |
| 925 $Options{outdelim} = 'comma'; | |
| 926 | |
| 927 $Options{inputdatamode} = 'LoadInMemory'; | |
| 928 | |
| 929 $Options{mode} = 'AutoDetect'; | |
| 930 | |
| 931 $Options{outmatrixformat} = 'RowsAndColumns'; | |
| 932 | |
| 933 $Options{outmatrixtype} = 'FullMatrix'; | |
| 934 | |
| 935 $Options{quote} = 'yes'; | |
| 936 $Options{precision} = 2; | |
| 937 | |
| 938 $Options{vectorcomparisonmode} = "TanimotoSimilarity"; | |
| 939 $Options{vectorcomparisonformulism} = "AlgebraicForm"; | |
| 940 | |
| 941 if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "colmode|c=s", "compoundidcol=s", "compoundidprefix=s", "compoundidfield=s", "compoundidmode=s", "detail|d=i", "fast|f", "fingerprintscol=s", "fingerprintsfield=s", "help|h", "indelim=s", "inputdatamode=s", "mode|m=s", "outdelim=s", "overwrite|o", "outmatrixformat=s", "outmatrixtype=s", "precision|p=s", "quote|q=s", "root|r=s", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) { | |
| 942 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 943 } | |
| 944 if ($Options{workingdir}) { | |
| 945 if (! -d $Options{workingdir}) { | |
| 946 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 947 } | |
| 948 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 949 } | |
| 950 if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) { | |
| 951 die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; | |
| 952 } | |
| 953 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { | |
| 954 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; | |
| 955 } | |
| 956 if (!IsPositiveInteger($Options{detail})) { | |
| 957 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; | |
| 958 } | |
| 959 if ($Options{inputdatamode} !~ /^(LoadInMemory|ScanFile)$/i) { | |
| 960 die "Error: The value specified, $Options{inputdatamode}, for option \"--InputDataMode\" is not valid. Allowed values: LoadInMemory or ScanFile\n"; | |
| 961 } | |
| 962 if ($Options{mode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) { | |
| 963 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n"; | |
| 964 } | |
| 965 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
| 966 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n"; | |
| 967 } | |
| 968 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
| 969 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
| 970 } | |
| 971 if ($Options{outmatrixformat} !~ /^(RowsAndColumns|IDPairsAndValue)$/i) { | |
| 972 die "Error: The value specified, $Options{outmatrixformat}, for option \"--OutMatrixFormat\" is not valid. Allowed values: RowsAndColumns or IDPairsAndValue\n"; | |
| 973 } | |
| 974 if ($Options{outmatrixtype} !~ /^(FullMatrix|UpperTriangularMatrix|LowerTriangularMatrix)$/i) { | |
| 975 die "Error: The value specified, $Options{outmatrixtype}, for option \"--OutMatrixType\" is not valid. Allowed values: FullMatrix, UpperTriangularMatrix or LowerTriangularMatrix\n"; | |
| 976 } | |
| 977 if ($Options{quote} !~ /^(Yes|No)$/i) { | |
| 978 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; | |
| 979 } | |
| 980 if (!IsPositiveInteger($Options{precision})) { | |
| 981 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n"; | |
| 982 } | |
| 983 } | |
| 984 | |
| 985 __END__ | |
| 986 | |
| 987 =head1 NAME | |
| 988 | |
| 989 SimilarityMatricesFingerprints.pl - Calculate similarity matrices using fingerprints strings data in SD, FP and CSV/TSV text file(s) | |
| 990 | |
| 991 =head1 SYNOPSIS | |
| 992 | |
| 993 SimilarityMatricesFingerprints.pl SDFile(s) FPFile(s) TextFile(s)... | |
| 994 | |
| 995 SimilarityMatricesFingerprints.pl [B<--alpha> I<number>] [B<--beta> I<number>] | |
| 996 [B<-b, --BitVectorComparisonMode> I<All | "TanimotoSimilarity,[ TverskySimilarity, ... ]">] | |
| 997 [B<-c, --ColMode> I<ColNum | ColLabel>] [B<--CompoundIDCol> I<col number | col name>] | |
| 998 [B<--CompoundIDPrefix> I<text>] [B<--CompoundIDField> I<DataFieldName>] | |
| 999 [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>] | |
| 1000 [B<-d, --detail> I<InfoLevel>] [B<-f, --fast>] [B<--FingerprintsCol> I<col number | col name>] | |
| 1001 [B<--FingerprintsField> I<FieldLabel>] [B<-h, --help>] [B<--InDelim> I<comma | semicolon>] | |
| 1002 [B<--InputDataMode> I<LoadInMemory | ScanFile>] | |
| 1003 [B<-m, --mode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>] | |
| 1004 [B<--OutDelim> I<comma | tab | semicolon>] [B<--OutMatrixFormat> I<RowsAndColumns | IDPairsAndValue>] | |
| 1005 [B<--OutMatrixType> I<FullMatrix | UpperTriangularMatrix | LowerTriangularMatrix>] | |
| 1006 [B<-o, --overwrite>] [B<-p, --precision> I<number>] | |
| 1007 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] | |
| 1008 [B<-v, --VectorComparisonMode> I<All | "TanimotoSimilairy, [ ManhattanDistance, ...]">] | |
| 1009 [B<--VectorComparisonFormulism> I<All | "AlgebraicForm, [BinaryForm, SetTheoreticForm]">] | |
| 1010 [B<-w, --WorkingDir> dirname] SDFile(s) FPFile(s) TextFile(s)... | |
| 1011 | |
| 1012 =head1 DESCRIPTION | |
| 1013 | |
| 1014 Calculate similarity matrices using fingerprint bit-vector or vector strings data in I<SD, FP | |
| 1015 and CSV/TSV> text file(s) and generate CSV/TSV text file(s) containing values for specified | |
| 1016 similarity and distance coefficients. | |
| 1017 | |
| 1018 The scripts SimilarityMatrixSDFiles.pl and SimilarityMatrixTextFiles.pl have been removed from the | |
| 1019 current release of MayaChemTools and their functionality merged with this script. | |
| 1020 | |
| 1021 The valid I<SDFile> extensions are I<.sdf> and I<.sd>. All SD files in a current directory | |
| 1022 can be specified either by I<*.sdf> or the current directory name. | |
| 1023 | |
| 1024 The valid I<FPFile> extensions are I<.fpf> and I<.fp>. All FP files in a current directory | |
| 1025 can be specified either by I<*.fpf> or the current directory name. | |
| 1026 | |
| 1027 The valid I<TextFile> extensions are I<.csv> and I<.tsv> for comma/semicolon and tab | |
| 1028 delimited text files respectively. All other file names are ignored. All text files in a | |
| 1029 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory | |
| 1030 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file | |
| 1031 which doesn't correspond to the format indicated by B<--indelim> option is ignored. | |
| 1032 | |
| 1033 Example of I<FP> file containing fingerprints bit-vector string data: | |
| 1034 | |
| 1035 # | |
| 1036 # Package = MayaChemTools 7.4 | |
| 1037 # ReleaseDate = Oct 21, 2010 | |
| 1038 # | |
| 1039 # TimeStamp = Mon Mar 7 15:14:01 2011 | |
| 1040 # | |
| 1041 # FingerprintsStringType = FingerprintsBitVector | |
| 1042 # | |
| 1043 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:... | |
| 1044 # Size = 1024 | |
| 1045 # BitStringFormat = HexadecimalString | |
| 1046 # BitsOrder = Ascending | |
| 1047 # | |
| 1048 Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510... | |
| 1049 Cmpd2 000000249400840040100042011001001980410c000000001010088001120... | |
| 1050 ... ... | |
| 1051 ... .. | |
| 1052 | |
| 1053 Example of I<FP> file containing fingerprints vector string data: | |
| 1054 | |
| 1055 # | |
| 1056 # Package = MayaChemTools 7.4 | |
| 1057 # ReleaseDate = Oct 21, 2010 | |
| 1058 # | |
| 1059 # TimeStamp = Mon Mar 7 15:14:01 2011 | |
| 1060 # | |
| 1061 # FingerprintsStringType = FingerprintsVector | |
| 1062 # | |
| 1063 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:... | |
| 1064 # VectorStringFormat = IDsAndValuesString | |
| 1065 # VectorValuesType = NumericalValues | |
| 1066 # | |
| 1067 Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C: | |
| 1068 N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...; | |
| 1069 33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2 | |
| 1070 6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ... | |
| 1071 Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C | |
| 1072 O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...; | |
| 1073 15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2 | |
| 1074 1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ... | |
| 1075 ... ... | |
| 1076 ... ... | |
| 1077 | |
| 1078 Example of I<SD> file containing fingerprints bit-vector string data: | |
| 1079 | |
| 1080 ... ... | |
| 1081 ... ... | |
| 1082 $$$$ | |
| 1083 ... ... | |
| 1084 ... ... | |
| 1085 ... ... | |
| 1086 41 44 0 0 0 0 0 0 0 0999 V2000 | |
| 1087 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 | |
| 1088 ... ... | |
| 1089 2 3 1 0 0 0 0 | |
| 1090 ... ... | |
| 1091 M END | |
| 1092 > <CmpdID> | |
| 1093 Cmpd1 | |
| 1094 | |
| 1095 > <PathLengthFingerprints> | |
| 1096 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt | |
| 1097 h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66 | |
| 1098 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028 | |
| 1099 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462 | |
| 1100 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a | |
| 1101 aa0660a11014a011d46 | |
| 1102 | |
| 1103 $$$$ | |
| 1104 ... ... | |
| 1105 ... ... | |
| 1106 | |
| 1107 Example of CSV I<Text> file containing fingerprints bit-vector string data: | |
| 1108 | |
| 1109 "CompoundID","PathLengthFingerprints" | |
| 1110 "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes | |
| 1111 :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4 | |
| 1112 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030 | |
| 1113 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..." | |
| 1114 ... ... | |
| 1115 ... ... | |
| 1116 | |
| 1117 The current release of MayaChemTools supports the following types of fingerprint | |
| 1118 bit-vector and vector strings: | |
| 1119 | |
| 1120 FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi | |
| 1121 us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0-C.X1.BO1.H3-AT | |
| 1122 C1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-ATC1 NR0-C.X | |
| 1123 1.BO1.H3-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-A | |
| 1124 TC1 NR0-C.X2.BO2.H2-ATC1:NR1-C.X2.BO2.H2-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2 | |
| 1125 -C.X2.BO2.H2-ATC1:NR2-N.X3.BO3-ATC1:NR2-O.X1.BO1.H1-ATC1 NR0-C.X2.B... | |
| 1126 | |
| 1127 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS | |
| 1128 ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2 | |
| 1129 .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1 | |
| 1130 O.X1.BO2;2 4 14 3 10 1 1 1 3 2 | |
| 1131 | |
| 1132 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume | |
| 1133 ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F | |
| 1134 N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1 | |
| 1135 | |
| 1136 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN | |
| 1137 umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C | |
| 1138 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N | |
| 1139 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8 | |
| 1140 O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1 | |
| 1141 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0... | |
| 1142 | |
| 1143 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs | |
| 1144 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN | |
| 1145 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3 | |
| 1146 .024 -2.270 | |
| 1147 | |
| 1148 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues; | |
| 1149 ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435 | |
| 1150 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1 | |
| 1151 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
| 1152 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
| 1153 | |
| 1154 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi | |
| 1155 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391 | |
| 1156 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414 | |
| 1157 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103 | |
| 1158 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338 | |
| 1159 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303... | |
| 1160 | |
| 1161 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes | |
| 1162 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524 | |
| 1163 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 | |
| 1164 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...; | |
| 1165 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2 | |
| 1166 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 | |
| 1167 | |
| 1168 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp | |
| 1169 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100 | |
| 1170 0000000001010000000110000011000000000000100000000000000000000000100001 | |
| 1171 1000000110000000000000000000000000010011000000000000000000000000010000 | |
| 1172 0000000000000000000000000010000000000000000001000000000000000000000000 | |
| 1173 0000000000010000100001000000000000101000000000000000100000000000000... | |
| 1174 | |
| 1175 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu | |
| 1176 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8 | |
| 1177 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567 | |
| 1178 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012 | |
| 1179 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455 | |
| 1180 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404... | |
| 1181 | |
| 1182 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp | |
| 1183 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184 | |
| 1184 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450 | |
| 1185 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430 | |
| 1186 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134 | |
| 1187 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566... | |
| 1188 | |
| 1189 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000 | |
| 1190 0000000000000000000000000000000001001000010010000000010010000000011100 | |
| 1191 0100101010111100011011000100110110000011011110100110111111111111011111 | |
| 1192 11111111111110111000 | |
| 1193 | |
| 1194 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011 | |
| 1195 1110011111100101111111000111101100110000000000000011100010000000000000 | |
| 1196 0000000000000000000000000000000000000000000000101000000000000000000000 | |
| 1197 0000000000000000000000000000000000000000000000000000000000000000000000 | |
| 1198 0000000000000000000000000000000000000011000000000000000000000000000000 | |
| 1199 0000000000000000000000000000000000000000 | |
| 1200 | |
| 1201 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri | |
| 1202 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
| 1203 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 | |
| 1204 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0 | |
| 1205 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1 | |
| 1206 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1 | |
| 1207 | |
| 1208 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri | |
| 1209 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0 | |
| 1210 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0 | |
| 1211 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
| 1212 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0 | |
| 1213 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | |
| 1214 | |
| 1215 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng | |
| 1216 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110 | |
| 1217 0100010101011000101001011100110001000010001001101000001001001001001000 | |
| 1218 0010110100000111001001000001001010100100100000000011000000101001011100 | |
| 1219 0010000001000101010100000100111100110111011011011000000010110111001101 | |
| 1220 0101100011000000010001000011000010100011101100001000001000100000000... | |
| 1221 | |
| 1222 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength | |
| 1223 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2 | |
| 1224 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X | |
| 1225 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1 | |
| 1226 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO | |
| 1227 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C.... | |
| 1228 | |
| 1229 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt | |
| 1230 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1 | |
| 1231 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N | |
| 1232 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1 | |
| 1233 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR | |
| 1234 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ... | |
| 1235 | |
| 1236 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD | |
| 1237 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1 | |
| 1238 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3. | |
| 1239 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...; | |
| 1240 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1 | |
| 1241 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1... | |
| 1242 | |
| 1243 FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi | |
| 1244 stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar | |
| 1245 Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H | |
| 1246 BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...; | |
| 1247 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4 | |
| 1248 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ... | |
| 1249 | |
| 1250 FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3 | |
| 1251 3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4- | |
| 1252 C.X3.BO4 C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-N.X3.BO3 C.X2.BO2.H2-C.X2.BO | |
| 1253 2.H2-C.X3.BO3.H1-C.X2.BO2.H2 C.X2.BO2.H2-C.X2.BO2.H2-C.X3.BO3.H1-O...; | |
| 1254 2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1 | |
| 1255 | |
| 1256 FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica | |
| 1257 lValues;IDsAndValuesString;aaCH-aaCH-aaCH-aaCH aaCH-aaCH-aaCH-aasC aaC | |
| 1258 H-aaCH-aasC-aaCH aaCH-aaCH-aasC-aasC aaCH-aaCH-aasC-sF aaCH-aaCH-aasC- | |
| 1259 ssNH aaCH-aasC-aasC-aasC aaCH-aasC-aasC-aasN aaCH-aasC-ssNH-dssC a...; | |
| 1260 4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 | |
| 1261 | |
| 1262 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M | |
| 1263 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1 | |
| 1264 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1 | |
| 1265 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1 | |
| 1266 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....; | |
| 1267 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2 | |
| 1268 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8... | |
| 1269 | |
| 1270 FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1 | |
| 1271 :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C | |
| 1272 .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3- | |
| 1273 D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2 | |
| 1274 -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C. | |
| 1275 3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7... | |
| 1276 | |
| 1277 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min | |
| 1278 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H | |
| 1279 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2- | |
| 1280 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H | |
| 1281 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...; | |
| 1282 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10 | |
| 1283 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1 | |
| 1284 | |
| 1285 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist | |
| 1286 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0 | |
| 1287 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1 | |
| 1288 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0 | |
| 1289 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0 | |
| 1290 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18... | |
| 1291 | |
| 1292 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize: | |
| 1293 MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1- | |
| 1294 Ar1-Ar1 Ar1-Ar1-H1 Ar1-Ar1-HBA1 Ar1-Ar1-HBD1 Ar1-H1-H1 Ar1-H1-HBA1 Ar1 | |
| 1295 -H1-HBD1 Ar1-HBA1-HBD1 H1-H1-H1 H1-H1-HBA1 H1-H1-HBD1 H1-HBA1-HBA1 H1- | |
| 1296 HBA1-HBD1 H1-HBA1-NI1 H1-HBD1-NI1 HBA1-HBA1-NI1 HBA1-HBD1-NI1 Ar1-...; | |
| 1297 46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23 | |
| 1298 28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1 | |
| 1299 119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ... | |
| 1300 | |
| 1301 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD | |
| 1302 istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106 | |
| 1303 8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0 | |
| 1304 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26 | |
| 1305 14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0 | |
| 1306 0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ... | |
| 1307 | |
| 1308 =head1 OPTIONS | |
| 1309 | |
| 1310 =over 4 | |
| 1311 | |
| 1312 =item B<--alpha> I<number> | |
| 1313 | |
| 1314 Value of alpha parameter for calculating I<Tversky> similarity coefficient specified for | |
| 1315 B<-b, --BitVectorComparisonMode> option. It corresponds to weights assigned for bits set | |
| 1316 to "1" in a pair of fingerprint bit-vectors during the calculation of similarity coefficient. Possible | |
| 1317 values: I<0 to 1>. Default value: <0.5>. | |
| 1318 | |
| 1319 =item B<--beta> I<number> | |
| 1320 | |
| 1321 Value of beta parameter for calculating I<WeightedTanimoto> and I<WeightedTversky> | |
| 1322 similarity coefficients specified for B<-b, --BitVectorComparisonMode> option. It is used to | |
| 1323 weight the contributions of bits set to "0" during the calculation of similarity coefficients. Possible | |
| 1324 values: I<0 to 1>. Default value of <1> makes I<WeightedTanimoto> and I<WeightedTversky> | |
| 1325 equivalent to I<Tanimoto> and I<Tversky>. | |
| 1326 | |
| 1327 =item B<-b, --BitVectorComparisonMode> I<All | "TanimotoSimilarity,[TverskySimilarity,...]"> | |
| 1328 | |
| 1329 Specify what similarity coefficients to use for calculating similarity matrices for fingerprints bit-vector | |
| 1330 strings data values in I<TextFile(s)>: calculate similarity matrices for all supported similarity | |
| 1331 coefficients or specify a comma delimited list of similarity coefficients. Possible values: | |
| 1332 I<All | "TanimotoSimilarity,[TverskySimilarity,...]>. Default: I<TanimotoSimilarity> | |
| 1333 | |
| 1334 I<All> uses complete list of supported similarity coefficients: I<BaroniUrbaniSimilarity, BuserSimilarity, | |
| 1335 CosineSimilarity, DiceSimilarity, DennisSimilarity, ForbesSimilarity, FossumSimilarity, HamannSimilarity, JacardSimilarity, | |
| 1336 Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity, McConnaugheySimilarity, OchiaiSimilarity, | |
| 1337 PearsonSimilarity, RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity, SkoalSneath1Similarity, | |
| 1338 SkoalSneath2Similarity, SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity, YuleSimilarity, | |
| 1339 WeightedTanimotoSimilarity, WeightedTverskySimilarity>. These similarity coefficients are described below. | |
| 1340 | |
| 1341 For two fingerprint bit-vectors A and B of same size, let: | |
| 1342 | |
| 1343 Na = Number of bits set to "1" in A | |
| 1344 Nb = Number of bits set to "1" in B | |
| 1345 Nc = Number of bits set to "1" in both A and B | |
| 1346 Nd = Number of bits set to "0" in both A and B | |
| 1347 | |
| 1348 Nt = Number of bits set to "1" or "0" in A or B (Size of A or B) | |
| 1349 Nt = Na + Nb - Nc + Nd | |
| 1350 | |
| 1351 Na - Nc = Number of bits set to "1" in A but not in B | |
| 1352 Nb - Nc = Number of bits set to "1" in B but not in A | |
| 1353 | |
| 1354 Then, various similarity coefficients [ Ref. 40 - 42 ] for a pair of bit-vectors A and B are | |
| 1355 defined as follows: | |
| 1356 | |
| 1357 I<BaroniUrbaniSimilarity>: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser ) | |
| 1358 | |
| 1359 I<BuserSimilarity>: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani ) | |
| 1360 | |
| 1361 I<CosineSimilarity>: Nc / SQRT ( Na * Nb ) (same as Ochiai) | |
| 1362 | |
| 1363 I<DiceSimilarity>: (2 * Nc) / ( Na + Nb ) | |
| 1364 | |
| 1365 I<DennisSimilarity>: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / SQRT ( Nt * Na * Nb) | |
| 1366 | |
| 1367 I<ForbesSimilarity>: ( Nt * Nc ) / ( Na * Nb ) | |
| 1368 | |
| 1369 I<FossumSimilarity>: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb ) | |
| 1370 | |
| 1371 I<HamannSimilarity>: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt | |
| 1372 | |
| 1373 I<JaccardSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Tanimoto) | |
| 1374 | |
| 1375 I<Kulczynski1Similarity>: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / ( Na + Nb - 2Nc ) | |
| 1376 | |
| 1377 I<Kulczynski2Similarity>: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + ( Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5 * ( Nc / Na + Nc / Nb ) | |
| 1378 | |
| 1379 I<MatchingSimilarity>: ( Nc + Nd ) / Nt | |
| 1380 | |
| 1381 I<McConnaugheySimilarity>: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / ( Na * Nb ) | |
| 1382 | |
| 1383 I<OchiaiSimilarity>: Nc / SQRT ( Na * Nb ) (same as Cosine) | |
| 1384 | |
| 1385 I<PearsonSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) / SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) ) | |
| 1386 | |
| 1387 I<RogersTanimotoSimilarity>: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc) + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt) | |
| 1388 | |
| 1389 I<RussellRaoSimilarity>: Nc / Nt | |
| 1390 | |
| 1391 I<SimpsonSimilarity>: Nc / MIN ( Na, Nb) | |
| 1392 | |
| 1393 I<SkoalSneath1Similarity>: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb - Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc ) | |
| 1394 | |
| 1395 I<SkoalSneath2Similarity>: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt ) | |
| 1396 | |
| 1397 I<SkoalSneath3Similarity>: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc ) ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc ) | |
| 1398 | |
| 1399 I<TanimotoSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Jaccard) | |
| 1400 | |
| 1401 I<TverskySimilarity>: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * ( Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb) | |
| 1402 | |
| 1403 I<YuleSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) ) | |
| 1404 | |
| 1405 Values of Tanimoto/Jaccard and Tversky coefficients are dependent on only those bit which | |
| 1406 are set to "1" in both A and B. In order to take into account all bit positions, modified versions | |
| 1407 of Tanimoto [ Ref. 42 ] and Tversky [ Ref. 43 ] have been developed. | |
| 1408 | |
| 1409 Let: | |
| 1410 | |
| 1411 Na' = Number of bits set to "0" in A | |
| 1412 Nb' = Number of bits set to "0" in B | |
| 1413 Nc' = Number of bits set to "0" in both A and B | |
| 1414 | |
| 1415 Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / ( Na' + Nb' - Nc' ) | |
| 1416 | |
| 1417 Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc' ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb') | |
| 1418 | |
| 1419 Then: | |
| 1420 | |
| 1421 I<WeightedTanimotoSimilarity> = beta * Tanimoto + (1 - beta) * Tanimoto' | |
| 1422 | |
| 1423 I<WeightedTverskySimilarity> = beta * Tversky + (1 - beta) * Tversky' | |
| 1424 | |
| 1425 =item B<-c, --ColMode> I<ColNum | ColLabel> | |
| 1426 | |
| 1427 Specify how columns are identified in I<TextFile(s)>: using column number or column | |
| 1428 label. Possible values: I<ColNum or ColLabel>. Default value: I<ColNum>. | |
| 1429 | |
| 1430 =item B<--CompoundIDCol> I<col number | col name> | |
| 1431 | |
| 1432 This value is B<-c, --ColMode> mode specific. It specifies input I<TextFile(s)> column to use for | |
| 1433 generating compound ID for similarity matrices in output I<TextFile(s)>. Possible values: I<col number | |
| 1434 or col label>. Default value: I<first column containing the word compoundID in its column label or sequentially | |
| 1435 generated IDs>. | |
| 1436 | |
| 1437 =item B<--CompoundIDPrefix> I<text> | |
| 1438 | |
| 1439 Specify compound ID prefix to use during sequential generation of compound IDs for input I<SDFile(s)> | |
| 1440 and I<TextFile(s)>. Default value: I<Cmpd>. The default value generates compound IDs which look | |
| 1441 like Cmpd<Number>. | |
| 1442 | |
| 1443 For input I<SDFile(s)>, this value is only used during I<LabelPrefix | MolNameOrLabelPrefix> values | |
| 1444 of B<--CompoundIDMode> option; otherwise, it's ignored. | |
| 1445 | |
| 1446 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>: | |
| 1447 | |
| 1448 Compound | |
| 1449 | |
| 1450 The values specified above generates compound IDs which correspond to Compound<Number> | |
| 1451 instead of default value of Cmpd<Number>. | |
| 1452 | |
| 1453 =item B<--CompoundIDField> I<DataFieldName> | |
| 1454 | |
| 1455 Specify input I<SDFile(s)> datafield label for generating compound IDs. This value is only used | |
| 1456 during I<DataField> value of B<--CompoundIDMode> option. | |
| 1457 | |
| 1458 Examples for I<DataField> value of B<--CompoundIDMode>: | |
| 1459 | |
| 1460 MolID | |
| 1461 ExtReg | |
| 1462 | |
| 1463 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix> | |
| 1464 | |
| 1465 Specify how to generate compound IDs from input I<SDFile(s)> for similarity matrix CSV/TSV text | |
| 1466 file(s): use a I<SDFile(s)> datafield value; use molname line from I<SDFile(s)>; generate a sequential ID | |
| 1467 with specific prefix; use combination of both MolName and LabelPrefix with usage of LabelPrefix values | |
| 1468 for empty molname lines. | |
| 1469 | |
| 1470 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>. | |
| 1471 Default: I<LabelPrefix>. | |
| 1472 | |
| 1473 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes | |
| 1474 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname | |
| 1475 values are replaced with sequential compound IDs. | |
| 1476 | |
| 1477 =item B<-d, --detail> I<InfoLevel> | |
| 1478 | |
| 1479 Level of information to print about lines being ignored. Default: I<1>. Possible values: | |
| 1480 I<1, 2 or 3>. | |
| 1481 | |
| 1482 =item B<-f, --fast> | |
| 1483 | |
| 1484 In this mode, fingerprints columns specified using B<--FingerprintsCol> for I<TextFile(s)> and | |
| 1485 B<--FingerprintsField> for I<SDFile(s)> are assumed to contain valid fingerprints data and no | |
| 1486 checking is performed before calculating similarity matrices. By default, fingerprints data is | |
| 1487 validated before computing pairwise similarity and distance coefficients. | |
| 1488 | |
| 1489 =item B<--FingerprintsCol> I<col number | col name> | |
| 1490 | |
| 1491 This value is B<-c, --colmode> specific. It specifies fingerprints column to use during | |
| 1492 calculation similarity matrices for I<TextFile(s)>. Possible values: I<col number or col label>. | |
| 1493 Default value: I<first column containing the word Fingerprints in its column label>. | |
| 1494 | |
| 1495 =item B<--FingerprintsField> I<FieldLabel> | |
| 1496 | |
| 1497 Fingerprints field label to use during calculation similarity matrices for I<SDFile(s)>. | |
| 1498 Default value: I<first data field label containing the word Fingerprints in its label> | |
| 1499 | |
| 1500 =item B<-h, --help> | |
| 1501 | |
| 1502 Print this help message. | |
| 1503 | |
| 1504 =item B<--InDelim> I<comma | semicolon> | |
| 1505 | |
| 1506 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
| 1507 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
| 1508 delimiter. | |
| 1509 | |
| 1510 =item B<--InputDataMode> I<LoadInMemory | ScanFile> | |
| 1511 | |
| 1512 Specify how fingerprints bit-vector or vector strings data from I<SD, FP and CSV/TSV> | |
| 1513 fingerprint file(s) is processed: Retrieve, process and load all available fingerprints | |
| 1514 data in memory; Retrieve and process data for fingerprints one at a time. Possible values | |
| 1515 : I<LoadInMemory | ScanFile>. Default: I<LoadInMemory>. | |
| 1516 | |
| 1517 During I<LoadInMemory> value of B<--InputDataMode>, fingerprints bit-vector or vector | |
| 1518 strings data from input file is retrieved, processed, and loaded into memory all at once | |
| 1519 as fingerprints objects for generation for similarity matrices. | |
| 1520 | |
| 1521 During I<ScanFile> value of B<--InputDataMode>, multiple passes over the input fingerprints | |
| 1522 file are performed to retrieve and process fingerprints bit-vector or vector strings data one at | |
| 1523 a time to generate fingerprints objects used during generation of similarity matrices. A temporary | |
| 1524 copy of the input fingerprints file is made at the start and deleted after generating the matrices. | |
| 1525 | |
| 1526 I<ScanFile> value of B<--InputDataMode> allows processing of arbitrary large fingerprints files | |
| 1527 without any additional memory requirement. | |
| 1528 | |
| 1529 =item B<-m, --mode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString> | |
| 1530 | |
| 1531 Format of fingerprint strings data in I<TextFile(s)>: automatically detect format of fingerprints | |
| 1532 string created by MayaChemTools fingerprints generation scripts or explicitly specify its format. | |
| 1533 Possible values: I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>. Default | |
| 1534 value: I<AutoDetect>. | |
| 1535 | |
| 1536 =item B<--OutDelim> I<comma | tab | semicolon> | |
| 1537 | |
| 1538 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> | |
| 1539 Default value: I<comma>. | |
| 1540 | |
| 1541 =item B<--OutMatrixFormat> I<RowsAndColumns | IDPairsAndValue> | |
| 1542 | |
| 1543 Specify how similarity or distance values calculated for fingerprints vector and bit-vector strings | |
| 1544 are written to the output CSV/TSV text file(s): Generate text files containing rows and columns | |
| 1545 with their labels corresponding to compound IDs and each matrix element value corresponding to | |
| 1546 similarity or distance between corresponding compounds; Generate text files containing rows containing | |
| 1547 compoundIDs for two compounds followed by similarity or distance value between these compounds. | |
| 1548 | |
| 1549 Possible values: I<RowsAndColumns, or IDPairsAndValue>. Default value: I<RowsAndColumns>. | |
| 1550 | |
| 1551 The value of B<--OutMatrixFormat> in conjunction with B<--OutMatrixType> determines type | |
| 1552 of data written to output files and allows generation of up to 6 different output data formats: | |
| 1553 | |
| 1554 OutMatrixFormat OutMatrixType | |
| 1555 | |
| 1556 RowsAndColumns FullMatrix [ DEFAULT ] | |
| 1557 RowsAndColumns UpperTriangularMatrix | |
| 1558 RowsAndColumns LowerTriangularMatrix | |
| 1559 | |
| 1560 IDPairsAndValue FullMatrix | |
| 1561 IDPairsAndValue UpperTriangularMatrix | |
| 1562 IDPairsAndValue LowerTriangularMatrix | |
| 1563 | |
| 1564 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for | |
| 1565 I<FullMatrix> valueof B<--OutMatrixType>: | |
| 1566 | |
| 1567 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ... | |
| 1568 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ... | |
| 1569 "Cmpd2","0.04","1","0.06","0.05","0.19","0.07",... ... | |
| 1570 "Cmpd3","0.25","0.06","1","0.12","0.22","0.25",... ... | |
| 1571 "Cmpd4","0.13","0.05","0.12","1","0.11","0.13",... ... | |
| 1572 "Cmpd5","0.11","0.19","0.22","0.11","1","0.17",... ... | |
| 1573 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1",... ... | |
| 1574 ... ... .. | |
| 1575 ... ... .. | |
| 1576 ... ... .. | |
| 1577 | |
| 1578 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for | |
| 1579 I<UpperTriangularMatrix> value of B<--OutMatrixType>: | |
| 1580 | |
| 1581 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ... | |
| 1582 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ... | |
| 1583 "Cmpd2","1","0.06","0.05","0.19","0.07",... ... | |
| 1584 "Cmpd3","1","0.12","0.22","0.25",... ... | |
| 1585 "Cmpd4","1","0.11","0.13",... ... | |
| 1586 "Cmpd5","1","0.17",... ... | |
| 1587 "Cmpd6","1",... ... | |
| 1588 ... ... .. | |
| 1589 ... ... .. | |
| 1590 ... ... .. | |
| 1591 | |
| 1592 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for | |
| 1593 I<LowerTriangularMatrix> value of B<--OutMatrixType>: | |
| 1594 | |
| 1595 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ... | |
| 1596 "Cmpd1","1" | |
| 1597 "Cmpd2","0.04","1" | |
| 1598 "Cmpd3","0.25","0.06","1" | |
| 1599 "Cmpd4","0.13","0.05","0.12","1" | |
| 1600 "Cmpd5","0.11","0.19","0.22","0.11","1" | |
| 1601 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1" | |
| 1602 ... ... .. | |
| 1603 ... ... .. | |
| 1604 ... ... .. | |
| 1605 | |
| 1606 | |
| 1607 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for | |
| 1608 <FullMatrix> value of B<OutMatrixType>: | |
| 1609 | |
| 1610 "CmpdID1","CmpdID2","Coefficient Value" | |
| 1611 "Cmpd1","Cmpd1","1" | |
| 1612 "Cmpd1","Cmpd2","0.04" | |
| 1613 "Cmpd1","Cmpd3","0.25" | |
| 1614 "Cmpd1","Cmpd4","0.13" | |
| 1615 ... ... ... | |
| 1616 ... ... ... | |
| 1617 ... ... ... | |
| 1618 "Cmpd2","Cmpd1","0.04" | |
| 1619 "Cmpd2","Cmpd2","1" | |
| 1620 "Cmpd2","Cmpd3","0.06" | |
| 1621 "Cmpd2","Cmpd4","0.05" | |
| 1622 ... ... ... | |
| 1623 ... ... ... | |
| 1624 ... ... ... | |
| 1625 "Cmpd3","Cmpd1","0.25" | |
| 1626 "Cmpd3","Cmpd2","0.06" | |
| 1627 "Cmpd3","Cmpd3","1" | |
| 1628 "Cmpd3","Cmpd4","0.12" | |
| 1629 ... ... ... | |
| 1630 ... ... ... | |
| 1631 ... ... ... | |
| 1632 | |
| 1633 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for | |
| 1634 <UpperTriangularMatrix> value of B<--OutMatrixType>: | |
| 1635 | |
| 1636 "CmpdID1","CmpdID2","Coefficient Value" | |
| 1637 "Cmpd1","Cmpd1","1" | |
| 1638 "Cmpd1","Cmpd2","0.04" | |
| 1639 "Cmpd1","Cmpd3","0.25" | |
| 1640 "Cmpd1","Cmpd4","0.13" | |
| 1641 ... ... ... | |
| 1642 ... ... ... | |
| 1643 ... ... ... | |
| 1644 "Cmpd2","Cmpd2","1" | |
| 1645 "Cmpd2","Cmpd3","0.06" | |
| 1646 "Cmpd2","Cmpd4","0.05" | |
| 1647 ... ... ... | |
| 1648 ... ... ... | |
| 1649 ... ... ... | |
| 1650 "Cmpd3","Cmpd3","1" | |
| 1651 "Cmpd3","Cmpd4","0.12" | |
| 1652 ... ... ... | |
| 1653 ... ... ... | |
| 1654 ... ... ... | |
| 1655 | |
| 1656 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for | |
| 1657 <LowerTriangularMatrix> value of B<--OutMatrixType>: | |
| 1658 | |
| 1659 "CmpdID1","CmpdID2","Coefficient Value" | |
| 1660 "Cmpd1","Cmpd1","1" | |
| 1661 "Cmpd2","Cmpd1","0.04" | |
| 1662 "Cmpd2","Cmpd2","1" | |
| 1663 "Cmpd3","Cmpd1","0.25" | |
| 1664 "Cmpd3","Cmpd2","0.06" | |
| 1665 "Cmpd3","Cmpd3","1" | |
| 1666 "Cmpd4","Cmpd1","0.13" | |
| 1667 "Cmpd4","Cmpd2","0.05" | |
| 1668 "Cmpd4","Cmpd3","0.12" | |
| 1669 "Cmpd4","Cmpd4","1" | |
| 1670 ... ... ... | |
| 1671 ... ... ... | |
| 1672 ... ... ... | |
| 1673 | |
| 1674 =item B<--OutMatrixType> I<FullMatrix | UpperTriangularMatrix | LowerTriangularMatrix> | |
| 1675 | |
| 1676 Type of similarity or distance matrix to calculate for fingerprints vector and bit-vector strings: | |
| 1677 Calculate full matrix; Calculate lower triangular matrix including diagonal; Calculate upper triangular | |
| 1678 matrix including diagonal. | |
| 1679 | |
| 1680 Possible values: I<FullMatrix, UpperTriangularMatrix, or LowerTriangularMatrix>. Default value: | |
| 1681 I<FullMatrix>. | |
| 1682 | |
| 1683 The value of B<--OutMatrixType> in conjunction with B<--OutMatrixFormat> determines type | |
| 1684 of data written to output files. | |
| 1685 | |
| 1686 =item B<-o, --overwrite> | |
| 1687 | |
| 1688 Overwrite existing files | |
| 1689 | |
| 1690 =item B<-p, --precision> I<number> | |
| 1691 | |
| 1692 Precision of calculated values in the output file. Default: up to I<2> decimal places. | |
| 1693 Valid values: positive integers. | |
| 1694 | |
| 1695 =item B<-q, --quote> I<Yes | No> | |
| 1696 | |
| 1697 Put quote around column values in output CSV/TSV text file(s). Possible values: | |
| 1698 I<Yes or No>. Default value: I<Yes>. | |
| 1699 | |
| 1700 =item B<-r, --root> I<RootName> | |
| 1701 | |
| 1702 New file name is generated using the root: <Root><BitVectorComparisonMode>.<Ext> or | |
| 1703 <Root><VectorComparisonMode><VectorComparisonFormulism>.<Ext>. | |
| 1704 The csv, and tsv <Ext> values are used for comma/semicolon, and tab delimited text files | |
| 1705 respectively. This option is ignored for multiple input files. | |
| 1706 | |
| 1707 =item B<-v, --VectorComparisonMode> I<All | "TanimotoSimilarity,[ManhattanDistance,...]"> | |
| 1708 | |
| 1709 Specify what similarity or distance coefficients to use for calculating similarity matrices for | |
| 1710 fingerprint vector strings data values in I<TextFile(s)>: calculate similarity matrices for all | |
| 1711 supported similarity and distance coefficients or specify a comma delimited list of similarity | |
| 1712 and distance coefficients. Possible values: I<All | "TanimotoSimilairy,[ManhattanDistance,..]">. | |
| 1713 Default: I<TanimotoSimilarity>. | |
| 1714 | |
| 1715 The value of B<-v, --VectorComparisonMode>, in conjunction with B<--VectorComparisonFormulism>, | |
| 1716 decides which type of similarity and distance coefficient formulism gets used. | |
| 1717 | |
| 1718 I<All> uses complete list of supported similarity and distance coefficients: I<CosineSimilarity, | |
| 1719 CzekanowskiSimilarity, DiceSimilarity, OchiaiSimilarity, JaccardSimilarity, SorensonSimilarity, TanimotoSimilarity, | |
| 1720 CityBlockDistance, EuclideanDistance, HammingDistance, ManhattanDistance, SoergelDistance>. These | |
| 1721 similarity and distance coefficients are described below. | |
| 1722 | |
| 1723 B<FingerprintsVector.pm> module, used to calculate similarity and distance coefficients, | |
| 1724 provides support to perform comparison between vectors containing three different types of | |
| 1725 values: | |
| 1726 | |
| 1727 Type I: OrderedNumericalValues | |
| 1728 | |
| 1729 . Size of two vectors are same | |
| 1730 . Vectors contain real values in a specific order. For example: MACCS keys | |
| 1731 count, Topological pharmnacophore atom pairs and so on. | |
| 1732 | |
| 1733 Type II: UnorderedNumericalValues | |
| 1734 | |
| 1735 . Size of two vectors might not be same | |
| 1736 . Vectors contain unordered real value identified by value IDs. For example: | |
| 1737 Toplogical atom pairs, Topological atom torsions and so on | |
| 1738 | |
| 1739 Type III: AlphaNumericalValues | |
| 1740 | |
| 1741 . Size of two vectors might not be same | |
| 1742 . Vectors contain unordered alphanumerical values. For example: Extended | |
| 1743 connectivity fingerprints, atom neighborhood fingerprints. | |
| 1744 | |
| 1745 Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues | |
| 1746 or AlphaNumericalValues, the vectors are transformed into vectors containing unique OrderedNumericalValues | |
| 1747 using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues. | |
| 1748 | |
| 1749 Three forms of similarity and distance calculation between two vectors, specified using B<--VectorComparisonFormulism> | |
| 1750 option, are supported: I<AlgebraicForm, BinaryForm or SetTheoreticForm>. | |
| 1751 | |
| 1752 For I<BinaryForm>, the ordered list of processed final vector values containing the value or | |
| 1753 count of each unique value type is simply converted into a binary vector containing 1s and 0s | |
| 1754 corresponding to presence or absence of values before calculating similarity or distance between | |
| 1755 two vectors. | |
| 1756 | |
| 1757 For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let: | |
| 1758 | |
| 1759 N = Number values in A or B | |
| 1760 | |
| 1761 Xa = Values of vector A | |
| 1762 Xb = Values of vector B | |
| 1763 | |
| 1764 Xai = Value of ith element in A | |
| 1765 Xbi = Value of ith element in B | |
| 1766 | |
| 1767 SUM = Sum of i over N values | |
| 1768 | |
| 1769 For SetTheoreticForm of calculation between two vectors, let: | |
| 1770 | |
| 1771 SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) ) | |
| 1772 SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) | |
| 1773 | |
| 1774 For BinaryForm of calculation between two vectors, let: | |
| 1775 | |
| 1776 Na = Number of bits set to "1" in A = SUM ( Xai ) | |
| 1777 Nb = Number of bits set to "1" in B = SUM ( Xbi ) | |
| 1778 Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi ) | |
| 1779 Nd = Number of bits set to "0" in both A and B | |
| 1780 = SUM ( 1 - Xai - Xbi + Xai * Xbi) | |
| 1781 | |
| 1782 N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd | |
| 1783 | |
| 1784 Additionally, for BinaryForm various values also correspond to: | |
| 1785 | |
| 1786 Na = | Xa | | |
| 1787 Nb = | Xb | | |
| 1788 Nc = | SetIntersectionXaXb | | |
| 1789 Nd = N - | SetDifferenceXaXb | | |
| 1790 | |
| 1791 | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc | |
| 1792 = | Xa | + | Xb | - | SetIntersectionXaXb | | |
| 1793 | |
| 1794 Various similarity and distance coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair of vectors A and B | |
| 1795 in I<AlgebraicForm, BinaryForm and SetTheoreticForm> are defined as follows: | |
| 1796 | |
| 1797 B<CityBlockDistance>: ( same as HammingDistance and ManhattanDistance) | |
| 1798 | |
| 1799 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) | |
| 1800 | |
| 1801 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc | |
| 1802 | |
| 1803 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) | |
| 1804 | |
| 1805 B<CosineSimilarity>: ( same as OchiaiSimilarityCoefficient) | |
| 1806 | |
| 1807 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) | |
| 1808 | |
| 1809 I<BinaryForm>: Nc / SQRT ( Na * Nb) | |
| 1810 | |
| 1811 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) | |
| 1812 | |
| 1813 B<CzekanowskiSimilarity>: ( same as DiceSimilarity and SorensonSimilarity) | |
| 1814 | |
| 1815 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) | |
| 1816 | |
| 1817 I<BinaryForm>: 2 * Nc / ( Na + Nb ) | |
| 1818 | |
| 1819 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) | |
| 1820 | |
| 1821 B<DiceSimilarity>: ( same as CzekanowskiSimilarity and SorensonSimilarity) | |
| 1822 | |
| 1823 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) | |
| 1824 | |
| 1825 I<BinaryForm>: 2 * Nc / ( Na + Nb ) | |
| 1826 | |
| 1827 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) | |
| 1828 | |
| 1829 B<EuclideanDistance>: | |
| 1830 | |
| 1831 I<AlgebraicForm>: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) ) | |
| 1832 | |
| 1833 I<BinaryForm>: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb - 2 * Nc ) | |
| 1834 | |
| 1835 I<SetTheoreticForm>: SQRT ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) | |
| 1836 | |
| 1837 B<HammingDistance>: ( same as CityBlockDistance and ManhattanDistance) | |
| 1838 | |
| 1839 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) | |
| 1840 | |
| 1841 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc | |
| 1842 | |
| 1843 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) | |
| 1844 | |
| 1845 B<JaccardSimilarity>: ( same as TanimotoSimilarity) | |
| 1846 | |
| 1847 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) | |
| 1848 | |
| 1849 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) | |
| 1850 | |
| 1851 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) | |
| 1852 | |
| 1853 B<ManhattanDistance>: ( same as CityBlockDistance and HammingDistance) | |
| 1854 | |
| 1855 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) | |
| 1856 | |
| 1857 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc | |
| 1858 | |
| 1859 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) | |
| 1860 | |
| 1861 B<OchiaiSimilarity>: ( same as CosineSimilarity) | |
| 1862 | |
| 1863 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) | |
| 1864 | |
| 1865 I<BinaryForm>: Nc / SQRT ( Na * Nb) | |
| 1866 | |
| 1867 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) | |
| 1868 | |
| 1869 B<SorensonSimilarity>: ( same as CzekanowskiSimilarity and DiceSimilarity) | |
| 1870 | |
| 1871 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) | |
| 1872 | |
| 1873 I<BinaryForm>: 2 * Nc / ( Na + Nb ) | |
| 1874 | |
| 1875 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) | |
| 1876 | |
| 1877 B<SoergelDistance>: | |
| 1878 | |
| 1879 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi ) ) | |
| 1880 | |
| 1881 I<BinaryForm>: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / ( Na + Nb - Nc ) | |
| 1882 | |
| 1883 I<SetTheoreticForm>: ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) / | SetDifferenceXaXb | = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) | |
| 1884 | |
| 1885 B<TanimotoSimilarity>: ( same as JaccardSimilarity) | |
| 1886 | |
| 1887 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) | |
| 1888 | |
| 1889 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) | |
| 1890 | |
| 1891 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) | |
| 1892 | |
| 1893 =item B<--VectorComparisonFormulism> I<All | "AlgebraicForm,[BinaryForm,SetTheoreticForm]"> | |
| 1894 | |
| 1895 Specify fingerprints vector comparison formulism to use for calculation similarity and distance | |
| 1896 coefficients during B<-v, --VectorComparisonMode>: use all supported comparison formulisms | |
| 1897 or specify a comma delimited. Possible values: I<All | "AlgebraicForm,[BinaryForm,SetTheoreticForm]">. | |
| 1898 Default value: I<AlgebraicForm>. | |
| 1899 | |
| 1900 I<All> uses all three forms of supported vector comparison formulism for values of B<-v, --VectorComparisonMode> | |
| 1901 option. | |
| 1902 | |
| 1903 For fingerprint vector strings containing B<AlphaNumericalValues> data values - B<ExtendedConnectivityFingerprints>, | |
| 1904 B<AtomNeighborhoodsFingerprints> and so on - all three formulism result in same value during similarity and distance | |
| 1905 calculations. | |
| 1906 | |
| 1907 =item B<-w, --WorkingDir> I<DirName> | |
| 1908 | |
| 1909 Location of working directory. Default: current directory. | |
| 1910 | |
| 1911 =back | |
| 1912 | |
| 1913 =head1 EXAMPLES | |
| 1914 | |
| 1915 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 1916 bit-vector strings data corresponding to supported fingerprints in text file present in a column | |
| 1917 name containing Fingerprint substring by loading all fingerprints data into memory and create a | |
| 1918 SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from column name | |
| 1919 containing CompoundID substring, type: | |
| 1920 | |
| 1921 % SimilarityMatricesFingerprints.pl -o SampleFPHex.csv | |
| 1922 | |
| 1923 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 1924 bit-vector strings data corresponding to supported fingerprints in SD File present in a data field | |
| 1925 with Fingerprint substring in its label by loading all fingerprints data into memory and create a | |
| 1926 SampleFPHexTanimotoSimilarity.csv file containing sequentially generated compound IDs with | |
| 1927 Cmpd prefix, type: | |
| 1928 | |
| 1929 % SimilarityMatricesFingerprints.pl -o SampleFPHex.sdf | |
| 1930 | |
| 1931 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 1932 bit-vector strings data corresponding to supported fingerprints in FP file by loading all fingerprints | |
| 1933 data into memory and create a SampleFPHexTanimotoSimilarity.csv file along with compound IDs | |
| 1934 retrieved from FP file, type: | |
| 1935 | |
| 1936 % SimilarityMatricesFingerprints.pl -o SampleFPHex.fpf | |
| 1937 | |
| 1938 To generate a lower triangular similarity matrix corresponding to Tanimoto similarity coefficient for | |
| 1939 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a | |
| 1940 column name containing Fingerprint substring by loading all fingerprints data into memory and create | |
| 1941 a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from column name | |
| 1942 containing CompoundID substring, type: | |
| 1943 | |
| 1944 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory | |
| 1945 --OutMatrixFormat RowsAndColumns --OutMatrixType LowerTriangularMatrix | |
| 1946 SampleFPHex.csv | |
| 1947 | |
| 1948 To generate a upper triangular similarity matrix corresponding to Tanimoto similarity coefficient for | |
| 1949 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a | |
| 1950 column name containing Fingerprint substring by loading all fingerprints data into memory and create | |
| 1951 a SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format containing compound IDs retrieved | |
| 1952 from column name containing CompoundID substring, type: | |
| 1953 | |
| 1954 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory | |
| 1955 --OutMatrixFormat IDPairsAndValue --OutMatrixType UpperTriangularMatrix | |
| 1956 SampleFPHex.csv | |
| 1957 | |
| 1958 To generate a full similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 1959 bit-vector strings data corresponding to supported fingerprints in text file present in a column | |
| 1960 name containing Fingerprint substring by scanning file without loading all fingerprints data into memory | |
| 1961 and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from | |
| 1962 column name containing CompoundID substring, type: | |
| 1963 | |
| 1964 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile | |
| 1965 --OutMatrixFormat RowsAndColumns --OutMatrixType FullMatrix | |
| 1966 SampleFPHex.csv | |
| 1967 | |
| 1968 To generate a lower triangular similarity matrix corresponding to Tanimoto similarity coefficient for | |
| 1969 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a | |
| 1970 column name containing Fingerprint substring by scanning file without loading all fingerprints data into | |
| 1971 memory and create a SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format containing | |
| 1972 compound IDs retrieved from column name containing CompoundID substring, type: | |
| 1973 | |
| 1974 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile | |
| 1975 --OutMatrixFormat IDPairsAndValue --OutMatrixType LowerTriangularMatrix | |
| 1976 SampleFPHex.csv | |
| 1977 | |
| 1978 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism | |
| 1979 for fingerprints vector strings data corresponding to supported fingerprints in text file present in a column name | |
| 1980 containing Fingerprint substring and create a SampleFPCountTanimotoSimilarityAlgebraicForm.csv file | |
| 1981 containing compound IDs retrieved from column name containing CompoundID substring, type: | |
| 1982 | |
| 1983 % SimilarityMatricesFingerprints.pl -o SampleFPCount.csv | |
| 1984 | |
| 1985 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism | |
| 1986 for fingerprints vector strings data corresponding to supported fingerprints in SD file present in a data field with | |
| 1987 Fingerprint substring in its label and create a SampleFPCountTanimotoSimilarityAlgebraicForm.csv file | |
| 1988 containing sequentially generated compound IDs with Cmpd prefix, type: | |
| 1989 | |
| 1990 % SimilarityMatricesFingerprints.pl -o SampleFPCount.sdf | |
| 1991 | |
| 1992 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism | |
| 1993 vector strings data corresponding to supported fingerprints in FP file and create a | |
| 1994 SampleFPCountTanimotoSimilarityAlgebraicForm.csv file along with compound IDs retrieved from FP file, type: | |
| 1995 | |
| 1996 % SimilarityMatricesFingerprints.pl -o SampleFPCount.fpf | |
| 1997 | |
| 1998 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 1999 bit-vector strings data corresponding to supported fingerprints in text file present in a column name | |
| 2000 containing Fingerprint substring and create a SampleFPHexTanimotoSimilarity.csv file in | |
| 2001 IDPairsAndValue format containing compound IDs retrieved from column name containing | |
| 2002 CompoundID substring, type: | |
| 2003 | |
| 2004 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o | |
| 2005 SampleFPHex.csv | |
| 2006 | |
| 2007 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2008 bit-vector strings data corresponding to supported fingerprints in SD file present in a data field with | |
| 2009 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file in | |
| 2010 IDPairsAndValue format containing sequentially generated compound IDs with Cmpd prefix, | |
| 2011 type: | |
| 2012 | |
| 2013 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o | |
| 2014 SampleFPHex.sdf | |
| 2015 | |
| 2016 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2017 bit-vector strings data corresponding to supported fingerprints in FP file and create a | |
| 2018 SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format along with compound IDs retrieved | |
| 2019 from FP file, type: | |
| 2020 | |
| 2021 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o | |
| 2022 SampleFPHex.fpf | |
| 2023 | |
| 2024 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2025 bit-vector strings data corresponding to supported fingerprints in SD file present in a data field with | |
| 2026 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file | |
| 2027 containing compound IDs from mol name line, type: | |
| 2028 | |
| 2029 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolName -o | |
| 2030 SampleFPHex.sdf | |
| 2031 | |
| 2032 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2033 bit-vector strings data corresponding to supported fingerprints present in a data field with | |
| 2034 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file | |
| 2035 containing compound IDs from data field name Mol_ID, type: | |
| 2036 | |
| 2037 % SimilarityMatricesFingerprints.pl --CompoundIDMode DataField | |
| 2038 --CompoundIDField Mol_ID -o SampleFPBin.sdf | |
| 2039 | |
| 2040 To generate similarity matrices corresponding to Buser, Dice and Tanimoto similarity coefficient | |
| 2041 for fingerprints bit-vector strings data corresponding to supported fingerprints present in a column | |
| 2042 name containing Fingerprint substring and create SampleFPBin[CoefficientName]Similarity.csv files | |
| 2043 containing compound IDs retrieved from column name containing CompoundID substring, type: | |
| 2044 | |
| 2045 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity, | |
| 2046 TanimotoSimilarity" -o SampleFPBin.csv | |
| 2047 | |
| 2048 To generate similarity matrices corresponding to Buser, Dice and Tanimoto similarity coefficient | |
| 2049 for fingerprints bit-vector strings data corresponding to supported fingerprints present in a data field with | |
| 2050 Fingerprint substring in its label and create SampleFPBin[CoefficientName]Similarity.csv files | |
| 2051 containing sequentially generated compound IDs with Cmpd prefix, type: | |
| 2052 | |
| 2053 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity, | |
| 2054 TanimotoSimilarity" -o SampleFPBin.sdf | |
| 2055 | |
| 2056 To generate similarity matrices corresponding to CityBlock distance and Tanimoto similarity coefficients using | |
| 2057 algebraic formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
| 2058 a column name containing Fingerprint substring and create SampleFPCount[CoefficientName]AlgebraicForm.csv | |
| 2059 files containing compound IDs retrieved from column name containing CompoundID substring, type: | |
| 2060 | |
| 2061 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
| 2062 TanimotoSimilarity" -o SampleFPCount.csv | |
| 2063 | |
| 2064 To generate similarity matrices corresponding to CityBlock distance and Tanimoto similarity coefficients using | |
| 2065 algebraic formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
| 2066 a data field with Fingerprint substring in its label and create SampleFPCount[CoefficientName]AlgebraicForm.csv | |
| 2067 files containing sequentially generated compound IDs with Cmpd prefix, type: | |
| 2068 | |
| 2069 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
| 2070 TanimotoSimilarity" -o SampleFPCount.sdf | |
| 2071 | |
| 2072 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
| 2073 binary formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
| 2074 a column name containing Fingerprint substring and create SampleFPCount[CoefficientName]Binary.csv | |
| 2075 files containing compound IDs retrieved from column name containing CompoundID substring, type: | |
| 2076 | |
| 2077 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
| 2078 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o | |
| 2079 SampleFPCount.csv | |
| 2080 | |
| 2081 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
| 2082 binary formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
| 2083 a data field with Fingerprint substring in its label and create SampleFPCount[CoefficientName]Binary.csv | |
| 2084 files containing sequentially generated compound IDs with Cmpd prefix, type: | |
| 2085 | |
| 2086 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
| 2087 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o | |
| 2088 SampleFPCount.sdf | |
| 2089 | |
| 2090 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
| 2091 all supported comparison formulisms for fingerprints vector strings data corresponding to supported | |
| 2092 fingerprints present in a column name containing Fingerprint substring and create | |
| 2093 SampleFPCount[CoefficientName][FormulismName].csv files containing compound IDs retrieved from column | |
| 2094 name containing CompoundID substring, type: | |
| 2095 | |
| 2096 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
| 2097 TanimotoSimilarity" --VectorComparisonFormulism All -o SampleFPCount.csv | |
| 2098 | |
| 2099 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
| 2100 all supported comparison formulisms for fingerprints vector strings data corresponding to supported | |
| 2101 fingerprints present in a data field with Fingerprint substring in its label and create | |
| 2102 SampleFPCount[CoefficientName][FormulismName].csv files containing sequentially generated | |
| 2103 compound IDs with Cmpd prefix, type: | |
| 2104 | |
| 2105 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,TanimotoSimilarity" | |
| 2106 --VectorComparisonFormulism All -o SampleFPCount.sdf | |
| 2107 | |
| 2108 To generate similarity matrices corresponding to all available similarity coefficient for fingerprints | |
| 2109 bit-vector strings data corresponding to supported fingerprints present in a column name | |
| 2110 containing Fingerprint substring and create SampleFPHex[CoefficientName].csv files | |
| 2111 containing compound IDs retrieved from column name containing CompoundID substring, type: | |
| 2112 | |
| 2113 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode | |
| 2114 All --alpha 0.5 -beta 0.5 -o SampleFPHex.csv | |
| 2115 | |
| 2116 To generate similarity matrices corresponding to all available similarity coefficient for fingerprints | |
| 2117 bit-vector strings data corresponding to supported fingerprints present in a data field with Fingerprint | |
| 2118 substring in its label and create SampleFPHex[CoefficientName].csv files containing sequentially | |
| 2119 generated compound IDs with Cmpd prefix, type | |
| 2120 | |
| 2121 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode | |
| 2122 All --alpha 0.5 -beta 0.5 -o SampleFPHex.sdf | |
| 2123 | |
| 2124 To generate similarity matrices corresponding to all available similarity and distance coefficients using | |
| 2125 all comparison formulism for fingerprints vector strings data corresponding to supported fingerprints | |
| 2126 present in a column name containing Fingerprint substring and create | |
| 2127 SampleFPCount[CoefficientName][FormulismName].csv files containing compound IDs | |
| 2128 retrieved from column name containing CompoundID substring, type: | |
| 2129 | |
| 2130 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode | |
| 2131 All --VectorComparisonFormulism All -o SampleFPCount.csv | |
| 2132 | |
| 2133 To generate similarity matrices corresponding to all available similarity and distance coefficients using | |
| 2134 all comparison formulism for fingerprints vector strings data corresponding to supported fingerprints | |
| 2135 present in a data field with Fingerprint substring in its label and create | |
| 2136 SampleFPCount[CoefficientName][FormulismName].csv files containing sequentially generated | |
| 2137 compound IDs with Cmpd prefix, type: | |
| 2138 | |
| 2139 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode | |
| 2140 All --VectorComparisonFormulism All -o SampleFPCount.sdf | |
| 2141 | |
| 2142 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2143 bit-vector strings data corresponding to supported fingerprints present in a column number 2 | |
| 2144 and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved column | |
| 2145 number 1, type: | |
| 2146 | |
| 2147 % SimilarityMatricesFingerprints.pl --ColMode ColNum --CompoundIDCol 1 | |
| 2148 --FingerprintsCol 2 -o SampleFPHex.csv | |
| 2149 | |
| 2150 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2151 bit-vector strings data corresponding to supported fingerprints present in a data field name | |
| 2152 Fingerprints and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs | |
| 2153 present in data field name Mol_ID, type: | |
| 2154 | |
| 2155 % SimilarityMatricesFingerprints.pl --FingerprintsField Fingerprints | |
| 2156 --CompoundIDMode DataField --CompoundIDField Mol_ID -o SampleFPHex.sdf | |
| 2157 | |
| 2158 To generate a similarity matrix corresponding to Tversky similarity coefficient for fingerprints | |
| 2159 bit-vector strings data corresponding to supported fingerprints present in a column named Fingerprints | |
| 2160 and create a SampleFPHexTverskySimilarity.tsv file containing compound IDs retrieved column named | |
| 2161 CompoundID, type: | |
| 2162 | |
| 2163 % SimilarityMatricesFingerprints.pl --BitVectorComparisonMode | |
| 2164 TverskySimilarity --alpha 0.5 --ColMode ColLabel --CompoundIDCol | |
| 2165 CompoundID --FingerprintsCol Fingerprints --OutDelim Tab --quote No | |
| 2166 -o SampleFPHex.csv | |
| 2167 | |
| 2168 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2169 bit-vector strings data corresponding to supported fingerprints present in a data field | |
| 2170 with Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file | |
| 2171 containing compound IDs from molname line or sequentially generated compound IDs | |
| 2172 with Mol prefix, type: | |
| 2173 | |
| 2174 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolnameOrLabelPrefix | |
| 2175 --CompoundIDPrefix Mol -o SampleFPHex.sdf | |
| 2176 | |
| 2177 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
| 2178 bit-vector strings data corresponding to supported fingerprints present in a data field with | |
| 2179 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.tsv file | |
| 2180 containing sequentially generated compound IDs with Cmpd prefix, type: | |
| 2181 | |
| 2182 % SimilarityMatricesFingerprints.pl -OutDelim Tab --quote No -o SampleFPHex.sdf | |
| 2183 | |
| 2184 =head1 AUTHOR | |
| 2185 | |
| 2186 Manish Sud <msud@san.rr.com> | |
| 2187 | |
| 2188 =head1 SEE ALSO | |
| 2189 | |
| 2190 InfoFingerprintsFiles.pl, SimilaritySearchingFingerprints.pl, AtomNeighborhoodsFingerprints.pl, | |
| 2191 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl, PathLengthFingerprints.pl, | |
| 2192 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl, | |
| 2193 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl | |
| 2194 | |
| 2195 =head1 COPYRIGHT | |
| 2196 | |
| 2197 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 2198 | |
| 2199 This file is part of MayaChemTools. | |
| 2200 | |
| 2201 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 2202 the terms of the GNU Lesser General Public License as published by the Free | |
| 2203 Software Foundation; either version 3 of the License, or (at your option) | |
| 2204 any later version. | |
| 2205 | |
| 2206 =cut |
