Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/SimilarityMatricesFingerprints.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: SimilarityMatricesFingerprints.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.21 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use File::Copy; | |
34 use Text::ParseWords; | |
35 use Benchmark; | |
36 use FileUtil; | |
37 use TextUtil; | |
38 use Fingerprints::FingerprintsFileUtil; | |
39 use Fingerprints::FingerprintsBitVector; | |
40 use Fingerprints::FingerprintsVector; | |
41 | |
42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
43 | |
44 # Autoflush STDOUT | |
45 $| = 1; | |
46 | |
47 # Starting message... | |
48 $ScriptName = basename($0); | |
49 print "\n$ScriptName: Starting...\n\n"; | |
50 $StartTime = new Benchmark; | |
51 | |
52 # Get the options and setup script... | |
53 SetupScriptUsage(); | |
54 if ($Options{help} || @ARGV < 1) { | |
55 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
56 } | |
57 | |
58 my(@FingerprintsFilesList); | |
59 @FingerprintsFilesList = ExpandFileNames(\@ARGV, "sdf sd fpf fp csv tsv"); | |
60 | |
61 # Process options... | |
62 print "Processing options...\n"; | |
63 my(%OptionsInfo); | |
64 ProcessOptions(); | |
65 | |
66 # Setup information about input files... | |
67 print "Checking input fingerprints file(s)...\n"; | |
68 my(%FingerprintsFilesInfo); | |
69 RetrieveFingerprintsFilesInfo(); | |
70 | |
71 # Process input files.. | |
72 my($FileIndex); | |
73 if (@FingerprintsFilesList > 1) { | |
74 print "\nProcessing fingerprints files...\n"; | |
75 } | |
76 for $FileIndex (0 .. $#FingerprintsFilesList) { | |
77 if ($FingerprintsFilesInfo{FileOkay}[$FileIndex]) { | |
78 print "\nProcessing file $FingerprintsFilesList[$FileIndex]...\n"; | |
79 GenerateSimilarityMatrices($FileIndex); | |
80 } | |
81 } | |
82 print "\n$ScriptName:Done...\n\n"; | |
83 | |
84 $EndTime = new Benchmark; | |
85 $TotalTime = timediff ($EndTime, $StartTime); | |
86 print "Total time: ", timestr($TotalTime), "\n"; | |
87 | |
88 ############################################################################### | |
89 | |
90 # Generate similarity matrices using fingerprints data in text file... | |
91 # | |
92 sub GenerateSimilarityMatrices { | |
93 my($FileIndex) = @_; | |
94 | |
95 ProcessFingerprintsData($FileIndex); | |
96 | |
97 if ($FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$FileIndex]) { | |
98 GenerateSimilarityMatricesForFingerprintsBitVectors($FileIndex); | |
99 } | |
100 elsif ($FingerprintsFilesInfo{FingerprintsVectorStringMode}[$FileIndex]) { | |
101 GenerateSimilarityMatricesForFingerprintsVectors($FileIndex); | |
102 } | |
103 | |
104 CleanupFingerprintsData($FileIndex); | |
105 } | |
106 | |
107 # Generate bit vector similarity matrices... | |
108 # | |
109 sub GenerateSimilarityMatricesForFingerprintsBitVectors { | |
110 my($FileIndex) = @_; | |
111 my($SpecifiedComparisonMeasure, $ComparisonMeasure, $NewTextFile, $SimilarityMatrixRef, $MethodName, @MethodParameters); | |
112 | |
113 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedBitVectorComparisonsRef}}) { | |
114 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
115 $NewTextFile = $FingerprintsFilesInfo{OutFileRoot}[$FileIndex] . "${ComparisonMeasure}." . $FingerprintsFilesInfo{OutFileExt}[$FileIndex]; | |
116 | |
117 $MethodName = $OptionsInfo{SpecifiedBitVectorComparisonsMethodRef}->{lc($ComparisonMeasure)}; | |
118 | |
119 @MethodParameters = (); | |
120 @MethodParameters = @{$OptionsInfo{SpecifiedBitVectorComparisonsParameterRef}->{lc($ComparisonMeasure)}}; | |
121 | |
122 GenerateSimilarityMatrix($FileIndex, $NewTextFile, $MethodName, \@MethodParameters); | |
123 } | |
124 } | |
125 | |
126 # Generate vector similarity and/or distance matrices... | |
127 # | |
128 sub GenerateSimilarityMatricesForFingerprintsVectors { | |
129 my($FileIndex) = @_; | |
130 my($SpecifiedComparisonMeasure, $ComparisonMode, $ComparisonMeasure, $NewTextFile, $MethodName, @MethodParameters); | |
131 | |
132 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedVectorComparisonsRef}}) { | |
133 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
134 | |
135 for $ComparisonMode (@{$OptionsInfo{SpecifiedVectorComparisonModesRef}}) { | |
136 $NewTextFile = $FingerprintsFilesInfo{OutFileRoot}[$FileIndex] . "${ComparisonMeasure}${ComparisonMode}." . $FingerprintsFilesInfo{OutFileExt}[$FileIndex]; | |
137 | |
138 $MethodName = $OptionsInfo{SpecifiedVectorComparisonsMethodRef}->{lc($ComparisonMeasure)}; | |
139 | |
140 @MethodParameters = (); | |
141 push @MethodParameters, $ComparisonMode; | |
142 push @MethodParameters, @{$OptionsInfo{SpecifiedVectorComparisonsParameterRef}->{lc($ComparisonMeasure)}}; | |
143 | |
144 GenerateSimilarityMatrix($FileIndex, $NewTextFile, $MethodName, \@MethodParameters); | |
145 } | |
146 } | |
147 } | |
148 | |
149 # Calculate similarity matrix and write it out... | |
150 # | |
151 sub GenerateSimilarityMatrix { | |
152 my($FileIndex, $NewTextFile, $MethodName, $MethodParametersRef) = @_; | |
153 | |
154 print "\nGenerating $NewTextFile...\n"; | |
155 | |
156 # Open new file and write out column labels... | |
157 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n"; | |
158 WriteColumnLabels($FileIndex, \*NEWTEXTFILE); | |
159 | |
160 # Calculate and write out similarity matrix values... | |
161 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
162 GenerateSimilarityMatrixUsingMemoryData($FileIndex, \*NEWTEXTFILE, $MethodName, $MethodParametersRef); | |
163 } | |
164 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
165 GenerateSimilarityMatrixUsingFileData($FileIndex, \*NEWTEXTFILE, $MethodName, $MethodParametersRef); | |
166 } | |
167 else { | |
168 warn "Warning: Input data mode, $OptionsInfo{InputDataMode}, is not supported.\n"; | |
169 } | |
170 | |
171 # Close new text file... | |
172 close NEWTEXTFILE; | |
173 | |
174 } | |
175 | |
176 # Calculate and write out similarity values using fingerprints data already loaded in | |
177 # memory... | |
178 # | |
179 sub GenerateSimilarityMatrixUsingMemoryData { | |
180 my($FileIndex, $NewTextFileRef, $MethodName, $MethodParametersRef) = @_; | |
181 my($RowIndex, $ColIndex, $CmpdID1, $CmpdID2, $FingerprintsObject1, $FingerprintsObject2, $Value, $Line, @LineWords); | |
182 | |
183 for $RowIndex (0 .. $#{$FingerprintsFilesInfo{FingerprintsObjectsRef}}) { | |
184 $FingerprintsObject1 = $FingerprintsFilesInfo{FingerprintsObjectsRef}->[$RowIndex]; | |
185 $CmpdID1 = $FingerprintsFilesInfo{CompundIDsRef}->[$RowIndex]; | |
186 | |
187 if ($OptionsInfo{WriteRowsAndColumns}) { | |
188 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}${CmpdID1}$OptionsInfo{OutQuoteValue}"; | |
189 } | |
190 | |
191 COLINDEX: for $ColIndex (0 .. $#{$FingerprintsFilesInfo{FingerprintsObjectsRef}}) { | |
192 if (SkipMatrixData($RowIndex, $ColIndex)) { | |
193 next COLINDEX; | |
194 } | |
195 | |
196 $FingerprintsObject2 = $FingerprintsFilesInfo{FingerprintsObjectsRef}->[$ColIndex]; | |
197 | |
198 $Value = $FingerprintsObject1->$MethodName($FingerprintsObject2, @{$MethodParametersRef}); | |
199 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ''; | |
200 | |
201 if ($OptionsInfo{WriteRowsAndColumns}) { | |
202 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${Value}$OptionsInfo{OutQuoteValue}"; | |
203 } | |
204 elsif ($OptionsInfo{WriteIDPairsAndValue}) { | |
205 $CmpdID2 = $FingerprintsFilesInfo{CompundIDsRef}->[$ColIndex]; | |
206 | |
207 @LineWords = (); | |
208 push @LineWords, ($CmpdID1, $CmpdID2, $Value); | |
209 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
210 print $NewTextFileRef "$Line\n"; | |
211 } | |
212 } | |
213 if ($OptionsInfo{WriteRowsAndColumns}) { | |
214 print $NewTextFileRef "\n"; | |
215 } | |
216 } | |
217 } | |
218 | |
219 # Calculate and write out similarity values by retrieving and prcessing data | |
220 # from fingerprint file... | |
221 # | |
222 sub GenerateSimilarityMatrixUsingFileData { | |
223 my($FileIndex, $NewTextFileRef, $MethodName, $MethodParametersRef) = @_; | |
224 my($RowIndex, $ColIndex, $FingerprintsFileIO, $TmpFingerprintsFileIO, $FingerprintsObject1, $FingerprintsObject2, $CmpdID1, $CmpdID2, $FingerprintsCount, $IgnoredFingerprintsCount, $Value, $Line, @LineWords); | |
225 | |
226 print "\nReading and processing fingerprints data...\n"; | |
227 | |
228 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}); | |
229 $FingerprintsFileIO->Open(); | |
230 | |
231 $RowIndex = 0; $ColIndex = 0; | |
232 $FingerprintsCount = 0; $IgnoredFingerprintsCount = 0; | |
233 | |
234 FINGERPRINTSFILEIO: while ($FingerprintsFileIO->Read()) { | |
235 $FingerprintsCount++; | |
236 | |
237 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) { | |
238 $IgnoredFingerprintsCount++; | |
239 next FINGERPRINTSFILEIO; | |
240 } | |
241 $RowIndex++; | |
242 $FingerprintsObject1 = $FingerprintsFileIO->GetFingerprints(); | |
243 $CmpdID1 = $FingerprintsFileIO->GetCompoundID(); | |
244 | |
245 if ($OptionsInfo{WriteRowsAndColumns}) { | |
246 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}${CmpdID1}$OptionsInfo{OutQuoteValue}"; | |
247 } | |
248 | |
249 # Force detail level of 1 to avoid duplicate printing of diagnostic messages for invalid | |
250 # fingerprints data... | |
251 $TmpFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$FileIndex]}, "DetailLevel" => 1); | |
252 $TmpFingerprintsFileIO->Open(); | |
253 | |
254 $ColIndex = 0; | |
255 TMPFINGERPRINTSFILEIO: while ($TmpFingerprintsFileIO->Read()) { | |
256 if (!$TmpFingerprintsFileIO->IsFingerprintsDataValid()) { | |
257 next TMPFINGERPRINTSFILEIO; | |
258 } | |
259 $ColIndex++; | |
260 | |
261 if (SkipMatrixData($RowIndex, $ColIndex)) { | |
262 next TMPFINGERPRINTSFILEIO; | |
263 } | |
264 | |
265 $FingerprintsObject2 = $TmpFingerprintsFileIO->GetFingerprints(); | |
266 | |
267 $Value = $FingerprintsObject1->$MethodName($FingerprintsObject2, @{$MethodParametersRef}); | |
268 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : ''; | |
269 | |
270 if ($OptionsInfo{WriteRowsAndColumns}) { | |
271 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${Value}$OptionsInfo{OutQuoteValue}"; | |
272 } | |
273 elsif ($OptionsInfo{WriteIDPairsAndValue}) { | |
274 $CmpdID2 = $TmpFingerprintsFileIO->GetCompoundID(); | |
275 | |
276 @LineWords = (); | |
277 push @LineWords, ($CmpdID1, $CmpdID2, $Value); | |
278 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
279 print $NewTextFileRef "$Line\n"; | |
280 } | |
281 } | |
282 $TmpFingerprintsFileIO->Close(); | |
283 | |
284 if ($OptionsInfo{WriteRowsAndColumns}) { | |
285 print $NewTextFileRef "\n"; | |
286 } | |
287 } | |
288 | |
289 $FingerprintsFileIO->Close(); | |
290 | |
291 print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n"; | |
292 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n"; | |
293 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n"; | |
294 } | |
295 | |
296 # Check whether matrix data need to be skipped... | |
297 # | |
298 sub SkipMatrixData { | |
299 my($RowIndex, $ColIndex) = @_; | |
300 | |
301 if ($OptionsInfo{WriteFullMatrix}) { | |
302 return 0; | |
303 } | |
304 elsif ($OptionsInfo{WriteUpperTriangularMatrix}) { | |
305 return ($RowIndex > $ColIndex) ? 1 : 0; | |
306 } | |
307 elsif ($OptionsInfo{WriteLowerTriangularMatrix}) { | |
308 return ($RowIndex < $ColIndex) ? 1 : 0; | |
309 } | |
310 | |
311 return 0; | |
312 } | |
313 | |
314 # Write out column labels... | |
315 # | |
316 sub WriteColumnLabels { | |
317 my($FileIndex, $NewTextFileRef) = @_; | |
318 my($Line, @LineWords); | |
319 | |
320 if ($OptionsInfo{OutMatrixFormat} =~ /^IDPairsAndValue$/i) { | |
321 @LineWords = (); | |
322 push @LineWords, ('CmpdID1', 'CmpdID2', 'Coefficient Value'); | |
323 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
324 print $NewTextFileRef "$Line\n"; | |
325 } | |
326 elsif ($OptionsInfo{OutMatrixFormat} =~ /^RowsAndColumns$/i) { | |
327 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
328 @LineWords = (); | |
329 push @LineWords, ''; | |
330 push @LineWords, @{$FingerprintsFilesInfo{CompundIDsRef}}; | |
331 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
332 print $NewTextFileRef "$Line\n"; | |
333 } | |
334 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
335 my( $FingerprintsFileIO, $CmpdID); | |
336 | |
337 # Scan file to retrieve compound IDs... | |
338 # | |
339 print "\nProcessing fingerprints file to generate compound IDs...\n"; | |
340 | |
341 # Force detail level of 1 to avoid diagnostics messages for invalid fingeprints data during | |
342 # retrieval of compound IDs as these get printed out during calculation of matrix... | |
343 # | |
344 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}, "DetailLevel" => 1); | |
345 $FingerprintsFileIO->Open(); | |
346 | |
347 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}$OptionsInfo{OutQuoteValue}"; | |
348 | |
349 FINGERPRINTSFILEIO: while ($FingerprintsFileIO->Read()) { | |
350 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) { | |
351 next FINGERPRINTSFILEIO; | |
352 } | |
353 $CmpdID = $FingerprintsFileIO->GetCompoundID(); | |
354 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${CmpdID}$OptionsInfo{OutQuoteValue}"; | |
355 } | |
356 $FingerprintsFileIO->Close(); | |
357 | |
358 print $NewTextFileRef "\n"; | |
359 | |
360 print "Processing fingerprints file to generate matrix...\n"; | |
361 } | |
362 } | |
363 else { | |
364 warn "Warning: Output matrix format, $OptionsInfo{OutMatrixFormat}, is not supported.\n"; | |
365 } | |
366 } | |
367 | |
368 # Process fingerprints data... | |
369 # | |
370 sub ProcessFingerprintsData { | |
371 my($FileIndex) = @_; | |
372 my($FingerprintsFileIO); | |
373 | |
374 $FingerprintsFilesInfo{CompundIDsRef} = undef; | |
375 $FingerprintsFilesInfo{FingerprintsObjectsRef} = undef; | |
376 | |
377 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
378 my($FingerprintsFileIO); | |
379 | |
380 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}); | |
381 ($FingerprintsFilesInfo{CompundIDsRef}, $FingerprintsFilesInfo{FingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO); | |
382 } | |
383 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
384 my($FingerprintsFile, $TmpFingerprintsFile); | |
385 | |
386 $FingerprintsFile = $FingerprintsFilesList[$FileIndex]; | |
387 $TmpFingerprintsFile = $FingerprintsFilesInfo{TmpFingerprintsFile}[$FileIndex]; | |
388 | |
389 # Copy fingerprints file to a tmp file for calculating similarity matrix... | |
390 print "\nCopying fingerprints file, $FingerprintsFile, to temporary fingperints file, $TmpFingerprintsFile...\n"; | |
391 copy $FingerprintsFile, $TmpFingerprintsFile or die "Error: Couldn't copy $FingerprintsFile to $TmpFingerprintsFile: $! \n"; | |
392 } | |
393 } | |
394 | |
395 # Clean up fingerprints data... | |
396 # | |
397 sub CleanupFingerprintsData { | |
398 my($FileIndex) = @_; | |
399 | |
400 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) { | |
401 $FingerprintsFilesInfo{CompundIDsRef} = undef; | |
402 $FingerprintsFilesInfo{FingerprintsObjectsRef} = undef; | |
403 } | |
404 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) { | |
405 my($TmpFingerprintsFile); | |
406 | |
407 # Delete temporary fingerprints file... | |
408 $TmpFingerprintsFile = $FingerprintsFilesInfo{TmpFingerprintsFile}[$FileIndex]; | |
409 | |
410 print "\nDeleting temporary fingerprints file $TmpFingerprintsFile...\n"; | |
411 unlink $TmpFingerprintsFile or die "Error: Couldn't unlink $TmpFingerprintsFile: $! \n"; | |
412 } | |
413 } | |
414 | |
415 # Retrieve information about fingerprints files... | |
416 # | |
417 sub RetrieveFingerprintsFilesInfo { | |
418 my($FingerprintsFile, $TmpFingerprintsFile, $FingerprintsFileIO, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FileType, $Index, $FileDir, $FileExt, $FileName, $InDelim, $OutFileRoot, $OutFileExt, %FingerprintsFileIOParameters); | |
419 | |
420 %FingerprintsFilesInfo = (); | |
421 @{$FingerprintsFilesInfo{FileOkay}} = (); | |
422 @{$FingerprintsFilesInfo{FileType}} = (); | |
423 @{$FingerprintsFilesInfo{InDelim}} = (); | |
424 @{$FingerprintsFilesInfo{OutFileRoot}} = (); | |
425 @{$FingerprintsFilesInfo{OutFileExt}} = (); | |
426 | |
427 @{$FingerprintsFilesInfo{TmpFingerprintsFile}} = (); | |
428 | |
429 @{$FingerprintsFilesInfo{FingerprintsFileIOParameters}} = (); | |
430 @{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}} = (); | |
431 | |
432 @{$FingerprintsFilesInfo{FingerprintsBitVectorStringMode}} = (); | |
433 @{$FingerprintsFilesInfo{FingerprintsVectorStringMode}} = (); | |
434 | |
435 FILELIST: for $Index (0 .. $#FingerprintsFilesList) { | |
436 $FingerprintsFilesInfo{FileOkay}[$Index] = 0; | |
437 $FingerprintsFilesInfo{FileType}[$Index] = ''; | |
438 $FingerprintsFilesInfo{InDelim}[$Index] = ""; | |
439 $FingerprintsFilesInfo{OutFileRoot}[$Index] = ''; | |
440 $FingerprintsFilesInfo{OutFileExt}[$Index] = ''; | |
441 | |
442 %{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$Index]} = (); | |
443 | |
444 $FingerprintsFilesInfo{TmpFingerprintsFile}[$Index] = ""; | |
445 %{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$Index]} = (); | |
446 | |
447 $FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$Index] = 0; | |
448 $FingerprintsFilesInfo{FingerprintsVectorStringMode}[$Index] = 0; | |
449 | |
450 $FingerprintsFile = $FingerprintsFilesList[$Index]; | |
451 if (!(-e $FingerprintsFile)) { | |
452 warn "Warning: Ignoring file $FingerprintsFile: It doesn't exist\n"; | |
453 next FILELIST; | |
454 } | |
455 | |
456 $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile); | |
457 if (IsEmpty($FileType)) { | |
458 warn "Warning: Ignoring file $FingerprintsFile: It's not a fingerprints file\n"; | |
459 next FILELIST; | |
460 } | |
461 | |
462 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
463 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile); | |
464 | |
465 # Setup temporary fingerprints file name for scan file mode... | |
466 $TmpFingerprintsFile = "${FileName}Tmp.${FileExt}"; | |
467 | |
468 $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim}; | |
469 | |
470 # Setup output file names... | |
471 $OutFileExt = "csv"; | |
472 if ($Options{outdelim} =~ /^tab$/i) { | |
473 $OutFileExt = "tsv"; | |
474 } | |
475 | |
476 $OutFileRoot = $FileName; | |
477 if ($OptionsInfo{OutFileRoot} && (@FingerprintsFilesList == 1)) { | |
478 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
479 if ($RootFileName && $RootFileExt) { | |
480 $FileName = $RootFileName; | |
481 } | |
482 else { | |
483 $FileName = $OptionsInfo{OutFileRoot}; | |
484 } | |
485 $OutFileRoot = $FileName; | |
486 } | |
487 | |
488 if (!$Options{overwrite}) { | |
489 # Similarity matrices output file names for bit-vector strings... | |
490 my($SpecifiedComparisonMeasure, $ComparisonMeasure); | |
491 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedBitVectorComparisonsRef}}) { | |
492 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
493 if (-e "${OutFileRoot}${ComparisonMeasure}.${OutFileExt}") { | |
494 warn "Warning: Ignoring file $FingerprintsFile: The file ${OutFileRoot}${ComparisonMeasure}.${OutFileExt} already exists.\n"; | |
495 next FILELIST; | |
496 } | |
497 } | |
498 # Similarity matrices output file names for vector strings... | |
499 my($ComparisonMode); | |
500 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedVectorComparisonsRef}}) { | |
501 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)}; | |
502 for $ComparisonMode (@{$OptionsInfo{SpecifiedVectorComparisonModesRef}}) { | |
503 if (-e "${OutFileRoot}${ComparisonMeasure}${ComparisonMode}.${OutFileExt}") { | |
504 warn "Warning: Ignoring file $FingerprintsFile: The file ${OutFileRoot}${ComparisonMeasure}${ComparisonMode}.${OutFileExt} already exists.\n"; | |
505 next FILELIST; | |
506 } | |
507 } | |
508 } | |
509 } | |
510 | |
511 # Setup FingerprintsFileIO parameters... | |
512 %FingerprintsFileIOParameters = (); | |
513 FILEIOPARAMETERS: { | |
514 if ($FileType =~ /^SD$/i) { | |
515 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsField}, 'CompoundIDMode' => $OptionsInfo{CompoundIDMode}, 'CompoundIDFieldLabel' => $OptionsInfo{CompoundIDField}, 'CompoundIDPrefix' => $OptionsInfo{CompoundIDPrefix}); | |
516 last FILEIOPARAMETERS; | |
517 } | |
518 if ($FileType =~ /^FP$/i) { | |
519 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}); | |
520 last FILEIOPARAMETERS; | |
521 } | |
522 if ($FileType =~ /^Text$/i) { | |
523 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{FingerprintsCol}, 'ColMode' => $OptionsInfo{ColMode}, 'CompoundIDCol' => $OptionsInfo{CompoundIDCol}, 'CompoundIDPrefix' => $OptionsInfo{CompoundIDPrefix}, 'InDelim' => $OptionsInfo{InDelim}); | |
524 last FILEIOPARAMETERS; | |
525 } | |
526 warn "Warning: File type for fingerprints file, $FingerprintsFile, is not valid. Supported file types: SD, FP or Text\n"; | |
527 next FILELIST; | |
528 } | |
529 | |
530 # Retrieve fingerints file string mode information... | |
531 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%FingerprintsFileIOParameters); | |
532 | |
533 if (!$FingerprintsFileIO) { | |
534 warn "Warning: Ignoring fingerprints file $FingerprintsFile: It contains invalid fingerprints data\n"; | |
535 next FILELIST; | |
536 } | |
537 if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) { | |
538 warn "Warning: Ignoring fingerprints file $FingerprintsFile: It contains invalid fingerprints data\n"; | |
539 next FILELIST; | |
540 } | |
541 $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode(); | |
542 $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode(); | |
543 | |
544 | |
545 $FingerprintsFilesInfo{FileOkay}[$Index] = 1; | |
546 $FingerprintsFilesInfo{FileType}[$Index] = $FileType; | |
547 | |
548 $FingerprintsFilesInfo{InDelim}[$Index] = $InDelim; | |
549 | |
550 $FingerprintsFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | |
551 $FingerprintsFilesInfo{OutFileExt}[$Index] = $OutFileExt; | |
552 | |
553 %{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$Index]} = %FingerprintsFileIOParameters; | |
554 | |
555 $FingerprintsFilesInfo{TmpFingerprintsFile}[$Index] = $TmpFingerprintsFile; | |
556 | |
557 $FingerprintsFileIOParameters{Name} = $TmpFingerprintsFile; | |
558 %{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$Index]} = %FingerprintsFileIOParameters; | |
559 | |
560 $FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$Index] = $FingerprintsBitVectorStringMode; | |
561 $FingerprintsFilesInfo{FingerprintsVectorStringMode}[$Index] = $FingerprintsVectorStringMode; | |
562 } | |
563 } | |
564 | |
565 # Process option values... | |
566 sub ProcessOptions { | |
567 %OptionsInfo = (); | |
568 | |
569 $OptionsInfo{Mode} = $Options{mode}; | |
570 | |
571 $OptionsInfo{InputDataMode} = $Options{inputdatamode}; | |
572 | |
573 ProcessBitVectorComparisonOptions(); | |
574 ProcessVectorComparisonOptions(); | |
575 | |
576 $OptionsInfo{CompoundIDPrefix} = $Options{compoundidprefix} ? $Options{compoundidprefix} : 'Cmpd'; | |
577 | |
578 # Compound ID and fingerprints column options for text files... | |
579 $OptionsInfo{ColMode} = $Options{colmode}; | |
580 | |
581 if (IsNotEmpty($Options{compoundidcol})) { | |
582 if ($Options{colmode} =~ /^ColNum$/i) { | |
583 if (!IsPositiveInteger($Options{compoundidcol})) { | |
584 die "Error: Column value, $Options{compoundidcol}, specified using \"--CompoundIDCol\" is not valid: Allowed integer values: > 0\n"; | |
585 } | |
586 } | |
587 $OptionsInfo{CompoundIDCol} = $Options{compoundidcol}; | |
588 } | |
589 else { | |
590 $OptionsInfo{CompoundIDCol} = 'AutoDetect'; | |
591 } | |
592 | |
593 if (IsNotEmpty($Options{fingerprintscol})) { | |
594 if ($Options{colmode} =~ /^ColNum$/i) { | |
595 if (!IsPositiveInteger($Options{fingerprintscol})) { | |
596 die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0\n"; | |
597 } | |
598 } | |
599 $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol}; | |
600 } | |
601 else { | |
602 $OptionsInfo{FingerprintsCol} = 'AutoDetect'; | |
603 } | |
604 | |
605 if (IsNotEmpty($Options{compoundidcol}) && IsNotEmpty($Options{fingerprintscol})) { | |
606 if (IsPositiveInteger($Options{compoundidcol}) && IsPositiveInteger($Options{fingerprintscol})) { | |
607 if (($Options{compoundidcol} == $Options{fingerprintscol})) { | |
608 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n"; | |
609 } | |
610 } | |
611 else { | |
612 if (($Options{compoundidcol} eq $Options{fingerprintscol})) { | |
613 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n"; | |
614 } | |
615 } | |
616 } | |
617 | |
618 # Compound ID and fingerprints field options for SD files... | |
619 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; | |
620 $OptionsInfo{CompoundIDField} = ''; | |
621 | |
622 if ($Options{compoundidmode} =~ /^DataField$/i) { | |
623 if (!$Options{compoundidfield}) { | |
624 die "Error: You must specify a value for \"--CompoundIDField\" option in \"DataField\" \"--CompoundIDMode\". \n"; | |
625 } | |
626 $OptionsInfo{CompoundIDField} = $Options{compoundidfield}; | |
627 } | |
628 | |
629 | |
630 if (IsNotEmpty($Options{fingerprintsfield})) { | |
631 $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield}; | |
632 } | |
633 else { | |
634 $OptionsInfo{FingerprintsField} = 'AutoDetect'; | |
635 } | |
636 | |
637 if ($Options{compoundidfield} && IsNotEmpty($Options{fingerprintsfield})) { | |
638 if (($Options{compoundidfield} eq $Options{fingerprintsfield})) { | |
639 die "Error: Values specified using \"--CompoundIDField\" and \"--Fingerprintsfield\", $Options{compoundidfield}, must be different.\n"; | |
640 } | |
641 } | |
642 | |
643 $OptionsInfo{Detail} = $Options{detail}; | |
644 | |
645 $OptionsInfo{InDelim} = $Options{indelim}; | |
646 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,"); | |
647 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; | |
648 $OptionsInfo{OutQuoteValue} = ($Options{quote} =~ /^Yes$/i) ? '"' : ''; | |
649 | |
650 $OptionsInfo{OutMatrixFormat} = $Options{outmatrixformat}; | |
651 | |
652 $OptionsInfo{WriteRowsAndColumns} = 0; $OptionsInfo{WriteIDPairsAndValue} = 0; | |
653 OUTMATRIXFORMAT: { | |
654 if ($OptionsInfo{OutMatrixFormat} =~ /^RowsAndColumns$/i) { | |
655 $OptionsInfo{WriteRowsAndColumns} = 1; last OUTMATRIXFORMAT; | |
656 } | |
657 if ($OptionsInfo{OutMatrixFormat} =~ /^IDPairsAndValue$/i) { | |
658 $OptionsInfo{WriteIDPairsAndValue} = 1; last OUTMATRIXFORMAT; | |
659 } | |
660 die "Error: The value specified, $Options{outmatrixformat}, for option \"--OutMatrixFormat\" is not valid. Allowed values: RowsAndColumns or IDPairsAndValue\n"; | |
661 } | |
662 | |
663 $OptionsInfo{OutMatrixType} = $Options{outmatrixtype}; | |
664 | |
665 $OptionsInfo{WriteFullMatrix} = 0; | |
666 $OptionsInfo{WriteUpperTriangularMatrix} = 0; $OptionsInfo{WriteLowerTriangularMatrix} = 0; | |
667 OUTMATRIXTYPE: { | |
668 if ($OptionsInfo{OutMatrixType} =~ /^FullMatrix$/i) { | |
669 $OptionsInfo{WriteFullMatrix} = 1; last OUTMATRIXTYPE; | |
670 } | |
671 if ($OptionsInfo{OutMatrixType} =~ /^UpperTriangularMatrix$/i) { | |
672 $OptionsInfo{WriteUpperTriangularMatrix} = 1; last OUTMATRIXTYPE; | |
673 } | |
674 if ($OptionsInfo{OutMatrixType} =~ /^LowerTriangularMatrix$/i) { | |
675 $OptionsInfo{WriteLowerTriangularMatrix} = 1; last OUTMATRIXTYPE; | |
676 } | |
677 die "Error: The value specified, $Options{outmatrixtype}, for option \"--OutMatrixType\" is not valid. Allowed values: FullMatrix, UpperTriangularMatrix or LowerTriangularMatrix\n"; | |
678 } | |
679 | |
680 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | |
681 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | |
682 | |
683 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0; | |
684 $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1; | |
685 | |
686 $OptionsInfo{Precision} = $Options{precision}; | |
687 | |
688 } | |
689 | |
690 # Process options related to comparion of bit vector strings... | |
691 # | |
692 sub ProcessBitVectorComparisonOptions { | |
693 # Setup supported bit vector similarity coefficients for bit vector strings... | |
694 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); | |
695 | |
696 @SupportedComparisonMeasures = (); | |
697 %SupportedComparisonMeasuresNameMap = (); | |
698 %SupportedComparisonMeasuresMethodMap = (); | |
699 | |
700 for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) { | |
701 # Similarity coefficient function/method names contain "Coefficient" in their names. | |
702 # So take 'em out and setup a map to original function/method name... | |
703 $ComparisonMeasure = $SupportedComparisonMeasure; | |
704 $ComparisonMeasure =~ s/Coefficient$//; | |
705 | |
706 push @SupportedComparisonMeasures, $ComparisonMeasure; | |
707 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; | |
708 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; | |
709 } | |
710 | |
711 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings... | |
712 my($SpecifiedMeasure, @SpecifiedComparisonMeasures, %SpecifiedComparisonMeasuresNameMap, %SpecifiedComparisonMeasuresMethodMap, %SpecifiedComparisonMeasuresParameterMap); | |
713 | |
714 @SpecifiedComparisonMeasures = (); | |
715 %SpecifiedComparisonMeasuresNameMap = (); | |
716 %SpecifiedComparisonMeasuresMethodMap = (); | |
717 %SpecifiedComparisonMeasuresParameterMap = (); | |
718 | |
719 if ($Options{bitvectorcomparisonmode} =~ /^All$/i) { | |
720 push @SpecifiedComparisonMeasures, @SupportedComparisonMeasures; | |
721 } | |
722 else { | |
723 # Comma delimited list of similarity coefficients... | |
724 my($BitVectorComparisonMode, @SpecifiedMeasures, @UnsupportedSpecifiedMeasures); | |
725 | |
726 $BitVectorComparisonMode = $Options{bitvectorcomparisonmode}; | |
727 $BitVectorComparisonMode =~ s/ //g; | |
728 @SpecifiedMeasures = split ",", $BitVectorComparisonMode; | |
729 @UnsupportedSpecifiedMeasures = (); | |
730 | |
731 for $SpecifiedMeasure (@SpecifiedMeasures) { | |
732 if (exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) { | |
733 push @SpecifiedComparisonMeasures, $SpecifiedMeasure; | |
734 } | |
735 else { | |
736 push @UnsupportedSpecifiedMeasures, $SpecifiedMeasure; | |
737 } | |
738 } | |
739 if (@UnsupportedSpecifiedMeasures) { | |
740 if (@UnsupportedSpecifiedMeasures > 1) { | |
741 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedMeasures, ", ", 0)," - for option \"-b --BitVectorComparisonMode\" are not valid.\n"; | |
742 } | |
743 else { | |
744 warn "Error: The value specified, @UnsupportedSpecifiedMeasures, for option \"-b --BitVectorComparisonMode\" is not valid.\n"; | |
745 } | |
746 die "Allowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; | |
747 } | |
748 } | |
749 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
750 $SpecifiedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; | |
751 $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; | |
752 } | |
753 | |
754 $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode}; | |
755 $OptionsInfo{SpecifiedBitVectorComparisonsRef} = \@SpecifiedComparisonMeasures; | |
756 $OptionsInfo{SpecifiedBitVectorComparisonsNameRef} = \%SpecifiedComparisonMeasuresNameMap; | |
757 $OptionsInfo{SpecifiedBitVectorComparisonsMethodRef} = \%SpecifiedComparisonMeasuresMethodMap; | |
758 | |
759 # Make sure valid alpha parameter is specified for Tversky calculation... | |
760 my($SpecifiedMeasure1, $SpecifiedMeasure2); | |
761 $OptionsInfo{Alpha} = ''; | |
762 $SpecifiedMeasure1 = 'TverskySimilarity'; | |
763 $SpecifiedMeasure2 = 'WeightedTverskySimilarity'; | |
764 if ($SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure1)} || $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure2)}) { | |
765 if (IsEmpty($Options{alpha})) { | |
766 die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedMeasure1, $SpecifiedMeasure2, or All\" \"-m --mode\". \n"; | |
767 } | |
768 my($Alpha); | |
769 $Alpha = $Options{alpha}; | |
770 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) { | |
771 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n"; | |
772 } | |
773 $OptionsInfo{Alpha} = $Alpha; | |
774 } | |
775 | |
776 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky | |
777 # calculations... | |
778 $OptionsInfo{Beta} = ''; | |
779 $SpecifiedMeasure1 = 'WeightedTverskySimilarity'; | |
780 $SpecifiedMeasure2 = 'WeightedTanimotoSimilarity'; | |
781 if ($SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure1)} || $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure2)}) { | |
782 if (IsEmpty($Options{beta})) { | |
783 die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedMeasure1, $SpecifiedMeasure2, or All\" \"-m --mode\". \n"; | |
784 } | |
785 my($Beta); | |
786 $Beta = $Options{beta}; | |
787 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) { | |
788 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n"; | |
789 } | |
790 $OptionsInfo{Beta} = $Beta; | |
791 } | |
792 | |
793 # Setup any parameters required for specified comparison menthod... | |
794 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
795 @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}} = (); | |
796 if ($SpecifiedMeasure =~ /^TverskySimilarity$/i) { | |
797 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Alpha}; | |
798 } | |
799 elsif ($SpecifiedMeasure =~ /^WeightedTverskySimilarity$/i) { | |
800 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Alpha}; | |
801 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Beta}; | |
802 } | |
803 elsif ($SpecifiedMeasure =~ /^WeightedTanimotoSimilarity$/i) { | |
804 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Beta}; | |
805 } | |
806 } | |
807 $OptionsInfo{SpecifiedBitVectorComparisonsParameterRef} = \%SpecifiedComparisonMeasuresParameterMap; | |
808 } | |
809 | |
810 # Process options related to comparion of vector strings... | |
811 # | |
812 sub ProcessVectorComparisonOptions { | |
813 # Setup specified similarity coefficients for vector strings.. | |
814 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap); | |
815 | |
816 @SupportedComparisonMeasures = (); | |
817 %SupportedComparisonMeasuresNameMap = (); | |
818 %SupportedComparisonMeasuresMethodMap = (); | |
819 for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) { | |
820 # Similarity and distance coefficient function/method names contain "Coefficient" in their names. | |
821 # So take 'em out and setup a map to original function/method name... | |
822 $ComparisonMeasure = $SupportedComparisonMeasure; | |
823 if ($ComparisonMeasure =~ /Coefficient$/i) { | |
824 $ComparisonMeasure =~ s/Coefficient$//i; | |
825 } | |
826 push @SupportedComparisonMeasures, $ComparisonMeasure; | |
827 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure; | |
828 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure; | |
829 } | |
830 | |
831 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings... | |
832 my($SpecifiedMeasure, @SpecifiedComparisonMeasures, %SpecifiedComparisonMeasuresNameMap, %SpecifiedComparisonMeasuresMethodMap, %SpecifiedComparisonMeasuresParameterMap); | |
833 | |
834 @SpecifiedComparisonMeasures = (); | |
835 %SpecifiedComparisonMeasuresNameMap = (); | |
836 %SpecifiedComparisonMeasuresMethodMap = (); | |
837 | |
838 if ($Options{vectorcomparisonmode} =~ /^All$/i) { | |
839 push @SpecifiedComparisonMeasures, @SupportedComparisonMeasures; | |
840 } | |
841 else { | |
842 # Comma delimited list of similarity coefficients... | |
843 my($VectorComparisonMode, @SpecifiedMeasures, @UnsupportedSpecifiedMeasures); | |
844 | |
845 $VectorComparisonMode = $Options{vectorcomparisonmode}; | |
846 $VectorComparisonMode =~ s/ //g; | |
847 @SpecifiedMeasures = split ",", $VectorComparisonMode; | |
848 @UnsupportedSpecifiedMeasures = (); | |
849 | |
850 for $SpecifiedMeasure (@SpecifiedMeasures) { | |
851 if (exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) { | |
852 push @SpecifiedComparisonMeasures, $SpecifiedMeasure; | |
853 } | |
854 else { | |
855 push @UnsupportedSpecifiedMeasures, $SpecifiedMeasure; | |
856 } | |
857 } | |
858 if (@UnsupportedSpecifiedMeasures) { | |
859 if (@UnsupportedSpecifiedMeasures > 1) { | |
860 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedMeasures, ", ", 0)," - for option \"-v --VectorComparisonMode\" are not valid.\n"; | |
861 } | |
862 else { | |
863 warn "Error: The value specified, @UnsupportedSpecifiedMeasures, for option \"-v --VectorComparisonMode\" is not valid.\n"; | |
864 } | |
865 die "Allowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n"; | |
866 } | |
867 } | |
868 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
869 $SpecifiedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)}; | |
870 $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)}; | |
871 } | |
872 | |
873 $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode}; | |
874 $OptionsInfo{SpecifiedVectorComparisonsRef} = \@SpecifiedComparisonMeasures; | |
875 $OptionsInfo{SpecifiedVectorComparisonsNameRef} = \%SpecifiedComparisonMeasuresNameMap; | |
876 $OptionsInfo{SpecifiedVectorComparisonsMethodRef} = \%SpecifiedComparisonMeasuresMethodMap; | |
877 | |
878 # Setup specified vector comparison calculation modes... | |
879 my(@SpecifiedVectorComparisonModes); | |
880 @SpecifiedVectorComparisonModes = (); | |
881 if ($Options{vectorcomparisonformulism} =~ /^All$/i) { | |
882 push @SpecifiedVectorComparisonModes, ("AlgebraicForm", "BinaryForm", "SetTheoreticForm"); | |
883 } | |
884 else { | |
885 my($SpecifiedFormulism, @SpecifiedFormulismWords); | |
886 | |
887 @SpecifiedFormulismWords = split /\,/, $Options{vectorcomparisonformulism}; | |
888 for $SpecifiedFormulism (@SpecifiedFormulismWords) { | |
889 if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) { | |
890 die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n"; | |
891 } | |
892 push @SpecifiedVectorComparisonModes, $SpecifiedFormulism; | |
893 } | |
894 } | |
895 $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism}; | |
896 $OptionsInfo{SpecifiedVectorComparisonModesRef} = \@SpecifiedVectorComparisonModes; | |
897 | |
898 # Setup any parameters required for specified comparison menthod... | |
899 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) { | |
900 @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}} = (); | |
901 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, ($Options{fast} ? 1 : 0); | |
902 } | |
903 $OptionsInfo{SpecifiedVectorComparisonsParameterRef} = \%SpecifiedComparisonMeasuresParameterMap; | |
904 } | |
905 | |
906 # Setup script usage and retrieve command line arguments specified using various options... | |
907 sub SetupScriptUsage { | |
908 | |
909 # Retrieve all the options... | |
910 %Options = (); | |
911 | |
912 $Options{alpha} = 0.5; | |
913 $Options{beta} = 1; | |
914 | |
915 $Options{bitvectorcomparisonmode} = "TanimotoSimilarity"; | |
916 | |
917 $Options{colmode} = 'colnum'; | |
918 | |
919 $Options{compoundidprefix} = 'Cmpd'; | |
920 $Options{compoundidmode} = 'LabelPrefix'; | |
921 | |
922 $Options{detail} = 1; | |
923 | |
924 $Options{indelim} = 'comma'; | |
925 $Options{outdelim} = 'comma'; | |
926 | |
927 $Options{inputdatamode} = 'LoadInMemory'; | |
928 | |
929 $Options{mode} = 'AutoDetect'; | |
930 | |
931 $Options{outmatrixformat} = 'RowsAndColumns'; | |
932 | |
933 $Options{outmatrixtype} = 'FullMatrix'; | |
934 | |
935 $Options{quote} = 'yes'; | |
936 $Options{precision} = 2; | |
937 | |
938 $Options{vectorcomparisonmode} = "TanimotoSimilarity"; | |
939 $Options{vectorcomparisonformulism} = "AlgebraicForm"; | |
940 | |
941 if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "colmode|c=s", "compoundidcol=s", "compoundidprefix=s", "compoundidfield=s", "compoundidmode=s", "detail|d=i", "fast|f", "fingerprintscol=s", "fingerprintsfield=s", "help|h", "indelim=s", "inputdatamode=s", "mode|m=s", "outdelim=s", "overwrite|o", "outmatrixformat=s", "outmatrixtype=s", "precision|p=s", "quote|q=s", "root|r=s", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) { | |
942 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
943 } | |
944 if ($Options{workingdir}) { | |
945 if (! -d $Options{workingdir}) { | |
946 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
947 } | |
948 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
949 } | |
950 if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) { | |
951 die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n"; | |
952 } | |
953 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { | |
954 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; | |
955 } | |
956 if (!IsPositiveInteger($Options{detail})) { | |
957 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n"; | |
958 } | |
959 if ($Options{inputdatamode} !~ /^(LoadInMemory|ScanFile)$/i) { | |
960 die "Error: The value specified, $Options{inputdatamode}, for option \"--InputDataMode\" is not valid. Allowed values: LoadInMemory or ScanFile\n"; | |
961 } | |
962 if ($Options{mode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) { | |
963 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n"; | |
964 } | |
965 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { | |
966 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n"; | |
967 } | |
968 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
969 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
970 } | |
971 if ($Options{outmatrixformat} !~ /^(RowsAndColumns|IDPairsAndValue)$/i) { | |
972 die "Error: The value specified, $Options{outmatrixformat}, for option \"--OutMatrixFormat\" is not valid. Allowed values: RowsAndColumns or IDPairsAndValue\n"; | |
973 } | |
974 if ($Options{outmatrixtype} !~ /^(FullMatrix|UpperTriangularMatrix|LowerTriangularMatrix)$/i) { | |
975 die "Error: The value specified, $Options{outmatrixtype}, for option \"--OutMatrixType\" is not valid. Allowed values: FullMatrix, UpperTriangularMatrix or LowerTriangularMatrix\n"; | |
976 } | |
977 if ($Options{quote} !~ /^(Yes|No)$/i) { | |
978 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; | |
979 } | |
980 if (!IsPositiveInteger($Options{precision})) { | |
981 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n"; | |
982 } | |
983 } | |
984 | |
985 __END__ | |
986 | |
987 =head1 NAME | |
988 | |
989 SimilarityMatricesFingerprints.pl - Calculate similarity matrices using fingerprints strings data in SD, FP and CSV/TSV text file(s) | |
990 | |
991 =head1 SYNOPSIS | |
992 | |
993 SimilarityMatricesFingerprints.pl SDFile(s) FPFile(s) TextFile(s)... | |
994 | |
995 SimilarityMatricesFingerprints.pl [B<--alpha> I<number>] [B<--beta> I<number>] | |
996 [B<-b, --BitVectorComparisonMode> I<All | "TanimotoSimilarity,[ TverskySimilarity, ... ]">] | |
997 [B<-c, --ColMode> I<ColNum | ColLabel>] [B<--CompoundIDCol> I<col number | col name>] | |
998 [B<--CompoundIDPrefix> I<text>] [B<--CompoundIDField> I<DataFieldName>] | |
999 [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>] | |
1000 [B<-d, --detail> I<InfoLevel>] [B<-f, --fast>] [B<--FingerprintsCol> I<col number | col name>] | |
1001 [B<--FingerprintsField> I<FieldLabel>] [B<-h, --help>] [B<--InDelim> I<comma | semicolon>] | |
1002 [B<--InputDataMode> I<LoadInMemory | ScanFile>] | |
1003 [B<-m, --mode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>] | |
1004 [B<--OutDelim> I<comma | tab | semicolon>] [B<--OutMatrixFormat> I<RowsAndColumns | IDPairsAndValue>] | |
1005 [B<--OutMatrixType> I<FullMatrix | UpperTriangularMatrix | LowerTriangularMatrix>] | |
1006 [B<-o, --overwrite>] [B<-p, --precision> I<number>] | |
1007 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] | |
1008 [B<-v, --VectorComparisonMode> I<All | "TanimotoSimilairy, [ ManhattanDistance, ...]">] | |
1009 [B<--VectorComparisonFormulism> I<All | "AlgebraicForm, [BinaryForm, SetTheoreticForm]">] | |
1010 [B<-w, --WorkingDir> dirname] SDFile(s) FPFile(s) TextFile(s)... | |
1011 | |
1012 =head1 DESCRIPTION | |
1013 | |
1014 Calculate similarity matrices using fingerprint bit-vector or vector strings data in I<SD, FP | |
1015 and CSV/TSV> text file(s) and generate CSV/TSV text file(s) containing values for specified | |
1016 similarity and distance coefficients. | |
1017 | |
1018 The scripts SimilarityMatrixSDFiles.pl and SimilarityMatrixTextFiles.pl have been removed from the | |
1019 current release of MayaChemTools and their functionality merged with this script. | |
1020 | |
1021 The valid I<SDFile> extensions are I<.sdf> and I<.sd>. All SD files in a current directory | |
1022 can be specified either by I<*.sdf> or the current directory name. | |
1023 | |
1024 The valid I<FPFile> extensions are I<.fpf> and I<.fp>. All FP files in a current directory | |
1025 can be specified either by I<*.fpf> or the current directory name. | |
1026 | |
1027 The valid I<TextFile> extensions are I<.csv> and I<.tsv> for comma/semicolon and tab | |
1028 delimited text files respectively. All other file names are ignored. All text files in a | |
1029 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory | |
1030 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file | |
1031 which doesn't correspond to the format indicated by B<--indelim> option is ignored. | |
1032 | |
1033 Example of I<FP> file containing fingerprints bit-vector string data: | |
1034 | |
1035 # | |
1036 # Package = MayaChemTools 7.4 | |
1037 # ReleaseDate = Oct 21, 2010 | |
1038 # | |
1039 # TimeStamp = Mon Mar 7 15:14:01 2011 | |
1040 # | |
1041 # FingerprintsStringType = FingerprintsBitVector | |
1042 # | |
1043 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:... | |
1044 # Size = 1024 | |
1045 # BitStringFormat = HexadecimalString | |
1046 # BitsOrder = Ascending | |
1047 # | |
1048 Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510... | |
1049 Cmpd2 000000249400840040100042011001001980410c000000001010088001120... | |
1050 ... ... | |
1051 ... .. | |
1052 | |
1053 Example of I<FP> file containing fingerprints vector string data: | |
1054 | |
1055 # | |
1056 # Package = MayaChemTools 7.4 | |
1057 # ReleaseDate = Oct 21, 2010 | |
1058 # | |
1059 # TimeStamp = Mon Mar 7 15:14:01 2011 | |
1060 # | |
1061 # FingerprintsStringType = FingerprintsVector | |
1062 # | |
1063 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:... | |
1064 # VectorStringFormat = IDsAndValuesString | |
1065 # VectorValuesType = NumericalValues | |
1066 # | |
1067 Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C: | |
1068 N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...; | |
1069 33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2 | |
1070 6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ... | |
1071 Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C | |
1072 O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...; | |
1073 15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2 | |
1074 1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ... | |
1075 ... ... | |
1076 ... ... | |
1077 | |
1078 Example of I<SD> file containing fingerprints bit-vector string data: | |
1079 | |
1080 ... ... | |
1081 ... ... | |
1082 $$$$ | |
1083 ... ... | |
1084 ... ... | |
1085 ... ... | |
1086 41 44 0 0 0 0 0 0 0 0999 V2000 | |
1087 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 | |
1088 ... ... | |
1089 2 3 1 0 0 0 0 | |
1090 ... ... | |
1091 M END | |
1092 > <CmpdID> | |
1093 Cmpd1 | |
1094 | |
1095 > <PathLengthFingerprints> | |
1096 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt | |
1097 h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66 | |
1098 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028 | |
1099 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462 | |
1100 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a | |
1101 aa0660a11014a011d46 | |
1102 | |
1103 $$$$ | |
1104 ... ... | |
1105 ... ... | |
1106 | |
1107 Example of CSV I<Text> file containing fingerprints bit-vector string data: | |
1108 | |
1109 "CompoundID","PathLengthFingerprints" | |
1110 "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes | |
1111 :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4 | |
1112 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030 | |
1113 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..." | |
1114 ... ... | |
1115 ... ... | |
1116 | |
1117 The current release of MayaChemTools supports the following types of fingerprint | |
1118 bit-vector and vector strings: | |
1119 | |
1120 FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi | |
1121 us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0-C.X1.BO1.H3-AT | |
1122 C1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-ATC1 NR0-C.X | |
1123 1.BO1.H3-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-A | |
1124 TC1 NR0-C.X2.BO2.H2-ATC1:NR1-C.X2.BO2.H2-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2 | |
1125 -C.X2.BO2.H2-ATC1:NR2-N.X3.BO3-ATC1:NR2-O.X1.BO1.H1-ATC1 NR0-C.X2.B... | |
1126 | |
1127 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS | |
1128 ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2 | |
1129 .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1 | |
1130 O.X1.BO2;2 4 14 3 10 1 1 1 3 2 | |
1131 | |
1132 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume | |
1133 ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F | |
1134 N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1 | |
1135 | |
1136 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN | |
1137 umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C | |
1138 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N | |
1139 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8 | |
1140 O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1 | |
1141 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0... | |
1142 | |
1143 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs | |
1144 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN | |
1145 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3 | |
1146 .024 -2.270 | |
1147 | |
1148 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues; | |
1149 ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435 | |
1150 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1 | |
1151 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
1152 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
1153 | |
1154 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi | |
1155 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391 | |
1156 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414 | |
1157 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103 | |
1158 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338 | |
1159 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303... | |
1160 | |
1161 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes | |
1162 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524 | |
1163 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 | |
1164 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...; | |
1165 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2 | |
1166 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 | |
1167 | |
1168 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp | |
1169 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100 | |
1170 0000000001010000000110000011000000000000100000000000000000000000100001 | |
1171 1000000110000000000000000000000000010011000000000000000000000000010000 | |
1172 0000000000000000000000000010000000000000000001000000000000000000000000 | |
1173 0000000000010000100001000000000000101000000000000000100000000000000... | |
1174 | |
1175 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu | |
1176 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8 | |
1177 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567 | |
1178 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012 | |
1179 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455 | |
1180 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404... | |
1181 | |
1182 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp | |
1183 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184 | |
1184 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450 | |
1185 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430 | |
1186 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134 | |
1187 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566... | |
1188 | |
1189 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000 | |
1190 0000000000000000000000000000000001001000010010000000010010000000011100 | |
1191 0100101010111100011011000100110110000011011110100110111111111111011111 | |
1192 11111111111110111000 | |
1193 | |
1194 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011 | |
1195 1110011111100101111111000111101100110000000000000011100010000000000000 | |
1196 0000000000000000000000000000000000000000000000101000000000000000000000 | |
1197 0000000000000000000000000000000000000000000000000000000000000000000000 | |
1198 0000000000000000000000000000000000000011000000000000000000000000000000 | |
1199 0000000000000000000000000000000000000000 | |
1200 | |
1201 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri | |
1202 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
1203 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0 | |
1204 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0 | |
1205 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1 | |
1206 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1 | |
1207 | |
1208 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri | |
1209 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0 | |
1210 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0 | |
1211 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | |
1212 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0 | |
1213 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | |
1214 | |
1215 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng | |
1216 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110 | |
1217 0100010101011000101001011100110001000010001001101000001001001001001000 | |
1218 0010110100000111001001000001001010100100100000000011000000101001011100 | |
1219 0010000001000101010100000100111100110111011011011000000010110111001101 | |
1220 0101100011000000010001000011000010100011101100001000001000100000000... | |
1221 | |
1222 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength | |
1223 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2 | |
1224 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X | |
1225 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1 | |
1226 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO | |
1227 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C.... | |
1228 | |
1229 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt | |
1230 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1 | |
1231 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N | |
1232 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1 | |
1233 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR | |
1234 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ... | |
1235 | |
1236 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD | |
1237 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1 | |
1238 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3. | |
1239 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...; | |
1240 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1 | |
1241 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1... | |
1242 | |
1243 FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi | |
1244 stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar | |
1245 Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H | |
1246 BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...; | |
1247 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4 | |
1248 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ... | |
1249 | |
1250 FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3 | |
1251 3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4- | |
1252 C.X3.BO4 C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-N.X3.BO3 C.X2.BO2.H2-C.X2.BO | |
1253 2.H2-C.X3.BO3.H1-C.X2.BO2.H2 C.X2.BO2.H2-C.X2.BO2.H2-C.X3.BO3.H1-O...; | |
1254 2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1 | |
1255 | |
1256 FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica | |
1257 lValues;IDsAndValuesString;aaCH-aaCH-aaCH-aaCH aaCH-aaCH-aaCH-aasC aaC | |
1258 H-aaCH-aasC-aaCH aaCH-aaCH-aasC-aasC aaCH-aaCH-aasC-sF aaCH-aaCH-aasC- | |
1259 ssNH aaCH-aasC-aasC-aasC aaCH-aasC-aasC-aasN aaCH-aasC-ssNH-dssC a...; | |
1260 4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2 | |
1261 | |
1262 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M | |
1263 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1 | |
1264 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1 | |
1265 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1 | |
1266 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....; | |
1267 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2 | |
1268 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8... | |
1269 | |
1270 FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1 | |
1271 :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C | |
1272 .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3- | |
1273 D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2 | |
1274 -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C. | |
1275 3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7... | |
1276 | |
1277 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min | |
1278 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H | |
1279 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2- | |
1280 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H | |
1281 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...; | |
1282 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10 | |
1283 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1 | |
1284 | |
1285 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist | |
1286 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0 | |
1287 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1 | |
1288 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0 | |
1289 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0 | |
1290 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18... | |
1291 | |
1292 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize: | |
1293 MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1- | |
1294 Ar1-Ar1 Ar1-Ar1-H1 Ar1-Ar1-HBA1 Ar1-Ar1-HBD1 Ar1-H1-H1 Ar1-H1-HBA1 Ar1 | |
1295 -H1-HBD1 Ar1-HBA1-HBD1 H1-H1-H1 H1-H1-HBA1 H1-H1-HBD1 H1-HBA1-HBA1 H1- | |
1296 HBA1-HBD1 H1-HBA1-NI1 H1-HBD1-NI1 HBA1-HBA1-NI1 HBA1-HBD1-NI1 Ar1-...; | |
1297 46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23 | |
1298 28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1 | |
1299 119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ... | |
1300 | |
1301 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD | |
1302 istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106 | |
1303 8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0 | |
1304 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26 | |
1305 14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0 | |
1306 0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ... | |
1307 | |
1308 =head1 OPTIONS | |
1309 | |
1310 =over 4 | |
1311 | |
1312 =item B<--alpha> I<number> | |
1313 | |
1314 Value of alpha parameter for calculating I<Tversky> similarity coefficient specified for | |
1315 B<-b, --BitVectorComparisonMode> option. It corresponds to weights assigned for bits set | |
1316 to "1" in a pair of fingerprint bit-vectors during the calculation of similarity coefficient. Possible | |
1317 values: I<0 to 1>. Default value: <0.5>. | |
1318 | |
1319 =item B<--beta> I<number> | |
1320 | |
1321 Value of beta parameter for calculating I<WeightedTanimoto> and I<WeightedTversky> | |
1322 similarity coefficients specified for B<-b, --BitVectorComparisonMode> option. It is used to | |
1323 weight the contributions of bits set to "0" during the calculation of similarity coefficients. Possible | |
1324 values: I<0 to 1>. Default value of <1> makes I<WeightedTanimoto> and I<WeightedTversky> | |
1325 equivalent to I<Tanimoto> and I<Tversky>. | |
1326 | |
1327 =item B<-b, --BitVectorComparisonMode> I<All | "TanimotoSimilarity,[TverskySimilarity,...]"> | |
1328 | |
1329 Specify what similarity coefficients to use for calculating similarity matrices for fingerprints bit-vector | |
1330 strings data values in I<TextFile(s)>: calculate similarity matrices for all supported similarity | |
1331 coefficients or specify a comma delimited list of similarity coefficients. Possible values: | |
1332 I<All | "TanimotoSimilarity,[TverskySimilarity,...]>. Default: I<TanimotoSimilarity> | |
1333 | |
1334 I<All> uses complete list of supported similarity coefficients: I<BaroniUrbaniSimilarity, BuserSimilarity, | |
1335 CosineSimilarity, DiceSimilarity, DennisSimilarity, ForbesSimilarity, FossumSimilarity, HamannSimilarity, JacardSimilarity, | |
1336 Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity, McConnaugheySimilarity, OchiaiSimilarity, | |
1337 PearsonSimilarity, RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity, SkoalSneath1Similarity, | |
1338 SkoalSneath2Similarity, SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity, YuleSimilarity, | |
1339 WeightedTanimotoSimilarity, WeightedTverskySimilarity>. These similarity coefficients are described below. | |
1340 | |
1341 For two fingerprint bit-vectors A and B of same size, let: | |
1342 | |
1343 Na = Number of bits set to "1" in A | |
1344 Nb = Number of bits set to "1" in B | |
1345 Nc = Number of bits set to "1" in both A and B | |
1346 Nd = Number of bits set to "0" in both A and B | |
1347 | |
1348 Nt = Number of bits set to "1" or "0" in A or B (Size of A or B) | |
1349 Nt = Na + Nb - Nc + Nd | |
1350 | |
1351 Na - Nc = Number of bits set to "1" in A but not in B | |
1352 Nb - Nc = Number of bits set to "1" in B but not in A | |
1353 | |
1354 Then, various similarity coefficients [ Ref. 40 - 42 ] for a pair of bit-vectors A and B are | |
1355 defined as follows: | |
1356 | |
1357 I<BaroniUrbaniSimilarity>: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser ) | |
1358 | |
1359 I<BuserSimilarity>: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani ) | |
1360 | |
1361 I<CosineSimilarity>: Nc / SQRT ( Na * Nb ) (same as Ochiai) | |
1362 | |
1363 I<DiceSimilarity>: (2 * Nc) / ( Na + Nb ) | |
1364 | |
1365 I<DennisSimilarity>: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / SQRT ( Nt * Na * Nb) | |
1366 | |
1367 I<ForbesSimilarity>: ( Nt * Nc ) / ( Na * Nb ) | |
1368 | |
1369 I<FossumSimilarity>: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb ) | |
1370 | |
1371 I<HamannSimilarity>: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt | |
1372 | |
1373 I<JaccardSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Tanimoto) | |
1374 | |
1375 I<Kulczynski1Similarity>: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / ( Na + Nb - 2Nc ) | |
1376 | |
1377 I<Kulczynski2Similarity>: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + ( Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5 * ( Nc / Na + Nc / Nb ) | |
1378 | |
1379 I<MatchingSimilarity>: ( Nc + Nd ) / Nt | |
1380 | |
1381 I<McConnaugheySimilarity>: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / ( Na * Nb ) | |
1382 | |
1383 I<OchiaiSimilarity>: Nc / SQRT ( Na * Nb ) (same as Cosine) | |
1384 | |
1385 I<PearsonSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) / SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) ) | |
1386 | |
1387 I<RogersTanimotoSimilarity>: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc) + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt) | |
1388 | |
1389 I<RussellRaoSimilarity>: Nc / Nt | |
1390 | |
1391 I<SimpsonSimilarity>: Nc / MIN ( Na, Nb) | |
1392 | |
1393 I<SkoalSneath1Similarity>: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb - Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc ) | |
1394 | |
1395 I<SkoalSneath2Similarity>: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt ) | |
1396 | |
1397 I<SkoalSneath3Similarity>: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc ) ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc ) | |
1398 | |
1399 I<TanimotoSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Jaccard) | |
1400 | |
1401 I<TverskySimilarity>: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * ( Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb) | |
1402 | |
1403 I<YuleSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) ) | |
1404 | |
1405 Values of Tanimoto/Jaccard and Tversky coefficients are dependent on only those bit which | |
1406 are set to "1" in both A and B. In order to take into account all bit positions, modified versions | |
1407 of Tanimoto [ Ref. 42 ] and Tversky [ Ref. 43 ] have been developed. | |
1408 | |
1409 Let: | |
1410 | |
1411 Na' = Number of bits set to "0" in A | |
1412 Nb' = Number of bits set to "0" in B | |
1413 Nc' = Number of bits set to "0" in both A and B | |
1414 | |
1415 Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / ( Na' + Nb' - Nc' ) | |
1416 | |
1417 Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc' ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb') | |
1418 | |
1419 Then: | |
1420 | |
1421 I<WeightedTanimotoSimilarity> = beta * Tanimoto + (1 - beta) * Tanimoto' | |
1422 | |
1423 I<WeightedTverskySimilarity> = beta * Tversky + (1 - beta) * Tversky' | |
1424 | |
1425 =item B<-c, --ColMode> I<ColNum | ColLabel> | |
1426 | |
1427 Specify how columns are identified in I<TextFile(s)>: using column number or column | |
1428 label. Possible values: I<ColNum or ColLabel>. Default value: I<ColNum>. | |
1429 | |
1430 =item B<--CompoundIDCol> I<col number | col name> | |
1431 | |
1432 This value is B<-c, --ColMode> mode specific. It specifies input I<TextFile(s)> column to use for | |
1433 generating compound ID for similarity matrices in output I<TextFile(s)>. Possible values: I<col number | |
1434 or col label>. Default value: I<first column containing the word compoundID in its column label or sequentially | |
1435 generated IDs>. | |
1436 | |
1437 =item B<--CompoundIDPrefix> I<text> | |
1438 | |
1439 Specify compound ID prefix to use during sequential generation of compound IDs for input I<SDFile(s)> | |
1440 and I<TextFile(s)>. Default value: I<Cmpd>. The default value generates compound IDs which look | |
1441 like Cmpd<Number>. | |
1442 | |
1443 For input I<SDFile(s)>, this value is only used during I<LabelPrefix | MolNameOrLabelPrefix> values | |
1444 of B<--CompoundIDMode> option; otherwise, it's ignored. | |
1445 | |
1446 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>: | |
1447 | |
1448 Compound | |
1449 | |
1450 The values specified above generates compound IDs which correspond to Compound<Number> | |
1451 instead of default value of Cmpd<Number>. | |
1452 | |
1453 =item B<--CompoundIDField> I<DataFieldName> | |
1454 | |
1455 Specify input I<SDFile(s)> datafield label for generating compound IDs. This value is only used | |
1456 during I<DataField> value of B<--CompoundIDMode> option. | |
1457 | |
1458 Examples for I<DataField> value of B<--CompoundIDMode>: | |
1459 | |
1460 MolID | |
1461 ExtReg | |
1462 | |
1463 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix> | |
1464 | |
1465 Specify how to generate compound IDs from input I<SDFile(s)> for similarity matrix CSV/TSV text | |
1466 file(s): use a I<SDFile(s)> datafield value; use molname line from I<SDFile(s)>; generate a sequential ID | |
1467 with specific prefix; use combination of both MolName and LabelPrefix with usage of LabelPrefix values | |
1468 for empty molname lines. | |
1469 | |
1470 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>. | |
1471 Default: I<LabelPrefix>. | |
1472 | |
1473 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes | |
1474 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname | |
1475 values are replaced with sequential compound IDs. | |
1476 | |
1477 =item B<-d, --detail> I<InfoLevel> | |
1478 | |
1479 Level of information to print about lines being ignored. Default: I<1>. Possible values: | |
1480 I<1, 2 or 3>. | |
1481 | |
1482 =item B<-f, --fast> | |
1483 | |
1484 In this mode, fingerprints columns specified using B<--FingerprintsCol> for I<TextFile(s)> and | |
1485 B<--FingerprintsField> for I<SDFile(s)> are assumed to contain valid fingerprints data and no | |
1486 checking is performed before calculating similarity matrices. By default, fingerprints data is | |
1487 validated before computing pairwise similarity and distance coefficients. | |
1488 | |
1489 =item B<--FingerprintsCol> I<col number | col name> | |
1490 | |
1491 This value is B<-c, --colmode> specific. It specifies fingerprints column to use during | |
1492 calculation similarity matrices for I<TextFile(s)>. Possible values: I<col number or col label>. | |
1493 Default value: I<first column containing the word Fingerprints in its column label>. | |
1494 | |
1495 =item B<--FingerprintsField> I<FieldLabel> | |
1496 | |
1497 Fingerprints field label to use during calculation similarity matrices for I<SDFile(s)>. | |
1498 Default value: I<first data field label containing the word Fingerprints in its label> | |
1499 | |
1500 =item B<-h, --help> | |
1501 | |
1502 Print this help message. | |
1503 | |
1504 =item B<--InDelim> I<comma | semicolon> | |
1505 | |
1506 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>. | |
1507 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a | |
1508 delimiter. | |
1509 | |
1510 =item B<--InputDataMode> I<LoadInMemory | ScanFile> | |
1511 | |
1512 Specify how fingerprints bit-vector or vector strings data from I<SD, FP and CSV/TSV> | |
1513 fingerprint file(s) is processed: Retrieve, process and load all available fingerprints | |
1514 data in memory; Retrieve and process data for fingerprints one at a time. Possible values | |
1515 : I<LoadInMemory | ScanFile>. Default: I<LoadInMemory>. | |
1516 | |
1517 During I<LoadInMemory> value of B<--InputDataMode>, fingerprints bit-vector or vector | |
1518 strings data from input file is retrieved, processed, and loaded into memory all at once | |
1519 as fingerprints objects for generation for similarity matrices. | |
1520 | |
1521 During I<ScanFile> value of B<--InputDataMode>, multiple passes over the input fingerprints | |
1522 file are performed to retrieve and process fingerprints bit-vector or vector strings data one at | |
1523 a time to generate fingerprints objects used during generation of similarity matrices. A temporary | |
1524 copy of the input fingerprints file is made at the start and deleted after generating the matrices. | |
1525 | |
1526 I<ScanFile> value of B<--InputDataMode> allows processing of arbitrary large fingerprints files | |
1527 without any additional memory requirement. | |
1528 | |
1529 =item B<-m, --mode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString> | |
1530 | |
1531 Format of fingerprint strings data in I<TextFile(s)>: automatically detect format of fingerprints | |
1532 string created by MayaChemTools fingerprints generation scripts or explicitly specify its format. | |
1533 Possible values: I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>. Default | |
1534 value: I<AutoDetect>. | |
1535 | |
1536 =item B<--OutDelim> I<comma | tab | semicolon> | |
1537 | |
1538 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> | |
1539 Default value: I<comma>. | |
1540 | |
1541 =item B<--OutMatrixFormat> I<RowsAndColumns | IDPairsAndValue> | |
1542 | |
1543 Specify how similarity or distance values calculated for fingerprints vector and bit-vector strings | |
1544 are written to the output CSV/TSV text file(s): Generate text files containing rows and columns | |
1545 with their labels corresponding to compound IDs and each matrix element value corresponding to | |
1546 similarity or distance between corresponding compounds; Generate text files containing rows containing | |
1547 compoundIDs for two compounds followed by similarity or distance value between these compounds. | |
1548 | |
1549 Possible values: I<RowsAndColumns, or IDPairsAndValue>. Default value: I<RowsAndColumns>. | |
1550 | |
1551 The value of B<--OutMatrixFormat> in conjunction with B<--OutMatrixType> determines type | |
1552 of data written to output files and allows generation of up to 6 different output data formats: | |
1553 | |
1554 OutMatrixFormat OutMatrixType | |
1555 | |
1556 RowsAndColumns FullMatrix [ DEFAULT ] | |
1557 RowsAndColumns UpperTriangularMatrix | |
1558 RowsAndColumns LowerTriangularMatrix | |
1559 | |
1560 IDPairsAndValue FullMatrix | |
1561 IDPairsAndValue UpperTriangularMatrix | |
1562 IDPairsAndValue LowerTriangularMatrix | |
1563 | |
1564 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for | |
1565 I<FullMatrix> valueof B<--OutMatrixType>: | |
1566 | |
1567 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ... | |
1568 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ... | |
1569 "Cmpd2","0.04","1","0.06","0.05","0.19","0.07",... ... | |
1570 "Cmpd3","0.25","0.06","1","0.12","0.22","0.25",... ... | |
1571 "Cmpd4","0.13","0.05","0.12","1","0.11","0.13",... ... | |
1572 "Cmpd5","0.11","0.19","0.22","0.11","1","0.17",... ... | |
1573 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1",... ... | |
1574 ... ... .. | |
1575 ... ... .. | |
1576 ... ... .. | |
1577 | |
1578 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for | |
1579 I<UpperTriangularMatrix> value of B<--OutMatrixType>: | |
1580 | |
1581 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ... | |
1582 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ... | |
1583 "Cmpd2","1","0.06","0.05","0.19","0.07",... ... | |
1584 "Cmpd3","1","0.12","0.22","0.25",... ... | |
1585 "Cmpd4","1","0.11","0.13",... ... | |
1586 "Cmpd5","1","0.17",... ... | |
1587 "Cmpd6","1",... ... | |
1588 ... ... .. | |
1589 ... ... .. | |
1590 ... ... .. | |
1591 | |
1592 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for | |
1593 I<LowerTriangularMatrix> value of B<--OutMatrixType>: | |
1594 | |
1595 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ... | |
1596 "Cmpd1","1" | |
1597 "Cmpd2","0.04","1" | |
1598 "Cmpd3","0.25","0.06","1" | |
1599 "Cmpd4","0.13","0.05","0.12","1" | |
1600 "Cmpd5","0.11","0.19","0.22","0.11","1" | |
1601 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1" | |
1602 ... ... .. | |
1603 ... ... .. | |
1604 ... ... .. | |
1605 | |
1606 | |
1607 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for | |
1608 <FullMatrix> value of B<OutMatrixType>: | |
1609 | |
1610 "CmpdID1","CmpdID2","Coefficient Value" | |
1611 "Cmpd1","Cmpd1","1" | |
1612 "Cmpd1","Cmpd2","0.04" | |
1613 "Cmpd1","Cmpd3","0.25" | |
1614 "Cmpd1","Cmpd4","0.13" | |
1615 ... ... ... | |
1616 ... ... ... | |
1617 ... ... ... | |
1618 "Cmpd2","Cmpd1","0.04" | |
1619 "Cmpd2","Cmpd2","1" | |
1620 "Cmpd2","Cmpd3","0.06" | |
1621 "Cmpd2","Cmpd4","0.05" | |
1622 ... ... ... | |
1623 ... ... ... | |
1624 ... ... ... | |
1625 "Cmpd3","Cmpd1","0.25" | |
1626 "Cmpd3","Cmpd2","0.06" | |
1627 "Cmpd3","Cmpd3","1" | |
1628 "Cmpd3","Cmpd4","0.12" | |
1629 ... ... ... | |
1630 ... ... ... | |
1631 ... ... ... | |
1632 | |
1633 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for | |
1634 <UpperTriangularMatrix> value of B<--OutMatrixType>: | |
1635 | |
1636 "CmpdID1","CmpdID2","Coefficient Value" | |
1637 "Cmpd1","Cmpd1","1" | |
1638 "Cmpd1","Cmpd2","0.04" | |
1639 "Cmpd1","Cmpd3","0.25" | |
1640 "Cmpd1","Cmpd4","0.13" | |
1641 ... ... ... | |
1642 ... ... ... | |
1643 ... ... ... | |
1644 "Cmpd2","Cmpd2","1" | |
1645 "Cmpd2","Cmpd3","0.06" | |
1646 "Cmpd2","Cmpd4","0.05" | |
1647 ... ... ... | |
1648 ... ... ... | |
1649 ... ... ... | |
1650 "Cmpd3","Cmpd3","1" | |
1651 "Cmpd3","Cmpd4","0.12" | |
1652 ... ... ... | |
1653 ... ... ... | |
1654 ... ... ... | |
1655 | |
1656 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for | |
1657 <LowerTriangularMatrix> value of B<--OutMatrixType>: | |
1658 | |
1659 "CmpdID1","CmpdID2","Coefficient Value" | |
1660 "Cmpd1","Cmpd1","1" | |
1661 "Cmpd2","Cmpd1","0.04" | |
1662 "Cmpd2","Cmpd2","1" | |
1663 "Cmpd3","Cmpd1","0.25" | |
1664 "Cmpd3","Cmpd2","0.06" | |
1665 "Cmpd3","Cmpd3","1" | |
1666 "Cmpd4","Cmpd1","0.13" | |
1667 "Cmpd4","Cmpd2","0.05" | |
1668 "Cmpd4","Cmpd3","0.12" | |
1669 "Cmpd4","Cmpd4","1" | |
1670 ... ... ... | |
1671 ... ... ... | |
1672 ... ... ... | |
1673 | |
1674 =item B<--OutMatrixType> I<FullMatrix | UpperTriangularMatrix | LowerTriangularMatrix> | |
1675 | |
1676 Type of similarity or distance matrix to calculate for fingerprints vector and bit-vector strings: | |
1677 Calculate full matrix; Calculate lower triangular matrix including diagonal; Calculate upper triangular | |
1678 matrix including diagonal. | |
1679 | |
1680 Possible values: I<FullMatrix, UpperTriangularMatrix, or LowerTriangularMatrix>. Default value: | |
1681 I<FullMatrix>. | |
1682 | |
1683 The value of B<--OutMatrixType> in conjunction with B<--OutMatrixFormat> determines type | |
1684 of data written to output files. | |
1685 | |
1686 =item B<-o, --overwrite> | |
1687 | |
1688 Overwrite existing files | |
1689 | |
1690 =item B<-p, --precision> I<number> | |
1691 | |
1692 Precision of calculated values in the output file. Default: up to I<2> decimal places. | |
1693 Valid values: positive integers. | |
1694 | |
1695 =item B<-q, --quote> I<Yes | No> | |
1696 | |
1697 Put quote around column values in output CSV/TSV text file(s). Possible values: | |
1698 I<Yes or No>. Default value: I<Yes>. | |
1699 | |
1700 =item B<-r, --root> I<RootName> | |
1701 | |
1702 New file name is generated using the root: <Root><BitVectorComparisonMode>.<Ext> or | |
1703 <Root><VectorComparisonMode><VectorComparisonFormulism>.<Ext>. | |
1704 The csv, and tsv <Ext> values are used for comma/semicolon, and tab delimited text files | |
1705 respectively. This option is ignored for multiple input files. | |
1706 | |
1707 =item B<-v, --VectorComparisonMode> I<All | "TanimotoSimilarity,[ManhattanDistance,...]"> | |
1708 | |
1709 Specify what similarity or distance coefficients to use for calculating similarity matrices for | |
1710 fingerprint vector strings data values in I<TextFile(s)>: calculate similarity matrices for all | |
1711 supported similarity and distance coefficients or specify a comma delimited list of similarity | |
1712 and distance coefficients. Possible values: I<All | "TanimotoSimilairy,[ManhattanDistance,..]">. | |
1713 Default: I<TanimotoSimilarity>. | |
1714 | |
1715 The value of B<-v, --VectorComparisonMode>, in conjunction with B<--VectorComparisonFormulism>, | |
1716 decides which type of similarity and distance coefficient formulism gets used. | |
1717 | |
1718 I<All> uses complete list of supported similarity and distance coefficients: I<CosineSimilarity, | |
1719 CzekanowskiSimilarity, DiceSimilarity, OchiaiSimilarity, JaccardSimilarity, SorensonSimilarity, TanimotoSimilarity, | |
1720 CityBlockDistance, EuclideanDistance, HammingDistance, ManhattanDistance, SoergelDistance>. These | |
1721 similarity and distance coefficients are described below. | |
1722 | |
1723 B<FingerprintsVector.pm> module, used to calculate similarity and distance coefficients, | |
1724 provides support to perform comparison between vectors containing three different types of | |
1725 values: | |
1726 | |
1727 Type I: OrderedNumericalValues | |
1728 | |
1729 . Size of two vectors are same | |
1730 . Vectors contain real values in a specific order. For example: MACCS keys | |
1731 count, Topological pharmnacophore atom pairs and so on. | |
1732 | |
1733 Type II: UnorderedNumericalValues | |
1734 | |
1735 . Size of two vectors might not be same | |
1736 . Vectors contain unordered real value identified by value IDs. For example: | |
1737 Toplogical atom pairs, Topological atom torsions and so on | |
1738 | |
1739 Type III: AlphaNumericalValues | |
1740 | |
1741 . Size of two vectors might not be same | |
1742 . Vectors contain unordered alphanumerical values. For example: Extended | |
1743 connectivity fingerprints, atom neighborhood fingerprints. | |
1744 | |
1745 Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues | |
1746 or AlphaNumericalValues, the vectors are transformed into vectors containing unique OrderedNumericalValues | |
1747 using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues. | |
1748 | |
1749 Three forms of similarity and distance calculation between two vectors, specified using B<--VectorComparisonFormulism> | |
1750 option, are supported: I<AlgebraicForm, BinaryForm or SetTheoreticForm>. | |
1751 | |
1752 For I<BinaryForm>, the ordered list of processed final vector values containing the value or | |
1753 count of each unique value type is simply converted into a binary vector containing 1s and 0s | |
1754 corresponding to presence or absence of values before calculating similarity or distance between | |
1755 two vectors. | |
1756 | |
1757 For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let: | |
1758 | |
1759 N = Number values in A or B | |
1760 | |
1761 Xa = Values of vector A | |
1762 Xb = Values of vector B | |
1763 | |
1764 Xai = Value of ith element in A | |
1765 Xbi = Value of ith element in B | |
1766 | |
1767 SUM = Sum of i over N values | |
1768 | |
1769 For SetTheoreticForm of calculation between two vectors, let: | |
1770 | |
1771 SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) ) | |
1772 SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) | |
1773 | |
1774 For BinaryForm of calculation between two vectors, let: | |
1775 | |
1776 Na = Number of bits set to "1" in A = SUM ( Xai ) | |
1777 Nb = Number of bits set to "1" in B = SUM ( Xbi ) | |
1778 Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi ) | |
1779 Nd = Number of bits set to "0" in both A and B | |
1780 = SUM ( 1 - Xai - Xbi + Xai * Xbi) | |
1781 | |
1782 N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd | |
1783 | |
1784 Additionally, for BinaryForm various values also correspond to: | |
1785 | |
1786 Na = | Xa | | |
1787 Nb = | Xb | | |
1788 Nc = | SetIntersectionXaXb | | |
1789 Nd = N - | SetDifferenceXaXb | | |
1790 | |
1791 | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc | |
1792 = | Xa | + | Xb | - | SetIntersectionXaXb | | |
1793 | |
1794 Various similarity and distance coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair of vectors A and B | |
1795 in I<AlgebraicForm, BinaryForm and SetTheoreticForm> are defined as follows: | |
1796 | |
1797 B<CityBlockDistance>: ( same as HammingDistance and ManhattanDistance) | |
1798 | |
1799 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) | |
1800 | |
1801 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc | |
1802 | |
1803 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) | |
1804 | |
1805 B<CosineSimilarity>: ( same as OchiaiSimilarityCoefficient) | |
1806 | |
1807 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) | |
1808 | |
1809 I<BinaryForm>: Nc / SQRT ( Na * Nb) | |
1810 | |
1811 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) | |
1812 | |
1813 B<CzekanowskiSimilarity>: ( same as DiceSimilarity and SorensonSimilarity) | |
1814 | |
1815 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) | |
1816 | |
1817 I<BinaryForm>: 2 * Nc / ( Na + Nb ) | |
1818 | |
1819 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) | |
1820 | |
1821 B<DiceSimilarity>: ( same as CzekanowskiSimilarity and SorensonSimilarity) | |
1822 | |
1823 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) | |
1824 | |
1825 I<BinaryForm>: 2 * Nc / ( Na + Nb ) | |
1826 | |
1827 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) | |
1828 | |
1829 B<EuclideanDistance>: | |
1830 | |
1831 I<AlgebraicForm>: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) ) | |
1832 | |
1833 I<BinaryForm>: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb - 2 * Nc ) | |
1834 | |
1835 I<SetTheoreticForm>: SQRT ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) | |
1836 | |
1837 B<HammingDistance>: ( same as CityBlockDistance and ManhattanDistance) | |
1838 | |
1839 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) | |
1840 | |
1841 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc | |
1842 | |
1843 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) | |
1844 | |
1845 B<JaccardSimilarity>: ( same as TanimotoSimilarity) | |
1846 | |
1847 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) | |
1848 | |
1849 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) | |
1850 | |
1851 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) | |
1852 | |
1853 B<ManhattanDistance>: ( same as CityBlockDistance and HammingDistance) | |
1854 | |
1855 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) | |
1856 | |
1857 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc | |
1858 | |
1859 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) | |
1860 | |
1861 B<OchiaiSimilarity>: ( same as CosineSimilarity) | |
1862 | |
1863 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) ) | |
1864 | |
1865 I<BinaryForm>: Nc / SQRT ( Na * Nb) | |
1866 | |
1867 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) ) | |
1868 | |
1869 B<SorensonSimilarity>: ( same as CzekanowskiSimilarity and DiceSimilarity) | |
1870 | |
1871 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) ) | |
1872 | |
1873 I<BinaryForm>: 2 * Nc / ( Na + Nb ) | |
1874 | |
1875 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) ) | |
1876 | |
1877 B<SoergelDistance>: | |
1878 | |
1879 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi ) ) | |
1880 | |
1881 I<BinaryForm>: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / ( Na + Nb - Nc ) | |
1882 | |
1883 I<SetTheoreticForm>: ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) / | SetDifferenceXaXb | = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) | |
1884 | |
1885 B<TanimotoSimilarity>: ( same as JaccardSimilarity) | |
1886 | |
1887 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) ) | |
1888 | |
1889 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) | |
1890 | |
1891 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) ) | |
1892 | |
1893 =item B<--VectorComparisonFormulism> I<All | "AlgebraicForm,[BinaryForm,SetTheoreticForm]"> | |
1894 | |
1895 Specify fingerprints vector comparison formulism to use for calculation similarity and distance | |
1896 coefficients during B<-v, --VectorComparisonMode>: use all supported comparison formulisms | |
1897 or specify a comma delimited. Possible values: I<All | "AlgebraicForm,[BinaryForm,SetTheoreticForm]">. | |
1898 Default value: I<AlgebraicForm>. | |
1899 | |
1900 I<All> uses all three forms of supported vector comparison formulism for values of B<-v, --VectorComparisonMode> | |
1901 option. | |
1902 | |
1903 For fingerprint vector strings containing B<AlphaNumericalValues> data values - B<ExtendedConnectivityFingerprints>, | |
1904 B<AtomNeighborhoodsFingerprints> and so on - all three formulism result in same value during similarity and distance | |
1905 calculations. | |
1906 | |
1907 =item B<-w, --WorkingDir> I<DirName> | |
1908 | |
1909 Location of working directory. Default: current directory. | |
1910 | |
1911 =back | |
1912 | |
1913 =head1 EXAMPLES | |
1914 | |
1915 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
1916 bit-vector strings data corresponding to supported fingerprints in text file present in a column | |
1917 name containing Fingerprint substring by loading all fingerprints data into memory and create a | |
1918 SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from column name | |
1919 containing CompoundID substring, type: | |
1920 | |
1921 % SimilarityMatricesFingerprints.pl -o SampleFPHex.csv | |
1922 | |
1923 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
1924 bit-vector strings data corresponding to supported fingerprints in SD File present in a data field | |
1925 with Fingerprint substring in its label by loading all fingerprints data into memory and create a | |
1926 SampleFPHexTanimotoSimilarity.csv file containing sequentially generated compound IDs with | |
1927 Cmpd prefix, type: | |
1928 | |
1929 % SimilarityMatricesFingerprints.pl -o SampleFPHex.sdf | |
1930 | |
1931 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
1932 bit-vector strings data corresponding to supported fingerprints in FP file by loading all fingerprints | |
1933 data into memory and create a SampleFPHexTanimotoSimilarity.csv file along with compound IDs | |
1934 retrieved from FP file, type: | |
1935 | |
1936 % SimilarityMatricesFingerprints.pl -o SampleFPHex.fpf | |
1937 | |
1938 To generate a lower triangular similarity matrix corresponding to Tanimoto similarity coefficient for | |
1939 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a | |
1940 column name containing Fingerprint substring by loading all fingerprints data into memory and create | |
1941 a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from column name | |
1942 containing CompoundID substring, type: | |
1943 | |
1944 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory | |
1945 --OutMatrixFormat RowsAndColumns --OutMatrixType LowerTriangularMatrix | |
1946 SampleFPHex.csv | |
1947 | |
1948 To generate a upper triangular similarity matrix corresponding to Tanimoto similarity coefficient for | |
1949 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a | |
1950 column name containing Fingerprint substring by loading all fingerprints data into memory and create | |
1951 a SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format containing compound IDs retrieved | |
1952 from column name containing CompoundID substring, type: | |
1953 | |
1954 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory | |
1955 --OutMatrixFormat IDPairsAndValue --OutMatrixType UpperTriangularMatrix | |
1956 SampleFPHex.csv | |
1957 | |
1958 To generate a full similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
1959 bit-vector strings data corresponding to supported fingerprints in text file present in a column | |
1960 name containing Fingerprint substring by scanning file without loading all fingerprints data into memory | |
1961 and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from | |
1962 column name containing CompoundID substring, type: | |
1963 | |
1964 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile | |
1965 --OutMatrixFormat RowsAndColumns --OutMatrixType FullMatrix | |
1966 SampleFPHex.csv | |
1967 | |
1968 To generate a lower triangular similarity matrix corresponding to Tanimoto similarity coefficient for | |
1969 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a | |
1970 column name containing Fingerprint substring by scanning file without loading all fingerprints data into | |
1971 memory and create a SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format containing | |
1972 compound IDs retrieved from column name containing CompoundID substring, type: | |
1973 | |
1974 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile | |
1975 --OutMatrixFormat IDPairsAndValue --OutMatrixType LowerTriangularMatrix | |
1976 SampleFPHex.csv | |
1977 | |
1978 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism | |
1979 for fingerprints vector strings data corresponding to supported fingerprints in text file present in a column name | |
1980 containing Fingerprint substring and create a SampleFPCountTanimotoSimilarityAlgebraicForm.csv file | |
1981 containing compound IDs retrieved from column name containing CompoundID substring, type: | |
1982 | |
1983 % SimilarityMatricesFingerprints.pl -o SampleFPCount.csv | |
1984 | |
1985 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism | |
1986 for fingerprints vector strings data corresponding to supported fingerprints in SD file present in a data field with | |
1987 Fingerprint substring in its label and create a SampleFPCountTanimotoSimilarityAlgebraicForm.csv file | |
1988 containing sequentially generated compound IDs with Cmpd prefix, type: | |
1989 | |
1990 % SimilarityMatricesFingerprints.pl -o SampleFPCount.sdf | |
1991 | |
1992 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism | |
1993 vector strings data corresponding to supported fingerprints in FP file and create a | |
1994 SampleFPCountTanimotoSimilarityAlgebraicForm.csv file along with compound IDs retrieved from FP file, type: | |
1995 | |
1996 % SimilarityMatricesFingerprints.pl -o SampleFPCount.fpf | |
1997 | |
1998 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
1999 bit-vector strings data corresponding to supported fingerprints in text file present in a column name | |
2000 containing Fingerprint substring and create a SampleFPHexTanimotoSimilarity.csv file in | |
2001 IDPairsAndValue format containing compound IDs retrieved from column name containing | |
2002 CompoundID substring, type: | |
2003 | |
2004 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o | |
2005 SampleFPHex.csv | |
2006 | |
2007 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2008 bit-vector strings data corresponding to supported fingerprints in SD file present in a data field with | |
2009 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file in | |
2010 IDPairsAndValue format containing sequentially generated compound IDs with Cmpd prefix, | |
2011 type: | |
2012 | |
2013 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o | |
2014 SampleFPHex.sdf | |
2015 | |
2016 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2017 bit-vector strings data corresponding to supported fingerprints in FP file and create a | |
2018 SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format along with compound IDs retrieved | |
2019 from FP file, type: | |
2020 | |
2021 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o | |
2022 SampleFPHex.fpf | |
2023 | |
2024 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2025 bit-vector strings data corresponding to supported fingerprints in SD file present in a data field with | |
2026 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file | |
2027 containing compound IDs from mol name line, type: | |
2028 | |
2029 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolName -o | |
2030 SampleFPHex.sdf | |
2031 | |
2032 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2033 bit-vector strings data corresponding to supported fingerprints present in a data field with | |
2034 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file | |
2035 containing compound IDs from data field name Mol_ID, type: | |
2036 | |
2037 % SimilarityMatricesFingerprints.pl --CompoundIDMode DataField | |
2038 --CompoundIDField Mol_ID -o SampleFPBin.sdf | |
2039 | |
2040 To generate similarity matrices corresponding to Buser, Dice and Tanimoto similarity coefficient | |
2041 for fingerprints bit-vector strings data corresponding to supported fingerprints present in a column | |
2042 name containing Fingerprint substring and create SampleFPBin[CoefficientName]Similarity.csv files | |
2043 containing compound IDs retrieved from column name containing CompoundID substring, type: | |
2044 | |
2045 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity, | |
2046 TanimotoSimilarity" -o SampleFPBin.csv | |
2047 | |
2048 To generate similarity matrices corresponding to Buser, Dice and Tanimoto similarity coefficient | |
2049 for fingerprints bit-vector strings data corresponding to supported fingerprints present in a data field with | |
2050 Fingerprint substring in its label and create SampleFPBin[CoefficientName]Similarity.csv files | |
2051 containing sequentially generated compound IDs with Cmpd prefix, type: | |
2052 | |
2053 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity, | |
2054 TanimotoSimilarity" -o SampleFPBin.sdf | |
2055 | |
2056 To generate similarity matrices corresponding to CityBlock distance and Tanimoto similarity coefficients using | |
2057 algebraic formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
2058 a column name containing Fingerprint substring and create SampleFPCount[CoefficientName]AlgebraicForm.csv | |
2059 files containing compound IDs retrieved from column name containing CompoundID substring, type: | |
2060 | |
2061 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
2062 TanimotoSimilarity" -o SampleFPCount.csv | |
2063 | |
2064 To generate similarity matrices corresponding to CityBlock distance and Tanimoto similarity coefficients using | |
2065 algebraic formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
2066 a data field with Fingerprint substring in its label and create SampleFPCount[CoefficientName]AlgebraicForm.csv | |
2067 files containing sequentially generated compound IDs with Cmpd prefix, type: | |
2068 | |
2069 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
2070 TanimotoSimilarity" -o SampleFPCount.sdf | |
2071 | |
2072 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
2073 binary formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
2074 a column name containing Fingerprint substring and create SampleFPCount[CoefficientName]Binary.csv | |
2075 files containing compound IDs retrieved from column name containing CompoundID substring, type: | |
2076 | |
2077 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
2078 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o | |
2079 SampleFPCount.csv | |
2080 | |
2081 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
2082 binary formulism for fingerprints vector strings data corresponding to supported fingerprints present in | |
2083 a data field with Fingerprint substring in its label and create SampleFPCount[CoefficientName]Binary.csv | |
2084 files containing sequentially generated compound IDs with Cmpd prefix, type: | |
2085 | |
2086 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
2087 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o | |
2088 SampleFPCount.sdf | |
2089 | |
2090 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
2091 all supported comparison formulisms for fingerprints vector strings data corresponding to supported | |
2092 fingerprints present in a column name containing Fingerprint substring and create | |
2093 SampleFPCount[CoefficientName][FormulismName].csv files containing compound IDs retrieved from column | |
2094 name containing CompoundID substring, type: | |
2095 | |
2096 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance, | |
2097 TanimotoSimilarity" --VectorComparisonFormulism All -o SampleFPCount.csv | |
2098 | |
2099 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using | |
2100 all supported comparison formulisms for fingerprints vector strings data corresponding to supported | |
2101 fingerprints present in a data field with Fingerprint substring in its label and create | |
2102 SampleFPCount[CoefficientName][FormulismName].csv files containing sequentially generated | |
2103 compound IDs with Cmpd prefix, type: | |
2104 | |
2105 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,TanimotoSimilarity" | |
2106 --VectorComparisonFormulism All -o SampleFPCount.sdf | |
2107 | |
2108 To generate similarity matrices corresponding to all available similarity coefficient for fingerprints | |
2109 bit-vector strings data corresponding to supported fingerprints present in a column name | |
2110 containing Fingerprint substring and create SampleFPHex[CoefficientName].csv files | |
2111 containing compound IDs retrieved from column name containing CompoundID substring, type: | |
2112 | |
2113 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode | |
2114 All --alpha 0.5 -beta 0.5 -o SampleFPHex.csv | |
2115 | |
2116 To generate similarity matrices corresponding to all available similarity coefficient for fingerprints | |
2117 bit-vector strings data corresponding to supported fingerprints present in a data field with Fingerprint | |
2118 substring in its label and create SampleFPHex[CoefficientName].csv files containing sequentially | |
2119 generated compound IDs with Cmpd prefix, type | |
2120 | |
2121 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode | |
2122 All --alpha 0.5 -beta 0.5 -o SampleFPHex.sdf | |
2123 | |
2124 To generate similarity matrices corresponding to all available similarity and distance coefficients using | |
2125 all comparison formulism for fingerprints vector strings data corresponding to supported fingerprints | |
2126 present in a column name containing Fingerprint substring and create | |
2127 SampleFPCount[CoefficientName][FormulismName].csv files containing compound IDs | |
2128 retrieved from column name containing CompoundID substring, type: | |
2129 | |
2130 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode | |
2131 All --VectorComparisonFormulism All -o SampleFPCount.csv | |
2132 | |
2133 To generate similarity matrices corresponding to all available similarity and distance coefficients using | |
2134 all comparison formulism for fingerprints vector strings data corresponding to supported fingerprints | |
2135 present in a data field with Fingerprint substring in its label and create | |
2136 SampleFPCount[CoefficientName][FormulismName].csv files containing sequentially generated | |
2137 compound IDs with Cmpd prefix, type: | |
2138 | |
2139 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode | |
2140 All --VectorComparisonFormulism All -o SampleFPCount.sdf | |
2141 | |
2142 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2143 bit-vector strings data corresponding to supported fingerprints present in a column number 2 | |
2144 and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved column | |
2145 number 1, type: | |
2146 | |
2147 % SimilarityMatricesFingerprints.pl --ColMode ColNum --CompoundIDCol 1 | |
2148 --FingerprintsCol 2 -o SampleFPHex.csv | |
2149 | |
2150 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2151 bit-vector strings data corresponding to supported fingerprints present in a data field name | |
2152 Fingerprints and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs | |
2153 present in data field name Mol_ID, type: | |
2154 | |
2155 % SimilarityMatricesFingerprints.pl --FingerprintsField Fingerprints | |
2156 --CompoundIDMode DataField --CompoundIDField Mol_ID -o SampleFPHex.sdf | |
2157 | |
2158 To generate a similarity matrix corresponding to Tversky similarity coefficient for fingerprints | |
2159 bit-vector strings data corresponding to supported fingerprints present in a column named Fingerprints | |
2160 and create a SampleFPHexTverskySimilarity.tsv file containing compound IDs retrieved column named | |
2161 CompoundID, type: | |
2162 | |
2163 % SimilarityMatricesFingerprints.pl --BitVectorComparisonMode | |
2164 TverskySimilarity --alpha 0.5 --ColMode ColLabel --CompoundIDCol | |
2165 CompoundID --FingerprintsCol Fingerprints --OutDelim Tab --quote No | |
2166 -o SampleFPHex.csv | |
2167 | |
2168 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2169 bit-vector strings data corresponding to supported fingerprints present in a data field | |
2170 with Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file | |
2171 containing compound IDs from molname line or sequentially generated compound IDs | |
2172 with Mol prefix, type: | |
2173 | |
2174 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolnameOrLabelPrefix | |
2175 --CompoundIDPrefix Mol -o SampleFPHex.sdf | |
2176 | |
2177 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints | |
2178 bit-vector strings data corresponding to supported fingerprints present in a data field with | |
2179 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.tsv file | |
2180 containing sequentially generated compound IDs with Cmpd prefix, type: | |
2181 | |
2182 % SimilarityMatricesFingerprints.pl -OutDelim Tab --quote No -o SampleFPHex.sdf | |
2183 | |
2184 =head1 AUTHOR | |
2185 | |
2186 Manish Sud <msud@san.rr.com> | |
2187 | |
2188 =head1 SEE ALSO | |
2189 | |
2190 InfoFingerprintsFiles.pl, SimilaritySearchingFingerprints.pl, AtomNeighborhoodsFingerprints.pl, | |
2191 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl, PathLengthFingerprints.pl, | |
2192 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl, | |
2193 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl | |
2194 | |
2195 =head1 COPYRIGHT | |
2196 | |
2197 Copyright (C) 2015 Manish Sud. All rights reserved. | |
2198 | |
2199 This file is part of MayaChemTools. | |
2200 | |
2201 MayaChemTools is free software; you can redistribute it and/or modify it under | |
2202 the terms of the GNU Lesser General Public License as published by the Free | |
2203 Software Foundation; either version 3 of the License, or (at your option) | |
2204 any later version. | |
2205 | |
2206 =cut |