comparison bin/SimilarityMatricesFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: SimilarityMatricesFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.21 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use File::Copy;
34 use Text::ParseWords;
35 use Benchmark;
36 use FileUtil;
37 use TextUtil;
38 use Fingerprints::FingerprintsFileUtil;
39 use Fingerprints::FingerprintsBitVector;
40 use Fingerprints::FingerprintsVector;
41
42 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
43
44 # Autoflush STDOUT
45 $| = 1;
46
47 # Starting message...
48 $ScriptName = basename($0);
49 print "\n$ScriptName: Starting...\n\n";
50 $StartTime = new Benchmark;
51
52 # Get the options and setup script...
53 SetupScriptUsage();
54 if ($Options{help} || @ARGV < 1) {
55 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
56 }
57
58 my(@FingerprintsFilesList);
59 @FingerprintsFilesList = ExpandFileNames(\@ARGV, "sdf sd fpf fp csv tsv");
60
61 # Process options...
62 print "Processing options...\n";
63 my(%OptionsInfo);
64 ProcessOptions();
65
66 # Setup information about input files...
67 print "Checking input fingerprints file(s)...\n";
68 my(%FingerprintsFilesInfo);
69 RetrieveFingerprintsFilesInfo();
70
71 # Process input files..
72 my($FileIndex);
73 if (@FingerprintsFilesList > 1) {
74 print "\nProcessing fingerprints files...\n";
75 }
76 for $FileIndex (0 .. $#FingerprintsFilesList) {
77 if ($FingerprintsFilesInfo{FileOkay}[$FileIndex]) {
78 print "\nProcessing file $FingerprintsFilesList[$FileIndex]...\n";
79 GenerateSimilarityMatrices($FileIndex);
80 }
81 }
82 print "\n$ScriptName:Done...\n\n";
83
84 $EndTime = new Benchmark;
85 $TotalTime = timediff ($EndTime, $StartTime);
86 print "Total time: ", timestr($TotalTime), "\n";
87
88 ###############################################################################
89
90 # Generate similarity matrices using fingerprints data in text file...
91 #
92 sub GenerateSimilarityMatrices {
93 my($FileIndex) = @_;
94
95 ProcessFingerprintsData($FileIndex);
96
97 if ($FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$FileIndex]) {
98 GenerateSimilarityMatricesForFingerprintsBitVectors($FileIndex);
99 }
100 elsif ($FingerprintsFilesInfo{FingerprintsVectorStringMode}[$FileIndex]) {
101 GenerateSimilarityMatricesForFingerprintsVectors($FileIndex);
102 }
103
104 CleanupFingerprintsData($FileIndex);
105 }
106
107 # Generate bit vector similarity matrices...
108 #
109 sub GenerateSimilarityMatricesForFingerprintsBitVectors {
110 my($FileIndex) = @_;
111 my($SpecifiedComparisonMeasure, $ComparisonMeasure, $NewTextFile, $SimilarityMatrixRef, $MethodName, @MethodParameters);
112
113 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedBitVectorComparisonsRef}}) {
114 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)};
115 $NewTextFile = $FingerprintsFilesInfo{OutFileRoot}[$FileIndex] . "${ComparisonMeasure}." . $FingerprintsFilesInfo{OutFileExt}[$FileIndex];
116
117 $MethodName = $OptionsInfo{SpecifiedBitVectorComparisonsMethodRef}->{lc($ComparisonMeasure)};
118
119 @MethodParameters = ();
120 @MethodParameters = @{$OptionsInfo{SpecifiedBitVectorComparisonsParameterRef}->{lc($ComparisonMeasure)}};
121
122 GenerateSimilarityMatrix($FileIndex, $NewTextFile, $MethodName, \@MethodParameters);
123 }
124 }
125
126 # Generate vector similarity and/or distance matrices...
127 #
128 sub GenerateSimilarityMatricesForFingerprintsVectors {
129 my($FileIndex) = @_;
130 my($SpecifiedComparisonMeasure, $ComparisonMode, $ComparisonMeasure, $NewTextFile, $MethodName, @MethodParameters);
131
132 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedVectorComparisonsRef}}) {
133 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)};
134
135 for $ComparisonMode (@{$OptionsInfo{SpecifiedVectorComparisonModesRef}}) {
136 $NewTextFile = $FingerprintsFilesInfo{OutFileRoot}[$FileIndex] . "${ComparisonMeasure}${ComparisonMode}." . $FingerprintsFilesInfo{OutFileExt}[$FileIndex];
137
138 $MethodName = $OptionsInfo{SpecifiedVectorComparisonsMethodRef}->{lc($ComparisonMeasure)};
139
140 @MethodParameters = ();
141 push @MethodParameters, $ComparisonMode;
142 push @MethodParameters, @{$OptionsInfo{SpecifiedVectorComparisonsParameterRef}->{lc($ComparisonMeasure)}};
143
144 GenerateSimilarityMatrix($FileIndex, $NewTextFile, $MethodName, \@MethodParameters);
145 }
146 }
147 }
148
149 # Calculate similarity matrix and write it out...
150 #
151 sub GenerateSimilarityMatrix {
152 my($FileIndex, $NewTextFile, $MethodName, $MethodParametersRef) = @_;
153
154 print "\nGenerating $NewTextFile...\n";
155
156 # Open new file and write out column labels...
157 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Can't open $NewTextFile: $! \n";
158 WriteColumnLabels($FileIndex, \*NEWTEXTFILE);
159
160 # Calculate and write out similarity matrix values...
161 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) {
162 GenerateSimilarityMatrixUsingMemoryData($FileIndex, \*NEWTEXTFILE, $MethodName, $MethodParametersRef);
163 }
164 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) {
165 GenerateSimilarityMatrixUsingFileData($FileIndex, \*NEWTEXTFILE, $MethodName, $MethodParametersRef);
166 }
167 else {
168 warn "Warning: Input data mode, $OptionsInfo{InputDataMode}, is not supported.\n";
169 }
170
171 # Close new text file...
172 close NEWTEXTFILE;
173
174 }
175
176 # Calculate and write out similarity values using fingerprints data already loaded in
177 # memory...
178 #
179 sub GenerateSimilarityMatrixUsingMemoryData {
180 my($FileIndex, $NewTextFileRef, $MethodName, $MethodParametersRef) = @_;
181 my($RowIndex, $ColIndex, $CmpdID1, $CmpdID2, $FingerprintsObject1, $FingerprintsObject2, $Value, $Line, @LineWords);
182
183 for $RowIndex (0 .. $#{$FingerprintsFilesInfo{FingerprintsObjectsRef}}) {
184 $FingerprintsObject1 = $FingerprintsFilesInfo{FingerprintsObjectsRef}->[$RowIndex];
185 $CmpdID1 = $FingerprintsFilesInfo{CompundIDsRef}->[$RowIndex];
186
187 if ($OptionsInfo{WriteRowsAndColumns}) {
188 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}${CmpdID1}$OptionsInfo{OutQuoteValue}";
189 }
190
191 COLINDEX: for $ColIndex (0 .. $#{$FingerprintsFilesInfo{FingerprintsObjectsRef}}) {
192 if (SkipMatrixData($RowIndex, $ColIndex)) {
193 next COLINDEX;
194 }
195
196 $FingerprintsObject2 = $FingerprintsFilesInfo{FingerprintsObjectsRef}->[$ColIndex];
197
198 $Value = $FingerprintsObject1->$MethodName($FingerprintsObject2, @{$MethodParametersRef});
199 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : '';
200
201 if ($OptionsInfo{WriteRowsAndColumns}) {
202 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${Value}$OptionsInfo{OutQuoteValue}";
203 }
204 elsif ($OptionsInfo{WriteIDPairsAndValue}) {
205 $CmpdID2 = $FingerprintsFilesInfo{CompundIDsRef}->[$ColIndex];
206
207 @LineWords = ();
208 push @LineWords, ($CmpdID1, $CmpdID2, $Value);
209 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
210 print $NewTextFileRef "$Line\n";
211 }
212 }
213 if ($OptionsInfo{WriteRowsAndColumns}) {
214 print $NewTextFileRef "\n";
215 }
216 }
217 }
218
219 # Calculate and write out similarity values by retrieving and prcessing data
220 # from fingerprint file...
221 #
222 sub GenerateSimilarityMatrixUsingFileData {
223 my($FileIndex, $NewTextFileRef, $MethodName, $MethodParametersRef) = @_;
224 my($RowIndex, $ColIndex, $FingerprintsFileIO, $TmpFingerprintsFileIO, $FingerprintsObject1, $FingerprintsObject2, $CmpdID1, $CmpdID2, $FingerprintsCount, $IgnoredFingerprintsCount, $Value, $Line, @LineWords);
225
226 print "\nReading and processing fingerprints data...\n";
227
228 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]});
229 $FingerprintsFileIO->Open();
230
231 $RowIndex = 0; $ColIndex = 0;
232 $FingerprintsCount = 0; $IgnoredFingerprintsCount = 0;
233
234 FINGERPRINTSFILEIO: while ($FingerprintsFileIO->Read()) {
235 $FingerprintsCount++;
236
237 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) {
238 $IgnoredFingerprintsCount++;
239 next FINGERPRINTSFILEIO;
240 }
241 $RowIndex++;
242 $FingerprintsObject1 = $FingerprintsFileIO->GetFingerprints();
243 $CmpdID1 = $FingerprintsFileIO->GetCompoundID();
244
245 if ($OptionsInfo{WriteRowsAndColumns}) {
246 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}${CmpdID1}$OptionsInfo{OutQuoteValue}";
247 }
248
249 # Force detail level of 1 to avoid duplicate printing of diagnostic messages for invalid
250 # fingerprints data...
251 $TmpFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$FileIndex]}, "DetailLevel" => 1);
252 $TmpFingerprintsFileIO->Open();
253
254 $ColIndex = 0;
255 TMPFINGERPRINTSFILEIO: while ($TmpFingerprintsFileIO->Read()) {
256 if (!$TmpFingerprintsFileIO->IsFingerprintsDataValid()) {
257 next TMPFINGERPRINTSFILEIO;
258 }
259 $ColIndex++;
260
261 if (SkipMatrixData($RowIndex, $ColIndex)) {
262 next TMPFINGERPRINTSFILEIO;
263 }
264
265 $FingerprintsObject2 = $TmpFingerprintsFileIO->GetFingerprints();
266
267 $Value = $FingerprintsObject1->$MethodName($FingerprintsObject2, @{$MethodParametersRef});
268 $Value = (defined($Value) && length($Value)) ? (sprintf("%.$OptionsInfo{Precision}f", $Value) + 0) : '';
269
270 if ($OptionsInfo{WriteRowsAndColumns}) {
271 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${Value}$OptionsInfo{OutQuoteValue}";
272 }
273 elsif ($OptionsInfo{WriteIDPairsAndValue}) {
274 $CmpdID2 = $TmpFingerprintsFileIO->GetCompoundID();
275
276 @LineWords = ();
277 push @LineWords, ($CmpdID1, $CmpdID2, $Value);
278 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
279 print $NewTextFileRef "$Line\n";
280 }
281 }
282 $TmpFingerprintsFileIO->Close();
283
284 if ($OptionsInfo{WriteRowsAndColumns}) {
285 print $NewTextFileRef "\n";
286 }
287 }
288
289 $FingerprintsFileIO->Close();
290
291 print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n";
292 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n";
293 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n";
294 }
295
296 # Check whether matrix data need to be skipped...
297 #
298 sub SkipMatrixData {
299 my($RowIndex, $ColIndex) = @_;
300
301 if ($OptionsInfo{WriteFullMatrix}) {
302 return 0;
303 }
304 elsif ($OptionsInfo{WriteUpperTriangularMatrix}) {
305 return ($RowIndex > $ColIndex) ? 1 : 0;
306 }
307 elsif ($OptionsInfo{WriteLowerTriangularMatrix}) {
308 return ($RowIndex < $ColIndex) ? 1 : 0;
309 }
310
311 return 0;
312 }
313
314 # Write out column labels...
315 #
316 sub WriteColumnLabels {
317 my($FileIndex, $NewTextFileRef) = @_;
318 my($Line, @LineWords);
319
320 if ($OptionsInfo{OutMatrixFormat} =~ /^IDPairsAndValue$/i) {
321 @LineWords = ();
322 push @LineWords, ('CmpdID1', 'CmpdID2', 'Coefficient Value');
323 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
324 print $NewTextFileRef "$Line\n";
325 }
326 elsif ($OptionsInfo{OutMatrixFormat} =~ /^RowsAndColumns$/i) {
327 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) {
328 @LineWords = ();
329 push @LineWords, '';
330 push @LineWords, @{$FingerprintsFilesInfo{CompundIDsRef}};
331 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
332 print $NewTextFileRef "$Line\n";
333 }
334 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) {
335 my( $FingerprintsFileIO, $CmpdID);
336
337 # Scan file to retrieve compound IDs...
338 #
339 print "\nProcessing fingerprints file to generate compound IDs...\n";
340
341 # Force detail level of 1 to avoid diagnostics messages for invalid fingeprints data during
342 # retrieval of compound IDs as these get printed out during calculation of matrix...
343 #
344 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]}, "DetailLevel" => 1);
345 $FingerprintsFileIO->Open();
346
347 print $NewTextFileRef "$OptionsInfo{OutQuoteValue}$OptionsInfo{OutQuoteValue}";
348
349 FINGERPRINTSFILEIO: while ($FingerprintsFileIO->Read()) {
350 if (!$FingerprintsFileIO->IsFingerprintsDataValid()) {
351 next FINGERPRINTSFILEIO;
352 }
353 $CmpdID = $FingerprintsFileIO->GetCompoundID();
354 print $NewTextFileRef "$OptionsInfo{OutDelim}$OptionsInfo{OutQuoteValue}${CmpdID}$OptionsInfo{OutQuoteValue}";
355 }
356 $FingerprintsFileIO->Close();
357
358 print $NewTextFileRef "\n";
359
360 print "Processing fingerprints file to generate matrix...\n";
361 }
362 }
363 else {
364 warn "Warning: Output matrix format, $OptionsInfo{OutMatrixFormat}, is not supported.\n";
365 }
366 }
367
368 # Process fingerprints data...
369 #
370 sub ProcessFingerprintsData {
371 my($FileIndex) = @_;
372 my($FingerprintsFileIO);
373
374 $FingerprintsFilesInfo{CompundIDsRef} = undef;
375 $FingerprintsFilesInfo{FingerprintsObjectsRef} = undef;
376
377 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) {
378 my($FingerprintsFileIO);
379
380 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$FileIndex]});
381 ($FingerprintsFilesInfo{CompundIDsRef}, $FingerprintsFilesInfo{FingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO);
382 }
383 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) {
384 my($FingerprintsFile, $TmpFingerprintsFile);
385
386 $FingerprintsFile = $FingerprintsFilesList[$FileIndex];
387 $TmpFingerprintsFile = $FingerprintsFilesInfo{TmpFingerprintsFile}[$FileIndex];
388
389 # Copy fingerprints file to a tmp file for calculating similarity matrix...
390 print "\nCopying fingerprints file, $FingerprintsFile, to temporary fingperints file, $TmpFingerprintsFile...\n";
391 copy $FingerprintsFile, $TmpFingerprintsFile or die "Error: Couldn't copy $FingerprintsFile to $TmpFingerprintsFile: $! \n";
392 }
393 }
394
395 # Clean up fingerprints data...
396 #
397 sub CleanupFingerprintsData {
398 my($FileIndex) = @_;
399
400 if ($OptionsInfo{InputDataMode} =~ /^LoadInMemory$/i) {
401 $FingerprintsFilesInfo{CompundIDsRef} = undef;
402 $FingerprintsFilesInfo{FingerprintsObjectsRef} = undef;
403 }
404 elsif ($OptionsInfo{InputDataMode} =~ /^ScanFile$/i) {
405 my($TmpFingerprintsFile);
406
407 # Delete temporary fingerprints file...
408 $TmpFingerprintsFile = $FingerprintsFilesInfo{TmpFingerprintsFile}[$FileIndex];
409
410 print "\nDeleting temporary fingerprints file $TmpFingerprintsFile...\n";
411 unlink $TmpFingerprintsFile or die "Error: Couldn't unlink $TmpFingerprintsFile: $! \n";
412 }
413 }
414
415 # Retrieve information about fingerprints files...
416 #
417 sub RetrieveFingerprintsFilesInfo {
418 my($FingerprintsFile, $TmpFingerprintsFile, $FingerprintsFileIO, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FileType, $Index, $FileDir, $FileExt, $FileName, $InDelim, $OutFileRoot, $OutFileExt, %FingerprintsFileIOParameters);
419
420 %FingerprintsFilesInfo = ();
421 @{$FingerprintsFilesInfo{FileOkay}} = ();
422 @{$FingerprintsFilesInfo{FileType}} = ();
423 @{$FingerprintsFilesInfo{InDelim}} = ();
424 @{$FingerprintsFilesInfo{OutFileRoot}} = ();
425 @{$FingerprintsFilesInfo{OutFileExt}} = ();
426
427 @{$FingerprintsFilesInfo{TmpFingerprintsFile}} = ();
428
429 @{$FingerprintsFilesInfo{FingerprintsFileIOParameters}} = ();
430 @{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}} = ();
431
432 @{$FingerprintsFilesInfo{FingerprintsBitVectorStringMode}} = ();
433 @{$FingerprintsFilesInfo{FingerprintsVectorStringMode}} = ();
434
435 FILELIST: for $Index (0 .. $#FingerprintsFilesList) {
436 $FingerprintsFilesInfo{FileOkay}[$Index] = 0;
437 $FingerprintsFilesInfo{FileType}[$Index] = '';
438 $FingerprintsFilesInfo{InDelim}[$Index] = "";
439 $FingerprintsFilesInfo{OutFileRoot}[$Index] = '';
440 $FingerprintsFilesInfo{OutFileExt}[$Index] = '';
441
442 %{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$Index]} = ();
443
444 $FingerprintsFilesInfo{TmpFingerprintsFile}[$Index] = "";
445 %{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$Index]} = ();
446
447 $FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$Index] = 0;
448 $FingerprintsFilesInfo{FingerprintsVectorStringMode}[$Index] = 0;
449
450 $FingerprintsFile = $FingerprintsFilesList[$Index];
451 if (!(-e $FingerprintsFile)) {
452 warn "Warning: Ignoring file $FingerprintsFile: It doesn't exist\n";
453 next FILELIST;
454 }
455
456 $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile);
457 if (IsEmpty($FileType)) {
458 warn "Warning: Ignoring file $FingerprintsFile: It's not a fingerprints file\n";
459 next FILELIST;
460 }
461
462 $FileDir = ""; $FileName = ""; $FileExt = "";
463 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
464
465 # Setup temporary fingerprints file name for scan file mode...
466 $TmpFingerprintsFile = "${FileName}Tmp.${FileExt}";
467
468 $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim};
469
470 # Setup output file names...
471 $OutFileExt = "csv";
472 if ($Options{outdelim} =~ /^tab$/i) {
473 $OutFileExt = "tsv";
474 }
475
476 $OutFileRoot = $FileName;
477 if ($OptionsInfo{OutFileRoot} && (@FingerprintsFilesList == 1)) {
478 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
479 if ($RootFileName && $RootFileExt) {
480 $FileName = $RootFileName;
481 }
482 else {
483 $FileName = $OptionsInfo{OutFileRoot};
484 }
485 $OutFileRoot = $FileName;
486 }
487
488 if (!$Options{overwrite}) {
489 # Similarity matrices output file names for bit-vector strings...
490 my($SpecifiedComparisonMeasure, $ComparisonMeasure);
491 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedBitVectorComparisonsRef}}) {
492 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)};
493 if (-e "${OutFileRoot}${ComparisonMeasure}.${OutFileExt}") {
494 warn "Warning: Ignoring file $FingerprintsFile: The file ${OutFileRoot}${ComparisonMeasure}.${OutFileExt} already exists.\n";
495 next FILELIST;
496 }
497 }
498 # Similarity matrices output file names for vector strings...
499 my($ComparisonMode);
500 for $SpecifiedComparisonMeasure (@{$OptionsInfo{SpecifiedVectorComparisonsRef}}) {
501 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonsNameRef}->{lc($SpecifiedComparisonMeasure)};
502 for $ComparisonMode (@{$OptionsInfo{SpecifiedVectorComparisonModesRef}}) {
503 if (-e "${OutFileRoot}${ComparisonMeasure}${ComparisonMode}.${OutFileExt}") {
504 warn "Warning: Ignoring file $FingerprintsFile: The file ${OutFileRoot}${ComparisonMeasure}${ComparisonMode}.${OutFileExt} already exists.\n";
505 next FILELIST;
506 }
507 }
508 }
509 }
510
511 # Setup FingerprintsFileIO parameters...
512 %FingerprintsFileIOParameters = ();
513 FILEIOPARAMETERS: {
514 if ($FileType =~ /^SD$/i) {
515 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsField}, 'CompoundIDMode' => $OptionsInfo{CompoundIDMode}, 'CompoundIDFieldLabel' => $OptionsInfo{CompoundIDField}, 'CompoundIDPrefix' => $OptionsInfo{CompoundIDPrefix});
516 last FILEIOPARAMETERS;
517 }
518 if ($FileType =~ /^FP$/i) {
519 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail});
520 last FILEIOPARAMETERS;
521 }
522 if ($FileType =~ /^Text$/i) {
523 %FingerprintsFileIOParameters = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{Mode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{FingerprintsCol}, 'ColMode' => $OptionsInfo{ColMode}, 'CompoundIDCol' => $OptionsInfo{CompoundIDCol}, 'CompoundIDPrefix' => $OptionsInfo{CompoundIDPrefix}, 'InDelim' => $OptionsInfo{InDelim});
524 last FILEIOPARAMETERS;
525 }
526 warn "Warning: File type for fingerprints file, $FingerprintsFile, is not valid. Supported file types: SD, FP or Text\n";
527 next FILELIST;
528 }
529
530 # Retrieve fingerints file string mode information...
531 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%FingerprintsFileIOParameters);
532
533 if (!$FingerprintsFileIO) {
534 warn "Warning: Ignoring fingerprints file $FingerprintsFile: It contains invalid fingerprints data\n";
535 next FILELIST;
536 }
537 if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) {
538 warn "Warning: Ignoring fingerprints file $FingerprintsFile: It contains invalid fingerprints data\n";
539 next FILELIST;
540 }
541 $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode();
542 $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode();
543
544
545 $FingerprintsFilesInfo{FileOkay}[$Index] = 1;
546 $FingerprintsFilesInfo{FileType}[$Index] = $FileType;
547
548 $FingerprintsFilesInfo{InDelim}[$Index] = $InDelim;
549
550 $FingerprintsFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
551 $FingerprintsFilesInfo{OutFileExt}[$Index] = $OutFileExt;
552
553 %{$FingerprintsFilesInfo{FingerprintsFileIOParameters}[$Index]} = %FingerprintsFileIOParameters;
554
555 $FingerprintsFilesInfo{TmpFingerprintsFile}[$Index] = $TmpFingerprintsFile;
556
557 $FingerprintsFileIOParameters{Name} = $TmpFingerprintsFile;
558 %{$FingerprintsFilesInfo{TmpFingerprintsFileIOParameters}[$Index]} = %FingerprintsFileIOParameters;
559
560 $FingerprintsFilesInfo{FingerprintsBitVectorStringMode}[$Index] = $FingerprintsBitVectorStringMode;
561 $FingerprintsFilesInfo{FingerprintsVectorStringMode}[$Index] = $FingerprintsVectorStringMode;
562 }
563 }
564
565 # Process option values...
566 sub ProcessOptions {
567 %OptionsInfo = ();
568
569 $OptionsInfo{Mode} = $Options{mode};
570
571 $OptionsInfo{InputDataMode} = $Options{inputdatamode};
572
573 ProcessBitVectorComparisonOptions();
574 ProcessVectorComparisonOptions();
575
576 $OptionsInfo{CompoundIDPrefix} = $Options{compoundidprefix} ? $Options{compoundidprefix} : 'Cmpd';
577
578 # Compound ID and fingerprints column options for text files...
579 $OptionsInfo{ColMode} = $Options{colmode};
580
581 if (IsNotEmpty($Options{compoundidcol})) {
582 if ($Options{colmode} =~ /^ColNum$/i) {
583 if (!IsPositiveInteger($Options{compoundidcol})) {
584 die "Error: Column value, $Options{compoundidcol}, specified using \"--CompoundIDCol\" is not valid: Allowed integer values: > 0\n";
585 }
586 }
587 $OptionsInfo{CompoundIDCol} = $Options{compoundidcol};
588 }
589 else {
590 $OptionsInfo{CompoundIDCol} = 'AutoDetect';
591 }
592
593 if (IsNotEmpty($Options{fingerprintscol})) {
594 if ($Options{colmode} =~ /^ColNum$/i) {
595 if (!IsPositiveInteger($Options{fingerprintscol})) {
596 die "Error: Column value, $Options{fingerprintscol}, specified using \"--FingerprintsCol\" is not valid: Allowed integer values: > 0\n";
597 }
598 }
599 $OptionsInfo{FingerprintsCol} = $Options{fingerprintscol};
600 }
601 else {
602 $OptionsInfo{FingerprintsCol} = 'AutoDetect';
603 }
604
605 if (IsNotEmpty($Options{compoundidcol}) && IsNotEmpty($Options{fingerprintscol})) {
606 if (IsPositiveInteger($Options{compoundidcol}) && IsPositiveInteger($Options{fingerprintscol})) {
607 if (($Options{compoundidcol} == $Options{fingerprintscol})) {
608 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n";
609 }
610 }
611 else {
612 if (($Options{compoundidcol} eq $Options{fingerprintscol})) {
613 die "Error: Values specified using \"--CompoundIDCol\" and \"--FingerprintsCol\", $Options{compoundidcol}, must be different.\n";
614 }
615 }
616 }
617
618 # Compound ID and fingerprints field options for SD files...
619 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
620 $OptionsInfo{CompoundIDField} = '';
621
622 if ($Options{compoundidmode} =~ /^DataField$/i) {
623 if (!$Options{compoundidfield}) {
624 die "Error: You must specify a value for \"--CompoundIDField\" option in \"DataField\" \"--CompoundIDMode\". \n";
625 }
626 $OptionsInfo{CompoundIDField} = $Options{compoundidfield};
627 }
628
629
630 if (IsNotEmpty($Options{fingerprintsfield})) {
631 $OptionsInfo{FingerprintsField} = $Options{fingerprintsfield};
632 }
633 else {
634 $OptionsInfo{FingerprintsField} = 'AutoDetect';
635 }
636
637 if ($Options{compoundidfield} && IsNotEmpty($Options{fingerprintsfield})) {
638 if (($Options{compoundidfield} eq $Options{fingerprintsfield})) {
639 die "Error: Values specified using \"--CompoundIDField\" and \"--Fingerprintsfield\", $Options{compoundidfield}, must be different.\n";
640 }
641 }
642
643 $OptionsInfo{Detail} = $Options{detail};
644
645 $OptionsInfo{InDelim} = $Options{indelim};
646 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
647 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
648 $OptionsInfo{OutQuoteValue} = ($Options{quote} =~ /^Yes$/i) ? '"' : '';
649
650 $OptionsInfo{OutMatrixFormat} = $Options{outmatrixformat};
651
652 $OptionsInfo{WriteRowsAndColumns} = 0; $OptionsInfo{WriteIDPairsAndValue} = 0;
653 OUTMATRIXFORMAT: {
654 if ($OptionsInfo{OutMatrixFormat} =~ /^RowsAndColumns$/i) {
655 $OptionsInfo{WriteRowsAndColumns} = 1; last OUTMATRIXFORMAT;
656 }
657 if ($OptionsInfo{OutMatrixFormat} =~ /^IDPairsAndValue$/i) {
658 $OptionsInfo{WriteIDPairsAndValue} = 1; last OUTMATRIXFORMAT;
659 }
660 die "Error: The value specified, $Options{outmatrixformat}, for option \"--OutMatrixFormat\" is not valid. Allowed values: RowsAndColumns or IDPairsAndValue\n";
661 }
662
663 $OptionsInfo{OutMatrixType} = $Options{outmatrixtype};
664
665 $OptionsInfo{WriteFullMatrix} = 0;
666 $OptionsInfo{WriteUpperTriangularMatrix} = 0; $OptionsInfo{WriteLowerTriangularMatrix} = 0;
667 OUTMATRIXTYPE: {
668 if ($OptionsInfo{OutMatrixType} =~ /^FullMatrix$/i) {
669 $OptionsInfo{WriteFullMatrix} = 1; last OUTMATRIXTYPE;
670 }
671 if ($OptionsInfo{OutMatrixType} =~ /^UpperTriangularMatrix$/i) {
672 $OptionsInfo{WriteUpperTriangularMatrix} = 1; last OUTMATRIXTYPE;
673 }
674 if ($OptionsInfo{OutMatrixType} =~ /^LowerTriangularMatrix$/i) {
675 $OptionsInfo{WriteLowerTriangularMatrix} = 1; last OUTMATRIXTYPE;
676 }
677 die "Error: The value specified, $Options{outmatrixtype}, for option \"--OutMatrixType\" is not valid. Allowed values: FullMatrix, UpperTriangularMatrix or LowerTriangularMatrix\n";
678 }
679
680 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
681 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
682
683 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0;
684 $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1;
685
686 $OptionsInfo{Precision} = $Options{precision};
687
688 }
689
690 # Process options related to comparion of bit vector strings...
691 #
692 sub ProcessBitVectorComparisonOptions {
693 # Setup supported bit vector similarity coefficients for bit vector strings...
694 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
695
696 @SupportedComparisonMeasures = ();
697 %SupportedComparisonMeasuresNameMap = ();
698 %SupportedComparisonMeasuresMethodMap = ();
699
700 for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) {
701 # Similarity coefficient function/method names contain "Coefficient" in their names.
702 # So take 'em out and setup a map to original function/method name...
703 $ComparisonMeasure = $SupportedComparisonMeasure;
704 $ComparisonMeasure =~ s/Coefficient$//;
705
706 push @SupportedComparisonMeasures, $ComparisonMeasure;
707 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
708 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
709 }
710
711 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings...
712 my($SpecifiedMeasure, @SpecifiedComparisonMeasures, %SpecifiedComparisonMeasuresNameMap, %SpecifiedComparisonMeasuresMethodMap, %SpecifiedComparisonMeasuresParameterMap);
713
714 @SpecifiedComparisonMeasures = ();
715 %SpecifiedComparisonMeasuresNameMap = ();
716 %SpecifiedComparisonMeasuresMethodMap = ();
717 %SpecifiedComparisonMeasuresParameterMap = ();
718
719 if ($Options{bitvectorcomparisonmode} =~ /^All$/i) {
720 push @SpecifiedComparisonMeasures, @SupportedComparisonMeasures;
721 }
722 else {
723 # Comma delimited list of similarity coefficients...
724 my($BitVectorComparisonMode, @SpecifiedMeasures, @UnsupportedSpecifiedMeasures);
725
726 $BitVectorComparisonMode = $Options{bitvectorcomparisonmode};
727 $BitVectorComparisonMode =~ s/ //g;
728 @SpecifiedMeasures = split ",", $BitVectorComparisonMode;
729 @UnsupportedSpecifiedMeasures = ();
730
731 for $SpecifiedMeasure (@SpecifiedMeasures) {
732 if (exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) {
733 push @SpecifiedComparisonMeasures, $SpecifiedMeasure;
734 }
735 else {
736 push @UnsupportedSpecifiedMeasures, $SpecifiedMeasure;
737 }
738 }
739 if (@UnsupportedSpecifiedMeasures) {
740 if (@UnsupportedSpecifiedMeasures > 1) {
741 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedMeasures, ", ", 0)," - for option \"-b --BitVectorComparisonMode\" are not valid.\n";
742 }
743 else {
744 warn "Error: The value specified, @UnsupportedSpecifiedMeasures, for option \"-b --BitVectorComparisonMode\" is not valid.\n";
745 }
746 die "Allowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
747 }
748 }
749 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) {
750 $SpecifiedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
751 $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
752 }
753
754 $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode};
755 $OptionsInfo{SpecifiedBitVectorComparisonsRef} = \@SpecifiedComparisonMeasures;
756 $OptionsInfo{SpecifiedBitVectorComparisonsNameRef} = \%SpecifiedComparisonMeasuresNameMap;
757 $OptionsInfo{SpecifiedBitVectorComparisonsMethodRef} = \%SpecifiedComparisonMeasuresMethodMap;
758
759 # Make sure valid alpha parameter is specified for Tversky calculation...
760 my($SpecifiedMeasure1, $SpecifiedMeasure2);
761 $OptionsInfo{Alpha} = '';
762 $SpecifiedMeasure1 = 'TverskySimilarity';
763 $SpecifiedMeasure2 = 'WeightedTverskySimilarity';
764 if ($SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure1)} || $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure2)}) {
765 if (IsEmpty($Options{alpha})) {
766 die "Error: You must specify a value for \"-a, --alpha\" option in \"$SpecifiedMeasure1, $SpecifiedMeasure2, or All\" \"-m --mode\". \n";
767 }
768 my($Alpha);
769 $Alpha = $Options{alpha};
770 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) {
771 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n";
772 }
773 $OptionsInfo{Alpha} = $Alpha;
774 }
775
776 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky
777 # calculations...
778 $OptionsInfo{Beta} = '';
779 $SpecifiedMeasure1 = 'WeightedTverskySimilarity';
780 $SpecifiedMeasure2 = 'WeightedTanimotoSimilarity';
781 if ($SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure1)} || $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure2)}) {
782 if (IsEmpty($Options{beta})) {
783 die "Error: You must specify a value for \"-b, --beta\" option in \"$SpecifiedMeasure1, $SpecifiedMeasure2, or All\" \"-m --mode\". \n";
784 }
785 my($Beta);
786 $Beta = $Options{beta};
787 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) {
788 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n";
789 }
790 $OptionsInfo{Beta} = $Beta;
791 }
792
793 # Setup any parameters required for specified comparison menthod...
794 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) {
795 @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}} = ();
796 if ($SpecifiedMeasure =~ /^TverskySimilarity$/i) {
797 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Alpha};
798 }
799 elsif ($SpecifiedMeasure =~ /^WeightedTverskySimilarity$/i) {
800 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Alpha};
801 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Beta};
802 }
803 elsif ($SpecifiedMeasure =~ /^WeightedTanimotoSimilarity$/i) {
804 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, $OptionsInfo{Beta};
805 }
806 }
807 $OptionsInfo{SpecifiedBitVectorComparisonsParameterRef} = \%SpecifiedComparisonMeasuresParameterMap;
808 }
809
810 # Process options related to comparion of vector strings...
811 #
812 sub ProcessVectorComparisonOptions {
813 # Setup specified similarity coefficients for vector strings..
814 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
815
816 @SupportedComparisonMeasures = ();
817 %SupportedComparisonMeasuresNameMap = ();
818 %SupportedComparisonMeasuresMethodMap = ();
819 for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) {
820 # Similarity and distance coefficient function/method names contain "Coefficient" in their names.
821 # So take 'em out and setup a map to original function/method name...
822 $ComparisonMeasure = $SupportedComparisonMeasure;
823 if ($ComparisonMeasure =~ /Coefficient$/i) {
824 $ComparisonMeasure =~ s/Coefficient$//i;
825 }
826 push @SupportedComparisonMeasures, $ComparisonMeasure;
827 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
828 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
829 }
830
831 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings...
832 my($SpecifiedMeasure, @SpecifiedComparisonMeasures, %SpecifiedComparisonMeasuresNameMap, %SpecifiedComparisonMeasuresMethodMap, %SpecifiedComparisonMeasuresParameterMap);
833
834 @SpecifiedComparisonMeasures = ();
835 %SpecifiedComparisonMeasuresNameMap = ();
836 %SpecifiedComparisonMeasuresMethodMap = ();
837
838 if ($Options{vectorcomparisonmode} =~ /^All$/i) {
839 push @SpecifiedComparisonMeasures, @SupportedComparisonMeasures;
840 }
841 else {
842 # Comma delimited list of similarity coefficients...
843 my($VectorComparisonMode, @SpecifiedMeasures, @UnsupportedSpecifiedMeasures);
844
845 $VectorComparisonMode = $Options{vectorcomparisonmode};
846 $VectorComparisonMode =~ s/ //g;
847 @SpecifiedMeasures = split ",", $VectorComparisonMode;
848 @UnsupportedSpecifiedMeasures = ();
849
850 for $SpecifiedMeasure (@SpecifiedMeasures) {
851 if (exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) {
852 push @SpecifiedComparisonMeasures, $SpecifiedMeasure;
853 }
854 else {
855 push @UnsupportedSpecifiedMeasures, $SpecifiedMeasure;
856 }
857 }
858 if (@UnsupportedSpecifiedMeasures) {
859 if (@UnsupportedSpecifiedMeasures > 1) {
860 warn "Error: The values specified - ", JoinWords(\@UnsupportedSpecifiedMeasures, ", ", 0)," - for option \"-v --VectorComparisonMode\" are not valid.\n";
861 }
862 else {
863 warn "Error: The value specified, @UnsupportedSpecifiedMeasures, for option \"-v --VectorComparisonMode\" is not valid.\n";
864 }
865 die "Allowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
866 }
867 }
868 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) {
869 $SpecifiedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
870 $SpecifiedComparisonMeasuresNameMap{lc($SpecifiedMeasure)} = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
871 }
872
873 $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode};
874 $OptionsInfo{SpecifiedVectorComparisonsRef} = \@SpecifiedComparisonMeasures;
875 $OptionsInfo{SpecifiedVectorComparisonsNameRef} = \%SpecifiedComparisonMeasuresNameMap;
876 $OptionsInfo{SpecifiedVectorComparisonsMethodRef} = \%SpecifiedComparisonMeasuresMethodMap;
877
878 # Setup specified vector comparison calculation modes...
879 my(@SpecifiedVectorComparisonModes);
880 @SpecifiedVectorComparisonModes = ();
881 if ($Options{vectorcomparisonformulism} =~ /^All$/i) {
882 push @SpecifiedVectorComparisonModes, ("AlgebraicForm", "BinaryForm", "SetTheoreticForm");
883 }
884 else {
885 my($SpecifiedFormulism, @SpecifiedFormulismWords);
886
887 @SpecifiedFormulismWords = split /\,/, $Options{vectorcomparisonformulism};
888 for $SpecifiedFormulism (@SpecifiedFormulismWords) {
889 if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) {
890 die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n";
891 }
892 push @SpecifiedVectorComparisonModes, $SpecifiedFormulism;
893 }
894 }
895 $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism};
896 $OptionsInfo{SpecifiedVectorComparisonModesRef} = \@SpecifiedVectorComparisonModes;
897
898 # Setup any parameters required for specified comparison menthod...
899 for $SpecifiedMeasure (@SpecifiedComparisonMeasures) {
900 @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}} = ();
901 push @{$SpecifiedComparisonMeasuresParameterMap{lc($SpecifiedMeasure)}}, ($Options{fast} ? 1 : 0);
902 }
903 $OptionsInfo{SpecifiedVectorComparisonsParameterRef} = \%SpecifiedComparisonMeasuresParameterMap;
904 }
905
906 # Setup script usage and retrieve command line arguments specified using various options...
907 sub SetupScriptUsage {
908
909 # Retrieve all the options...
910 %Options = ();
911
912 $Options{alpha} = 0.5;
913 $Options{beta} = 1;
914
915 $Options{bitvectorcomparisonmode} = "TanimotoSimilarity";
916
917 $Options{colmode} = 'colnum';
918
919 $Options{compoundidprefix} = 'Cmpd';
920 $Options{compoundidmode} = 'LabelPrefix';
921
922 $Options{detail} = 1;
923
924 $Options{indelim} = 'comma';
925 $Options{outdelim} = 'comma';
926
927 $Options{inputdatamode} = 'LoadInMemory';
928
929 $Options{mode} = 'AutoDetect';
930
931 $Options{outmatrixformat} = 'RowsAndColumns';
932
933 $Options{outmatrixtype} = 'FullMatrix';
934
935 $Options{quote} = 'yes';
936 $Options{precision} = 2;
937
938 $Options{vectorcomparisonmode} = "TanimotoSimilarity";
939 $Options{vectorcomparisonformulism} = "AlgebraicForm";
940
941 if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "colmode|c=s", "compoundidcol=s", "compoundidprefix=s", "compoundidfield=s", "compoundidmode=s", "detail|d=i", "fast|f", "fingerprintscol=s", "fingerprintsfield=s", "help|h", "indelim=s", "inputdatamode=s", "mode|m=s", "outdelim=s", "overwrite|o", "outmatrixformat=s", "outmatrixtype=s", "precision|p=s", "quote|q=s", "root|r=s", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) {
942 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
943 }
944 if ($Options{workingdir}) {
945 if (! -d $Options{workingdir}) {
946 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
947 }
948 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
949 }
950 if ($Options{colmode} !~ /^(ColNum|ColLabel)$/i) {
951 die "Error: The value specified, $Options{colmode}, for option \"-c, --ColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
952 }
953 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
954 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
955 }
956 if (!IsPositiveInteger($Options{detail})) {
957 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
958 }
959 if ($Options{inputdatamode} !~ /^(LoadInMemory|ScanFile)$/i) {
960 die "Error: The value specified, $Options{inputdatamode}, for option \"--InputDataMode\" is not valid. Allowed values: LoadInMemory or ScanFile\n";
961 }
962 if ($Options{mode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) {
963 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n";
964 }
965 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
966 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n";
967 }
968 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
969 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
970 }
971 if ($Options{outmatrixformat} !~ /^(RowsAndColumns|IDPairsAndValue)$/i) {
972 die "Error: The value specified, $Options{outmatrixformat}, for option \"--OutMatrixFormat\" is not valid. Allowed values: RowsAndColumns or IDPairsAndValue\n";
973 }
974 if ($Options{outmatrixtype} !~ /^(FullMatrix|UpperTriangularMatrix|LowerTriangularMatrix)$/i) {
975 die "Error: The value specified, $Options{outmatrixtype}, for option \"--OutMatrixType\" is not valid. Allowed values: FullMatrix, UpperTriangularMatrix or LowerTriangularMatrix\n";
976 }
977 if ($Options{quote} !~ /^(Yes|No)$/i) {
978 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
979 }
980 if (!IsPositiveInteger($Options{precision})) {
981 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n";
982 }
983 }
984
985 __END__
986
987 =head1 NAME
988
989 SimilarityMatricesFingerprints.pl - Calculate similarity matrices using fingerprints strings data in SD, FP and CSV/TSV text file(s)
990
991 =head1 SYNOPSIS
992
993 SimilarityMatricesFingerprints.pl SDFile(s) FPFile(s) TextFile(s)...
994
995 SimilarityMatricesFingerprints.pl [B<--alpha> I<number>] [B<--beta> I<number>]
996 [B<-b, --BitVectorComparisonMode> I<All | "TanimotoSimilarity,[ TverskySimilarity, ... ]">]
997 [B<-c, --ColMode> I<ColNum | ColLabel>] [B<--CompoundIDCol> I<col number | col name>]
998 [B<--CompoundIDPrefix> I<text>] [B<--CompoundIDField> I<DataFieldName>]
999 [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
1000 [B<-d, --detail> I<InfoLevel>] [B<-f, --fast>] [B<--FingerprintsCol> I<col number | col name>]
1001 [B<--FingerprintsField> I<FieldLabel>] [B<-h, --help>] [B<--InDelim> I<comma | semicolon>]
1002 [B<--InputDataMode> I<LoadInMemory | ScanFile>]
1003 [B<-m, --mode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>]
1004 [B<--OutDelim> I<comma | tab | semicolon>] [B<--OutMatrixFormat> I<RowsAndColumns | IDPairsAndValue>]
1005 [B<--OutMatrixType> I<FullMatrix | UpperTriangularMatrix | LowerTriangularMatrix>]
1006 [B<-o, --overwrite>] [B<-p, --precision> I<number>]
1007 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>]
1008 [B<-v, --VectorComparisonMode> I<All | "TanimotoSimilairy, [ ManhattanDistance, ...]">]
1009 [B<--VectorComparisonFormulism> I<All | "AlgebraicForm, [BinaryForm, SetTheoreticForm]">]
1010 [B<-w, --WorkingDir> dirname] SDFile(s) FPFile(s) TextFile(s)...
1011
1012 =head1 DESCRIPTION
1013
1014 Calculate similarity matrices using fingerprint bit-vector or vector strings data in I<SD, FP
1015 and CSV/TSV> text file(s) and generate CSV/TSV text file(s) containing values for specified
1016 similarity and distance coefficients.
1017
1018 The scripts SimilarityMatrixSDFiles.pl and SimilarityMatrixTextFiles.pl have been removed from the
1019 current release of MayaChemTools and their functionality merged with this script.
1020
1021 The valid I<SDFile> extensions are I<.sdf> and I<.sd>. All SD files in a current directory
1022 can be specified either by I<*.sdf> or the current directory name.
1023
1024 The valid I<FPFile> extensions are I<.fpf> and I<.fp>. All FP files in a current directory
1025 can be specified either by I<*.fpf> or the current directory name.
1026
1027 The valid I<TextFile> extensions are I<.csv> and I<.tsv> for comma/semicolon and tab
1028 delimited text files respectively. All other file names are ignored. All text files in a
1029 current directory can be specified by I<*.csv>, I<*.tsv>, or the current directory
1030 name. The B<--indelim> option determines the format of I<TextFile(s)>. Any file
1031 which doesn't correspond to the format indicated by B<--indelim> option is ignored.
1032
1033 Example of I<FP> file containing fingerprints bit-vector string data:
1034
1035 #
1036 # Package = MayaChemTools 7.4
1037 # ReleaseDate = Oct 21, 2010
1038 #
1039 # TimeStamp = Mon Mar 7 15:14:01 2011
1040 #
1041 # FingerprintsStringType = FingerprintsBitVector
1042 #
1043 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
1044 # Size = 1024
1045 # BitStringFormat = HexadecimalString
1046 # BitsOrder = Ascending
1047 #
1048 Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510...
1049 Cmpd2 000000249400840040100042011001001980410c000000001010088001120...
1050 ... ...
1051 ... ..
1052
1053 Example of I<FP> file containing fingerprints vector string data:
1054
1055 #
1056 # Package = MayaChemTools 7.4
1057 # ReleaseDate = Oct 21, 2010
1058 #
1059 # TimeStamp = Mon Mar 7 15:14:01 2011
1060 #
1061 # FingerprintsStringType = FingerprintsVector
1062 #
1063 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
1064 # VectorStringFormat = IDsAndValuesString
1065 # VectorValuesType = NumericalValues
1066 #
1067 Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C:
1068 N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...;
1069 33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2
1070 6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ...
1071 Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C
1072 O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...;
1073 15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2
1074 1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ...
1075 ... ...
1076 ... ...
1077
1078 Example of I<SD> file containing fingerprints bit-vector string data:
1079
1080 ... ...
1081 ... ...
1082 $$$$
1083 ... ...
1084 ... ...
1085 ... ...
1086 41 44 0 0 0 0 0 0 0 0999 V2000
1087 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1088 ... ...
1089 2 3 1 0 0 0 0
1090 ... ...
1091 M END
1092 > <CmpdID>
1093 Cmpd1
1094
1095 > <PathLengthFingerprints>
1096 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt
1097 h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66
1098 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028
1099 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462
1100 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a
1101 aa0660a11014a011d46
1102
1103 $$$$
1104 ... ...
1105 ... ...
1106
1107 Example of CSV I<Text> file containing fingerprints bit-vector string data:
1108
1109 "CompoundID","PathLengthFingerprints"
1110 "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes
1111 :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4
1112 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030
1113 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..."
1114 ... ...
1115 ... ...
1116
1117 The current release of MayaChemTools supports the following types of fingerprint
1118 bit-vector and vector strings:
1119
1120 FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi
1121 us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0-C.X1.BO1.H3-AT
1122 C1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-ATC1 NR0-C.X
1123 1.BO1.H3-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-A
1124 TC1 NR0-C.X2.BO2.H2-ATC1:NR1-C.X2.BO2.H2-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2
1125 -C.X2.BO2.H2-ATC1:NR2-N.X3.BO3-ATC1:NR2-O.X1.BO1.H1-ATC1 NR0-C.X2.B...
1126
1127 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS
1128 ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2
1129 .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1
1130 O.X1.BO2;2 4 14 3 10 1 1 1 3 2
1131
1132 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume
1133 ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F
1134 N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1
1135
1136 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN
1137 umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C
1138 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N
1139 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8
1140 O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1
1141 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0...
1142
1143 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
1144 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
1145 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3
1146 .024 -2.270
1147
1148 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
1149 ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435
1150 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1
1151 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1152 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1153
1154 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
1155 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
1156 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
1157 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
1158 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
1159 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
1160
1161 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
1162 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
1163 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
1164 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
1165 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
1166 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
1167
1168 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
1169 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
1170 0000000001010000000110000011000000000000100000000000000000000000100001
1171 1000000110000000000000000000000000010011000000000000000000000000010000
1172 0000000000000000000000000010000000000000000001000000000000000000000000
1173 0000000000010000100001000000000000101000000000000000100000000000000...
1174
1175 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu
1176 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8
1177 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567
1178 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012
1179 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455
1180 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404...
1181
1182 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp
1183 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184
1184 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450
1185 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430
1186 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134
1187 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566...
1188
1189 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
1190 0000000000000000000000000000000001001000010010000000010010000000011100
1191 0100101010111100011011000100110110000011011110100110111111111111011111
1192 11111111111110111000
1193
1194 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
1195 1110011111100101111111000111101100110000000000000011100010000000000000
1196 0000000000000000000000000000000000000000000000101000000000000000000000
1197 0000000000000000000000000000000000000000000000000000000000000000000000
1198 0000000000000000000000000000000000000011000000000000000000000000000000
1199 0000000000000000000000000000000000000000
1200
1201 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
1202 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1203 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
1204 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
1205 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
1206 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
1207
1208 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
1209 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
1210 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
1211 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1212 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
1213 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1214
1215 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1216 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
1217 0100010101011000101001011100110001000010001001101000001001001001001000
1218 0010110100000111001001000001001010100100100000000011000000101001011100
1219 0010000001000101010100000100111100110111011011011000000010110111001101
1220 0101100011000000010001000011000010100011101100001000001000100000000...
1221
1222 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
1223 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
1224 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
1225 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
1226 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
1227 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
1228
1229 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
1230 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
1231 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
1232 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
1233 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
1234 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
1235
1236 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
1237 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
1238 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.
1239 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...;
1240 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
1241 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
1242
1243 FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi
1244 stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar
1245 Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H
1246 BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...;
1247 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4
1248 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ...
1249
1250 FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3
1251 3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-
1252 C.X3.BO4 C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-N.X3.BO3 C.X2.BO2.H2-C.X2.BO
1253 2.H2-C.X3.BO3.H1-C.X2.BO2.H2 C.X2.BO2.H2-C.X2.BO2.H2-C.X3.BO3.H1-O...;
1254 2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1
1255
1256 FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica
1257 lValues;IDsAndValuesString;aaCH-aaCH-aaCH-aaCH aaCH-aaCH-aaCH-aasC aaC
1258 H-aaCH-aasC-aaCH aaCH-aaCH-aasC-aasC aaCH-aaCH-aasC-sF aaCH-aaCH-aasC-
1259 ssNH aaCH-aasC-aasC-aasC aaCH-aasC-aasC-aasN aaCH-aasC-ssNH-dssC a...;
1260 4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
1261
1262 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
1263 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
1264 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1
1265 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1
1266 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....;
1267 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
1268 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
1269
1270 FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1
1271 :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C
1272 .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3-
1273 D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2
1274 -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C.
1275 3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7...
1276
1277 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
1278 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
1279 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
1280 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H
1281 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...;
1282 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
1283 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
1284
1285 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
1286 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
1287 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
1288 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
1289 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
1290 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
1291
1292 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize:
1293 MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1-
1294 Ar1-Ar1 Ar1-Ar1-H1 Ar1-Ar1-HBA1 Ar1-Ar1-HBD1 Ar1-H1-H1 Ar1-H1-HBA1 Ar1
1295 -H1-HBD1 Ar1-HBA1-HBD1 H1-H1-H1 H1-H1-HBA1 H1-H1-HBD1 H1-HBA1-HBA1 H1-
1296 HBA1-HBD1 H1-HBA1-NI1 H1-HBD1-NI1 HBA1-HBA1-NI1 HBA1-HBD1-NI1 Ar1-...;
1297 46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23
1298 28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1
1299 119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ...
1300
1301 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD
1302 istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106
1303 8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0
1304 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26
1305 14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0
1306 0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ...
1307
1308 =head1 OPTIONS
1309
1310 =over 4
1311
1312 =item B<--alpha> I<number>
1313
1314 Value of alpha parameter for calculating I<Tversky> similarity coefficient specified for
1315 B<-b, --BitVectorComparisonMode> option. It corresponds to weights assigned for bits set
1316 to "1" in a pair of fingerprint bit-vectors during the calculation of similarity coefficient. Possible
1317 values: I<0 to 1>. Default value: <0.5>.
1318
1319 =item B<--beta> I<number>
1320
1321 Value of beta parameter for calculating I<WeightedTanimoto> and I<WeightedTversky>
1322 similarity coefficients specified for B<-b, --BitVectorComparisonMode> option. It is used to
1323 weight the contributions of bits set to "0" during the calculation of similarity coefficients. Possible
1324 values: I<0 to 1>. Default value of <1> makes I<WeightedTanimoto> and I<WeightedTversky>
1325 equivalent to I<Tanimoto> and I<Tversky>.
1326
1327 =item B<-b, --BitVectorComparisonMode> I<All | "TanimotoSimilarity,[TverskySimilarity,...]">
1328
1329 Specify what similarity coefficients to use for calculating similarity matrices for fingerprints bit-vector
1330 strings data values in I<TextFile(s)>: calculate similarity matrices for all supported similarity
1331 coefficients or specify a comma delimited list of similarity coefficients. Possible values:
1332 I<All | "TanimotoSimilarity,[TverskySimilarity,...]>. Default: I<TanimotoSimilarity>
1333
1334 I<All> uses complete list of supported similarity coefficients: I<BaroniUrbaniSimilarity, BuserSimilarity,
1335 CosineSimilarity, DiceSimilarity, DennisSimilarity, ForbesSimilarity, FossumSimilarity, HamannSimilarity, JacardSimilarity,
1336 Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity, McConnaugheySimilarity, OchiaiSimilarity,
1337 PearsonSimilarity, RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity, SkoalSneath1Similarity,
1338 SkoalSneath2Similarity, SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity, YuleSimilarity,
1339 WeightedTanimotoSimilarity, WeightedTverskySimilarity>. These similarity coefficients are described below.
1340
1341 For two fingerprint bit-vectors A and B of same size, let:
1342
1343 Na = Number of bits set to "1" in A
1344 Nb = Number of bits set to "1" in B
1345 Nc = Number of bits set to "1" in both A and B
1346 Nd = Number of bits set to "0" in both A and B
1347
1348 Nt = Number of bits set to "1" or "0" in A or B (Size of A or B)
1349 Nt = Na + Nb - Nc + Nd
1350
1351 Na - Nc = Number of bits set to "1" in A but not in B
1352 Nb - Nc = Number of bits set to "1" in B but not in A
1353
1354 Then, various similarity coefficients [ Ref. 40 - 42 ] for a pair of bit-vectors A and B are
1355 defined as follows:
1356
1357 I<BaroniUrbaniSimilarity>: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser )
1358
1359 I<BuserSimilarity>: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani )
1360
1361 I<CosineSimilarity>: Nc / SQRT ( Na * Nb ) (same as Ochiai)
1362
1363 I<DiceSimilarity>: (2 * Nc) / ( Na + Nb )
1364
1365 I<DennisSimilarity>: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / SQRT ( Nt * Na * Nb)
1366
1367 I<ForbesSimilarity>: ( Nt * Nc ) / ( Na * Nb )
1368
1369 I<FossumSimilarity>: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb )
1370
1371 I<HamannSimilarity>: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt
1372
1373 I<JaccardSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Tanimoto)
1374
1375 I<Kulczynski1Similarity>: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / ( Na + Nb - 2Nc )
1376
1377 I<Kulczynski2Similarity>: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + ( Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5 * ( Nc / Na + Nc / Nb )
1378
1379 I<MatchingSimilarity>: ( Nc + Nd ) / Nt
1380
1381 I<McConnaugheySimilarity>: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / ( Na * Nb )
1382
1383 I<OchiaiSimilarity>: Nc / SQRT ( Na * Nb ) (same as Cosine)
1384
1385 I<PearsonSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) / SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) )
1386
1387 I<RogersTanimotoSimilarity>: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc) + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt)
1388
1389 I<RussellRaoSimilarity>: Nc / Nt
1390
1391 I<SimpsonSimilarity>: Nc / MIN ( Na, Nb)
1392
1393 I<SkoalSneath1Similarity>: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb - Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc )
1394
1395 I<SkoalSneath2Similarity>: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt )
1396
1397 I<SkoalSneath3Similarity>: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc ) ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc )
1398
1399 I<TanimotoSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Jaccard)
1400
1401 I<TverskySimilarity>: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * ( Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb)
1402
1403 I<YuleSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) )
1404
1405 Values of Tanimoto/Jaccard and Tversky coefficients are dependent on only those bit which
1406 are set to "1" in both A and B. In order to take into account all bit positions, modified versions
1407 of Tanimoto [ Ref. 42 ] and Tversky [ Ref. 43 ] have been developed.
1408
1409 Let:
1410
1411 Na' = Number of bits set to "0" in A
1412 Nb' = Number of bits set to "0" in B
1413 Nc' = Number of bits set to "0" in both A and B
1414
1415 Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / ( Na' + Nb' - Nc' )
1416
1417 Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc' ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb')
1418
1419 Then:
1420
1421 I<WeightedTanimotoSimilarity> = beta * Tanimoto + (1 - beta) * Tanimoto'
1422
1423 I<WeightedTverskySimilarity> = beta * Tversky + (1 - beta) * Tversky'
1424
1425 =item B<-c, --ColMode> I<ColNum | ColLabel>
1426
1427 Specify how columns are identified in I<TextFile(s)>: using column number or column
1428 label. Possible values: I<ColNum or ColLabel>. Default value: I<ColNum>.
1429
1430 =item B<--CompoundIDCol> I<col number | col name>
1431
1432 This value is B<-c, --ColMode> mode specific. It specifies input I<TextFile(s)> column to use for
1433 generating compound ID for similarity matrices in output I<TextFile(s)>. Possible values: I<col number
1434 or col label>. Default value: I<first column containing the word compoundID in its column label or sequentially
1435 generated IDs>.
1436
1437 =item B<--CompoundIDPrefix> I<text>
1438
1439 Specify compound ID prefix to use during sequential generation of compound IDs for input I<SDFile(s)>
1440 and I<TextFile(s)>. Default value: I<Cmpd>. The default value generates compound IDs which look
1441 like Cmpd<Number>.
1442
1443 For input I<SDFile(s)>, this value is only used during I<LabelPrefix | MolNameOrLabelPrefix> values
1444 of B<--CompoundIDMode> option; otherwise, it's ignored.
1445
1446 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1447
1448 Compound
1449
1450 The values specified above generates compound IDs which correspond to Compound<Number>
1451 instead of default value of Cmpd<Number>.
1452
1453 =item B<--CompoundIDField> I<DataFieldName>
1454
1455 Specify input I<SDFile(s)> datafield label for generating compound IDs. This value is only used
1456 during I<DataField> value of B<--CompoundIDMode> option.
1457
1458 Examples for I<DataField> value of B<--CompoundIDMode>:
1459
1460 MolID
1461 ExtReg
1462
1463 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1464
1465 Specify how to generate compound IDs from input I<SDFile(s)> for similarity matrix CSV/TSV text
1466 file(s): use a I<SDFile(s)> datafield value; use molname line from I<SDFile(s)>; generate a sequential ID
1467 with specific prefix; use combination of both MolName and LabelPrefix with usage of LabelPrefix values
1468 for empty molname lines.
1469
1470 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1471 Default: I<LabelPrefix>.
1472
1473 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1474 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1475 values are replaced with sequential compound IDs.
1476
1477 =item B<-d, --detail> I<InfoLevel>
1478
1479 Level of information to print about lines being ignored. Default: I<1>. Possible values:
1480 I<1, 2 or 3>.
1481
1482 =item B<-f, --fast>
1483
1484 In this mode, fingerprints columns specified using B<--FingerprintsCol> for I<TextFile(s)> and
1485 B<--FingerprintsField> for I<SDFile(s)> are assumed to contain valid fingerprints data and no
1486 checking is performed before calculating similarity matrices. By default, fingerprints data is
1487 validated before computing pairwise similarity and distance coefficients.
1488
1489 =item B<--FingerprintsCol> I<col number | col name>
1490
1491 This value is B<-c, --colmode> specific. It specifies fingerprints column to use during
1492 calculation similarity matrices for I<TextFile(s)>. Possible values: I<col number or col label>.
1493 Default value: I<first column containing the word Fingerprints in its column label>.
1494
1495 =item B<--FingerprintsField> I<FieldLabel>
1496
1497 Fingerprints field label to use during calculation similarity matrices for I<SDFile(s)>.
1498 Default value: I<first data field label containing the word Fingerprints in its label>
1499
1500 =item B<-h, --help>
1501
1502 Print this help message.
1503
1504 =item B<--InDelim> I<comma | semicolon>
1505
1506 Input delimiter for CSV I<TextFile(s)>. Possible values: I<comma or semicolon>.
1507 Default value: I<comma>. For TSV files, this option is ignored and I<tab> is used as a
1508 delimiter.
1509
1510 =item B<--InputDataMode> I<LoadInMemory | ScanFile>
1511
1512 Specify how fingerprints bit-vector or vector strings data from I<SD, FP and CSV/TSV>
1513 fingerprint file(s) is processed: Retrieve, process and load all available fingerprints
1514 data in memory; Retrieve and process data for fingerprints one at a time. Possible values
1515 : I<LoadInMemory | ScanFile>. Default: I<LoadInMemory>.
1516
1517 During I<LoadInMemory> value of B<--InputDataMode>, fingerprints bit-vector or vector
1518 strings data from input file is retrieved, processed, and loaded into memory all at once
1519 as fingerprints objects for generation for similarity matrices.
1520
1521 During I<ScanFile> value of B<--InputDataMode>, multiple passes over the input fingerprints
1522 file are performed to retrieve and process fingerprints bit-vector or vector strings data one at
1523 a time to generate fingerprints objects used during generation of similarity matrices. A temporary
1524 copy of the input fingerprints file is made at the start and deleted after generating the matrices.
1525
1526 I<ScanFile> value of B<--InputDataMode> allows processing of arbitrary large fingerprints files
1527 without any additional memory requirement.
1528
1529 =item B<-m, --mode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>
1530
1531 Format of fingerprint strings data in I<TextFile(s)>: automatically detect format of fingerprints
1532 string created by MayaChemTools fingerprints generation scripts or explicitly specify its format.
1533 Possible values: I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>. Default
1534 value: I<AutoDetect>.
1535
1536 =item B<--OutDelim> I<comma | tab | semicolon>
1537
1538 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1539 Default value: I<comma>.
1540
1541 =item B<--OutMatrixFormat> I<RowsAndColumns | IDPairsAndValue>
1542
1543 Specify how similarity or distance values calculated for fingerprints vector and bit-vector strings
1544 are written to the output CSV/TSV text file(s): Generate text files containing rows and columns
1545 with their labels corresponding to compound IDs and each matrix element value corresponding to
1546 similarity or distance between corresponding compounds; Generate text files containing rows containing
1547 compoundIDs for two compounds followed by similarity or distance value between these compounds.
1548
1549 Possible values: I<RowsAndColumns, or IDPairsAndValue>. Default value: I<RowsAndColumns>.
1550
1551 The value of B<--OutMatrixFormat> in conjunction with B<--OutMatrixType> determines type
1552 of data written to output files and allows generation of up to 6 different output data formats:
1553
1554 OutMatrixFormat OutMatrixType
1555
1556 RowsAndColumns FullMatrix [ DEFAULT ]
1557 RowsAndColumns UpperTriangularMatrix
1558 RowsAndColumns LowerTriangularMatrix
1559
1560 IDPairsAndValue FullMatrix
1561 IDPairsAndValue UpperTriangularMatrix
1562 IDPairsAndValue LowerTriangularMatrix
1563
1564 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for
1565 I<FullMatrix> valueof B<--OutMatrixType>:
1566
1567 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ...
1568 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ...
1569 "Cmpd2","0.04","1","0.06","0.05","0.19","0.07",... ...
1570 "Cmpd3","0.25","0.06","1","0.12","0.22","0.25",... ...
1571 "Cmpd4","0.13","0.05","0.12","1","0.11","0.13",... ...
1572 "Cmpd5","0.11","0.19","0.22","0.11","1","0.17",... ...
1573 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1",... ...
1574 ... ... ..
1575 ... ... ..
1576 ... ... ..
1577
1578 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for
1579 I<UpperTriangularMatrix> value of B<--OutMatrixType>:
1580
1581 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ...
1582 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ...
1583 "Cmpd2","1","0.06","0.05","0.19","0.07",... ...
1584 "Cmpd3","1","0.12","0.22","0.25",... ...
1585 "Cmpd4","1","0.11","0.13",... ...
1586 "Cmpd5","1","0.17",... ...
1587 "Cmpd6","1",... ...
1588 ... ... ..
1589 ... ... ..
1590 ... ... ..
1591
1592 Example of data in output file for I<RowsAndColumns> B<--OutMatrixFormat> value for
1593 I<LowerTriangularMatrix> value of B<--OutMatrixType>:
1594
1595 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ...
1596 "Cmpd1","1"
1597 "Cmpd2","0.04","1"
1598 "Cmpd3","0.25","0.06","1"
1599 "Cmpd4","0.13","0.05","0.12","1"
1600 "Cmpd5","0.11","0.19","0.22","0.11","1"
1601 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1"
1602 ... ... ..
1603 ... ... ..
1604 ... ... ..
1605
1606
1607 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for
1608 <FullMatrix> value of B<OutMatrixType>:
1609
1610 "CmpdID1","CmpdID2","Coefficient Value"
1611 "Cmpd1","Cmpd1","1"
1612 "Cmpd1","Cmpd2","0.04"
1613 "Cmpd1","Cmpd3","0.25"
1614 "Cmpd1","Cmpd4","0.13"
1615 ... ... ...
1616 ... ... ...
1617 ... ... ...
1618 "Cmpd2","Cmpd1","0.04"
1619 "Cmpd2","Cmpd2","1"
1620 "Cmpd2","Cmpd3","0.06"
1621 "Cmpd2","Cmpd4","0.05"
1622 ... ... ...
1623 ... ... ...
1624 ... ... ...
1625 "Cmpd3","Cmpd1","0.25"
1626 "Cmpd3","Cmpd2","0.06"
1627 "Cmpd3","Cmpd3","1"
1628 "Cmpd3","Cmpd4","0.12"
1629 ... ... ...
1630 ... ... ...
1631 ... ... ...
1632
1633 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for
1634 <UpperTriangularMatrix> value of B<--OutMatrixType>:
1635
1636 "CmpdID1","CmpdID2","Coefficient Value"
1637 "Cmpd1","Cmpd1","1"
1638 "Cmpd1","Cmpd2","0.04"
1639 "Cmpd1","Cmpd3","0.25"
1640 "Cmpd1","Cmpd4","0.13"
1641 ... ... ...
1642 ... ... ...
1643 ... ... ...
1644 "Cmpd2","Cmpd2","1"
1645 "Cmpd2","Cmpd3","0.06"
1646 "Cmpd2","Cmpd4","0.05"
1647 ... ... ...
1648 ... ... ...
1649 ... ... ...
1650 "Cmpd3","Cmpd3","1"
1651 "Cmpd3","Cmpd4","0.12"
1652 ... ... ...
1653 ... ... ...
1654 ... ... ...
1655
1656 Example of data in output file for I<IDPairsAndValue> B<--OutMatrixFormat> value for
1657 <LowerTriangularMatrix> value of B<--OutMatrixType>:
1658
1659 "CmpdID1","CmpdID2","Coefficient Value"
1660 "Cmpd1","Cmpd1","1"
1661 "Cmpd2","Cmpd1","0.04"
1662 "Cmpd2","Cmpd2","1"
1663 "Cmpd3","Cmpd1","0.25"
1664 "Cmpd3","Cmpd2","0.06"
1665 "Cmpd3","Cmpd3","1"
1666 "Cmpd4","Cmpd1","0.13"
1667 "Cmpd4","Cmpd2","0.05"
1668 "Cmpd4","Cmpd3","0.12"
1669 "Cmpd4","Cmpd4","1"
1670 ... ... ...
1671 ... ... ...
1672 ... ... ...
1673
1674 =item B<--OutMatrixType> I<FullMatrix | UpperTriangularMatrix | LowerTriangularMatrix>
1675
1676 Type of similarity or distance matrix to calculate for fingerprints vector and bit-vector strings:
1677 Calculate full matrix; Calculate lower triangular matrix including diagonal; Calculate upper triangular
1678 matrix including diagonal.
1679
1680 Possible values: I<FullMatrix, UpperTriangularMatrix, or LowerTriangularMatrix>. Default value:
1681 I<FullMatrix>.
1682
1683 The value of B<--OutMatrixType> in conjunction with B<--OutMatrixFormat> determines type
1684 of data written to output files.
1685
1686 =item B<-o, --overwrite>
1687
1688 Overwrite existing files
1689
1690 =item B<-p, --precision> I<number>
1691
1692 Precision of calculated values in the output file. Default: up to I<2> decimal places.
1693 Valid values: positive integers.
1694
1695 =item B<-q, --quote> I<Yes | No>
1696
1697 Put quote around column values in output CSV/TSV text file(s). Possible values:
1698 I<Yes or No>. Default value: I<Yes>.
1699
1700 =item B<-r, --root> I<RootName>
1701
1702 New file name is generated using the root: <Root><BitVectorComparisonMode>.<Ext> or
1703 <Root><VectorComparisonMode><VectorComparisonFormulism>.<Ext>.
1704 The csv, and tsv <Ext> values are used for comma/semicolon, and tab delimited text files
1705 respectively. This option is ignored for multiple input files.
1706
1707 =item B<-v, --VectorComparisonMode> I<All | "TanimotoSimilarity,[ManhattanDistance,...]">
1708
1709 Specify what similarity or distance coefficients to use for calculating similarity matrices for
1710 fingerprint vector strings data values in I<TextFile(s)>: calculate similarity matrices for all
1711 supported similarity and distance coefficients or specify a comma delimited list of similarity
1712 and distance coefficients. Possible values: I<All | "TanimotoSimilairy,[ManhattanDistance,..]">.
1713 Default: I<TanimotoSimilarity>.
1714
1715 The value of B<-v, --VectorComparisonMode>, in conjunction with B<--VectorComparisonFormulism>,
1716 decides which type of similarity and distance coefficient formulism gets used.
1717
1718 I<All> uses complete list of supported similarity and distance coefficients: I<CosineSimilarity,
1719 CzekanowskiSimilarity, DiceSimilarity, OchiaiSimilarity, JaccardSimilarity, SorensonSimilarity, TanimotoSimilarity,
1720 CityBlockDistance, EuclideanDistance, HammingDistance, ManhattanDistance, SoergelDistance>. These
1721 similarity and distance coefficients are described below.
1722
1723 B<FingerprintsVector.pm> module, used to calculate similarity and distance coefficients,
1724 provides support to perform comparison between vectors containing three different types of
1725 values:
1726
1727 Type I: OrderedNumericalValues
1728
1729 . Size of two vectors are same
1730 . Vectors contain real values in a specific order. For example: MACCS keys
1731 count, Topological pharmnacophore atom pairs and so on.
1732
1733 Type II: UnorderedNumericalValues
1734
1735 . Size of two vectors might not be same
1736 . Vectors contain unordered real value identified by value IDs. For example:
1737 Toplogical atom pairs, Topological atom torsions and so on
1738
1739 Type III: AlphaNumericalValues
1740
1741 . Size of two vectors might not be same
1742 . Vectors contain unordered alphanumerical values. For example: Extended
1743 connectivity fingerprints, atom neighborhood fingerprints.
1744
1745 Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues
1746 or AlphaNumericalValues, the vectors are transformed into vectors containing unique OrderedNumericalValues
1747 using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues.
1748
1749 Three forms of similarity and distance calculation between two vectors, specified using B<--VectorComparisonFormulism>
1750 option, are supported: I<AlgebraicForm, BinaryForm or SetTheoreticForm>.
1751
1752 For I<BinaryForm>, the ordered list of processed final vector values containing the value or
1753 count of each unique value type is simply converted into a binary vector containing 1s and 0s
1754 corresponding to presence or absence of values before calculating similarity or distance between
1755 two vectors.
1756
1757 For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let:
1758
1759 N = Number values in A or B
1760
1761 Xa = Values of vector A
1762 Xb = Values of vector B
1763
1764 Xai = Value of ith element in A
1765 Xbi = Value of ith element in B
1766
1767 SUM = Sum of i over N values
1768
1769 For SetTheoreticForm of calculation between two vectors, let:
1770
1771 SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) )
1772 SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) )
1773
1774 For BinaryForm of calculation between two vectors, let:
1775
1776 Na = Number of bits set to "1" in A = SUM ( Xai )
1777 Nb = Number of bits set to "1" in B = SUM ( Xbi )
1778 Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi )
1779 Nd = Number of bits set to "0" in both A and B
1780 = SUM ( 1 - Xai - Xbi + Xai * Xbi)
1781
1782 N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd
1783
1784 Additionally, for BinaryForm various values also correspond to:
1785
1786 Na = | Xa |
1787 Nb = | Xb |
1788 Nc = | SetIntersectionXaXb |
1789 Nd = N - | SetDifferenceXaXb |
1790
1791 | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc
1792 = | Xa | + | Xb | - | SetIntersectionXaXb |
1793
1794 Various similarity and distance coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair of vectors A and B
1795 in I<AlgebraicForm, BinaryForm and SetTheoreticForm> are defined as follows:
1796
1797 B<CityBlockDistance>: ( same as HammingDistance and ManhattanDistance)
1798
1799 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) )
1800
1801 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
1802
1803 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
1804
1805 B<CosineSimilarity>: ( same as OchiaiSimilarityCoefficient)
1806
1807 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) )
1808
1809 I<BinaryForm>: Nc / SQRT ( Na * Nb)
1810
1811 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
1812
1813 B<CzekanowskiSimilarity>: ( same as DiceSimilarity and SorensonSimilarity)
1814
1815 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) )
1816
1817 I<BinaryForm>: 2 * Nc / ( Na + Nb )
1818
1819 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
1820
1821 B<DiceSimilarity>: ( same as CzekanowskiSimilarity and SorensonSimilarity)
1822
1823 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) )
1824
1825 I<BinaryForm>: 2 * Nc / ( Na + Nb )
1826
1827 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
1828
1829 B<EuclideanDistance>:
1830
1831 I<AlgebraicForm>: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) )
1832
1833 I<BinaryForm>: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb - 2 * Nc )
1834
1835 I<SetTheoreticForm>: SQRT ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) )
1836
1837 B<HammingDistance>: ( same as CityBlockDistance and ManhattanDistance)
1838
1839 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) )
1840
1841 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
1842
1843 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
1844
1845 B<JaccardSimilarity>: ( same as TanimotoSimilarity)
1846
1847 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) )
1848
1849 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc )
1850
1851 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) )
1852
1853 B<ManhattanDistance>: ( same as CityBlockDistance and HammingDistance)
1854
1855 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) )
1856
1857 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
1858
1859 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
1860
1861 B<OchiaiSimilarity>: ( same as CosineSimilarity)
1862
1863 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) )
1864
1865 I<BinaryForm>: Nc / SQRT ( Na * Nb)
1866
1867 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
1868
1869 B<SorensonSimilarity>: ( same as CzekanowskiSimilarity and DiceSimilarity)
1870
1871 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) )
1872
1873 I<BinaryForm>: 2 * Nc / ( Na + Nb )
1874
1875 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
1876
1877 B<SoergelDistance>:
1878
1879 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi ) )
1880
1881 I<BinaryForm>: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / ( Na + Nb - Nc )
1882
1883 I<SetTheoreticForm>: ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) / | SetDifferenceXaXb | = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) )
1884
1885 B<TanimotoSimilarity>: ( same as JaccardSimilarity)
1886
1887 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) )
1888
1889 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc )
1890
1891 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) )
1892
1893 =item B<--VectorComparisonFormulism> I<All | "AlgebraicForm,[BinaryForm,SetTheoreticForm]">
1894
1895 Specify fingerprints vector comparison formulism to use for calculation similarity and distance
1896 coefficients during B<-v, --VectorComparisonMode>: use all supported comparison formulisms
1897 or specify a comma delimited. Possible values: I<All | "AlgebraicForm,[BinaryForm,SetTheoreticForm]">.
1898 Default value: I<AlgebraicForm>.
1899
1900 I<All> uses all three forms of supported vector comparison formulism for values of B<-v, --VectorComparisonMode>
1901 option.
1902
1903 For fingerprint vector strings containing B<AlphaNumericalValues> data values - B<ExtendedConnectivityFingerprints>,
1904 B<AtomNeighborhoodsFingerprints> and so on - all three formulism result in same value during similarity and distance
1905 calculations.
1906
1907 =item B<-w, --WorkingDir> I<DirName>
1908
1909 Location of working directory. Default: current directory.
1910
1911 =back
1912
1913 =head1 EXAMPLES
1914
1915 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
1916 bit-vector strings data corresponding to supported fingerprints in text file present in a column
1917 name containing Fingerprint substring by loading all fingerprints data into memory and create a
1918 SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from column name
1919 containing CompoundID substring, type:
1920
1921 % SimilarityMatricesFingerprints.pl -o SampleFPHex.csv
1922
1923 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
1924 bit-vector strings data corresponding to supported fingerprints in SD File present in a data field
1925 with Fingerprint substring in its label by loading all fingerprints data into memory and create a
1926 SampleFPHexTanimotoSimilarity.csv file containing sequentially generated compound IDs with
1927 Cmpd prefix, type:
1928
1929 % SimilarityMatricesFingerprints.pl -o SampleFPHex.sdf
1930
1931 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
1932 bit-vector strings data corresponding to supported fingerprints in FP file by loading all fingerprints
1933 data into memory and create a SampleFPHexTanimotoSimilarity.csv file along with compound IDs
1934 retrieved from FP file, type:
1935
1936 % SimilarityMatricesFingerprints.pl -o SampleFPHex.fpf
1937
1938 To generate a lower triangular similarity matrix corresponding to Tanimoto similarity coefficient for
1939 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a
1940 column name containing Fingerprint substring by loading all fingerprints data into memory and create
1941 a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from column name
1942 containing CompoundID substring, type:
1943
1944 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory
1945 --OutMatrixFormat RowsAndColumns --OutMatrixType LowerTriangularMatrix
1946 SampleFPHex.csv
1947
1948 To generate a upper triangular similarity matrix corresponding to Tanimoto similarity coefficient for
1949 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a
1950 column name containing Fingerprint substring by loading all fingerprints data into memory and create
1951 a SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format containing compound IDs retrieved
1952 from column name containing CompoundID substring, type:
1953
1954 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory
1955 --OutMatrixFormat IDPairsAndValue --OutMatrixType UpperTriangularMatrix
1956 SampleFPHex.csv
1957
1958 To generate a full similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
1959 bit-vector strings data corresponding to supported fingerprints in text file present in a column
1960 name containing Fingerprint substring by scanning file without loading all fingerprints data into memory
1961 and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved from
1962 column name containing CompoundID substring, type:
1963
1964 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile
1965 --OutMatrixFormat RowsAndColumns --OutMatrixType FullMatrix
1966 SampleFPHex.csv
1967
1968 To generate a lower triangular similarity matrix corresponding to Tanimoto similarity coefficient for
1969 fingerprints bit-vector strings data corresponding to supported fingerprints in text file present in a
1970 column name containing Fingerprint substring by scanning file without loading all fingerprints data into
1971 memory and create a SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format containing
1972 compound IDs retrieved from column name containing CompoundID substring, type:
1973
1974 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile
1975 --OutMatrixFormat IDPairsAndValue --OutMatrixType LowerTriangularMatrix
1976 SampleFPHex.csv
1977
1978 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism
1979 for fingerprints vector strings data corresponding to supported fingerprints in text file present in a column name
1980 containing Fingerprint substring and create a SampleFPCountTanimotoSimilarityAlgebraicForm.csv file
1981 containing compound IDs retrieved from column name containing CompoundID substring, type:
1982
1983 % SimilarityMatricesFingerprints.pl -o SampleFPCount.csv
1984
1985 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism
1986 for fingerprints vector strings data corresponding to supported fingerprints in SD file present in a data field with
1987 Fingerprint substring in its label and create a SampleFPCountTanimotoSimilarityAlgebraicForm.csv file
1988 containing sequentially generated compound IDs with Cmpd prefix, type:
1989
1990 % SimilarityMatricesFingerprints.pl -o SampleFPCount.sdf
1991
1992 To generate a similarity matrix corresponding to Tanimoto similarity coefficient using algebraic formulism
1993 vector strings data corresponding to supported fingerprints in FP file and create a
1994 SampleFPCountTanimotoSimilarityAlgebraicForm.csv file along with compound IDs retrieved from FP file, type:
1995
1996 % SimilarityMatricesFingerprints.pl -o SampleFPCount.fpf
1997
1998 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
1999 bit-vector strings data corresponding to supported fingerprints in text file present in a column name
2000 containing Fingerprint substring and create a SampleFPHexTanimotoSimilarity.csv file in
2001 IDPairsAndValue format containing compound IDs retrieved from column name containing
2002 CompoundID substring, type:
2003
2004 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o
2005 SampleFPHex.csv
2006
2007 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2008 bit-vector strings data corresponding to supported fingerprints in SD file present in a data field with
2009 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file in
2010 IDPairsAndValue format containing sequentially generated compound IDs with Cmpd prefix,
2011 type:
2012
2013 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o
2014 SampleFPHex.sdf
2015
2016 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2017 bit-vector strings data corresponding to supported fingerprints in FP file and create a
2018 SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format along with compound IDs retrieved
2019 from FP file, type:
2020
2021 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o
2022 SampleFPHex.fpf
2023
2024 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2025 bit-vector strings data corresponding to supported fingerprints in SD file present in a data field with
2026 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file
2027 containing compound IDs from mol name line, type:
2028
2029 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolName -o
2030 SampleFPHex.sdf
2031
2032 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2033 bit-vector strings data corresponding to supported fingerprints present in a data field with
2034 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file
2035 containing compound IDs from data field name Mol_ID, type:
2036
2037 % SimilarityMatricesFingerprints.pl --CompoundIDMode DataField
2038 --CompoundIDField Mol_ID -o SampleFPBin.sdf
2039
2040 To generate similarity matrices corresponding to Buser, Dice and Tanimoto similarity coefficient
2041 for fingerprints bit-vector strings data corresponding to supported fingerprints present in a column
2042 name containing Fingerprint substring and create SampleFPBin[CoefficientName]Similarity.csv files
2043 containing compound IDs retrieved from column name containing CompoundID substring, type:
2044
2045 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity,
2046 TanimotoSimilarity" -o SampleFPBin.csv
2047
2048 To generate similarity matrices corresponding to Buser, Dice and Tanimoto similarity coefficient
2049 for fingerprints bit-vector strings data corresponding to supported fingerprints present in a data field with
2050 Fingerprint substring in its label and create SampleFPBin[CoefficientName]Similarity.csv files
2051 containing sequentially generated compound IDs with Cmpd prefix, type:
2052
2053 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity,
2054 TanimotoSimilarity" -o SampleFPBin.sdf
2055
2056 To generate similarity matrices corresponding to CityBlock distance and Tanimoto similarity coefficients using
2057 algebraic formulism for fingerprints vector strings data corresponding to supported fingerprints present in
2058 a column name containing Fingerprint substring and create SampleFPCount[CoefficientName]AlgebraicForm.csv
2059 files containing compound IDs retrieved from column name containing CompoundID substring, type:
2060
2061 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
2062 TanimotoSimilarity" -o SampleFPCount.csv
2063
2064 To generate similarity matrices corresponding to CityBlock distance and Tanimoto similarity coefficients using
2065 algebraic formulism for fingerprints vector strings data corresponding to supported fingerprints present in
2066 a data field with Fingerprint substring in its label and create SampleFPCount[CoefficientName]AlgebraicForm.csv
2067 files containing sequentially generated compound IDs with Cmpd prefix, type:
2068
2069 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
2070 TanimotoSimilarity" -o SampleFPCount.sdf
2071
2072 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using
2073 binary formulism for fingerprints vector strings data corresponding to supported fingerprints present in
2074 a column name containing Fingerprint substring and create SampleFPCount[CoefficientName]Binary.csv
2075 files containing compound IDs retrieved from column name containing CompoundID substring, type:
2076
2077 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
2078 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o
2079 SampleFPCount.csv
2080
2081 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using
2082 binary formulism for fingerprints vector strings data corresponding to supported fingerprints present in
2083 a data field with Fingerprint substring in its label and create SampleFPCount[CoefficientName]Binary.csv
2084 files containing sequentially generated compound IDs with Cmpd prefix, type:
2085
2086 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
2087 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o
2088 SampleFPCount.sdf
2089
2090 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using
2091 all supported comparison formulisms for fingerprints vector strings data corresponding to supported
2092 fingerprints present in a column name containing Fingerprint substring and create
2093 SampleFPCount[CoefficientName][FormulismName].csv files containing compound IDs retrieved from column
2094 name containing CompoundID substring, type:
2095
2096 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
2097 TanimotoSimilarity" --VectorComparisonFormulism All -o SampleFPCount.csv
2098
2099 To generate similarity matrices corresponding to CityBlock distance Tanimoto similarity coefficients using
2100 all supported comparison formulisms for fingerprints vector strings data corresponding to supported
2101 fingerprints present in a data field with Fingerprint substring in its label and create
2102 SampleFPCount[CoefficientName][FormulismName].csv files containing sequentially generated
2103 compound IDs with Cmpd prefix, type:
2104
2105 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,TanimotoSimilarity"
2106 --VectorComparisonFormulism All -o SampleFPCount.sdf
2107
2108 To generate similarity matrices corresponding to all available similarity coefficient for fingerprints
2109 bit-vector strings data corresponding to supported fingerprints present in a column name
2110 containing Fingerprint substring and create SampleFPHex[CoefficientName].csv files
2111 containing compound IDs retrieved from column name containing CompoundID substring, type:
2112
2113 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode
2114 All --alpha 0.5 -beta 0.5 -o SampleFPHex.csv
2115
2116 To generate similarity matrices corresponding to all available similarity coefficient for fingerprints
2117 bit-vector strings data corresponding to supported fingerprints present in a data field with Fingerprint
2118 substring in its label and create SampleFPHex[CoefficientName].csv files containing sequentially
2119 generated compound IDs with Cmpd prefix, type
2120
2121 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode
2122 All --alpha 0.5 -beta 0.5 -o SampleFPHex.sdf
2123
2124 To generate similarity matrices corresponding to all available similarity and distance coefficients using
2125 all comparison formulism for fingerprints vector strings data corresponding to supported fingerprints
2126 present in a column name containing Fingerprint substring and create
2127 SampleFPCount[CoefficientName][FormulismName].csv files containing compound IDs
2128 retrieved from column name containing CompoundID substring, type:
2129
2130 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode
2131 All --VectorComparisonFormulism All -o SampleFPCount.csv
2132
2133 To generate similarity matrices corresponding to all available similarity and distance coefficients using
2134 all comparison formulism for fingerprints vector strings data corresponding to supported fingerprints
2135 present in a data field with Fingerprint substring in its label and create
2136 SampleFPCount[CoefficientName][FormulismName].csv files containing sequentially generated
2137 compound IDs with Cmpd prefix, type:
2138
2139 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode
2140 All --VectorComparisonFormulism All -o SampleFPCount.sdf
2141
2142 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2143 bit-vector strings data corresponding to supported fingerprints present in a column number 2
2144 and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved column
2145 number 1, type:
2146
2147 % SimilarityMatricesFingerprints.pl --ColMode ColNum --CompoundIDCol 1
2148 --FingerprintsCol 2 -o SampleFPHex.csv
2149
2150 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2151 bit-vector strings data corresponding to supported fingerprints present in a data field name
2152 Fingerprints and create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs
2153 present in data field name Mol_ID, type:
2154
2155 % SimilarityMatricesFingerprints.pl --FingerprintsField Fingerprints
2156 --CompoundIDMode DataField --CompoundIDField Mol_ID -o SampleFPHex.sdf
2157
2158 To generate a similarity matrix corresponding to Tversky similarity coefficient for fingerprints
2159 bit-vector strings data corresponding to supported fingerprints present in a column named Fingerprints
2160 and create a SampleFPHexTverskySimilarity.tsv file containing compound IDs retrieved column named
2161 CompoundID, type:
2162
2163 % SimilarityMatricesFingerprints.pl --BitVectorComparisonMode
2164 TverskySimilarity --alpha 0.5 --ColMode ColLabel --CompoundIDCol
2165 CompoundID --FingerprintsCol Fingerprints --OutDelim Tab --quote No
2166 -o SampleFPHex.csv
2167
2168 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2169 bit-vector strings data corresponding to supported fingerprints present in a data field
2170 with Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.csv file
2171 containing compound IDs from molname line or sequentially generated compound IDs
2172 with Mol prefix, type:
2173
2174 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolnameOrLabelPrefix
2175 --CompoundIDPrefix Mol -o SampleFPHex.sdf
2176
2177 To generate a similarity matrix corresponding to Tanimoto similarity coefficient for fingerprints
2178 bit-vector strings data corresponding to supported fingerprints present in a data field with
2179 Fingerprint substring in its label and create a SampleFPHexTanimotoSimilarity.tsv file
2180 containing sequentially generated compound IDs with Cmpd prefix, type:
2181
2182 % SimilarityMatricesFingerprints.pl -OutDelim Tab --quote No -o SampleFPHex.sdf
2183
2184 =head1 AUTHOR
2185
2186 Manish Sud <msud@san.rr.com>
2187
2188 =head1 SEE ALSO
2189
2190 InfoFingerprintsFiles.pl, SimilaritySearchingFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
2191 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
2192 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
2193 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
2194
2195 =head1 COPYRIGHT
2196
2197 Copyright (C) 2015 Manish Sud. All rights reserved.
2198
2199 This file is part of MayaChemTools.
2200
2201 MayaChemTools is free software; you can redistribute it and/or modify it under
2202 the terms of the GNU Lesser General Public License as published by the Free
2203 Software Foundation; either version 3 of the License, or (at your option)
2204 any later version.
2205
2206 =cut