comparison bin/SimilaritySearchingFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: SimilaritySearchingFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:21 $
5 # $Revision: 1.18 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use StatisticsUtil;
39 use PseudoHeap;
40 use Fingerprints::FingerprintsFileUtil;
41 use Fingerprints::FingerprintsBitVector;
42 use Fingerprints::FingerprintsVector;
43
44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
45
46 # Autoflush STDOUT
47 $| = 1;
48
49 # Starting message...
50 $ScriptName = basename($0);
51 print "\n$ScriptName: Starting...\n\n";
52 $StartTime = new Benchmark;
53
54 # Get the options and setup script...
55 SetupScriptUsage();
56 if ($Options{help} || @ARGV != 2) {
57 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
58 }
59
60 # Process reference and database file names...
61 my(@FingerprintsFilesList);
62 ProcessFingerprintsFileNames();
63
64 # Process options...
65 print "Processing options...\n";
66 my(%OptionsInfo);
67 ProcessOptions();
68
69 # Setup information about fingerprints inut and SD/text output files...
70 my(%FingerprintsFilesInfo, %OutputFilesInfo, %SimilaritySearchInfo);
71 print "Checking and retrieving information from reference and database fingerprints files...\n";
72 RetrieveFingerprintsFilesInfo();
73
74 # Perform similarity search...
75 print "Performing similarity search...\n";
76 my(%SimilaritySearchResults, %DatabaseFingerprintsFileData);
77 PerformSimilaritySearch();
78
79 print "\n$ScriptName:Done...\n\n";
80
81 $EndTime = new Benchmark;
82 $TotalTime = timediff ($EndTime, $StartTime);
83 print "Total time: ", timestr($TotalTime), "\n";
84
85 ###############################################################################
86
87 # Perform similarity search using fingerprints data in reference and database text files...
88 #
89 sub PerformSimilaritySearch {
90
91 print "\nProcessing fingerprints data for reference molecules...\n";
92 ReadReferenceFingerprintsData();
93
94 InitializeSimilaritySearchResults();
95 GenerateSimilaritySearchResults();
96 WriteSimilaritySearchResultFiles();
97 }
98
99 # Find similar molecules from database molecules for individual or multiple reference molecules...
100 #
101 sub GenerateSimilaritySearchResults {
102 my($DatabaseFingerprintsFileIO, $FingerprintsCount, $IgnoredFingerprintsCount, $DatabaseFingerprintsObject, $DatabaseCmpdID, $ReferenceFingerprintsObject, $ReferenceIndex, $ReferenceCmpdID, $ComparisonValue, $FusedComparisonValue, @ComparisonValues);
103
104 print "Processing fingerprints data for database molecules...\n";
105
106 ($FingerprintsCount, $IgnoredFingerprintsCount) = (0) x 3;
107
108 $DatabaseFingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}});
109 $DatabaseFingerprintsFileIO->Open();
110
111 @ComparisonValues = ();
112
113 DATABASEFP: while ($DatabaseFingerprintsFileIO->Read()) {
114 $FingerprintsCount++;
115
116 if (!$DatabaseFingerprintsFileIO->IsFingerprintsDataValid()) {
117 $IgnoredFingerprintsCount++;
118 next DATABASEFP;
119 }
120 $DatabaseFingerprintsObject = $DatabaseFingerprintsFileIO->GetFingerprints();
121 $DatabaseCmpdID = $DatabaseFingerprintsFileIO->GetCompoundID();
122
123 if ($SimilaritySearchInfo{MultipleReferencesMode}) {
124 @ComparisonValues = ();
125 }
126
127 REFERENCEFP: for $ReferenceIndex (0 .. $#{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
128 $ReferenceCmpdID = $SimilaritySearchInfo{ReferenceCmpdIDsRef}->[$ReferenceIndex];
129 $ReferenceFingerprintsObject = $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}->[$ReferenceIndex];
130
131 $ComparisonValue = CompareReferenceAndDatabaseFingerprintsPair($ReferenceFingerprintsObject, $DatabaseFingerprintsObject);
132 if (!defined $ComparisonValue) {
133 next REFERENCEFP;
134 }
135
136 if ($SimilaritySearchInfo{IndividualReferenceMode}) {
137 CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID);
138 }
139 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
140 push @ComparisonValues, $ComparisonValue;
141 }
142 }
143
144 if ($SimilaritySearchInfo{MultipleReferencesMode}) {
145 $FusedComparisonValue = CalculateGroupFusionComparisonValue(\@ComparisonValues);
146 if (!defined $FusedComparisonValue) {
147 next DATABASEFP;
148 }
149 CollectSimilaritySearchResults($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $FusedComparisonValue);
150 }
151 }
152 $DatabaseFingerprintsFileIO->Close();
153
154 print "Number of fingerprints data entries in database fingerprints file: $FingerprintsCount\n";
155 print "Number of fingerprints date entries processed successfully: ", ($FingerprintsCount - $IgnoredFingerprintsCount) , "\n";
156 print "Number of fingerprints data entries ignored due to missing/invalid data: $IgnoredFingerprintsCount\n\n";
157 }
158
159 # Compare a pair of reference and database fingerprints objects corresponding to bit-vector or
160 # vectors using specified comparison method and comparison cutoff...
161 #
162 sub CompareReferenceAndDatabaseFingerprintsPair {
163 my($ReferenceFingerprintsObject, $DatabaseFingerprintsObject) = @_;
164 my($ComparisonMethod, $ComparisonValue);
165
166 $ComparisonMethod = $SimilaritySearchInfo{ComparisonMethod};
167 $ComparisonValue = $ReferenceFingerprintsObject->$ComparisonMethod($DatabaseFingerprintsObject, @{$SimilaritySearchInfo{ComparisonMethodParameters}});
168
169 if (!defined $ComparisonValue) {
170 warn "Warning: Ignoring fingerprints data for reference compound ID ", $ReferenceFingerprintsObject->GetID(), ": Its comparison with database compound ID, ", $DatabaseFingerprintsObject->GetID(), ", failed.\n";
171 return undef;
172 }
173
174 $ComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $ComparisonValue);
175
176 # Apply any comparison cutoff...
177 if ($SimilaritySearchInfo{ApplyComparisonCutoff}) {
178 return $SimilaritySearchInfo{KeepTop} ? ($ComparisonValue >= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef) : ($ComparisonValue <= $SimilaritySearchInfo{ComparisonCutoff} ? $ComparisonValue : undef);
179 }
180 else {
181 return $ComparisonValue;
182 }
183 }
184
185 # Calculate group fusion comparison value...
186 #
187 sub CalculateGroupFusionComparisonValue {
188 my($ComparisonValuesRef) = @_;
189 my($FusedComparisonValue, @ComparisonValues);
190
191 if (!@{$ComparisonValuesRef}) {
192 return undef;
193 }
194
195 if ($SimilaritySearchInfo{SortComparisonValues}) {
196 @ComparisonValues = sort { $SimilaritySearchInfo{KeepTop} ? ($b <=> $a) : ($a <=> $b) } @{$ComparisonValuesRef};
197 if ($SimilaritySearchInfo{UsekNN} && ($OptionsInfo{kNN} < scalar @{$ComparisonValuesRef})) {
198 # Keep only top kNN values for group fusion...
199 splice @ComparisonValues, $OptionsInfo{kNN};
200 }
201 $ComparisonValuesRef = \@ComparisonValues;
202 }
203
204 $FusedComparisonValue = &{$SimilaritySearchInfo{GroupFusionMethodRef}}($ComparisonValuesRef);
205 if ($SimilaritySearchInfo{ApplyPrecisionDuringFusion}) {
206 $FusedComparisonValue = sprintf("%.$OptionsInfo{Precision}f", $FusedComparisonValue);
207 }
208
209 return $FusedComparisonValue;
210 }
211
212 # Collect similarity results for individual reference and multiple references search...
213 #
214 sub CollectSimilaritySearchResults {
215 my($DatabaseFingerprintsFileIO, $DatabaseCmpdID, $ComparisonValue, $ReferenceCmpdID) = @_;
216
217 if (defined $ReferenceCmpdID) {
218 $SimilaritySearchResults{$ReferenceCmpdID}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID);
219 }
220 else {
221 $SimilaritySearchResults{ResultsPseudoHeap}->AddKeyValuePair($ComparisonValue, $DatabaseCmpdID);
222 }
223
224 if ($FingerprintsFilesInfo{Database}{CollectInputFileData}) {
225 CollectDatabaseFileData($DatabaseCmpdID, $DatabaseFingerprintsFileIO);
226 }
227 }
228
229 # Initialize similarity results for individual or multiple reference molecules...
230 #
231 sub InitializeSimilaritySearchResults {
232 my($ReferenceCmpdID);
233
234 %SimilaritySearchResults = ();
235
236 if ($SimilaritySearchInfo{IndividualReferenceMode}) {
237 for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
238 $SimilaritySearchResults{$ReferenceCmpdID} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules});
239 }
240 }
241 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
242 $SimilaritySearchResults{ResultsPseudoHeap} = new PseudoHeap('Type' => ($SimilaritySearchInfo{KeepTop} ? 'KeepTopN' : 'KeepBottomN'), 'KeyType' => 'Numeric', 'MaxSize' => $OptionsInfo{MaxSimilarMolecules});
243 }
244
245 %DatabaseFingerprintsFileData = ();
246 }
247
248 # Write out results SD and/or CSV/TSV text files for individual or multiple reference molecules...
249 #
250 sub WriteSimilaritySearchResultFiles {
251 my($NewSDFileRef, $NewTextFileRef, $ReferenceCmpdID, $DatabaseCmpdID, $ComparisonValue);
252
253 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles();
254
255 if ($SimilaritySearchInfo{IndividualReferenceMode}) {
256 for $ReferenceCmpdID (@{$SimilaritySearchInfo{ReferenceCmpdIDsRef}}) {
257 for $ComparisonValue ($SimilaritySearchResults{$ReferenceCmpdID}->GetSortedKeys()) {
258 for $DatabaseCmpdID ($SimilaritySearchResults{$ReferenceCmpdID}->GetKeyValues($ComparisonValue)) {
259 WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID);
260 }
261 }
262 }
263 }
264 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
265 for $ComparisonValue ($SimilaritySearchResults{ResultsPseudoHeap}->GetSortedKeys()) {
266 for $DatabaseCmpdID ($SimilaritySearchResults{ResultsPseudoHeap}->GetKeyValues($ComparisonValue)) {
267 WriteDataToOutputFiles($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID);
268 }
269 }
270 }
271
272 if ($NewSDFileRef) {
273 close $NewSDFileRef;
274 }
275 if ($NewTextFileRef) {
276 close $NewTextFileRef;
277 }
278 }
279
280 # Write individual reference or multiple references similarity results along with any other data to output files...
281 #
282 sub WriteDataToOutputFiles {
283 my($NewSDFileRef, $NewTextFileRef, $ComparisonValue, $DatabaseCmpdID, $ReferenceCmpdID) = @_;
284
285 if ($NewSDFileRef) {
286 WriteMolStringDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef);
287 if (defined $ReferenceCmpdID) {
288 print $NewSDFileRef "> <ReferenceCmpdID>\n$ReferenceCmpdID\n\n";
289 }
290 print $NewSDFileRef "> <DatabaseCmpdID>\n$DatabaseCmpdID\n\n> <ComparisonValue>\n$ComparisonValue\n\n";
291 WriteDatabaseDataToSDOutputFile($DatabaseCmpdID, $NewSDFileRef);
292 print $NewSDFileRef "\$\$\$\$\n";
293 }
294
295 if ($NewTextFileRef) {
296 my(@LineWords);
297
298 @LineWords = ();
299 if (defined $ReferenceCmpdID) {
300 push @LineWords, $ReferenceCmpdID;
301 }
302 push @LineWords, ($DatabaseCmpdID, $ComparisonValue);
303
304 if ($FingerprintsFilesInfo{Database}{OutputDataFields} || $FingerprintsFilesInfo{Database}{OutputDataCols}) {
305 push @LineWords, RetrieveDatabaseDataForTextOutputFile($DatabaseCmpdID);
306 }
307 print $NewTextFileRef JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}), "\n";
308 }
309 }
310
311 # Open output files...
312 #
313 sub SetupAndOpenOutputFiles {
314 my($NewSDFileRef, $NewTextFileRef, $NewSDFile, $NewTextFile);
315
316 ($NewSDFileRef, $NewTextFileRef) = (undef) x 2;
317
318 if ($OptionsInfo{SDOutput}) {
319 $NewSDFile = $OutputFilesInfo{SDOutFileName};
320 print "Generating SD file $NewSDFile...\n";
321 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
322 $NewSDFileRef = \*NEWSDFILE;
323 }
324
325 if ($OptionsInfo{TextOutput}) {
326 $NewTextFile = $OutputFilesInfo{TextOutFileName};
327 print "Generating text file $NewTextFile...\n";
328 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
329 $NewTextFileRef = \*NEWTEXTFILE;
330
331 WriteTextFileCoulmnLabels(\*NEWTEXTFILE);
332 }
333
334 return ($NewSDFileRef, $NewTextFileRef);
335 }
336
337 # Write out approriate column labels to text file...
338 #
339 sub WriteTextFileCoulmnLabels {
340 my($NewTextFileRef) = @_;
341 my($Line, @LineWords);
342
343 @LineWords = ();
344
345 if ($SimilaritySearchInfo{IndividualReferenceMode}) {
346 push @LineWords, qw(ReferenceCompoundID DatabaseCompoundID ComparisonValue);
347 }
348 elsif ($SimilaritySearchInfo{MultipleReferencesMode}) {
349 push @LineWords, qw(DatabaseCompoundID ComparisonValue);
350 }
351
352 # Add columns for other database fingerprints file data to be written to output file...
353 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
354 push @LineWords, @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}};
355 }
356 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
357 push @LineWords, @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}};
358 }
359
360 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
361 print $NewTextFileRef "$Line\n";
362 }
363
364 # Write molecule string data to SD output file...
365 #
366 sub WriteMolStringDataToSDOutputFile {
367 my($DatabaseCmpdID, $NewSDFileRef) = @_;
368
369 if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) {
370 my($MolString);
371
372 ($MolString) = split /M END/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
373 print $NewSDFileRef "$MolString\nM END\n";
374 }
375 else {
376 # Just write out an empty molecule data string...
377 print $NewSDFileRef SDFileUtil::GenerateEmptyCtabBlockLines(), "\n";
378 }
379 }
380
381 # Write database data from SD or Text database file to SD output file...
382 #
383 sub WriteDatabaseDataToSDOutputFile {
384 my($DatabaseCmpdID, $NewSDFileRef) = @_;
385
386 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
387 my($DataFieldLabel, $DataFieldValue, @CmpdLines, %DataFieldLabelAndValues);
388
389 @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
390 %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
391
392 for $DataFieldLabel ($FingerprintsFilesInfo{Database}{OutputCurrentDataFields} ? GetCmpdDataHeaderLabels(\@CmpdLines) : @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}) {
393 $DataFieldValue = exists $DataFieldLabelAndValues{$DataFieldLabel} ? $DataFieldLabelAndValues{$DataFieldLabel} : '';
394 print $NewSDFileRef "> <$DataFieldLabel>\n$DataFieldValue\n\n";
395 }
396 }
397 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
398 my($DataColNum, $DataFieldLabel, $DataFieldValue);
399
400 for $DataColNum (@{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}}) {
401 $DataFieldLabel = $FingerprintsFilesInfo{Database}{DataColNumToLabelMap}{$DataColNum};
402 $DataFieldValue = $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$DataColNum];
403 print $NewSDFileRef "> <$DataFieldLabel>\n$DataFieldValue\n\n";
404 }
405 }
406 }
407
408 # Retriebe database data from SD or Text database file for text output file...
409 #
410 sub RetrieveDatabaseDataForTextOutputFile {
411 my($DatabaseCmpdID) = @_;
412
413 if ($FingerprintsFilesInfo{Database}{OutputDataFields}) {
414 my(@CmpdLines, %DataFieldLabelAndValues);
415
416 @CmpdLines = split /\n/, $DatabaseFingerprintsFileData{$DatabaseCmpdID};
417 %DataFieldLabelAndValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
418
419 return map { exists $DataFieldLabelAndValues{$_} ? $DataFieldLabelAndValues{$_} : ''} @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}};
420 }
421 elsif ($FingerprintsFilesInfo{Database}{OutputDataCols}) {
422 if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) {
423 return map { $DatabaseFingerprintsFileData{$DatabaseCmpdID}->[$_] } (0 .. $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}});
424 }
425 else {
426 return ('') x $#{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}};
427 }
428 }
429 }
430
431 # Collect database file SD compound string or CSV/TSV data line for generating results
432 # files..
433 #
434 sub CollectDatabaseFileData {
435 my($DatabaseCmpdID, $DatabaseFingerprintsFileIO) = @_;
436
437 if (exists $DatabaseFingerprintsFileData{$DatabaseCmpdID}) {
438 return;
439 }
440
441 if ($FingerprintsFilesInfo{Database}{CollectCmpdStringData}) {
442 $DatabaseFingerprintsFileData{$DatabaseCmpdID} = $DatabaseFingerprintsFileIO->GetCompoundString();
443 }
444
445 if ($FingerprintsFilesInfo{Database}{CollectDataLine}) {
446 my(@DataLineWords);
447 @DataLineWords = $DatabaseFingerprintsFileIO->GetDataLineWords();
448 $DatabaseFingerprintsFileData{$DatabaseCmpdID} = \@DataLineWords;
449 }
450
451 }
452
453 # Read fingerprints data from reference fingerprints file...
454 #
455 sub ReadReferenceFingerprintsData {
456 my($FingerprintsFileIO);
457
458 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}});
459 ($SimilaritySearchInfo{ReferenceCmpdIDsRef}, $SimilaritySearchInfo{ReferenceFingerprintsObjectsRef}) = Fingerprints::FingerprintsFileUtil::ReadAndProcessFingerpritsData($FingerprintsFileIO);
460
461 }
462
463 # Retrieve information about fingerprints files...
464 #
465 sub RetrieveFingerprintsFilesInfo {
466
467 %FingerprintsFilesInfo = ();
468 %OutputFilesInfo = ();
469 %SimilaritySearchInfo = ();
470
471 %{$FingerprintsFilesInfo{Reference}} = ();
472 %{$FingerprintsFilesInfo{Database}} = ();
473
474 # Set up reference and database file names...
475 $FingerprintsFilesInfo{Reference}{FileName} = $FingerprintsFilesList[0];
476 $FingerprintsFilesInfo{Database}{FileName} = $FingerprintsFilesList[1];
477
478 # Retrieve information about reference and database fingerprints file...
479 RetrieveReferenceFingerprintsFileInfo();
480 RetrieveDatabaseFingerprintsFileInfo();
481
482 # Setup fingerprints comparison method and associated method parameters...
483 SetupReferenceAndDatabaseFingerprintsComparisonInfo();
484
485 # Retrieve information for output files...
486 RetrieveOutputFilesInfo();
487 }
488
489 # Setup refrerence and database fingerprints comparison method and associated method parameters...
490 #
491 sub SetupReferenceAndDatabaseFingerprintsComparisonInfo {
492
493 # Make sure reference and database fingerprints string match...
494 if (($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}$/i) ||
495 ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode}) ||
496 ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} != $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode}) ) {
497 die "Error: First reference fingerprints string type, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType}, must match first database fingerprints type, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType}.\n";
498 }
499
500 if ($FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} !~ /^$FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}$/i) {
501 warn "Warning: First reference fingerprints string description, $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription}, doesn't match first database fingerprints string description, $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription}.\n";
502 }
503
504 # Setup individual reference and multiple references search mode...
505 $SimilaritySearchInfo{IndividualReferenceMode} = undef;
506 $SimilaritySearchInfo{MultipleReferencesMode} = undef;
507
508 if ($OptionsInfo{Mode} =~ /^IndividualReference$/i) {
509 $SimilaritySearchInfo{IndividualReferenceMode} = 1;
510 }
511 elsif ($OptionsInfo{Mode} =~ /^MultipleReferences$/i) {
512 $SimilaritySearchInfo{MultipleReferencesMode} = 1;
513 }
514 else {
515 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n";
516 }
517
518 # Set up reference and database fingerprints similarity search method and paramaters...
519 my($ComparisonMeasure, $ComparisonMethod, $ApplyComparisonCutoff, $ComparisonCutoff, $KeepTop, @ComparisonMethodParameters);
520
521 $SimilaritySearchInfo{ComparisonMethod} = '';
522 @{$SimilaritySearchInfo{ComparisonMethodParameters}} = ();
523
524 $SimilaritySearchInfo{ComparisonCutoff} = '';
525 $SimilaritySearchInfo{KeepTop} = '';
526
527 $ComparisonMeasure = ''; $ComparisonMethod = '';
528 @ComparisonMethodParameters = ();
529
530 FINGERPRINTSTYPE: {
531 if ($FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode}) {
532 $ComparisonMeasure = $OptionsInfo{SpecifiedBitVectorComparisonMeasure};
533 $ComparisonMethod = $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod};
534
535 if ($ComparisonMeasure =~ /^TverskySimilarity$/i) {
536 push @ComparisonMethodParameters, $OptionsInfo{Alpha};
537 }
538 elsif ($ComparisonMeasure =~ /^WeightedTverskySimilarity$/i) {
539 push @ComparisonMethodParameters, $OptionsInfo{Alpha};
540 push @ComparisonMethodParameters, $OptionsInfo{Beta};
541 }
542 elsif ($ComparisonMeasure =~ /^WeightedTanimotoSimilarity$/i) {
543 push @ComparisonMethodParameters, $OptionsInfo{Beta};
544 }
545
546 last FINGERPRINTSTYPE;
547 }
548 if ($FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode}) {
549 my($SkipValuesCheck);
550
551 $ComparisonMeasure = $OptionsInfo{SpecifiedVectorComparisonMeasure};
552 $ComparisonMethod = $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod};
553
554 push @ComparisonMethodParameters, $OptionsInfo{SpecifiedVectorComparisonMode};
555
556 $SkipValuesCheck = $OptionsInfo{Fast} ? 1 : 0;
557 push @ComparisonMethodParameters, $SkipValuesCheck;
558
559 last FINGERPRINTSTYPE;
560 }
561 die "Error: Uknown fingerprints string type. Supported values: FingerprintsBitVectorString or FingerprintsVectorString.\n";
562 }
563
564 $ApplyComparisonCutoff = $SimilaritySearchInfo{IndividualReferenceMode} ? 1 : (($SimilaritySearchInfo{MultipleReferencesMode} && $OptionsInfo{GroupFusionApplyCutoff}) ? 1 : 0);
565
566 $ComparisonCutoff = ''; $KeepTop = '';
567 if ($ComparisonMethod =~ /Distance/i) {
568 $ComparisonCutoff = $OptionsInfo{DistanceCutoff};
569 $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 0 : 1;
570 }
571 else {
572 $ComparisonCutoff = $OptionsInfo{SimilarityCutoff};
573 $KeepTop = ($OptionsInfo{SearchMode} =~ /^SimilaritySearch$/i) ? 1 : 0;
574 }
575
576 $SimilaritySearchInfo{ComparisonMethod} = $ComparisonMethod;
577 @{$SimilaritySearchInfo{ComparisonMethodParameters}} = @ComparisonMethodParameters;
578
579 $SimilaritySearchInfo{ComparisonCutoff} = $ComparisonCutoff;
580 $SimilaritySearchInfo{KeepTop} = $KeepTop;
581 $SimilaritySearchInfo{ApplyComparisonCutoff} = $ApplyComparisonCutoff;
582
583 # Setup references to group fusion methods...
584 $SimilaritySearchInfo{GroupFusionMethodRef} = undef;
585 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = undef;
586
587 FUSIONRULE: {
588 if ($OptionsInfo{GroupFusionRule} =~ /^Max$/i) {
589 # It's always the first value in the appropriated sorted list using value of KeepTop...
590 $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[0]; };
591 last FUSIONRULE;
592 }
593 if ($OptionsInfo{GroupFusionRule} =~ /^Min$/i) {
594 # It's always the last value in the appropriated sorted list using value of KeepTop...
595 $SimilaritySearchInfo{GroupFusionMethodRef} = sub { my($ComparisonValuesRef) = @_; return $ComparisonValuesRef->[$#{$ComparisonValuesRef}]; };
596 last FUSIONRULE;
597 }
598 if ($OptionsInfo{GroupFusionRule} =~ /^Mean$/i) {
599 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Mean;
600 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
601 last FUSIONRULE;
602 }
603 if ($OptionsInfo{GroupFusionRule} =~ /^Median$/i) {
604 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Median;
605 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
606 last FUSIONRULE;
607 }
608 if ($OptionsInfo{GroupFusionRule} =~ /^Sum$/i) {
609 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Sum;
610 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
611 last FUSIONRULE;
612 }
613 if ($OptionsInfo{GroupFusionRule} =~ /^Euclidean$/i) {
614 $SimilaritySearchInfo{GroupFusionMethodRef} = \&StatisticsUtil::Euclidean;
615 $SimilaritySearchInfo{ApplyPrecisionDuringFusion} = 1;
616 last FUSIONRULE;
617 }
618 die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n";
619 }
620
621 $SimilaritySearchInfo{UsekNN} = ($OptionsInfo{kNN} !~ /^All$/i) ? 1 : 0;
622 $SimilaritySearchInfo{SortComparisonValues} = (($OptionsInfo{GroupFusionRule} =~ /^(Max|Min)$/i) || $SimilaritySearchInfo{UsekNN}) ? 1 : 0;
623 }
624
625 # Retrieve information about reference fingerprints file...
626 #
627 sub RetrieveReferenceFingerprintsFileInfo {
628 my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
629
630 $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName};
631 ($FileType, $InDelim) = RetrieveFingerprintsFileInfo($FingerprintsFile);
632
633 $FingerprintsFilesInfo{Reference}{FileType} = $FileType;
634 $FingerprintsFilesInfo{Reference}{InDelim} = $InDelim;
635
636 # Setup reference FingerprintsFileIO parameters...
637 %{$FingerprintsFilesInfo{Reference}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Reference', $FileType, $FingerprintsFile);
638
639 # Make sure reference fingerprints data file contains valid and retrieve fingerprints string mode information...
640 ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Reference', $FingerprintsFile);
641 $FingerprintsFilesInfo{Reference}{FingerprintsStringMode} = $FingerprintsStringMode;
642 $FingerprintsFilesInfo{Reference}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode;
643 $FingerprintsFilesInfo{Reference}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode;
644 $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringType} = $FirstFingerprintsStringType;
645 $FingerprintsFilesInfo{Reference}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription;
646
647 }
648
649 # Retrieve information about database fingerprints file...
650 #
651 sub RetrieveDatabaseFingerprintsFileInfo {
652 my($FingerprintsFile, $FileType, $InDelim, $FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
653
654 $FingerprintsFile = $FingerprintsFilesInfo{Database}{FileName};
655 ($FileType, $InDelim) = RetrieveFingerprintsFileInfo($FingerprintsFile);
656
657 $FingerprintsFilesInfo{Database}{FileType} = $FileType;
658 $FingerprintsFilesInfo{Database}{InDelim} = $InDelim;
659
660 # Setup reference FingerprintsFileIO parameters...
661 %{$FingerprintsFilesInfo{Database}{FingerprintsFileIOParameters}} = RetrieveFingerprintsFileIOParameters('Database', $FileType, $FingerprintsFile);
662
663 # Make sure database fingerprints data file contains valid and retrieve fingerprints string mode information...
664 ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription) = RetrieveFingerprintsFileFingerprintsStringInfo('Database', $FingerprintsFile);
665 $FingerprintsFilesInfo{Database}{FingerprintsStringMode} = $FingerprintsStringMode;
666 $FingerprintsFilesInfo{Database}{FingerprintsBitVectorStringMode} = $FingerprintsBitVectorStringMode;
667 $FingerprintsFilesInfo{Database}{FingerprintsVectorStringMode} = $FingerprintsVectorStringMode;
668 $FingerprintsFilesInfo{Database}{FirstFingerprintsStringType} = $FirstFingerprintsStringType;
669 $FingerprintsFilesInfo{Database}{FirstFingerprintsStringDescription} = $FirstFingerprintsStringDescription;
670
671 # Retrieve database fingerprints data field information for output file...
672 #
673 RetrieveDatabaseFingerprintsDataFieldsInfo($FingerprintsFile, $FileType, $InDelim);
674
675 # Retrieve database fingerprints text file data columns information for output file...
676 #
677 RetrieveDatabaseFingerprintsDataColsInfo($FingerprintsFile, $FileType, $InDelim);
678
679 # Any need to collect database compound string or data line for generation of results files...
680 $FingerprintsFilesInfo{Database}{CollectCmpdStringData} = ($FileType =~ /^SD$/i) ? 1 : 0;
681 $FingerprintsFilesInfo{Database}{CollectDataLine} = ($FileType =~ /^Text$/i && $OptionsInfo{DatabaseDataColsMode} =~ /^(All|Specify)$/i) ? 1 : 0;
682 $FingerprintsFilesInfo{Database}{CollectInputFileData} = ($FingerprintsFilesInfo{Database}{CollectCmpdStringData} || $FingerprintsFilesInfo{Database}{CollectDataLine}) ? 1 : 0;
683
684 # Set maximum number of similar compounds to find for individual reference of set of multiple
685 # reference compounds...
686 #
687 SetMaximumSimilarMoleculesToRetrieve($FingerprintsFile, $FileType, $InDelim);
688 }
689
690 # Retrieve database fingerprints data field information...
691 #
692 sub RetrieveDatabaseFingerprintsDataFieldsInfo {
693 my($FingerprintsFile, $FileType, $InDelim) = @_;
694 my($CollectDataFields, $CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef, @DataFieldsToOutput);
695
696 $FingerprintsFilesInfo{Database}{OutputDataFields} = 0;
697 @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}} = ();
698
699 $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 0;
700
701 @{$FingerprintsFilesInfo{Database}{AllDataFields}} = ();
702 @{$FingerprintsFilesInfo{Database}{CommonDataFields}} = ();
703 @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}} = ();
704
705 if ($FileType !~ /^SD$/i) {
706 return;
707 }
708
709 # No need to go over SD file and collect data fields for SD file during All DatabaseDataFieldsMode as
710 # they would be retrieved from database SD file compound string during generation of output files...
711 #
712 $CollectDataFields = (($OptionsInfo{TextOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^(All|Common)$/i) || ($OptionsInfo{SDOutput} && $OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i)) ? 1 : 0;
713
714 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = (undef) x 2;
715
716 if ($CollectDataFields) {
717 open SDFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
718 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
719 close SDFILE;
720 }
721
722 @DataFieldsToOutput = ();
723 if ($OptionsInfo{DatabaseDataFieldsMode} =~ /^All$/i) {
724 if (defined $AllDataFieldsRef) {
725 push @DataFieldsToOutput, @{$AllDataFieldsRef};
726 push @{$FingerprintsFilesInfo{Database}{AllDataFields}}, @{$AllDataFieldsRef};
727 }
728 else {
729 # Retrieve and output data fields and values dynamically...
730 $FingerprintsFilesInfo{Database}{OutputCurrentDataFields} = 1;
731 }
732 }
733 elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Common$/i) {
734 if (defined $CommonDataFieldsRef) {
735 push @DataFieldsToOutput, @{$CommonDataFieldsRef};
736 push @{$FingerprintsFilesInfo{Database}{CommonDataFields}}, @{$CommonDataFieldsRef};
737 }
738 }
739 elsif ($OptionsInfo{DatabaseDataFieldsMode} =~ /^Specify$/i) {
740 push @DataFieldsToOutput, @{$OptionsInfo{SpecifiedDatabaseDataFields}};
741 push @{$FingerprintsFilesInfo{Database}{SpecifiedDatabaseDataFields}}, @{$OptionsInfo{SpecifiedDatabaseDataFields}};
742 }
743
744 if ($OptionsInfo{DatabaseDataFieldsMode} !~ /^CompoundID$/i) {
745 $FingerprintsFilesInfo{Database}{OutputDataFields} = 1;
746 }
747
748 push @{$FingerprintsFilesInfo{Database}{DataFieldsToOutput}}, @DataFieldsToOutput;
749
750 }
751
752 # Retrieve database fingerprints data columns information...
753 #
754 sub RetrieveDatabaseFingerprintsDataColsInfo {
755 my($FingerprintsFile, $FileType, $InDelim) = @_;
756 my($Line, $ColNum, $ColLabel, $NumOfCols, @DataColLabels, @DataColLabelsToOutput, @DataColNumsToOutput, %DataColLabelToNumMap, %DataColNumToLabelMap);
757
758 $FingerprintsFilesInfo{Database}{OutputDataCols} = 0;
759
760 @{$FingerprintsFilesInfo{Database}{DataColLabels}} = ();
761 %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = ();
762 %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = ();
763
764 @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = ();
765 @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = ();
766
767 if ($FileType !~ /^Text$/i) {
768 return;
769 }
770
771 @DataColLabels = ();
772 @DataColLabelsToOutput = ();
773 @DataColNumsToOutput = ();
774
775 %DataColLabelToNumMap = ();
776 %DataColNumToLabelMap = ();
777
778 # Get column label line...
779 open TEXTFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
780 $Line = TextUtil::GetTextLine(\*TEXTFILE);
781 close TEXTFILE;
782
783 $InDelim = ($InDelim =~ /^Tab$/i) ? "\t" : ($InDelim =~ /semicolon/i ? "\;" : "\,");
784
785 @DataColLabels = TextUtil::SplitWords($Line, $InDelim);
786 $NumOfCols = scalar @DataColLabels;
787
788 for $ColNum (0 .. $#DataColLabels) {
789 $ColLabel = $DataColLabels[$ColNum];
790 $DataColLabelToNumMap{$ColLabel} = $ColNum;
791 $DataColNumToLabelMap{$ColNum} = $ColLabel;
792 }
793
794 if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i) {
795 if ($OptionsInfo{DatabaseColMode} =~ /^ColNum$/i) {
796 for $ColNum (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) {
797 if ($ColNum > $NumOfCols) {
798 die "Error: Column number, $ColNum, specified using \"--DatabaseDataCols\" is not valid: It must be <= $NumOfCols\n";
799 }
800 push @DataColNumsToOutput, ($ColNum - 1);
801 }
802 }
803 elsif ($OptionsInfo{DatabaseColMode} =~ /^ColLabel$/i) {
804 for $ColLabel (@{$OptionsInfo{SpecifiedDatabaseDataCols}}) {
805 if (!exists $DataColLabelToNumMap{$ColLabel}) {
806 die "Error: Column label, $ColLabel, specified using \"--DatabaseDataCols\" is not valid: It doesn't exist\n";
807 }
808 push @DataColNumsToOutput, $DataColLabelToNumMap{$ColLabel};
809 }
810 }
811 }
812 elsif ($OptionsInfo{DatabaseDataColsMode} =~ /^All$/i) {
813 @DataColNumsToOutput = map { $_ } (0 .. $#DataColLabels);
814 }
815
816 # Setup data column labels to output...
817 if (scalar @DataColNumsToOutput) {
818 @DataColLabelsToOutput = map { $DataColNumToLabelMap{$_} } (0 .. $#DataColNumsToOutput);
819 }
820
821 $FingerprintsFilesInfo{Database}{OutputDataCols} = scalar @DataColNumsToOutput ? 1 : 0;
822
823 @{$FingerprintsFilesInfo{Database}{DataColLabels}} = @DataColLabels;
824 %{$FingerprintsFilesInfo{Database}{DataColLabelToNumMap}} = %DataColLabelToNumMap;
825 %{$FingerprintsFilesInfo{Database}{DataColNumToLabelMap}} = %DataColNumToLabelMap;
826
827 @{$FingerprintsFilesInfo{Database}{DataColNumsToOutput}} = @DataColNumsToOutput;
828 @{$FingerprintsFilesInfo{Database}{DataColLabelsToOutput}} = @DataColLabelsToOutput;
829 }
830
831 # Set maximum number of similar compounds to find for individual reference of set of multiple
832 # reference compounds...
833 #
834 sub SetMaximumSimilarMoleculesToRetrieve {
835 my($FingerprintsFile, $FileType, $InDelim) = @_;
836 my($MaxSimilarMolecules, $NumOfDatabaseMolecules, $PercentSimilarMolecules, $Line);
837
838 if ($OptionsInfo{SimilarCountMode} !~ /^PercentSimilar$/i) {
839 return;
840 }
841
842 $PercentSimilarMolecules = $OptionsInfo{PercentSimilarMolecules};
843
844 # Count database entries to figure out MaxSimilarMolecules using PercentSimilarMolecules
845 # value...
846 $NumOfDatabaseMolecules = 0;
847 if ($FileType =~ /^SD$/i && exists($FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules})) {
848 # It might already be counted for SD file...
849 $NumOfDatabaseMolecules = $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules};
850 }
851 else {
852 print "Calculating maximum number of similar molecules to retrieve for \"PercentSimilar\" value of \"--SimilarCountMode\" option by counting number of molecules in database fingerprints file...\n";
853 open FINGERPRINTSFILE, "$FingerprintsFile" or die "Error: Couldn't open $FingerprintsFile: $! \n";
854 FILETYPE: {
855 if ($FileType =~ /^SD$/i) {
856 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
857 if ($Line =~ /^\$\$\$\$/) {
858 $NumOfDatabaseMolecules++;
859 }
860 }
861 last FILETYPE;
862 }
863 if ($FileType =~ /^Text$/i) {
864 # Ignore column label line...
865 $Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE);
866 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
867 $NumOfDatabaseMolecules++;
868 }
869 last FILETYPE;
870 }
871 if ($FileType =~ /^FP$/i) {
872 while ($Line = TextUtil::GetTextLine(\*FINGERPRINTSFILE)) {
873 if ($Line !~ /^#/) {
874 $NumOfDatabaseMolecules++;
875 }
876 }
877 last FILETYPE;
878 }
879 $NumOfDatabaseMolecules = 0;
880 }
881 close FINGERPRINTSFILE;
882 $FingerprintsFilesInfo{Database}{NumOfDatabaseMolecules} = $NumOfDatabaseMolecules;
883 }
884
885 $MaxSimilarMolecules = int (($NumOfDatabaseMolecules * $PercentSimilarMolecules)/100);
886 if ($MaxSimilarMolecules < 1) {
887 $MaxSimilarMolecules = 1;
888 }
889
890 $OptionsInfo{MaxSimilarMolecules} = $MaxSimilarMolecules;
891 }
892
893 # Retrieve information about fingerprints file...
894 #
895 sub RetrieveFingerprintsFileInfo {
896 my($FingerprintsFile) = @_;
897 my($FileType, $InDelim, $FileDir, $FileExt, $FileName);
898
899 if (!(-e $FingerprintsFile)) {
900 die "Error: Input fingerprints file, $FingerprintsFile, doesn't exist.\n";
901 }
902
903 $FileType = Fingerprints::FingerprintsFileUtil::GetFingerprintsFileType($FingerprintsFile);
904 if (IsEmpty($FileType)) {
905 die "Error: Input file, $FingerprintsFile, is not a fingerprints file.\n";
906 }
907
908 $InDelim = '';
909 if ($FileType =~ /^Text$/i) {
910 $FileDir = ""; $FileName = ""; $FileExt = "";
911 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
912 $InDelim = ($FileExt =~ /^tsv$/i) ? 'Tab' : $OptionsInfo{InDelim};
913 }
914
915 return ($FileType, $InDelim);
916 }
917
918 # Retrieve fingerprints file IO parameters...
919 #
920 sub RetrieveFingerprintsFileIOParameters {
921 my($FingerprintsFileMode, $FileType, $FingerprintsFile) = @_;
922 my(%FingerprintsFileIOParams);
923
924 if ($FingerprintsFileMode !~ /^(Reference|Database)$/) {
925 die "Error: Unknown fingerprints file mode: $FingerprintsFileMode. Supported values: Reference or Database\n";
926 }
927
928 %FingerprintsFileIOParams = ();
929
930 FILETYPE: {
931 if ($FileType =~ /^SD$/i) {
932 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsField"}, 'CompoundIDMode' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDMode"}, 'CompoundIDFieldLabel' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDField"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"});
933 last FILETYPE;
934 }
935 if ($FileType =~ /^FP$/i) {
936 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail});
937 last FILETYPE;
938 }
939 if ($FileType =~ /^Text$/i) {
940 %FingerprintsFileIOParams = ('Name' => $FingerprintsFile, 'Mode' => 'Read', 'FingerprintsStringMode' => $OptionsInfo{FingerprintsMode}, 'ValidateData' => $OptionsInfo{ValidateData}, 'DetailLevel' => $OptionsInfo{Detail}, 'FingerprintsCol' => $OptionsInfo{"${FingerprintsFileMode}FingerprintsCol"}, 'ColMode' => $OptionsInfo{"${FingerprintsFileMode}ColMode"}, 'CompoundIDCol' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDCol"}, 'CompoundIDPrefix' => $OptionsInfo{"${FingerprintsFileMode}CompoundIDPrefix"}, 'InDelim' => $FingerprintsFilesInfo{$FingerprintsFileMode}{InDelim});
941 last FILETYPE;
942 }
943 die "Error: Fingerprints file type, $FileType, is not valid. Supported file types: SD, FP or Text\n";
944 }
945
946 return %FingerprintsFileIOParams;
947 }
948
949 # Make sure fingerprints data file contains valid dta and retrieve fingerprints string mode information...
950 #
951 sub RetrieveFingerprintsFileFingerprintsStringInfo {
952 my($FingerprintsFileMode, $FingerprintsFile) = @_;
953 my($FingerprintsFileIO, $FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
954
955 $FingerprintsFileIO = Fingerprints::FingerprintsFileUtil::NewFingerprintsFileIO(%{$FingerprintsFilesInfo{$FingerprintsFileMode}{FingerprintsFileIOParameters}});
956 if (!$FingerprintsFileIO) {
957 die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n";
958 }
959 if (!$FingerprintsFileIO->IsFingerprintsFileDataValid()) {
960 die "Error: Reference fingerprints file, $FingerprintsFile, contains invalid fingerprints data.\n";
961 }
962
963 $FingerprintsStringMode = $FingerprintsFileIO->GetFingerprintsStringMode();
964 $FingerprintsBitVectorStringMode = $FingerprintsFileIO->GetFingerprintsBitVectorStringMode();
965 $FingerprintsVectorStringMode = $FingerprintsFileIO->GetFingerprintsVectorStringMode();
966
967 $FirstFingerprintsStringType = $FingerprintsFileIO->GetFirstFingerprintsStringType();
968 $FirstFingerprintsStringDescription = $FingerprintsFileIO->GetFirstFingerprintsStringDescription();
969
970 $FingerprintsFileIO->Close();
971
972 return ($FingerprintsStringMode, $FingerprintsBitVectorStringMode, $FingerprintsVectorStringMode, $FirstFingerprintsStringType, $FirstFingerprintsStringDescription);
973 }
974
975 # Retrieve output files names using reference fingerprints file name...
976 #
977 sub RetrieveOutputFilesInfo {
978 my($FingerprintsFile, $FileDir, $FileExt, $FileName, $OutFileRoot, $SDOutFileName, $TextOutFileName, $SDOutFileExt, $TextOutFileExt, $ReferenceFileName, $DatabaseFileName);
979
980 $OutputFilesInfo{OutFileRoot} = '';
981 $OutputFilesInfo{SDOutFileName} = '';
982 $OutputFilesInfo{TextOutFileName} = '';
983
984 $FingerprintsFile = $FingerprintsFilesInfo{Reference}{FileName};
985
986 $FileDir = ""; $FileName = ""; $FileExt = "";
987 ($FileDir, $FileName, $FileExt) = ParseFileName($FingerprintsFile);
988
989 $SDOutFileExt = "sdf";
990 $TextOutFileExt = ($Options{outdelim} =~ /^tab$/i) ? "tsv" : "csv";
991
992 if ($OptionsInfo{OutFileRoot}) {
993 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
994 if ($RootFileName && $RootFileExt) {
995 $FileName = $RootFileName;
996 }
997 else {
998 $FileName = $OptionsInfo{OutFileRoot};
999 }
1000 $OutFileRoot = $FileName;
1001 }
1002 else {
1003 $OutFileRoot = "${FileName}SimilaritySearching";
1004 }
1005
1006 $SDOutFileName = "${OutFileRoot}.${SDOutFileExt}";
1007 $TextOutFileName = "${OutFileRoot}.${TextOutFileExt}";
1008
1009 $ReferenceFileName = $FingerprintsFilesInfo{Reference}{FileName};
1010 $DatabaseFileName = $FingerprintsFilesInfo{Database}{FileName};
1011
1012 if ($OptionsInfo{SDOutput}) {
1013 if ($SDOutFileName =~ /^$ReferenceFileName$/i) {
1014 die "Error: Same output, $SDOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1015 }
1016 if ($SDOutFileName =~ /^$DatabaseFileName$/i) {
1017 die "Error: Same output, $SDOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1018 }
1019 }
1020
1021 if ($OptionsInfo{TextOutput}) {
1022 if ($TextOutFileName =~ /^$ReferenceFileName$/i) {
1023 die "Error: Same output, $TextOutFileName, and reference input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1024 }
1025 if ($TextOutFileName =~ /^$DatabaseFileName$/i) {
1026 die "Error: Same output, $TextOutFileName, and database input file names.\nSpecify a different name using \"-r --root\" option or use default name.\n";
1027 }
1028 }
1029
1030 if (!$OptionsInfo{OverwriteFiles}) {
1031 if ($OptionsInfo{SDOutput}) {
1032 if (-e $SDOutFileName) {
1033 die "Error: The output file $SDOutFileName already exists.\n";
1034 }
1035 }
1036 if ($OptionsInfo{TextOutput}) {
1037 if (-e $TextOutFileName) {
1038 die "Error: The output file $TextOutFileName already exists.\n";
1039 }
1040 }
1041 }
1042
1043 $OutputFilesInfo{OutFileRoot} = $OutFileRoot;
1044 $OutputFilesInfo{SDOutFileName} = $SDOutFileName;
1045 $OutputFilesInfo{TextOutFileName} = $TextOutFileName;
1046
1047 }
1048
1049 # Process input fingerprints file names...
1050 #
1051 sub ProcessFingerprintsFileNames {
1052 @FingerprintsFilesList = ();
1053
1054 if (@ARGV != 2) {
1055 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
1056 }
1057
1058 # Reference fingerprints file name...
1059 push @FingerprintsFilesList, $ARGV[0];
1060
1061 # Database fingerprints file name...
1062 push @FingerprintsFilesList, $ARGV[1];
1063
1064 }
1065
1066 # Process option values...
1067 sub ProcessOptions {
1068 %OptionsInfo = ();
1069
1070 $OptionsInfo{Mode} = $Options{mode};
1071 $OptionsInfo{FingerprintsMode} = $Options{fingerprintsmode};
1072
1073 $OptionsInfo{SearchMode} = $Options{searchmode};
1074
1075 ProcessBitVectorComparisonOptions();
1076 ProcessVectorComparisonOptions();
1077
1078 $OptionsInfo{GroupFusionRule} = $Options{groupfusionrule};
1079 $OptionsInfo{GroupFusionApplyCutoff} = ($Options{groupfusionapplycutoff} =~ /^Yes$/i) ? 1 : 0;;
1080
1081 $OptionsInfo{SimilarCountMode} = $Options{similarcountmode};
1082 $OptionsInfo{NumOfSimilarMolecules} = $Options{numofsimilarmolecules};
1083 $OptionsInfo{PercentSimilarMolecules} = $Options{percentsimilarmolecules};
1084
1085 # Set MaxSimilarMolecules to NumOfSimilarMolecules. For PercentSimilar value of SimilarCountMode,
1086 # it'll be overwritten using number of entries in database fingerprints file and value of PercentSimilarMolecules...
1087 #
1088 $OptionsInfo{MaxSimilarMolecules} = $OptionsInfo{NumOfSimilarMolecules};
1089
1090 $OptionsInfo{SimilarityCutoff} = $Options{similaritycutoff};
1091 $OptionsInfo{DistanceCutoff} = $Options{distancecutoff};
1092
1093 $OptionsInfo{kNN} = $Options{knn};
1094 if ($Options{knn} !~ /^All$/i) {
1095 if (!IsPositiveInteger($Options{knn})) {
1096 die "Error: The value specified, $Options{knn}, for option \"-k, --KNN\" is not valid. Allowed values: > 0 \n";
1097 }
1098 }
1099
1100 ProcessReferenceFingerprintsDataOptions();
1101 ProcessDatabaseFingerprintsDataOptions();
1102
1103 $OptionsInfo{Detail} = $Options{detail};
1104
1105 $OptionsInfo{InDelim} = $Options{indelim};
1106 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
1107 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
1108
1109 $OptionsInfo{Output} = $Options{output};
1110 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0;
1111 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0;
1112
1113 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
1114 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
1115
1116 $OptionsInfo{Fast} = $Options{fast} ? 1 : 0;
1117 $OptionsInfo{ValidateData} = $Options{fast} ? 0 : 1;
1118
1119 $OptionsInfo{Precision} = $Options{precision};
1120 }
1121
1122 # Process options related to comparion of bit vector strings...
1123 #
1124 sub ProcessBitVectorComparisonOptions {
1125 # Setup supported bit vector similarity coefficients for bit vector strings...
1126 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
1127
1128 @SupportedComparisonMeasures = ();
1129 %SupportedComparisonMeasuresNameMap = ();
1130 %SupportedComparisonMeasuresMethodMap = ();
1131
1132 for $SupportedComparisonMeasure (Fingerprints::FingerprintsBitVector::GetSupportedSimilarityCoefficients()) {
1133 # Similarity coefficient function/method names contain "Coefficient" in their names.
1134 # So take 'em out and setup a map to original function/method name...
1135 $ComparisonMeasure = $SupportedComparisonMeasure;
1136 $ComparisonMeasure =~ s/Coefficient$//;
1137
1138 push @SupportedComparisonMeasures, $ComparisonMeasure;
1139 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
1140 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
1141 }
1142
1143 # Setup similarity coefficient to use for calculating similarity matrices for bit vector strings...
1144 my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod);
1145
1146 $SpecifiedComparisonMeasureName = '';
1147 $SpecifiedComparisonMeasureMethod = '';
1148
1149 $SpecifiedMeasure = $Options{bitvectorcomparisonmode};
1150
1151 if (! exists $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)} ) {
1152 die "Error: The value specified, $SpecifiedMeasure, for option \"-b --BitVectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
1153 }
1154
1155 $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
1156 $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
1157
1158 $OptionsInfo{BitVectorComparisonMode} = $Options{bitvectorcomparisonmode};
1159
1160 $OptionsInfo{SpecifiedBitVectorComparisonMeasure} = $SpecifiedMeasure;
1161 $OptionsInfo{SpecifiedBitVectorComparisonMeasureName} = $SpecifiedComparisonMeasureName;
1162 $OptionsInfo{SpecifiedBitVectorComparisonMeasureMethod} = $SpecifiedComparisonMeasureMethod;
1163
1164 # Make sure valid alpha parameter is specified for Tversky calculation...
1165 $OptionsInfo{Alpha} = '';
1166 if ($SpecifiedMeasure =~ /^(TverskySimilarity|WeightedTverskySimilarity)$/i) {
1167 if (IsEmpty($Options{alpha})) {
1168 die "Error: You must specify a value for \"-a, --alpha\" option in \"TverskySimilarity or WeightedTverskySimilarity\" \"-m --mode\". \n";
1169 }
1170 my($Alpha);
1171 $Alpha = $Options{alpha};
1172 if (!(IsFloat($Alpha) && $Alpha >=0 && $Alpha <= 1)) {
1173 die "Error: The value specified, $Options{alpha}, for option \"-a, --alpha\" is not valid. Allowed values: >= 0 and <= 1\n";
1174 }
1175 $OptionsInfo{Alpha} = $Alpha;
1176 }
1177
1178 # Make sure valid beta parameter is specified for WeightedTanimoto and WeightedTversky
1179 # calculations...
1180 $OptionsInfo{Beta} = '';
1181 if ($SpecifiedMeasure =~ /^(WeightedTverskySimilarity|WeightedTanimotoSimilarity)$/i) {
1182 if (IsEmpty($Options{beta})) {
1183 die "Error: You must specify a value for \"-b, --beta\" option in \"WeightedTverskySimilarity or WeightedTanimotoSimilarity\" \"-m --mode\". \n";
1184 }
1185 my($Beta);
1186 $Beta = $Options{beta};
1187 if (!(IsFloat($Beta) && $Beta >=0 && $Beta <= 1)) {
1188 die "Error: The value specified, $Options{beta}, for option \"-b, --beta\" is not valid. Allowed values: >= 0 and <= 1\n";
1189 }
1190 $OptionsInfo{Beta} = $Beta;
1191 }
1192 }
1193
1194 # Process options related to comparion of vector strings...
1195 #
1196 sub ProcessVectorComparisonOptions {
1197 # Setup specified similarity coefficients for vector strings..
1198 my($ComparisonMeasure, $SupportedComparisonMeasure, @SupportedComparisonMeasures, %SupportedComparisonMeasuresNameMap, %SupportedComparisonMeasuresMethodMap);
1199
1200 @SupportedComparisonMeasures = ();
1201 %SupportedComparisonMeasuresNameMap = ();
1202 %SupportedComparisonMeasuresMethodMap = ();
1203 for $SupportedComparisonMeasure (Fingerprints::FingerprintsVector::GetSupportedDistanceAndSimilarityCoefficients()) {
1204 # Similarity and distance coefficient function/method names contain "Coefficient" in their names.
1205 # So take 'em out and setup a map to original function/method name...
1206 $ComparisonMeasure = $SupportedComparisonMeasure;
1207 if ($ComparisonMeasure =~ /Coefficient$/i) {
1208 $ComparisonMeasure =~ s/Coefficient$//i;
1209 }
1210 push @SupportedComparisonMeasures, $ComparisonMeasure;
1211 $SupportedComparisonMeasuresNameMap{lc($ComparisonMeasure)} = $ComparisonMeasure;
1212 $SupportedComparisonMeasuresMethodMap{lc($ComparisonMeasure)} = $SupportedComparisonMeasure;
1213 }
1214
1215 # Setup a list of similarity coefficients to use for calculating similarity matrices for bit vector strings...
1216 my($SpecifiedMeasure, $SpecifiedComparisonMeasureName, $SpecifiedComparisonMeasureMethod);
1217
1218 $SpecifiedComparisonMeasureName = '';
1219 $SpecifiedComparisonMeasureMethod = '';
1220
1221 $SpecifiedMeasure = $Options{vectorcomparisonmode};
1222 $SpecifiedMeasure =~ s/ //g;
1223
1224 if (! exists($SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)})) {
1225 die "Error: The value specified, $SpecifiedMeasure, for option \"-v --VectorComparisonMode\" is not valid.\nAllowed values:", JoinWords(\@SupportedComparisonMeasures, ", ", 0), "\n";
1226 }
1227
1228 $SpecifiedComparisonMeasureMethod = $SupportedComparisonMeasuresMethodMap{lc($SpecifiedMeasure)};
1229 $SpecifiedComparisonMeasureName = $SupportedComparisonMeasuresNameMap{lc($SpecifiedMeasure)};
1230
1231 $OptionsInfo{VectorComparisonMode} = $Options{vectorcomparisonmode};
1232
1233 $OptionsInfo{SpecifiedVectorComparisonMeasure} = $SpecifiedMeasure;
1234 $OptionsInfo{SpecifiedVectorComparisonMeasuresName} = $SpecifiedComparisonMeasureName;
1235 $OptionsInfo{SpecifiedVectorComparisonMeasuresMethod} = $SpecifiedComparisonMeasureMethod;
1236
1237 # Setup specified vector comparison calculation modes...
1238 my($SpecifiedFormulism);
1239
1240 $SpecifiedFormulism = $Options{vectorcomparisonformulism};
1241 $SpecifiedFormulism =~ s/ //g;
1242 if ($SpecifiedFormulism !~ /^(AlgebraicForm|BinaryForm|SetTheoreticForm)$/i) {
1243 die "Error: The value specified, $SpecifiedFormulism, for option \"--VectorComparisonFormulism\" is not valid. Allowed values: AlgebraicForm, BinaryForm or SetTheoreticForm\n";
1244 }
1245
1246 $OptionsInfo{VectorComparisonFormulism} = $Options{vectorcomparisonformulism};
1247 $OptionsInfo{SpecifiedVectorComparisonMode} = $SpecifiedFormulism;
1248
1249 }
1250
1251 # Process options related to data retrieval from reference fingerprints SD and CSV/TSV
1252 # text files...
1253 #
1254 sub ProcessReferenceFingerprintsDataOptions {
1255
1256 $OptionsInfo{ReferenceCompoundIDPrefix} = $Options{referencecompoundidprefix} ? $Options{referencecompoundidprefix} : 'Cmpd';
1257
1258 # Compound ID and fingerprints column options for text files...
1259
1260 $OptionsInfo{ReferenceColMode} = $Options{referencecolmode};
1261
1262 if (IsNotEmpty($Options{referencecompoundidcol})) {
1263 if ($Options{referencecolmode} =~ /^ColNum$/i) {
1264 if (!IsPositiveInteger($Options{referencecompoundidcol})) {
1265 die "Error: Column value, $Options{referencecompoundidcol}, specified using \"--ReferenceCompoundIDCol\" is not valid: Allowed integer values: > 0\n";
1266 }
1267 }
1268 $OptionsInfo{ReferenceCompoundIDCol} = $Options{referencecompoundidcol};
1269 }
1270 else {
1271 $OptionsInfo{ReferenceCompoundIDCol} = 'AutoDetect';
1272 }
1273
1274 if (IsNotEmpty($Options{referencefingerprintscol})) {
1275 if ($Options{referencecolmode} =~ /^ColNum$/i) {
1276 if (!IsPositiveInteger($Options{referencefingerprintscol})) {
1277 die "Error: Column value, $Options{referencefingerprintscol}, specified using \"--ReferenceFingerprintsCol\" is not valid: Allowed integer values: > 0\n";
1278 }
1279 }
1280 $OptionsInfo{ReferenceFingerprintsCol} = $Options{referencefingerprintscol};
1281 }
1282 else {
1283 $OptionsInfo{ReferenceFingerprintsCol} = 'AutoDetect';
1284 }
1285
1286 if (IsNotEmpty($Options{referencecompoundidcol}) && IsNotEmpty($Options{referencefingerprintscol})) {
1287 if (IsPositiveInteger($Options{referencecompoundidcol}) && IsPositiveInteger($Options{referencefingerprintscol})) {
1288 if (($Options{referencecompoundidcol} == $Options{referencefingerprintscol})) {
1289 die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n";
1290 }
1291 }
1292 else {
1293 if (($Options{referencecompoundidcol} eq $Options{referencefingerprintscol})) {
1294 die "Error: Values specified using \"--ReferenceCompoundIDCol\" and \"--ReferenceFingerprintsCol\", $Options{referencecompoundidcol}, must be different.\n";
1295 }
1296 }
1297 }
1298
1299 # Compound ID and fingerprints field options for SD files...
1300
1301 $OptionsInfo{ReferenceCompoundIDMode} = $Options{referencecompoundidmode};
1302 $OptionsInfo{ReferenceCompoundIDField} = '';
1303
1304 if ($Options{referencecompoundidmode} =~ /^DataField$/i && !$Options{referencecompoundidfield}) {
1305 die "Error: You must specify a value for \"--ReferenceCompoundIDField\" option in \"DataField\" \"--ReferenceCompoundIDMode\". \n";
1306 }
1307 if ($Options{referencecompoundidfield}) {
1308 $OptionsInfo{ReferenceCompoundIDField} = $Options{referencecompoundidfield};
1309 }
1310
1311 if (IsNotEmpty($Options{referencefingerprintsfield})) {
1312 $OptionsInfo{ReferenceFingerprintsField} = $Options{referencefingerprintsfield};
1313 }
1314 else {
1315 $OptionsInfo{ReferenceFingerprintsField} = 'AutoDetect';
1316 }
1317
1318 if ($Options{referencecompoundidfield} && IsNotEmpty($Options{referencefingerprintsfield})) {
1319 if (($Options{referencecompoundidfield} eq $Options{referencefingerprintsfield})) {
1320 die "Error: Values specified using \"--ReferenceCompoundIDField\" and \"--ReferenceFingerprintsfield\", $Options{referencecompoundidfield}, must be different.\n";
1321 }
1322 }
1323
1324 }
1325
1326 # Process options related to data retrieval from database fingerprints SD and CSV/TSV
1327 # text files...
1328 #
1329 sub ProcessDatabaseFingerprintsDataOptions {
1330
1331 $OptionsInfo{DatabaseCompoundIDPrefix} = $Options{databasecompoundidprefix} ? $Options{databasecompoundidprefix} : 'Cmpd';
1332
1333 # Compound ID and fingerprints column options for text files...
1334
1335 $OptionsInfo{DatabaseColMode} = $Options{databasecolmode};
1336
1337 if (IsNotEmpty($Options{databasecompoundidcol})) {
1338 if ($Options{databasecolmode} =~ /^ColNum$/i) {
1339 if (!IsPositiveInteger($Options{databasecompoundidcol})) {
1340 die "Error: Column value, $Options{databasecompoundidcol}, specified using \"--DatabaseCompoundIDCol\" is not valid: Allowed integer values: > 0\n";
1341 }
1342 }
1343 $OptionsInfo{DatabaseCompoundIDCol} = $Options{databasecompoundidcol};
1344 }
1345 else {
1346 $OptionsInfo{DatabaseCompoundIDCol} = 'AutoDetect';
1347 }
1348
1349 if (IsNotEmpty($Options{databasefingerprintscol})) {
1350 if ($Options{databasecolmode} =~ /^ColNum$/i) {
1351 if (!IsPositiveInteger($Options{databasefingerprintscol})) {
1352 die "Error: Column value, $Options{databasefingerprintscol}, specified using \"--DatabaseFingerprintsCol\" is not valid: Allowed integer values: > 0\n";
1353 }
1354 }
1355 $OptionsInfo{DatabaseFingerprintsCol} = $Options{databasefingerprintscol};
1356 }
1357 else {
1358 $OptionsInfo{DatabaseFingerprintsCol} = 'AutoDetect';
1359 }
1360
1361 if (IsNotEmpty($Options{databasecompoundidcol}) && IsNotEmpty($Options{databasefingerprintscol})) {
1362 if (IsPositiveInteger($Options{databasecompoundidcol}) && IsPositiveInteger($Options{databasefingerprintscol})) {
1363 if (($Options{databasecompoundidcol} == $Options{databasefingerprintscol})) {
1364 die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n";
1365 }
1366 }
1367 else {
1368 if (($Options{databasecompoundidcol} eq $Options{databasefingerprintscol})) {
1369 die "Error: Values specified using \"--DatabaseCompoundIDCol\" and \"--DatabaseFingerprintsCol\", $Options{databasecompoundidcol}, must be different.\n";
1370 }
1371 }
1372 }
1373
1374 # Database data column options for text files...
1375
1376 $OptionsInfo{DatabaseDataColsMode} = $Options{databasedatacolsmode};
1377 $OptionsInfo{DatabaseDataCols} = '';
1378 @{$OptionsInfo{SpecifiedDatabaseDataCols}} = ();
1379
1380 if ($Options{databasedatacolsmode} =~ /^Specify$/i) {
1381 my($DatabaseDataCols, $DatabaseColNum, @SpecifiedDataCols);
1382
1383 if (!$Options{databasedatacols}) {
1384 die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n";
1385 }
1386 $DatabaseDataCols = $Options{databasedatacols};
1387
1388 if ($Options{databasecolmode} =~ /^ColNum$/i) {
1389 $DatabaseDataCols =~ s/ //g;
1390 @SpecifiedDataCols = split /\,/, $DatabaseDataCols;
1391 for $DatabaseColNum (@SpecifiedDataCols) {
1392 if (!IsPositiveInteger($DatabaseColNum)) {
1393 die "Error: Column value, $DatabaseColNum, specified using \"--DatabaseDataCols\" is not valid: Allowed integer values: > 0\n";
1394 }
1395 }
1396 }
1397 else {
1398 @SpecifiedDataCols = split /\,/, $DatabaseDataCols;
1399 }
1400 $OptionsInfo{DatabaseDataCols} = $DatabaseDataCols;
1401 push @{$OptionsInfo{SpecifiedDatabaseDataCols}}, @SpecifiedDataCols;
1402 }
1403 elsif ($Options{databasedatacolsmode} =~ /^All$/i) {
1404 $OptionsInfo{DatabaseDataCols} = 'All';
1405 }
1406
1407 if ($OptionsInfo{DatabaseDataColsMode} =~ /^Specify$/i && !$OptionsInfo{DatabaseDataCols}) {
1408 die "Error: You must specify a value for \"--DatabaseDataCols\" option in \"Specify\" \"--DatabaseDataColsMode\". \n";
1409 }
1410
1411 # Compound ID and fingerprints field options for SD files...
1412
1413 $OptionsInfo{DatabaseCompoundIDMode} = $Options{databasecompoundidmode};
1414 $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield} ? $Options{databasecompoundidfield} : '';
1415
1416 if ($Options{databasecompoundidmode} =~ /^DataField$/i) {
1417 if (!$Options{databasecompoundidfield}) {
1418 die "Error: You must specify a value for \"--DatabaseCompoundIDField\" option in \"DataField\" \"--DatabaseCompoundIDMode\". \n";
1419 }
1420 $OptionsInfo{DatabaseCompoundIDField} = $Options{databasecompoundidfield};
1421 }
1422
1423
1424 if (IsNotEmpty($Options{databasefingerprintsfield})) {
1425 $OptionsInfo{DatabaseFingerprintsField} = $Options{databasefingerprintsfield};
1426 }
1427 else {
1428 $OptionsInfo{DatabaseFingerprintsField} = 'AutoDetect';
1429 }
1430
1431 if ($Options{databasecompoundidfield} && IsNotEmpty($Options{databasefingerprintsfield})) {
1432 if (($Options{databasecompoundidfield} eq $Options{databasefingerprintsfield})) {
1433 die "Error: Values specified using \"--DatabaseCompoundIDField\" and \"--DatabaseFingerprintsfield\", $Options{databasecompoundidfield}, must be different.\n";
1434 }
1435 }
1436
1437 # Database data field options for SD files...
1438
1439 $OptionsInfo{DatabaseDataFieldsMode} = $Options{databasedatafieldsmode};
1440 $OptionsInfo{DatabaseDataFields} = '';
1441 @{$OptionsInfo{SpecifiedDatabaseDataFields}} = ();
1442
1443 if ($Options{databasedatafieldsmode} =~ /^Specify$/i && !$Options{databasedatafields}) {
1444 die "Error: You must specify a value for \"--DatabaseDataFields\" option in \"Specify\" \"--DatabaseDataFieldsMode\". \n";
1445 }
1446 if ($Options{databasedatafields}) {
1447 my(@SpecifiedDataFields);
1448 $OptionsInfo{DatabaseDataFields} = $Options{databasedatafields};
1449
1450 @SpecifiedDataFields = split /\,/, $Options{databasedatafields};
1451 push @{$OptionsInfo{SpecifiedDatabaseDataFields}}, @SpecifiedDataFields;
1452 }
1453 }
1454
1455 # Setup script usage and retrieve command line arguments specified using various options...
1456 sub SetupScriptUsage {
1457
1458 # Retrieve all the options...
1459 %Options = ();
1460
1461 $Options{alpha} = 0.5;
1462 $Options{beta} = 1;
1463
1464 $Options{bitvectorcomparisonmode} = "TanimotoSimilarity";
1465
1466 $Options{databasecolmode} = 'colnum';
1467
1468 $Options{databasecompoundidprefix} = 'Cmpd';
1469 $Options{databasecompoundidmode} = 'LabelPrefix';
1470
1471 $Options{databasedatacolsmode} = 'CompoundID';
1472 $Options{databasedatafieldsmode} = 'CompoundID';
1473
1474 $Options{distancecutoff} = 10;
1475
1476 $Options{referencecolmode} = 'colnum';
1477
1478 $Options{referencecompoundidprefix} = 'Cmpd';
1479 $Options{referencecompoundidmode} = 'LabelPrefix';
1480
1481 $Options{detail} = 1;
1482
1483 $Options{fingerprintsmode} = 'AutoDetect';
1484 $Options{groupfusionrule} = 'Max';
1485 $Options{groupfusionapplycutoff} = 'Yes';
1486
1487 $Options{knn} = 'All';
1488
1489 $Options{mode} = 'MultipleReferences';
1490
1491 $Options{numofsimilarmolecules} = 10;
1492 $Options{percentsimilarmolecules} = 1;
1493
1494 $Options{indelim} = 'comma';
1495 $Options{outdelim} = 'comma';
1496 $Options{quote} = 'yes';
1497
1498 $Options{output} = 'text';
1499
1500 $Options{precision} = 2;
1501
1502 $Options{searchmode} = 'SimilaritySearch';
1503
1504 $Options{similarcountmode} = 'NumOfSimilar';
1505
1506 $Options{similaritycutoff} = 0.75;
1507
1508 $Options{vectorcomparisonmode} = 'TanimotoSimilarity';
1509 $Options{vectorcomparisonformulism} = 'AlgebraicForm';
1510
1511 if (!GetOptions(\%Options, "alpha=f", "beta=f", "bitvectorcomparisonmode|b=s", "databasecolmode=s", "databasecompoundidcol=s", "databasecompoundidprefix=s", "databasecompoundidfield=s", "databasecompoundidmode=s", "databasedatacols=s", "databasedatacolsmode=s", "databasedatafields=s", "databasedatafieldsmode=s", "databasefingerprintscol=s", "databasefingerprintsfield=s", "distancecutoff=f", "detail|d=i", "fast|f", "fingerprintsmode=s", "groupfusionrule|g=s", , "groupfusionapplycutoff=s", "help|h", "indelim=s", "knn|k=s", "mode|m=s", "numofsimilarmolecules|n=i", "outdelim=s", "output=s", "overwrite|o", "percentsimilarmolecules|p=f", "precision=s", "quote|q=s", "referencecolmode=s", "referencecompoundidcol=s", "referencecompoundidprefix=s", "referencecompoundidfield=s", "referencecompoundidmode=s", "referencefingerprintscol=s", "referencefingerprintsfield=s", "root|r=s", "searchmode|s=s", "similarcountmode=s", "similaritycutoff=f", "vectorcomparisonmode|v=s", "vectorcomparisonformulism=s", "workingdir|w=s")) {
1512 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1513 }
1514 if ($Options{workingdir}) {
1515 if (! -d $Options{workingdir}) {
1516 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1517 }
1518 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1519 }
1520 if ($Options{databasecolmode} !~ /^(ColNum|ColLabel)$/i) {
1521 die "Error: The value specified, $Options{databasecolmode}, for option \"--DatabaseColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
1522 }
1523 if ($Options{databasecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1524 die "Error: The value specified, $Options{databasecompoundidmode}, for option \"--DatabaseCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1525 }
1526 if ($Options{databasedatacolsmode} !~ /^(All|Specify|CompoundID)$/i) {
1527 die "Error: The value specified, $Options{databasedatacolsmode}, for option \"--DatabaseDataColsMode\" is not valid. Allowed values: All, Specify, or CompoundID\n";
1528 }
1529 if ($Options{databasedatafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
1530 die "Error: The value specified, $Options{databasedatafieldsmode}, for option \"--DatabaseDataFieldsMode\" is not valid. Allowed values: All, Common, Specify, or CompoundID\n";
1531 }
1532 if (!IsPositiveInteger($Options{detail})) {
1533 die "Error: The value specified, $Options{detail}, for option \"-d, --detail\" is not valid. Allowed values: > 0 \n";
1534 }
1535 if ($Options{fingerprintsmode} !~ /^(AutoDetect|FingerprintsBitVectorString|FingerprintsVectorString)$/i) {
1536 die "Error: The value specified, $Options{fingerprintsmode}, for option \"--FingerprintsMode\" is not valid. Allowed values: AutoDetect, FingerprintsBitVectorString or FingerprintsVectorString \n";
1537 }
1538 if ($Options{groupfusionrule} !~ /^(Max|Min|Mean|Median|Sum|Euclidean)$/i) {
1539 die "Error: The value specified, $Options{groupfusionrule}, for option \"-g, --GroupFusionRule\" is not valid. Allowed values: Max, Min, Mean, Median, Sum, Euclidean\n";
1540 }
1541 if ($Options{groupfusionapplycutoff} !~ /^(Yes|No)$/i) {
1542 die "Error: The value specified, $Options{quote}, for option \"--GroupFusionApplyCutoff\" is not valid. Allowed values: Yes or No\n";
1543 }
1544 if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
1545 die "Error: The value specified, $Options{indelim}, for option \"--InDelim\" is not valid. Allowed values: comma, or semicolon\n";
1546 }
1547 if ($Options{mode} !~ /^(IndividualReference|MultipleReferences)$/i) {
1548 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: IndividualReference, MultipleReferences\n";
1549 }
1550 if (!IsPositiveInteger($Options{numofsimilarmolecules})) {
1551 die "Error: The value specified, $Options{numofsimilarmolecules}, for option \"-n, --NumOfSimilarMolecules\" is not valid. Allowed values: > 0 \n";
1552 }
1553 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1554 die "Error: The value specified, $Options{outdelim}, for option \"--OutDelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1555 }
1556 if ($Options{output} !~ /^(SD|text|both)$/i) {
1557 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1558 }
1559 if (!(IsFloat($Options{percentsimilarmolecules}) && $Options{percentsimilarmolecules} > 0 && $Options{percentsimilarmolecules} <= 100)) {
1560 die "Error: The value specified, $Options{percentsimilarmolecules}, for option \"-p, --PercentSimilarMolecules\" is not valid. Allowed values: > 0 and <= 100 \n";
1561 }
1562 if ($Options{quote} !~ /^(Yes|No)$/i) {
1563 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
1564 }
1565 if (!IsPositiveInteger($Options{precision})) {
1566 die "Error: The value specified, $Options{precision}, for option \"--precision\" is not valid. Allowed values: > 0 \n";
1567 }
1568 if ($Options{referencecolmode} !~ /^(ColNum|ColLabel)$/i) {
1569 die "Error: The value specified, $Options{referencecolmode}, for option \"--ReferenceColMode\" is not valid. Allowed values: ColNum, or ColLabel\n";
1570 }
1571 if ($Options{referencecompoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1572 die "Error: The value specified, $Options{referencecompoundidmode}, for option \"--ReferenceCompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1573 }
1574 if ($Options{searchmode} !~ /^(SimilaritySearch|DissimilaritySearch)$/i) {
1575 die "Error: The value specified, $Options{searchmode}, for option \"-s, --SearchMode\" is not valid. Allowed values: SimilaritySearch, DissimilaritySearch \n";
1576 }
1577 if ($Options{similarcountmode} !~ /^(NumOfSimilar|PercentSimilar)$/i) {
1578 die "Error: The value specified, $Options{similarcountmode}, for option \"--SimilarCountMode\" is not valid. Allowed values: NumOfSimilar, PercentSimilar \n";
1579 }
1580 }
1581
1582 __END__
1583
1584 =head1 NAME
1585
1586 SimilaritySearchingFingerprints.pl - Perform similarity search using fingerprints strings data in SD, FP and CSV/TSV text file(s)
1587
1588 =head1 SYNOPSIS
1589
1590 SimilaritySearchingFingerprints.pl ReferenceFPFile DatabaseFPFile
1591
1592 SimilaritySearchingFingerprints.pl [B<--alpha> I<number>] [B<--beta> I<number>]
1593 [B<-b, --BitVectorComparisonMode> I<TanimotoSimilarity | TverskySimilarity | ...>]
1594 [B<--DatabaseColMode> I<ColNum | ColLabel>] [B<--DatabaseCompoundIDCol> I<col number | col name>]
1595 [B<--DatabaseCompoundIDPrefix> I<text>] [B<--DatabaseCompoundIDField> I<DataFieldName>]
1596 [B<--DatabaseCompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
1597 [B<--DatabaseDataCols> I<"DataColNum1, DataColNum2,... " | DataColLabel1, DataCoLabel2,... ">]
1598 [B<--DatabaseDataColsMode> I<All | Specify | CompoundID>] [B<--DatabaseDataFields> I<"FieldLabel1, FieldLabel2,... ">]
1599 [B<--DatabaseDataFieldsMode> I<All | Common | Specify | CompoundID>]
1600 [B<--DatabaseFingerprintsCol> I<col number | col name>] [B<--DatabaseFingerprintsField> I<FieldLabel>]
1601 []B<--DistanceCutoff> I<number>] [B<-d, --detail> I<InfoLevel>] [B<-f, --fast>]
1602 [B<--FingerprintsMode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>]
1603 [B<-g, --GroupFusionRule> I<Max, Mean, Median, Min, Sum, Euclidean>] [B<--GroupFusionApplyCutoff> I<Yes | No>]
1604 [B<-h, --help>] [B<--InDelim> I<comma | semicolon>] [B<-k, --KNN> I<all | number>]
1605 [B<-m, --mode> I<IndividualReference | MultipleReferences>]
1606 [B<-n, --NumOfSimilarMolecules> I<number>] [B<--OutDelim> I<comma | tab | semicolon>]
1607 [B<--output> I<SD | text | both>] [B<-o, --overwrite>]
1608 [B<-p, --PercentSimilarMolecules> I<number>] [B<--precision> I<number>] [B<-q, --quote> I<Yes | No>]
1609 [B<--ReferenceColMode> I<ColNum | ColLabel>] [B<--ReferenceCompoundIDCol> I<col number | col name>]
1610 [B<--ReferenceCompoundIDPrefix> I<text>] [B<--ReferenceCompoundIDField> I<DataFieldName>]
1611 [B<--ReferenceCompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
1612 [B<--ReferenceFingerprintsCol> I<col number | col name>] [B<--ReferenceFingerprintsField> I<FieldLabel>]
1613 [B<-r, --root> I<RootName>] [B<-s, --SearchMode> I<SimilaritySearch | DissimilaritySearch>]
1614 [B<--SimilarCountMode> I<NumOfSimilar | PercentSimilar>] [B<--SimilarityCutoff> I<number>]
1615 [B<-v, --VectorComparisonMode> I<TanimotoSimilairy | ... | ManhattanDistance | ...>]
1616 [B<--VectorComparisonFormulism> I<AlgebraicForm | BinaryForm | SetTheoreticForm>]
1617 [B<-w, --WorkingDir> dirname] ReferenceFingerprintsFile DatabaseFingerprintsFile
1618
1619 =head1 DESCRIPTION
1620
1621 Perform molecular similarity search [ Ref 94-113 ] using fingerprint bit-vector or vector strings
1622 data in I<SD, FP, or CSV/TSV text> files corresponding to I<ReferenceFingerprintsFile> and
1623 I<DatabaseFingerprintsFile>, and generate SD and CSV/TSV text file(s) containing database
1624 molecules which are similar to reference molecule(s). The reference molecules are also referred
1625 to as query or seed molecules and database molecules as target molecules in the literature.
1626
1627 The current release of MayaChemTools supports two types of similarity search modes:
1628 I<IndividualReference or MultipleReferences>. For default value of I<MultipleReferences> for B<-m, --mode>
1629 option, reference molecules are considered as a set and B<-g, --GroupFusionRule> is used to calculate
1630 similarity of a database molecule against reference molecules set. The group fusion rule is also
1631 referred to as data fusion of consensus scoring in the literature. However, for I<IndividualReference>
1632 value of B<-m, --mode> option, reference molecules are treated as individual molecules and each reference
1633 molecule is compared against a database molecule by itself to identify similar molecules.
1634
1635 The molecular dissimilarity search can also be performed using I<DissimilaritySearch> value for
1636 B<-s, --SearchMode> option. During dissimilarity search or usage of distance comparison coefficient
1637 in similarity similarity search, the meaning of fingerprints comparison value is automatically reversed
1638 as shown below:
1639
1640 SeachMode ComparisonCoefficient ResultsSort ComparisonValues
1641
1642 Similarity SimilarityCoefficient Descending Higher value imples
1643 high similarity
1644 Similarity DistanceCoefficient Ascending Lower value implies
1645 high similarity
1646
1647 Dissimilarity SimilarityCoefficient Ascending Lower value implies
1648 high dissimilarity
1649 Dissimilarity DistanceCoefficient Descending Higher value implies
1650 high dissimilarity
1651
1652 During I<IndividualReference> value of B<-m, --Mode> option for similarity search, fingerprints bit-vector
1653 or vector string of each reference molecule is compared with database molecules using specified
1654 similarity or distance coefficients to identify most similar molecules for each reference molecule.
1655 Based on value of B<--SimilarCountMode>, up to B<--n, --NumOfSimilarMolecules> or B<-p,
1656 --PercentSimilarMolecules> at specified B<--SimilarityCutoff> or B<--DistanceCutoff> are
1657 identified for each reference molecule.
1658
1659 During I<MultipleReferences> value B<-m, --mode> option for similarity search, all reference molecules
1660 are considered as a set and B<-g, --GroupFusionRule> is used to calculate similarity of a database
1661 molecule against reference molecules set either using all reference molecules or number of k-nearest
1662 neighbors (k-NN) to a database molecule specified using B<-k, --kNN>. The fingerprints bit-vector
1663 or vector string of each reference molecule in a set is compared with a database molecule using
1664 a similarity or distance coefficient specified via B<-b, --BitVectorComparisonMode> or B<-v,
1665 --VectorComparisonMode>. The reference molecules whose comparison values with a database
1666 molecule fall outside specified B<--SimilarityCutoff> or B<--DistanceCutoff> are ignored during I<Yes>
1667 value of B<--GroupFusionApplyCutoff>. The specified B<-g, --GroupFusionRule> is applied to
1668 B<-k, --kNN> reference molecules to calculate final similarity value between a database molecule
1669 and reference molecules set.
1670
1671 The input fingerprints I<SD, FP, or Text (CSV/TSV)> files for I<ReferenceFingerprintsFile> and
1672 I<DatabaseTextFile> must contain valid fingerprint bit-vector or vector strings data corresponding to
1673 same type of fingerprints.
1674
1675 The valid fingerprints I<SDFile> extensions are I<.sdf> and I<.sd>. The valid fingerprints I<FPFile>
1676 extensions are I<.fpf> and I<.fp>. The valid fingerprints I<TextFile (CSV/TSV)> extensions are
1677 I<.csv> and I<.tsv> for comma/semicolon and tab delimited text files respectively. The B<--indelim>
1678 option determines the format of I<TextFile>. Any file which doesn't correspond to the format indicated
1679 by B<--indelim> option is ignored.
1680
1681 Example of I<FP> file containing fingerprints bit-vector string data:
1682
1683 #
1684 # Package = MayaChemTools 7.4
1685 # ReleaseDate = Oct 21, 2010
1686 #
1687 # TimeStamp = Mon Mar 7 15:14:01 2011
1688 #
1689 # FingerprintsStringType = FingerprintsBitVector
1690 #
1691 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
1692 # Size = 1024
1693 # BitStringFormat = HexadecimalString
1694 # BitsOrder = Ascending
1695 #
1696 Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510...
1697 Cmpd2 000000249400840040100042011001001980410c000000001010088001120...
1698 ... ...
1699 ... ..
1700
1701 Example of I<FP> file containing fingerprints vector string data:
1702
1703 #
1704 # Package = MayaChemTools 7.4
1705 # ReleaseDate = Oct 21, 2010
1706 #
1707 # TimeStamp = Mon Mar 7 15:14:01 2011
1708 #
1709 # FingerprintsStringType = FingerprintsVector
1710 #
1711 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
1712 # VectorStringFormat = IDsAndValuesString
1713 # VectorValuesType = NumericalValues
1714 #
1715 Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C:
1716 N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...;
1717 33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2
1718 6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ...
1719 Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C
1720 O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...;
1721 15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2
1722 1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ...
1723 ... ...
1724 ... ...
1725
1726 Example of I<SD> file containing fingerprints bit-vector string data:
1727
1728 ... ...
1729 ... ...
1730 $$$$
1731 ... ...
1732 ... ...
1733 ... ...
1734 41 44 0 0 0 0 0 0 0 0999 V2000
1735 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1736 ... ...
1737 2 3 1 0 0 0 0
1738 ... ...
1739 M END
1740 > <CmpdID>
1741 Cmpd1
1742
1743 > <PathLengthFingerprints>
1744 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt
1745 h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66
1746 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028
1747 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462
1748 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a
1749 aa0660a11014a011d46
1750
1751 $$$$
1752 ... ...
1753 ... ...
1754
1755 Example of CSV I<TextFile> containing fingerprints bit-vector string data:
1756
1757 "CompoundID","PathLengthFingerprints"
1758 "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes
1759 :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4
1760 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030
1761 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..."
1762 ... ...
1763 ... ...
1764
1765 The current release of MayaChemTools supports the following types of fingerprint
1766 bit-vector and vector strings:
1767
1768 FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi
1769 us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0-C.X1.BO1.H3-AT
1770 C1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-ATC1 NR0-C.X
1771 1.BO1.H3-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-A
1772 TC1 NR0-C.X2.BO2.H2-ATC1:NR1-C.X2.BO2.H2-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2
1773 -C.X2.BO2.H2-ATC1:NR2-N.X3.BO3-ATC1:NR2-O.X1.BO1.H1-ATC1 NR0-C.X2.B...
1774
1775 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS
1776 ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2
1777 .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1
1778 O.X1.BO2;2 4 14 3 10 1 1 1 3 2
1779
1780 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume
1781 ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F
1782 N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1
1783
1784 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN
1785 umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C
1786 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N
1787 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8
1788 O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1
1789 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0...
1790
1791 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
1792 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
1793 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3
1794 .024 -2.270
1795
1796 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
1797 ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435
1798 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1
1799 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1800 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1801
1802 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
1803 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
1804 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
1805 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
1806 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
1807 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
1808
1809 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
1810 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
1811 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
1812 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
1813 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
1814 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
1815
1816 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
1817 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
1818 0000000001010000000110000011000000000000100000000000000000000000100001
1819 1000000110000000000000000000000000010011000000000000000000000000010000
1820 0000000000000000000000000010000000000000000001000000000000000000000000
1821 0000000000010000100001000000000000101000000000000000100000000000000...
1822
1823 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu
1824 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8
1825 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567
1826 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012
1827 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455
1828 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404...
1829
1830 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp
1831 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184
1832 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450
1833 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430
1834 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134
1835 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566...
1836
1837 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
1838 0000000000000000000000000000000001001000010010000000010010000000011100
1839 0100101010111100011011000100110110000011011110100110111111111111011111
1840 11111111111110111000
1841
1842 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
1843 1110011111100101111111000111101100110000000000000011100010000000000000
1844 0000000000000000000000000000000000000000000000101000000000000000000000
1845 0000000000000000000000000000000000000000000000000000000000000000000000
1846 0000000000000000000000000000000000000011000000000000000000000000000000
1847 0000000000000000000000000000000000000000
1848
1849 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
1850 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1851 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
1852 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
1853 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
1854 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
1855
1856 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
1857 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
1858 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
1859 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1860 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
1861 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1862
1863 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1864 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
1865 0100010101011000101001011100110001000010001001101000001001001001001000
1866 0010110100000111001001000001001010100100100000000011000000101001011100
1867 0010000001000101010100000100111100110111011011011000000010110111001101
1868 0101100011000000010001000011000010100011101100001000001000100000000...
1869
1870 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
1871 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
1872 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
1873 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
1874 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
1875 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
1876
1877 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
1878 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
1879 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
1880 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
1881 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
1882 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
1883
1884 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
1885 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
1886 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.
1887 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...;
1888 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
1889 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
1890
1891 FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi
1892 stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar
1893 Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H
1894 BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...;
1895 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4
1896 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ...
1897
1898 FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3
1899 3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-
1900 C.X3.BO4 C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-N.X3.BO3 C.X2.BO2.H2-C.X2.BO
1901 2.H2-C.X3.BO3.H1-C.X2.BO2.H2 C.X2.BO2.H2-C.X2.BO2.H2-C.X3.BO3.H1-O...;
1902 2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1
1903
1904 FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica
1905 lValues;IDsAndValuesString;aaCH-aaCH-aaCH-aaCH aaCH-aaCH-aaCH-aasC aaC
1906 H-aaCH-aasC-aaCH aaCH-aaCH-aasC-aasC aaCH-aaCH-aasC-sF aaCH-aaCH-aasC-
1907 ssNH aaCH-aasC-aasC-aasC aaCH-aasC-aasC-aasN aaCH-aasC-ssNH-dssC a...;
1908 4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
1909
1910 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
1911 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
1912 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1
1913 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1
1914 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....;
1915 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
1916 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
1917
1918 FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1
1919 :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C
1920 .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3-
1921 D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2
1922 -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C.
1923 3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7...
1924
1925 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
1926 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
1927 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
1928 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H
1929 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...;
1930 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
1931 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
1932
1933 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
1934 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
1935 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
1936 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
1937 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
1938 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
1939
1940 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize:
1941 MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1-
1942 Ar1-Ar1 Ar1-Ar1-H1 Ar1-Ar1-HBA1 Ar1-Ar1-HBD1 Ar1-H1-H1 Ar1-H1-HBA1 Ar1
1943 -H1-HBD1 Ar1-HBA1-HBD1 H1-H1-H1 H1-H1-HBA1 H1-H1-HBD1 H1-HBA1-HBA1 H1-
1944 HBA1-HBD1 H1-HBA1-NI1 H1-HBD1-NI1 HBA1-HBA1-NI1 HBA1-HBD1-NI1 Ar1-...;
1945 46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23
1946 28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1
1947 119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ...
1948
1949 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD
1950 istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106
1951 8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0
1952 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26
1953 14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0
1954 0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ...
1955
1956 =head1 OPTIONS
1957
1958 =over 4
1959
1960 =item B<--alpha> I<number>
1961
1962 Value of alpha parameter for calculating I<Tversky> similarity coefficient specified for
1963 B<-b, --BitVectorComparisonMode> option. It corresponds to weights assigned for bits set
1964 to "1" in a pair of fingerprint bit-vectors during the calculation of similarity coefficient. Possible
1965 values: I<0 to 1>. Default value: <0.5>.
1966
1967 =item B<--beta> I<number>
1968
1969 Value of beta parameter for calculating I<WeightedTanimoto> and I<WeightedTversky>
1970 similarity coefficients specified for B<-b, --BitVectorComparisonMode> option. It is used to
1971 weight the contributions of bits set to "0" during the calculation of similarity coefficients. Possible
1972 values: I<0 to 1>. Default value of <1> makes I<WeightedTanimoto> and I<WeightedTversky>
1973 equivalent to I<Tanimoto> and I<Tversky>.
1974
1975 =item B<-b, --BitVectorComparisonMode> I<TanimotoSimilarity | TverskySimilarity | ...>
1976
1977 Specify what similarity coefficient to use for calculating similarity between fingerprints bit-vector
1978 string data values in I<ReferenceFingerprintsFile> and I<DatabaseFingerprintsFile> during similarity
1979 search. Possible values: I<TanimotoSimilarity | TverskySimilarity | ...>. Default: I<TanimotoSimilarity>
1980
1981 The current release supports the following similarity coefficients: I<BaroniUrbaniSimilarity, BuserSimilarity,
1982 CosineSimilarity, DiceSimilarity, DennisSimilarity, ForbesSimilarity, FossumSimilarity, HamannSimilarity, JacardSimilarity,
1983 Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity, McConnaugheySimilarity, OchiaiSimilarity,
1984 PearsonSimilarity, RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity, SkoalSneath1Similarity,
1985 SkoalSneath2Similarity, SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity, YuleSimilarity,
1986 WeightedTanimotoSimilarity, WeightedTverskySimilarity>. These similarity coefficients are described below.
1987
1988 For two fingerprint bit-vectors A and B of same size, let:
1989
1990 Na = Number of bits set to "1" in A
1991 Nb = Number of bits set to "1" in B
1992 Nc = Number of bits set to "1" in both A and B
1993 Nd = Number of bits set to "0" in both A and B
1994
1995 Nt = Number of bits set to "1" or "0" in A or B (Size of A or B)
1996 Nt = Na + Nb - Nc + Nd
1997
1998 Na - Nc = Number of bits set to "1" in A but not in B
1999 Nb - Nc = Number of bits set to "1" in B but not in A
2000
2001 Then, various similarity coefficients [ Ref. 40 - 42 ] for a pair of bit-vectors A and B are
2002 defined as follows:
2003
2004 I<BaroniUrbaniSimilarity>: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser )
2005
2006 I<BuserSimilarity>: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani )
2007
2008 I<CosineSimilarity>: Nc / SQRT ( Na * Nb ) (same as Ochiai)
2009
2010 I<DiceSimilarity>: (2 * Nc) / ( Na + Nb )
2011
2012 I<DennisSimilarity>: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / SQRT ( Nt * Na * Nb)
2013
2014 I<ForbesSimilarity>: ( Nt * Nc ) / ( Na * Nb )
2015
2016 I<FossumSimilarity>: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb )
2017
2018 I<HamannSimilarity>: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt
2019
2020 I<JaccardSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Tanimoto)
2021
2022 I<Kulczynski1Similarity>: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / ( Na + Nb - 2Nc )
2023
2024 I<Kulczynski2Similarity>: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + ( Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5 * ( Nc / Na + Nc / Nb )
2025
2026 I<MatchingSimilarity>: ( Nc + Nd ) / Nt
2027
2028 I<McConnaugheySimilarity>: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / ( Na * Nb )
2029
2030 I<OchiaiSimilarity>: Nc / SQRT ( Na * Nb ) (same as Cosine)
2031
2032 I<PearsonSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) / SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) )
2033
2034 I<RogersTanimotoSimilarity>: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc) + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt)
2035
2036 I<RussellRaoSimilarity>: Nc / Nt
2037
2038 I<SimpsonSimilarity>: Nc / MIN ( Na, Nb)
2039
2040 I<SkoalSneath1Similarity>: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb - Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc )
2041
2042 I<SkoalSneath2Similarity>: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt )
2043
2044 I<SkoalSneath3Similarity>: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc ) ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc )
2045
2046 I<TanimotoSimilarity>: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc ) (same as Jaccard)
2047
2048 I<TverskySimilarity>: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * ( Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb)
2049
2050 I<YuleSimilarity>: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) / ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) )
2051
2052 Values of Tanimoto/Jaccard and Tversky coefficients are dependent on only those bit which
2053 are set to "1" in both A and B. In order to take into account all bit positions, modified versions
2054 of Tanimoto [ Ref. 42 ] and Tversky [ Ref. 43 ] have been developed.
2055
2056 Let:
2057
2058 Na' = Number of bits set to "0" in A
2059 Nb' = Number of bits set to "0" in B
2060 Nc' = Number of bits set to "0" in both A and B
2061
2062 Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / ( Na' + Nb' - Nc' )
2063
2064 Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc' ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb')
2065
2066 Then:
2067
2068 I<WeightedTanimotoSimilarity> = beta * Tanimoto + (1 - beta) * Tanimoto'
2069
2070 I<WeightedTverskySimilarity> = beta * Tversky + (1 - beta) * Tversky'
2071
2072 =item B<--DatabaseColMode> I<ColNum | ColLabel>
2073
2074 Specify how columns are identified in database fingerprints I<TextFile>: using column
2075 number or column label. Possible values: I<ColNum or ColLabel>. Default value: I<ColNum>.
2076
2077 =item B<--DatabaseCompoundIDCol> I<col number | col name>
2078
2079 This value is B<--DatabaseColMode> mode specific. It specifies column to use for retrieving compound
2080 ID from database fingerprints I<TextFile> during similarity and dissimilarity search for output SD and
2081 CSV/TSV text files. Possible values: I<col number or col label>. Default value: I<first column containing
2082 the word compoundID in its column label or sequentially generated IDs>.
2083
2084 This is only used for I<CompoundID> value of B<--DatabaseDataColsMode> option.
2085
2086 =item B<--DatabaseCompoundIDPrefix> I<text>
2087
2088 Specify compound ID prefix to use during sequential generation of compound IDs for database fingerprints
2089 I<SDFile> and I<TextFile>. Default value: I<Cmpd>. The default value generates compound IDs which look
2090 like Cmpd<Number>.
2091
2092 For database fingerprints I<SDFile>, this value is only used during I<LabelPrefix | MolNameOrLabelPrefix>
2093 values of B<--DatabaseCompoundIDMode> option; otherwise, it's ignored.
2094
2095 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--DatabaseCompoundIDMode>:
2096
2097 Compound
2098
2099 The values specified above generates compound IDs which correspond to Compound<Number>
2100 instead of default value of Cmpd<Number>.
2101
2102 =item B<--DatabaseCompoundIDField> I<DataFieldName>
2103
2104 Specify database fingerprints I<SDFile> datafield label for generating compound IDs. This value is
2105 only used during I<DataField> value of B<--DatabaseCompoundIDMode> option.
2106
2107 Examples for I<DataField> value of B<--DatabaseCompoundIDMode>:
2108
2109 MolID
2110 ExtReg
2111
2112 =item B<--DatabaseCompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
2113
2114 Specify how to generate compound IDs from database fingerprints I<SDFile> during similarity and
2115 dissimilarity search for output SD and CSV/TSV text files: use a I<SDFile> datafield value; use
2116 molname line from I<SDFile>; generate a sequential ID with specific prefix; use combination of both
2117 MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
2118
2119 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
2120 Default: I<LabelPrefix>.
2121
2122 For I<MolNameAndLabelPrefix> value of B<--DatabaseCompoundIDMode>, molname line in I<SDFile> takes
2123 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
2124 values are replaced with sequential compound IDs.
2125
2126 This is only used for I<CompoundID> value of B<--DatabaseDataFieldsMode> option.
2127
2128 =item B<--DatabaseDataCols> I<"DataColNum1,DataColNum2,... " | DataColLabel1,DataCoLabel2,... ">
2129
2130 This value is B<--DatabaseColMode> mode specific. It is a comma delimited list of database fingerprints
2131 I<TextFile> data column numbers or labels to extract and write to SD and CSV/TSV text files along with
2132 other information for I<SD | text | both> values of B<--output> option.
2133
2134 This is only used for I<Specify> value of B<--DatabaseDataColsMode> option.
2135
2136 Examples:
2137
2138 1,2,3
2139 CompoundName,MolWt
2140
2141 =item B<--DatabaseDataColsMode> I<All | Specify | CompoundID>
2142
2143 Specify how data columns from database fingerprints I<TextFile> are transferred to output SD and
2144 CSV/TSV text files along with other information for I<SD | text | both> values of B<--output> option:
2145 transfer all data columns; extract specified data columns; generate a compound ID database compound
2146 prefix. Possible values: I<All | Specify | CompoundID>. Default value: I<CompoundID>.
2147
2148 =item B<--DatabaseDataFields> I<"FieldLabel1,FieldLabel2,... ">
2149
2150 Comma delimited list of database fingerprints I<SDFile> data fields to extract and write to SD
2151 and CSV/TSV text files along with other information for I<SD | text | both> values of
2152 B<--output> option.
2153
2154 This is only used for I<Specify> value of B<--DatabaseDataFieldsMode> option.
2155
2156 Examples:
2157
2158 Extreg
2159 MolID,CompoundName
2160
2161 =item B<--DatabaseDataFieldsMode> I<All | Common | Specify | CompoundID>
2162
2163 Specify how data fields from database fingerprints I<SDFile> are transferred to output SD and
2164 CSV/TSV text files along with other information for I<SD | text | both> values of B<--output>
2165 option: transfer all SD data field; transfer SD data files common to all compounds; extract
2166 specified data fields; generate a compound ID using molname line, a compound prefix, or a
2167 combination of both. Possible values: I<All | Common | specify | CompoundID>. Default value:
2168 I<CompoundID>.
2169
2170 =item B<--DatabaseFingerprintsCol> I<col number | col name>
2171
2172 This value is B<--DatabaseColMode> specific. It specifies fingerprints column to use during similarity
2173 and dissimilarity search for database fingerprints I<TextFile>. Possible values: I<col number or col label>.
2174 Default value: I<first column containing the word Fingerprints in its column label>.
2175
2176 =item B<--DatabaseFingerprintsField> I<FieldLabel>
2177
2178 Fingerprints field label to use during similarity and dissimilarity search for database fingerprints I<SDFile>.
2179 Default value: I<first data field label containing the word Fingerprints in its label>
2180
2181 =item B<--DistanceCutoff> I<number>
2182
2183 Distance cutoff value to use during comparison of distance value between a pair of database
2184 and reference molecule calculated by distance comparison methods for fingerprints vector
2185 string data values. Possible values: I<Any valid number>. Default value: I<10>.
2186
2187 The comparison value between a pair of database and reference molecule must meet the cutoff
2188 criterion as shown below:
2189
2190 SeachMode CutoffCriterion ComparisonValues
2191
2192 Similarity <= Lower value implies high similarity
2193 Dissimilarity >= Higher value implies high dissimilarity
2194
2195 This option is only used during distance coefficients values of B<-v, --VectorComparisonMode>
2196 option.
2197
2198 This option is ignored during I<No> value of B<--GroupFusionApplyCutoff> for I<MultipleReferences>
2199 B<-m, --mode>.
2200
2201 =item B<-d, --detail> I<InfoLevel>
2202
2203 Level of information to print about lines being ignored. Default: I<1>. Possible values:
2204 I<1, 2 or 3>.
2205
2206 =item B<-f, --fast>
2207
2208 In this mode, fingerprints columns specified using B<--FingerprintsCol> for reference and database
2209 fingerprints I<TextFile(s)>, and B<--FingerprintsField> for reference and database fingerprints I<SDFile(s)>
2210 are assumed to contain valid fingerprints data and no checking is performed before performing similarity
2211 and dissimilarity search. By default, fingerprints data is validated before computing pairwise similarity and
2212 distance coefficients.
2213
2214 =item B<--FingerprintsMode> I<AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString>
2215
2216 Format of fingerprint strings data in reference and database fingerprints I<SD, FP, or Text (CSV/TSV)>
2217 files: automatically detect format of fingerprints string created by MayaChemTools fingerprints
2218 generation scripts or explicitly specify its format. Possible values: I<AutoDetect | FingerprintsBitVectorString |
2219 FingerprintsVectorString>. Default value: I<AutoDetect>.
2220
2221 =item B<-g, --GroupFusionRule> I<Max, Min, Mean, Median, Sum, Euclidean>
2222
2223 Specify what group fusion [ Ref 94-97, Ref 100, Ref 105 ] rule to use for calculating similarity of
2224 a database molecule against a set of reference molecules during I<MultipleReferences> value of
2225 similarity search B<-m, --mode>. Possible values: I<Max, Min, Mean, Median, Sum, Euclidean>. Default
2226 value: I<Max>. I<Mean> value corresponds to average or arithmetic mean. The group fusion rule is
2227 also referred to as data fusion of consensus scoring in the literature.
2228
2229 For a reference molecules set and a database molecule, let:
2230
2231 N = Number of reference molecules in a set
2232
2233 i = ith reference reference molecule in a set
2234 n = Nth reference reference molecule in a set
2235
2236 d = dth database molecule
2237
2238 Crd = Fingerprints comparison value between rth reference and dth database
2239 molecule - similarity/dissimilarity comparison using similarity or
2240 distance coefficient
2241
2242 Then, various group fusion rules to calculate fused similarity between a database molecule and
2243 reference molecules set are defined as follows:
2244
2245 B<Max>: MAX ( C1d, C2d, ..., Cid, ..., Cnd )
2246
2247 B<Min>: MIN ( C1d, C2d, ..., Cid, ..., Cnd )
2248
2249 B<Mean>: SUM ( C1d, C2d, ..., Cid, ..., Cnd ) / N
2250
2251 B<Median>: MEDIAN ( C1d, C2d, ..., Cid, ..., Cnd )
2252
2253 B<Sum>: SUM ( C1d, C2d, ..., Cid, ..., Cnd )
2254
2255 B<Euclidean>: SQRT( SUM( C1d ** 2, C2d ** 2, ..., Cid ** 2, ..., Cnd *** 2) )
2256
2257 The fingerprints bit-vector or vector string of each reference molecule in a set is compared
2258 with a database molecule using a similarity or distance coefficient specified via B<-b,
2259 --BitVectorComparisonMode> or B<-v, --VectorComparisonMode>. The reference molecules
2260 whose comparison values with a database molecule fall outside specified B<--SimilarityCutoff>
2261 or B<--DistanceCutoff> are ignored during I<Yes> value of B<--GroupFusionApplyCutoff>. The
2262 specified B<-g, --GroupFusionRule> is applied to B<-k, --kNN> reference molecules to calculate
2263 final fused similarity value between a database molecule and reference molecules set.
2264
2265 During dissimilarity search or usage of distance comparison coefficient in similarity search,
2266 the meaning of fingerprints comaprison value is automatically reversed as shown below:
2267
2268 SeachMode ComparisonCoefficient ComparisonValues
2269
2270 Similarity SimilarityCoefficient Higher value imples high similarity
2271 Similarity DistanceCoefficient Lower value implies high similarity
2272
2273 Dissimilarity SimilarityCoefficient Lower value implies high
2274 dissimilarity
2275 Dissimilarity DistanceCoefficient Higher value implies high
2276 dissimilarity
2277
2278 Consequently, I<Max> implies highest and lowest comparison value for usage of similarity and
2279 distance coefficient respectively during similarity search. And it corresponds to lowest and highest
2280 comparison value for usage of similarity and distance coefficient respectively during dissimilarity
2281 search. During I<Min> fusion rule, the highest and lowest comparison values are appropriately
2282 reversed.
2283
2284 =item B<--GroupFusionApplyCutoff> I<Yes | No>
2285
2286 Specify whether to apply B<--SimilarityCutoff> or B<--DistanceCutoff> values during application
2287 of B<-g, --GroupFusionRule> to reference molecules set. Possible values: I<Yes or No>. Default
2288 value: I<Yes>.
2289
2290 During I<Yes> value of B<--GroupFusionApplyCutoff>, the reference molecules whose comparison
2291 values with a database molecule fall outside specified B<--SimilarityCutoff> or B<--DistanceCutoff>
2292 are not used to calculate final fused similarity value between a database molecule and reference
2293 molecules set.
2294
2295 =item B<-h, --help>
2296
2297 Print this help message.
2298
2299 =item B<--InDelim> I<comma | semicolon>
2300
2301 Input delimiter for reference and database fingerprints CSV I<TextFile(s)>. Possible values:
2302 I<comma or semicolon>. Default value: I<comma>. For TSV files, this option is ignored
2303 and I<tab> is used as a delimiter.
2304
2305 =item B<-k, --kNN> I<all | number>
2306
2307 Number of k-nearest neighbors (k-NN) reference molecules to use during B<-g, --GroupFusionRule>
2308 for calculating similarity of a database molecule against a set of reference molecules. Possible values:
2309 I<all | positive integers>. Default: I<all>.
2310
2311 After ranking similarity values between a database molecule and reference molecules during
2312 I<MultipleReferences> value of similarity search B<-m, --mode> option, a top B<-k, --KNN> reference
2313 molecule are selected and used during B<-g, --GroupFusionRule>.
2314
2315 This option is B<-s, --SearchMode> dependent: It corresponds to dissimilar molecules during
2316 I<DissimilaritySearch> value of B<-s, --SearchMode> option.
2317
2318 =item B<-m, --mode> I<IndividualReference | MultipleReferences>
2319
2320 Specify how to treat reference molecules in I<ReferenceFingerprintsFile> during similarity search:
2321 Treat each reference molecule individually during similarity search or perform similarity
2322 search by treating multiple reference molecules as a set. Possible values: I<IndividualReference
2323 | MultipleReferences>. Default value: I<MultipleReferences>.
2324
2325 During I<IndividualReference> value of B<-m, --Mode> for similarity search, fingerprints bit-vector
2326 or vector string of each reference molecule is compared with database molecules using specified
2327 similarity or distance coefficients to identify most similar molecules for each reference molecule.
2328 Based on value of B<--SimilarCountMode>, upto B<--n, NumOfSimilarMolecules> or B<-p,
2329 --PercentSimilarMolecules> at specified <--SimilarityCutoff> or B<--DistanceCutoff> are
2330 identified for each reference molecule.
2331
2332 During I<MultipleReferences> value B<-m, --mode> for similarity search, all reference molecules
2333 are considered as a set and B<-g, --GroupFusionRule> is used to calculate similarity of a database
2334 molecule against reference molecules set either using all reference molecules or number of k-nearest
2335 neighbors (k-NN) to a database molecule specified using B<-k, --kNN>. The fingerprints bit-vector
2336 or vector string of each reference molecule in a set is compared with a database molecule using
2337 a similarity or distance coefficient specified via B<-b, --BitVectorComparisonMode> or B<-v,
2338 --VectorComparisonMode>. The reference molecules whose comparison values with a database
2339 molecule fall outside specified B<--SimilarityCutoff> or B<--DistanceCutoff> are ignored. The
2340 specified B<-g, --GroupFusionRule> is applied to rest of B<-k, --kNN> reference molecules to calculate
2341 final similarity value between a database molecule and reference molecules set.
2342
2343 The meaning of similarity and distance is automatically reversed during I<DissimilaritySearch> value
2344 of B<-s, --SearchMode> along with appropriate handling of B<--SimilarityCutoff> or
2345 B<--DistanceCutoff> values.
2346
2347 =item B<-n, --NumOfSimilarMolecules> I<number>
2348
2349 Maximum number of most similar database molecules to find for each reference molecule or set of
2350 reference molecules based on I<IndividualReference> or I<MultipleReferences> value of similarity
2351 search B<-m, --mode> option. Default: I<10>. Valid values: positive integers.
2352
2353 This option is ignored during I<PercentSimilar> value of B<--SimilarCountMode> option.
2354
2355 This option is B<-s, --SearchMode> dependent: It corresponds to dissimilar molecules during
2356 I<DissimilaritySearch> value of B<-s, --SearchMode> option.
2357
2358 =item B<--OutDelim> I<comma | tab | semicolon>
2359
2360 Delimiter for output CSV/TSV text file. Possible values: I<comma, tab, or semicolon>
2361 Default value: I<comma>.
2362
2363 =item B<--output> I<SD | text | both>
2364
2365 Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<text>.
2366
2367 =item B<-o, --overwrite>
2368
2369 Overwrite existing files
2370
2371 =item B<-p, --PercentSimilarMolecules> I<number>
2372
2373 Maximum percent of mosy similar database molecules to find for each reference molecule or set of
2374 reference molecules based on I<IndividualReference> or I<MultipleReferences> value of similarity
2375 search B<-m, --mode> option. Default: I<1> percent of database molecules. Valid values: non-zero values
2376 in between I<0 to 100>.
2377
2378 This option is ignored during I<NumOfSimilar> value of B<--SimilarCountMode> option.
2379
2380 During I<PercentSimilar> value of B<--SimilarCountMode> option, the number of molecules
2381 in I<DatabaseFingerprintsFile> is counted and number of similar molecules correspond to
2382 B<--PercentSimilarMolecules> of the total number of database molecules.
2383
2384 This option is B<-s, --SearchMode> dependent: It corresponds to dissimilar molecules during
2385 I<DissimilaritySearch> value of B<-s, --SearchMode> option.
2386
2387 =item B<--precision> I<number>
2388
2389 Precision of calculated similarity values for comparison and generating output files. Default: up to I<2>
2390 decimal places. Valid values: positive integers.
2391
2392 =item B<-q, --quote> I<Yes | No>
2393
2394 Put quote around column values in output CSV/TSV text file. Possible values:
2395 I<Yes or No>. Default value: I<Yes>.
2396
2397 =item B<--ReferenceColMode> I<ColNum | ColLabel>
2398
2399 Specify how columns are identified in reference fingerprints I<TextFile>: using column
2400 number or column label. Possible values: I<ColNum or ColLabel>. Default value: I<ColNum>.
2401
2402 =item B<--ReferenceCompoundIDCol> I<col number | col name>
2403
2404 This value is B<--ReferenceColMode> mode specific. It specifies column to use for retrieving compound
2405 ID from reference fingerprints I<TextFile> during similarity and dissimilarity search for output SD and CSV/TSV
2406 text files. Possible values: I<col number or col label>. Default value: I<first column containing the word compoundID
2407 in its column label or sequentially generated IDs>.
2408
2409 =item B<--ReferenceCompoundIDPrefix> I<text>
2410
2411 Specify compound ID prefix to use during sequential generation of compound IDs for reference fingerprints
2412 I<SDFile> and I<TextFile>. Default value: I<Cmpd>. The default value generates compound IDs which looks
2413 like Cmpd<Number>.
2414
2415 For reference fingerprints I<SDFile>, this value is only used during I<LabelPrefix | MolNameOrLabelPrefix>
2416 values of B<--ReferenceCompoundIDMode> option; otherwise, it's ignored.
2417
2418 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--DatabaseCompoundIDMode>:
2419
2420 Compound
2421
2422 The values specified above generates compound IDs which correspond to Compound<Number>
2423 instead of default value of Cmpd<Number>.
2424
2425 =item B<--ReferenceCompoundIDField> I<DataFieldName>
2426
2427 Specify reference fingerprints I<SDFile> datafield label for generating compound IDs.
2428 This value is only used during I<DataField> value of B<--ReferenceCompoundIDMode> option.
2429
2430 Examples for I<DataField> value of B<--ReferenceCompoundIDMode>:
2431
2432 MolID
2433 ExtReg
2434
2435 =item B<--ReferenceCompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
2436
2437 Specify how to generate compound IDs from reference fingerprints I<SDFile> during similarity and
2438 dissimilarity search for output SD and CSV/TSV text files: use a I<SDFile> datafield value; use
2439 molname line from I<SDFile>; generate a sequential ID with specific prefix; use combination of both
2440 MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
2441
2442 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
2443 Default: I<LabelPrefix>.
2444
2445 For I<MolNameAndLabelPrefix> value of B<--ReferenceCompoundIDMode>, molname line in I<SDFiles>
2446 takes precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
2447 values are replaced with sequential compound IDs.
2448
2449 =item B<--ReferenceFingerprintsCol> I<col number | col name>
2450
2451 This value is B<--ReferenceColMode> specific. It specifies fingerprints column to use during similarity
2452 and dissimilarity search for reference fingerprints I<TextFile>. Possible values: I<col number or col label>.
2453 Default value: I<first column containing the word Fingerprints in its column label>.
2454
2455 =item B<--ReferenceFingerprintsField> I<FieldLabel>
2456
2457 Fingerprints field label to use during similarity and dissimilarity search for reference fingerprints I<SDFile>.
2458 Default value: I<first data field label containing the word Fingerprints in its label>
2459
2460 =item B<-r, --root> I<RootName>
2461
2462 New file name is generated using the root: <Root>.<Ext>. Default for new file name:
2463 <ReferenceFileName>SimilaritySearching.<Ext>. The output file type determines <Ext>
2464 value. The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab delimited
2465 text files respectively.
2466
2467 =item B<-s, --SearchMode> I<SimilaritySearch | DissimilaritySearch>
2468
2469 Specify how to find molecules from database molecules for individual reference molecules or
2470 set of reference molecules: Find similar molecules or dissimilar molecules from database molecules.
2471 Possible values: I<SimilaritySearch | DissimilaritySearch>. Default value: I<SimilaritySearch>.
2472
2473 During I<DissimilaritySearch> value of B<-s, --SearchMode> option, the meaning of the following
2474 options is switched and they correspond to dissimilar molecules instead of similar molecules:
2475 B<--SimilarCountMode>, B<-n, --NumOfSimilarMolecules>, B<--PercentSimilarMolecules>,
2476 B<-k, --kNN>.
2477
2478 =item B<--SimilarCountMode> I<NumOfSimilar | PercentSimilar>
2479
2480 Specify method used to count similar molecules found from database molecules for individual
2481 reference molecules or set of reference molecules: Find number of similar molecules or percent
2482 of similar molecules from database molecules. Possible values: I<NumOfSimilar | PercentSimilar>.
2483 Default value: I<NumOfSimilar>.
2484
2485 The values for number of similar molecules and percent similar molecules are specified
2486 using options B<-n, NumOfSimilarMolecule> and B<--PercentSimilarMolecules>.
2487
2488 This option is B<-s, --SearchMode> dependent: It corresponds to dissimilar molecules during
2489 I<DissimilaritySearch> value of B<-s, --SearchMode> option.
2490
2491 =item B<--SimilarityCutoff> I<number>
2492
2493 Similarity cutoff value to use during comparison of similarity value between a pair of database
2494 and reference molecules calculated by similarity comparison methods for fingerprints bit-vector
2495 vector strings data values. Possible values: I<Any valid number>. Default value: I<0.75>.
2496
2497 The comparison value between a pair of database and reference molecule must meet the cutoff
2498 criterion as shown below:
2499
2500 SeachMode CutoffCriterion ComparisonValues
2501
2502 Similarity >= Higher value implies high similarity
2503 Dissimilarity <= Lower value implies high dissimilarity
2504
2505 This option is ignored during I<No> value of B<--GroupFusionApplyCutoff> for I<MultipleReferences>
2506 B<-m, --mode>.
2507
2508 This option is B<-s, --SearchMode> dependent: It corresponds to dissimilar molecules during
2509 I<DissimilaritySearch> value of B<-s, --SearchMode> option.
2510
2511 =item B<-v, --VectorComparisonMode> I<SupportedSimilarityName | SupportedDistanceName>
2512
2513 Specify what similarity or distance coefficient to use for calculating similarity between fingerprint
2514 vector strings data values in I<ReferenceFingerprintsFile> and I<DatabaseFingerprintsFile> during
2515 similarity search. Possible values: I<TanimotoSimilairy | ... | ManhattanDistance | ...>. Default
2516 value: I<TanimotoSimilarity>.
2517
2518 The value of B<-v, --VectorComparisonMode>, in conjunction with B<--VectorComparisonFormulism>,
2519 decides which type of similarity and distance coefficient formulism gets used.
2520
2521 The current releases supports the following similarity and distance coefficients: I<CosineSimilarity,
2522 CzekanowskiSimilarity, DiceSimilarity, OchiaiSimilarity, JaccardSimilarity, SorensonSimilarity, TanimotoSimilarity,
2523 CityBlockDistance, EuclideanDistance, HammingDistance, ManhattanDistance, SoergelDistance>. These
2524 similarity and distance coefficients are described below.
2525
2526 B<FingerprintsVector.pm> module, used to calculate similarity and distance coefficients,
2527 provides support to perform comparison between vectors containing three different types of
2528 values:
2529
2530 Type I: OrderedNumericalValues
2531
2532 . Size of two vectors are same
2533 . Vectors contain real values in a specific order. For example: MACCS keys
2534 count, Topological pharmnacophore atom pairs and so on.
2535
2536 Type II: UnorderedNumericalValues
2537
2538 . Size of two vectors might not be same
2539 . Vectors contain unordered real value identified by value IDs. For example:
2540 Toplogical atom pairs, Topological atom torsions and so on
2541
2542 Type III: AlphaNumericalValues
2543
2544 . Size of two vectors might not be same
2545 . Vectors contain unordered alphanumerical values. For example: Extended
2546 connectivity fingerprints, atom neighborhood fingerprints.
2547
2548 Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues
2549 or AlphaNumericalValues, the vectors are transformed into vectors containing unique OrderedNumericalValues
2550 using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues.
2551
2552 Three forms of similarity and distance calculation between two vectors, specified using B<--VectorComparisonFormulism>
2553 option, are supported: I<AlgebraicForm, BinaryForm or SetTheoreticForm>.
2554
2555 For I<BinaryForm>, the ordered list of processed final vector values containing the value or
2556 count of each unique value type is simply converted into a binary vector containing 1s and 0s
2557 corresponding to presence or absence of values before calculating similarity or distance between
2558 two vectors.
2559
2560 For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let:
2561
2562 N = Number values in A or B
2563
2564 Xa = Values of vector A
2565 Xb = Values of vector B
2566
2567 Xai = Value of ith element in A
2568 Xbi = Value of ith element in B
2569
2570 SUM = Sum of i over N values
2571
2572 For SetTheoreticForm of calculation between two vectors, let:
2573
2574 SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) )
2575 SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) )
2576
2577 For BinaryForm of calculation between two vectors, let:
2578
2579 Na = Number of bits set to "1" in A = SUM ( Xai )
2580 Nb = Number of bits set to "1" in B = SUM ( Xbi )
2581 Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi )
2582 Nd = Number of bits set to "0" in both A and B
2583 = SUM ( 1 - Xai - Xbi + Xai * Xbi)
2584
2585 N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd
2586
2587 Additionally, for BinaryForm various values also correspond to:
2588
2589 Na = | Xa |
2590 Nb = | Xb |
2591 Nc = | SetIntersectionXaXb |
2592 Nd = N - | SetDifferenceXaXb |
2593
2594 | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc
2595 = | Xa | + | Xb | - | SetIntersectionXaXb |
2596
2597 Various similarity and distance coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair of vectors A and B
2598 in I<AlgebraicForm, BinaryForm and SetTheoreticForm> are defined as follows:
2599
2600 B<CityBlockDistance>: ( same as HammingDistance and ManhattanDistance)
2601
2602 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) )
2603
2604 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
2605
2606 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
2607
2608 B<CosineSimilarity>: ( same as OchiaiSimilarityCoefficient)
2609
2610 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) )
2611
2612 I<BinaryForm>: Nc / SQRT ( Na * Nb)
2613
2614 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
2615
2616 B<CzekanowskiSimilarity>: ( same as DiceSimilarity and SorensonSimilarity)
2617
2618 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) )
2619
2620 I<BinaryForm>: 2 * Nc / ( Na + Nb )
2621
2622 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
2623
2624 B<DiceSimilarity>: ( same as CzekanowskiSimilarity and SorensonSimilarity)
2625
2626 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) )
2627
2628 I<BinaryForm>: 2 * Nc / ( Na + Nb )
2629
2630 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
2631
2632 B<EuclideanDistance>:
2633
2634 I<AlgebraicForm>: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) )
2635
2636 I<BinaryForm>: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb - 2 * Nc )
2637
2638 I<SetTheoreticForm>: SQRT ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) )
2639
2640 B<HammingDistance>: ( same as CityBlockDistance and ManhattanDistance)
2641
2642 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) )
2643
2644 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
2645
2646 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
2647
2648 B<JaccardSimilarity>: ( same as TanimotoSimilarity)
2649
2650 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) )
2651
2652 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc )
2653
2654 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) )
2655
2656 B<ManhattanDistance>: ( same as CityBlockDistance and HammingDistance)
2657
2658 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) )
2659
2660 I<BinaryForm>: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
2661
2662 I<SetTheoreticForm>: | SetDifferenceXaXb | - | SetIntersectionXaXb | = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
2663
2664 B<OchiaiSimilarity>: ( same as CosineSimilarity)
2665
2666 I<AlgebraicForm>: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM ( Xbi ** 2) )
2667
2668 I<BinaryForm>: Nc / SQRT ( Na * Nb)
2669
2670 I<SetTheoreticForm>: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) = SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
2671
2672 B<SorensonSimilarity>: ( same as CzekanowskiSimilarity and DiceSimilarity)
2673
2674 I<AlgebraicForm>: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) + SUM ( Xbi **2 ) )
2675
2676 I<BinaryForm>: 2 * Nc / ( Na + Nb )
2677
2678 I<SetTheoreticForm>: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
2679
2680 B<SoergelDistance>:
2681
2682 I<AlgebraicForm>: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi ) )
2683
2684 I<BinaryForm>: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / ( Na + Nb - Nc )
2685
2686 I<SetTheoreticForm>: ( | SetDifferenceXaXb | - | SetIntersectionXaXb | ) / | SetDifferenceXaXb | = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) )
2687
2688 B<TanimotoSimilarity>: ( same as JaccardSimilarity)
2689
2690 I<AlgebraicForm>: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi ** 2 ) - SUM ( Xai * Xbi ) )
2691
2692 I<BinaryForm>: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na + Nb - Nc )
2693
2694 I<SetTheoreticForm>: | SetIntersectionXaXb | / | SetDifferenceXaXb | = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) ) )
2695
2696 =item B<--VectorComparisonFormulism> I<AlgebraicForm | BinaryForm | SetTheoreticForm>
2697
2698 Specify fingerprints vector comparison formulism to use for calculation similarity and distance
2699 coefficients during B<-v, --VectorComparisonMode>. Possible values: I<AlgebraicForm | BinaryForm |
2700 SetTheoreticForm>. Default value: I<AlgebraicForm>.
2701
2702 For fingerprint vector strings containing B<AlphaNumericalValues> data values - B<ExtendedConnectivityFingerprints>,
2703 B<AtomNeighborhoodsFingerprints> and so on - all three formulism result in same value during similarity and distance
2704 calculations.
2705
2706 =item B<-w, --WorkingDir> I<DirName>
2707
2708 Location of working directory. Default: current directory.
2709
2710 =back
2711
2712 =head1 EXAMPLES
2713
2714 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
2715 to find 10 most similar database molecules with application of Max group fusion rule and similarity
2716 cutoff to supported fingerprints strings data in SD fingerprints files present in a data fields with
2717 Fingerprint substring in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing
2718 sequentially generated database compound IDs with Cmpd prefix, type:
2719
2720 % SimilaritySearchingFingerprints.pl -o ReferenceSampleFPHex.sdf
2721 DatabaseSampleFPHex.sdf
2722
2723 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
2724 to find 10 most similar database molecules with application of Max group fusion rule and similarity
2725 cutoff to supported fingerprints strings data in FP fingerprints files, and create a
2726 SimilaritySearchResults.csv file containing database compound IDs retireved from FP file, type:
2727
2728 % SimilaritySearchingFingerprints.pl -r SimilaritySearchResults -o
2729 ReferenceSampleFPBin.fpf DatabaseSampleFPBin.fpf
2730
2731 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
2732 to find 10 most similar database database molecules with application of Max group fusion rule and
2733 similarity cutoff to supported fingerprints strings data in text fingerprints files present in a column
2734 names containing Fingerprint substring in their names, and create a ReferenceFPHexSimilaritySearching.csv
2735 file containing database compound IDs retireved column name containing CompoundID substring or
2736 sequentially generated compound IDs, type:
2737
2738 % SimilaritySearchingFingerprints.pl -o ReferenceSampleFPCount.csv
2739 DatabaseSampleFPCount.csv
2740
2741 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
2742 to find 10 most similar database molecules for each reference molecule with application of similarity cutoff to
2743 supported fingerprints strings data in SD fingerprints files present in a data fields with Fingerprint substring
2744 in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing sequentially generated
2745 reference and database compound IDs with Cmpd prefix, type:
2746
2747 % SimilaritySearchingFingerprints.pl -mode IndividualReference -o
2748 ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
2749
2750 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
2751 to find 10 most similar database molecules for each reference molecule with application of similarity cutoff to
2752 supported fingerprints strings data in FP fingerprints files, and create a ReferenceFPHexSimilaritySearching.csv
2753 file containing references and database compound IDs retireved from FP file, type:
2754
2755 % SimilaritySearchingFingerprints.pl -mode IndividualReference -o
2756 ReferenceSampleFPHex.fpf DatabaseSampleFPHex.fpf
2757
2758 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
2759 to find 10 most similar database molecules for each reference molecule with application of similarity cutoff to
2760 supported fingerprints strings data in text fingerprints files present in a column names containing Fingerprint
2761 substring in their names, and create a ReferenceFPHexSimilaritySearching.csv file containing reference and
2762 database compound IDs retrieved column name containing CompoundID substring or sequentially generated
2763 compound IDs, type:
2764
2765 % SimilaritySearchingFingerprints.pl -mode IndividualReference -o
2766 ReferenceSampleFPHex.csv DatabaseSampleFPHex.csv
2767
2768 To perform dissimilarity search using Tanimoto coefficient by treating all reference molecules as a set
2769 to find 10 most dissimilar database molecules with application of Max group fusion rule and similarity
2770 cutoff to supported fingerprints strings data in SD fingerprints files present in a data fields with
2771 Fingerprint substring in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing
2772 sequentially generated database compound IDs with Cmpd prefix, type:
2773
2774 % SimilaritySearchingFingerprints.pl --mode MultipleReferences --SearchMode
2775 DissimilaritySearch -o ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
2776
2777 To perform similarity search using CityBlock distance by treating reference molecules as individual molecules
2778 to find 10 most similar database molecules for each reference molecule with application of distance cutoff
2779 to supported vector fingerprints strings data in SD fingerprints files present in a data fields with Fingerprint
2780 substring in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing sequentially generated
2781 reference and database compound IDs with Cmpd prefix, type:
2782
2783 % SimilaritySearchingFingerprints.pl -mode IndividualReference
2784 --VectorComparisonMode CityBlockDistance --VectorComparisonFormulism
2785 AlgebraicForm --DistanceCutoff 10 -o
2786 ReferenceSampleFPCount.sdf DatabaseSampleFPCount.sdf
2787
2788 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
2789 to find 100 most similar database molecules with application of Mean group fusion rule to to top 10
2790 reference molecules with in similarity cutoff of 0.75 to supported fingerprints strings data in FP fingerprints
2791 files, and create a ReferenceFPHexSimilaritySearching.csv file containing database compound IDs retrieved
2792 from FP file, type:
2793
2794 % SimilaritySearchingFingerprints.pl --mode MultipleReferences --SearchMode
2795 SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
2796 --GroupFusionRule Mean --GroupFusionApplyCutoff Yes --kNN 10
2797 --SimilarityCutoff 0.75 --SimilarCountMode NumOfSimilar
2798 --NumOfSimilarMolecules 100 -o
2799 ReferenceSampleFPHex.fpf DatabaseSampleFPHex.fpf
2800
2801 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
2802 to find 2 percent of most similar database molecules for each reference molecule with application of similarity
2803 cutoff of 0.85 to supported fingerprints strings data in text fingerprints files present in specific columns and
2804 create a ReferenceFPHexSimilaritySearching.csv file containing reference and database compoundIDs retrieved
2805 from specific columns, type:
2806
2807 % SimilaritySearchingFingerprints.pl --mode IndividualReference --SearchMode
2808 SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
2809 --ReferenceColMode ColLabel --ReferenceFingerprintsCol Fingerprints
2810 --ReferenceCompoundIDCol CompoundID --DatabaseColMode Collabel
2811 --DatabaseCompoundIDCol CompoundID --DatabaseFingerprintsCol
2812 Fingerprints --SimilarityCutoff 0.85 --SimilarCountMode PercentSimilar
2813 --PercentSimilarMolecules 2 -o
2814 ReferenceSampleFPHex.csv DatabaseSampleFPHex.csv
2815
2816 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
2817 to find top 50 most similar database molecules for each reference molecule with application of similarity
2818 cutoff of 0.85 to supported fingerprints strings data in SD fingerprints files present in specific data fields and
2819 create both ReferenceFPHexSimilaritySearching.csv and ReferenceFPHexSimilaritySearching.sdf files containing
2820 reference and database compoundIDs retrieved from specific data fields, type:
2821
2822 % SimilaritySearchingFingerprints.pl --mode IndividualReference --SearchMode
2823 SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
2824 --ReferenceFingerprintsField Fingerprints
2825 --DatabaseFingerprintsField Fingerprints
2826 --ReferenceCompoundIDMode DataField --ReferenceCompoundIDField CmpdID
2827 --DatabaseCompoundIDMode DataField --DatabaseCompoundIDField CmpdID
2828 --SimilarityCutoff 0.85 --SimilarCountMode NumOfSimilar
2829 --NumOfSimilarMolecules 50 --output both -o
2830 ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
2831
2832 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
2833 to find 1 percent of most similar database molecules for each reference molecule with application of similarity
2834 cutoff to supported fingerprints strings data in SD fingerprints files present in specific data field labels, and create
2835 both ReferenceFPHexSimilaritySearching.csv ReferenceFPHexSimilaritySearching.sdf files containing reference and
2836 database compound IDs retrieved from specific data field labels along with other specific data for database
2837 molecules, type:
2838
2839 % SimilaritySearchingFingerprints.pl --mode IndividualReference --SearchMode
2840 SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
2841 --ReferenceFingerprintsField Fingerprints
2842 --DatabaseFingerprintsField Fingerprints
2843 --ReferenceCompoundIDMode DataField --ReferenceCompoundIDField CmpdID
2844 --DatabaseCompoundIDMode DataField --DatabaseCompoundIDField CmpdID
2845 --DatabaseDataFieldsMode Specify --DatabaseDataFields "TPSA,SLogP"
2846 --SimilarityCutoff 0.75 --SimilarCountMode PercentSimilar
2847 --PercentSimilarMolecules 1 --output both --OutDelim comma --quote Yes
2848 --precision 3 -o ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
2849
2850 =head1 AUTHOR
2851
2852 Manish Sud <msud@san.rr.com>
2853
2854 =head1 SEE ALSO
2855
2856 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
2857 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
2858 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
2859 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
2860
2861 =head1 COPYRIGHT
2862
2863 Copyright (C) 2015 Manish Sud. All rights reserved.
2864
2865 This file is part of MayaChemTools.
2866
2867 MayaChemTools is free software; you can redistribute it and/or modify it under
2868 the terms of the GNU Lesser General Public License as published by the Free
2869 Software Foundation; either version 3 of the License, or (at your option)
2870 any later version.
2871
2872 =cut