| 
0
 | 
     1 NAME
 | 
| 
 | 
     2     SimilaritySearchingFingerprints.pl - Perform similarity search using
 | 
| 
 | 
     3     fingerprints strings data in SD, FP and CSV/TSV text file(s)
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 SYNOPSIS
 | 
| 
 | 
     6     SimilaritySearchingFingerprints.pl ReferenceFPFile DatabaseFPFile
 | 
| 
 | 
     7 
 | 
| 
 | 
     8     SimilaritySearchingFingerprints.pl [--alpha *number*] [--beta *number*]
 | 
| 
 | 
     9     [-b, --BitVectorComparisonMode *TanimotoSimilarity | TverskySimilarity |
 | 
| 
 | 
    10     ...*] [--DatabaseColMode *ColNum | ColLabel*] [--DatabaseCompoundIDCol
 | 
| 
 | 
    11     *col number | col name*] [--DatabaseCompoundIDPrefix *text*]
 | 
| 
 | 
    12     [--DatabaseCompoundIDField *DataFieldName*] [--DatabaseCompoundIDMode
 | 
| 
 | 
    13     *DataField | MolName | LabelPrefix | MolNameOrLabelPrefix*]
 | 
| 
 | 
    14     [--DatabaseDataCols *"DataColNum1, DataColNum2,... " | DataColLabel1,
 | 
| 
 | 
    15     DataCoLabel2,... "*] [--DatabaseDataColsMode *All | Specify |
 | 
| 
 | 
    16     CompoundID*] [--DatabaseDataFields *"FieldLabel1, FieldLabel2,... "*]
 | 
| 
 | 
    17     [--DatabaseDataFieldsMode *All | Common | Specify | CompoundID*]
 | 
| 
 | 
    18     [--DatabaseFingerprintsCol *col number | col name*]
 | 
| 
 | 
    19     [--DatabaseFingerprintsField *FieldLabel*] []--DistanceCutoff *number*]
 | 
| 
 | 
    20     [-d, --detail *InfoLevel*] [-f, --fast] [--FingerprintsMode *AutoDetect
 | 
| 
 | 
    21     | FingerprintsBitVectorString | FingerprintsVectorString*] [-g,
 | 
| 
 | 
    22     --GroupFusionRule *Max, Mean, Median, Min, Sum, Euclidean*]
 | 
| 
 | 
    23     [--GroupFusionApplyCutoff *Yes | No*] [-h, --help] [--InDelim *comma |
 | 
| 
 | 
    24     semicolon*] [-k, --KNN *all | number*] [-m, --mode *IndividualReference
 | 
| 
 | 
    25     | MultipleReferences*] [-n, --NumOfSimilarMolecules *number*]
 | 
| 
 | 
    26     [--OutDelim *comma | tab | semicolon*] [--output *SD | text | both*]
 | 
| 
 | 
    27     [-o, --overwrite] [-p, --PercentSimilarMolecules *number*] [--precision
 | 
| 
 | 
    28     *number*] [-q, --quote *Yes | No*] [--ReferenceColMode *ColNum |
 | 
| 
 | 
    29     ColLabel*] [--ReferenceCompoundIDCol *col number | col name*]
 | 
| 
 | 
    30     [--ReferenceCompoundIDPrefix *text*] [--ReferenceCompoundIDField
 | 
| 
 | 
    31     *DataFieldName*] [--ReferenceCompoundIDMode *DataField | MolName |
 | 
| 
 | 
    32     LabelPrefix | MolNameOrLabelPrefix*] [--ReferenceFingerprintsCol *col
 | 
| 
 | 
    33     number | col name*] [--ReferenceFingerprintsField *FieldLabel*] [-r,
 | 
| 
 | 
    34     --root *RootName*] [-s, --SearchMode *SimilaritySearch |
 | 
| 
 | 
    35     DissimilaritySearch*] [--SimilarCountMode *NumOfSimilar |
 | 
| 
 | 
    36     PercentSimilar*] [--SimilarityCutoff *number*] [-v,
 | 
| 
 | 
    37     --VectorComparisonMode *TanimotoSimilairy | ... | ManhattanDistance |
 | 
| 
 | 
    38     ...*] [--VectorComparisonFormulism *AlgebraicForm | BinaryForm |
 | 
| 
 | 
    39     SetTheoreticForm*] [-w, --WorkingDir dirname] ReferenceFingerprintsFile
 | 
| 
 | 
    40     DatabaseFingerprintsFile
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 DESCRIPTION
 | 
| 
 | 
    43     Perform molecular similarity search [ Ref 94-113 ] using fingerprint
 | 
| 
 | 
    44     bit-vector or vector strings data in *SD, FP, or CSV/TSV text* files
 | 
| 
 | 
    45     corresponding to *ReferenceFingerprintsFile* and
 | 
| 
 | 
    46     *DatabaseFingerprintsFile*, and generate SD and CSV/TSV text file(s)
 | 
| 
 | 
    47     containing database molecules which are similar to reference
 | 
| 
 | 
    48     molecule(s). The reference molecules are also referred to as query or
 | 
| 
 | 
    49     seed molecules and database molecules as target molecules in the
 | 
| 
 | 
    50     literature.
 | 
| 
 | 
    51 
 | 
| 
 | 
    52     The current release of MayaChemTools supports two types of similarity
 | 
| 
 | 
    53     search modes: *IndividualReference or MultipleReferences*. For default
 | 
| 
 | 
    54     value of *MultipleReferences* for -m, --mode option, reference molecules
 | 
| 
 | 
    55     are considered as a set and -g, --GroupFusionRule is used to calculate
 | 
| 
 | 
    56     similarity of a database molecule against reference molecules set. The
 | 
| 
 | 
    57     group fusion rule is also referred to as data fusion of consensus
 | 
| 
 | 
    58     scoring in the literature. However, for *IndividualReference* value of
 | 
| 
 | 
    59     -m, --mode option, reference molecules are treated as individual
 | 
| 
 | 
    60     molecules and each reference molecule is compared against a database
 | 
| 
 | 
    61     molecule by itself to identify similar molecules.
 | 
| 
 | 
    62 
 | 
| 
 | 
    63     The molecular dissimilarity search can also be performed using
 | 
| 
 | 
    64     *DissimilaritySearch* value for -s, --SearchMode option. During
 | 
| 
 | 
    65     dissimilarity search or usage of distance comparison coefficient in
 | 
| 
 | 
    66     similarity similarity search, the meaning of fingerprints comparison
 | 
| 
 | 
    67     value is automatically reversed as shown below:
 | 
| 
 | 
    68 
 | 
| 
 | 
    69         SeachMode      ComparisonCoefficient  ResultsSort   ComparisonValues
 | 
| 
 | 
    70 
 | 
| 
 | 
    71         Similarity     SimilarityCoefficient  Descending    Higher value imples
 | 
| 
 | 
    72                                                             high similarity
 | 
| 
 | 
    73         Similarity     DistanceCoefficient    Ascending     Lower value implies
 | 
| 
 | 
    74                                                             high similarity
 | 
| 
 | 
    75 
 | 
| 
 | 
    76         Dissimilarity  SimilarityCoefficient  Ascending     Lower value implies
 | 
| 
 | 
    77                                                             high dissimilarity
 | 
| 
 | 
    78         Dissimilarity  DistanceCoefficient    Descending    Higher value implies
 | 
| 
 | 
    79                                                             high dissimilarity
 | 
| 
 | 
    80 
 | 
| 
 | 
    81     During *IndividualReference* value of -m, --Mode option for similarity
 | 
| 
 | 
    82     search, fingerprints bit-vector or vector string of each reference
 | 
| 
 | 
    83     molecule is compared with database molecules using specified similarity
 | 
| 
 | 
    84     or distance coefficients to identify most similar molecules for each
 | 
| 
 | 
    85     reference molecule. Based on value of --SimilarCountMode, up to --n,
 | 
| 
 | 
    86     --NumOfSimilarMolecules or -p, --PercentSimilarMolecules at specified
 | 
| 
 | 
    87     --SimilarityCutoff or --DistanceCutoff are identified for each reference
 | 
| 
 | 
    88     molecule.
 | 
| 
 | 
    89 
 | 
| 
 | 
    90     During *MultipleReferences* value -m, --mode option for similarity
 | 
| 
 | 
    91     search, all reference molecules are considered as a set and -g,
 | 
| 
 | 
    92     --GroupFusionRule is used to calculate similarity of a database molecule
 | 
| 
 | 
    93     against reference molecules set either using all reference molecules or
 | 
| 
 | 
    94     number of k-nearest neighbors (k-NN) to a database molecule specified
 | 
| 
 | 
    95     using -k, --kNN. The fingerprints bit-vector or vector string of each
 | 
| 
 | 
    96     reference molecule in a set is compared with a database molecule using a
 | 
| 
 | 
    97     similarity or distance coefficient specified via -b,
 | 
| 
 | 
    98     --BitVectorComparisonMode or -v, --VectorComparisonMode. The reference
 | 
| 
 | 
    99     molecules whose comparison values with a database molecule fall outside
 | 
| 
 | 
   100     specified --SimilarityCutoff or --DistanceCutoff are ignored during
 | 
| 
 | 
   101     *Yes* value of --GroupFusionApplyCutoff. The specified -g,
 | 
| 
 | 
   102     --GroupFusionRule is applied to -k, --kNN reference molecules to
 | 
| 
 | 
   103     calculate final similarity value between a database molecule and
 | 
| 
 | 
   104     reference molecules set.
 | 
| 
 | 
   105 
 | 
| 
 | 
   106     The input fingerprints *SD, FP, or Text (CSV/TSV)* files for
 | 
| 
 | 
   107     *ReferenceFingerprintsFile* and *DatabaseTextFile* must contain valid
 | 
| 
 | 
   108     fingerprint bit-vector or vector strings data corresponding to same type
 | 
| 
 | 
   109     of fingerprints.
 | 
| 
 | 
   110 
 | 
| 
 | 
   111     The valid fingerprints *SDFile* extensions are *.sdf* and *.sd*. The
 | 
| 
 | 
   112     valid fingerprints *FPFile* extensions are *.fpf* and *.fp*. The valid
 | 
| 
 | 
   113     fingerprints *TextFile (CSV/TSV)* extensions are *.csv* and *.tsv* for
 | 
| 
 | 
   114     comma/semicolon and tab delimited text files respectively. The --indelim
 | 
| 
 | 
   115     option determines the format of *TextFile*. Any file which doesn't
 | 
| 
 | 
   116     correspond to the format indicated by --indelim option is ignored.
 | 
| 
 | 
   117 
 | 
| 
 | 
   118     Example of *FP* file containing fingerprints bit-vector string data:
 | 
| 
 | 
   119 
 | 
| 
 | 
   120         #
 | 
| 
 | 
   121         # Package = MayaChemTools 7.4
 | 
| 
 | 
   122         # ReleaseDate = Oct 21, 2010
 | 
| 
 | 
   123         #
 | 
| 
 | 
   124         # TimeStamp =  Mon Mar 7 15:14:01 2011
 | 
| 
 | 
   125         #
 | 
| 
 | 
   126         # FingerprintsStringType = FingerprintsBitVector
 | 
| 
 | 
   127         #
 | 
| 
 | 
   128         # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
 | 
| 
 | 
   129         # Size = 1024
 | 
| 
 | 
   130         # BitStringFormat = HexadecimalString
 | 
| 
 | 
   131         # BitsOrder = Ascending
 | 
| 
 | 
   132         #
 | 
| 
 | 
   133         Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510...
 | 
| 
 | 
   134         Cmpd2 000000249400840040100042011001001980410c000000001010088001120...
 | 
| 
 | 
   135         ... ...
 | 
| 
 | 
   136         ... ..
 | 
| 
 | 
   137 
 | 
| 
 | 
   138     Example of *FP* file containing fingerprints vector string data:
 | 
| 
 | 
   139 
 | 
| 
 | 
   140         #
 | 
| 
 | 
   141         # Package = MayaChemTools 7.4
 | 
| 
 | 
   142         # ReleaseDate = Oct 21, 2010
 | 
| 
 | 
   143         #
 | 
| 
 | 
   144         # TimeStamp =  Mon Mar 7 15:14:01 2011
 | 
| 
 | 
   145         #
 | 
| 
 | 
   146         # FingerprintsStringType = FingerprintsVector
 | 
| 
 | 
   147         #
 | 
| 
 | 
   148         # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
 | 
| 
 | 
   149         # VectorStringFormat = IDsAndValuesString
 | 
| 
 | 
   150         # VectorValuesType = NumericalValues
 | 
| 
 | 
   151         #
 | 
| 
 | 
   152         Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C:
 | 
| 
 | 
   153         N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...;
 | 
| 
 | 
   154         33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2
 | 
| 
 | 
   155         6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ...
 | 
| 
 | 
   156         Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C
 | 
| 
 | 
   157         O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...;
 | 
| 
 | 
   158         15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2
 | 
| 
 | 
   159         1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ...
 | 
| 
 | 
   160         ... ...
 | 
| 
 | 
   161         ... ...
 | 
| 
 | 
   162 
 | 
| 
 | 
   163     Example of *SD* file containing fingerprints bit-vector string data:
 | 
| 
 | 
   164 
 | 
| 
 | 
   165         ... ...
 | 
| 
 | 
   166         ... ...
 | 
| 
 | 
   167         $$$$
 | 
| 
 | 
   168         ... ...
 | 
| 
 | 
   169         ... ...
 | 
| 
 | 
   170         ... ...
 | 
| 
 | 
   171         41 44  0  0  0  0  0  0  0  0999 V2000
 | 
| 
 | 
   172          -3.3652    1.4499    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
 | 
| 
 | 
   173         ... ...
 | 
| 
 | 
   174         2  3  1  0  0  0  0
 | 
| 
 | 
   175         ... ...
 | 
| 
 | 
   176         M  END
 | 
| 
 | 
   177         >  <CmpdID>
 | 
| 
 | 
   178         Cmpd1
 | 
| 
 | 
   179 
 | 
| 
 | 
   180         >  <PathLengthFingerprints>
 | 
| 
 | 
   181         FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt
 | 
| 
 | 
   182         h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66
 | 
| 
 | 
   183         03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028
 | 
| 
 | 
   184         00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462
 | 
| 
 | 
   185         08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a
 | 
| 
 | 
   186         aa0660a11014a011d46
 | 
| 
 | 
   187 
 | 
| 
 | 
   188         $$$$
 | 
| 
 | 
   189         ... ...
 | 
| 
 | 
   190         ... ...
 | 
| 
 | 
   191 
 | 
| 
 | 
   192     Example of CSV *TextFile* containing fingerprints bit-vector string
 | 
| 
 | 
   193     data:
 | 
| 
 | 
   194 
 | 
| 
 | 
   195         "CompoundID","PathLengthFingerprints"
 | 
| 
 | 
   196         "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes
 | 
| 
 | 
   197         :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4
 | 
| 
 | 
   198         9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030
 | 
| 
 | 
   199         8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..."
 | 
| 
 | 
   200         ... ...
 | 
| 
 | 
   201         ... ...
 | 
| 
 | 
   202 
 | 
| 
 | 
   203     The current release of MayaChemTools supports the following types of
 | 
| 
 | 
   204     fingerprint bit-vector and vector strings:
 | 
| 
 | 
   205 
 | 
| 
 | 
   206         FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi
 | 
| 
 | 
   207         us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0-C.X1.BO1.H3-AT
 | 
| 
 | 
   208         C1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-ATC1 NR0-C.X
 | 
| 
 | 
   209         1.BO1.H3-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-A
 | 
| 
 | 
   210         TC1 NR0-C.X2.BO2.H2-ATC1:NR1-C.X2.BO2.H2-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2
 | 
| 
 | 
   211         -C.X2.BO2.H2-ATC1:NR2-N.X3.BO3-ATC1:NR2-O.X1.BO1.H1-ATC1 NR0-C.X2.B...
 | 
| 
 | 
   212 
 | 
| 
 | 
   213         FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS
 | 
| 
 | 
   214         ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2
 | 
| 
 | 
   215         .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1
 | 
| 
 | 
   216         O.X1.BO2;2 4 14 3 10 1 1 1 3 2
 | 
| 
 | 
   217 
 | 
| 
 | 
   218         FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume
 | 
| 
 | 
   219         ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F
 | 
| 
 | 
   220         N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1
 | 
| 
 | 
   221 
 | 
| 
 | 
   222         FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN
 | 
| 
 | 
   223         umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C
 | 
| 
 | 
   224         12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N
 | 
| 
 | 
   225         2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8
 | 
| 
 | 
   226         O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1
 | 
| 
 | 
   227         1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0...
 | 
| 
 | 
   228 
 | 
| 
 | 
   229         FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
 | 
| 
 | 
   230         AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
 | 
| 
 | 
   231         H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3
 | 
| 
 | 
   232         .024 -2.270
 | 
| 
 | 
   233 
 | 
| 
 | 
   234         FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
 | 
| 
 | 
   235         ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435
 | 
| 
 | 
   236         4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1
 | 
| 
 | 
   237         4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 | 
| 
 | 
   238         0 0 0 0 0 0 0 0 0 0 0 0 0 0
 | 
| 
 | 
   239 
 | 
| 
 | 
   240         FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
 | 
| 
 | 
   241         us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
 | 
| 
 | 
   242         666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
 | 
| 
 | 
   243         08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
 | 
| 
 | 
   244         5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
 | 
| 
 | 
   245         532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
 | 
| 
 | 
   246 
 | 
| 
 | 
   247         FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
 | 
| 
 | 
   248         :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
 | 
| 
 | 
   249         13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
 | 
| 
 | 
   250         2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
 | 
| 
 | 
   251         3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
 | 
| 
 | 
   252         1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
 | 
| 
 | 
   253 
 | 
| 
 | 
   254         FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
 | 
| 
 | 
   255         es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
 | 
| 
 | 
   256         0000000001010000000110000011000000000000100000000000000000000000100001
 | 
| 
 | 
   257         1000000110000000000000000000000000010011000000000000000000000000010000
 | 
| 
 | 
   258         0000000000000000000000000010000000000000000001000000000000000000000000
 | 
| 
 | 
   259         0000000000010000100001000000000000101000000000000000100000000000000...
 | 
| 
 | 
   260 
 | 
| 
 | 
   261         FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu
 | 
| 
 | 
   262         s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8
 | 
| 
 | 
   263         62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567
 | 
| 
 | 
   264         1 571109041 639579325 683993318 723853089 810600886 885767127 90326012
 | 
| 
 | 
   265         7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455
 | 
| 
 | 
   266         632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404...
 | 
| 
 | 
   267 
 | 
| 
 | 
   268         FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp
 | 
| 
 | 
   269         haNumericalValues;ValuesString;25189973 528584866 662581668 671034184
 | 
| 
 | 
   270         926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450
 | 
| 
 | 
   271         44754 96779665 180364292 341712110 345278822 386540408 387387308 50430
 | 
| 
 | 
   272         1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134
 | 
| 
 | 
   273         1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566...
 | 
| 
 | 
   274 
 | 
| 
 | 
   275         FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
 | 
| 
 | 
   276         0000000000000000000000000000000001001000010010000000010010000000011100
 | 
| 
 | 
   277         0100101010111100011011000100110110000011011110100110111111111111011111
 | 
| 
 | 
   278         11111111111110111000
 | 
| 
 | 
   279 
 | 
| 
 | 
   280         FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
 | 
| 
 | 
   281         1110011111100101111111000111101100110000000000000011100010000000000000
 | 
| 
 | 
   282         0000000000000000000000000000000000000000000000101000000000000000000000
 | 
| 
 | 
   283         0000000000000000000000000000000000000000000000000000000000000000000000
 | 
| 
 | 
   284         0000000000000000000000000000000000000011000000000000000000000000000000
 | 
| 
 | 
   285         0000000000000000000000000000000000000000
 | 
| 
 | 
   286 
 | 
| 
 | 
   287         FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
 | 
| 
 | 
   288         ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 | 
| 
 | 
   289         0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
 | 
| 
 | 
   290         0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
 | 
| 
 | 
   291         5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
 | 
| 
 | 
   292         3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
 | 
| 
 | 
   293 
 | 
| 
 | 
   294         FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
 | 
| 
 | 
   295         ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
 | 
| 
 | 
   296         0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
 | 
| 
 | 
   297         0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 | 
| 
 | 
   298         0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
 | 
| 
 | 
   299         0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
 | 
| 
 | 
   300 
 | 
| 
 | 
   301         FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
 | 
| 
 | 
   302         th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
 | 
| 
 | 
   303         0100010101011000101001011100110001000010001001101000001001001001001000
 | 
| 
 | 
   304         0010110100000111001001000001001010100100100000000011000000101001011100
 | 
| 
 | 
   305         0010000001000101010100000100111100110111011011011000000010110111001101
 | 
| 
 | 
   306         0101100011000000010001000011000010100011101100001000001000100000000...
 | 
| 
 | 
   307 
 | 
| 
 | 
   308         FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
 | 
| 
 | 
   309         1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
 | 
| 
 | 
   310         C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
 | 
| 
 | 
   311         2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
 | 
| 
 | 
   312         2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
 | 
| 
 | 
   313         4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
 | 
| 
 | 
   314 
 | 
| 
 | 
   315         FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
 | 
| 
 | 
   316         h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
 | 
| 
 | 
   317         8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
 | 
| 
 | 
   318         5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
 | 
| 
 | 
   319         CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
 | 
| 
 | 
   320         OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
 | 
| 
 | 
   321 
 | 
| 
 | 
   322         FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
 | 
| 
 | 
   323         istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
 | 
| 
 | 
   324         .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.
 | 
| 
 | 
   325         H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...;
 | 
| 
 | 
   326         2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
 | 
| 
 | 
   327         1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
 | 
| 
 | 
   328 
 | 
| 
 | 
   329         FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi
 | 
| 
 | 
   330         stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar
 | 
| 
 | 
   331         Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H
 | 
| 
 | 
   332         BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...;
 | 
| 
 | 
   333         23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4
 | 
| 
 | 
   334         1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ...
 | 
| 
 | 
   335 
 | 
| 
 | 
   336         FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3
 | 
| 
 | 
   337         3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-
 | 
| 
 | 
   338         C.X3.BO4 C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-N.X3.BO3 C.X2.BO2.H2-C.X2.BO
 | 
| 
 | 
   339         2.H2-C.X3.BO3.H1-C.X2.BO2.H2 C.X2.BO2.H2-C.X2.BO2.H2-C.X3.BO3.H1-O...;
 | 
| 
 | 
   340         2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1
 | 
| 
 | 
   341 
 | 
| 
 | 
   342         FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica
 | 
| 
 | 
   343         lValues;IDsAndValuesString;aaCH-aaCH-aaCH-aaCH aaCH-aaCH-aaCH-aasC aaC
 | 
| 
 | 
   344         H-aaCH-aasC-aaCH aaCH-aaCH-aasC-aasC aaCH-aaCH-aasC-sF aaCH-aaCH-aasC-
 | 
| 
 | 
   345         ssNH aaCH-aasC-aasC-aasC aaCH-aasC-aasC-aasN aaCH-aasC-ssNH-dssC a...;
 | 
| 
 | 
   346         4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
 | 
| 
 | 
   347 
 | 
| 
 | 
   348         FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
 | 
| 
 | 
   349         inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
 | 
| 
 | 
   350         .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1
 | 
| 
 | 
   351         0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1
 | 
| 
 | 
   352         -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....;
 | 
| 
 | 
   353         1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
 | 
| 
 | 
   354         2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
 | 
| 
 | 
   355 
 | 
| 
 | 
   356         FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1
 | 
| 
 | 
   357         :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C
 | 
| 
 | 
   358         .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3-
 | 
| 
 | 
   359         D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2
 | 
| 
 | 
   360         -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C.
 | 
| 
 | 
   361         3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7...
 | 
| 
 | 
   362 
 | 
| 
 | 
   363         FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
 | 
| 
 | 
   364         Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
 | 
| 
 | 
   365         -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
 | 
| 
 | 
   366         HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H
 | 
| 
 | 
   367         BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...;
 | 
| 
 | 
   368         18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
 | 
| 
 | 
   369         3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
 | 
| 
 | 
   370 
 | 
| 
 | 
   371         FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
 | 
| 
 | 
   372         ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
 | 
| 
 | 
   373         0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
 | 
| 
 | 
   374         0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
 | 
| 
 | 
   375         0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
 | 
| 
 | 
   376         0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
 | 
| 
 | 
   377 
 | 
| 
 | 
   378         FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize:
 | 
| 
 | 
   379         MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1-
 | 
| 
 | 
   380         Ar1-Ar1 Ar1-Ar1-H1 Ar1-Ar1-HBA1 Ar1-Ar1-HBD1 Ar1-H1-H1 Ar1-H1-HBA1 Ar1
 | 
| 
 | 
   381         -H1-HBD1 Ar1-HBA1-HBD1 H1-H1-H1 H1-H1-HBA1 H1-H1-HBD1 H1-HBA1-HBA1 H1-
 | 
| 
 | 
   382         HBA1-HBD1 H1-HBA1-NI1 H1-HBD1-NI1 HBA1-HBA1-NI1 HBA1-HBD1-NI1 Ar1-...;
 | 
| 
 | 
   383         46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23
 | 
| 
 | 
   384         28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1
 | 
| 
 | 
   385         119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ...
 | 
| 
 | 
   386 
 | 
| 
 | 
   387         FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD
 | 
| 
 | 
   388         istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106
 | 
| 
 | 
   389         8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0
 | 
| 
 | 
   390         0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26
 | 
| 
 | 
   391         14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0
 | 
| 
 | 
   392         0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ...
 | 
| 
 | 
   393 
 | 
| 
 | 
   394 OPTIONS
 | 
| 
 | 
   395     --alpha *number*
 | 
| 
 | 
   396         Value of alpha parameter for calculating *Tversky* similarity
 | 
| 
 | 
   397         coefficient specified for -b, --BitVectorComparisonMode option. It
 | 
| 
 | 
   398         corresponds to weights assigned for bits set to "1" in a pair of
 | 
| 
 | 
   399         fingerprint bit-vectors during the calculation of similarity
 | 
| 
 | 
   400         coefficient. Possible values: *0 to 1*. Default value: <0.5>.
 | 
| 
 | 
   401 
 | 
| 
 | 
   402     --beta *number*
 | 
| 
 | 
   403         Value of beta parameter for calculating *WeightedTanimoto* and
 | 
| 
 | 
   404         *WeightedTversky* similarity coefficients specified for -b,
 | 
| 
 | 
   405         --BitVectorComparisonMode option. It is used to weight the
 | 
| 
 | 
   406         contributions of bits set to "0" during the calculation of
 | 
| 
 | 
   407         similarity coefficients. Possible values: *0 to 1*. Default value of
 | 
| 
 | 
   408         <1> makes *WeightedTanimoto* and *WeightedTversky* equivalent to
 | 
| 
 | 
   409         *Tanimoto* and *Tversky*.
 | 
| 
 | 
   410 
 | 
| 
 | 
   411     -b, --BitVectorComparisonMode *TanimotoSimilarity | TverskySimilarity |
 | 
| 
 | 
   412     ...*
 | 
| 
 | 
   413         Specify what similarity coefficient to use for calculating
 | 
| 
 | 
   414         similarity between fingerprints bit-vector string data values in
 | 
| 
 | 
   415         *ReferenceFingerprintsFile* and *DatabaseFingerprintsFile* during
 | 
| 
 | 
   416         similarity search. Possible values: *TanimotoSimilarity |
 | 
| 
 | 
   417         TverskySimilarity | ...*. Default: *TanimotoSimilarity*
 | 
| 
 | 
   418 
 | 
| 
 | 
   419         The current release supports the following similarity coefficients:
 | 
| 
 | 
   420         *BaroniUrbaniSimilarity, BuserSimilarity, CosineSimilarity,
 | 
| 
 | 
   421         DiceSimilarity, DennisSimilarity, ForbesSimilarity,
 | 
| 
 | 
   422         FossumSimilarity, HamannSimilarity, JacardSimilarity,
 | 
| 
 | 
   423         Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity,
 | 
| 
 | 
   424         McConnaugheySimilarity, OchiaiSimilarity, PearsonSimilarity,
 | 
| 
 | 
   425         RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity,
 | 
| 
 | 
   426         SkoalSneath1Similarity, SkoalSneath2Similarity,
 | 
| 
 | 
   427         SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity,
 | 
| 
 | 
   428         YuleSimilarity, WeightedTanimotoSimilarity,
 | 
| 
 | 
   429         WeightedTverskySimilarity*. These similarity coefficients are
 | 
| 
 | 
   430         described below.
 | 
| 
 | 
   431 
 | 
| 
 | 
   432         For two fingerprint bit-vectors A and B of same size, let:
 | 
| 
 | 
   433 
 | 
| 
 | 
   434             Na = Number of bits set to "1" in A
 | 
| 
 | 
   435             Nb = Number of bits set to "1" in B
 | 
| 
 | 
   436             Nc = Number of bits set to "1" in both A and B
 | 
| 
 | 
   437             Nd = Number of bits set to "0" in both A and B
 | 
| 
 | 
   438 
 | 
| 
 | 
   439             Nt = Number of bits set to "1" or "0" in A or B (Size of A or B)
 | 
| 
 | 
   440             Nt = Na + Nb - Nc + Nd
 | 
| 
 | 
   441 
 | 
| 
 | 
   442             Na - Nc = Number of bits set to "1" in A but not in B
 | 
| 
 | 
   443             Nb - Nc = Number of bits set to "1" in B but not in A
 | 
| 
 | 
   444 
 | 
| 
 | 
   445         Then, various similarity coefficients [ Ref. 40 - 42 ] for a pair of
 | 
| 
 | 
   446         bit-vectors A and B are defined as follows:
 | 
| 
 | 
   447 
 | 
| 
 | 
   448         *BaroniUrbaniSimilarity*: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc *
 | 
| 
 | 
   449         Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser )
 | 
| 
 | 
   450 
 | 
| 
 | 
   451         *BuserSimilarity*: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) +
 | 
| 
 | 
   452         Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani )
 | 
| 
 | 
   453 
 | 
| 
 | 
   454         *CosineSimilarity*: Nc / SQRT ( Na * Nb ) (same as Ochiai)
 | 
| 
 | 
   455 
 | 
| 
 | 
   456         *DiceSimilarity*: (2 * Nc) / ( Na + Nb )
 | 
| 
 | 
   457 
 | 
| 
 | 
   458         *DennisSimilarity*: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) /
 | 
| 
 | 
   459         SQRT ( Nt * Na * Nb)
 | 
| 
 | 
   460 
 | 
| 
 | 
   461         *ForbesSimilarity*: ( Nt * Nc ) / ( Na * Nb )
 | 
| 
 | 
   462 
 | 
| 
 | 
   463         *FossumSimilarity*: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb )
 | 
| 
 | 
   464 
 | 
| 
 | 
   465         *HamannSimilarity*: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt
 | 
| 
 | 
   466 
 | 
| 
 | 
   467         *JaccardSimilarity*: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / (
 | 
| 
 | 
   468         Na + Nb - Nc ) (same as Tanimoto)
 | 
| 
 | 
   469 
 | 
| 
 | 
   470         *Kulczynski1Similarity*: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / (
 | 
| 
 | 
   471         Na + Nb - 2Nc )
 | 
| 
 | 
   472 
 | 
| 
 | 
   473         *Kulczynski2Similarity*: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + (
 | 
| 
 | 
   474         Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5
 | 
| 
 | 
   475         * ( Nc / Na + Nc / Nb )
 | 
| 
 | 
   476 
 | 
| 
 | 
   477         *MatchingSimilarity*: ( Nc + Nd ) / Nt
 | 
| 
 | 
   478 
 | 
| 
 | 
   479         *McConnaugheySimilarity*: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / (
 | 
| 
 | 
   480         Na * Nb )
 | 
| 
 | 
   481 
 | 
| 
 | 
   482         *OchiaiSimilarity*: Nc / SQRT ( Na * Nb ) (same as Cosine)
 | 
| 
 | 
   483 
 | 
| 
 | 
   484         *PearsonSimilarity*: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) /
 | 
| 
 | 
   485         SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) )
 | 
| 
 | 
   486 
 | 
| 
 | 
   487         *RogersTanimotoSimilarity*: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc)
 | 
| 
 | 
   488         + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt)
 | 
| 
 | 
   489 
 | 
| 
 | 
   490         *RussellRaoSimilarity*: Nc / Nt
 | 
| 
 | 
   491 
 | 
| 
 | 
   492         *SimpsonSimilarity*: Nc / MIN ( Na, Nb)
 | 
| 
 | 
   493 
 | 
| 
 | 
   494         *SkoalSneath1Similarity*: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb -
 | 
| 
 | 
   495         Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc )
 | 
| 
 | 
   496 
 | 
| 
 | 
   497         *SkoalSneath2Similarity*: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt )
 | 
| 
 | 
   498 
 | 
| 
 | 
   499         *SkoalSneath3Similarity*: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc )
 | 
| 
 | 
   500         ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc )
 | 
| 
 | 
   501 
 | 
| 
 | 
   502         *TanimotoSimilarity*: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc /
 | 
| 
 | 
   503         ( Na + Nb - Nc ) (same as Jaccard)
 | 
| 
 | 
   504 
 | 
| 
 | 
   505         *TverskySimilarity*: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * (
 | 
| 
 | 
   506         Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb)
 | 
| 
 | 
   507 
 | 
| 
 | 
   508         *YuleSimilarity*: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) /
 | 
| 
 | 
   509         ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) )
 | 
| 
 | 
   510 
 | 
| 
 | 
   511         Values of Tanimoto/Jaccard and Tversky coefficients are dependent on
 | 
| 
 | 
   512         only those bit which are set to "1" in both A and B. In order to
 | 
| 
 | 
   513         take into account all bit positions, modified versions of Tanimoto [
 | 
| 
 | 
   514         Ref. 42 ] and Tversky [ Ref. 43 ] have been developed.
 | 
| 
 | 
   515 
 | 
| 
 | 
   516         Let:
 | 
| 
 | 
   517 
 | 
| 
 | 
   518             Na' = Number of bits set to "0" in A
 | 
| 
 | 
   519             Nb' = Number of bits set to "0" in B
 | 
| 
 | 
   520             Nc' = Number of bits set to "0" in both A and B
 | 
| 
 | 
   521 
 | 
| 
 | 
   522         Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / (
 | 
| 
 | 
   523         Na' + Nb' - Nc' )
 | 
| 
 | 
   524 
 | 
| 
 | 
   525         Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc'
 | 
| 
 | 
   526         ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb')
 | 
| 
 | 
   527 
 | 
| 
 | 
   528         Then:
 | 
| 
 | 
   529 
 | 
| 
 | 
   530         *WeightedTanimotoSimilarity* = beta * Tanimoto + (1 - beta) *
 | 
| 
 | 
   531         Tanimoto'
 | 
| 
 | 
   532 
 | 
| 
 | 
   533         *WeightedTverskySimilarity* = beta * Tversky + (1 - beta) * Tversky'
 | 
| 
 | 
   534 
 | 
| 
 | 
   535     --DatabaseColMode *ColNum | ColLabel*
 | 
| 
 | 
   536         Specify how columns are identified in database fingerprints
 | 
| 
 | 
   537         *TextFile*: using column number or column label. Possible values:
 | 
| 
 | 
   538         *ColNum or ColLabel*. Default value: *ColNum*.
 | 
| 
 | 
   539 
 | 
| 
 | 
   540     --DatabaseCompoundIDCol *col number | col name*
 | 
| 
 | 
   541         This value is --DatabaseColMode mode specific. It specifies column
 | 
| 
 | 
   542         to use for retrieving compound ID from database fingerprints
 | 
| 
 | 
   543         *TextFile* during similarity and dissimilarity search for output SD
 | 
| 
 | 
   544         and CSV/TSV text files. Possible values: *col number or col label*.
 | 
| 
 | 
   545         Default value: *first column containing the word compoundID in its
 | 
| 
 | 
   546         column label or sequentially generated IDs*.
 | 
| 
 | 
   547 
 | 
| 
 | 
   548         This is only used for *CompoundID* value of --DatabaseDataColsMode
 | 
| 
 | 
   549         option.
 | 
| 
 | 
   550 
 | 
| 
 | 
   551     --DatabaseCompoundIDPrefix *text*
 | 
| 
 | 
   552         Specify compound ID prefix to use during sequential generation of
 | 
| 
 | 
   553         compound IDs for database fingerprints *SDFile* and *TextFile*.
 | 
| 
 | 
   554         Default value: *Cmpd*. The default value generates compound IDs
 | 
| 
 | 
   555         which look like Cmpd<Number>.
 | 
| 
 | 
   556 
 | 
| 
 | 
   557         For database fingerprints *SDFile*, this value is only used during
 | 
| 
 | 
   558         *LabelPrefix | MolNameOrLabelPrefix* values of
 | 
| 
 | 
   559         --DatabaseCompoundIDMode option; otherwise, it's ignored.
 | 
| 
 | 
   560 
 | 
| 
 | 
   561         Examples for *LabelPrefix* or *MolNameOrLabelPrefix* value of
 | 
| 
 | 
   562         --DatabaseCompoundIDMode:
 | 
| 
 | 
   563 
 | 
| 
 | 
   564             Compound
 | 
| 
 | 
   565 
 | 
| 
 | 
   566         The values specified above generates compound IDs which correspond
 | 
| 
 | 
   567         to Compound<Number> instead of default value of Cmpd<Number>.
 | 
| 
 | 
   568 
 | 
| 
 | 
   569     --DatabaseCompoundIDField *DataFieldName*
 | 
| 
 | 
   570         Specify database fingerprints *SDFile* datafield label for
 | 
| 
 | 
   571         generating compound IDs. This value is only used during *DataField*
 | 
| 
 | 
   572         value of --DatabaseCompoundIDMode option.
 | 
| 
 | 
   573 
 | 
| 
 | 
   574         Examples for *DataField* value of --DatabaseCompoundIDMode:
 | 
| 
 | 
   575 
 | 
| 
 | 
   576             MolID
 | 
| 
 | 
   577             ExtReg
 | 
| 
 | 
   578 
 | 
| 
 | 
   579     --DatabaseCompoundIDMode *DataField | MolName | LabelPrefix |
 | 
| 
 | 
   580     MolNameOrLabelPrefix*
 | 
| 
 | 
   581         Specify how to generate compound IDs from database fingerprints
 | 
| 
 | 
   582         *SDFile* during similarity and dissimilarity search for output SD
 | 
| 
 | 
   583         and CSV/TSV text files: use a *SDFile* datafield value; use molname
 | 
| 
 | 
   584         line from *SDFile*; generate a sequential ID with specific prefix;
 | 
| 
 | 
   585         use combination of both MolName and LabelPrefix with usage of
 | 
| 
 | 
   586         LabelPrefix values for empty molname lines.
 | 
| 
 | 
   587 
 | 
| 
 | 
   588         Possible values: *DataField | MolName | LabelPrefix |
 | 
| 
 | 
   589         MolNameOrLabelPrefix*. Default: *LabelPrefix*.
 | 
| 
 | 
   590 
 | 
| 
 | 
   591         For *MolNameAndLabelPrefix* value of --DatabaseCompoundIDMode,
 | 
| 
 | 
   592         molname line in *SDFile* takes precedence over sequential compound
 | 
| 
 | 
   593         IDs generated using *LabelPrefix* and only empty molname values are
 | 
| 
 | 
   594         replaced with sequential compound IDs.
 | 
| 
 | 
   595 
 | 
| 
 | 
   596         This is only used for *CompoundID* value of --DatabaseDataFieldsMode
 | 
| 
 | 
   597         option.
 | 
| 
 | 
   598 
 | 
| 
 | 
   599     --DatabaseDataCols *"DataColNum1,DataColNum2,... " |
 | 
| 
 | 
   600     DataColLabel1,DataCoLabel2,... "*
 | 
| 
 | 
   601         This value is --DatabaseColMode mode specific. It is a comma
 | 
| 
 | 
   602         delimited list of database fingerprints *TextFile* data column
 | 
| 
 | 
   603         numbers or labels to extract and write to SD and CSV/TSV text files
 | 
| 
 | 
   604         along with other information for *SD | text | both* values of
 | 
| 
 | 
   605         --output option.
 | 
| 
 | 
   606 
 | 
| 
 | 
   607         This is only used for *Specify* value of --DatabaseDataColsMode
 | 
| 
 | 
   608         option.
 | 
| 
 | 
   609 
 | 
| 
 | 
   610         Examples:
 | 
| 
 | 
   611 
 | 
| 
 | 
   612             1,2,3
 | 
| 
 | 
   613             CompoundName,MolWt
 | 
| 
 | 
   614 
 | 
| 
 | 
   615     --DatabaseDataColsMode *All | Specify | CompoundID*
 | 
| 
 | 
   616         Specify how data columns from database fingerprints *TextFile* are
 | 
| 
 | 
   617         transferred to output SD and CSV/TSV text files along with other
 | 
| 
 | 
   618         information for *SD | text | both* values of --output option:
 | 
| 
 | 
   619         transfer all data columns; extract specified data columns; generate
 | 
| 
 | 
   620         a compound ID database compound prefix. Possible values: *All |
 | 
| 
 | 
   621         Specify | CompoundID*. Default value: *CompoundID*.
 | 
| 
 | 
   622 
 | 
| 
 | 
   623     --DatabaseDataFields *"FieldLabel1,FieldLabel2,... "*
 | 
| 
 | 
   624         Comma delimited list of database fingerprints *SDFile* data fields
 | 
| 
 | 
   625         to extract and write to SD and CSV/TSV text files along with other
 | 
| 
 | 
   626         information for *SD | text | both* values of --output option.
 | 
| 
 | 
   627 
 | 
| 
 | 
   628         This is only used for *Specify* value of --DatabaseDataFieldsMode
 | 
| 
 | 
   629         option.
 | 
| 
 | 
   630 
 | 
| 
 | 
   631         Examples:
 | 
| 
 | 
   632 
 | 
| 
 | 
   633             Extreg
 | 
| 
 | 
   634             MolID,CompoundName
 | 
| 
 | 
   635 
 | 
| 
 | 
   636     --DatabaseDataFieldsMode *All | Common | Specify | CompoundID*
 | 
| 
 | 
   637         Specify how data fields from database fingerprints *SDFile* are
 | 
| 
 | 
   638         transferred to output SD and CSV/TSV text files along with other
 | 
| 
 | 
   639         information for *SD | text | both* values of --output option:
 | 
| 
 | 
   640         transfer all SD data field; transfer SD data files common to all
 | 
| 
 | 
   641         compounds; extract specified data fields; generate a compound ID
 | 
| 
 | 
   642         using molname line, a compound prefix, or a combination of both.
 | 
| 
 | 
   643         Possible values: *All | Common | specify | CompoundID*. Default
 | 
| 
 | 
   644         value: *CompoundID*.
 | 
| 
 | 
   645 
 | 
| 
 | 
   646     --DatabaseFingerprintsCol *col number | col name*
 | 
| 
 | 
   647         This value is --DatabaseColMode specific. It specifies fingerprints
 | 
| 
 | 
   648         column to use during similarity and dissimilarity search for
 | 
| 
 | 
   649         database fingerprints *TextFile*. Possible values: *col number or
 | 
| 
 | 
   650         col label*. Default value: *first column containing the word
 | 
| 
 | 
   651         Fingerprints in its column label*.
 | 
| 
 | 
   652 
 | 
| 
 | 
   653     --DatabaseFingerprintsField *FieldLabel*
 | 
| 
 | 
   654         Fingerprints field label to use during similarity and dissimilarity
 | 
| 
 | 
   655         search for database fingerprints *SDFile*. Default value: *first
 | 
| 
 | 
   656         data field label containing the word Fingerprints in its label*
 | 
| 
 | 
   657 
 | 
| 
 | 
   658     --DistanceCutoff *number*
 | 
| 
 | 
   659         Distance cutoff value to use during comparison of distance value
 | 
| 
 | 
   660         between a pair of database and reference molecule calculated by
 | 
| 
 | 
   661         distance comparison methods for fingerprints vector string data
 | 
| 
 | 
   662         values. Possible values: *Any valid number*. Default value: *10*.
 | 
| 
 | 
   663 
 | 
| 
 | 
   664         The comparison value between a pair of database and reference
 | 
| 
 | 
   665         molecule must meet the cutoff criterion as shown below:
 | 
| 
 | 
   666 
 | 
| 
 | 
   667             SeachMode      CutoffCriterion  ComparisonValues
 | 
| 
 | 
   668 
 | 
| 
 | 
   669             Similarity     <=               Lower value implies high similarity
 | 
| 
 | 
   670             Dissimilarity  >=               Higher value implies high dissimilarity
 | 
| 
 | 
   671 
 | 
| 
 | 
   672         This option is only used during distance coefficients values of -v,
 | 
| 
 | 
   673         --VectorComparisonMode option.
 | 
| 
 | 
   674 
 | 
| 
 | 
   675         This option is ignored during *No* value of --GroupFusionApplyCutoff
 | 
| 
 | 
   676         for *MultipleReferences* -m, --mode.
 | 
| 
 | 
   677 
 | 
| 
 | 
   678     -d, --detail *InfoLevel*
 | 
| 
 | 
   679         Level of information to print about lines being ignored. Default:
 | 
| 
 | 
   680         *1*. Possible values: *1, 2 or 3*.
 | 
| 
 | 
   681 
 | 
| 
 | 
   682     -f, --fast
 | 
| 
 | 
   683         In this mode, fingerprints columns specified using --FingerprintsCol
 | 
| 
 | 
   684         for reference and database fingerprints *TextFile(s)*, and
 | 
| 
 | 
   685         --FingerprintsField for reference and database fingerprints
 | 
| 
 | 
   686         *SDFile(s)* are assumed to contain valid fingerprints data and no
 | 
| 
 | 
   687         checking is performed before performing similarity and dissimilarity
 | 
| 
 | 
   688         search. By default, fingerprints data is validated before computing
 | 
| 
 | 
   689         pairwise similarity and distance coefficients.
 | 
| 
 | 
   690 
 | 
| 
 | 
   691     --FingerprintsMode *AutoDetect | FingerprintsBitVectorString |
 | 
| 
 | 
   692     FingerprintsVectorString*
 | 
| 
 | 
   693         Format of fingerprint strings data in reference and database
 | 
| 
 | 
   694         fingerprints *SD, FP, or Text (CSV/TSV)* files: automatically detect
 | 
| 
 | 
   695         format of fingerprints string created by MayaChemTools fingerprints
 | 
| 
 | 
   696         generation scripts or explicitly specify its format. Possible
 | 
| 
 | 
   697         values: *AutoDetect | FingerprintsBitVectorString |
 | 
| 
 | 
   698         FingerprintsVectorString*. Default value: *AutoDetect*.
 | 
| 
 | 
   699 
 | 
| 
 | 
   700     -g, --GroupFusionRule *Max, Min, Mean, Median, Sum, Euclidean*
 | 
| 
 | 
   701         Specify what group fusion [ Ref 94-97, Ref 100, Ref 105 ] rule to
 | 
| 
 | 
   702         use for calculating similarity of a database molecule against a set
 | 
| 
 | 
   703         of reference molecules during *MultipleReferences* value of
 | 
| 
 | 
   704         similarity search -m, --mode. Possible values: *Max, Min, Mean,
 | 
| 
 | 
   705         Median, Sum, Euclidean*. Default value: *Max*. *Mean* value
 | 
| 
 | 
   706         corresponds to average or arithmetic mean. The group fusion rule is
 | 
| 
 | 
   707         also referred to as data fusion of consensus scoring in the
 | 
| 
 | 
   708         literature.
 | 
| 
 | 
   709 
 | 
| 
 | 
   710         For a reference molecules set and a database molecule, let:
 | 
| 
 | 
   711 
 | 
| 
 | 
   712             N = Number of reference molecules in a set
 | 
| 
 | 
   713 
 | 
| 
 | 
   714             i = ith reference reference molecule in a set
 | 
| 
 | 
   715             n = Nth reference reference molecule in a set
 | 
| 
 | 
   716 
 | 
| 
 | 
   717             d = dth database molecule
 | 
| 
 | 
   718 
 | 
| 
 | 
   719             Crd = Fingerprints comparison value between rth reference and dth database
 | 
| 
 | 
   720                   molecule - similarity/dissimilarity comparison using similarity or
 | 
| 
 | 
   721                   distance coefficient
 | 
| 
 | 
   722 
 | 
| 
 | 
   723         Then, various group fusion rules to calculate fused similarity
 | 
| 
 | 
   724         between a database molecule and reference molecules set are defined
 | 
| 
 | 
   725         as follows:
 | 
| 
 | 
   726 
 | 
| 
 | 
   727         Max: MAX ( C1d, C2d, ..., Cid, ..., Cnd )
 | 
| 
 | 
   728 
 | 
| 
 | 
   729         Min: MIN ( C1d, C2d, ..., Cid, ..., Cnd )
 | 
| 
 | 
   730 
 | 
| 
 | 
   731         Mean: SUM ( C1d, C2d, ..., Cid, ..., Cnd ) / N
 | 
| 
 | 
   732 
 | 
| 
 | 
   733         Median: MEDIAN ( C1d, C2d, ..., Cid, ..., Cnd )
 | 
| 
 | 
   734 
 | 
| 
 | 
   735         Sum: SUM ( C1d, C2d, ..., Cid, ..., Cnd )
 | 
| 
 | 
   736 
 | 
| 
 | 
   737         Euclidean: SQRT( SUM( C1d ** 2, C2d ** 2, ..., Cid ** 2, ..., Cnd
 | 
| 
 | 
   738         *** 2) )
 | 
| 
 | 
   739 
 | 
| 
 | 
   740         The fingerprints bit-vector or vector string of each reference
 | 
| 
 | 
   741         molecule in a set is compared with a database molecule using a
 | 
| 
 | 
   742         similarity or distance coefficient specified via -b,
 | 
| 
 | 
   743         --BitVectorComparisonMode or -v, --VectorComparisonMode. The
 | 
| 
 | 
   744         reference molecules whose comparison values with a database molecule
 | 
| 
 | 
   745         fall outside specified --SimilarityCutoff or --DistanceCutoff are
 | 
| 
 | 
   746         ignored during *Yes* value of --GroupFusionApplyCutoff. The
 | 
| 
 | 
   747         specified -g, --GroupFusionRule is applied to -k, --kNN reference
 | 
| 
 | 
   748         molecules to calculate final fused similarity value between a
 | 
| 
 | 
   749         database molecule and reference molecules set.
 | 
| 
 | 
   750 
 | 
| 
 | 
   751         During dissimilarity search or usage of distance comparison
 | 
| 
 | 
   752         coefficient in similarity search, the meaning of fingerprints
 | 
| 
 | 
   753         comaprison value is automatically reversed as shown below:
 | 
| 
 | 
   754 
 | 
| 
 | 
   755             SeachMode      ComparisonCoefficient  ComparisonValues
 | 
| 
 | 
   756 
 | 
| 
 | 
   757             Similarity     SimilarityCoefficient  Higher value imples high similarity
 | 
| 
 | 
   758             Similarity     DistanceCoefficient    Lower value implies high similarity
 | 
| 
 | 
   759 
 | 
| 
 | 
   760             Dissimilarity  SimilarityCoefficient  Lower value implies high
 | 
| 
 | 
   761                                                   dissimilarity
 | 
| 
 | 
   762             Dissimilarity  DistanceCoefficient    Higher value implies high
 | 
| 
 | 
   763                                                   dissimilarity
 | 
| 
 | 
   764 
 | 
| 
 | 
   765         Consequently, *Max* implies highest and lowest comparison value for
 | 
| 
 | 
   766         usage of similarity and distance coefficient respectively during
 | 
| 
 | 
   767         similarity search. And it corresponds to lowest and highest
 | 
| 
 | 
   768         comparison value for usage of similarity and distance coefficient
 | 
| 
 | 
   769         respectively during dissimilarity search. During *Min* fusion rule,
 | 
| 
 | 
   770         the highest and lowest comparison values are appropriately reversed.
 | 
| 
 | 
   771 
 | 
| 
 | 
   772     --GroupFusionApplyCutoff *Yes | No*
 | 
| 
 | 
   773         Specify whether to apply --SimilarityCutoff or --DistanceCutoff
 | 
| 
 | 
   774         values during application of -g, --GroupFusionRule to reference
 | 
| 
 | 
   775         molecules set. Possible values: *Yes or No*. Default value: *Yes*.
 | 
| 
 | 
   776 
 | 
| 
 | 
   777         During *Yes* value of --GroupFusionApplyCutoff, the reference
 | 
| 
 | 
   778         molecules whose comparison values with a database molecule fall
 | 
| 
 | 
   779         outside specified --SimilarityCutoff or --DistanceCutoff are not
 | 
| 
 | 
   780         used to calculate final fused similarity value between a database
 | 
| 
 | 
   781         molecule and reference molecules set.
 | 
| 
 | 
   782 
 | 
| 
 | 
   783     -h, --help
 | 
| 
 | 
   784         Print this help message.
 | 
| 
 | 
   785 
 | 
| 
 | 
   786     --InDelim *comma | semicolon*
 | 
| 
 | 
   787         Input delimiter for reference and database fingerprints CSV
 | 
| 
 | 
   788         *TextFile(s)*. Possible values: *comma or semicolon*. Default value:
 | 
| 
 | 
   789         *comma*. For TSV files, this option is ignored and *tab* is used as
 | 
| 
 | 
   790         a delimiter.
 | 
| 
 | 
   791 
 | 
| 
 | 
   792     -k, --kNN *all | number*
 | 
| 
 | 
   793         Number of k-nearest neighbors (k-NN) reference molecules to use
 | 
| 
 | 
   794         during -g, --GroupFusionRule for calculating similarity of a
 | 
| 
 | 
   795         database molecule against a set of reference molecules. Possible
 | 
| 
 | 
   796         values: *all | positive integers*. Default: *all*.
 | 
| 
 | 
   797 
 | 
| 
 | 
   798         After ranking similarity values between a database molecule and
 | 
| 
 | 
   799         reference molecules during *MultipleReferences* value of similarity
 | 
| 
 | 
   800         search -m, --mode option, a top -k, --KNN reference molecule are
 | 
| 
 | 
   801         selected and used during -g, --GroupFusionRule.
 | 
| 
 | 
   802 
 | 
| 
 | 
   803         This option is -s, --SearchMode dependent: It corresponds to
 | 
| 
 | 
   804         dissimilar molecules during *DissimilaritySearch* value of -s,
 | 
| 
 | 
   805         --SearchMode option.
 | 
| 
 | 
   806 
 | 
| 
 | 
   807     -m, --mode *IndividualReference | MultipleReferences*
 | 
| 
 | 
   808         Specify how to treat reference molecules in
 | 
| 
 | 
   809         *ReferenceFingerprintsFile* during similarity search: Treat each
 | 
| 
 | 
   810         reference molecule individually during similarity search or perform
 | 
| 
 | 
   811         similarity search by treating multiple reference molecules as a set.
 | 
| 
 | 
   812         Possible values: *IndividualReference | MultipleReferences*. Default
 | 
| 
 | 
   813         value: *MultipleReferences*.
 | 
| 
 | 
   814 
 | 
| 
 | 
   815         During *IndividualReference* value of -m, --Mode for similarity
 | 
| 
 | 
   816         search, fingerprints bit-vector or vector string of each reference
 | 
| 
 | 
   817         molecule is compared with database molecules using specified
 | 
| 
 | 
   818         similarity or distance coefficients to identify most similar
 | 
| 
 | 
   819         molecules for each reference molecule. Based on value of
 | 
| 
 | 
   820         --SimilarCountMode, upto --n, NumOfSimilarMolecules or -p,
 | 
| 
 | 
   821         --PercentSimilarMolecules at specified <--SimilarityCutoff> or
 | 
| 
 | 
   822         --DistanceCutoff are identified for each reference molecule.
 | 
| 
 | 
   823 
 | 
| 
 | 
   824         During *MultipleReferences* value -m, --mode for similarity search,
 | 
| 
 | 
   825         all reference molecules are considered as a set and -g,
 | 
| 
 | 
   826         --GroupFusionRule is used to calculate similarity of a database
 | 
| 
 | 
   827         molecule against reference molecules set either using all reference
 | 
| 
 | 
   828         molecules or number of k-nearest neighbors (k-NN) to a database
 | 
| 
 | 
   829         molecule specified using -k, --kNN. The fingerprints bit-vector or
 | 
| 
 | 
   830         vector string of each reference molecule in a set is compared with a
 | 
| 
 | 
   831         database molecule using a similarity or distance coefficient
 | 
| 
 | 
   832         specified via -b, --BitVectorComparisonMode or -v,
 | 
| 
 | 
   833         --VectorComparisonMode. The reference molecules whose comparison
 | 
| 
 | 
   834         values with a database molecule fall outside specified
 | 
| 
 | 
   835         --SimilarityCutoff or --DistanceCutoff are ignored. The specified
 | 
| 
 | 
   836         -g, --GroupFusionRule is applied to rest of -k, --kNN reference
 | 
| 
 | 
   837         molecules to calculate final similarity value between a database
 | 
| 
 | 
   838         molecule and reference molecules set.
 | 
| 
 | 
   839 
 | 
| 
 | 
   840         The meaning of similarity and distance is automatically reversed
 | 
| 
 | 
   841         during *DissimilaritySearch* value of -s, --SearchMode along with
 | 
| 
 | 
   842         appropriate handling of --SimilarityCutoff or --DistanceCutoff
 | 
| 
 | 
   843         values.
 | 
| 
 | 
   844 
 | 
| 
 | 
   845     -n, --NumOfSimilarMolecules *number*
 | 
| 
 | 
   846         Maximum number of most similar database molecules to find for each
 | 
| 
 | 
   847         reference molecule or set of reference molecules based on
 | 
| 
 | 
   848         *IndividualReference* or *MultipleReferences* value of similarity
 | 
| 
 | 
   849         search -m, --mode option. Default: *10*. Valid values: positive
 | 
| 
 | 
   850         integers.
 | 
| 
 | 
   851 
 | 
| 
 | 
   852         This option is ignored during *PercentSimilar* value of
 | 
| 
 | 
   853         --SimilarCountMode option.
 | 
| 
 | 
   854 
 | 
| 
 | 
   855         This option is -s, --SearchMode dependent: It corresponds to
 | 
| 
 | 
   856         dissimilar molecules during *DissimilaritySearch* value of -s,
 | 
| 
 | 
   857         --SearchMode option.
 | 
| 
 | 
   858 
 | 
| 
 | 
   859     --OutDelim *comma | tab | semicolon*
 | 
| 
 | 
   860         Delimiter for output CSV/TSV text file. Possible values: *comma,
 | 
| 
 | 
   861         tab, or semicolon* Default value: *comma*.
 | 
| 
 | 
   862 
 | 
| 
 | 
   863     --output *SD | text | both*
 | 
| 
 | 
   864         Type of output files to generate. Possible values: *SD, text, or
 | 
| 
 | 
   865         both*. Default value: *text*.
 | 
| 
 | 
   866 
 | 
| 
 | 
   867     -o, --overwrite
 | 
| 
 | 
   868         Overwrite existing files
 | 
| 
 | 
   869 
 | 
| 
 | 
   870     -p, --PercentSimilarMolecules *number*
 | 
| 
 | 
   871         Maximum percent of mosy similar database molecules to find for each
 | 
| 
 | 
   872         reference molecule or set of reference molecules based on
 | 
| 
 | 
   873         *IndividualReference* or *MultipleReferences* value of similarity
 | 
| 
 | 
   874         search -m, --mode option. Default: *1* percent of database
 | 
| 
 | 
   875         molecules. Valid values: non-zero values in between *0 to 100*.
 | 
| 
 | 
   876 
 | 
| 
 | 
   877         This option is ignored during *NumOfSimilar* value of
 | 
| 
 | 
   878         --SimilarCountMode option.
 | 
| 
 | 
   879 
 | 
| 
 | 
   880         During *PercentSimilar* value of --SimilarCountMode option, the
 | 
| 
 | 
   881         number of molecules in *DatabaseFingerprintsFile* is counted and
 | 
| 
 | 
   882         number of similar molecules correspond to --PercentSimilarMolecules
 | 
| 
 | 
   883         of the total number of database molecules.
 | 
| 
 | 
   884 
 | 
| 
 | 
   885         This option is -s, --SearchMode dependent: It corresponds to
 | 
| 
 | 
   886         dissimilar molecules during *DissimilaritySearch* value of -s,
 | 
| 
 | 
   887         --SearchMode option.
 | 
| 
 | 
   888 
 | 
| 
 | 
   889     --precision *number*
 | 
| 
 | 
   890         Precision of calculated similarity values for comparison and
 | 
| 
 | 
   891         generating output files. Default: up to *2* decimal places. Valid
 | 
| 
 | 
   892         values: positive integers.
 | 
| 
 | 
   893 
 | 
| 
 | 
   894     -q, --quote *Yes | No*
 | 
| 
 | 
   895         Put quote around column values in output CSV/TSV text file. Possible
 | 
| 
 | 
   896         values: *Yes or No*. Default value: *Yes*.
 | 
| 
 | 
   897 
 | 
| 
 | 
   898     --ReferenceColMode *ColNum | ColLabel*
 | 
| 
 | 
   899         Specify how columns are identified in reference fingerprints
 | 
| 
 | 
   900         *TextFile*: using column number or column label. Possible values:
 | 
| 
 | 
   901         *ColNum or ColLabel*. Default value: *ColNum*.
 | 
| 
 | 
   902 
 | 
| 
 | 
   903     --ReferenceCompoundIDCol *col number | col name*
 | 
| 
 | 
   904         This value is --ReferenceColMode mode specific. It specifies column
 | 
| 
 | 
   905         to use for retrieving compound ID from reference fingerprints
 | 
| 
 | 
   906         *TextFile* during similarity and dissimilarity search for output SD
 | 
| 
 | 
   907         and CSV/TSV text files. Possible values: *col number or col label*.
 | 
| 
 | 
   908         Default value: *first column containing the word compoundID in its
 | 
| 
 | 
   909         column label or sequentially generated IDs*.
 | 
| 
 | 
   910 
 | 
| 
 | 
   911     --ReferenceCompoundIDPrefix *text*
 | 
| 
 | 
   912         Specify compound ID prefix to use during sequential generation of
 | 
| 
 | 
   913         compound IDs for reference fingerprints *SDFile* and *TextFile*.
 | 
| 
 | 
   914         Default value: *Cmpd*. The default value generates compound IDs
 | 
| 
 | 
   915         which looks like Cmpd<Number>.
 | 
| 
 | 
   916 
 | 
| 
 | 
   917         For reference fingerprints *SDFile*, this value is only used during
 | 
| 
 | 
   918         *LabelPrefix | MolNameOrLabelPrefix* values of
 | 
| 
 | 
   919         --ReferenceCompoundIDMode option; otherwise, it's ignored.
 | 
| 
 | 
   920 
 | 
| 
 | 
   921         Examples for *LabelPrefix* or *MolNameOrLabelPrefix* value of
 | 
| 
 | 
   922         --DatabaseCompoundIDMode:
 | 
| 
 | 
   923 
 | 
| 
 | 
   924             Compound
 | 
| 
 | 
   925 
 | 
| 
 | 
   926         The values specified above generates compound IDs which correspond
 | 
| 
 | 
   927         to Compound<Number> instead of default value of Cmpd<Number>.
 | 
| 
 | 
   928 
 | 
| 
 | 
   929     --ReferenceCompoundIDField *DataFieldName*
 | 
| 
 | 
   930         Specify reference fingerprints *SDFile* datafield label for
 | 
| 
 | 
   931         generating compound IDs. This value is only used during *DataField*
 | 
| 
 | 
   932         value of --ReferenceCompoundIDMode option.
 | 
| 
 | 
   933 
 | 
| 
 | 
   934         Examples for *DataField* value of --ReferenceCompoundIDMode:
 | 
| 
 | 
   935 
 | 
| 
 | 
   936             MolID
 | 
| 
 | 
   937             ExtReg
 | 
| 
 | 
   938 
 | 
| 
 | 
   939     --ReferenceCompoundIDMode *DataField | MolName | LabelPrefix |
 | 
| 
 | 
   940     MolNameOrLabelPrefix*
 | 
| 
 | 
   941         Specify how to generate compound IDs from reference fingerprints
 | 
| 
 | 
   942         *SDFile* during similarity and dissimilarity search for output SD
 | 
| 
 | 
   943         and CSV/TSV text files: use a *SDFile* datafield value; use molname
 | 
| 
 | 
   944         line from *SDFile*; generate a sequential ID with specific prefix;
 | 
| 
 | 
   945         use combination of both MolName and LabelPrefix with usage of
 | 
| 
 | 
   946         LabelPrefix values for empty molname lines.
 | 
| 
 | 
   947 
 | 
| 
 | 
   948         Possible values: *DataField | MolName | LabelPrefix |
 | 
| 
 | 
   949         MolNameOrLabelPrefix*. Default: *LabelPrefix*.
 | 
| 
 | 
   950 
 | 
| 
 | 
   951         For *MolNameAndLabelPrefix* value of --ReferenceCompoundIDMode,
 | 
| 
 | 
   952         molname line in *SDFiles* takes precedence over sequential compound
 | 
| 
 | 
   953         IDs generated using *LabelPrefix* and only empty molname values are
 | 
| 
 | 
   954         replaced with sequential compound IDs.
 | 
| 
 | 
   955 
 | 
| 
 | 
   956     --ReferenceFingerprintsCol *col number | col name*
 | 
| 
 | 
   957         This value is --ReferenceColMode specific. It specifies fingerprints
 | 
| 
 | 
   958         column to use during similarity and dissimilarity search for
 | 
| 
 | 
   959         reference fingerprints *TextFile*. Possible values: *col number or
 | 
| 
 | 
   960         col label*. Default value: *first column containing the word
 | 
| 
 | 
   961         Fingerprints in its column label*.
 | 
| 
 | 
   962 
 | 
| 
 | 
   963     --ReferenceFingerprintsField *FieldLabel*
 | 
| 
 | 
   964         Fingerprints field label to use during similarity and dissimilarity
 | 
| 
 | 
   965         search for reference fingerprints *SDFile*. Default value: *first
 | 
| 
 | 
   966         data field label containing the word Fingerprints in its label*
 | 
| 
 | 
   967 
 | 
| 
 | 
   968     -r, --root *RootName*
 | 
| 
 | 
   969         New file name is generated using the root: <Root>.<Ext>. Default for
 | 
| 
 | 
   970         new file name: <ReferenceFileName>SimilaritySearching.<Ext>. The
 | 
| 
 | 
   971         output file type determines <Ext> value. The sdf, csv, and tsv <Ext>
 | 
| 
 | 
   972         values are used for SD, comma/semicolon, and tab delimited text
 | 
| 
 | 
   973         files respectively.
 | 
| 
 | 
   974 
 | 
| 
 | 
   975     -s, --SearchMode *SimilaritySearch | DissimilaritySearch*
 | 
| 
 | 
   976         Specify how to find molecules from database molecules for individual
 | 
| 
 | 
   977         reference molecules or set of reference molecules: Find similar
 | 
| 
 | 
   978         molecules or dissimilar molecules from database molecules. Possible
 | 
| 
 | 
   979         values: *SimilaritySearch | DissimilaritySearch*. Default value:
 | 
| 
 | 
   980         *SimilaritySearch*.
 | 
| 
 | 
   981 
 | 
| 
 | 
   982         During *DissimilaritySearch* value of -s, --SearchMode option, the
 | 
| 
 | 
   983         meaning of the following options is switched and they correspond to
 | 
| 
 | 
   984         dissimilar molecules instead of similar molecules:
 | 
| 
 | 
   985         --SimilarCountMode, -n, --NumOfSimilarMolecules,
 | 
| 
 | 
   986         --PercentSimilarMolecules, -k, --kNN.
 | 
| 
 | 
   987 
 | 
| 
 | 
   988     --SimilarCountMode *NumOfSimilar | PercentSimilar*
 | 
| 
 | 
   989         Specify method used to count similar molecules found from database
 | 
| 
 | 
   990         molecules for individual reference molecules or set of reference
 | 
| 
 | 
   991         molecules: Find number of similar molecules or percent of similar
 | 
| 
 | 
   992         molecules from database molecules. Possible values: *NumOfSimilar |
 | 
| 
 | 
   993         PercentSimilar*. Default value: *NumOfSimilar*.
 | 
| 
 | 
   994 
 | 
| 
 | 
   995         The values for number of similar molecules and percent similar
 | 
| 
 | 
   996         molecules are specified using options -n, NumOfSimilarMolecule and
 | 
| 
 | 
   997         --PercentSimilarMolecules.
 | 
| 
 | 
   998 
 | 
| 
 | 
   999         This option is -s, --SearchMode dependent: It corresponds to
 | 
| 
 | 
  1000         dissimilar molecules during *DissimilaritySearch* value of -s,
 | 
| 
 | 
  1001         --SearchMode option.
 | 
| 
 | 
  1002 
 | 
| 
 | 
  1003     --SimilarityCutoff *number*
 | 
| 
 | 
  1004         Similarity cutoff value to use during comparison of similarity value
 | 
| 
 | 
  1005         between a pair of database and reference molecules calculated by
 | 
| 
 | 
  1006         similarity comparison methods for fingerprints bit-vector vector
 | 
| 
 | 
  1007         strings data values. Possible values: *Any valid number*. Default
 | 
| 
 | 
  1008         value: *0.75*.
 | 
| 
 | 
  1009 
 | 
| 
 | 
  1010         The comparison value between a pair of database and reference
 | 
| 
 | 
  1011         molecule must meet the cutoff criterion as shown below:
 | 
| 
 | 
  1012 
 | 
| 
 | 
  1013             SeachMode      CutoffCriterion  ComparisonValues
 | 
| 
 | 
  1014 
 | 
| 
 | 
  1015             Similarity     >=               Higher value implies high similarity
 | 
| 
 | 
  1016             Dissimilarity  <=               Lower value implies high dissimilarity
 | 
| 
 | 
  1017 
 | 
| 
 | 
  1018         This option is ignored during *No* value of --GroupFusionApplyCutoff
 | 
| 
 | 
  1019         for *MultipleReferences* -m, --mode.
 | 
| 
 | 
  1020 
 | 
| 
 | 
  1021         This option is -s, --SearchMode dependent: It corresponds to
 | 
| 
 | 
  1022         dissimilar molecules during *DissimilaritySearch* value of -s,
 | 
| 
 | 
  1023         --SearchMode option.
 | 
| 
 | 
  1024 
 | 
| 
 | 
  1025     -v, --VectorComparisonMode *SupportedSimilarityName |
 | 
| 
 | 
  1026     SupportedDistanceName*
 | 
| 
 | 
  1027         Specify what similarity or distance coefficient to use for
 | 
| 
 | 
  1028         calculating similarity between fingerprint vector strings data
 | 
| 
 | 
  1029         values in *ReferenceFingerprintsFile* and *DatabaseFingerprintsFile*
 | 
| 
 | 
  1030         during similarity search. Possible values: *TanimotoSimilairy | ...
 | 
| 
 | 
  1031         | ManhattanDistance | ...*. Default value: *TanimotoSimilarity*.
 | 
| 
 | 
  1032 
 | 
| 
 | 
  1033         The value of -v, --VectorComparisonMode, in conjunction with
 | 
| 
 | 
  1034         --VectorComparisonFormulism, decides which type of similarity and
 | 
| 
 | 
  1035         distance coefficient formulism gets used.
 | 
| 
 | 
  1036 
 | 
| 
 | 
  1037         The current releases supports the following similarity and distance
 | 
| 
 | 
  1038         coefficients: *CosineSimilarity, CzekanowskiSimilarity,
 | 
| 
 | 
  1039         DiceSimilarity, OchiaiSimilarity, JaccardSimilarity,
 | 
| 
 | 
  1040         SorensonSimilarity, TanimotoSimilarity, CityBlockDistance,
 | 
| 
 | 
  1041         EuclideanDistance, HammingDistance, ManhattanDistance,
 | 
| 
 | 
  1042         SoergelDistance*. These similarity and distance coefficients are
 | 
| 
 | 
  1043         described below.
 | 
| 
 | 
  1044 
 | 
| 
 | 
  1045         FingerprintsVector.pm module, used to calculate similarity and
 | 
| 
 | 
  1046         distance coefficients, provides support to perform comparison
 | 
| 
 | 
  1047         between vectors containing three different types of values:
 | 
| 
 | 
  1048 
 | 
| 
 | 
  1049         Type I: OrderedNumericalValues
 | 
| 
 | 
  1050 
 | 
| 
 | 
  1051             . Size of two vectors are same
 | 
| 
 | 
  1052             . Vectors contain real values in a specific order. For example: MACCS keys
 | 
| 
 | 
  1053               count, Topological pharmnacophore atom pairs and so on.
 | 
| 
 | 
  1054 
 | 
| 
 | 
  1055         Type II: UnorderedNumericalValues
 | 
| 
 | 
  1056 
 | 
| 
 | 
  1057             . Size of two vectors might not be same
 | 
| 
 | 
  1058             . Vectors contain unordered real value identified by value IDs. For example:
 | 
| 
 | 
  1059               Toplogical atom pairs, Topological atom torsions and so on
 | 
| 
 | 
  1060 
 | 
| 
 | 
  1061         Type III: AlphaNumericalValues
 | 
| 
 | 
  1062 
 | 
| 
 | 
  1063             . Size of two vectors might not be same
 | 
| 
 | 
  1064             . Vectors contain unordered alphanumerical values. For example: Extended
 | 
| 
 | 
  1065               connectivity fingerprints, atom neighborhood fingerprints.
 | 
| 
 | 
  1066 
 | 
| 
 | 
  1067         Before performing similarity or distance calculations between
 | 
| 
 | 
  1068         vectors containing UnorderedNumericalValues or AlphaNumericalValues,
 | 
| 
 | 
  1069         the vectors are transformed into vectors containing unique
 | 
| 
 | 
  1070         OrderedNumericalValues using value IDs for UnorderedNumericalValues
 | 
| 
 | 
  1071         and values itself for AlphaNumericalValues.
 | 
| 
 | 
  1072 
 | 
| 
 | 
  1073         Three forms of similarity and distance calculation between two
 | 
| 
 | 
  1074         vectors, specified using --VectorComparisonFormulism option, are
 | 
| 
 | 
  1075         supported: *AlgebraicForm, BinaryForm or SetTheoreticForm*.
 | 
| 
 | 
  1076 
 | 
| 
 | 
  1077         For *BinaryForm*, the ordered list of processed final vector values
 | 
| 
 | 
  1078         containing the value or count of each unique value type is simply
 | 
| 
 | 
  1079         converted into a binary vector containing 1s and 0s corresponding to
 | 
| 
 | 
  1080         presence or absence of values before calculating similarity or
 | 
| 
 | 
  1081         distance between two vectors.
 | 
| 
 | 
  1082 
 | 
| 
 | 
  1083         For two fingerprint vectors A and B of same size containing
 | 
| 
 | 
  1084         OrderedNumericalValues, let:
 | 
| 
 | 
  1085 
 | 
| 
 | 
  1086             N = Number values in A or B
 | 
| 
 | 
  1087 
 | 
| 
 | 
  1088             Xa = Values of vector A
 | 
| 
 | 
  1089             Xb = Values of vector B
 | 
| 
 | 
  1090 
 | 
| 
 | 
  1091             Xai = Value of ith element in A
 | 
| 
 | 
  1092             Xbi = Value of ith element in B
 | 
| 
 | 
  1093 
 | 
| 
 | 
  1094            SUM = Sum of i over N values
 | 
| 
 | 
  1095 
 | 
| 
 | 
  1096         For SetTheoreticForm of calculation between two vectors, let:
 | 
| 
 | 
  1097 
 | 
| 
 | 
  1098             SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) )
 | 
| 
 | 
  1099             SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) )
 | 
| 
 | 
  1100 
 | 
| 
 | 
  1101         For BinaryForm of calculation between two vectors, let:
 | 
| 
 | 
  1102 
 | 
| 
 | 
  1103             Na = Number of bits set to "1" in A = SUM ( Xai )
 | 
| 
 | 
  1104             Nb = Number of bits set to "1" in B = SUM ( Xbi )
 | 
| 
 | 
  1105             Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi )
 | 
| 
 | 
  1106             Nd = Number of bits set to "0" in both A and B
 | 
| 
 | 
  1107                = SUM ( 1 - Xai - Xbi + Xai * Xbi)
 | 
| 
 | 
  1108 
 | 
| 
 | 
  1109             N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd
 | 
| 
 | 
  1110 
 | 
| 
 | 
  1111         Additionally, for BinaryForm various values also correspond to:
 | 
| 
 | 
  1112 
 | 
| 
 | 
  1113             Na = | Xa |
 | 
| 
 | 
  1114             Nb = | Xb |
 | 
| 
 | 
  1115             Nc = | SetIntersectionXaXb |
 | 
| 
 | 
  1116             Nd = N - | SetDifferenceXaXb |
 | 
| 
 | 
  1117 
 | 
| 
 | 
  1118             | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc
 | 
| 
 | 
  1119                                   =  | Xa | + | Xb | - | SetIntersectionXaXb |
 | 
| 
 | 
  1120 
 | 
| 
 | 
  1121         Various similarity and distance coefficients [ Ref 40, Ref 62, Ref
 | 
| 
 | 
  1122         64 ] for a pair of vectors A and B in *AlgebraicForm, BinaryForm and
 | 
| 
 | 
  1123         SetTheoreticForm* are defined as follows:
 | 
| 
 | 
  1124 
 | 
| 
 | 
  1125         CityBlockDistance: ( same as HammingDistance and ManhattanDistance)
 | 
| 
 | 
  1126 
 | 
| 
 | 
  1127         *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) )
 | 
| 
 | 
  1128 
 | 
| 
 | 
  1129         *BinaryForm*: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
 | 
| 
 | 
  1130 
 | 
| 
 | 
  1131         *SetTheoreticForm*: | SetDifferenceXaXb | - | SetIntersectionXaXb |
 | 
| 
 | 
  1132         = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
 | 
| 
 | 
  1133 
 | 
| 
 | 
  1134         CosineSimilarity: ( same as OchiaiSimilarityCoefficient)
 | 
| 
 | 
  1135 
 | 
| 
 | 
  1136         *AlgebraicForm*: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM (
 | 
| 
 | 
  1137         Xbi ** 2) )
 | 
| 
 | 
  1138 
 | 
| 
 | 
  1139         *BinaryForm*: Nc / SQRT ( Na * Nb)
 | 
| 
 | 
  1140 
 | 
| 
 | 
  1141         *SetTheoreticForm*: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) =
 | 
| 
 | 
  1142         SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
 | 
| 
 | 
  1143 
 | 
| 
 | 
  1144         CzekanowskiSimilarity: ( same as DiceSimilarity and
 | 
| 
 | 
  1145         SorensonSimilarity)
 | 
| 
 | 
  1146 
 | 
| 
 | 
  1147         *AlgebraicForm*: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) +
 | 
| 
 | 
  1148         SUM ( Xbi **2 ) )
 | 
| 
 | 
  1149 
 | 
| 
 | 
  1150         *BinaryForm*: 2 * Nc / ( Na + Nb )
 | 
| 
 | 
  1151 
 | 
| 
 | 
  1152         *SetTheoreticForm*: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) =
 | 
| 
 | 
  1153         2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
 | 
| 
 | 
  1154 
 | 
| 
 | 
  1155         DiceSimilarity: ( same as CzekanowskiSimilarity and
 | 
| 
 | 
  1156         SorensonSimilarity)
 | 
| 
 | 
  1157 
 | 
| 
 | 
  1158         *AlgebraicForm*: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) +
 | 
| 
 | 
  1159         SUM ( Xbi **2 ) )
 | 
| 
 | 
  1160 
 | 
| 
 | 
  1161         *BinaryForm*: 2 * Nc / ( Na + Nb )
 | 
| 
 | 
  1162 
 | 
| 
 | 
  1163         *SetTheoreticForm*: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) =
 | 
| 
 | 
  1164         2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
 | 
| 
 | 
  1165 
 | 
| 
 | 
  1166         EuclideanDistance:
 | 
| 
 | 
  1167 
 | 
| 
 | 
  1168         *AlgebraicForm*: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) )
 | 
| 
 | 
  1169 
 | 
| 
 | 
  1170         *BinaryForm*: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb -
 | 
| 
 | 
  1171         2 * Nc )
 | 
| 
 | 
  1172 
 | 
| 
 | 
  1173         *SetTheoreticForm*: SQRT ( | SetDifferenceXaXb | - |
 | 
| 
 | 
  1174         SetIntersectionXaXb | ) = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * (
 | 
| 
 | 
  1175         SUM ( MIN ( Xai, Xbi ) ) ) )
 | 
| 
 | 
  1176 
 | 
| 
 | 
  1177         HammingDistance: ( same as CityBlockDistance and ManhattanDistance)
 | 
| 
 | 
  1178 
 | 
| 
 | 
  1179         *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) )
 | 
| 
 | 
  1180 
 | 
| 
 | 
  1181         *BinaryForm*: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
 | 
| 
 | 
  1182 
 | 
| 
 | 
  1183         *SetTheoreticForm*: | SetDifferenceXaXb | - | SetIntersectionXaXb |
 | 
| 
 | 
  1184         = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
 | 
| 
 | 
  1185 
 | 
| 
 | 
  1186         JaccardSimilarity: ( same as TanimotoSimilarity)
 | 
| 
 | 
  1187 
 | 
| 
 | 
  1188         *AlgebraicForm*: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi
 | 
| 
 | 
  1189         ** 2 ) - SUM ( Xai * Xbi ) )
 | 
| 
 | 
  1190 
 | 
| 
 | 
  1191         *BinaryForm*: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na +
 | 
| 
 | 
  1192         Nb - Nc )
 | 
| 
 | 
  1193 
 | 
| 
 | 
  1194         *SetTheoreticForm*: | SetIntersectionXaXb | / | SetDifferenceXaXb |
 | 
| 
 | 
  1195         = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN
 | 
| 
 | 
  1196         ( Xai, Xbi ) ) )
 | 
| 
 | 
  1197 
 | 
| 
 | 
  1198         ManhattanDistance: ( same as CityBlockDistance and HammingDistance)
 | 
| 
 | 
  1199 
 | 
| 
 | 
  1200         *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) )
 | 
| 
 | 
  1201 
 | 
| 
 | 
  1202         *BinaryForm*: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
 | 
| 
 | 
  1203 
 | 
| 
 | 
  1204         *SetTheoreticForm*: | SetDifferenceXaXb | - | SetIntersectionXaXb |
 | 
| 
 | 
  1205         = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
 | 
| 
 | 
  1206 
 | 
| 
 | 
  1207         OchiaiSimilarity: ( same as CosineSimilarity)
 | 
| 
 | 
  1208 
 | 
| 
 | 
  1209         *AlgebraicForm*: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM (
 | 
| 
 | 
  1210         Xbi ** 2) )
 | 
| 
 | 
  1211 
 | 
| 
 | 
  1212         *BinaryForm*: Nc / SQRT ( Na * Nb)
 | 
| 
 | 
  1213 
 | 
| 
 | 
  1214         *SetTheoreticForm*: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) =
 | 
| 
 | 
  1215         SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
 | 
| 
 | 
  1216 
 | 
| 
 | 
  1217         SorensonSimilarity: ( same as CzekanowskiSimilarity and
 | 
| 
 | 
  1218         DiceSimilarity)
 | 
| 
 | 
  1219 
 | 
| 
 | 
  1220         *AlgebraicForm*: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) +
 | 
| 
 | 
  1221         SUM ( Xbi **2 ) )
 | 
| 
 | 
  1222 
 | 
| 
 | 
  1223         *BinaryForm*: 2 * Nc / ( Na + Nb )
 | 
| 
 | 
  1224 
 | 
| 
 | 
  1225         *SetTheoreticForm*: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) =
 | 
| 
 | 
  1226         2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
 | 
| 
 | 
  1227 
 | 
| 
 | 
  1228         SoergelDistance:
 | 
| 
 | 
  1229 
 | 
| 
 | 
  1230         *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi )
 | 
| 
 | 
  1231         )
 | 
| 
 | 
  1232 
 | 
| 
 | 
  1233         *BinaryForm*: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / (
 | 
| 
 | 
  1234         Na + Nb - Nc )
 | 
| 
 | 
  1235 
 | 
| 
 | 
  1236         *SetTheoreticForm*: ( | SetDifferenceXaXb | - | SetIntersectionXaXb
 | 
| 
 | 
  1237         | ) / | SetDifferenceXaXb | = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * (
 | 
| 
 | 
  1238         SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM (
 | 
| 
 | 
  1239         MIN ( Xai, Xbi ) ) )
 | 
| 
 | 
  1240 
 | 
| 
 | 
  1241         TanimotoSimilarity: ( same as JaccardSimilarity)
 | 
| 
 | 
  1242 
 | 
| 
 | 
  1243         *AlgebraicForm*: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi
 | 
| 
 | 
  1244         ** 2 ) - SUM ( Xai * Xbi ) )
 | 
| 
 | 
  1245 
 | 
| 
 | 
  1246         *BinaryForm*: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na +
 | 
| 
 | 
  1247         Nb - Nc )
 | 
| 
 | 
  1248 
 | 
| 
 | 
  1249         *SetTheoreticForm*: | SetIntersectionXaXb | / | SetDifferenceXaXb |
 | 
| 
 | 
  1250         = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN
 | 
| 
 | 
  1251         ( Xai, Xbi ) ) )
 | 
| 
 | 
  1252 
 | 
| 
 | 
  1253     --VectorComparisonFormulism *AlgebraicForm | BinaryForm |
 | 
| 
 | 
  1254     SetTheoreticForm*
 | 
| 
 | 
  1255         Specify fingerprints vector comparison formulism to use for
 | 
| 
 | 
  1256         calculation similarity and distance coefficients during -v,
 | 
| 
 | 
  1257         --VectorComparisonMode. Possible values: *AlgebraicForm | BinaryForm
 | 
| 
 | 
  1258         | SetTheoreticForm*. Default value: *AlgebraicForm*.
 | 
| 
 | 
  1259 
 | 
| 
 | 
  1260         For fingerprint vector strings containing AlphaNumericalValues data
 | 
| 
 | 
  1261         values - ExtendedConnectivityFingerprints,
 | 
| 
 | 
  1262         AtomNeighborhoodsFingerprints and so on - all three formulism result
 | 
| 
 | 
  1263         in same value during similarity and distance calculations.
 | 
| 
 | 
  1264 
 | 
| 
 | 
  1265     -w, --WorkingDir *DirName*
 | 
| 
 | 
  1266         Location of working directory. Default: current directory.
 | 
| 
 | 
  1267 
 | 
| 
 | 
  1268 EXAMPLES
 | 
| 
 | 
  1269     To perform similarity search using Tanimoto coefficient by treating all
 | 
| 
 | 
  1270     reference molecules as a set to find 10 most similar database molecules
 | 
| 
 | 
  1271     with application of Max group fusion rule and similarity cutoff to
 | 
| 
 | 
  1272     supported fingerprints strings data in SD fingerprints files present in
 | 
| 
 | 
  1273     a data fields with Fingerprint substring in their labels, and create a
 | 
| 
 | 
  1274     ReferenceFPHexSimilaritySearching.csv file containing sequentially
 | 
| 
 | 
  1275     generated database compound IDs with Cmpd prefix, type:
 | 
| 
 | 
  1276 
 | 
| 
 | 
  1277         % SimilaritySearchingFingerprints.pl -o ReferenceSampleFPHex.sdf
 | 
| 
 | 
  1278           DatabaseSampleFPHex.sdf
 | 
| 
 | 
  1279 
 | 
| 
 | 
  1280     To perform similarity search using Tanimoto coefficient by treating all
 | 
| 
 | 
  1281     reference molecules as a set to find 10 most similar database molecules
 | 
| 
 | 
  1282     with application of Max group fusion rule and similarity cutoff to
 | 
| 
 | 
  1283     supported fingerprints strings data in FP fingerprints files, and create
 | 
| 
 | 
  1284     a SimilaritySearchResults.csv file containing database compound IDs
 | 
| 
 | 
  1285     retireved from FP file, type:
 | 
| 
 | 
  1286 
 | 
| 
 | 
  1287         % SimilaritySearchingFingerprints.pl -r SimilaritySearchResults -o
 | 
| 
 | 
  1288           ReferenceSampleFPBin.fpf DatabaseSampleFPBin.fpf
 | 
| 
 | 
  1289 
 | 
| 
 | 
  1290     To perform similarity search using Tanimoto coefficient by treating all
 | 
| 
 | 
  1291     reference molecules as a set to find 10 most similar database database
 | 
| 
 | 
  1292     molecules with application of Max group fusion rule and similarity
 | 
| 
 | 
  1293     cutoff to supported fingerprints strings data in text fingerprints files
 | 
| 
 | 
  1294     present in a column names containing Fingerprint substring in their
 | 
| 
 | 
  1295     names, and create a ReferenceFPHexSimilaritySearching.csv file
 | 
| 
 | 
  1296     containing database compound IDs retireved column name containing
 | 
| 
 | 
  1297     CompoundID substring or sequentially generated compound IDs, type:
 | 
| 
 | 
  1298 
 | 
| 
 | 
  1299         % SimilaritySearchingFingerprints.pl -o ReferenceSampleFPCount.csv
 | 
| 
 | 
  1300           DatabaseSampleFPCount.csv
 | 
| 
 | 
  1301 
 | 
| 
 | 
  1302     To perform similarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1303     reference molecules as individual molecules to find 10 most similar
 | 
| 
 | 
  1304     database molecules for each reference molecule with application of
 | 
| 
 | 
  1305     similarity cutoff to supported fingerprints strings data in SD
 | 
| 
 | 
  1306     fingerprints files present in a data fields with Fingerprint substring
 | 
| 
 | 
  1307     in their labels, and create a ReferenceFPHexSimilaritySearching.csv file
 | 
| 
 | 
  1308     containing sequentially generated reference and database compound IDs
 | 
| 
 | 
  1309     with Cmpd prefix, type:
 | 
| 
 | 
  1310 
 | 
| 
 | 
  1311         % SimilaritySearchingFingerprints.pl -mode IndividualReference -o
 | 
| 
 | 
  1312           ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
 | 
| 
 | 
  1313 
 | 
| 
 | 
  1314     To perform similarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1315     reference molecules as individual molecules to find 10 most similar
 | 
| 
 | 
  1316     database molecules for each reference molecule with application of
 | 
| 
 | 
  1317     similarity cutoff to supported fingerprints strings data in FP
 | 
| 
 | 
  1318     fingerprints files, and create a ReferenceFPHexSimilaritySearching.csv
 | 
| 
 | 
  1319     file containing references and database compound IDs retireved from FP
 | 
| 
 | 
  1320     file, type:
 | 
| 
 | 
  1321 
 | 
| 
 | 
  1322         % SimilaritySearchingFingerprints.pl -mode IndividualReference -o
 | 
| 
 | 
  1323           ReferenceSampleFPHex.fpf DatabaseSampleFPHex.fpf
 | 
| 
 | 
  1324 
 | 
| 
 | 
  1325     To perform similarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1326     reference molecules as individual molecules to find 10 most similar
 | 
| 
 | 
  1327     database molecules for each reference molecule with application of
 | 
| 
 | 
  1328     similarity cutoff to supported fingerprints strings data in text
 | 
| 
 | 
  1329     fingerprints files present in a column names containing Fingerprint
 | 
| 
 | 
  1330     substring in their names, and create a
 | 
| 
 | 
  1331     ReferenceFPHexSimilaritySearching.csv file containing reference and
 | 
| 
 | 
  1332     database compound IDs retrieved column name containing CompoundID
 | 
| 
 | 
  1333     substring or sequentially generated compound IDs, type:
 | 
| 
 | 
  1334 
 | 
| 
 | 
  1335         % SimilaritySearchingFingerprints.pl -mode IndividualReference -o
 | 
| 
 | 
  1336           ReferenceSampleFPHex.csv DatabaseSampleFPHex.csv
 | 
| 
 | 
  1337 
 | 
| 
 | 
  1338     To perform dissimilarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1339     all reference molecules as a set to find 10 most dissimilar database
 | 
| 
 | 
  1340     molecules with application of Max group fusion rule and similarity
 | 
| 
 | 
  1341     cutoff to supported fingerprints strings data in SD fingerprints files
 | 
| 
 | 
  1342     present in a data fields with Fingerprint substring in their labels, and
 | 
| 
 | 
  1343     create a ReferenceFPHexSimilaritySearching.csv file containing
 | 
| 
 | 
  1344     sequentially generated database compound IDs with Cmpd prefix, type:
 | 
| 
 | 
  1345 
 | 
| 
 | 
  1346         % SimilaritySearchingFingerprints.pl --mode MultipleReferences --SearchMode
 | 
| 
 | 
  1347           DissimilaritySearch -o ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
 | 
| 
 | 
  1348 
 | 
| 
 | 
  1349     To perform similarity search using CityBlock distance by treating
 | 
| 
 | 
  1350     reference molecules as individual molecules to find 10 most similar
 | 
| 
 | 
  1351     database molecules for each reference molecule with application of
 | 
| 
 | 
  1352     distance cutoff to supported vector fingerprints strings data in SD
 | 
| 
 | 
  1353     fingerprints files present in a data fields with Fingerprint substring
 | 
| 
 | 
  1354     in their labels, and create a ReferenceFPHexSimilaritySearching.csv file
 | 
| 
 | 
  1355     containing sequentially generated reference and database compound IDs
 | 
| 
 | 
  1356     with Cmpd prefix, type:
 | 
| 
 | 
  1357 
 | 
| 
 | 
  1358         % SimilaritySearchingFingerprints.pl -mode IndividualReference
 | 
| 
 | 
  1359           --VectorComparisonMode CityBlockDistance --VectorComparisonFormulism
 | 
| 
 | 
  1360           AlgebraicForm --DistanceCutoff 10 -o
 | 
| 
 | 
  1361           ReferenceSampleFPCount.sdf DatabaseSampleFPCount.sdf
 | 
| 
 | 
  1362 
 | 
| 
 | 
  1363     To perform similarity search using Tanimoto coefficient by treating all
 | 
| 
 | 
  1364     reference molecules as a set to find 100 most similar database molecules
 | 
| 
 | 
  1365     with application of Mean group fusion rule to to top 10 reference
 | 
| 
 | 
  1366     molecules with in similarity cutoff of 0.75 to supported fingerprints
 | 
| 
 | 
  1367     strings data in FP fingerprints files, and create a
 | 
| 
 | 
  1368     ReferenceFPHexSimilaritySearching.csv file containing database compound
 | 
| 
 | 
  1369     IDs retrieved from FP file, type:
 | 
| 
 | 
  1370 
 | 
| 
 | 
  1371         % SimilaritySearchingFingerprints.pl --mode MultipleReferences --SearchMode
 | 
| 
 | 
  1372           SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
 | 
| 
 | 
  1373           --GroupFusionRule Mean --GroupFusionApplyCutoff Yes --kNN 10
 | 
| 
 | 
  1374           --SimilarityCutoff 0.75 --SimilarCountMode NumOfSimilar
 | 
| 
 | 
  1375           --NumOfSimilarMolecules 100 -o
 | 
| 
 | 
  1376           ReferenceSampleFPHex.fpf DatabaseSampleFPHex.fpf
 | 
| 
 | 
  1377 
 | 
| 
 | 
  1378     To perform similarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1379     reference molecules as individual molecules to find 2 percent of most
 | 
| 
 | 
  1380     similar database molecules for each reference molecule with application
 | 
| 
 | 
  1381     of similarity cutoff of 0.85 to supported fingerprints strings data in
 | 
| 
 | 
  1382     text fingerprints files present in specific columns and create a
 | 
| 
 | 
  1383     ReferenceFPHexSimilaritySearching.csv file containing reference and
 | 
| 
 | 
  1384     database compoundIDs retrieved from specific columns, type:
 | 
| 
 | 
  1385 
 | 
| 
 | 
  1386         % SimilaritySearchingFingerprints.pl --mode IndividualReference --SearchMode
 | 
| 
 | 
  1387           SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
 | 
| 
 | 
  1388           --ReferenceColMode ColLabel --ReferenceFingerprintsCol Fingerprints
 | 
| 
 | 
  1389           --ReferenceCompoundIDCol CompoundID --DatabaseColMode Collabel
 | 
| 
 | 
  1390           --DatabaseCompoundIDCol CompoundID --DatabaseFingerprintsCol
 | 
| 
 | 
  1391           Fingerprints --SimilarityCutoff 0.85 --SimilarCountMode PercentSimilar
 | 
| 
 | 
  1392           --PercentSimilarMolecules 2 -o
 | 
| 
 | 
  1393           ReferenceSampleFPHex.csv DatabaseSampleFPHex.csv
 | 
| 
 | 
  1394 
 | 
| 
 | 
  1395     To perform similarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1396     reference molecules as individual molecules to find top 50 most similar
 | 
| 
 | 
  1397     database molecules for each reference molecule with application of
 | 
| 
 | 
  1398     similarity cutoff of 0.85 to supported fingerprints strings data in SD
 | 
| 
 | 
  1399     fingerprints files present in specific data fields and create both
 | 
| 
 | 
  1400     ReferenceFPHexSimilaritySearching.csv and
 | 
| 
 | 
  1401     ReferenceFPHexSimilaritySearching.sdf files containing reference and
 | 
| 
 | 
  1402     database compoundIDs retrieved from specific data fields, type:
 | 
| 
 | 
  1403 
 | 
| 
 | 
  1404         % SimilaritySearchingFingerprints.pl --mode IndividualReference --SearchMode
 | 
| 
 | 
  1405           SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
 | 
| 
 | 
  1406           --ReferenceFingerprintsField Fingerprints
 | 
| 
 | 
  1407           --DatabaseFingerprintsField Fingerprints
 | 
| 
 | 
  1408           --ReferenceCompoundIDMode DataField --ReferenceCompoundIDField CmpdID
 | 
| 
 | 
  1409           --DatabaseCompoundIDMode DataField --DatabaseCompoundIDField CmpdID
 | 
| 
 | 
  1410           --SimilarityCutoff 0.85 --SimilarCountMode NumOfSimilar
 | 
| 
 | 
  1411           --NumOfSimilarMolecules 50 --output both -o
 | 
| 
 | 
  1412           ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
 | 
| 
 | 
  1413 
 | 
| 
 | 
  1414     To perform similarity search using Tanimoto coefficient by treating
 | 
| 
 | 
  1415     reference molecules as individual molecules to find 1 percent of most
 | 
| 
 | 
  1416     similar database molecules for each reference molecule with application
 | 
| 
 | 
  1417     of similarity cutoff to supported fingerprints strings data in SD
 | 
| 
 | 
  1418     fingerprints files present in specific data field labels, and create
 | 
| 
 | 
  1419     both ReferenceFPHexSimilaritySearching.csv
 | 
| 
 | 
  1420     ReferenceFPHexSimilaritySearching.sdf files containing reference and
 | 
| 
 | 
  1421     database compound IDs retrieved from specific data field labels along
 | 
| 
 | 
  1422     with other specific data for database molecules, type:
 | 
| 
 | 
  1423 
 | 
| 
 | 
  1424         % SimilaritySearchingFingerprints.pl --mode IndividualReference --SearchMode
 | 
| 
 | 
  1425           SimilaritySearch --BitVectorComparisonMode TanimotoSimilarity
 | 
| 
 | 
  1426           --ReferenceFingerprintsField Fingerprints
 | 
| 
 | 
  1427           --DatabaseFingerprintsField Fingerprints
 | 
| 
 | 
  1428           --ReferenceCompoundIDMode DataField --ReferenceCompoundIDField CmpdID
 | 
| 
 | 
  1429           --DatabaseCompoundIDMode DataField --DatabaseCompoundIDField CmpdID
 | 
| 
 | 
  1430           --DatabaseDataFieldsMode Specify --DatabaseDataFields "TPSA,SLogP"
 | 
| 
 | 
  1431           --SimilarityCutoff 0.75 --SimilarCountMode PercentSimilar
 | 
| 
 | 
  1432           --PercentSimilarMolecules 1 --output both --OutDelim comma --quote Yes
 | 
| 
 | 
  1433           --precision 3 -o ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
 | 
| 
 | 
  1434 
 | 
| 
 | 
  1435 AUTHOR
 | 
| 
 | 
  1436     Manish Sud <msud@san.rr.com>
 | 
| 
 | 
  1437 
 | 
| 
 | 
  1438 SEE ALSO
 | 
| 
 | 
  1439     InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl,
 | 
| 
 | 
  1440     AtomNeighborhoodsFingerprints.pl, ExtendedConnectivityFingerprints.pl,
 | 
| 
 | 
  1441     MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
 | 
| 
 | 
  1442     TopologicalAtomPairsFingerprints.pl,
 | 
| 
 | 
  1443     TopologicalAtomTorsionsFingerprints.pl,
 | 
| 
 | 
  1444     TopologicalPharmacophoreAtomPairsFingerprints.pl,
 | 
| 
 | 
  1445     TopologicalPharmacophoreAtomTripletsFingerprints.pl
 | 
| 
 | 
  1446 
 | 
| 
 | 
  1447 COPYRIGHT
 | 
| 
 | 
  1448     Copyright (C) 2015 Manish Sud. All rights reserved.
 | 
| 
 | 
  1449 
 | 
| 
 | 
  1450     This file is part of MayaChemTools.
 | 
| 
 | 
  1451 
 | 
| 
 | 
  1452     MayaChemTools is free software; you can redistribute it and/or modify it
 | 
| 
 | 
  1453     under the terms of the GNU Lesser General Public License as published by
 | 
| 
 | 
  1454     the Free Software Foundation; either version 3 of the License, or (at
 | 
| 
 | 
  1455     your option) any later version.
 | 
| 
 | 
  1456 
 |