comparison docs/scripts/man1/SimilaritySearchingFingerprints.1 @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 .\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.22)
2 .\"
3 .\" Standard preamble:
4 .\" ========================================================================
5 .de Sp \" Vertical space (when we can't use .PP)
6 .if t .sp .5v
7 .if n .sp
8 ..
9 .de Vb \" Begin verbatim text
10 .ft CW
11 .nf
12 .ne \\$1
13 ..
14 .de Ve \" End verbatim text
15 .ft R
16 .fi
17 ..
18 .\" Set up some character translations and predefined strings. \*(-- will
19 .\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
20 .\" double quote, and \*(R" will give a right double quote. \*(C+ will
21 .\" give a nicer C++. Capital omega is used to do unbreakable dashes and
22 .\" therefore won't be available. \*(C` and \*(C' expand to `' in nroff,
23 .\" nothing in troff, for use with C<>.
24 .tr \(*W-
25 .ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
26 .ie n \{\
27 . ds -- \(*W-
28 . ds PI pi
29 . if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
30 . if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\" diablo 12 pitch
31 . ds L" ""
32 . ds R" ""
33 . ds C` ""
34 . ds C' ""
35 'br\}
36 .el\{\
37 . ds -- \|\(em\|
38 . ds PI \(*p
39 . ds L" ``
40 . ds R" ''
41 'br\}
42 .\"
43 .\" Escape single quotes in literal strings from groff's Unicode transform.
44 .ie \n(.g .ds Aq \(aq
45 .el .ds Aq '
46 .\"
47 .\" If the F register is turned on, we'll generate index entries on stderr for
48 .\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
49 .\" entries marked with X<> in POD. Of course, you'll have to process the
50 .\" output yourself in some meaningful fashion.
51 .ie \nF \{\
52 . de IX
53 . tm Index:\\$1\t\\n%\t"\\$2"
54 ..
55 . nr % 0
56 . rr F
57 .\}
58 .el \{\
59 . de IX
60 ..
61 .\}
62 .\"
63 .\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
64 .\" Fear. Run. Save yourself. No user-serviceable parts.
65 . \" fudge factors for nroff and troff
66 .if n \{\
67 . ds #H 0
68 . ds #V .8m
69 . ds #F .3m
70 . ds #[ \f1
71 . ds #] \fP
72 .\}
73 .if t \{\
74 . ds #H ((1u-(\\\\n(.fu%2u))*.13m)
75 . ds #V .6m
76 . ds #F 0
77 . ds #[ \&
78 . ds #] \&
79 .\}
80 . \" simple accents for nroff and troff
81 .if n \{\
82 . ds ' \&
83 . ds ` \&
84 . ds ^ \&
85 . ds , \&
86 . ds ~ ~
87 . ds /
88 .\}
89 .if t \{\
90 . ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
91 . ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
92 . ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
93 . ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
94 . ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
95 . ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
96 .\}
97 . \" troff and (daisy-wheel) nroff accents
98 .ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
99 .ds 8 \h'\*(#H'\(*b\h'-\*(#H'
100 .ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
101 .ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
102 .ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
103 .ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
104 .ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
105 .ds ae a\h'-(\w'a'u*4/10)'e
106 .ds Ae A\h'-(\w'A'u*4/10)'E
107 . \" corrections for vroff
108 .if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
109 .if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
110 . \" for low resolution devices (crt and lpr)
111 .if \n(.H>23 .if \n(.V>19 \
112 \{\
113 . ds : e
114 . ds 8 ss
115 . ds o a
116 . ds d- d\h'-1'\(ga
117 . ds D- D\h'-1'\(hy
118 . ds th \o'bp'
119 . ds Th \o'LP'
120 . ds ae ae
121 . ds Ae AE
122 .\}
123 .rm #[ #] #H #V #F C
124 .\" ========================================================================
125 .\"
126 .IX Title "SIMILARITYSEARCHINGFINGERPRINTS 1"
127 .TH SIMILARITYSEARCHINGFINGERPRINTS 1 "2015-03-29" "perl v5.14.2" "MayaChemTools"
128 .\" For nroff, turn off justification. Always turn off hyphenation; it makes
129 .\" way too many mistakes in technical documents.
130 .if n .ad l
131 .nh
132 .SH "NAME"
133 SimilaritySearchingFingerprints.pl \- Perform similarity search using fingerprints strings data in SD, FP and CSV/TSV text file(s)
134 .SH "SYNOPSIS"
135 .IX Header "SYNOPSIS"
136 SimilaritySearchingFingerprints.pl ReferenceFPFile DatabaseFPFile
137 .PP
138 SimilaritySearchingFingerprints.pl [\fB\-\-alpha\fR \fInumber\fR] [\fB\-\-beta\fR \fInumber\fR]
139 [\fB\-b, \-\-BitVectorComparisonMode\fR \fITanimotoSimilarity | TverskySimilarity | ...\fR]
140 [\fB\-\-DatabaseColMode\fR \fIColNum | ColLabel\fR] [\fB\-\-DatabaseCompoundIDCol\fR \fIcol number | col name\fR]
141 [\fB\-\-DatabaseCompoundIDPrefix\fR \fItext\fR] [\fB\-\-DatabaseCompoundIDField\fR \fIDataFieldName\fR]
142 [\fB\-\-DatabaseCompoundIDMode\fR \fIDataField | MolName | LabelPrefix | MolNameOrLabelPrefix\fR]
143 [\fB\-\-DatabaseDataCols\fR \fI\*(L"DataColNum1, DataColNum2,... \*(R" | DataColLabel1, DataCoLabel2,... "\fR]
144 [\fB\-\-DatabaseDataColsMode\fR \fIAll | Specify | CompoundID\fR] [\fB\-\-DatabaseDataFields\fR \fI\*(L"FieldLabel1, FieldLabel2,... \*(R"\fR]
145 [\fB\-\-DatabaseDataFieldsMode\fR \fIAll | Common | Specify | CompoundID\fR]
146 [\fB\-\-DatabaseFingerprintsCol\fR \fIcol number | col name\fR] [\fB\-\-DatabaseFingerprintsField\fR \fIFieldLabel\fR]
147 []\fB\-\-DistanceCutoff\fR \fInumber\fR] [\fB\-d, \-\-detail\fR \fIInfoLevel\fR] [\fB\-f, \-\-fast\fR]
148 [\fB\-\-FingerprintsMode\fR \fIAutoDetect | FingerprintsBitVectorString | FingerprintsVectorString\fR]
149 [\fB\-g, \-\-GroupFusionRule\fR \fIMax, Mean, Median, Min, Sum, Euclidean\fR] [\fB\-\-GroupFusionApplyCutoff\fR \fIYes | No\fR]
150 [\fB\-h, \-\-help\fR] [\fB\-\-InDelim\fR \fIcomma | semicolon\fR] [\fB\-k, \-\-KNN\fR \fIall | number\fR]
151 [\fB\-m, \-\-mode\fR \fIIndividualReference | MultipleReferences\fR]
152 [\fB\-n, \-\-NumOfSimilarMolecules\fR \fInumber\fR] [\fB\-\-OutDelim\fR \fIcomma | tab | semicolon\fR]
153 [\fB\-\-output\fR \fI\s-1SD\s0 | text | both\fR] [\fB\-o, \-\-overwrite\fR]
154 [\fB\-p, \-\-PercentSimilarMolecules\fR \fInumber\fR] [\fB\-\-precision\fR \fInumber\fR] [\fB\-q, \-\-quote\fR \fIYes | No\fR]
155 [\fB\-\-ReferenceColMode\fR \fIColNum | ColLabel\fR] [\fB\-\-ReferenceCompoundIDCol\fR \fIcol number | col name\fR]
156 [\fB\-\-ReferenceCompoundIDPrefix\fR \fItext\fR] [\fB\-\-ReferenceCompoundIDField\fR \fIDataFieldName\fR]
157 [\fB\-\-ReferenceCompoundIDMode\fR \fIDataField | MolName | LabelPrefix | MolNameOrLabelPrefix\fR]
158 [\fB\-\-ReferenceFingerprintsCol\fR \fIcol number | col name\fR] [\fB\-\-ReferenceFingerprintsField\fR \fIFieldLabel\fR]
159 [\fB\-r, \-\-root\fR \fIRootName\fR] [\fB\-s, \-\-SearchMode\fR \fISimilaritySearch | DissimilaritySearch\fR]
160 [\fB\-\-SimilarCountMode\fR \fINumOfSimilar | PercentSimilar\fR] [\fB\-\-SimilarityCutoff\fR \fInumber\fR]
161 [\fB\-v, \-\-VectorComparisonMode\fR \fITanimotoSimilairy | ... | ManhattanDistance | ...\fR]
162 [\fB\-\-VectorComparisonFormulism\fR \fIAlgebraicForm | BinaryForm | SetTheoreticForm\fR]
163 [\fB\-w, \-\-WorkingDir\fR dirname] ReferenceFingerprintsFile DatabaseFingerprintsFile
164 .SH "DESCRIPTION"
165 .IX Header "DESCRIPTION"
166 Perform molecular similarity search [ Ref 94\-113 ] using fingerprint bit-vector or vector strings
167 data in \fI\s-1SD\s0, \s-1FP\s0, or \s-1CSV/TSV\s0 text\fR files corresponding to \fIReferenceFingerprintsFile\fR and
168 \&\fIDatabaseFingerprintsFile\fR, and generate \s-1SD\s0 and \s-1CSV/TSV\s0 text file(s) containing database
169 molecules which are similar to reference molecule(s). The reference molecules are also referred
170 to as query or seed molecules and database molecules as target molecules in the literature.
171 .PP
172 The current release of MayaChemTools supports two types of similarity search modes:
173 \&\fIIndividualReference or MultipleReferences\fR. For default value of \fIMultipleReferences\fR for \fB\-m, \-\-mode\fR
174 option, reference molecules are considered as a set and \fB\-g, \-\-GroupFusionRule\fR is used to calculate
175 similarity of a database molecule against reference molecules set. The group fusion rule is also
176 referred to as data fusion of consensus scoring in the literature. However, for \fIIndividualReference\fR
177 value of \fB\-m, \-\-mode\fR option, reference molecules are treated as individual molecules and each reference
178 molecule is compared against a database molecule by itself to identify similar molecules.
179 .PP
180 The molecular dissimilarity search can also be performed using \fIDissimilaritySearch\fR value for
181 \&\fB\-s, \-\-SearchMode\fR option. During dissimilarity search or usage of distance comparison coefficient
182 in similarity similarity search, the meaning of fingerprints comparison value is automatically reversed
183 as shown below:
184 .PP
185 .Vb 1
186 \& SeachMode ComparisonCoefficient ResultsSort ComparisonValues
187 \&
188 \& Similarity SimilarityCoefficient Descending Higher value imples
189 \& high similarity
190 \& Similarity DistanceCoefficient Ascending Lower value implies
191 \& high similarity
192 \&
193 \& Dissimilarity SimilarityCoefficient Ascending Lower value implies
194 \& high dissimilarity
195 \& Dissimilarity DistanceCoefficient Descending Higher value implies
196 \& high dissimilarity
197 .Ve
198 .PP
199 During \fIIndividualReference\fR value of \fB\-m, \-\-Mode\fR option for similarity search, fingerprints bit-vector
200 or vector string of each reference molecule is compared with database molecules using specified
201 similarity or distance coefficients to identify most similar molecules for each reference molecule.
202 Based on value of \fB\-\-SimilarCountMode\fR, up to \fB\-\-n, \-\-NumOfSimilarMolecules\fR or \fB\-p,
203 \&\-\-PercentSimilarMolecules\fR at specified \fB\-\-SimilarityCutoff\fR or \fB\-\-DistanceCutoff\fR are
204 identified for each reference molecule.
205 .PP
206 During \fIMultipleReferences\fR value \fB\-m, \-\-mode\fR option for similarity search, all reference molecules
207 are considered as a set and \fB\-g, \-\-GroupFusionRule\fR is used to calculate similarity of a database
208 molecule against reference molecules set either using all reference molecules or number of k\-nearest
209 neighbors (k\-NN) to a database molecule specified using \fB\-k, \-\-kNN\fR. The fingerprints bit-vector
210 or vector string of each reference molecule in a set is compared with a database molecule using
211 a similarity or distance coefficient specified via \fB\-b, \-\-BitVectorComparisonMode\fR or \fB\-v,
212 \&\-\-VectorComparisonMode\fR. The reference molecules whose comparison values with a database
213 molecule fall outside specified \fB\-\-SimilarityCutoff\fR or \fB\-\-DistanceCutoff\fR are ignored during \fIYes\fR
214 value of \fB\-\-GroupFusionApplyCutoff\fR. The specified \fB\-g, \-\-GroupFusionRule\fR is applied to
215 \&\fB\-k, \-\-kNN\fR reference molecules to calculate final similarity value between a database molecule
216 and reference molecules set.
217 .PP
218 The input fingerprints \fI\s-1SD\s0, \s-1FP\s0, or Text (\s-1CSV/TSV\s0)\fR files for \fIReferenceFingerprintsFile\fR and
219 \&\fIDatabaseTextFile\fR must contain valid fingerprint bit-vector or vector strings data corresponding to
220 same type of fingerprints.
221 .PP
222 The valid fingerprints \fISDFile\fR extensions are \fI.sdf\fR and \fI.sd\fR. The valid fingerprints \fIFPFile\fR
223 extensions are \fI.fpf\fR and \fI.fp\fR. The valid fingerprints \fITextFile (\s-1CSV/TSV\s0)\fR extensions are
224 \&\fI.csv\fR and \fI.tsv\fR for comma/semicolon and tab delimited text files respectively. The \fB\-\-indelim\fR
225 option determines the format of \fITextFile\fR. Any file which doesn't correspond to the format indicated
226 by \fB\-\-indelim\fR option is ignored.
227 .PP
228 Example of \fI\s-1FP\s0\fR file containing fingerprints bit-vector string data:
229 .PP
230 .Vb 10
231 \& #
232 \& # Package = MayaChemTools 7.4
233 \& # ReleaseDate = Oct 21, 2010
234 \& #
235 \& # TimeStamp = Mon Mar 7 15:14:01 2011
236 \& #
237 \& # FingerprintsStringType = FingerprintsBitVector
238 \& #
239 \& # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
240 \& # Size = 1024
241 \& # BitStringFormat = HexadecimalString
242 \& # BitsOrder = Ascending
243 \& #
244 \& Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510...
245 \& Cmpd2 000000249400840040100042011001001980410c000000001010088001120...
246 \& ... ...
247 \& ... ..
248 .Ve
249 .PP
250 Example of \fI\s-1FP\s0\fR file containing fingerprints vector string data:
251 .PP
252 .Vb 10
253 \& #
254 \& # Package = MayaChemTools 7.4
255 \& # ReleaseDate = Oct 21, 2010
256 \& #
257 \& # TimeStamp = Mon Mar 7 15:14:01 2011
258 \& #
259 \& # FingerprintsStringType = FingerprintsVector
260 \& #
261 \& # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
262 \& # VectorStringFormat = IDsAndValuesString
263 \& # VectorValuesType = NumericalValues
264 \& #
265 \& Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C:
266 \& N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...;
267 \& 33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2
268 \& 6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ...
269 \& Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C
270 \& O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...;
271 \& 15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2
272 \& 1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ...
273 \& ... ...
274 \& ... ...
275 .Ve
276 .PP
277 Example of \fI\s-1SD\s0\fR file containing fingerprints bit-vector string data:
278 .PP
279 .Vb 10
280 \& ... ...
281 \& ... ...
282 \& $$$$
283 \& ... ...
284 \& ... ...
285 \& ... ...
286 \& 41 44 0 0 0 0 0 0 0 0999 V2000
287 \& \-3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
288 \& ... ...
289 \& 2 3 1 0 0 0 0
290 \& ... ...
291 \& M END
292 \& > <CmpdID>
293 \& Cmpd1
294 \&
295 \& > <PathLengthFingerprints>
296 \& FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt
297 \& h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66
298 \& 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028
299 \& 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462
300 \& 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a
301 \& aa0660a11014a011d46
302 \&
303 \& $$$$
304 \& ... ...
305 \& ... ...
306 .Ve
307 .PP
308 Example of \s-1CSV\s0 \fITextFile\fR containing fingerprints bit-vector string data:
309 .PP
310 .Vb 7
311 \& "CompoundID","PathLengthFingerprints"
312 \& "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes
313 \& :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4
314 \& 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030
315 \& 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..."
316 \& ... ...
317 \& ... ...
318 .Ve
319 .PP
320 The current release of MayaChemTools supports the following types of fingerprint
321 bit-vector and vector strings:
322 .PP
323 .Vb 6
324 \& FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi
325 \& us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0\-C.X1.BO1.H3\-AT
326 \& C1:NR1\-C.X3.BO3.H1\-ATC1:NR2\-C.X1.BO1.H3\-ATC1:NR2\-C.X3.BO4\-ATC1 NR0\-C.X
327 \& 1.BO1.H3\-ATC1:NR1\-C.X3.BO3.H1\-ATC1:NR2\-C.X1.BO1.H3\-ATC1:NR2\-C.X3.BO4\-A
328 \& TC1 NR0\-C.X2.BO2.H2\-ATC1:NR1\-C.X2.BO2.H2\-ATC1:NR1\-C.X3.BO3.H1\-ATC1:NR2
329 \& \-C.X2.BO2.H2\-ATC1:NR2\-N.X3.BO3\-ATC1:NR2\-O.X1.BO1.H1\-ATC1 NR0\-C.X2.B...
330 \&
331 \& FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS
332 \& ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2
333 \& .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1
334 \& O.X1.BO2;2 4 14 3 10 1 1 1 3 2
335 \&
336 \& FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume
337 \& ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F
338 \& N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1
339 \&
340 \& FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN
341 \& umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C
342 \& 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N
343 \& 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8
344 \& O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1
345 \& 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0...
346 \&
347 \& FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
348 \& AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
349 \& H SsssCH;24.778 4.387 1.993 25.023 \-1.435 3.975 14.006 29.759 \-0.073 3
350 \& .024 \-2.270
351 \&
352 \& FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
353 \& ValuesString;0 0 0 0 0 0 0 3.975 0 \-0.073 0 0 24.778 \-2.270 0 0 \-1.435
354 \& 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1
355 \& 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
356 \& 0 0 0 0 0 0 0 0 0 0 0 0 0 0
357 \&
358 \& FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
359 \& us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
360 \& 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
361 \& 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
362 \& 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
363 \& 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
364 \&
365 \& FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
366 \& :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
367 \& 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
368 \& 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
369 \& 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
370 \& 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
371 \&
372 \& FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
373 \& es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
374 \& 0000000001010000000110000011000000000000100000000000000000000000100001
375 \& 1000000110000000000000000000000000010011000000000000000000000000010000
376 \& 0000000000000000000000000010000000000000000001000000000000000000000000
377 \& 0000000000010000100001000000000000101000000000000000100000000000000...
378 \&
379 \& FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu
380 \& s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8
381 \& 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567
382 \& 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012
383 \& 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455
384 \& 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404...
385 \&
386 \& FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp
387 \& haNumericalValues;ValuesString;25189973 528584866 662581668 671034184
388 \& 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450
389 \& 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430
390 \& 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134
391 \& 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566...
392 \&
393 \& FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
394 \& 0000000000000000000000000000000001001000010010000000010010000000011100
395 \& 0100101010111100011011000100110110000011011110100110111111111111011111
396 \& 11111111111110111000
397 \&
398 \& FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
399 \& 1110011111100101111111000111101100110000000000000011100010000000000000
400 \& 0000000000000000000000000000000000000000000000101000000000000000000000
401 \& 0000000000000000000000000000000000000000000000000000000000000000000000
402 \& 0000000000000000000000000000000000000011000000000000000000000000000000
403 \& 0000000000000000000000000000000000000000
404 \&
405 \& FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
406 \& ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
407 \& 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
408 \& 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
409 \& 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
410 \& 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
411 \&
412 \& FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
413 \& ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
414 \& 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
415 \& 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
416 \& 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
417 \& 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
418 \&
419 \& FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
420 \& th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
421 \& 0100010101011000101001011100110001000010001001101000001001001001001000
422 \& 0010110100000111001001000001001010100100100000000011000000101001011100
423 \& 0010000001000101010100000100111100110111011011011000000010110111001101
424 \& 0101100011000000010001000011000010100011101100001000001000100000000...
425 \&
426 \& FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
427 \& 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
428 \& C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
429 \& 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
430 \& 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
431 \& 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
432 \&
433 \& FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
434 \& h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
435 \& 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
436 \& 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
437 \& CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
438 \& OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
439 \&
440 \& FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
441 \& istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
442 \& .H3\-D1\-C.X3.BO3.H1 C.X2.BO2.H2\-D1\-C.X2.BO2.H2 C.X2.BO2.H2\-D1\-C.X3.BO3.
443 \& H1 C.X2.BO2.H2\-D1\-C.X3.BO4 C.X2.BO2.H2\-D1\-N.X3.BO3 C.X2.BO3.H1\-D1\-...;
444 \& 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
445 \& 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
446 \&
447 \& FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi
448 \& stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar\-D1\-Ar
449 \& Ar\-D1\-Ar.HBA Ar\-D1\-HBD Ar\-D1\-Hal Ar\-D1\-None Ar.HBA\-D1\-None HBA\-D1\-NI H
450 \& BA\-D1\-None HBA.HBD\-D1\-NI HBA.HBD\-D1\-None HBD\-D1\-None NI\-D1\-None No...;
451 \& 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4
452 \& 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ...
453 \&
454 \& FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3
455 \& 3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3\-C.X3.BO3.H1\-C.X3.BO4\-
456 \& C.X3.BO4 C.X1.BO1.H3\-C.X3.BO3.H1\-C.X3.BO4\-N.X3.BO3 C.X2.BO2.H2\-C.X2.BO
457 \& 2.H2\-C.X3.BO3.H1\-C.X2.BO2.H2 C.X2.BO2.H2\-C.X2.BO2.H2\-C.X3.BO3.H1\-O...;
458 \& 2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1
459 \&
460 \& FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica
461 \& lValues;IDsAndValuesString;aaCH\-aaCH\-aaCH\-aaCH aaCH\-aaCH\-aaCH\-aasC aaC
462 \& H\-aaCH\-aasC\-aaCH aaCH\-aaCH\-aasC\-aasC aaCH\-aaCH\-aasC\-sF aaCH\-aaCH\-aasC\-
463 \& ssNH aaCH\-aasC\-aasC\-aasC aaCH\-aasC\-aasC\-aasN aaCH\-aasC\-ssNH\-dssC a...;
464 \& 4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
465 \&
466 \& FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
467 \& inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
468 \& .BO1.H3\-D1\-C.X1.BO1.H3\-D1\-C.X3.BO3.H1\-D2 C.X1.BO1.H3\-D1\-C.X2.BO2.H2\-D1
469 \& 0\-C.X3.BO4\-D9 C.X1.BO1.H3\-D1\-C.X2.BO2.H2\-D3\-N.X3.BO3\-D4 C.X1.BO1.H3\-D1
470 \& \-C.X2.BO2.H2\-D4\-C.X2.BO2.H2\-D5 C.X1.BO1.H3\-D1\-C.X2.BO2.H2\-D6\-C.X3....;
471 \& 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
472 \& 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
473 \&
474 \& FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1
475 \& :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2\-D1\-C.2\-D9\-C
476 \& .3\-D10 C.2\-D1\-C.2\-D9\-C.ar\-D10 C.2\-D1\-C.3\-D1\-C.3\-D2 C.2\-D1\-C.3\-D10\-C.3\-
477 \& D9 C.2\-D1\-C.3\-D2\-C.3\-D3 C.2\-D1\-C.3\-D2\-C.ar\-D3 C.2\-D1\-C.3\-D3\-C.3\-D4 C.2
478 \& \-D1\-C.3\-D3\-N.ar\-D4 C.2\-D1\-C.3\-D3\-O.3\-D2 C.2\-D1\-C.3\-D4\-C.3\-D5 C.2\-D1\-C.
479 \& 3\-D5\-C.3\-D6 C.2\-D1\-C.3\-D5\-O.3\-D4 C.2\-D1\-C.3\-D6\-C.3\-D7 C.2\-D1\-C.3\-D7...
480 \&
481 \& FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
482 \& Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H\-D1\-H H
483 \& \-D1\-NI HBA\-D1\-NI HBD\-D1\-NI H\-D2\-H H\-D2\-HBA H\-D2\-HBD HBA\-D2\-HBA HBA\-D2\-
484 \& HBD H\-D3\-H H\-D3\-HBA H\-D3\-HBD H\-D3\-NI HBA\-D3\-NI HBD\-D3\-NI H\-D4\-H H\-D4\-H
485 \& BA H\-D4\-HBD HBA\-D4\-HBA HBA\-D4\-HBD HBD\-D4\-HBD H\-D5\-H H\-D5\-HBA H\-D5\-...;
486 \& 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
487 \& 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
488 \&
489 \& FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
490 \& ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
491 \& 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
492 \& 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
493 \& 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
494 \& 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
495 \&
496 \& FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize:
497 \& MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1\-
498 \& Ar1\-Ar1 Ar1\-Ar1\-H1 Ar1\-Ar1\-HBA1 Ar1\-Ar1\-HBD1 Ar1\-H1\-H1 Ar1\-H1\-HBA1 Ar1
499 \& \-H1\-HBD1 Ar1\-HBA1\-HBD1 H1\-H1\-H1 H1\-H1\-HBA1 H1\-H1\-HBD1 H1\-HBA1\-HBA1 H1\-
500 \& HBA1\-HBD1 H1\-HBA1\-NI1 H1\-HBD1\-NI1 HBA1\-HBA1\-NI1 HBA1\-HBD1\-NI1 Ar1\-...;
501 \& 46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23
502 \& 28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1
503 \& 119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ...
504 \&
505 \& FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD
506 \& istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106
507 \& 8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0
508 \& 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26
509 \& 14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0
510 \& 0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ...
511 .Ve
512 .SH "OPTIONS"
513 .IX Header "OPTIONS"
514 .IP "\fB\-\-alpha\fR \fInumber\fR" 4
515 .IX Item "--alpha number"
516 Value of alpha parameter for calculating \fITversky\fR similarity coefficient specified for
517 \&\fB\-b, \-\-BitVectorComparisonMode\fR option. It corresponds to weights assigned for bits set
518 to \*(L"1\*(R" in a pair of fingerprint bit-vectors during the calculation of similarity coefficient. Possible
519 values: \fI0 to 1\fR. Default value: <0.5>.
520 .IP "\fB\-\-beta\fR \fInumber\fR" 4
521 .IX Item "--beta number"
522 Value of beta parameter for calculating \fIWeightedTanimoto\fR and \fIWeightedTversky\fR
523 similarity coefficients specified for \fB\-b, \-\-BitVectorComparisonMode\fR option. It is used to
524 weight the contributions of bits set to \*(L"0\*(R" during the calculation of similarity coefficients. Possible
525 values: \fI0 to 1\fR. Default value of <1> makes \fIWeightedTanimoto\fR and \fIWeightedTversky\fR
526 equivalent to \fITanimoto\fR and \fITversky\fR.
527 .IP "\fB\-b, \-\-BitVectorComparisonMode\fR \fITanimotoSimilarity | TverskySimilarity | ...\fR" 4
528 .IX Item "-b, --BitVectorComparisonMode TanimotoSimilarity | TverskySimilarity | ..."
529 Specify what similarity coefficient to use for calculating similarity between fingerprints bit-vector
530 string data values in \fIReferenceFingerprintsFile\fR and \fIDatabaseFingerprintsFile\fR during similarity
531 search. Possible values: \fITanimotoSimilarity | TverskySimilarity | ...\fR. Default: \fITanimotoSimilarity\fR
532 .Sp
533 The current release supports the following similarity coefficients: \fIBaroniUrbaniSimilarity, BuserSimilarity,
534 CosineSimilarity, DiceSimilarity, DennisSimilarity, ForbesSimilarity, FossumSimilarity, HamannSimilarity, JacardSimilarity,
535 Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity, McConnaugheySimilarity, OchiaiSimilarity,
536 PearsonSimilarity, RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity, SkoalSneath1Similarity,
537 SkoalSneath2Similarity, SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity, YuleSimilarity,
538 WeightedTanimotoSimilarity, WeightedTverskySimilarity\fR. These similarity coefficients are described below.
539 .Sp
540 For two fingerprint bit-vectors A and B of same size, let:
541 .Sp
542 .Vb 4
543 \& Na = Number of bits set to "1" in A
544 \& Nb = Number of bits set to "1" in B
545 \& Nc = Number of bits set to "1" in both A and B
546 \& Nd = Number of bits set to "0" in both A and B
547 \&
548 \& Nt = Number of bits set to "1" or "0" in A or B (Size of A or B)
549 \& Nt = Na + Nb \- Nc + Nd
550 \&
551 \& Na \- Nc = Number of bits set to "1" in A but not in B
552 \& Nb \- Nc = Number of bits set to "1" in B but not in A
553 .Ve
554 .Sp
555 Then, various similarity coefficients [ Ref. 40 \- 42 ] for a pair of bit-vectors A and B are
556 defined as follows:
557 .Sp
558 \&\fIBaroniUrbaniSimilarity\fR: ( \s-1SQRT\s0( Nc * Nd ) + Nc ) / ( \s-1SQRT\s0 ( Nc * Nd ) + Nc + ( Na \- Nc ) + ( Nb \- Nc ) ) ( same as Buser )
559 .Sp
560 \&\fIBuserSimilarity\fR: ( \s-1SQRT\s0 ( Nc * Nd ) + Nc ) / ( \s-1SQRT\s0 ( Nc * Nd ) + Nc + ( Na \- Nc ) + ( Nb \- Nc ) ) ( same as BaroniUrbani )
561 .Sp
562 \&\fICosineSimilarity\fR: Nc / \s-1SQRT\s0 ( Na * Nb ) (same as Ochiai)
563 .Sp
564 \&\fIDiceSimilarity\fR: (2 * Nc) / ( Na + Nb )
565 .Sp
566 \&\fIDennisSimilarity\fR: ( Nc * Nd \- ( ( Na \- Nc ) * ( Nb \- Nc ) ) ) / \s-1SQRT\s0 ( Nt * Na * Nb)
567 .Sp
568 \&\fIForbesSimilarity\fR: ( Nt * Nc ) / ( Na * Nb )
569 .Sp
570 \&\fIFossumSimilarity\fR: ( Nt * ( ( Nc \- 1/2 ) ** 2 ) / ( Na * Nb )
571 .Sp
572 \&\fIHamannSimilarity\fR: ( ( Nc + Nd ) \- ( Na \- Nc ) \- ( Nb \- Nc ) ) / Nt
573 .Sp
574 \&\fIJaccardSimilarity\fR: Nc / ( ( Na \- Nc) + ( Nb \- Nc ) + Nc ) = Nc / ( Na + Nb \- Nc ) (same as Tanimoto)
575 .Sp
576 \&\fIKulczynski1Similarity\fR: Nc / ( ( Na \- Nc ) + ( Nb \- Nc) ) = Nc / ( Na + Nb \- 2Nc )
577 .Sp
578 \&\fIKulczynski2Similarity\fR: ( ( Nc / 2 ) * ( 2 * Nc + ( Na \- Nc ) + ( Nb \- Nc) ) ) / ( ( Nc + ( Na \- Nc ) ) * ( Nc + ( Nb \- Nc ) ) ) = 0.5 * ( Nc / Na + Nc / Nb )
579 .Sp
580 \&\fIMatchingSimilarity\fR: ( Nc + Nd ) / Nt
581 .Sp
582 \&\fIMcConnaugheySimilarity\fR: ( Nc ** 2 \- ( Na \- Nc ) * ( Nb \- Nc) ) / ( Na * Nb )
583 .Sp
584 \&\fIOchiaiSimilarity\fR: Nc / \s-1SQRT\s0 ( Na * Nb ) (same as Cosine)
585 .Sp
586 \&\fIPearsonSimilarity\fR: ( ( Nc * Nd ) \- ( ( Na \- Nc ) * ( Nb \- Nc ) ) / \s-1SQRT\s0 ( Na * Nb * ( Na \- Nc + Nd ) * ( Nb \- Nc + Nd ) )
587 .Sp
588 \&\fIRogersTanimotoSimilarity\fR: ( Nc + Nd ) / ( ( Na \- Nc) + ( Nb \- Nc) + Nt) = ( Nc + Nd ) / ( Na + Nb \- 2Nc + Nt)
589 .Sp
590 \&\fIRussellRaoSimilarity\fR: Nc / Nt
591 .Sp
592 \&\fISimpsonSimilarity\fR: Nc / \s-1MIN\s0 ( Na, Nb)
593 .Sp
594 \&\fISkoalSneath1Similarity\fR: Nc / ( Nc + 2 * ( Na \- Nc) + 2 * ( Nb \- Nc) ) = Nc / ( 2 * Na + 2 * Nb \- 3 * Nc )
595 .Sp
596 \&\fISkoalSneath2Similarity\fR: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt )
597 .Sp
598 \&\fISkoalSneath3Similarity\fR: ( Nc + Nd ) / ( ( Na \- Nc ) + ( Nb \- Nc ) ) = ( Nc + Nd ) / ( Na + Nb \- 2 * Nc )
599 .Sp
600 \&\fITanimotoSimilarity\fR: Nc / ( ( Na \- Nc) + ( Nb \- Nc ) + Nc ) = Nc / ( Na + Nb \- Nc ) (same as Jaccard)
601 .Sp
602 \&\fITverskySimilarity\fR: Nc / ( alpha * ( Na \- Nc ) + ( 1 \- alpha) * ( Nb \- Nc) + Nc ) = Nc / ( alpha * ( Na \- Nb ) + Nb)
603 .Sp
604 \&\fIYuleSimilarity\fR: ( ( Nc * Nd ) \- ( ( Na \- Nc ) * ( Nb \- Nc ) ) ) / ( ( Nc * Nd ) + ( ( Na \- Nc ) * ( Nb \- Nc ) ) )
605 .Sp
606 Values of Tanimoto/Jaccard and Tversky coefficients are dependent on only those bit which
607 are set to \*(L"1\*(R" in both A and B. In order to take into account all bit positions, modified versions
608 of Tanimoto [ Ref. 42 ] and Tversky [ Ref. 43 ] have been developed.
609 .Sp
610 Let:
611 .Sp
612 .Vb 3
613 \& Na\*(Aq = Number of bits set to "0" in A
614 \& Nb\*(Aq = Number of bits set to "0" in B
615 \& Nc\*(Aq = Number of bits set to "0" in both A and B
616 .Ve
617 .Sp
618 Tanimoto': Nc' / ( ( Na' \- Nc') + ( Nb' \- Nc' ) + Nc' ) = Nc' / ( Na' + Nb' \- Nc' )
619 .Sp
620 Tversky': Nc' / ( alpha * ( Na' \- Nc' ) + ( 1 \- alpha) * ( Nb' \- Nc' ) + Nc' ) = Nc' / ( alpha * ( Na' \- Nb' ) + Nb')
621 .Sp
622 Then:
623 .Sp
624 \&\fIWeightedTanimotoSimilarity\fR = beta * Tanimoto + (1 \- beta) * Tanimoto'
625 .Sp
626 \&\fIWeightedTverskySimilarity\fR = beta * Tversky + (1 \- beta) * Tversky'
627 .IP "\fB\-\-DatabaseColMode\fR \fIColNum | ColLabel\fR" 4
628 .IX Item "--DatabaseColMode ColNum | ColLabel"
629 Specify how columns are identified in database fingerprints \fITextFile\fR: using column
630 number or column label. Possible values: \fIColNum or ColLabel\fR. Default value: \fIColNum\fR.
631 .IP "\fB\-\-DatabaseCompoundIDCol\fR \fIcol number | col name\fR" 4
632 .IX Item "--DatabaseCompoundIDCol col number | col name"
633 This value is \fB\-\-DatabaseColMode\fR mode specific. It specifies column to use for retrieving compound
634 \&\s-1ID\s0 from database fingerprints \fITextFile\fR during similarity and dissimilarity search for output \s-1SD\s0 and
635 \&\s-1CSV/TSV\s0 text files. Possible values: \fIcol number or col label\fR. Default value: \fIfirst column containing
636 the word compoundID in its column label or sequentially generated IDs\fR.
637 .Sp
638 This is only used for \fICompoundID\fR value of \fB\-\-DatabaseDataColsMode\fR option.
639 .IP "\fB\-\-DatabaseCompoundIDPrefix\fR \fItext\fR" 4
640 .IX Item "--DatabaseCompoundIDPrefix text"
641 Specify compound \s-1ID\s0 prefix to use during sequential generation of compound IDs for database fingerprints
642 \&\fISDFile\fR and \fITextFile\fR. Default value: \fICmpd\fR. The default value generates compound IDs which look
643 like Cmpd<Number>.
644 .Sp
645 For database fingerprints \fISDFile\fR, this value is only used during \fILabelPrefix | MolNameOrLabelPrefix\fR
646 values of \fB\-\-DatabaseCompoundIDMode\fR option; otherwise, it's ignored.
647 .Sp
648 Examples for \fILabelPrefix\fR or \fIMolNameOrLabelPrefix\fR value of \fB\-\-DatabaseCompoundIDMode\fR:
649 .Sp
650 .Vb 1
651 \& Compound
652 .Ve
653 .Sp
654 The values specified above generates compound IDs which correspond to Compound<Number>
655 instead of default value of Cmpd<Number>.
656 .IP "\fB\-\-DatabaseCompoundIDField\fR \fIDataFieldName\fR" 4
657 .IX Item "--DatabaseCompoundIDField DataFieldName"
658 Specify database fingerprints \fISDFile\fR datafield label for generating compound IDs. This value is
659 only used during \fIDataField\fR value of \fB\-\-DatabaseCompoundIDMode\fR option.
660 .Sp
661 Examples for \fIDataField\fR value of \fB\-\-DatabaseCompoundIDMode\fR:
662 .Sp
663 .Vb 2
664 \& MolID
665 \& ExtReg
666 .Ve
667 .IP "\fB\-\-DatabaseCompoundIDMode\fR \fIDataField | MolName | LabelPrefix | MolNameOrLabelPrefix\fR" 4
668 .IX Item "--DatabaseCompoundIDMode DataField | MolName | LabelPrefix | MolNameOrLabelPrefix"
669 Specify how to generate compound IDs from database fingerprints \fISDFile\fR during similarity and
670 dissimilarity search for output \s-1SD\s0 and \s-1CSV/TSV\s0 text files: use a \fISDFile\fR datafield value; use
671 molname line from \fISDFile\fR; generate a sequential \s-1ID\s0 with specific prefix; use combination of both
672 MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
673 .Sp
674 Possible values: \fIDataField | MolName | LabelPrefix | MolNameOrLabelPrefix\fR.
675 Default: \fILabelPrefix\fR.
676 .Sp
677 For \fIMolNameAndLabelPrefix\fR value of \fB\-\-DatabaseCompoundIDMode\fR, molname line in \fISDFile\fR takes
678 precedence over sequential compound IDs generated using \fILabelPrefix\fR and only empty molname
679 values are replaced with sequential compound IDs.
680 .Sp
681 This is only used for \fICompoundID\fR value of \fB\-\-DatabaseDataFieldsMode\fR option.
682 .ie n .IP "\fB\-\-DatabaseDataCols\fR \fI""DataColNum1,DataColNum2,... "" | DataColLabel1,DataCoLabel2,... ""\fR" 4
683 .el .IP "\fB\-\-DatabaseDataCols\fR \fI``DataColNum1,DataColNum2,... '' | DataColLabel1,DataCoLabel2,... ""\fR" 4
684 .IX Item "--DatabaseDataCols DataColNum1,DataColNum2,... | DataColLabel1,DataCoLabel2,... """
685 This value is \fB\-\-DatabaseColMode\fR mode specific. It is a comma delimited list of database fingerprints
686 \&\fITextFile\fR data column numbers or labels to extract and write to \s-1SD\s0 and \s-1CSV/TSV\s0 text files along with
687 other information for \fI\s-1SD\s0 | text | both\fR values of \fB\-\-output\fR option.
688 .Sp
689 This is only used for \fISpecify\fR value of \fB\-\-DatabaseDataColsMode\fR option.
690 .Sp
691 Examples:
692 .Sp
693 .Vb 2
694 \& 1,2,3
695 \& CompoundName,MolWt
696 .Ve
697 .IP "\fB\-\-DatabaseDataColsMode\fR \fIAll | Specify | CompoundID\fR" 4
698 .IX Item "--DatabaseDataColsMode All | Specify | CompoundID"
699 Specify how data columns from database fingerprints \fITextFile\fR are transferred to output \s-1SD\s0 and
700 \&\s-1CSV/TSV\s0 text files along with other information for \fI\s-1SD\s0 | text | both\fR values of \fB\-\-output\fR option:
701 transfer all data columns; extract specified data columns; generate a compound \s-1ID\s0 database compound
702 prefix. Possible values: \fIAll | Specify | CompoundID\fR. Default value: \fICompoundID\fR.
703 .ie n .IP "\fB\-\-DatabaseDataFields\fR \fI""FieldLabel1,FieldLabel2,... ""\fR" 4
704 .el .IP "\fB\-\-DatabaseDataFields\fR \fI``FieldLabel1,FieldLabel2,... ''\fR" 4
705 .IX Item "--DatabaseDataFields FieldLabel1,FieldLabel2,... "
706 Comma delimited list of database fingerprints \fISDFile\fR data fields to extract and write to \s-1SD\s0
707 and \s-1CSV/TSV\s0 text files along with other information for \fI\s-1SD\s0 | text | both\fR values of
708 \&\fB\-\-output\fR option.
709 .Sp
710 This is only used for \fISpecify\fR value of \fB\-\-DatabaseDataFieldsMode\fR option.
711 .Sp
712 Examples:
713 .Sp
714 .Vb 2
715 \& Extreg
716 \& MolID,CompoundName
717 .Ve
718 .IP "\fB\-\-DatabaseDataFieldsMode\fR \fIAll | Common | Specify | CompoundID\fR" 4
719 .IX Item "--DatabaseDataFieldsMode All | Common | Specify | CompoundID"
720 Specify how data fields from database fingerprints \fISDFile\fR are transferred to output \s-1SD\s0 and
721 \&\s-1CSV/TSV\s0 text files along with other information for \fI\s-1SD\s0 | text | both\fR values of \fB\-\-output\fR
722 option: transfer all \s-1SD\s0 data field; transfer \s-1SD\s0 data files common to all compounds; extract
723 specified data fields; generate a compound \s-1ID\s0 using molname line, a compound prefix, or a
724 combination of both. Possible values: \fIAll | Common | specify | CompoundID\fR. Default value:
725 \&\fICompoundID\fR.
726 .IP "\fB\-\-DatabaseFingerprintsCol\fR \fIcol number | col name\fR" 4
727 .IX Item "--DatabaseFingerprintsCol col number | col name"
728 This value is \fB\-\-DatabaseColMode\fR specific. It specifies fingerprints column to use during similarity
729 and dissimilarity search for database fingerprints \fITextFile\fR. Possible values: \fIcol number or col label\fR.
730 Default value: \fIfirst column containing the word Fingerprints in its column label\fR.
731 .IP "\fB\-\-DatabaseFingerprintsField\fR \fIFieldLabel\fR" 4
732 .IX Item "--DatabaseFingerprintsField FieldLabel"
733 Fingerprints field label to use during similarity and dissimilarity search for database fingerprints \fISDFile\fR.
734 Default value: \fIfirst data field label containing the word Fingerprints in its label\fR
735 .IP "\fB\-\-DistanceCutoff\fR \fInumber\fR" 4
736 .IX Item "--DistanceCutoff number"
737 Distance cutoff value to use during comparison of distance value between a pair of database
738 and reference molecule calculated by distance comparison methods for fingerprints vector
739 string data values. Possible values: \fIAny valid number\fR. Default value: \fI10\fR.
740 .Sp
741 The comparison value between a pair of database and reference molecule must meet the cutoff
742 criterion as shown below:
743 .Sp
744 .Vb 1
745 \& SeachMode CutoffCriterion ComparisonValues
746 \&
747 \& Similarity <= Lower value implies high similarity
748 \& Dissimilarity >= Higher value implies high dissimilarity
749 .Ve
750 .Sp
751 This option is only used during distance coefficients values of \fB\-v, \-\-VectorComparisonMode\fR
752 option.
753 .Sp
754 This option is ignored during \fINo\fR value of \fB\-\-GroupFusionApplyCutoff\fR for \fIMultipleReferences\fR
755 \&\fB\-m, \-\-mode\fR.
756 .IP "\fB\-d, \-\-detail\fR \fIInfoLevel\fR" 4
757 .IX Item "-d, --detail InfoLevel"
758 Level of information to print about lines being ignored. Default: \fI1\fR. Possible values:
759 \&\fI1, 2 or 3\fR.
760 .IP "\fB\-f, \-\-fast\fR" 4
761 .IX Item "-f, --fast"
762 In this mode, fingerprints columns specified using \fB\-\-FingerprintsCol\fR for reference and database
763 fingerprints \fITextFile(s)\fR, and \fB\-\-FingerprintsField\fR for reference and database fingerprints \fISDFile(s)\fR
764 are assumed to contain valid fingerprints data and no checking is performed before performing similarity
765 and dissimilarity search. By default, fingerprints data is validated before computing pairwise similarity and
766 distance coefficients.
767 .IP "\fB\-\-FingerprintsMode\fR \fIAutoDetect | FingerprintsBitVectorString | FingerprintsVectorString\fR" 4
768 .IX Item "--FingerprintsMode AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString"
769 Format of fingerprint strings data in reference and database fingerprints \fI\s-1SD\s0, \s-1FP\s0, or Text (\s-1CSV/TSV\s0)\fR
770 files: automatically detect format of fingerprints string created by MayaChemTools fingerprints
771 generation scripts or explicitly specify its format. Possible values: \fIAutoDetect | FingerprintsBitVectorString |
772 FingerprintsVectorString\fR. Default value: \fIAutoDetect\fR.
773 .IP "\fB\-g, \-\-GroupFusionRule\fR \fIMax, Min, Mean, Median, Sum, Euclidean\fR" 4
774 .IX Item "-g, --GroupFusionRule Max, Min, Mean, Median, Sum, Euclidean"
775 Specify what group fusion [ Ref 94\-97, Ref 100, Ref 105 ] rule to use for calculating similarity of
776 a database molecule against a set of reference molecules during \fIMultipleReferences\fR value of
777 similarity search \fB\-m, \-\-mode\fR. Possible values: \fIMax, Min, Mean, Median, Sum, Euclidean\fR. Default
778 value: \fIMax\fR. \fIMean\fR value corresponds to average or arithmetic mean. The group fusion rule is
779 also referred to as data fusion of consensus scoring in the literature.
780 .Sp
781 For a reference molecules set and a database molecule, let:
782 .Sp
783 .Vb 1
784 \& N = Number of reference molecules in a set
785 \&
786 \& i = ith reference reference molecule in a set
787 \& n = Nth reference reference molecule in a set
788 \&
789 \& d = dth database molecule
790 \&
791 \& Crd = Fingerprints comparison value between rth reference and dth database
792 \& molecule \- similarity/dissimilarity comparison using similarity or
793 \& distance coefficient
794 .Ve
795 .Sp
796 Then, various group fusion rules to calculate fused similarity between a database molecule and
797 reference molecules set are defined as follows:
798 .Sp
799 \&\fBMax\fR: \s-1MAX\s0 ( C1d, C2d, ..., Cid, ..., Cnd )
800 .Sp
801 \&\fBMin\fR: \s-1MIN\s0 ( C1d, C2d, ..., Cid, ..., Cnd )
802 .Sp
803 \&\fBMean\fR: \s-1SUM\s0 ( C1d, C2d, ..., Cid, ..., Cnd ) / N
804 .Sp
805 \&\fBMedian\fR: \s-1MEDIAN\s0 ( C1d, C2d, ..., Cid, ..., Cnd )
806 .Sp
807 \&\fBSum\fR: \s-1SUM\s0 ( C1d, C2d, ..., Cid, ..., Cnd )
808 .Sp
809 \&\fBEuclidean\fR: \s-1SQRT\s0( \s-1SUM\s0( C1d ** 2, C2d ** 2, ..., Cid ** 2, ..., Cnd *** 2) )
810 .Sp
811 The fingerprints bit-vector or vector string of each reference molecule in a set is compared
812 with a database molecule using a similarity or distance coefficient specified via \fB\-b,
813 \&\-\-BitVectorComparisonMode\fR or \fB\-v, \-\-VectorComparisonMode\fR. The reference molecules
814 whose comparison values with a database molecule fall outside specified \fB\-\-SimilarityCutoff\fR
815 or \fB\-\-DistanceCutoff\fR are ignored during \fIYes\fR value of \fB\-\-GroupFusionApplyCutoff\fR. The
816 specified \fB\-g, \-\-GroupFusionRule\fR is applied to \fB\-k, \-\-kNN\fR reference molecules to calculate
817 final fused similarity value between a database molecule and reference molecules set.
818 .Sp
819 During dissimilarity search or usage of distance comparison coefficient in similarity search,
820 the meaning of fingerprints comaprison value is automatically reversed as shown below:
821 .Sp
822 .Vb 1
823 \& SeachMode ComparisonCoefficient ComparisonValues
824 \&
825 \& Similarity SimilarityCoefficient Higher value imples high similarity
826 \& Similarity DistanceCoefficient Lower value implies high similarity
827 \&
828 \& Dissimilarity SimilarityCoefficient Lower value implies high
829 \& dissimilarity
830 \& Dissimilarity DistanceCoefficient Higher value implies high
831 \& dissimilarity
832 .Ve
833 .Sp
834 Consequently, \fIMax\fR implies highest and lowest comparison value for usage of similarity and
835 distance coefficient respectively during similarity search. And it corresponds to lowest and highest
836 comparison value for usage of similarity and distance coefficient respectively during dissimilarity
837 search. During \fIMin\fR fusion rule, the highest and lowest comparison values are appropriately
838 reversed.
839 .IP "\fB\-\-GroupFusionApplyCutoff\fR \fIYes | No\fR" 4
840 .IX Item "--GroupFusionApplyCutoff Yes | No"
841 Specify whether to apply \fB\-\-SimilarityCutoff\fR or \fB\-\-DistanceCutoff\fR values during application
842 of \fB\-g, \-\-GroupFusionRule\fR to reference molecules set. Possible values: \fIYes or No\fR. Default
843 value: \fIYes\fR.
844 .Sp
845 During \fIYes\fR value of \fB\-\-GroupFusionApplyCutoff\fR, the reference molecules whose comparison
846 values with a database molecule fall outside specified \fB\-\-SimilarityCutoff\fR or \fB\-\-DistanceCutoff\fR
847 are not used to calculate final fused similarity value between a database molecule and reference
848 molecules set.
849 .IP "\fB\-h, \-\-help\fR" 4
850 .IX Item "-h, --help"
851 Print this help message.
852 .IP "\fB\-\-InDelim\fR \fIcomma | semicolon\fR" 4
853 .IX Item "--InDelim comma | semicolon"
854 Input delimiter for reference and database fingerprints \s-1CSV\s0 \fITextFile(s)\fR. Possible values:
855 \&\fIcomma or semicolon\fR. Default value: \fIcomma\fR. For \s-1TSV\s0 files, this option is ignored
856 and \fItab\fR is used as a delimiter.
857 .IP "\fB\-k, \-\-kNN\fR \fIall | number\fR" 4
858 .IX Item "-k, --kNN all | number"
859 Number of k\-nearest neighbors (k\-NN) reference molecules to use during \fB\-g, \-\-GroupFusionRule\fR
860 for calculating similarity of a database molecule against a set of reference molecules. Possible values:
861 \&\fIall | positive integers\fR. Default: \fIall\fR.
862 .Sp
863 After ranking similarity values between a database molecule and reference molecules during
864 \&\fIMultipleReferences\fR value of similarity search \fB\-m, \-\-mode\fR option, a top \fB\-k, \-\-KNN\fR reference
865 molecule are selected and used during \fB\-g, \-\-GroupFusionRule\fR.
866 .Sp
867 This option is \fB\-s, \-\-SearchMode\fR dependent: It corresponds to dissimilar molecules during
868 \&\fIDissimilaritySearch\fR value of \fB\-s, \-\-SearchMode\fR option.
869 .IP "\fB\-m, \-\-mode\fR \fIIndividualReference | MultipleReferences\fR" 4
870 .IX Item "-m, --mode IndividualReference | MultipleReferences"
871 Specify how to treat reference molecules in \fIReferenceFingerprintsFile\fR during similarity search:
872 Treat each reference molecule individually during similarity search or perform similarity
873 search by treating multiple reference molecules as a set. Possible values: \fIIndividualReference
874 | MultipleReferences\fR. Default value: \fIMultipleReferences\fR.
875 .Sp
876 During \fIIndividualReference\fR value of \fB\-m, \-\-Mode\fR for similarity search, fingerprints bit-vector
877 or vector string of each reference molecule is compared with database molecules using specified
878 similarity or distance coefficients to identify most similar molecules for each reference molecule.
879 Based on value of \fB\-\-SimilarCountMode\fR, upto \fB\-\-n, NumOfSimilarMolecules\fR or \fB\-p,
880 \&\-\-PercentSimilarMolecules\fR at specified <\-\-SimilarityCutoff> or \fB\-\-DistanceCutoff\fR are
881 identified for each reference molecule.
882 .Sp
883 During \fIMultipleReferences\fR value \fB\-m, \-\-mode\fR for similarity search, all reference molecules
884 are considered as a set and \fB\-g, \-\-GroupFusionRule\fR is used to calculate similarity of a database
885 molecule against reference molecules set either using all reference molecules or number of k\-nearest
886 neighbors (k\-NN) to a database molecule specified using \fB\-k, \-\-kNN\fR. The fingerprints bit-vector
887 or vector string of each reference molecule in a set is compared with a database molecule using
888 a similarity or distance coefficient specified via \fB\-b, \-\-BitVectorComparisonMode\fR or \fB\-v,
889 \&\-\-VectorComparisonMode\fR. The reference molecules whose comparison values with a database
890 molecule fall outside specified \fB\-\-SimilarityCutoff\fR or \fB\-\-DistanceCutoff\fR are ignored. The
891 specified \fB\-g, \-\-GroupFusionRule\fR is applied to rest of \fB\-k, \-\-kNN\fR reference molecules to calculate
892 final similarity value between a database molecule and reference molecules set.
893 .Sp
894 The meaning of similarity and distance is automatically reversed during \fIDissimilaritySearch\fR value
895 of \fB\-s, \-\-SearchMode\fR along with appropriate handling of \fB\-\-SimilarityCutoff\fR or
896 \&\fB\-\-DistanceCutoff\fR values.
897 .IP "\fB\-n, \-\-NumOfSimilarMolecules\fR \fInumber\fR" 4
898 .IX Item "-n, --NumOfSimilarMolecules number"
899 Maximum number of most similar database molecules to find for each reference molecule or set of
900 reference molecules based on \fIIndividualReference\fR or \fIMultipleReferences\fR value of similarity
901 search \fB\-m, \-\-mode\fR option. Default: \fI10\fR. Valid values: positive integers.
902 .Sp
903 This option is ignored during \fIPercentSimilar\fR value of \fB\-\-SimilarCountMode\fR option.
904 .Sp
905 This option is \fB\-s, \-\-SearchMode\fR dependent: It corresponds to dissimilar molecules during
906 \&\fIDissimilaritySearch\fR value of \fB\-s, \-\-SearchMode\fR option.
907 .IP "\fB\-\-OutDelim\fR \fIcomma | tab | semicolon\fR" 4
908 .IX Item "--OutDelim comma | tab | semicolon"
909 Delimiter for output \s-1CSV/TSV\s0 text file. Possible values: \fIcomma, tab, or semicolon\fR
910 Default value: \fIcomma\fR.
911 .IP "\fB\-\-output\fR \fI\s-1SD\s0 | text | both\fR" 4
912 .IX Item "--output SD | text | both"
913 Type of output files to generate. Possible values: \fI\s-1SD\s0, text, or both\fR. Default value: \fItext\fR.
914 .IP "\fB\-o, \-\-overwrite\fR" 4
915 .IX Item "-o, --overwrite"
916 Overwrite existing files
917 .IP "\fB\-p, \-\-PercentSimilarMolecules\fR \fInumber\fR" 4
918 .IX Item "-p, --PercentSimilarMolecules number"
919 Maximum percent of mosy similar database molecules to find for each reference molecule or set of
920 reference molecules based on \fIIndividualReference\fR or \fIMultipleReferences\fR value of similarity
921 search \fB\-m, \-\-mode\fR option. Default: \fI1\fR percent of database molecules. Valid values: non-zero values
922 in between \fI0 to 100\fR.
923 .Sp
924 This option is ignored during \fINumOfSimilar\fR value of \fB\-\-SimilarCountMode\fR option.
925 .Sp
926 During \fIPercentSimilar\fR value of \fB\-\-SimilarCountMode\fR option, the number of molecules
927 in \fIDatabaseFingerprintsFile\fR is counted and number of similar molecules correspond to
928 \&\fB\-\-PercentSimilarMolecules\fR of the total number of database molecules.
929 .Sp
930 This option is \fB\-s, \-\-SearchMode\fR dependent: It corresponds to dissimilar molecules during
931 \&\fIDissimilaritySearch\fR value of \fB\-s, \-\-SearchMode\fR option.
932 .IP "\fB\-\-precision\fR \fInumber\fR" 4
933 .IX Item "--precision number"
934 Precision of calculated similarity values for comparison and generating output files. Default: up to \fI2\fR
935 decimal places. Valid values: positive integers.
936 .IP "\fB\-q, \-\-quote\fR \fIYes | No\fR" 4
937 .IX Item "-q, --quote Yes | No"
938 Put quote around column values in output \s-1CSV/TSV\s0 text file. Possible values:
939 \&\fIYes or No\fR. Default value: \fIYes\fR.
940 .IP "\fB\-\-ReferenceColMode\fR \fIColNum | ColLabel\fR" 4
941 .IX Item "--ReferenceColMode ColNum | ColLabel"
942 Specify how columns are identified in reference fingerprints \fITextFile\fR: using column
943 number or column label. Possible values: \fIColNum or ColLabel\fR. Default value: \fIColNum\fR.
944 .IP "\fB\-\-ReferenceCompoundIDCol\fR \fIcol number | col name\fR" 4
945 .IX Item "--ReferenceCompoundIDCol col number | col name"
946 This value is \fB\-\-ReferenceColMode\fR mode specific. It specifies column to use for retrieving compound
947 \&\s-1ID\s0 from reference fingerprints \fITextFile\fR during similarity and dissimilarity search for output \s-1SD\s0 and \s-1CSV/TSV\s0
948 text files. Possible values: \fIcol number or col label\fR. Default value: \fIfirst column containing the word compoundID
949 in its column label or sequentially generated IDs\fR.
950 .IP "\fB\-\-ReferenceCompoundIDPrefix\fR \fItext\fR" 4
951 .IX Item "--ReferenceCompoundIDPrefix text"
952 Specify compound \s-1ID\s0 prefix to use during sequential generation of compound IDs for reference fingerprints
953 \&\fISDFile\fR and \fITextFile\fR. Default value: \fICmpd\fR. The default value generates compound IDs which looks
954 like Cmpd<Number>.
955 .Sp
956 For reference fingerprints \fISDFile\fR, this value is only used during \fILabelPrefix | MolNameOrLabelPrefix\fR
957 values of \fB\-\-ReferenceCompoundIDMode\fR option; otherwise, it's ignored.
958 .Sp
959 Examples for \fILabelPrefix\fR or \fIMolNameOrLabelPrefix\fR value of \fB\-\-DatabaseCompoundIDMode\fR:
960 .Sp
961 .Vb 1
962 \& Compound
963 .Ve
964 .Sp
965 The values specified above generates compound IDs which correspond to Compound<Number>
966 instead of default value of Cmpd<Number>.
967 .IP "\fB\-\-ReferenceCompoundIDField\fR \fIDataFieldName\fR" 4
968 .IX Item "--ReferenceCompoundIDField DataFieldName"
969 Specify reference fingerprints \fISDFile\fR datafield label for generating compound IDs.
970 This value is only used during \fIDataField\fR value of \fB\-\-ReferenceCompoundIDMode\fR option.
971 .Sp
972 Examples for \fIDataField\fR value of \fB\-\-ReferenceCompoundIDMode\fR:
973 .Sp
974 .Vb 2
975 \& MolID
976 \& ExtReg
977 .Ve
978 .IP "\fB\-\-ReferenceCompoundIDMode\fR \fIDataField | MolName | LabelPrefix | MolNameOrLabelPrefix\fR" 4
979 .IX Item "--ReferenceCompoundIDMode DataField | MolName | LabelPrefix | MolNameOrLabelPrefix"
980 Specify how to generate compound IDs from reference fingerprints \fISDFile\fR during similarity and
981 dissimilarity search for output \s-1SD\s0 and \s-1CSV/TSV\s0 text files: use a \fISDFile\fR datafield value; use
982 molname line from \fISDFile\fR; generate a sequential \s-1ID\s0 with specific prefix; use combination of both
983 MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
984 .Sp
985 Possible values: \fIDataField | MolName | LabelPrefix | MolNameOrLabelPrefix\fR.
986 Default: \fILabelPrefix\fR.
987 .Sp
988 For \fIMolNameAndLabelPrefix\fR value of \fB\-\-ReferenceCompoundIDMode\fR, molname line in \fISDFiles\fR
989 takes precedence over sequential compound IDs generated using \fILabelPrefix\fR and only empty molname
990 values are replaced with sequential compound IDs.
991 .IP "\fB\-\-ReferenceFingerprintsCol\fR \fIcol number | col name\fR" 4
992 .IX Item "--ReferenceFingerprintsCol col number | col name"
993 This value is \fB\-\-ReferenceColMode\fR specific. It specifies fingerprints column to use during similarity
994 and dissimilarity search for reference fingerprints \fITextFile\fR. Possible values: \fIcol number or col label\fR.
995 Default value: \fIfirst column containing the word Fingerprints in its column label\fR.
996 .IP "\fB\-\-ReferenceFingerprintsField\fR \fIFieldLabel\fR" 4
997 .IX Item "--ReferenceFingerprintsField FieldLabel"
998 Fingerprints field label to use during similarity and dissimilarity search for reference fingerprints \fISDFile\fR.
999 Default value: \fIfirst data field label containing the word Fingerprints in its label\fR
1000 .IP "\fB\-r, \-\-root\fR \fIRootName\fR" 4
1001 .IX Item "-r, --root RootName"
1002 New file name is generated using the root: <Root>.<Ext>. Default for new file name:
1003 <ReferenceFileName>SimilaritySearching.<Ext>. The output file type determines <Ext>
1004 value. The sdf, csv, and tsv <Ext> values are used for \s-1SD\s0, comma/semicolon, and tab delimited
1005 text files respectively.
1006 .IP "\fB\-s, \-\-SearchMode\fR \fISimilaritySearch | DissimilaritySearch\fR" 4
1007 .IX Item "-s, --SearchMode SimilaritySearch | DissimilaritySearch"
1008 Specify how to find molecules from database molecules for individual reference molecules or
1009 set of reference molecules: Find similar molecules or dissimilar molecules from database molecules.
1010 Possible values: \fISimilaritySearch | DissimilaritySearch\fR. Default value: \fISimilaritySearch\fR.
1011 .Sp
1012 During \fIDissimilaritySearch\fR value of \fB\-s, \-\-SearchMode\fR option, the meaning of the following
1013 options is switched and they correspond to dissimilar molecules instead of similar molecules:
1014 \&\fB\-\-SimilarCountMode\fR, \fB\-n, \-\-NumOfSimilarMolecules\fR, \fB\-\-PercentSimilarMolecules\fR,
1015 \&\fB\-k, \-\-kNN\fR.
1016 .IP "\fB\-\-SimilarCountMode\fR \fINumOfSimilar | PercentSimilar\fR" 4
1017 .IX Item "--SimilarCountMode NumOfSimilar | PercentSimilar"
1018 Specify method used to count similar molecules found from database molecules for individual
1019 reference molecules or set of reference molecules: Find number of similar molecules or percent
1020 of similar molecules from database molecules. Possible values: \fINumOfSimilar | PercentSimilar\fR.
1021 Default value: \fINumOfSimilar\fR.
1022 .Sp
1023 The values for number of similar molecules and percent similar molecules are specified
1024 using options \fB\-n, NumOfSimilarMolecule\fR and \fB\-\-PercentSimilarMolecules\fR.
1025 .Sp
1026 This option is \fB\-s, \-\-SearchMode\fR dependent: It corresponds to dissimilar molecules during
1027 \&\fIDissimilaritySearch\fR value of \fB\-s, \-\-SearchMode\fR option.
1028 .IP "\fB\-\-SimilarityCutoff\fR \fInumber\fR" 4
1029 .IX Item "--SimilarityCutoff number"
1030 Similarity cutoff value to use during comparison of similarity value between a pair of database
1031 and reference molecules calculated by similarity comparison methods for fingerprints bit-vector
1032 vector strings data values. Possible values: \fIAny valid number\fR. Default value: \fI0.75\fR.
1033 .Sp
1034 The comparison value between a pair of database and reference molecule must meet the cutoff
1035 criterion as shown below:
1036 .Sp
1037 .Vb 1
1038 \& SeachMode CutoffCriterion ComparisonValues
1039 \&
1040 \& Similarity >= Higher value implies high similarity
1041 \& Dissimilarity <= Lower value implies high dissimilarity
1042 .Ve
1043 .Sp
1044 This option is ignored during \fINo\fR value of \fB\-\-GroupFusionApplyCutoff\fR for \fIMultipleReferences\fR
1045 \&\fB\-m, \-\-mode\fR.
1046 .Sp
1047 This option is \fB\-s, \-\-SearchMode\fR dependent: It corresponds to dissimilar molecules during
1048 \&\fIDissimilaritySearch\fR value of \fB\-s, \-\-SearchMode\fR option.
1049 .IP "\fB\-v, \-\-VectorComparisonMode\fR \fISupportedSimilarityName | SupportedDistanceName\fR" 4
1050 .IX Item "-v, --VectorComparisonMode SupportedSimilarityName | SupportedDistanceName"
1051 Specify what similarity or distance coefficient to use for calculating similarity between fingerprint
1052 vector strings data values in \fIReferenceFingerprintsFile\fR and \fIDatabaseFingerprintsFile\fR during
1053 similarity search. Possible values: \fITanimotoSimilairy | ... | ManhattanDistance | ...\fR. Default
1054 value: \fITanimotoSimilarity\fR.
1055 .Sp
1056 The value of \fB\-v, \-\-VectorComparisonMode\fR, in conjunction with \fB\-\-VectorComparisonFormulism\fR,
1057 decides which type of similarity and distance coefficient formulism gets used.
1058 .Sp
1059 The current releases supports the following similarity and distance coefficients: \fICosineSimilarity,
1060 CzekanowskiSimilarity, DiceSimilarity, OchiaiSimilarity, JaccardSimilarity, SorensonSimilarity, TanimotoSimilarity,
1061 CityBlockDistance, EuclideanDistance, HammingDistance, ManhattanDistance, SoergelDistance\fR. These
1062 similarity and distance coefficients are described below.
1063 .Sp
1064 \&\fBFingerprintsVector.pm\fR module, used to calculate similarity and distance coefficients,
1065 provides support to perform comparison between vectors containing three different types of
1066 values:
1067 .Sp
1068 Type I: OrderedNumericalValues
1069 .Sp
1070 .Vb 3
1071 \& . Size of two vectors are same
1072 \& . Vectors contain real values in a specific order. For example: MACCS keys
1073 \& count, Topological pharmnacophore atom pairs and so on.
1074 .Ve
1075 .Sp
1076 Type \s-1II:\s0 UnorderedNumericalValues
1077 .Sp
1078 .Vb 3
1079 \& . Size of two vectors might not be same
1080 \& . Vectors contain unordered real value identified by value IDs. For example:
1081 \& Toplogical atom pairs, Topological atom torsions and so on
1082 .Ve
1083 .Sp
1084 Type \s-1III:\s0 AlphaNumericalValues
1085 .Sp
1086 .Vb 3
1087 \& . Size of two vectors might not be same
1088 \& . Vectors contain unordered alphanumerical values. For example: Extended
1089 \& connectivity fingerprints, atom neighborhood fingerprints.
1090 .Ve
1091 .Sp
1092 Before performing similarity or distance calculations between vectors containing UnorderedNumericalValues
1093 or AlphaNumericalValues, the vectors are transformed into vectors containing unique OrderedNumericalValues
1094 using value IDs for UnorderedNumericalValues and values itself for AlphaNumericalValues.
1095 .Sp
1096 Three forms of similarity and distance calculation between two vectors, specified using \fB\-\-VectorComparisonFormulism\fR
1097 option, are supported: \fIAlgebraicForm, BinaryForm or SetTheoreticForm\fR.
1098 .Sp
1099 For \fIBinaryForm\fR, the ordered list of processed final vector values containing the value or
1100 count of each unique value type is simply converted into a binary vector containing 1s and 0s
1101 corresponding to presence or absence of values before calculating similarity or distance between
1102 two vectors.
1103 .Sp
1104 For two fingerprint vectors A and B of same size containing OrderedNumericalValues, let:
1105 .Sp
1106 .Vb 1
1107 \& N = Number values in A or B
1108 \&
1109 \& Xa = Values of vector A
1110 \& Xb = Values of vector B
1111 \&
1112 \& Xai = Value of ith element in A
1113 \& Xbi = Value of ith element in B
1114 \&
1115 \& SUM = Sum of i over N values
1116 .Ve
1117 .Sp
1118 For SetTheoreticForm of calculation between two vectors, let:
1119 .Sp
1120 .Vb 2
1121 \& SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) )
1122 \& SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) \- SUM ( MIN ( Xai, Xbi ) )
1123 .Ve
1124 .Sp
1125 For BinaryForm of calculation between two vectors, let:
1126 .Sp
1127 .Vb 5
1128 \& Na = Number of bits set to "1" in A = SUM ( Xai )
1129 \& Nb = Number of bits set to "1" in B = SUM ( Xbi )
1130 \& Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi )
1131 \& Nd = Number of bits set to "0" in both A and B
1132 \& = SUM ( 1 \- Xai \- Xbi + Xai * Xbi)
1133 \&
1134 \& N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb \- Nc + Nd
1135 .Ve
1136 .Sp
1137 Additionally, for BinaryForm various values also correspond to:
1138 .Sp
1139 .Vb 4
1140 \& Na = | Xa |
1141 \& Nb = | Xb |
1142 \& Nc = | SetIntersectionXaXb |
1143 \& Nd = N \- | SetDifferenceXaXb |
1144 \&
1145 \& | SetDifferenceXaXb | = N \- Nd = Na + Nb \- Nc + Nd \- Nd = Na + Nb \- Nc
1146 \& = | Xa | + | Xb | \- | SetIntersectionXaXb |
1147 .Ve
1148 .Sp
1149 Various similarity and distance coefficients [ Ref 40, Ref 62, Ref 64 ] for a pair of vectors A and B
1150 in \fIAlgebraicForm, BinaryForm and SetTheoreticForm\fR are defined as follows:
1151 .Sp
1152 \&\fBCityBlockDistance\fR: ( same as HammingDistance and ManhattanDistance)
1153 .Sp
1154 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( \s-1ABS\s0 ( Xai \- Xbi ) )
1155 .Sp
1156 \&\fIBinaryForm\fR: ( Na \- Nc ) + ( Nb \- Nc ) = Na + Nb \- 2 * Nc
1157 .Sp
1158 \&\fISetTheoreticForm\fR: | SetDifferenceXaXb | \- | SetIntersectionXaXb | = \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) )
1159 .Sp
1160 \&\fBCosineSimilarity\fR: ( same as OchiaiSimilarityCoefficient)
1161 .Sp
1162 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( Xai * Xbi ) / \s-1SQRT\s0 ( \s-1SUM\s0 ( Xai ** 2) * \s-1SUM\s0 ( Xbi ** 2) )
1163 .Sp
1164 \&\fIBinaryForm\fR: Nc / \s-1SQRT\s0 ( Na * Nb)
1165 .Sp
1166 \&\fISetTheoreticForm\fR: | SetIntersectionXaXb | / \s-1SQRT\s0 ( |Xa| * |Xb| ) = \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) / \s-1SQRT\s0 ( \s-1SUM\s0 ( Xai ) * \s-1SUM\s0 ( Xbi ) )
1167 .Sp
1168 \&\fBCzekanowskiSimilarity\fR: ( same as DiceSimilarity and SorensonSimilarity)
1169 .Sp
1170 \&\fIAlgebraicForm\fR: ( 2 * ( \s-1SUM\s0 ( Xai * Xbi ) ) ) / ( \s-1SUM\s0 ( Xai ** 2) + \s-1SUM\s0 ( Xbi **2 ) )
1171 .Sp
1172 \&\fIBinaryForm\fR: 2 * Nc / ( Na + Nb )
1173 .Sp
1174 \&\fISetTheoreticForm\fR: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) ) / ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) )
1175 .Sp
1176 \&\fBDiceSimilarity\fR: ( same as CzekanowskiSimilarity and SorensonSimilarity)
1177 .Sp
1178 \&\fIAlgebraicForm\fR: ( 2 * ( \s-1SUM\s0 ( Xai * Xbi ) ) ) / ( \s-1SUM\s0 ( Xai ** 2) + \s-1SUM\s0 ( Xbi **2 ) )
1179 .Sp
1180 \&\fIBinaryForm\fR: 2 * Nc / ( Na + Nb )
1181 .Sp
1182 \&\fISetTheoreticForm\fR: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) ) / ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) )
1183 .Sp
1184 \&\fBEuclideanDistance\fR:
1185 .Sp
1186 \&\fIAlgebraicForm\fR: \s-1SQRT\s0 ( \s-1SUM\s0 ( ( ( Xai \- Xbi ) ** 2 ) ) )
1187 .Sp
1188 \&\fIBinaryForm\fR: \s-1SQRT\s0 ( ( Na \- Nc ) + ( Nb \- Nc ) ) = \s-1SQRT\s0 ( Na + Nb \- 2 * Nc )
1189 .Sp
1190 \&\fISetTheoreticForm\fR: \s-1SQRT\s0 ( | SetDifferenceXaXb | \- | SetIntersectionXaXb | ) = \s-1SQRT\s0 ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) ) )
1191 .Sp
1192 \&\fBHammingDistance\fR: ( same as CityBlockDistance and ManhattanDistance)
1193 .Sp
1194 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( \s-1ABS\s0 ( Xai \- Xbi ) )
1195 .Sp
1196 \&\fIBinaryForm\fR: ( Na \- Nc ) + ( Nb \- Nc ) = Na + Nb \- 2 * Nc
1197 .Sp
1198 \&\fISetTheoreticForm\fR: | SetDifferenceXaXb | \- | SetIntersectionXaXb | = \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) )
1199 .Sp
1200 \&\fBJaccardSimilarity\fR: ( same as TanimotoSimilarity)
1201 .Sp
1202 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( Xai * Xbi ) / ( \s-1SUM\s0 ( Xai ** 2 ) + \s-1SUM\s0 ( Xbi ** 2 ) \- \s-1SUM\s0 ( Xai * Xbi ) )
1203 .Sp
1204 \&\fIBinaryForm\fR: Nc / ( ( Na \- Nc ) + ( Nb \- Nc ) + Nc ) = Nc / ( Na + Nb \- Nc )
1205 .Sp
1206 \&\fISetTheoreticForm\fR: | SetIntersectionXaXb | / | SetDifferenceXaXb | = \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) / ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) )
1207 .Sp
1208 \&\fBManhattanDistance\fR: ( same as CityBlockDistance and HammingDistance)
1209 .Sp
1210 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( \s-1ABS\s0 ( Xai \- Xbi ) )
1211 .Sp
1212 \&\fIBinaryForm\fR: ( Na \- Nc ) + ( Nb \- Nc ) = Na + Nb \- 2 * Nc
1213 .Sp
1214 \&\fISetTheoreticForm\fR: | SetDifferenceXaXb | \- | SetIntersectionXaXb | = \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) )
1215 .Sp
1216 \&\fBOchiaiSimilarity\fR: ( same as CosineSimilarity)
1217 .Sp
1218 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( Xai * Xbi ) / \s-1SQRT\s0 ( \s-1SUM\s0 ( Xai ** 2) * \s-1SUM\s0 ( Xbi ** 2) )
1219 .Sp
1220 \&\fIBinaryForm\fR: Nc / \s-1SQRT\s0 ( Na * Nb)
1221 .Sp
1222 \&\fISetTheoreticForm\fR: | SetIntersectionXaXb | / \s-1SQRT\s0 ( |Xa| * |Xb| ) = \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) / \s-1SQRT\s0 ( \s-1SUM\s0 ( Xai ) * \s-1SUM\s0 ( Xbi ) )
1223 .Sp
1224 \&\fBSorensonSimilarity\fR: ( same as CzekanowskiSimilarity and DiceSimilarity)
1225 .Sp
1226 \&\fIAlgebraicForm\fR: ( 2 * ( \s-1SUM\s0 ( Xai * Xbi ) ) ) / ( \s-1SUM\s0 ( Xai ** 2) + \s-1SUM\s0 ( Xbi **2 ) )
1227 .Sp
1228 \&\fIBinaryForm\fR: 2 * Nc / ( Na + Nb )
1229 .Sp
1230 \&\fISetTheoreticForm\fR: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) = 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) ) / ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) )
1231 .Sp
1232 \&\fBSoergelDistance\fR:
1233 .Sp
1234 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( \s-1ABS\s0 ( Xai \- Xbi ) ) / \s-1SUM\s0 ( \s-1MAX\s0 ( Xai, Xbi ) )
1235 .Sp
1236 \&\fIBinaryForm\fR: 1 \- Nc / ( Na + Nb \- Nc ) = ( Na + Nb \- 2 * Nc ) / ( Na + Nb \- Nc )
1237 .Sp
1238 \&\fISetTheoreticForm\fR: ( | SetDifferenceXaXb | \- | SetIntersectionXaXb | ) / | SetDifferenceXaXb | = ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- 2 * ( \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) ) ) / ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) )
1239 .Sp
1240 \&\fBTanimotoSimilarity\fR: ( same as JaccardSimilarity)
1241 .Sp
1242 \&\fIAlgebraicForm\fR: \s-1SUM\s0 ( Xai * Xbi ) / ( \s-1SUM\s0 ( Xai ** 2 ) + \s-1SUM\s0 ( Xbi ** 2 ) \- \s-1SUM\s0 ( Xai * Xbi ) )
1243 .Sp
1244 \&\fIBinaryForm\fR: Nc / ( ( Na \- Nc ) + ( Nb \- Nc ) + Nc ) = Nc / ( Na + Nb \- Nc )
1245 .Sp
1246 \&\fISetTheoreticForm\fR: | SetIntersectionXaXb | / | SetDifferenceXaXb | = \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) / ( \s-1SUM\s0 ( Xai ) + \s-1SUM\s0 ( Xbi ) \- \s-1SUM\s0 ( \s-1MIN\s0 ( Xai, Xbi ) ) )
1247 .IP "\fB\-\-VectorComparisonFormulism\fR \fIAlgebraicForm | BinaryForm | SetTheoreticForm\fR" 4
1248 .IX Item "--VectorComparisonFormulism AlgebraicForm | BinaryForm | SetTheoreticForm"
1249 Specify fingerprints vector comparison formulism to use for calculation similarity and distance
1250 coefficients during \fB\-v, \-\-VectorComparisonMode\fR. Possible values: \fIAlgebraicForm | BinaryForm |
1251 SetTheoreticForm\fR. Default value: \fIAlgebraicForm\fR.
1252 .Sp
1253 For fingerprint vector strings containing \fBAlphaNumericalValues\fR data values \- \fBExtendedConnectivityFingerprints\fR,
1254 \&\fBAtomNeighborhoodsFingerprints\fR and so on \- all three formulism result in same value during similarity and distance
1255 calculations.
1256 .IP "\fB\-w, \-\-WorkingDir\fR \fIDirName\fR" 4
1257 .IX Item "-w, --WorkingDir DirName"
1258 Location of working directory. Default: current directory.
1259 .SH "EXAMPLES"
1260 .IX Header "EXAMPLES"
1261 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
1262 to find 10 most similar database molecules with application of Max group fusion rule and similarity
1263 cutoff to supported fingerprints strings data in \s-1SD\s0 fingerprints files present in a data fields with
1264 Fingerprint substring in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing
1265 sequentially generated database compound IDs with Cmpd prefix, type:
1266 .PP
1267 .Vb 2
1268 \& % SimilaritySearchingFingerprints.pl \-o ReferenceSampleFPHex.sdf
1269 \& DatabaseSampleFPHex.sdf
1270 .Ve
1271 .PP
1272 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
1273 to find 10 most similar database molecules with application of Max group fusion rule and similarity
1274 cutoff to supported fingerprints strings data in \s-1FP\s0 fingerprints files, and create a
1275 SimilaritySearchResults.csv file containing database compound IDs retireved from \s-1FP\s0 file, type:
1276 .PP
1277 .Vb 2
1278 \& % SimilaritySearchingFingerprints.pl \-r SimilaritySearchResults \-o
1279 \& ReferenceSampleFPBin.fpf DatabaseSampleFPBin.fpf
1280 .Ve
1281 .PP
1282 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
1283 to find 10 most similar database database molecules with application of Max group fusion rule and
1284 similarity cutoff to supported fingerprints strings data in text fingerprints files present in a column
1285 names containing Fingerprint substring in their names, and create a ReferenceFPHexSimilaritySearching.csv
1286 file containing database compound IDs retireved column name containing CompoundID substring or
1287 sequentially generated compound IDs, type:
1288 .PP
1289 .Vb 2
1290 \& % SimilaritySearchingFingerprints.pl \-o ReferenceSampleFPCount.csv
1291 \& DatabaseSampleFPCount.csv
1292 .Ve
1293 .PP
1294 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
1295 to find 10 most similar database molecules for each reference molecule with application of similarity cutoff to
1296 supported fingerprints strings data in \s-1SD\s0 fingerprints files present in a data fields with Fingerprint substring
1297 in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing sequentially generated
1298 reference and database compound IDs with Cmpd prefix, type:
1299 .PP
1300 .Vb 2
1301 \& % SimilaritySearchingFingerprints.pl \-mode IndividualReference \-o
1302 \& ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
1303 .Ve
1304 .PP
1305 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
1306 to find 10 most similar database molecules for each reference molecule with application of similarity cutoff to
1307 supported fingerprints strings data in \s-1FP\s0 fingerprints files, and create a ReferenceFPHexSimilaritySearching.csv
1308 file containing references and database compound IDs retireved from \s-1FP\s0 file, type:
1309 .PP
1310 .Vb 2
1311 \& % SimilaritySearchingFingerprints.pl \-mode IndividualReference \-o
1312 \& ReferenceSampleFPHex.fpf DatabaseSampleFPHex.fpf
1313 .Ve
1314 .PP
1315 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
1316 to find 10 most similar database molecules for each reference molecule with application of similarity cutoff to
1317 supported fingerprints strings data in text fingerprints files present in a column names containing Fingerprint
1318 substring in their names, and create a ReferenceFPHexSimilaritySearching.csv file containing reference and
1319 database compound IDs retrieved column name containing CompoundID substring or sequentially generated
1320 compound IDs, type:
1321 .PP
1322 .Vb 2
1323 \& % SimilaritySearchingFingerprints.pl \-mode IndividualReference \-o
1324 \& ReferenceSampleFPHex.csv DatabaseSampleFPHex.csv
1325 .Ve
1326 .PP
1327 To perform dissimilarity search using Tanimoto coefficient by treating all reference molecules as a set
1328 to find 10 most dissimilar database molecules with application of Max group fusion rule and similarity
1329 cutoff to supported fingerprints strings data in \s-1SD\s0 fingerprints files present in a data fields with
1330 Fingerprint substring in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing
1331 sequentially generated database compound IDs with Cmpd prefix, type:
1332 .PP
1333 .Vb 2
1334 \& % SimilaritySearchingFingerprints.pl \-\-mode MultipleReferences \-\-SearchMode
1335 \& DissimilaritySearch \-o ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
1336 .Ve
1337 .PP
1338 To perform similarity search using CityBlock distance by treating reference molecules as individual molecules
1339 to find 10 most similar database molecules for each reference molecule with application of distance cutoff
1340 to supported vector fingerprints strings data in \s-1SD\s0 fingerprints files present in a data fields with Fingerprint
1341 substring in their labels, and create a ReferenceFPHexSimilaritySearching.csv file containing sequentially generated
1342 reference and database compound IDs with Cmpd prefix, type:
1343 .PP
1344 .Vb 4
1345 \& % SimilaritySearchingFingerprints.pl \-mode IndividualReference
1346 \& \-\-VectorComparisonMode CityBlockDistance \-\-VectorComparisonFormulism
1347 \& AlgebraicForm \-\-DistanceCutoff 10 \-o
1348 \& ReferenceSampleFPCount.sdf DatabaseSampleFPCount.sdf
1349 .Ve
1350 .PP
1351 To perform similarity search using Tanimoto coefficient by treating all reference molecules as a set
1352 to find 100 most similar database molecules with application of Mean group fusion rule to to top 10
1353 reference molecules with in similarity cutoff of 0.75 to supported fingerprints strings data in \s-1FP\s0 fingerprints
1354 files, and create a ReferenceFPHexSimilaritySearching.csv file containing database compound IDs retrieved
1355 from \s-1FP\s0 file, type:
1356 .PP
1357 .Vb 6
1358 \& % SimilaritySearchingFingerprints.pl \-\-mode MultipleReferences \-\-SearchMode
1359 \& SimilaritySearch \-\-BitVectorComparisonMode TanimotoSimilarity
1360 \& \-\-GroupFusionRule Mean \-\-GroupFusionApplyCutoff Yes \-\-kNN 10
1361 \& \-\-SimilarityCutoff 0.75 \-\-SimilarCountMode NumOfSimilar
1362 \& \-\-NumOfSimilarMolecules 100 \-o
1363 \& ReferenceSampleFPHex.fpf DatabaseSampleFPHex.fpf
1364 .Ve
1365 .PP
1366 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
1367 to find 2 percent of most similar database molecules for each reference molecule with application of similarity
1368 cutoff of 0.85 to supported fingerprints strings data in text fingerprints files present in specific columns and
1369 create a ReferenceFPHexSimilaritySearching.csv file containing reference and database compoundIDs retrieved
1370 from specific columns, type:
1371 .PP
1372 .Vb 8
1373 \& % SimilaritySearchingFingerprints.pl \-\-mode IndividualReference \-\-SearchMode
1374 \& SimilaritySearch \-\-BitVectorComparisonMode TanimotoSimilarity
1375 \& \-\-ReferenceColMode ColLabel \-\-ReferenceFingerprintsCol Fingerprints
1376 \& \-\-ReferenceCompoundIDCol CompoundID \-\-DatabaseColMode Collabel
1377 \& \-\-DatabaseCompoundIDCol CompoundID \-\-DatabaseFingerprintsCol
1378 \& Fingerprints \-\-SimilarityCutoff 0.85 \-\-SimilarCountMode PercentSimilar
1379 \& \-\-PercentSimilarMolecules 2 \-o
1380 \& ReferenceSampleFPHex.csv DatabaseSampleFPHex.csv
1381 .Ve
1382 .PP
1383 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
1384 to find top 50 most similar database molecules for each reference molecule with application of similarity
1385 cutoff of 0.85 to supported fingerprints strings data in \s-1SD\s0 fingerprints files present in specific data fields and
1386 create both ReferenceFPHexSimilaritySearching.csv and ReferenceFPHexSimilaritySearching.sdf files containing
1387 reference and database compoundIDs retrieved from specific data fields, type:
1388 .PP
1389 .Vb 9
1390 \& % SimilaritySearchingFingerprints.pl \-\-mode IndividualReference \-\-SearchMode
1391 \& SimilaritySearch \-\-BitVectorComparisonMode TanimotoSimilarity
1392 \& \-\-ReferenceFingerprintsField Fingerprints
1393 \& \-\-DatabaseFingerprintsField Fingerprints
1394 \& \-\-ReferenceCompoundIDMode DataField \-\-ReferenceCompoundIDField CmpdID
1395 \& \-\-DatabaseCompoundIDMode DataField \-\-DatabaseCompoundIDField CmpdID
1396 \& \-\-SimilarityCutoff 0.85 \-\-SimilarCountMode NumOfSimilar
1397 \& \-\-NumOfSimilarMolecules 50 \-\-output both \-o
1398 \& ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
1399 .Ve
1400 .PP
1401 To perform similarity search using Tanimoto coefficient by treating reference molecules as individual molecules
1402 to find 1 percent of most similar database molecules for each reference molecule with application of similarity
1403 cutoff to supported fingerprints strings data in \s-1SD\s0 fingerprints files present in specific data field labels, and create
1404 both ReferenceFPHexSimilaritySearching.csv ReferenceFPHexSimilaritySearching.sdf files containing reference and
1405 database compound IDs retrieved from specific data field labels along with other specific data for database
1406 molecules, type:
1407 .PP
1408 .Vb 10
1409 \& % SimilaritySearchingFingerprints.pl \-\-mode IndividualReference \-\-SearchMode
1410 \& SimilaritySearch \-\-BitVectorComparisonMode TanimotoSimilarity
1411 \& \-\-ReferenceFingerprintsField Fingerprints
1412 \& \-\-DatabaseFingerprintsField Fingerprints
1413 \& \-\-ReferenceCompoundIDMode DataField \-\-ReferenceCompoundIDField CmpdID
1414 \& \-\-DatabaseCompoundIDMode DataField \-\-DatabaseCompoundIDField CmpdID
1415 \& \-\-DatabaseDataFieldsMode Specify \-\-DatabaseDataFields "TPSA,SLogP"
1416 \& \-\-SimilarityCutoff 0.75 \-\-SimilarCountMode PercentSimilar
1417 \& \-\-PercentSimilarMolecules 1 \-\-output both \-\-OutDelim comma \-\-quote Yes
1418 \& \-\-precision 3 \-o ReferenceSampleFPHex.sdf DatabaseSampleFPHex.sdf
1419 .Ve
1420 .SH "AUTHOR"
1421 .IX Header "AUTHOR"
1422 Manish Sud <msud@san.rr.com>
1423 .SH "SEE ALSO"
1424 .IX Header "SEE ALSO"
1425 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1426 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
1427 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1428 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1429 .SH "COPYRIGHT"
1430 .IX Header "COPYRIGHT"
1431 Copyright (C) 2015 Manish Sud. All rights reserved.
1432 .PP
1433 This file is part of MayaChemTools.
1434 .PP
1435 MayaChemTools is free software; you can redistribute it and/or modify it under
1436 the terms of the \s-1GNU\s0 Lesser General Public License as published by the Free
1437 Software Foundation; either version 3 of the License, or (at your option)
1438 any later version.