0
|
1 NAME
|
|
2 SimilarityMatricesFingerprints.pl - Calculate similarity matrices using
|
|
3 fingerprints strings data in SD, FP and CSV/TSV text file(s)
|
|
4
|
|
5 SYNOPSIS
|
|
6 SimilarityMatricesFingerprints.pl SDFile(s) FPFile(s) TextFile(s)...
|
|
7
|
|
8 SimilarityMatricesFingerprints.pl [--alpha *number*] [--beta *number*]
|
|
9 [-b, --BitVectorComparisonMode *All | "TanimotoSimilarity,[
|
|
10 TverskySimilarity, ... ]"*] [-c, --ColMode *ColNum | ColLabel*]
|
|
11 [--CompoundIDCol *col number | col name*] [--CompoundIDPrefix *text*]
|
|
12 [--CompoundIDField *DataFieldName*] [--CompoundIDMode *DataField |
|
|
13 MolName | LabelPrefix | MolNameOrLabelPrefix*] [-d, --detail
|
|
14 *InfoLevel*] [-f, --fast] [--FingerprintsCol *col number | col name*]
|
|
15 [--FingerprintsField *FieldLabel*] [-h, --help] [--InDelim *comma |
|
|
16 semicolon*] [--InputDataMode *LoadInMemory | ScanFile*] [-m, --mode
|
|
17 *AutoDetect | FingerprintsBitVectorString | FingerprintsVectorString*]
|
|
18 [--OutDelim *comma | tab | semicolon*] [--OutMatrixFormat
|
|
19 *RowsAndColumns | IDPairsAndValue*] [--OutMatrixType *FullMatrix |
|
|
20 UpperTriangularMatrix | LowerTriangularMatrix*] [-o, --overwrite] [-p,
|
|
21 --precision *number*] [-q, --quote *Yes | No*] [-r, --root *RootName*]
|
|
22 [-v, --VectorComparisonMode *All | "TanimotoSimilairy, [
|
|
23 ManhattanDistance, ...]"*] [--VectorComparisonFormulism *All |
|
|
24 "AlgebraicForm, [BinaryForm, SetTheoreticForm]"*] [-w, --WorkingDir
|
|
25 dirname] SDFile(s) FPFile(s) TextFile(s)...
|
|
26
|
|
27 DESCRIPTION
|
|
28 Calculate similarity matrices using fingerprint bit-vector or vector
|
|
29 strings data in *SD, FP and CSV/TSV* text file(s) and generate CSV/TSV
|
|
30 text file(s) containing values for specified similarity and distance
|
|
31 coefficients.
|
|
32
|
|
33 The scripts SimilarityMatrixSDFiles.pl and SimilarityMatrixTextFiles.pl
|
|
34 have been removed from the current release of MayaChemTools and their
|
|
35 functionality merged with this script.
|
|
36
|
|
37 The valid *SDFile* extensions are *.sdf* and *.sd*. All SD files in a
|
|
38 current directory can be specified either by **.sdf* or the current
|
|
39 directory name.
|
|
40
|
|
41 The valid *FPFile* extensions are *.fpf* and *.fp*. All FP files in a
|
|
42 current directory can be specified either by **.fpf* or the current
|
|
43 directory name.
|
|
44
|
|
45 The valid *TextFile* extensions are *.csv* and *.tsv* for
|
|
46 comma/semicolon and tab delimited text files respectively. All other
|
|
47 file names are ignored. All text files in a current directory can be
|
|
48 specified by **.csv*, **.tsv*, or the current directory name. The
|
|
49 --indelim option determines the format of *TextFile(s)*. Any file which
|
|
50 doesn't correspond to the format indicated by --indelim option is
|
|
51 ignored.
|
|
52
|
|
53 Example of *FP* file containing fingerprints bit-vector string data:
|
|
54
|
|
55 #
|
|
56 # Package = MayaChemTools 7.4
|
|
57 # ReleaseDate = Oct 21, 2010
|
|
58 #
|
|
59 # TimeStamp = Mon Mar 7 15:14:01 2011
|
|
60 #
|
|
61 # FingerprintsStringType = FingerprintsBitVector
|
|
62 #
|
|
63 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
|
|
64 # Size = 1024
|
|
65 # BitStringFormat = HexadecimalString
|
|
66 # BitsOrder = Ascending
|
|
67 #
|
|
68 Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510...
|
|
69 Cmpd2 000000249400840040100042011001001980410c000000001010088001120...
|
|
70 ... ...
|
|
71 ... ..
|
|
72
|
|
73 Example of *FP* file containing fingerprints vector string data:
|
|
74
|
|
75 #
|
|
76 # Package = MayaChemTools 7.4
|
|
77 # ReleaseDate = Oct 21, 2010
|
|
78 #
|
|
79 # TimeStamp = Mon Mar 7 15:14:01 2011
|
|
80 #
|
|
81 # FingerprintsStringType = FingerprintsVector
|
|
82 #
|
|
83 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
|
|
84 # VectorStringFormat = IDsAndValuesString
|
|
85 # VectorValuesType = NumericalValues
|
|
86 #
|
|
87 Cmpd1 338;C F N O C:C C:N C=O CC CF CN CO C:C:C C:C:N C:CC C:CF C:CN C:
|
|
88 N:C C:NC CC:N CC=O CCC CCN CCO CNC NC=O O=CO C:C:C:C C:C:C:N C:C:CC...;
|
|
89 33 1 2 5 21 2 2 12 1 3 3 20 2 10 2 2 1 2 2 2 8 2 5 1 1 1 19 2 8 2 2 2 2
|
|
90 6 2 2 2 2 2 2 2 2 3 2 2 1 4 1 5 1 1 18 6 2 2 1 2 10 2 1 2 1 2 2 2 2 ...
|
|
91 Cmpd2 103;C N O C=N C=O CC CN CO CC=O CCC CCN CCO CNC N=CN NC=O NCN O=C
|
|
92 O C CC=O CCCC CCCN CCCO CCNC CNC=N CNC=O CNCN CCCC=O CCCCC CCCCN CC...;
|
|
93 15 4 4 1 2 13 5 2 2 15 5 3 2 2 1 1 1 2 17 7 6 5 1 1 1 2 15 8 5 7 2 2 2 2
|
|
94 1 2 1 1 3 15 7 6 8 3 4 4 3 2 2 1 2 3 14 2 4 7 4 4 4 4 1 1 1 2 1 1 1 ...
|
|
95 ... ...
|
|
96 ... ...
|
|
97
|
|
98 Example of *SD* file containing fingerprints bit-vector string data:
|
|
99
|
|
100 ... ...
|
|
101 ... ...
|
|
102 $$$$
|
|
103 ... ...
|
|
104 ... ...
|
|
105 ... ...
|
|
106 41 44 0 0 0 0 0 0 0 0999 V2000
|
|
107 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
108 ... ...
|
|
109 2 3 1 0 0 0 0
|
|
110 ... ...
|
|
111 M END
|
|
112 > <CmpdID>
|
|
113 Cmpd1
|
|
114
|
|
115 > <PathLengthFingerprints>
|
|
116 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt
|
|
117 h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66
|
|
118 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028
|
|
119 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462
|
|
120 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a
|
|
121 aa0660a11014a011d46
|
|
122
|
|
123 $$$$
|
|
124 ... ...
|
|
125 ... ...
|
|
126
|
|
127 Example of CSV *Text* file containing fingerprints bit-vector string
|
|
128 data:
|
|
129
|
|
130 "CompoundID","PathLengthFingerprints"
|
|
131 "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes
|
|
132 :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4
|
|
133 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030
|
|
134 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..."
|
|
135 ... ...
|
|
136 ... ...
|
|
137
|
|
138 The current release of MayaChemTools supports the following types of
|
|
139 fingerprint bit-vector and vector strings:
|
|
140
|
|
141 FingerprintsVector;AtomNeighborhoods:AtomicInvariantsAtomTypes:MinRadi
|
|
142 us0:MaxRadius2;41;AlphaNumericalValues;ValuesString;NR0-C.X1.BO1.H3-AT
|
|
143 C1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-ATC1 NR0-C.X
|
|
144 1.BO1.H3-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2-C.X1.BO1.H3-ATC1:NR2-C.X3.BO4-A
|
|
145 TC1 NR0-C.X2.BO2.H2-ATC1:NR1-C.X2.BO2.H2-ATC1:NR1-C.X3.BO3.H1-ATC1:NR2
|
|
146 -C.X2.BO2.H2-ATC1:NR2-N.X3.BO3-ATC1:NR2-O.X1.BO1.H1-ATC1 NR0-C.X2.B...
|
|
147
|
|
148 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS
|
|
149 ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2
|
|
150 .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1
|
|
151 O.X1.BO2;2 4 14 3 10 1 1 1 3 2
|
|
152
|
|
153 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume
|
|
154 ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F
|
|
155 N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1
|
|
156
|
|
157 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN
|
|
158 umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C
|
|
159 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N
|
|
160 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8
|
|
161 O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1
|
|
162 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0...
|
|
163
|
|
164 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
|
|
165 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
|
|
166 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3
|
|
167 .024 -2.270
|
|
168
|
|
169 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
|
|
170 ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435
|
|
171 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1
|
|
172 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
173 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
174
|
|
175 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
|
|
176 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
|
|
177 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
|
|
178 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
|
|
179 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
|
|
180 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
|
|
181
|
|
182 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
|
|
183 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
|
|
184 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
|
|
185 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
|
|
186 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
|
|
187 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
|
|
188
|
|
189 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
|
|
190 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
|
|
191 0000000001010000000110000011000000000000100000000000000000000000100001
|
|
192 1000000110000000000000000000000000010011000000000000000000000000010000
|
|
193 0000000000000000000000000010000000000000000001000000000000000000000000
|
|
194 0000000000010000100001000000000000101000000000000000100000000000000...
|
|
195
|
|
196 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu
|
|
197 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8
|
|
198 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567
|
|
199 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012
|
|
200 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455
|
|
201 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404...
|
|
202
|
|
203 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp
|
|
204 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184
|
|
205 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450
|
|
206 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430
|
|
207 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134
|
|
208 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566...
|
|
209
|
|
210 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
|
|
211 0000000000000000000000000000000001001000010010000000010010000000011100
|
|
212 0100101010111100011011000100110110000011011110100110111111111111011111
|
|
213 11111111111110111000
|
|
214
|
|
215 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
|
|
216 1110011111100101111111000111101100110000000000000011100010000000000000
|
|
217 0000000000000000000000000000000000000000000000101000000000000000000000
|
|
218 0000000000000000000000000000000000000000000000000000000000000000000000
|
|
219 0000000000000000000000000000000000000011000000000000000000000000000000
|
|
220 0000000000000000000000000000000000000000
|
|
221
|
|
222 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
|
|
223 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
224 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
|
|
225 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
|
|
226 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
|
|
227 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
|
|
228
|
|
229 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
|
|
230 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
|
|
231 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
|
|
232 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
|
|
233 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
|
|
234 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
|
|
235
|
|
236 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
|
|
237 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
|
|
238 0100010101011000101001011100110001000010001001101000001001001001001000
|
|
239 0010110100000111001001000001001010100100100000000011000000101001011100
|
|
240 0010000001000101010100000100111100110111011011011000000010110111001101
|
|
241 0101100011000000010001000011000010100011101100001000001000100000000...
|
|
242
|
|
243 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
|
|
244 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
|
|
245 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
|
|
246 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
|
|
247 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
|
|
248 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
|
|
249
|
|
250 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
|
|
251 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
|
|
252 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
|
|
253 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
|
|
254 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
|
|
255 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
|
|
256
|
|
257 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
|
|
258 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
|
|
259 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.
|
|
260 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...;
|
|
261 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
|
|
262 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
|
|
263
|
|
264 FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi
|
|
265 stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar
|
|
266 Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H
|
|
267 BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...;
|
|
268 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4
|
|
269 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ...
|
|
270
|
|
271 FingerprintsVector;TopologicalAtomTorsions:AtomicInvariantsAtomTypes;3
|
|
272 3;NumericalValues;IDsAndValuesString;C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-
|
|
273 C.X3.BO4 C.X1.BO1.H3-C.X3.BO3.H1-C.X3.BO4-N.X3.BO3 C.X2.BO2.H2-C.X2.BO
|
|
274 2.H2-C.X3.BO3.H1-C.X2.BO2.H2 C.X2.BO2.H2-C.X2.BO2.H2-C.X3.BO3.H1-O...;
|
|
275 2 2 1 1 2 2 1 1 3 4 4 8 4 2 2 6 2 2 1 2 1 1 2 1 1 2 6 2 4 2 1 3 1
|
|
276
|
|
277 FingerprintsVector;TopologicalAtomTorsions:EStateAtomTypes;36;Numerica
|
|
278 lValues;IDsAndValuesString;aaCH-aaCH-aaCH-aaCH aaCH-aaCH-aaCH-aasC aaC
|
|
279 H-aaCH-aasC-aaCH aaCH-aaCH-aasC-aasC aaCH-aaCH-aasC-sF aaCH-aaCH-aasC-
|
|
280 ssNH aaCH-aasC-aasC-aasC aaCH-aasC-aasC-aasN aaCH-aasC-ssNH-dssC a...;
|
|
281 4 4 8 4 2 2 6 2 2 2 4 3 2 1 3 3 2 2 2 1 2 1 1 1 2 1 1 1 1 1 1 1 2 1 1 2
|
|
282
|
|
283 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
|
|
284 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
|
|
285 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1
|
|
286 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1
|
|
287 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....;
|
|
288 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
|
|
289 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
|
|
290
|
|
291 FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1
|
|
292 :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C
|
|
293 .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3-
|
|
294 D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2
|
|
295 -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C.
|
|
296 3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7...
|
|
297
|
|
298 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
|
|
299 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
|
|
300 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
|
|
301 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H
|
|
302 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...;
|
|
303 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
|
|
304 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
|
|
305
|
|
306 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
|
|
307 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
|
|
308 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
|
|
309 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
|
|
310 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
|
|
311 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
|
|
312
|
|
313 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:ArbitrarySize:
|
|
314 MinDistance1:MaxDistance10;696;NumericalValues;IDsAndValuesString;Ar1-
|
|
315 Ar1-Ar1 Ar1-Ar1-H1 Ar1-Ar1-HBA1 Ar1-Ar1-HBD1 Ar1-H1-H1 Ar1-H1-HBA1 Ar1
|
|
316 -H1-HBD1 Ar1-HBA1-HBD1 H1-H1-H1 H1-H1-HBA1 H1-H1-HBD1 H1-HBA1-HBA1 H1-
|
|
317 HBA1-HBD1 H1-HBA1-NI1 H1-HBD1-NI1 HBA1-HBA1-NI1 HBA1-HBD1-NI1 Ar1-...;
|
|
318 46 106 8 3 83 11 4 1 21 5 3 1 2 2 1 1 1 100 101 18 11 145 132 26 14 23
|
|
319 28 3 3 5 4 61 45 10 4 16 20 7 5 1 3 4 5 3 1 1 1 1 5 4 2 1 2 2 2 1 1 1
|
|
320 119 123 24 15 185 202 41 25 22 17 3 5 85 95 18 11 23 17 3 1 1 6 4 ...
|
|
321
|
|
322 FingerprintsVector;TopologicalPharmacophoreAtomTriplets:FixedSize:MinD
|
|
323 istance1:MaxDistance10;2692;OrderedNumericalValues;ValuesString;46 106
|
|
324 8 3 0 0 83 11 4 0 0 0 1 0 0 0 0 0 0 0 0 21 5 3 0 0 1 2 2 0 0 1 0 0 0
|
|
325 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 100 101 18 11 0 0 145 132 26
|
|
326 14 0 0 23 28 3 3 0 0 5 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 61 45 10 4 0
|
|
327 0 16 20 7 5 1 0 3 4 5 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 5 ...
|
|
328
|
|
329 OPTIONS
|
|
330 --alpha *number*
|
|
331 Value of alpha parameter for calculating *Tversky* similarity
|
|
332 coefficient specified for -b, --BitVectorComparisonMode option. It
|
|
333 corresponds to weights assigned for bits set to "1" in a pair of
|
|
334 fingerprint bit-vectors during the calculation of similarity
|
|
335 coefficient. Possible values: *0 to 1*. Default value: <0.5>.
|
|
336
|
|
337 --beta *number*
|
|
338 Value of beta parameter for calculating *WeightedTanimoto* and
|
|
339 *WeightedTversky* similarity coefficients specified for -b,
|
|
340 --BitVectorComparisonMode option. It is used to weight the
|
|
341 contributions of bits set to "0" during the calculation of
|
|
342 similarity coefficients. Possible values: *0 to 1*. Default value of
|
|
343 <1> makes *WeightedTanimoto* and *WeightedTversky* equivalent to
|
|
344 *Tanimoto* and *Tversky*.
|
|
345
|
|
346 -b, --BitVectorComparisonMode *All |
|
|
347 "TanimotoSimilarity,[TverskySimilarity,...]"*
|
|
348 Specify what similarity coefficients to use for calculating
|
|
349 similarity matrices for fingerprints bit-vector strings data values
|
|
350 in *TextFile(s)*: calculate similarity matrices for all supported
|
|
351 similarity coefficients or specify a comma delimited list of
|
|
352 similarity coefficients. Possible values: *All |
|
|
353 "TanimotoSimilarity,[TverskySimilarity,...]*. Default:
|
|
354 *TanimotoSimilarity*
|
|
355
|
|
356 *All* uses complete list of supported similarity coefficients:
|
|
357 *BaroniUrbaniSimilarity, BuserSimilarity, CosineSimilarity,
|
|
358 DiceSimilarity, DennisSimilarity, ForbesSimilarity,
|
|
359 FossumSimilarity, HamannSimilarity, JacardSimilarity,
|
|
360 Kulczynski1Similarity, Kulczynski2Similarity, MatchingSimilarity,
|
|
361 McConnaugheySimilarity, OchiaiSimilarity, PearsonSimilarity,
|
|
362 RogersTanimotoSimilarity, RussellRaoSimilarity, SimpsonSimilarity,
|
|
363 SkoalSneath1Similarity, SkoalSneath2Similarity,
|
|
364 SkoalSneath3Similarity, TanimotoSimilarity, TverskySimilarity,
|
|
365 YuleSimilarity, WeightedTanimotoSimilarity,
|
|
366 WeightedTverskySimilarity*. These similarity coefficients are
|
|
367 described below.
|
|
368
|
|
369 For two fingerprint bit-vectors A and B of same size, let:
|
|
370
|
|
371 Na = Number of bits set to "1" in A
|
|
372 Nb = Number of bits set to "1" in B
|
|
373 Nc = Number of bits set to "1" in both A and B
|
|
374 Nd = Number of bits set to "0" in both A and B
|
|
375
|
|
376 Nt = Number of bits set to "1" or "0" in A or B (Size of A or B)
|
|
377 Nt = Na + Nb - Nc + Nd
|
|
378
|
|
379 Na - Nc = Number of bits set to "1" in A but not in B
|
|
380 Nb - Nc = Number of bits set to "1" in B but not in A
|
|
381
|
|
382 Then, various similarity coefficients [ Ref. 40 - 42 ] for a pair of
|
|
383 bit-vectors A and B are defined as follows:
|
|
384
|
|
385 *BaroniUrbaniSimilarity*: ( SQRT( Nc * Nd ) + Nc ) / ( SQRT ( Nc *
|
|
386 Nd ) + Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as Buser )
|
|
387
|
|
388 *BuserSimilarity*: ( SQRT ( Nc * Nd ) + Nc ) / ( SQRT ( Nc * Nd ) +
|
|
389 Nc + ( Na - Nc ) + ( Nb - Nc ) ) ( same as BaroniUrbani )
|
|
390
|
|
391 *CosineSimilarity*: Nc / SQRT ( Na * Nb ) (same as Ochiai)
|
|
392
|
|
393 *DiceSimilarity*: (2 * Nc) / ( Na + Nb )
|
|
394
|
|
395 *DennisSimilarity*: ( Nc * Nd - ( ( Na - Nc ) * ( Nb - Nc ) ) ) /
|
|
396 SQRT ( Nt * Na * Nb)
|
|
397
|
|
398 *ForbesSimilarity*: ( Nt * Nc ) / ( Na * Nb )
|
|
399
|
|
400 *FossumSimilarity*: ( Nt * ( ( Nc - 1/2 ) ** 2 ) / ( Na * Nb )
|
|
401
|
|
402 *HamannSimilarity*: ( ( Nc + Nd ) - ( Na - Nc ) - ( Nb - Nc ) ) / Nt
|
|
403
|
|
404 *JaccardSimilarity*: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc / (
|
|
405 Na + Nb - Nc ) (same as Tanimoto)
|
|
406
|
|
407 *Kulczynski1Similarity*: Nc / ( ( Na - Nc ) + ( Nb - Nc) ) = Nc / (
|
|
408 Na + Nb - 2Nc )
|
|
409
|
|
410 *Kulczynski2Similarity*: ( ( Nc / 2 ) * ( 2 * Nc + ( Na - Nc ) + (
|
|
411 Nb - Nc) ) ) / ( ( Nc + ( Na - Nc ) ) * ( Nc + ( Nb - Nc ) ) ) = 0.5
|
|
412 * ( Nc / Na + Nc / Nb )
|
|
413
|
|
414 *MatchingSimilarity*: ( Nc + Nd ) / Nt
|
|
415
|
|
416 *McConnaugheySimilarity*: ( Nc ** 2 - ( Na - Nc ) * ( Nb - Nc) ) / (
|
|
417 Na * Nb )
|
|
418
|
|
419 *OchiaiSimilarity*: Nc / SQRT ( Na * Nb ) (same as Cosine)
|
|
420
|
|
421 *PearsonSimilarity*: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) /
|
|
422 SQRT ( Na * Nb * ( Na - Nc + Nd ) * ( Nb - Nc + Nd ) )
|
|
423
|
|
424 *RogersTanimotoSimilarity*: ( Nc + Nd ) / ( ( Na - Nc) + ( Nb - Nc)
|
|
425 + Nt) = ( Nc + Nd ) / ( Na + Nb - 2Nc + Nt)
|
|
426
|
|
427 *RussellRaoSimilarity*: Nc / Nt
|
|
428
|
|
429 *SimpsonSimilarity*: Nc / MIN ( Na, Nb)
|
|
430
|
|
431 *SkoalSneath1Similarity*: Nc / ( Nc + 2 * ( Na - Nc) + 2 * ( Nb -
|
|
432 Nc) ) = Nc / ( 2 * Na + 2 * Nb - 3 * Nc )
|
|
433
|
|
434 *SkoalSneath2Similarity*: ( 2 * Nc + 2 * Nd ) / ( Nc + Nd + Nt )
|
|
435
|
|
436 *SkoalSneath3Similarity*: ( Nc + Nd ) / ( ( Na - Nc ) + ( Nb - Nc )
|
|
437 ) = ( Nc + Nd ) / ( Na + Nb - 2 * Nc )
|
|
438
|
|
439 *TanimotoSimilarity*: Nc / ( ( Na - Nc) + ( Nb - Nc ) + Nc ) = Nc /
|
|
440 ( Na + Nb - Nc ) (same as Jaccard)
|
|
441
|
|
442 *TverskySimilarity*: Nc / ( alpha * ( Na - Nc ) + ( 1 - alpha) * (
|
|
443 Nb - Nc) + Nc ) = Nc / ( alpha * ( Na - Nb ) + Nb)
|
|
444
|
|
445 *YuleSimilarity*: ( ( Nc * Nd ) - ( ( Na - Nc ) * ( Nb - Nc ) ) ) /
|
|
446 ( ( Nc * Nd ) + ( ( Na - Nc ) * ( Nb - Nc ) ) )
|
|
447
|
|
448 Values of Tanimoto/Jaccard and Tversky coefficients are dependent on
|
|
449 only those bit which are set to "1" in both A and B. In order to
|
|
450 take into account all bit positions, modified versions of Tanimoto [
|
|
451 Ref. 42 ] and Tversky [ Ref. 43 ] have been developed.
|
|
452
|
|
453 Let:
|
|
454
|
|
455 Na' = Number of bits set to "0" in A
|
|
456 Nb' = Number of bits set to "0" in B
|
|
457 Nc' = Number of bits set to "0" in both A and B
|
|
458
|
|
459 Tanimoto': Nc' / ( ( Na' - Nc') + ( Nb' - Nc' ) + Nc' ) = Nc' / (
|
|
460 Na' + Nb' - Nc' )
|
|
461
|
|
462 Tversky': Nc' / ( alpha * ( Na' - Nc' ) + ( 1 - alpha) * ( Nb' - Nc'
|
|
463 ) + Nc' ) = Nc' / ( alpha * ( Na' - Nb' ) + Nb')
|
|
464
|
|
465 Then:
|
|
466
|
|
467 *WeightedTanimotoSimilarity* = beta * Tanimoto + (1 - beta) *
|
|
468 Tanimoto'
|
|
469
|
|
470 *WeightedTverskySimilarity* = beta * Tversky + (1 - beta) * Tversky'
|
|
471
|
|
472 -c, --ColMode *ColNum | ColLabel*
|
|
473 Specify how columns are identified in *TextFile(s)*: using column
|
|
474 number or column label. Possible values: *ColNum or ColLabel*.
|
|
475 Default value: *ColNum*.
|
|
476
|
|
477 --CompoundIDCol *col number | col name*
|
|
478 This value is -c, --ColMode mode specific. It specifies input
|
|
479 *TextFile(s)* column to use for generating compound ID for
|
|
480 similarity matrices in output *TextFile(s)*. Possible values: *col
|
|
481 number or col label*. Default value: *first column containing the
|
|
482 word compoundID in its column label or sequentially generated IDs*.
|
|
483
|
|
484 --CompoundIDPrefix *text*
|
|
485 Specify compound ID prefix to use during sequential generation of
|
|
486 compound IDs for input *SDFile(s)* and *TextFile(s)*. Default value:
|
|
487 *Cmpd*. The default value generates compound IDs which look like
|
|
488 Cmpd<Number>.
|
|
489
|
|
490 For input *SDFile(s)*, this value is only used during *LabelPrefix |
|
|
491 MolNameOrLabelPrefix* values of --CompoundIDMode option; otherwise,
|
|
492 it's ignored.
|
|
493
|
|
494 Examples for *LabelPrefix* or *MolNameOrLabelPrefix* value of
|
|
495 --CompoundIDMode:
|
|
496
|
|
497 Compound
|
|
498
|
|
499 The values specified above generates compound IDs which correspond
|
|
500 to Compound<Number> instead of default value of Cmpd<Number>.
|
|
501
|
|
502 --CompoundIDField *DataFieldName*
|
|
503 Specify input *SDFile(s)* datafield label for generating compound
|
|
504 IDs. This value is only used during *DataField* value of
|
|
505 --CompoundIDMode option.
|
|
506
|
|
507 Examples for *DataField* value of --CompoundIDMode:
|
|
508
|
|
509 MolID
|
|
510 ExtReg
|
|
511
|
|
512 --CompoundIDMode *DataField | MolName | LabelPrefix |
|
|
513 MolNameOrLabelPrefix*
|
|
514 Specify how to generate compound IDs from input *SDFile(s)* for
|
|
515 similarity matrix CSV/TSV text file(s): use a *SDFile(s)* datafield
|
|
516 value; use molname line from *SDFile(s)*; generate a sequential ID
|
|
517 with specific prefix; use combination of both MolName and
|
|
518 LabelPrefix with usage of LabelPrefix values for empty molname
|
|
519 lines.
|
|
520
|
|
521 Possible values: *DataField | MolName | LabelPrefix |
|
|
522 MolNameOrLabelPrefix*. Default: *LabelPrefix*.
|
|
523
|
|
524 For *MolNameAndLabelPrefix* value of --CompoundIDMode, molname line
|
|
525 in *SDFile(s)* takes precedence over sequential compound IDs
|
|
526 generated using *LabelPrefix* and only empty molname values are
|
|
527 replaced with sequential compound IDs.
|
|
528
|
|
529 -d, --detail *InfoLevel*
|
|
530 Level of information to print about lines being ignored. Default:
|
|
531 *1*. Possible values: *1, 2 or 3*.
|
|
532
|
|
533 -f, --fast
|
|
534 In this mode, fingerprints columns specified using --FingerprintsCol
|
|
535 for *TextFile(s)* and --FingerprintsField for *SDFile(s)* are
|
|
536 assumed to contain valid fingerprints data and no checking is
|
|
537 performed before calculating similarity matrices. By default,
|
|
538 fingerprints data is validated before computing pairwise similarity
|
|
539 and distance coefficients.
|
|
540
|
|
541 --FingerprintsCol *col number | col name*
|
|
542 This value is -c, --colmode specific. It specifies fingerprints
|
|
543 column to use during calculation similarity matrices for
|
|
544 *TextFile(s)*. Possible values: *col number or col label*. Default
|
|
545 value: *first column containing the word Fingerprints in its column
|
|
546 label*.
|
|
547
|
|
548 --FingerprintsField *FieldLabel*
|
|
549 Fingerprints field label to use during calculation similarity
|
|
550 matrices for *SDFile(s)*. Default value: *first data field label
|
|
551 containing the word Fingerprints in its label*
|
|
552
|
|
553 -h, --help
|
|
554 Print this help message.
|
|
555
|
|
556 --InDelim *comma | semicolon*
|
|
557 Input delimiter for CSV *TextFile(s)*. Possible values: *comma or
|
|
558 semicolon*. Default value: *comma*. For TSV files, this option is
|
|
559 ignored and *tab* is used as a delimiter.
|
|
560
|
|
561 --InputDataMode *LoadInMemory | ScanFile*
|
|
562 Specify how fingerprints bit-vector or vector strings data from *SD,
|
|
563 FP and CSV/TSV* fingerprint file(s) is processed: Retrieve, process
|
|
564 and load all available fingerprints data in memory; Retrieve and
|
|
565 process data for fingerprints one at a time. Possible values :
|
|
566 *LoadInMemory | ScanFile*. Default: *LoadInMemory*.
|
|
567
|
|
568 During *LoadInMemory* value of --InputDataMode, fingerprints
|
|
569 bit-vector or vector strings data from input file is retrieved,
|
|
570 processed, and loaded into memory all at once as fingerprints
|
|
571 objects for generation for similarity matrices.
|
|
572
|
|
573 During *ScanFile* value of --InputDataMode, multiple passes over the
|
|
574 input fingerprints file are performed to retrieve and process
|
|
575 fingerprints bit-vector or vector strings data one at a time to
|
|
576 generate fingerprints objects used during generation of similarity
|
|
577 matrices. A temporary copy of the input fingerprints file is made at
|
|
578 the start and deleted after generating the matrices.
|
|
579
|
|
580 *ScanFile* value of --InputDataMode allows processing of arbitrary
|
|
581 large fingerprints files without any additional memory requirement.
|
|
582
|
|
583 -m, --mode *AutoDetect | FingerprintsBitVectorString |
|
|
584 FingerprintsVectorString*
|
|
585 Format of fingerprint strings data in *TextFile(s)*: automatically
|
|
586 detect format of fingerprints string created by MayaChemTools
|
|
587 fingerprints generation scripts or explicitly specify its format.
|
|
588 Possible values: *AutoDetect | FingerprintsBitVectorString |
|
|
589 FingerprintsVectorString*. Default value: *AutoDetect*.
|
|
590
|
|
591 --OutDelim *comma | tab | semicolon*
|
|
592 Delimiter for output CSV/TSV text file(s). Possible values: *comma,
|
|
593 tab, or semicolon* Default value: *comma*.
|
|
594
|
|
595 --OutMatrixFormat *RowsAndColumns | IDPairsAndValue*
|
|
596 Specify how similarity or distance values calculated for
|
|
597 fingerprints vector and bit-vector strings are written to the output
|
|
598 CSV/TSV text file(s): Generate text files containing rows and
|
|
599 columns with their labels corresponding to compound IDs and each
|
|
600 matrix element value corresponding to similarity or distance between
|
|
601 corresponding compounds; Generate text files containing rows
|
|
602 containing compoundIDs for two compounds followed by similarity or
|
|
603 distance value between these compounds.
|
|
604
|
|
605 Possible values: *RowsAndColumns, or IDPairsAndValue*. Default
|
|
606 value: *RowsAndColumns*.
|
|
607
|
|
608 The value of --OutMatrixFormat in conjunction with --OutMatrixType
|
|
609 determines type of data written to output files and allows
|
|
610 generation of up to 6 different output data formats:
|
|
611
|
|
612 OutMatrixFormat OutMatrixType
|
|
613
|
|
614 RowsAndColumns FullMatrix [ DEFAULT ]
|
|
615 RowsAndColumns UpperTriangularMatrix
|
|
616 RowsAndColumns LowerTriangularMatrix
|
|
617
|
|
618 IDPairsAndValue FullMatrix
|
|
619 IDPairsAndValue UpperTriangularMatrix
|
|
620 IDPairsAndValue LowerTriangularMatrix
|
|
621
|
|
622 Example of data in output file for *RowsAndColumns*
|
|
623 --OutMatrixFormat value for *FullMatrix* valueof --OutMatrixType:
|
|
624
|
|
625 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ...
|
|
626 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ...
|
|
627 "Cmpd2","0.04","1","0.06","0.05","0.19","0.07",... ...
|
|
628 "Cmpd3","0.25","0.06","1","0.12","0.22","0.25",... ...
|
|
629 "Cmpd4","0.13","0.05","0.12","1","0.11","0.13",... ...
|
|
630 "Cmpd5","0.11","0.19","0.22","0.11","1","0.17",... ...
|
|
631 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1",... ...
|
|
632 ... ... ..
|
|
633 ... ... ..
|
|
634 ... ... ..
|
|
635
|
|
636 Example of data in output file for *RowsAndColumns*
|
|
637 --OutMatrixFormat value for *UpperTriangularMatrix* value of
|
|
638 --OutMatrixType:
|
|
639
|
|
640 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ...
|
|
641 "Cmpd1","1","0.04","0.25","0.13","0.11","0.2",... ...
|
|
642 "Cmpd2","1","0.06","0.05","0.19","0.07",... ...
|
|
643 "Cmpd3","1","0.12","0.22","0.25",... ...
|
|
644 "Cmpd4","1","0.11","0.13",... ...
|
|
645 "Cmpd5","1","0.17",... ...
|
|
646 "Cmpd6","1",... ...
|
|
647 ... ... ..
|
|
648 ... ... ..
|
|
649 ... ... ..
|
|
650
|
|
651 Example of data in output file for *RowsAndColumns*
|
|
652 --OutMatrixFormat value for *LowerTriangularMatrix* value of
|
|
653 --OutMatrixType:
|
|
654
|
|
655 "","Cmpd1","Cmpd2","Cmpd3","Cmpd4","Cmpd5","Cmpd6",... ...
|
|
656 "Cmpd1","1"
|
|
657 "Cmpd2","0.04","1"
|
|
658 "Cmpd3","0.25","0.06","1"
|
|
659 "Cmpd4","0.13","0.05","0.12","1"
|
|
660 "Cmpd5","0.11","0.19","0.22","0.11","1"
|
|
661 "Cmpd6","0.2","0.07","0.25","0.13","0.17","1"
|
|
662 ... ... ..
|
|
663 ... ... ..
|
|
664 ... ... ..
|
|
665
|
|
666 Example of data in output file for *IDPairsAndValue*
|
|
667 --OutMatrixFormat value for <FullMatrix> value of OutMatrixType:
|
|
668
|
|
669 "CmpdID1","CmpdID2","Coefficient Value"
|
|
670 "Cmpd1","Cmpd1","1"
|
|
671 "Cmpd1","Cmpd2","0.04"
|
|
672 "Cmpd1","Cmpd3","0.25"
|
|
673 "Cmpd1","Cmpd4","0.13"
|
|
674 ... ... ...
|
|
675 ... ... ...
|
|
676 ... ... ...
|
|
677 "Cmpd2","Cmpd1","0.04"
|
|
678 "Cmpd2","Cmpd2","1"
|
|
679 "Cmpd2","Cmpd3","0.06"
|
|
680 "Cmpd2","Cmpd4","0.05"
|
|
681 ... ... ...
|
|
682 ... ... ...
|
|
683 ... ... ...
|
|
684 "Cmpd3","Cmpd1","0.25"
|
|
685 "Cmpd3","Cmpd2","0.06"
|
|
686 "Cmpd3","Cmpd3","1"
|
|
687 "Cmpd3","Cmpd4","0.12"
|
|
688 ... ... ...
|
|
689 ... ... ...
|
|
690 ... ... ...
|
|
691
|
|
692 Example of data in output file for *IDPairsAndValue*
|
|
693 --OutMatrixFormat value for <UpperTriangularMatrix> value of
|
|
694 --OutMatrixType:
|
|
695
|
|
696 "CmpdID1","CmpdID2","Coefficient Value"
|
|
697 "Cmpd1","Cmpd1","1"
|
|
698 "Cmpd1","Cmpd2","0.04"
|
|
699 "Cmpd1","Cmpd3","0.25"
|
|
700 "Cmpd1","Cmpd4","0.13"
|
|
701 ... ... ...
|
|
702 ... ... ...
|
|
703 ... ... ...
|
|
704 "Cmpd2","Cmpd2","1"
|
|
705 "Cmpd2","Cmpd3","0.06"
|
|
706 "Cmpd2","Cmpd4","0.05"
|
|
707 ... ... ...
|
|
708 ... ... ...
|
|
709 ... ... ...
|
|
710 "Cmpd3","Cmpd3","1"
|
|
711 "Cmpd3","Cmpd4","0.12"
|
|
712 ... ... ...
|
|
713 ... ... ...
|
|
714 ... ... ...
|
|
715
|
|
716 Example of data in output file for *IDPairsAndValue*
|
|
717 --OutMatrixFormat value for <LowerTriangularMatrix> value of
|
|
718 --OutMatrixType:
|
|
719
|
|
720 "CmpdID1","CmpdID2","Coefficient Value"
|
|
721 "Cmpd1","Cmpd1","1"
|
|
722 "Cmpd2","Cmpd1","0.04"
|
|
723 "Cmpd2","Cmpd2","1"
|
|
724 "Cmpd3","Cmpd1","0.25"
|
|
725 "Cmpd3","Cmpd2","0.06"
|
|
726 "Cmpd3","Cmpd3","1"
|
|
727 "Cmpd4","Cmpd1","0.13"
|
|
728 "Cmpd4","Cmpd2","0.05"
|
|
729 "Cmpd4","Cmpd3","0.12"
|
|
730 "Cmpd4","Cmpd4","1"
|
|
731 ... ... ...
|
|
732 ... ... ...
|
|
733 ... ... ...
|
|
734
|
|
735 --OutMatrixType *FullMatrix | UpperTriangularMatrix |
|
|
736 LowerTriangularMatrix*
|
|
737 Type of similarity or distance matrix to calculate for fingerprints
|
|
738 vector and bit-vector strings: Calculate full matrix; Calculate
|
|
739 lower triangular matrix including diagonal; Calculate upper
|
|
740 triangular matrix including diagonal.
|
|
741
|
|
742 Possible values: *FullMatrix, UpperTriangularMatrix, or
|
|
743 LowerTriangularMatrix*. Default value: *FullMatrix*.
|
|
744
|
|
745 The value of --OutMatrixType in conjunction with --OutMatrixFormat
|
|
746 determines type of data written to output files.
|
|
747
|
|
748 -o, --overwrite
|
|
749 Overwrite existing files
|
|
750
|
|
751 -p, --precision *number*
|
|
752 Precision of calculated values in the output file. Default: up to
|
|
753 *2* decimal places. Valid values: positive integers.
|
|
754
|
|
755 -q, --quote *Yes | No*
|
|
756 Put quote around column values in output CSV/TSV text file(s).
|
|
757 Possible values: *Yes or No*. Default value: *Yes*.
|
|
758
|
|
759 -r, --root *RootName*
|
|
760 New file name is generated using the root:
|
|
761 <Root><BitVectorComparisonMode>.<Ext> or
|
|
762 <Root><VectorComparisonMode><VectorComparisonFormulism>.<Ext>. The
|
|
763 csv, and tsv <Ext> values are used for comma/semicolon, and tab
|
|
764 delimited text files respectively. This option is ignored for
|
|
765 multiple input files.
|
|
766
|
|
767 -v, --VectorComparisonMode *All |
|
|
768 "TanimotoSimilarity,[ManhattanDistance,...]"*
|
|
769 Specify what similarity or distance coefficients to use for
|
|
770 calculating similarity matrices for fingerprint vector strings data
|
|
771 values in *TextFile(s)*: calculate similarity matrices for all
|
|
772 supported similarity and distance coefficients or specify a comma
|
|
773 delimited list of similarity and distance coefficients. Possible
|
|
774 values: *All | "TanimotoSimilairy,[ManhattanDistance,..]"*. Default:
|
|
775 *TanimotoSimilarity*.
|
|
776
|
|
777 The value of -v, --VectorComparisonMode, in conjunction with
|
|
778 --VectorComparisonFormulism, decides which type of similarity and
|
|
779 distance coefficient formulism gets used.
|
|
780
|
|
781 *All* uses complete list of supported similarity and distance
|
|
782 coefficients: *CosineSimilarity, CzekanowskiSimilarity,
|
|
783 DiceSimilarity, OchiaiSimilarity, JaccardSimilarity,
|
|
784 SorensonSimilarity, TanimotoSimilarity, CityBlockDistance,
|
|
785 EuclideanDistance, HammingDistance, ManhattanDistance,
|
|
786 SoergelDistance*. These similarity and distance coefficients are
|
|
787 described below.
|
|
788
|
|
789 FingerprintsVector.pm module, used to calculate similarity and
|
|
790 distance coefficients, provides support to perform comparison
|
|
791 between vectors containing three different types of values:
|
|
792
|
|
793 Type I: OrderedNumericalValues
|
|
794
|
|
795 . Size of two vectors are same
|
|
796 . Vectors contain real values in a specific order. For example: MACCS keys
|
|
797 count, Topological pharmnacophore atom pairs and so on.
|
|
798
|
|
799 Type II: UnorderedNumericalValues
|
|
800
|
|
801 . Size of two vectors might not be same
|
|
802 . Vectors contain unordered real value identified by value IDs. For example:
|
|
803 Toplogical atom pairs, Topological atom torsions and so on
|
|
804
|
|
805 Type III: AlphaNumericalValues
|
|
806
|
|
807 . Size of two vectors might not be same
|
|
808 . Vectors contain unordered alphanumerical values. For example: Extended
|
|
809 connectivity fingerprints, atom neighborhood fingerprints.
|
|
810
|
|
811 Before performing similarity or distance calculations between
|
|
812 vectors containing UnorderedNumericalValues or AlphaNumericalValues,
|
|
813 the vectors are transformed into vectors containing unique
|
|
814 OrderedNumericalValues using value IDs for UnorderedNumericalValues
|
|
815 and values itself for AlphaNumericalValues.
|
|
816
|
|
817 Three forms of similarity and distance calculation between two
|
|
818 vectors, specified using --VectorComparisonFormulism option, are
|
|
819 supported: *AlgebraicForm, BinaryForm or SetTheoreticForm*.
|
|
820
|
|
821 For *BinaryForm*, the ordered list of processed final vector values
|
|
822 containing the value or count of each unique value type is simply
|
|
823 converted into a binary vector containing 1s and 0s corresponding to
|
|
824 presence or absence of values before calculating similarity or
|
|
825 distance between two vectors.
|
|
826
|
|
827 For two fingerprint vectors A and B of same size containing
|
|
828 OrderedNumericalValues, let:
|
|
829
|
|
830 N = Number values in A or B
|
|
831
|
|
832 Xa = Values of vector A
|
|
833 Xb = Values of vector B
|
|
834
|
|
835 Xai = Value of ith element in A
|
|
836 Xbi = Value of ith element in B
|
|
837
|
|
838 SUM = Sum of i over N values
|
|
839
|
|
840 For SetTheoreticForm of calculation between two vectors, let:
|
|
841
|
|
842 SetIntersectionXaXb = SUM ( MIN ( Xai, Xbi ) )
|
|
843 SetDifferenceXaXb = SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN ( Xai, Xbi ) )
|
|
844
|
|
845 For BinaryForm of calculation between two vectors, let:
|
|
846
|
|
847 Na = Number of bits set to "1" in A = SUM ( Xai )
|
|
848 Nb = Number of bits set to "1" in B = SUM ( Xbi )
|
|
849 Nc = Number of bits set to "1" in both A and B = SUM ( Xai * Xbi )
|
|
850 Nd = Number of bits set to "0" in both A and B
|
|
851 = SUM ( 1 - Xai - Xbi + Xai * Xbi)
|
|
852
|
|
853 N = Number of bits set to "1" or "0" in A or B = Size of A or B = Na + Nb - Nc + Nd
|
|
854
|
|
855 Additionally, for BinaryForm various values also correspond to:
|
|
856
|
|
857 Na = | Xa |
|
|
858 Nb = | Xb |
|
|
859 Nc = | SetIntersectionXaXb |
|
|
860 Nd = N - | SetDifferenceXaXb |
|
|
861
|
|
862 | SetDifferenceXaXb | = N - Nd = Na + Nb - Nc + Nd - Nd = Na + Nb - Nc
|
|
863 = | Xa | + | Xb | - | SetIntersectionXaXb |
|
|
864
|
|
865 Various similarity and distance coefficients [ Ref 40, Ref 62, Ref
|
|
866 64 ] for a pair of vectors A and B in *AlgebraicForm, BinaryForm and
|
|
867 SetTheoreticForm* are defined as follows:
|
|
868
|
|
869 CityBlockDistance: ( same as HammingDistance and ManhattanDistance)
|
|
870
|
|
871 *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) )
|
|
872
|
|
873 *BinaryForm*: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
|
|
874
|
|
875 *SetTheoreticForm*: | SetDifferenceXaXb | - | SetIntersectionXaXb |
|
|
876 = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
|
|
877
|
|
878 CosineSimilarity: ( same as OchiaiSimilarityCoefficient)
|
|
879
|
|
880 *AlgebraicForm*: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM (
|
|
881 Xbi ** 2) )
|
|
882
|
|
883 *BinaryForm*: Nc / SQRT ( Na * Nb)
|
|
884
|
|
885 *SetTheoreticForm*: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) =
|
|
886 SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
|
|
887
|
|
888 CzekanowskiSimilarity: ( same as DiceSimilarity and
|
|
889 SorensonSimilarity)
|
|
890
|
|
891 *AlgebraicForm*: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) +
|
|
892 SUM ( Xbi **2 ) )
|
|
893
|
|
894 *BinaryForm*: 2 * Nc / ( Na + Nb )
|
|
895
|
|
896 *SetTheoreticForm*: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) =
|
|
897 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
|
|
898
|
|
899 DiceSimilarity: ( same as CzekanowskiSimilarity and
|
|
900 SorensonSimilarity)
|
|
901
|
|
902 *AlgebraicForm*: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) +
|
|
903 SUM ( Xbi **2 ) )
|
|
904
|
|
905 *BinaryForm*: 2 * Nc / ( Na + Nb )
|
|
906
|
|
907 *SetTheoreticForm*: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) =
|
|
908 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
|
|
909
|
|
910 EuclideanDistance:
|
|
911
|
|
912 *AlgebraicForm*: SQRT ( SUM ( ( ( Xai - Xbi ) ** 2 ) ) )
|
|
913
|
|
914 *BinaryForm*: SQRT ( ( Na - Nc ) + ( Nb - Nc ) ) = SQRT ( Na + Nb -
|
|
915 2 * Nc )
|
|
916
|
|
917 *SetTheoreticForm*: SQRT ( | SetDifferenceXaXb | - |
|
|
918 SetIntersectionXaXb | ) = SQRT ( SUM ( Xai ) + SUM ( Xbi ) - 2 * (
|
|
919 SUM ( MIN ( Xai, Xbi ) ) ) )
|
|
920
|
|
921 HammingDistance: ( same as CityBlockDistance and ManhattanDistance)
|
|
922
|
|
923 *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) )
|
|
924
|
|
925 *BinaryForm*: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
|
|
926
|
|
927 *SetTheoreticForm*: | SetDifferenceXaXb | - | SetIntersectionXaXb |
|
|
928 = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
|
|
929
|
|
930 JaccardSimilarity: ( same as TanimotoSimilarity)
|
|
931
|
|
932 *AlgebraicForm*: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi
|
|
933 ** 2 ) - SUM ( Xai * Xbi ) )
|
|
934
|
|
935 *BinaryForm*: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na +
|
|
936 Nb - Nc )
|
|
937
|
|
938 *SetTheoreticForm*: | SetIntersectionXaXb | / | SetDifferenceXaXb |
|
|
939 = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN
|
|
940 ( Xai, Xbi ) ) )
|
|
941
|
|
942 ManhattanDistance: ( same as CityBlockDistance and HammingDistance)
|
|
943
|
|
944 *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) )
|
|
945
|
|
946 *BinaryForm*: ( Na - Nc ) + ( Nb - Nc ) = Na + Nb - 2 * Nc
|
|
947
|
|
948 *SetTheoreticForm*: | SetDifferenceXaXb | - | SetIntersectionXaXb |
|
|
949 = SUM ( Xai ) + SUM ( Xbi ) - 2 * ( SUM ( MIN ( Xai, Xbi ) ) )
|
|
950
|
|
951 OchiaiSimilarity: ( same as CosineSimilarity)
|
|
952
|
|
953 *AlgebraicForm*: SUM ( Xai * Xbi ) / SQRT ( SUM ( Xai ** 2) * SUM (
|
|
954 Xbi ** 2) )
|
|
955
|
|
956 *BinaryForm*: Nc / SQRT ( Na * Nb)
|
|
957
|
|
958 *SetTheoreticForm*: | SetIntersectionXaXb | / SQRT ( |Xa| * |Xb| ) =
|
|
959 SUM ( MIN ( Xai, Xbi ) ) / SQRT ( SUM ( Xai ) * SUM ( Xbi ) )
|
|
960
|
|
961 SorensonSimilarity: ( same as CzekanowskiSimilarity and
|
|
962 DiceSimilarity)
|
|
963
|
|
964 *AlgebraicForm*: ( 2 * ( SUM ( Xai * Xbi ) ) ) / ( SUM ( Xai ** 2) +
|
|
965 SUM ( Xbi **2 ) )
|
|
966
|
|
967 *BinaryForm*: 2 * Nc / ( Na + Nb )
|
|
968
|
|
969 *SetTheoreticForm*: 2 * | SetIntersectionXaXb | / ( |Xa| + |Xb| ) =
|
|
970 2 * ( SUM ( MIN ( Xai, Xbi ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) )
|
|
971
|
|
972 SoergelDistance:
|
|
973
|
|
974 *AlgebraicForm*: SUM ( ABS ( Xai - Xbi ) ) / SUM ( MAX ( Xai, Xbi )
|
|
975 )
|
|
976
|
|
977 *BinaryForm*: 1 - Nc / ( Na + Nb - Nc ) = ( Na + Nb - 2 * Nc ) / (
|
|
978 Na + Nb - Nc )
|
|
979
|
|
980 *SetTheoreticForm*: ( | SetDifferenceXaXb | - | SetIntersectionXaXb
|
|
981 | ) / | SetDifferenceXaXb | = ( SUM ( Xai ) + SUM ( Xbi ) - 2 * (
|
|
982 SUM ( MIN ( Xai, Xbi ) ) ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM (
|
|
983 MIN ( Xai, Xbi ) ) )
|
|
984
|
|
985 TanimotoSimilarity: ( same as JaccardSimilarity)
|
|
986
|
|
987 *AlgebraicForm*: SUM ( Xai * Xbi ) / ( SUM ( Xai ** 2 ) + SUM ( Xbi
|
|
988 ** 2 ) - SUM ( Xai * Xbi ) )
|
|
989
|
|
990 *BinaryForm*: Nc / ( ( Na - Nc ) + ( Nb - Nc ) + Nc ) = Nc / ( Na +
|
|
991 Nb - Nc )
|
|
992
|
|
993 *SetTheoreticForm*: | SetIntersectionXaXb | / | SetDifferenceXaXb |
|
|
994 = SUM ( MIN ( Xai, Xbi ) ) / ( SUM ( Xai ) + SUM ( Xbi ) - SUM ( MIN
|
|
995 ( Xai, Xbi ) ) )
|
|
996
|
|
997 --VectorComparisonFormulism *All |
|
|
998 "AlgebraicForm,[BinaryForm,SetTheoreticForm]"*
|
|
999 Specify fingerprints vector comparison formulism to use for
|
|
1000 calculation similarity and distance coefficients during -v,
|
|
1001 --VectorComparisonMode: use all supported comparison formulisms or
|
|
1002 specify a comma delimited. Possible values: *All |
|
|
1003 "AlgebraicForm,[BinaryForm,SetTheoreticForm]"*. Default value:
|
|
1004 *AlgebraicForm*.
|
|
1005
|
|
1006 *All* uses all three forms of supported vector comparison formulism
|
|
1007 for values of -v, --VectorComparisonMode option.
|
|
1008
|
|
1009 For fingerprint vector strings containing AlphaNumericalValues data
|
|
1010 values - ExtendedConnectivityFingerprints,
|
|
1011 AtomNeighborhoodsFingerprints and so on - all three formulism result
|
|
1012 in same value during similarity and distance calculations.
|
|
1013
|
|
1014 -w, --WorkingDir *DirName*
|
|
1015 Location of working directory. Default: current directory.
|
|
1016
|
|
1017 EXAMPLES
|
|
1018 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1019 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1020 supported fingerprints in text file present in a column name containing
|
|
1021 Fingerprint substring by loading all fingerprints data into memory and
|
|
1022 create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs
|
|
1023 retrieved from column name containing CompoundID substring, type:
|
|
1024
|
|
1025 % SimilarityMatricesFingerprints.pl -o SampleFPHex.csv
|
|
1026
|
|
1027 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1028 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1029 supported fingerprints in SD File present in a data field with
|
|
1030 Fingerprint substring in its label by loading all fingerprints data into
|
|
1031 memory and create a SampleFPHexTanimotoSimilarity.csv file containing
|
|
1032 sequentially generated compound IDs with Cmpd prefix, type:
|
|
1033
|
|
1034 % SimilarityMatricesFingerprints.pl -o SampleFPHex.sdf
|
|
1035
|
|
1036 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1037 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1038 supported fingerprints in FP file by loading all fingerprints data into
|
|
1039 memory and create a SampleFPHexTanimotoSimilarity.csv file along with
|
|
1040 compound IDs retrieved from FP file, type:
|
|
1041
|
|
1042 % SimilarityMatricesFingerprints.pl -o SampleFPHex.fpf
|
|
1043
|
|
1044 To generate a lower triangular similarity matrix corresponding to
|
|
1045 Tanimoto similarity coefficient for fingerprints bit-vector strings data
|
|
1046 corresponding to supported fingerprints in text file present in a column
|
|
1047 name containing Fingerprint substring by loading all fingerprints data
|
|
1048 into memory and create a SampleFPHexTanimotoSimilarity.csv file
|
|
1049 containing compound IDs retrieved from column name containing CompoundID
|
|
1050 substring, type:
|
|
1051
|
|
1052 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory
|
|
1053 --OutMatrixFormat RowsAndColumns --OutMatrixType LowerTriangularMatrix
|
|
1054 SampleFPHex.csv
|
|
1055
|
|
1056 To generate a upper triangular similarity matrix corresponding to
|
|
1057 Tanimoto similarity coefficient for fingerprints bit-vector strings data
|
|
1058 corresponding to supported fingerprints in text file present in a column
|
|
1059 name containing Fingerprint substring by loading all fingerprints data
|
|
1060 into memory and create a SampleFPHexTanimotoSimilarity.csv file in
|
|
1061 IDPairsAndValue format containing compound IDs retrieved from column
|
|
1062 name containing CompoundID substring, type:
|
|
1063
|
|
1064 % SimilarityMatricesFingerprints.pl -o --InputDataMode LoadInMemory
|
|
1065 --OutMatrixFormat IDPairsAndValue --OutMatrixType UpperTriangularMatrix
|
|
1066 SampleFPHex.csv
|
|
1067
|
|
1068 To generate a full similarity matrix corresponding to Tanimoto
|
|
1069 similarity coefficient for fingerprints bit-vector strings data
|
|
1070 corresponding to supported fingerprints in text file present in a column
|
|
1071 name containing Fingerprint substring by scanning file without loading
|
|
1072 all fingerprints data into memory and create a
|
|
1073 SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved
|
|
1074 from column name containing CompoundID substring, type:
|
|
1075
|
|
1076 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile
|
|
1077 --OutMatrixFormat RowsAndColumns --OutMatrixType FullMatrix
|
|
1078 SampleFPHex.csv
|
|
1079
|
|
1080 To generate a lower triangular similarity matrix corresponding to
|
|
1081 Tanimoto similarity coefficient for fingerprints bit-vector strings data
|
|
1082 corresponding to supported fingerprints in text file present in a column
|
|
1083 name containing Fingerprint substring by scanning file without loading
|
|
1084 all fingerprints data into memory and create a
|
|
1085 SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format
|
|
1086 containing compound IDs retrieved from column name containing CompoundID
|
|
1087 substring, type:
|
|
1088
|
|
1089 % SimilarityMatricesFingerprints.pl -o --InputDataMode ScanFile
|
|
1090 --OutMatrixFormat IDPairsAndValue --OutMatrixType LowerTriangularMatrix
|
|
1091 SampleFPHex.csv
|
|
1092
|
|
1093 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1094 coefficient using algebraic formulism for fingerprints vector strings
|
|
1095 data corresponding to supported fingerprints in text file present in a
|
|
1096 column name containing Fingerprint substring and create a
|
|
1097 SampleFPCountTanimotoSimilarityAlgebraicForm.csv file containing
|
|
1098 compound IDs retrieved from column name containing CompoundID substring,
|
|
1099 type:
|
|
1100
|
|
1101 % SimilarityMatricesFingerprints.pl -o SampleFPCount.csv
|
|
1102
|
|
1103 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1104 coefficient using algebraic formulism for fingerprints vector strings
|
|
1105 data corresponding to supported fingerprints in SD file present in a
|
|
1106 data field with Fingerprint substring in its label and create a
|
|
1107 SampleFPCountTanimotoSimilarityAlgebraicForm.csv file containing
|
|
1108 sequentially generated compound IDs with Cmpd prefix, type:
|
|
1109
|
|
1110 % SimilarityMatricesFingerprints.pl -o SampleFPCount.sdf
|
|
1111
|
|
1112 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1113 coefficient using algebraic formulism vector strings data corresponding
|
|
1114 to supported fingerprints in FP file and create a
|
|
1115 SampleFPCountTanimotoSimilarityAlgebraicForm.csv file along with
|
|
1116 compound IDs retrieved from FP file, type:
|
|
1117
|
|
1118 % SimilarityMatricesFingerprints.pl -o SampleFPCount.fpf
|
|
1119
|
|
1120 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1121 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1122 supported fingerprints in text file present in a column name containing
|
|
1123 Fingerprint substring and create a SampleFPHexTanimotoSimilarity.csv
|
|
1124 file in IDPairsAndValue format containing compound IDs retrieved from
|
|
1125 column name containing CompoundID substring, type:
|
|
1126
|
|
1127 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o
|
|
1128 SampleFPHex.csv
|
|
1129
|
|
1130 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1131 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1132 supported fingerprints in SD file present in a data field with
|
|
1133 Fingerprint substring in its label and create a
|
|
1134 SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format
|
|
1135 containing sequentially generated compound IDs with Cmpd prefix, type:
|
|
1136
|
|
1137 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o
|
|
1138 SampleFPHex.sdf
|
|
1139
|
|
1140 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1141 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1142 supported fingerprints in FP file and create a
|
|
1143 SampleFPHexTanimotoSimilarity.csv file in IDPairsAndValue format along
|
|
1144 with compound IDs retrieved from FP file, type:
|
|
1145
|
|
1146 % SimilarityMatricesFingerprints.pl --OutMatrixFormat IDPairsAndValue -o
|
|
1147 SampleFPHex.fpf
|
|
1148
|
|
1149 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1150 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1151 supported fingerprints in SD file present in a data field with
|
|
1152 Fingerprint substring in its label and create a
|
|
1153 SampleFPHexTanimotoSimilarity.csv file containing compound IDs from mol
|
|
1154 name line, type:
|
|
1155
|
|
1156 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolName -o
|
|
1157 SampleFPHex.sdf
|
|
1158
|
|
1159 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1160 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1161 supported fingerprints present in a data field with Fingerprint
|
|
1162 substring in its label and create a SampleFPHexTanimotoSimilarity.csv
|
|
1163 file containing compound IDs from data field name Mol_ID, type:
|
|
1164
|
|
1165 % SimilarityMatricesFingerprints.pl --CompoundIDMode DataField
|
|
1166 --CompoundIDField Mol_ID -o SampleFPBin.sdf
|
|
1167
|
|
1168 To generate similarity matrices corresponding to Buser, Dice and
|
|
1169 Tanimoto similarity coefficient for fingerprints bit-vector strings data
|
|
1170 corresponding to supported fingerprints present in a column name
|
|
1171 containing Fingerprint substring and create
|
|
1172 SampleFPBin[CoefficientName]Similarity.csv files containing compound IDs
|
|
1173 retrieved from column name containing CompoundID substring, type:
|
|
1174
|
|
1175 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity,
|
|
1176 TanimotoSimilarity" -o SampleFPBin.csv
|
|
1177
|
|
1178 To generate similarity matrices corresponding to Buser, Dice and
|
|
1179 Tanimoto similarity coefficient for fingerprints bit-vector strings data
|
|
1180 corresponding to supported fingerprints present in a data field with
|
|
1181 Fingerprint substring in its label and create
|
|
1182 SampleFPBin[CoefficientName]Similarity.csv files containing sequentially
|
|
1183 generated compound IDs with Cmpd prefix, type:
|
|
1184
|
|
1185 % SimilarityMatricesFingerprints.pl -b "BuserSimilarity,DiceSimilarity,
|
|
1186 TanimotoSimilarity" -o SampleFPBin.sdf
|
|
1187
|
|
1188 To generate similarity matrices corresponding to CityBlock distance and
|
|
1189 Tanimoto similarity coefficients using algebraic formulism for
|
|
1190 fingerprints vector strings data corresponding to supported fingerprints
|
|
1191 present in a column name containing Fingerprint substring and create
|
|
1192 SampleFPCount[CoefficientName]AlgebraicForm.csv files containing
|
|
1193 compound IDs retrieved from column name containing CompoundID substring,
|
|
1194 type:
|
|
1195
|
|
1196 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
|
|
1197 TanimotoSimilarity" -o SampleFPCount.csv
|
|
1198
|
|
1199 To generate similarity matrices corresponding to CityBlock distance and
|
|
1200 Tanimoto similarity coefficients using algebraic formulism for
|
|
1201 fingerprints vector strings data corresponding to supported fingerprints
|
|
1202 present in a data field with Fingerprint substring in its label and
|
|
1203 create SampleFPCount[CoefficientName]AlgebraicForm.csv files containing
|
|
1204 sequentially generated compound IDs with Cmpd prefix, type:
|
|
1205
|
|
1206 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
|
|
1207 TanimotoSimilarity" -o SampleFPCount.sdf
|
|
1208
|
|
1209 To generate similarity matrices corresponding to CityBlock distance
|
|
1210 Tanimoto similarity coefficients using binary formulism for fingerprints
|
|
1211 vector strings data corresponding to supported fingerprints present in a
|
|
1212 column name containing Fingerprint substring and create
|
|
1213 SampleFPCount[CoefficientName]Binary.csv files containing compound IDs
|
|
1214 retrieved from column name containing CompoundID substring, type:
|
|
1215
|
|
1216 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
|
|
1217 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o
|
|
1218 SampleFPCount.csv
|
|
1219
|
|
1220 To generate similarity matrices corresponding to CityBlock distance
|
|
1221 Tanimoto similarity coefficients using binary formulism for fingerprints
|
|
1222 vector strings data corresponding to supported fingerprints present in a
|
|
1223 data field with Fingerprint substring in its label and create
|
|
1224 SampleFPCount[CoefficientName]Binary.csv files containing sequentially
|
|
1225 generated compound IDs with Cmpd prefix, type:
|
|
1226
|
|
1227 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
|
|
1228 TanimotoSimilarity" --VectorComparisonFormulism BinaryForm -o
|
|
1229 SampleFPCount.sdf
|
|
1230
|
|
1231 To generate similarity matrices corresponding to CityBlock distance
|
|
1232 Tanimoto similarity coefficients using all supported comparison
|
|
1233 formulisms for fingerprints vector strings data corresponding to
|
|
1234 supported fingerprints present in a column name containing Fingerprint
|
|
1235 substring and create SampleFPCount[CoefficientName][FormulismName].csv
|
|
1236 files containing compound IDs retrieved from column name containing
|
|
1237 CompoundID substring, type:
|
|
1238
|
|
1239 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,
|
|
1240 TanimotoSimilarity" --VectorComparisonFormulism All -o SampleFPCount.csv
|
|
1241
|
|
1242 To generate similarity matrices corresponding to CityBlock distance
|
|
1243 Tanimoto similarity coefficients using all supported comparison
|
|
1244 formulisms for fingerprints vector strings data corresponding to
|
|
1245 supported fingerprints present in a data field with Fingerprint
|
|
1246 substring in its label and create
|
|
1247 SampleFPCount[CoefficientName][FormulismName].csv files containing
|
|
1248 sequentially generated compound IDs with Cmpd prefix, type:
|
|
1249
|
|
1250 % SimilarityMatricesFingerprints.pl -v "CityBlockDistance,TanimotoSimilarity"
|
|
1251 --VectorComparisonFormulism All -o SampleFPCount.sdf
|
|
1252
|
|
1253 To generate similarity matrices corresponding to all available
|
|
1254 similarity coefficient for fingerprints bit-vector strings data
|
|
1255 corresponding to supported fingerprints present in a column name
|
|
1256 containing Fingerprint substring and create
|
|
1257 SampleFPHex[CoefficientName].csv files containing compound IDs retrieved
|
|
1258 from column name containing CompoundID substring, type:
|
|
1259
|
|
1260 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode
|
|
1261 All --alpha 0.5 -beta 0.5 -o SampleFPHex.csv
|
|
1262
|
|
1263 To generate similarity matrices corresponding to all available
|
|
1264 similarity coefficient for fingerprints bit-vector strings data
|
|
1265 corresponding to supported fingerprints present in a data field with
|
|
1266 Fingerprint substring in its label and create
|
|
1267 SampleFPHex[CoefficientName].csv files containing sequentially generated
|
|
1268 compound IDs with Cmpd prefix, type
|
|
1269
|
|
1270 % SimilarityMatricesFingerprints.pl -m AutoDetect --BitVectorComparisonMode
|
|
1271 All --alpha 0.5 -beta 0.5 -o SampleFPHex.sdf
|
|
1272
|
|
1273 To generate similarity matrices corresponding to all available
|
|
1274 similarity and distance coefficients using all comparison formulism for
|
|
1275 fingerprints vector strings data corresponding to supported fingerprints
|
|
1276 present in a column name containing Fingerprint substring and create
|
|
1277 SampleFPCount[CoefficientName][FormulismName].csv files containing
|
|
1278 compound IDs retrieved from column name containing CompoundID substring,
|
|
1279 type:
|
|
1280
|
|
1281 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode
|
|
1282 All --VectorComparisonFormulism All -o SampleFPCount.csv
|
|
1283
|
|
1284 To generate similarity matrices corresponding to all available
|
|
1285 similarity and distance coefficients using all comparison formulism for
|
|
1286 fingerprints vector strings data corresponding to supported fingerprints
|
|
1287 present in a data field with Fingerprint substring in its label and
|
|
1288 create SampleFPCount[CoefficientName][FormulismName].csv files
|
|
1289 containing sequentially generated compound IDs with Cmpd prefix, type:
|
|
1290
|
|
1291 % SimilarityMatricesFingerprints.pl -m AutoDetect --VectorComparisonMode
|
|
1292 All --VectorComparisonFormulism All -o SampleFPCount.sdf
|
|
1293
|
|
1294 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1295 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1296 supported fingerprints present in a column number 2 and create a
|
|
1297 SampleFPHexTanimotoSimilarity.csv file containing compound IDs retrieved
|
|
1298 column number 1, type:
|
|
1299
|
|
1300 % SimilarityMatricesFingerprints.pl --ColMode ColNum --CompoundIDCol 1
|
|
1301 --FingerprintsCol 2 -o SampleFPHex.csv
|
|
1302
|
|
1303 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1304 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1305 supported fingerprints present in a data field name Fingerprints and
|
|
1306 create a SampleFPHexTanimotoSimilarity.csv file containing compound IDs
|
|
1307 present in data field name Mol_ID, type:
|
|
1308
|
|
1309 % SimilarityMatricesFingerprints.pl --FingerprintsField Fingerprints
|
|
1310 --CompoundIDMode DataField --CompoundIDField Mol_ID -o SampleFPHex.sdf
|
|
1311
|
|
1312 To generate a similarity matrix corresponding to Tversky similarity
|
|
1313 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1314 supported fingerprints present in a column named Fingerprints and create
|
|
1315 a SampleFPHexTverskySimilarity.tsv file containing compound IDs
|
|
1316 retrieved column named CompoundID, type:
|
|
1317
|
|
1318 % SimilarityMatricesFingerprints.pl --BitVectorComparisonMode
|
|
1319 TverskySimilarity --alpha 0.5 --ColMode ColLabel --CompoundIDCol
|
|
1320 CompoundID --FingerprintsCol Fingerprints --OutDelim Tab --quote No
|
|
1321 -o SampleFPHex.csv
|
|
1322
|
|
1323 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1324 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1325 supported fingerprints present in a data field with Fingerprint
|
|
1326 substring in its label and create a SampleFPHexTanimotoSimilarity.csv
|
|
1327 file containing compound IDs from molname line or sequentially generated
|
|
1328 compound IDs with Mol prefix, type:
|
|
1329
|
|
1330 % SimilarityMatricesFingerprints.pl --CompoundIDMode MolnameOrLabelPrefix
|
|
1331 --CompoundIDPrefix Mol -o SampleFPHex.sdf
|
|
1332
|
|
1333 To generate a similarity matrix corresponding to Tanimoto similarity
|
|
1334 coefficient for fingerprints bit-vector strings data corresponding to
|
|
1335 supported fingerprints present in a data field with Fingerprint
|
|
1336 substring in its label and create a SampleFPHexTanimotoSimilarity.tsv
|
|
1337 file containing sequentially generated compound IDs with Cmpd prefix,
|
|
1338 type:
|
|
1339
|
|
1340 % SimilarityMatricesFingerprints.pl -OutDelim Tab --quote No -o SampleFPHex.sdf
|
|
1341
|
|
1342 AUTHOR
|
|
1343 Manish Sud <msud@san.rr.com>
|
|
1344
|
|
1345 SEE ALSO
|
|
1346 InfoFingerprintsFiles.pl, SimilaritySearchingFingerprints.pl,
|
|
1347 AtomNeighborhoodsFingerprints.pl, ExtendedConnectivityFingerprints.pl,
|
|
1348 MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
|
|
1349 TopologicalAtomPairsFingerprints.pl,
|
|
1350 TopologicalAtomTorsionsFingerprints.pl,
|
|
1351 TopologicalPharmacophoreAtomPairsFingerprints.pl,
|
|
1352 TopologicalPharmacophoreAtomTripletsFingerprints.pl
|
|
1353
|
|
1354 COPYRIGHT
|
|
1355 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1356
|
|
1357 This file is part of MayaChemTools.
|
|
1358
|
|
1359 MayaChemTools is free software; you can redistribute it and/or modify it
|
|
1360 under the terms of the GNU Lesser General Public License as published by
|
|
1361 the Free Software Foundation; either version 3 of the License, or (at
|
|
1362 your option) any later version.
|
|
1363
|