Mercurial > repos > deepakjadmin > mayatool3_test3
comparison mayachemtools/bin/ExtendedConnectivityFingerprints.pl @ 0:73ae111cf86f draft
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 11:55:01 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:73ae111cf86f |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: ExtendedConnectivityFingerprints.pl,v $ | |
4 # $Date: 2015/02/28 20:46:19 $ | |
5 # $Revision: 1.37 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use FileUtil; | |
36 use TextUtil; | |
37 use SDFileUtil; | |
38 use MoleculeFileIO; | |
39 use FileIO::FingerprintsSDFileIO; | |
40 use FileIO::FingerprintsTextFileIO; | |
41 use FileIO::FingerprintsFPFileIO; | |
42 use AtomTypes::AtomicInvariantsAtomTypes; | |
43 use AtomTypes::FunctionalClassAtomTypes; | |
44 use Fingerprints::ExtendedConnectivityFingerprints; | |
45 | |
46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
47 | |
48 # Autoflush STDOUT | |
49 $| = 1; | |
50 | |
51 # Starting message... | |
52 $ScriptName = basename($0); | |
53 print "\n$ScriptName: Starting...\n\n"; | |
54 $StartTime = new Benchmark; | |
55 | |
56 # Get the options and setup script... | |
57 SetupScriptUsage(); | |
58 if ($Options{help} || @ARGV < 1) { | |
59 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
60 } | |
61 | |
62 my(@SDFilesList); | |
63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
64 | |
65 # Process options... | |
66 print "Processing options...\n"; | |
67 my(%OptionsInfo); | |
68 ProcessOptions(); | |
69 | |
70 # Setup information about input files... | |
71 print "Checking input SD file(s)...\n"; | |
72 my(%SDFilesInfo); | |
73 RetrieveSDFilesInfo(); | |
74 | |
75 # Process input files.. | |
76 my($FileIndex); | |
77 if (@SDFilesList > 1) { | |
78 print "\nProcessing SD files...\n"; | |
79 } | |
80 for $FileIndex (0 .. $#SDFilesList) { | |
81 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
82 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
83 GenerateExtendedConnectivityFingerprints($FileIndex); | |
84 } | |
85 } | |
86 print "\n$ScriptName:Done...\n\n"; | |
87 | |
88 $EndTime = new Benchmark; | |
89 $TotalTime = timediff ($EndTime, $StartTime); | |
90 print "Total time: ", timestr($TotalTime), "\n"; | |
91 | |
92 ############################################################################### | |
93 | |
94 # Generate fingerprints for a SD file... | |
95 # | |
96 sub GenerateExtendedConnectivityFingerprints { | |
97 my($FileIndex) = @_; | |
98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $ExtendedConnectivityFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); | |
99 | |
100 $SDFile = $SDFilesList[$FileIndex]; | |
101 | |
102 # Setup output files... | |
103 # | |
104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex); | |
105 | |
106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile); | |
107 $MoleculeFileIO->Open(); | |
108 | |
109 $CmpdCount = 0; | |
110 $IgnoredCmpdCount = 0; | |
111 | |
112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) { | |
113 $CmpdCount++; | |
114 | |
115 # Filter compound data before calculating fingerprints... | |
116 if ($OptionsInfo{Filter}) { | |
117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) { | |
118 $IgnoredCmpdCount++; | |
119 next COMPOUND; | |
120 } | |
121 } | |
122 | |
123 $ExtendedConnectivityFingerprints = GenerateMoleculeFingerprints($Molecule); | |
124 if (!$ExtendedConnectivityFingerprints) { | |
125 $IgnoredCmpdCount++; | |
126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule); | |
127 next COMPOUND; | |
128 } | |
129 | |
130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $ExtendedConnectivityFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); | |
131 } | |
132 $MoleculeFileIO->Close(); | |
133 | |
134 if ($NewFPSDFileIO) { | |
135 $NewFPSDFileIO->Close(); | |
136 } | |
137 if ($NewFPTextFileIO) { | |
138 $NewFPTextFileIO->Close(); | |
139 } | |
140 if ($NewFPFileIO) { | |
141 $NewFPFileIO->Close(); | |
142 } | |
143 | |
144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount); | |
145 } | |
146 | |
147 # Process compound being ignored due to problems in fingerprints geneation... | |
148 # | |
149 sub ProcessIgnoredCompound { | |
150 my($Mode, $CmpdCount, $Molecule) = @_; | |
151 my($CmpdID, $DataFieldLabelAndValuesRef); | |
152 | |
153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); | |
154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); | |
155 | |
156 MODE: { | |
157 if ($Mode =~ /^ContainsNonElementalData$/i) { | |
158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n"; | |
159 next MODE; | |
160 } | |
161 | |
162 if ($Mode =~ /^ContainsNoElementalData$/i) { | |
163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n"; | |
164 next MODE; | |
165 } | |
166 | |
167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) { | |
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; | |
169 next MODE; | |
170 } | |
171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n"; | |
172 } | |
173 } | |
174 | |
175 # Check and filter compounds.... | |
176 # | |
177 sub CheckAndFilterCompound { | |
178 my($CmpdCount, $Molecule) = @_; | |
179 my($ElementCount, $NonElementCount); | |
180 | |
181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements(); | |
182 | |
183 if ($NonElementCount) { | |
184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule); | |
185 return 1; | |
186 } | |
187 | |
188 if (!$ElementCount) { | |
189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule); | |
190 return 1; | |
191 } | |
192 | |
193 return 0; | |
194 } | |
195 | |
196 # Write out compounds fingerprints generation summary statistics... | |
197 # | |
198 sub WriteFingerprintsGenerationSummaryStatistics { | |
199 my($CmpdCount, $IgnoredCmpdCount) = @_; | |
200 my($ProcessedCmpdCount); | |
201 | |
202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount; | |
203 | |
204 print "\nNumber of compounds: $CmpdCount\n"; | |
205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n"; | |
206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n"; | |
207 } | |
208 | |
209 # Open output files... | |
210 # | |
211 sub SetupAndOpenOutputFiles { | |
212 my($FileIndex) = @_; | |
213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams); | |
214 | |
215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3; | |
216 | |
217 # Setup common parameters for fingerprints file IO objects... | |
218 # | |
219 %FingerprintsFileIOParams = (); | |
220 if ($OptionsInfo{Mode} =~ /^(ExtendedConnectivity|ExtendedConnectivityCount)$/i) { | |
221 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat}); | |
222 } | |
223 elsif ($OptionsInfo{Mode} =~ /^ExtendedConnectivityBits$/i) { | |
224 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder}); | |
225 } | |
226 | |
227 if ($OptionsInfo{SDOutput}) { | |
228 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex]; | |
229 print "Generating SD file $NewFPSDFile...\n"; | |
230 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel}); | |
231 $NewFPSDFileIO->Open(); | |
232 } | |
233 | |
234 if ($OptionsInfo{FPOutput}) { | |
235 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex]; | |
236 print "Generating FP file $NewFPFile...\n"; | |
237 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams); | |
238 $NewFPFileIO->Open(); | |
239 } | |
240 | |
241 if ($OptionsInfo{TextOutput}) { | |
242 my($ColLabelsRef); | |
243 | |
244 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex]; | |
245 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex); | |
246 | |
247 print "Generating text file $NewFPTextFile...\n"; | |
248 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote}); | |
249 $NewFPTextFileIO->Open(); | |
250 } | |
251 | |
252 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO); | |
253 } | |
254 | |
255 # Write fingerpritns and other data to appropriate output files... | |
256 # | |
257 sub WriteDataToOutputFiles { | |
258 my($FileIndex, $CmpdCount, $Molecule, $ExtendedConnectivityFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_; | |
259 my($DataFieldLabelAndValuesRef); | |
260 | |
261 $DataFieldLabelAndValuesRef = undef; | |
262 if ($NewFPTextFileIO || $NewFPFileIO) { | |
263 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues(); | |
264 } | |
265 | |
266 if ($NewFPSDFileIO) { | |
267 my($CmpdString); | |
268 | |
269 $CmpdString = $Molecule->GetInputMoleculeString(); | |
270 $NewFPSDFileIO->WriteFingerprints($ExtendedConnectivityFingerprints, $CmpdString); | |
271 } | |
272 | |
273 if ($NewFPTextFileIO) { | |
274 my($ColValuesRef); | |
275 | |
276 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); | |
277 $NewFPTextFileIO->WriteFingerprints($ExtendedConnectivityFingerprints, $ColValuesRef); | |
278 } | |
279 | |
280 if ($NewFPFileIO) { | |
281 my($CompoundID); | |
282 | |
283 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); | |
284 $NewFPFileIO->WriteFingerprints($ExtendedConnectivityFingerprints, $CompoundID); | |
285 } | |
286 } | |
287 | |
288 # Generate approriate column labels for FPText output file... | |
289 # | |
290 sub SetupFPTextFileCoulmnLabels { | |
291 my($FileIndex) = @_; | |
292 my($Line, @ColLabels); | |
293 | |
294 @ColLabels = (); | |
295 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { | |
296 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; | |
297 } | |
298 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { | |
299 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; | |
300 } | |
301 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { | |
302 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}}; | |
303 } | |
304 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { | |
305 push @ColLabels, $OptionsInfo{CompoundIDLabel}; | |
306 } | |
307 # Add fingerprints label... | |
308 push @ColLabels, $OptionsInfo{FingerprintsLabel}; | |
309 | |
310 return \@ColLabels; | |
311 } | |
312 | |
313 # Generate column values FPText output file.. | |
314 # | |
315 sub SetupFPTextFileCoulmnValues { | |
316 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; | |
317 my(@ColValues); | |
318 | |
319 @ColValues = (); | |
320 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) { | |
321 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef); | |
322 } | |
323 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) { | |
324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]}; | |
325 } | |
326 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) { | |
327 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]}; | |
328 } | |
329 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) { | |
330 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}}; | |
331 } | |
332 | |
333 return \@ColValues; | |
334 } | |
335 | |
336 # Generate compound ID for FP and FPText output files.. | |
337 # | |
338 sub SetupCmpdIDForOutputFiles { | |
339 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_; | |
340 my($CmpdID); | |
341 | |
342 $CmpdID = ''; | |
343 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) { | |
344 my($MolName); | |
345 $MolName = $Molecule->GetName(); | |
346 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}"; | |
347 } | |
348 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) { | |
349 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}"; | |
350 } | |
351 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) { | |
352 my($SpecifiedDataField); | |
353 $SpecifiedDataField = $OptionsInfo{CompoundID}; | |
354 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : ''; | |
355 } | |
356 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) { | |
357 $CmpdID = $Molecule->GetName(); | |
358 } | |
359 return $CmpdID; | |
360 } | |
361 | |
362 # Generate fingerprints for molecule... | |
363 # | |
364 sub GenerateMoleculeFingerprints { | |
365 my($Molecule) = @_; | |
366 my($ExtendedConnectivityFingerprints); | |
367 | |
368 if ($OptionsInfo{KeepLargestComponent}) { | |
369 $Molecule->KeepLargestComponent(); | |
370 } | |
371 if (!$Molecule->DetectRings()) { | |
372 return undef; | |
373 } | |
374 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel}); | |
375 $Molecule->DetectAromaticity(); | |
376 | |
377 $ExtendedConnectivityFingerprints = undef; | |
378 if ($OptionsInfo{Mode} =~ /^(ExtendedConnectivity|ExtendedConnectivityCount)$/i ) { | |
379 $ExtendedConnectivityFingerprints = new Fingerprints::ExtendedConnectivityFingerprints('Type' => $OptionsInfo{Mode}, 'Molecule' => $Molecule, 'NeighborhoodRadius' => $OptionsInfo{NeighborhoodRadius}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}); | |
380 } | |
381 elsif ($OptionsInfo{Mode} =~ /^ExtendedConnectivityBits$/i) { | |
382 $ExtendedConnectivityFingerprints = new Fingerprints::ExtendedConnectivityFingerprints('Type' => $OptionsInfo{Mode}, 'Molecule' => $Molecule, 'NeighborhoodRadius' => $OptionsInfo{NeighborhoodRadius}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}, 'Size' => $OptionsInfo{Size}, 'UsePerlCoreRandom' => $OptionsInfo{UsePerlCoreRandom}); | |
383 } | |
384 else { | |
385 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: ExtendedConnectivity, ExtendedConnectivityCount or ExtendedConnectivityBits\n"; | |
386 } | |
387 SetAtomIdentifierTypeValuesToUse($ExtendedConnectivityFingerprints); | |
388 | |
389 # Generate fingerprints... | |
390 $ExtendedConnectivityFingerprints->GenerateFingerprints(); | |
391 | |
392 # Make sure fingerprints generation is successful... | |
393 if (!$ExtendedConnectivityFingerprints->IsFingerprintsGenerationSuccessful()) { | |
394 return undef; | |
395 } | |
396 | |
397 return $ExtendedConnectivityFingerprints; | |
398 } | |
399 | |
400 # Set atom identifier type to use for generating fingerprints... | |
401 # | |
402 sub SetAtomIdentifierTypeValuesToUse { | |
403 my($ExtendedConnectivityFingerprints) = @_; | |
404 | |
405 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) { | |
406 $ExtendedConnectivityFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}}); | |
407 } | |
408 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) { | |
409 $ExtendedConnectivityFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}}); | |
410 } | |
411 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { | |
412 # Nothing to do for now... | |
413 } | |
414 else { | |
415 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; | |
416 } | |
417 } | |
418 | |
419 # Retrieve information about SD files... | |
420 # | |
421 sub RetrieveSDFilesInfo { | |
422 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef); | |
423 | |
424 %SDFilesInfo = (); | |
425 @{$SDFilesInfo{FileOkay}} = (); | |
426 @{$SDFilesInfo{OutFileRoot}} = (); | |
427 @{$SDFilesInfo{SDOutFileNames}} = (); | |
428 @{$SDFilesInfo{FPOutFileNames}} = (); | |
429 @{$SDFilesInfo{TextOutFileNames}} = (); | |
430 @{$SDFilesInfo{AllDataFieldsRef}} = (); | |
431 @{$SDFilesInfo{CommonDataFieldsRef}} = (); | |
432 | |
433 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0; | |
434 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0; | |
435 | |
436 FILELIST: for $Index (0 .. $#SDFilesList) { | |
437 $SDFile = $SDFilesList[$Index]; | |
438 | |
439 $SDFilesInfo{FileOkay}[$Index] = 0; | |
440 $SDFilesInfo{OutFileRoot}[$Index] = ''; | |
441 $SDFilesInfo{SDOutFileNames}[$Index] = ''; | |
442 $SDFilesInfo{FPOutFileNames}[$Index] = ''; | |
443 $SDFilesInfo{TextOutFileNames}[$Index] = ''; | |
444 | |
445 $SDFile = $SDFilesList[$Index]; | |
446 if (!(-e $SDFile)) { | |
447 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
448 next FILELIST; | |
449 } | |
450 if (!CheckFileType($SDFile, "sd sdf")) { | |
451 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
452 next FILELIST; | |
453 } | |
454 | |
455 if ($CheckDataField) { | |
456 # Make sure data field exists in SD file.. | |
457 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues); | |
458 | |
459 @CmpdLines = (); | |
460 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; | |
461 $CmpdString = ReadCmpdString(\*SDFILE); | |
462 close SDFILE; | |
463 @CmpdLines = split "\n", $CmpdString; | |
464 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
465 $SpecifiedDataField = $OptionsInfo{CompoundID}; | |
466 if (!exists $DataFieldValues{$SpecifiedDataField}) { | |
467 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n"; | |
468 next FILELIST; | |
469 } | |
470 } | |
471 | |
472 $AllDataFieldsRef = ''; | |
473 $CommonDataFieldsRef = ''; | |
474 if ($CollectDataFields) { | |
475 my($CmpdCount); | |
476 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; | |
477 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE); | |
478 close SDFILE; | |
479 } | |
480 | |
481 # Setup output file names... | |
482 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
483 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
484 | |
485 $TextOutFileExt = "csv"; | |
486 if ($Options{outdelim} =~ /^tab$/i) { | |
487 $TextOutFileExt = "tsv"; | |
488 } | |
489 $SDOutFileExt = $FileExt; | |
490 $FPOutFileExt = "fpf"; | |
491 | |
492 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { | |
493 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
494 if ($RootFileName && $RootFileExt) { | |
495 $FileName = $RootFileName; | |
496 } | |
497 else { | |
498 $FileName = $OptionsInfo{OutFileRoot}; | |
499 } | |
500 $OutFileRoot = $FileName; | |
501 } | |
502 else { | |
503 $OutFileRoot = "${FileName}ExtendedConnectivityFP"; | |
504 } | |
505 | |
506 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}"; | |
507 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}"; | |
508 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}"; | |
509 | |
510 if ($OptionsInfo{SDOutput}) { | |
511 if ($SDFile =~ /$NewSDFileName/i) { | |
512 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; | |
513 print "Specify a different name using \"-r --root\" option or use default name.\n"; | |
514 next FILELIST; | |
515 } | |
516 } | |
517 | |
518 if (!$OptionsInfo{OverwriteFiles}) { | |
519 # Check SD and text outout files... | |
520 if ($OptionsInfo{SDOutput}) { | |
521 if (-e $NewSDFileName) { | |
522 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n"; | |
523 next FILELIST; | |
524 } | |
525 } | |
526 if ($OptionsInfo{FPOutput}) { | |
527 if (-e $NewFPFileName) { | |
528 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n"; | |
529 next FILELIST; | |
530 } | |
531 } | |
532 if ($OptionsInfo{TextOutput}) { | |
533 if (-e $NewTextFileName) { | |
534 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n"; | |
535 next FILELIST; | |
536 } | |
537 } | |
538 } | |
539 | |
540 $SDFilesInfo{FileOkay}[$Index] = 1; | |
541 | |
542 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | |
543 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName; | |
544 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName; | |
545 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName; | |
546 | |
547 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef; | |
548 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef; | |
549 } | |
550 } | |
551 | |
552 # Process option values... | |
553 sub ProcessOptions { | |
554 %OptionsInfo = (); | |
555 | |
556 ProcessAtomIdentifierTypeOptions(); | |
557 | |
558 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel}; | |
559 | |
560 $OptionsInfo{BitsOrder} = $Options{bitsorder}; | |
561 $OptionsInfo{BitStringFormat} = $Options{bitstringformat}; | |
562 | |
563 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode}; | |
564 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel}; | |
565 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode}; | |
566 | |
567 my(@SpecifiedDataFields); | |
568 @SpecifiedDataFields = (); | |
569 | |
570 @{$OptionsInfo{SpecifiedDataFields}} = (); | |
571 $OptionsInfo{CompoundID} = ''; | |
572 | |
573 if ($Options{datafieldsmode} =~ /^CompoundID$/i) { | |
574 if ($Options{compoundidmode} =~ /^DataField$/i) { | |
575 if (!$Options{compoundid}) { | |
576 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n"; | |
577 } | |
578 $OptionsInfo{CompoundID} = $Options{compoundid}; | |
579 } | |
580 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) { | |
581 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd'; | |
582 } | |
583 } | |
584 elsif ($Options{datafieldsmode} =~ /^Specify$/i) { | |
585 if (!$Options{datafields}) { | |
586 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n"; | |
587 } | |
588 @SpecifiedDataFields = split /\,/, $Options{datafields}; | |
589 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields; | |
590 } | |
591 | |
592 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'ExtendedConnectivityFingerprints'; | |
593 | |
594 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0; | |
595 | |
596 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0; | |
597 | |
598 $OptionsInfo{Mode} = $Options{mode}; | |
599 | |
600 $OptionsInfo{NeighborhoodRadius} = $Options{neighborhoodradius}; | |
601 | |
602 $OptionsInfo{UsePerlCoreRandom} = ($Options{useperlcorerandom} =~ /^Yes$/i) ? 1 : 0; | |
603 | |
604 $OptionsInfo{Output} = $Options{output}; | |
605 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0; | |
606 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0; | |
607 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0; | |
608 | |
609 $OptionsInfo{OutDelim} = $Options{outdelim}; | |
610 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0; | |
611 | |
612 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | |
613 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | |
614 | |
615 my($Size, $MinSize, $MaxSize); | |
616 $MinSize = 32; | |
617 $MaxSize = 2**32; | |
618 $Size = $Options{size}; | |
619 if (!(IsPositiveInteger($Size) && $Size >= $MinSize && $Size <= $MaxSize && IsNumberPowerOfNumber($Size, 2))) { | |
620 die "Error: Invalid size value, $Size, for \"-s, --size\" option. Allowed values: power of 2, >= minimum size of $MinSize, and <= maximum size of $MaxSize.\n"; | |
621 } | |
622 $OptionsInfo{Size} = $Size; | |
623 | |
624 # Setup default vector string format... | |
625 # | |
626 my($VectorStringFormat); | |
627 $VectorStringFormat = ''; | |
628 if ($Options{vectorstringformat}) { | |
629 $VectorStringFormat = $Options{vectorstringformat}; | |
630 } | |
631 else { | |
632 $VectorStringFormat = ($Options{mode} =~ /^ExtendedConnectivity$/) ? "ValuesString" : "IDsAndValuesString"; | |
633 } | |
634 $OptionsInfo{VectorStringFormat} = $VectorStringFormat; | |
635 } | |
636 | |
637 # Process atom identifier type and related options... | |
638 # | |
639 sub ProcessAtomIdentifierTypeOptions { | |
640 | |
641 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype}; | |
642 | |
643 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) { | |
644 ProcessAtomicInvariantsToUseOption(); | |
645 } | |
646 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) { | |
647 ProcessFunctionalClassesToUse(); | |
648 } | |
649 elsif ($Options{atomidentifiertype} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { | |
650 # Nothing to do for now... | |
651 } | |
652 else { | |
653 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; | |
654 } | |
655 } | |
656 | |
657 # Process specified atomic invariants to use... | |
658 # | |
659 sub ProcessAtomicInvariantsToUseOption { | |
660 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords); | |
661 | |
662 @{$OptionsInfo{AtomicInvariantsToUse}} = (); | |
663 if (IsEmpty($Options{atomicinvariantstouse})) { | |
664 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n"; | |
665 } | |
666 $AtomSymbolSpecified = 0; | |
667 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse}; | |
668 for $AtomicInvariant (@AtomicInvariantsWords) { | |
669 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) { | |
670 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n "; | |
671 } | |
672 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) { | |
673 $AtomSymbolSpecified = 1; | |
674 } | |
675 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant; | |
676 } | |
677 if (!$AtomSymbolSpecified) { | |
678 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n "; | |
679 } | |
680 } | |
681 | |
682 # Process specified functional classes invariants to use... | |
683 # | |
684 sub ProcessFunctionalClassesToUse { | |
685 my($FunctionalClass, @FunctionalClassesToUseWords); | |
686 | |
687 @{$OptionsInfo{FunctionalClassesToUse}} = (); | |
688 if (IsEmpty($Options{functionalclassestouse})) { | |
689 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n"; | |
690 } | |
691 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse}; | |
692 for $FunctionalClass (@FunctionalClassesToUseWords) { | |
693 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) { | |
694 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n "; | |
695 } | |
696 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass; | |
697 } | |
698 } | |
699 | |
700 # Setup script usage and retrieve command line arguments specified using various options... | |
701 sub SetupScriptUsage { | |
702 | |
703 # Retrieve all the options... | |
704 %Options = (); | |
705 | |
706 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel'; | |
707 | |
708 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes'; | |
709 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC,MN'; | |
710 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal'; | |
711 | |
712 $Options{bitsorder} = 'Ascending'; | |
713 $Options{bitstringformat} = 'HexadecimalString'; | |
714 | |
715 $Options{compoundidmode} = 'LabelPrefix'; | |
716 $Options{compoundidlabel} = 'CompoundID'; | |
717 $Options{datafieldsmode} = 'CompoundID'; | |
718 | |
719 $Options{filter} = 'Yes'; | |
720 | |
721 $Options{keeplargestcomponent} = 'Yes'; | |
722 | |
723 $Options{mode} = 'ExtendedConnectivity'; | |
724 | |
725 $Options{neighborhoodradius} = 2; | |
726 | |
727 $Options{useperlcorerandom} = 'yes'; | |
728 | |
729 $Options{output} = 'text'; | |
730 $Options{outdelim} = 'comma'; | |
731 $Options{quote} = 'yes'; | |
732 | |
733 $Options{size} = 1024; | |
734 | |
735 $Options{vectorstringformat} = ''; | |
736 | |
737 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mode|m=s", "neighborhoodradius|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "useperlcorerandom=s", "vectorstringformat|v=s", "workingdir|w=s")) { | |
738 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
739 } | |
740 if ($Options{workingdir}) { | |
741 if (! -d $Options{workingdir}) { | |
742 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
743 } | |
744 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
745 } | |
746 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) { | |
747 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels(); | |
748 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n"; | |
749 } | |
750 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) { | |
751 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n"; | |
752 } | |
753 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) { | |
754 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n"; | |
755 } | |
756 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) { | |
757 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n"; | |
758 } | |
759 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) { | |
760 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n"; | |
761 } | |
762 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) { | |
763 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n"; | |
764 } | |
765 if ($Options{filter} !~ /^(Yes|No)$/i) { | |
766 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n"; | |
767 } | |
768 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) { | |
769 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n"; | |
770 } | |
771 if ($Options{mode} !~ /^(ExtendedConnectivity|ExtendedConnectivityCount|ExtendedConnectivityBits)$/i) { | |
772 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: ExtendedConnectivity, ExtendedConnecticityCount, or ExtendedConnectivityBits\n"; | |
773 } | |
774 if (!(IsInteger($Options{neighborhoodradius}) && ($Options{neighborhoodradius} >= 0))) { | |
775 die "Error: The value specified, $Options{neighborhoodradius}, for option \"-n, --NeighborhoodRadius\" is not valid. Allowed values: >= 0 \n"; | |
776 } | |
777 if ($Options{output} !~ /^(SD|FP|text|all)$/i) { | |
778 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n"; | |
779 } | |
780 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
781 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
782 } | |
783 if ($Options{quote} !~ /^(Yes|No)$/i) { | |
784 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n"; | |
785 } | |
786 if (!IsPositiveInteger($Options{size})) { | |
787 die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: > 0 \n"; | |
788 } | |
789 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) { | |
790 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n"; | |
791 } | |
792 if ($Options{useperlcorerandom} !~ /^(Yes|No)$/i) { | |
793 die "Error: The value specified, $Options{useperlcorerandom}, for option \"--UsePerlCoreRandom\" is not valid. Allowed values: Yes or No\n"; | |
794 } | |
795 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) { | |
796 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n"; | |
797 } | |
798 } | |
799 | |
800 __END__ | |
801 | |
802 =head1 NAME | |
803 | |
804 ExtendedConnectivityFingerprints.pl - Generate extended connectivity fingerprints for SD files | |
805 | |
806 =head1 SYNOPSIS | |
807 | |
808 ExtendedConnectivityFingerprints.pl SDFile(s)... | |
809 | |
810 ExtendedConnectivityFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>] | |
811 [B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes>] | |
812 [B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">] | |
813 [B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">] | |
814 [B<--BitsOrder> I<Ascending | Descending>] [B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>] | |
815 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>] | |
816 [B<--CompoundIDMode>] [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">] | |
817 [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>] [B<-f, --Filter> I<Yes | No>] | |
818 [B<--FingerprintsLabel> I<text>] [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>] | |
819 [B<-m, --mode> I<ExtendedConnectivity | ExtendedConnecticityCount | ExtendedConnecticityBits>] | |
820 [B<-n, --NeighborhoodRadius> I<number>] [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD | FP | text | all>] | |
821 [B<-o, --overwrite>] [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-s, --size> I<number>] | |
822 [B<--UsePerlCoreRandom> I<Yes | No>] | |
823 [B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>] | |
824 [B<-w, --WorkingDir> dirname] SDFile(s)... | |
825 | |
826 =head1 DESCRIPTION | |
827 | |
828 Generate extended connectivity fingerprints [ Ref 48, Ref 52 ] for I<SDFile(s)> and create appropriate | |
829 SD, FP or CSV/TSV text file(s) containing fingerprints vector strings corresponding to molecular fingerprints. | |
830 | |
831 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf> | |
832 and I<.sd>. All other file names are ignored. All the SD files in a current directory | |
833 can be specified either by I<*.sdf> or the current directory name. | |
834 | |
835 The current release of MayaChemTools supports generation of extended connectivity fingerprints | |
836 corresponding to following B<-a, --AtomIdentifierTypes>: | |
837 | |
838 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, | |
839 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, | |
840 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes | |
841 | |
842 Based on values specified for B<-a, --AtomIdentifierType>, B<--AtomicInvariantsToUse> | |
843 and B<--FunctionalClassesToUse>, initial atom types are assigned to all non-hydrogen atoms in | |
844 a molecule and these atom types strings are converted into initial atom identifier integers using | |
845 B<TextUtil::HashCode> function. The duplicate atom identifiers are removed. | |
846 | |
847 For B<-n, --NeighborhoodRadius> value of I<0>, the initial set of unique atom identifiers comprises | |
848 the molecule fingerprints. Otherwise, atom neighborhoods are generated for each non-hydrogen | |
849 atom up to specified B<-n, --NeighborhoodRadius> value. For each non-hydrogen central atom | |
850 at a specific radius, its neighbors at next radius level along with their bond orders and previously | |
851 calculated atom identifiers are collected which in turn are used to generate a new integer | |
852 atom identifier; the bond orders and atom identifier pairs list is first sorted by bond order | |
853 followed by atom identifiers to make these values graph invariant. | |
854 | |
855 After integer atom identifiers have been generated for all non-hydrogen atoms at all specified | |
856 neighborhood radii, the duplicate integer atom identifiers corresponding to same hash code | |
857 value generated using B<TextUtil::HashCode> are tracked by keeping the atom identifiers at | |
858 lower radius. Additionally, all structurally duplicate integer atom identifiers at each specified | |
859 radius are also tracked by identifying equivalent atoms and bonds corresponding to substructures | |
860 used for generating atom identifier and keeping integer atom identifier with lowest value. | |
861 | |
862 For I<ExtendedConnnectivity> value of fingerprints B<-m, --mode>, the duplicate identifiers are | |
863 removed from the list and the unique atom identifiers constitute the extended connectivity | |
864 fingerprints of a molecule. | |
865 | |
866 For I<ExtendedConnnectivityCount> value of fingerprints B<-m, --mode>, the occurrence of each | |
867 unique atom identifiers appears is counted and the unique atom identifiers along with their | |
868 count constitute the extended connectivity fingerprints of a molecule. | |
869 | |
870 For I<ExtendedConnectivityBits> value of fingerprints B<-m, --mode>, the unique atom identifiers | |
871 are used as a random number seed to generate a random integer value between 0 and B<--Size> which | |
872 in turn is used to set corresponding bits in the fingerprint bit-vector string. | |
873 | |
874 Example of I<SD> file containing extended connectivity fingerprints string data: | |
875 | |
876 ... ... | |
877 ... ... | |
878 $$$$ | |
879 ... ... | |
880 ... ... | |
881 ... ... | |
882 41 44 0 0 0 0 0 0 0 0999 V2000 | |
883 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0 | |
884 ... ... | |
885 2 3 1 0 0 0 0 | |
886 ... ... | |
887 M END | |
888 > <CmpdID> | |
889 Cmpd1 | |
890 | |
891 > <ExtendedConnectivityFingerprints> | |
892 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radiu | |
893 s2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391 66 | |
894 6191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414087 | |
895 99 49532520 64643108 79385615 96062769 273726379 564565671 855141035 90 | |
896 6706094 988546669 1018231313 1032696425 1197507444 1331250018 133853... | |
897 | |
898 $$$$ | |
899 ... ... | |
900 ... ... | |
901 | |
902 Example of I<FP> file containing extended connectivity fingerprints string data: | |
903 | |
904 # | |
905 # Package = MayaChemTools 7.4 | |
906 # Release Date = Oct 21, 2010 | |
907 # | |
908 # TimeStamp = Fri Mar 11 14:43:57 2011 | |
909 # | |
910 # FingerprintsStringType = FingerprintsVector | |
911 # | |
912 # Description = ExtendedConnectivity:AtomicInvariantsAtomTypes:Radius2 | |
913 # VectorStringFormat = ValuesString | |
914 # VectorValuesType = AlphaNumericalValues | |
915 # | |
916 Cmpd1 60;73555770 333564680 352413391 666191900 1001270906 137167432... | |
917 Cmpd2 41;73555770 333564680 666191900 1142173602 1363635752 14814699... | |
918 ... ... | |
919 ... .. | |
920 | |
921 Example of CSV I<Text> file containing extended connectivity fingerprints string data: | |
922 | |
923 "CompoundID","ExtendedConnectivityFingerprints" | |
924 "Cmpd1","FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTy | |
925 pes:Radius2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352 | |
926 413391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 | |
927 2141408799 49532520 64643108 79385615 96062769 273726379 564565671 8551 | |
928 41035 906706094 988546669 1018231313 1032696425 1197507444 13312500..." | |
929 ... ... | |
930 ... ... | |
931 | |
932 The current release of MayaChemTools generates the following types of extended connectivity | |
933 fingerprints vector strings: | |
934 | |
935 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi | |
936 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391 | |
937 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414 | |
938 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103 | |
939 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338 | |
940 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303... | |
941 | |
942 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes | |
943 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524 | |
944 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 | |
945 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...; | |
946 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2 | |
947 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 | |
948 | |
949 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp | |
950 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100 | |
951 0000000001010000000110000011000000000000100000000000000000000000100001 | |
952 1000000110000000000000000000000000010011000000000000000000000000010000 | |
953 0000000000000000000000000010000000000000000001000000000000000000000000 | |
954 0000000000010000100001000000000000101000000000000000100000000000000... | |
955 | |
956 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp | |
957 es:Radius2;1024;HexadecimalString;Ascending;000000010050c0600800000803 | |
958 0300000091000004000000020000100000000124008200020000000040020000000000 | |
959 2080000000820040010020000000008040000000000080001000000000400000000000 | |
960 4040000090000061010000000800200000000000001400000000020080000000000020 | |
961 00008020200000408000 | |
962 | |
963 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu | |
964 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8 | |
965 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567 | |
966 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012 | |
967 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455 | |
968 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404... | |
969 | |
970 FingerprintsVector;ExtendedConnectivityCount:FunctionalClassAtomTypes: | |
971 Radius2;57;NumericalValues;IDsAndValuesString;24769214 508787397 85039 | |
972 3286 862102353 981185303 1231636850 1649386610 1941540674 263599683 32 | |
973 9205671 571109041 639579325 683993318 723853089 810600886 885767127...; | |
974 1 1 1 10 2 22 3 1 3 3 1 1 1 3 2 2 1 2 2 2 3 1 1 1 1 1 14 1 1 1 1 1 1 2 | |
975 1 2 1 1 2 2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1 | |
976 | |
977 FingerprintsBitVector;ExtendedConnectivityBits:FunctionalClassAtomType | |
978 s:Radius2;1024;BinaryString;Ascending;00000000000000000000100000000000 | |
979 0000000001000100000000001000000000000000000000000000000000101000000010 | |
980 0000001000000000010000000000000000000000000000000000000000000000000100 | |
981 0000000000001000000000000001000000000001001000000000000000000000000000 | |
982 0000000000000000100000000000001000000000000000000000000000000000000... | |
983 | |
984 FingerprintsVector;ExtendedConnectivity:DREIDINGAtomTypes:Radius2;56;A | |
985 lphaNumericalValues;ValuesString;280305427 357928343 721790579 1151822 | |
986 898 1207111054 1380963747 1568213839 1603445250 4559268 55012922 18094 | |
987 0813 335715751 534801009 684609658 829361048 972945982 999881534 10076 | |
988 55741 1213692591 1222032501 1224517934 1235687794 1244268533 152812070 | |
989 0 1629595024 1856308891 1978806036 2001865095 2096549435 172675415 ... | |
990 | |
991 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp | |
992 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184 | |
993 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450 | |
994 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430 | |
995 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134 | |
996 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566... | |
997 | |
998 FingerprintsVector;ExtendedConnectivity:MMFF94AtomTypes:Radius2;64;Alp | |
999 haNumericalValues;ValuesString;224051550 746527773 998750766 103704190 | |
1000 2 1239701709 1248384926 1259447756 1521678386 1631549126 1909437580 20 | |
1001 37095052 2104274756 2117729376 8770364 31445800 81450228 314289324 344 | |
1002 041929 581773587 638555787 692022098 811840536 929651561 936421792 988 | |
1003 636432 1048624296 1054288509 1369487579 1454058929 1519352190 17271... | |
1004 | |
1005 FingerprintsVector;ExtendedConnectivity:SLogPAtomTypes:Radius2;71;Alph | |
1006 aNumericalValues;ValuesString;78989290 116507218 489454042 888737940 1 | |
1007 162561799 1241797255 1251494264 1263717127 1471206899 1538061784 17654 | |
1008 07295 1795036542 1809833874 2020454493 2055310842 2117729376 11868981 | |
1009 56731842 149505242 184525155 196984339 288181334 481409282 556716568 6 | |
1010 41915747 679881756 721736571 794256218 908276640 992898760 10987549... | |
1011 | |
1012 FingerprintsVector;ExtendedConnectivity:SYBYLAtomTypes:Radius2;58;Alph | |
1013 aNumericalValues;ValuesString;199957044 313356892 455463968 465982819 | |
1014 1225318176 1678585943 1883366064 1963811677 2117729376 113784599 19153 | |
1015 8837 196629033 263865277 416380653 477036669 681527491 730724924 90906 | |
1016 5537 1021959189 1133014972 1174311016 1359441203 1573452838 1661585138 | |
1017 1668649038 1684198062 1812312554 1859266290 1891651106 2072549404 ... | |
1018 | |
1019 FingerprintsVector;ExtendedConnectivity:TPSAAtomTypes:Radius2;47;Alpha | |
1020 NumericalValues;ValuesString;20818206 259344053 862102353 1331904542 1 | |
1021 700688206 265614156 363161397 681332588 810600886 885767127 950172500 | |
1022 951454814 1059668746 1247054493 1382302230 1399502637 1805025917 19189 | |
1023 39561 2114677228 2126402271 8130483 17645742 32278373 149975755 160327 | |
1024 654 256360355 279492740 291251259 317592700 333763396 972105960 101... | |
1025 | |
1026 FingerprintsVector;ExtendedConnectivity:UFFAtomTypes:Radius2;56;AlphaN | |
1027 umericalValues;ValuesString;280305427 357928343 721790579 1151822898 1 | |
1028 207111054 1380963747 1568213839 1603445250 4559268 55012922 180940813 | |
1029 335715751 534801009 684609658 829361048 972945982 999881534 1007655741 | |
1030 1213692591 1222032501 1224517934 1235687794 1244268533 1528120700 162 | |
1031 9595024 1856308891 1978806036 2001865095 2096549435 172675415 18344... | |
1032 | |
1033 =head1 OPTIONS | |
1034 | |
1035 =over 4 | |
1036 | |
1037 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel> | |
1038 | |
1039 Specify aromaticity model to use during detection of aromaticity. Possible values in the current | |
1040 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel, | |
1041 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel | |
1042 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>. | |
1043 | |
1044 The supported aromaticity model names along with model specific control parameters | |
1045 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release | |
1046 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from | |
1047 this file during class instantiation and makes it available to method B<DetectAromaticity> | |
1048 for detecting aromaticity corresponding to a specific model. | |
1049 | |
1050 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | FunctionalClassAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes> | |
1051 | |
1052 Specify atom identifier type to use for assignment of initial atom identifier to non-hydrogen | |
1053 atoms during calculation of extended connectivity fingerprints [ Ref 48, Ref 52]. Possible values | |
1054 in the current release are: I<AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, | |
1055 DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, | |
1056 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>. | |
1057 | |
1058 =item B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant..."> | |
1059 | |
1060 This value is used during I<AtomicInvariantsAtomTypes> value of B<a, --AtomIdentifierType> | |
1061 option. It's a list of comma separated valid atomic invariant atom types. | |
1062 | |
1063 Possible values for atomic invarians are: I<AS, X, BO, LBO, SB, DB, TB, | |
1064 H, Ar, RA, FC, MN, SM>. Default value [ Ref 24 ]: I<AS,X,BO,H,FC,MN>. | |
1065 | |
1066 The atomic invariants abbreviations correspond to: | |
1067 | |
1068 AS = Atom symbol corresponding to element symbol | |
1069 | |
1070 X<n> = Number of non-hydrogen atom neighbors or heavy atoms | |
1071 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms | |
1072 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms | |
1073 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms | |
1074 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms | |
1075 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms | |
1076 H<n> = Number of implicit and explicit hydrogens for atom | |
1077 Ar = Aromatic annotation indicating whether atom is aromatic | |
1078 RA = Ring atom annotation indicating whether atom is a ring | |
1079 FC<+n/-n> = Formal charge assigned to atom | |
1080 MN<n> = Mass number indicating isotope other than most abundant isotope | |
1081 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or | |
1082 3 (triplet) | |
1083 | |
1084 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to: | |
1085 | |
1086 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n> | |
1087 | |
1088 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are | |
1089 optional. Atom type specification doesn't include atomic invariants with zero or undefined values. | |
1090 | |
1091 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words | |
1092 are also allowed: | |
1093 | |
1094 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors | |
1095 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms | |
1096 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms | |
1097 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms | |
1098 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms | |
1099 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms | |
1100 H : NumOfImplicitAndExplicitHydrogens | |
1101 Ar : Aromatic | |
1102 RA : RingAtom | |
1103 FC : FormalCharge | |
1104 MN : MassNumber | |
1105 SM : SpinMultiplicity | |
1106 | |
1107 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant | |
1108 atom types. | |
1109 | |
1110 =item B<--BitsOrder> I<Ascending | Descending> | |
1111 | |
1112 Bits order to use during generation of fingerprints bit-vector string for I<ExtendedConnectivityBits> | |
1113 value of B<-m, --mode> option. Possible values: I<Ascending, Descending>. Default: I<Ascending>. | |
1114 | |
1115 I<Ascending> bit order which corresponds to first bit in each byte as the lowest bit as | |
1116 opposed to the highest bit. | |
1117 | |
1118 Internally, bits are stored in I<Ascending> order using Perl vec function. Regardless | |
1119 of machine order, big-endian or little-endian, vec function always considers first | |
1120 string byte as the lowest byte and first bit within each byte as the lowest bit. | |
1121 | |
1122 =item B<-b, --BitStringFormat> I<BinaryString | HexadecimalString> | |
1123 | |
1124 Format of fingerprints bit-vector string data in output SD, FP or CSV/TSV text file(s) specified by | |
1125 B<--output> used during I<ExtendedConnectivityBits> value of B<-m, --mode> option. Possible | |
1126 values: I<BinaryString, HexadecimalString>. Default value: I<BinaryString>. | |
1127 | |
1128 I<BinaryString> corresponds to an ASCII string containing 1s and 0s. I<HexadecimalString> | |
1129 contains bit values in ASCII hexadecimal format. | |
1130 | |
1131 Examples: | |
1132 | |
1133 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp | |
1134 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100 | |
1135 0000000001010000000110000011000000000000100000000000000000000000100001 | |
1136 1000000110000000000000000000000000010011000000000000000000000000010000 | |
1137 0000000000000000000000000010000000000000000001000000000000000000000000 | |
1138 0000000000010000100001000000000000101000000000000000100000000000000... | |
1139 | |
1140 FingerprintsBitVector;ExtendedConnectivityBits:FunctionalClassAtomType | |
1141 s:Radius2;1024;BinaryString;Ascending;00000000000000000000100000000000 | |
1142 0000000001000100000000001000000000000000000000000000000000101000000010 | |
1143 0000001000000000010000000000000000000000000000000000000000000000000100 | |
1144 0000000000001000000000000001000000000001001000000000000000000000000000 | |
1145 0000000000000000100000000000001000000000000000000000000000000000000... | |
1146 | |
1147 =item B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2..."> | |
1148 | |
1149 This value is used during I<FunctionalClassAtomTypes> value of B<a, --AtomIdentifierType> | |
1150 option. It's a list of comma separated valid functional classes. | |
1151 | |
1152 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>. | |
1153 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>. | |
1154 | |
1155 The functional class abbreviations correspond to: | |
1156 | |
1157 HBD: HydrogenBondDonor | |
1158 HBA: HydrogenBondAcceptor | |
1159 PI : PositivelyIonizable | |
1160 NI : NegativelyIonizable | |
1161 Ar : Aromatic | |
1162 Hal : Halogen | |
1163 H : Hydrophobic | |
1164 RA : RingAtom | |
1165 CA : ChainAtom | |
1166 | |
1167 Functional class atom type specification for an atom corresponds to: | |
1168 | |
1169 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA | |
1170 | |
1171 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom | |
1172 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]: | |
1173 | |
1174 HydrogenBondDonor: NH, NH2, OH | |
1175 HydrogenBondAcceptor: N[!H], O | |
1176 PositivelyIonizable: +, NH2 | |
1177 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH | |
1178 | |
1179 =item B<--CompoundID> I<DataFieldName or LabelPrefixString> | |
1180 | |
1181 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated. | |
1182 | |
1183 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name | |
1184 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound | |
1185 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which | |
1186 look like Cmpd<Number>. | |
1187 | |
1188 Examples for I<DataField> value of B<--CompoundIDMode>: | |
1189 | |
1190 MolID | |
1191 ExtReg | |
1192 | |
1193 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>: | |
1194 | |
1195 Compound | |
1196 | |
1197 The value specified above generates compound IDs which correspond to Compound<Number> | |
1198 instead of default value of Cmpd<Number>. | |
1199 | |
1200 =item B<--CompoundIDLabel> I<text> | |
1201 | |
1202 Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value | |
1203 of B<--DataFieldsMode> option. Default: I<CompoundID>. | |
1204 | |
1205 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix> | |
1206 | |
1207 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated | |
1208 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value; | |
1209 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination | |
1210 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines. | |
1211 | |
1212 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>. | |
1213 Default: I<LabelPrefix>. | |
1214 | |
1215 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes | |
1216 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname | |
1217 values are replaced with sequential compound IDs. | |
1218 | |
1219 This is only used for I<CompoundID> value of B<--DataFieldsMode> option. | |
1220 | |
1221 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,..."> | |
1222 | |
1223 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along | |
1224 with generated fingerprints for I<text | all> values of B<--output> option. | |
1225 | |
1226 This is only used for I<Specify> value of B<--DataFieldsMode> option. | |
1227 | |
1228 Examples: | |
1229 | |
1230 Extreg | |
1231 MolID,CompoundName | |
1232 | |
1233 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID> | |
1234 | |
1235 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along | |
1236 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD | |
1237 data field; transfer SD data files common to all compounds; extract specified data fields; | |
1238 generate a compound ID using molname line, a compound prefix, or a combination of both. | |
1239 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>. | |
1240 | |
1241 =item B<-f, --Filter> I<Yes | No> | |
1242 | |
1243 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>. | |
1244 Default value: I<Yes>. | |
1245 | |
1246 By default, compound data is checked before calculating fingerprints and compounds containing | |
1247 atom data corresponding to non-element symbols or no atom data are ignored. | |
1248 | |
1249 =item B<--FingerprintsLabel> I<text> | |
1250 | |
1251 SD data label or text file column label to use for fingerprints string in output SD or | |
1252 CSV/TSV text file(s) specified by B<--output>. Default value: I<ExtendedConnectivityFingerprints>. | |
1253 | |
1254 =item B<-h, --help> | |
1255 | |
1256 Print this help message. | |
1257 | |
1258 =item B<-k, --KeepLargestComponent> I<Yes | No> | |
1259 | |
1260 Generate fingerprints for only the largest component in molecule. Possible values: | |
1261 I<Yes or No>. Default value: I<Yes>. | |
1262 | |
1263 For molecules containing multiple connected components, fingerprints can be generated | |
1264 in two different ways: use all connected components or just the largest connected | |
1265 component. By default, all atoms except for the largest connected component are | |
1266 deleted before generation of fingerprints. | |
1267 | |
1268 =item B<-m, --mode> I<ExtendedConnectivity | ExtendedConnectivityCount | ExtendedConnectivityBits> | |
1269 | |
1270 Specify type of extended connectivity fingerprints to generate for molecules in I<SDFile(s)>. | |
1271 Possible values: I<ExtendedConnectivity, ExtendedConnecticityCount or | |
1272 ExtendedConnectivityBits>. Default value: I<ExtendedConnectivity>. | |
1273 | |
1274 For I<ExtendedConnnectivity> value of fingerprints B<-m, --mode>, a fingerprint vector | |
1275 containing unique atom identifiers constitute the extended connectivity fingerprints | |
1276 of a molecule. | |
1277 | |
1278 For I<ExtendedConnnectivityCount> value of fingerprints B<-m, --mode>, a fingerprint vector | |
1279 containing unique atom identifiers along with their count constitute the extended connectivity | |
1280 fingerprints of a molecule. | |
1281 | |
1282 For I<ExtendedConnnectivityBits> value of fingerprints B<-m, --mode>, a fingerprint bit vector | |
1283 indicating presence/absence of structurally unique atom identifiers constitute the extended | |
1284 connectivity fingerprints of a molecule. | |
1285 | |
1286 =item B<-n, --NeighborhoodRadius> I<number> | |
1287 | |
1288 Atomic neighborhood radius for generating extended connectivity neighborhoods. Default | |
1289 value: I<2>. Valid values: >= 0. Neighborhood radius of zero correspond to just the list | |
1290 of non-hydrogen atoms. | |
1291 | |
1292 Default value of I<2> for atomic neighborhood radius generates extended connectivity | |
1293 fingerprints corresponding to path length or diameter value of I<4> [ Ref 52b ]. | |
1294 | |
1295 =item B<--OutDelim> I<comma | tab | semicolon> | |
1296 | |
1297 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> | |
1298 Default value: I<comma>. | |
1299 | |
1300 =item B<--output> I<SD | FP | text | all> | |
1301 | |
1302 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>. | |
1303 | |
1304 =item B<-o, --overwrite> | |
1305 | |
1306 Overwrite existing files. | |
1307 | |
1308 =item B<-q, --quote> I<Yes | No> | |
1309 | |
1310 Put quote around column values in output CSV/TSV text file(s). Possible values: | |
1311 I<Yes or No>. Default value: I<Yes>. | |
1312 | |
1313 =item B<-r, --root> I<RootName> | |
1314 | |
1315 New file name is generated using the root: <Root>.<Ext>. Default for new file names: | |
1316 <SDFileName><ExtendedConnectivityFP>.<Ext>. The file type determines <Ext> | |
1317 value. The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab | |
1318 delimited text files, respectively.This option is ignored for multiple input files. | |
1319 | |
1320 =item B<-s, --size> I<number> | |
1321 | |
1322 Size of bit-vector to use during generation of fingerprints bit-vector string for | |
1323 I<ExtendedConnectivityBits> value of B<-m, --mode>. Default value: I<1024>. | |
1324 Valid values correspond to any positive integer which satisfies the following criteria: | |
1325 power of 2, >= 32 and <= 2 ** 32. | |
1326 | |
1327 Examples: | |
1328 | |
1329 512 | |
1330 1024 | |
1331 2048 | |
1332 | |
1333 =item B<--UsePerlCoreRandom> I<Yes | No> | |
1334 | |
1335 Specify whether to use Perl CORE::rand or MayaChemTools MathUtil::random function | |
1336 during random number generation for setting bits in fingerprints bit-vector strings. Possible | |
1337 values: I<Yes or No>. Default value: I<Yes>. | |
1338 | |
1339 I<No> value option for B<--UsePerlCoreRandom> allows the generation of fingerprints | |
1340 bit-vector strings which are same across different platforms. | |
1341 | |
1342 The random number generator implemented in MayaChemTools is a variant of | |
1343 linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ]. | |
1344 It is also referred to as Lehmer random number generator or Park-Miller | |
1345 random number generator. | |
1346 | |
1347 Unlike Perl's core random number generator function rand, the random number | |
1348 generator implemented in MayaChemTools, MathUtil::random, generates consistent | |
1349 random values across different platforms for a specific random seed and leads | |
1350 to generation of portable fingerprints bit-vector strings. | |
1351 | |
1352 =item B<-v, --VectorStringFormat> I<ValuesString | IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString> | |
1353 | |
1354 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by | |
1355 B<--output> used during <ExtendedConnectivityCount> value of B<-m, --mode> option. Possible | |
1356 values: I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | | |
1357 ValuesAndIDsPairsString>. | |
1358 | |
1359 Default value during <ExtendedConnectivityCount> value of B<-m, --mode> option: | |
1360 I<IDsAndValuesString>. | |
1361 | |
1362 Default value during <ExtendedConnectivity> value of B<-m, --mode> option: I<ValuesString>. | |
1363 | |
1364 Examples: | |
1365 | |
1366 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi | |
1367 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391 | |
1368 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414 | |
1369 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103 | |
1370 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338 | |
1371 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303... | |
1372 | |
1373 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes | |
1374 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524 | |
1375 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 | |
1376 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...; | |
1377 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2 | |
1378 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1 | |
1379 | |
1380 =item B<-w, --WorkingDir> I<DirName> | |
1381 | |
1382 Location of working directory. Default: current directory. | |
1383 | |
1384 =back | |
1385 | |
1386 =head1 EXAMPLES | |
1387 | |
1388 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1389 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv | |
1390 file containing sequential compound IDs along with fingerprints vector strings data, type: | |
1391 | |
1392 % ExtendedConnectivityFingerprints.pl -r SampleECAIFP -o Sample.sdf | |
1393 | |
1394 To generate extended connectivity count fingerprints corresponding to neighborhood radius up to | |
1395 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv | |
1396 file containing sequential compound IDs along with fingerprints vector strings data, type: | |
1397 | |
1398 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityCount | |
1399 -r SampleECAIFP -o Sample.sdf | |
1400 | |
1401 To generate extended connectivity bits fingerprints as hexadecimal bit-string corresponding to | |
1402 neighborhood radius up to 2 using atomic invariants atom types in vector string format and | |
1403 create a SampleECAIFP.csv file containing sequential compound IDs along with fingerprints | |
1404 vector strings data, type: | |
1405 | |
1406 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityBits | |
1407 -r SampleECAIFP -o Sample.sdf | |
1408 | |
1409 To generate extended connectivity bits fingerprints as binary bit-string corresponding to | |
1410 neighborhood radius up to 2 using atomic invariants atom types in vector string format and | |
1411 create a SampleECAIFP.csv file containing sequential compound IDs along with fingerprints | |
1412 vector strings data, type: | |
1413 | |
1414 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityBits | |
1415 --BitStringFormat BinaryString -r SampleECAIFP -o Sample.sdf | |
1416 | |
1417 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1418 2 using atomic invariants atom types in vector string format and create SampleECAIFP.sdf, SampleECAIFP.fpf | |
1419 and SampleECAIFP.csv files containing sequential compound IDs in CSV file along with fingerprints | |
1420 vector strings data, type: | |
1421 | |
1422 % ExtendedConnectivityFingerprints.pl --output all -r SampleECAIFP | |
1423 -o Sample.sdf | |
1424 | |
1425 To generate extended connectivity count fingerprints corresponding to neighborhood radius up to | |
1426 2 using atomic invariants atom types in vector string format and create SampleECAIFP.sdf, SampleECAIFP.fpf | |
1427 and SampleECAIFP.csv files containing sequential compound IDs in CSV file along with fingerprints | |
1428 vector strings data, type: | |
1429 | |
1430 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityCount | |
1431 --output all -r SampleECAIFP -o Sample.sdf | |
1432 | |
1433 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1434 2 using functional class atom types in vector string format and create a SampleECFCFP.csv file | |
1435 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1436 | |
1437 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes | |
1438 -r SampleECFCFP -o Sample.sdf | |
1439 | |
1440 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1441 2 using DREIDING atom types in vector string format and create a SampleECFP.csv file | |
1442 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1443 | |
1444 % ExtendedConnectivityFingerprints.pl -a DREIDINGAtomTypes | |
1445 -r SampleECFP -o Sample.sdf | |
1446 | |
1447 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1448 2 using E-state atom types in vector string format and create a SampleECFP.csv file | |
1449 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1450 | |
1451 % ExtendedConnectivityFingerprints.pl -a EStateAtomTypes | |
1452 -r SampleECFP -o Sample.sdf | |
1453 | |
1454 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1455 2 using MMFF94 atom types in vector string format and create a SampleECFP.csv file | |
1456 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1457 | |
1458 % ExtendedConnectivityFingerprints.pl -a MMFF94AtomTypes | |
1459 -r SampleECFP -o Sample.sdf | |
1460 | |
1461 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1462 2 using SLogP atom types in vector string format and create a SampleECFP.csv file | |
1463 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1464 | |
1465 % ExtendedConnectivityFingerprints.pl -a SLogPAtomTypes | |
1466 -r SampleECFP -o Sample.sdf | |
1467 | |
1468 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1469 2 using SYBYL atom types in vector string format and create a SampleECFP.csv file | |
1470 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1471 | |
1472 % ExtendedConnectivityFingerprints.pl -a SYBYLAtomTypes | |
1473 -r SampleECFP -o Sample.sdf | |
1474 | |
1475 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1476 2 using TPSA atom types in vector string format and create a SampleECFP.csv file | |
1477 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1478 | |
1479 % ExtendedConnectivityFingerprints.pl -a TPSAAtomTypes | |
1480 -r SampleECFP -o Sample.sdf | |
1481 | |
1482 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1483 2 using UFF atom types in vector string format and create a SampleECFP.csv file | |
1484 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1485 | |
1486 % ExtendedConnectivityFingerprints.pl -a UFFAtomTypes | |
1487 -r SampleECFP -o Sample.sdf | |
1488 | |
1489 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1490 3 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv | |
1491 file containing sequential compound IDs along with fingerprints vector strings data, type: | |
1492 | |
1493 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes -n 3 | |
1494 -r SampleECAIFP -o Sample.sdf | |
1495 | |
1496 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1497 3 using functional class atom types in vector string format and create a SampleECFCFP.csv file | |
1498 containing sequential compound IDs along with fingerprints vector strings data, type: | |
1499 | |
1500 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes -n 3 | |
1501 -r SampleECFCFP -o Sample.sdf | |
1502 | |
1503 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1504 2 using only AS,X atomic invariants atom types in vector string format and create a | |
1505 SampleECAIFP.csv file containing sequential compound IDs along with fingerprints vector | |
1506 strings data, type: | |
1507 | |
1508 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes | |
1509 --AtomicInvariantsToUse "AS,X" -r SampleECAIFP -o Sample.sdf | |
1510 | |
1511 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1512 2 using only HBD,HBA functional class atom types in vector string format and create a | |
1513 SampleECFCFP.csv file containing sequential compound IDs along with fingerprints vector | |
1514 strings data, type: | |
1515 | |
1516 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes | |
1517 --FunctionalClassesToUse "HBD,HBA" -r SampleECFCFP -o Sample.sdf | |
1518 | |
1519 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1520 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv | |
1521 file containing compound ID from molecule name line along with fingerprints vector strings | |
1522 data, type: | |
1523 | |
1524 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes | |
1525 --DataFieldsMode CompoundID -CompoundIDMode MolName | |
1526 -r SampleECAIFP -o Sample.sdf | |
1527 | |
1528 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1529 2 using functional class atom types in vector string format and create a SampleECFCFP.csv | |
1530 file containing compound IDs using specified data field along with fingerprints vector strings | |
1531 data, type: | |
1532 | |
1533 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes | |
1534 --DataFieldsMode CompoundID -CompoundIDMode DataField --CompoundID Mol_ID | |
1535 -r SampleECFCFP -o Sample.sdf | |
1536 | |
1537 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1538 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.tsv | |
1539 file containing compound ID using combination of molecule name line and an explicit compound | |
1540 prefix along with fingerprints vector strings data, type: | |
1541 | |
1542 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes | |
1543 --DataFieldsMode CompoundID -CompoundIDMode MolnameOrLabelPrefix | |
1544 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleECAIFP -o Sample.sdf | |
1545 | |
1546 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1547 2 using functional class atom types in vector string format and create a SampleECFCFP.csv | |
1548 file containing specific data fields columns along with fingerprints vector strings | |
1549 data, type: | |
1550 | |
1551 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes | |
1552 --DataFieldsMode Specify --DataFields Mol_ID -r SampleECFCFP | |
1553 -o Sample.sdf | |
1554 | |
1555 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1556 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.tsv | |
1557 file containing common data fields columns along with fingerprints vector strings data, type: | |
1558 | |
1559 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes | |
1560 --DataFieldsMode Common -r SampleECAIFP -o Sample.sdf | |
1561 | |
1562 To generate extended connectivity fingerprints corresponding to neighborhood radius up to | |
1563 2 using functional class atom types in vector string format and create SampleECFCFP.sdf, SampleECFCFP.fpf | |
1564 and SampleECFCFP.csv files containing all data fields columns in CSV file along with fingerprints | |
1565 vector strings data, type: | |
1566 | |
1567 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes | |
1568 --DataFieldsMode All --output all -r SampleECFCFP | |
1569 -o Sample.sdf | |
1570 | |
1571 =head1 AUTHOR | |
1572 | |
1573 Manish Sud <msud@san.rr.com> | |
1574 | |
1575 =head1 SEE ALSO | |
1576 | |
1577 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl, | |
1578 MACCSKeysFingerprints.pl, PathLengthFingerprints.pl, | |
1579 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl, | |
1580 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl | |
1581 | |
1582 =head1 COPYRIGHT | |
1583 | |
1584 Copyright (C) 2015 Manish Sud. All rights reserved. | |
1585 | |
1586 This file is part of MayaChemTools. | |
1587 | |
1588 MayaChemTools is free software; you can redistribute it and/or modify it under | |
1589 the terms of the GNU Lesser General Public License as published by the Free | |
1590 Software Foundation; either version 3 of the License, or (at your option) | |
1591 any later version. | |
1592 | |
1593 =cut |