comparison bin/AtomTypesFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: AtomTypesFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:19 $
5 # $Revision: 1.25 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::AtomicInvariantsAtomTypes;
43 use AtomTypes::FunctionalClassAtomTypes;
44 use Fingerprints::AtomTypesFingerprints;
45
46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
47
48 # Autoflush STDOUT
49 $| = 1;
50
51 # Starting message...
52 $ScriptName = basename($0);
53 print "\n$ScriptName: Starting...\n\n";
54 $StartTime = new Benchmark;
55
56 # Get the options and setup script...
57 SetupScriptUsage();
58 if ($Options{help} || @ARGV < 1) {
59 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
60 }
61
62 my(@SDFilesList);
63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
64
65 # Process options...
66 print "Processing options...\n";
67 my(%OptionsInfo);
68 ProcessOptions();
69
70 # Setup information about input files...
71 print "Checking input SD file(s)...\n";
72 my(%SDFilesInfo);
73 RetrieveSDFilesInfo();
74
75 # Process input files..
76 my($FileIndex);
77 if (@SDFilesList > 1) {
78 print "\nProcessing SD files...\n";
79 }
80 for $FileIndex (0 .. $#SDFilesList) {
81 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
82 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
83 GenerateAtomTypesFingerprints($FileIndex);
84 }
85 }
86 print "\n$ScriptName:Done...\n\n";
87
88 $EndTime = new Benchmark;
89 $TotalTime = timediff ($EndTime, $StartTime);
90 print "Total time: ", timestr($TotalTime), "\n";
91
92 ###############################################################################
93
94 # Generate fingerprints for a SD file...
95 #
96 sub GenerateAtomTypesFingerprints {
97 my($FileIndex) = @_;
98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $AtomTypesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
99
100 $SDFile = $SDFilesList[$FileIndex];
101
102 # Setup output files...
103 #
104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
105
106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
107 $MoleculeFileIO->Open();
108
109 $CmpdCount = 0;
110 $IgnoredCmpdCount = 0;
111
112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
113 $CmpdCount++;
114
115 # Filter compound data before calculating fingerprints...
116 if ($OptionsInfo{Filter}) {
117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
118 $IgnoredCmpdCount++;
119 next COMPOUND;
120 }
121 }
122
123 $AtomTypesFingerprints = GenerateMoleculeFingerprints($Molecule);
124 if (!$AtomTypesFingerprints) {
125 $IgnoredCmpdCount++;
126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
127 next COMPOUND;
128 }
129
130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $AtomTypesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
131 }
132 $MoleculeFileIO->Close();
133
134 if ($NewFPSDFileIO) {
135 $NewFPSDFileIO->Close();
136 }
137 if ($NewFPTextFileIO) {
138 $NewFPTextFileIO->Close();
139 }
140 if ($NewFPFileIO) {
141 $NewFPFileIO->Close();
142 }
143
144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
145 }
146
147 # Process compound being ignored due to problems in fingerprints geneation...
148 #
149 sub ProcessIgnoredCompound {
150 my($Mode, $CmpdCount, $Molecule) = @_;
151 my($CmpdID, $DataFieldLabelAndValuesRef);
152
153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
155
156 MODE: {
157 if ($Mode =~ /^ContainsNonElementalData$/i) {
158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
159 next MODE;
160 }
161
162 if ($Mode =~ /^ContainsNoElementalData$/i) {
163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
164 next MODE;
165 }
166
167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
169 next MODE;
170 }
171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
172 }
173 }
174
175 # Check and filter compounds....
176 #
177 sub CheckAndFilterCompound {
178 my($CmpdCount, $Molecule) = @_;
179 my($ElementCount, $NonElementCount);
180
181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
182
183 if ($NonElementCount) {
184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
185 return 1;
186 }
187
188 if (!$ElementCount) {
189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
190 return 1;
191 }
192
193 return 0;
194 }
195
196 # Write out compounds fingerprints generation summary statistics...
197 #
198 sub WriteFingerprintsGenerationSummaryStatistics {
199 my($CmpdCount, $IgnoredCmpdCount) = @_;
200 my($ProcessedCmpdCount);
201
202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
203
204 print "\nNumber of compounds: $CmpdCount\n";
205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
207 }
208
209 # Open output files...
210 #
211 sub SetupAndOpenOutputFiles {
212 my($FileIndex) = @_;
213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
214
215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
216
217 # Setup common parameters for fingerprints file IO objects...
218 #
219 %FingerprintsFileIOParams = ();
220 if ($OptionsInfo{Mode} =~ /^AtomTypesBits$/i) {
221 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
222 }
223 elsif ($OptionsInfo{Mode} =~ /^AtomTypesCount$/i) {
224 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
225 }
226
227 if ($OptionsInfo{SDOutput}) {
228 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
229 print "Generating SD file $NewFPSDFile...\n";
230 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
231 $NewFPSDFileIO->Open();
232 }
233
234 if ($OptionsInfo{FPOutput}) {
235 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
236 print "Generating FP file $NewFPFile...\n";
237 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
238 $NewFPFileIO->Open();
239 }
240
241 if ($OptionsInfo{TextOutput}) {
242 my($ColLabelsRef);
243
244 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
245 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
246
247 print "Generating text file $NewFPTextFile...\n";
248 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
249 $NewFPTextFileIO->Open();
250 }
251
252 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
253 }
254
255 # Write fingerpritns and other data to appropriate output files...
256 #
257 sub WriteDataToOutputFiles {
258 my($FileIndex, $CmpdCount, $Molecule, $AtomTypesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
259 my($DataFieldLabelAndValuesRef);
260
261 $DataFieldLabelAndValuesRef = undef;
262 if ($NewFPTextFileIO || $NewFPFileIO) {
263 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
264 }
265
266 if ($NewFPSDFileIO) {
267 my($CmpdString);
268
269 $CmpdString = $Molecule->GetInputMoleculeString();
270 $NewFPSDFileIO->WriteFingerprints($AtomTypesFingerprints, $CmpdString);
271 }
272
273 if ($NewFPTextFileIO) {
274 my($ColValuesRef);
275
276 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
277 $NewFPTextFileIO->WriteFingerprints($AtomTypesFingerprints, $ColValuesRef);
278 }
279
280 if ($NewFPFileIO) {
281 my($CompoundID);
282
283 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
284 $NewFPFileIO->WriteFingerprints($AtomTypesFingerprints, $CompoundID);
285 }
286 }
287
288 # Generate approriate column labels for FPText output file...
289 #
290 sub SetupFPTextFileCoulmnLabels {
291 my($FileIndex) = @_;
292 my($Line, @ColLabels);
293
294 @ColLabels = ();
295 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
296 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
297 }
298 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
299 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
300 }
301 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
302 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
303 }
304 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
305 push @ColLabels, $OptionsInfo{CompoundIDLabel};
306 }
307 # Add fingerprints label...
308 push @ColLabels, $OptionsInfo{FingerprintsLabel};
309
310 return \@ColLabels;
311 }
312
313 # Generate column values FPText output file..
314 #
315 sub SetupFPTextFileCoulmnValues {
316 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
317 my(@ColValues);
318
319 @ColValues = ();
320 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
321 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
322 }
323 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
325 }
326 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
327 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
328 }
329 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
330 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
331 }
332
333 return \@ColValues;
334 }
335
336 # Generate compound ID for FP and FPText output files..
337 #
338 sub SetupCmpdIDForOutputFiles {
339 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
340 my($CmpdID);
341
342 $CmpdID = '';
343 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
344 my($MolName);
345 $MolName = $Molecule->GetName();
346 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
347 }
348 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
349 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
350 }
351 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
352 my($SpecifiedDataField);
353 $SpecifiedDataField = $OptionsInfo{CompoundID};
354 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
355 }
356 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
357 $CmpdID = $Molecule->GetName();
358 }
359 return $CmpdID;
360 }
361
362 # Generate fingerprints for molecule...
363 #
364 sub GenerateMoleculeFingerprints {
365 my($Molecule) = @_;
366 my($AtomTypesFingerprints);
367
368 if ($OptionsInfo{KeepLargestComponent}) {
369 $Molecule->KeepLargestComponent();
370 }
371 if (!$Molecule->DetectRings()) {
372 return undef;
373 }
374 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
375 $Molecule->DetectAromaticity();
376
377 $AtomTypesFingerprints = undef;
378 if ($OptionsInfo{Mode} =~ /^AtomTypesCount$/i) {
379 $AtomTypesFingerprints = new Fingerprints::AtomTypesFingerprints('Molecule' => $Molecule, 'Type' => 'AtomTypesCount', 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}, 'AtomTypesSetToUse' => $OptionsInfo{AtomTypesSetToUse}, 'IgnoreHydrogens' => $OptionsInfo{IgnoreHydrogens});
380
381 }
382 elsif ($OptionsInfo{Mode} =~ /^AtomTypesBits$/i) {
383 $AtomTypesFingerprints = new Fingerprints::AtomTypesFingerprints('Molecule' => $Molecule, 'Type' => 'AtomTypesBits', 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}, 'AtomTypesSetToUse' => 'FixedSize', 'IgnoreHydrogens' => $OptionsInfo{IgnoreHydrogens});
384 }
385 else {
386 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: AtomTypesCount or AtomTypesBits\n";
387 }
388
389 SetAtomIdentifierTypeValuesToUse($AtomTypesFingerprints);
390
391 # Generate atom types fingerprints...
392 $AtomTypesFingerprints->GenerateFingerprints();
393
394 # Make sure atom types fingerprints generation is successful...
395 if (!$AtomTypesFingerprints->IsFingerprintsGenerationSuccessful()) {
396 return undef;
397 }
398
399 return $AtomTypesFingerprints;
400 }
401
402 # Set atom identifier type to use for generating fingerprints...
403 #
404 sub SetAtomIdentifierTypeValuesToUse {
405 my($AtomTypesFingerprints) = @_;
406
407 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
408 $AtomTypesFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
409 }
410 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
411 $AtomTypesFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
412 }
413 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
414 # Nothing to do for now...
415 }
416 else {
417 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
418 }
419 }
420
421 # Retrieve information about SD files...
422 #
423 sub RetrieveSDFilesInfo {
424 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
425
426 %SDFilesInfo = ();
427 @{$SDFilesInfo{FileOkay}} = ();
428 @{$SDFilesInfo{OutFileRoot}} = ();
429 @{$SDFilesInfo{SDOutFileNames}} = ();
430 @{$SDFilesInfo{FPOutFileNames}} = ();
431 @{$SDFilesInfo{TextOutFileNames}} = ();
432 @{$SDFilesInfo{AllDataFieldsRef}} = ();
433 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
434
435 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
436 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
437
438 FILELIST: for $Index (0 .. $#SDFilesList) {
439 $SDFile = $SDFilesList[$Index];
440
441 $SDFilesInfo{FileOkay}[$Index] = 0;
442 $SDFilesInfo{OutFileRoot}[$Index] = '';
443 $SDFilesInfo{SDOutFileNames}[$Index] = '';
444 $SDFilesInfo{FPOutFileNames}[$Index] = '';
445 $SDFilesInfo{TextOutFileNames}[$Index] = '';
446
447 $SDFile = $SDFilesList[$Index];
448 if (!(-e $SDFile)) {
449 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
450 next FILELIST;
451 }
452 if (!CheckFileType($SDFile, "sd sdf")) {
453 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
454 next FILELIST;
455 }
456
457 if ($CheckDataField) {
458 # Make sure data field exists in SD file..
459 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
460
461 @CmpdLines = ();
462 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
463 $CmpdString = ReadCmpdString(\*SDFILE);
464 close SDFILE;
465 @CmpdLines = split "\n", $CmpdString;
466 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
467 $SpecifiedDataField = $OptionsInfo{CompoundID};
468 if (!exists $DataFieldValues{$SpecifiedDataField}) {
469 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
470 next FILELIST;
471 }
472 }
473
474 $AllDataFieldsRef = '';
475 $CommonDataFieldsRef = '';
476 if ($CollectDataFields) {
477 my($CmpdCount);
478 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
479 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
480 close SDFILE;
481 }
482
483 # Setup output file names...
484 $FileDir = ""; $FileName = ""; $FileExt = "";
485 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
486
487 $TextOutFileExt = "csv";
488 if ($Options{outdelim} =~ /^tab$/i) {
489 $TextOutFileExt = "tsv";
490 }
491 $SDOutFileExt = $FileExt;
492 $FPOutFileExt = "fpf";
493
494 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
495 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
496 if ($RootFileName && $RootFileExt) {
497 $FileName = $RootFileName;
498 }
499 else {
500 $FileName = $OptionsInfo{OutFileRoot};
501 }
502 $OutFileRoot = $FileName;
503 }
504 else {
505 $OutFileRoot = $FileName . 'AtomTypesFP';
506 }
507
508 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
509 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
510 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
511
512 if ($OptionsInfo{SDOutput}) {
513 if ($SDFile =~ /$NewSDFileName/i) {
514 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
515 print "Specify a different name using \"-r --root\" option or use default name.\n";
516 next FILELIST;
517 }
518 }
519
520 if (!$OptionsInfo{OverwriteFiles}) {
521 # Check SD and text outout files...
522 if ($OptionsInfo{SDOutput}) {
523 if (-e $NewSDFileName) {
524 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
525 next FILELIST;
526 }
527 }
528 if ($OptionsInfo{FPOutput}) {
529 if (-e $NewFPFileName) {
530 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
531 next FILELIST;
532 }
533 }
534 if ($OptionsInfo{TextOutput}) {
535 if (-e $NewTextFileName) {
536 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
537 next FILELIST;
538 }
539 }
540 }
541
542 $SDFilesInfo{FileOkay}[$Index] = 1;
543
544 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
545 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
546 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
547 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
548
549 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
550 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
551 }
552 }
553
554 # Process option values...
555 sub ProcessOptions {
556 %OptionsInfo = ();
557
558 $OptionsInfo{Mode} = $Options{mode};
559 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
560
561 ProcessAtomIdentifierTypeOptions();
562
563 my($AtomTypesSetToUse);
564 $AtomTypesSetToUse = '';
565 if ($Options{mode} =~ /^AtomTypesBits$/i) {
566 if ($Options{atomtypessettouse} && $Options{atomtypessettouse} !~ /^FixedSize$/) {
567 die "Error: The value specified, $Options{atomtypessettouse}, for option \"-e, --AtomTypesSetToUse\" is not valid. Allowed values for AtomTypesBits of \"-m, --mode\" option: FixedSize\n";
568 }
569 $AtomTypesSetToUse = 'FixedSize';
570 }
571 else {
572 if ($Options{atomidentifiertype} =~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes)$/i && $Options{atomtypessettouse} =~ /^FixedSize$/) {
573 die "Error: The value specified, $Options{atomtypessettouse}, for option \"-e, --AtomTypesSetToUse\" is not valid during \"AtomicInvariantsAtomTypes or FunctionalClassAtomTypes\" value of \"-a, --AtomIdentifierType\". Allowed values: ArbitrarySize\n";
574 }
575 if ($Options{atomidentifiertype} =~ /^TPSAAtomTypes$/i && $Options{atomtypessettouse} =~ /^ArbitrarySize$/) {
576 die "Error: The value specified, $Options{atomtypessettouse}, for option \"-e, --AtomTypesSetToUse\" is not valid during \"TPSAAtomTypes\" value of \"-a, --AtomIdentifierType\". Allowed values: FixedSize\n";
577 }
578 $AtomTypesSetToUse = $Options{atomtypessettouse} ? $Options{atomtypessettouse} : 'ArbitrarySize';
579 }
580 $OptionsInfo{AtomTypesSetToUse} = $AtomTypesSetToUse;
581
582 $OptionsInfo{BitsOrder} = $Options{bitsorder};
583 $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
584
585 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
586 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
587 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
588
589 my(@SpecifiedDataFields);
590 @SpecifiedDataFields = ();
591
592 @{$OptionsInfo{SpecifiedDataFields}} = ();
593 $OptionsInfo{CompoundID} = '';
594
595 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
596 if ($Options{compoundidmode} =~ /^DataField$/i) {
597 if (!$Options{compoundid}) {
598 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
599 }
600 $OptionsInfo{CompoundID} = $Options{compoundid};
601 }
602 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
603 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
604 }
605 }
606 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
607 if (!$Options{datafields}) {
608 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
609 }
610 @SpecifiedDataFields = split /\,/, $Options{datafields};
611 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
612 }
613
614 $OptionsInfo{IgnoreHydrogens} = ($Options{ignorehydrogens} =~ /^Yes$/i) ? 1 : 0;
615
616 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'AtomTypesFingerprints';
617
618 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
619
620 if ($Options{fingerprintslabelmode} =~ /^FingerprintsLabelWithIDs$/) {
621 if ($Options{mode} =~ /^(AtomTypesCount)$/i && $Options{atomtypessettouse} =~ /^FixedSize$/i) {
622 # Append atom types to the fingerprints label...
623 my($FixedSizeAtomTypesSetRef);
624 $FixedSizeAtomTypesSetRef = GetFixedSizeAtomTypesSet();
625
626 $OptionsInfo{FingerprintsLabel} .= "; AtomTypes: " . TextUtil::JoinWords($FixedSizeAtomTypesSetRef, " ", 0);
627 }
628 }
629 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
630
631 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
632
633 $OptionsInfo{Output} = $Options{output};
634 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
635 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
636 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
637
638 $OptionsInfo{OutDelim} = $Options{outdelim};
639 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
640
641 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
642 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
643
644 # Setup default vector string format...
645 my($VectorStringFormat);
646 $VectorStringFormat = '';
647 if ($Options{vectorstringformat}) {
648 $VectorStringFormat = $Options{vectorstringformat};
649 }
650 else {
651 $VectorStringFormat = ($Options{atomtypessettouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
652 }
653 $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
654 }
655
656 # Process atom identifier type and related options...
657 #
658 sub ProcessAtomIdentifierTypeOptions {
659
660 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
661
662 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
663 ProcessAtomicInvariantsToUseOption();
664 }
665 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
666 ProcessFunctionalClassesToUse();
667 }
668 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
669 # Nothing to do for now...
670 }
671 else {
672 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
673 }
674 }
675
676 # Process specified atomic invariants to use...
677 #
678 sub ProcessAtomicInvariantsToUseOption {
679 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
680
681 @{$OptionsInfo{AtomicInvariantsToUse}} = ();
682 if (IsEmpty($Options{atomicinvariantstouse})) {
683 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
684 }
685 $AtomSymbolSpecified = 0;
686 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
687 for $AtomicInvariant (@AtomicInvariantsWords) {
688 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
689 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
690 }
691 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
692 $AtomSymbolSpecified = 1;
693 }
694 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
695 }
696 if (!$AtomSymbolSpecified) {
697 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
698 }
699 }
700
701 # Process specified functional classes invariants to use...
702 #
703 sub ProcessFunctionalClassesToUse {
704 my($FunctionalClass, @FunctionalClassesToUseWords);
705
706 @{$OptionsInfo{FunctionalClassesToUse}} = ();
707 if (IsEmpty($Options{functionalclassestouse})) {
708 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
709 }
710 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
711 for $FunctionalClass (@FunctionalClassesToUseWords) {
712 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
713 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
714 }
715 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
716 }
717 }
718
719 # Get fixed size atom types set...
720 #
721 sub GetFixedSizeAtomTypesSet {
722 my($AtomTypesRef);
723
724 $AtomTypesRef = undef;
725
726 IDENTIFIERTYPE: {
727 if ($OptionsInfo{AtomIdentifierType} =~ /^DREIDINGAtomTypes$/i) {
728 $AtomTypesRef = $OptionsInfo{IgnoreHydrogens} ? DREIDINGAtomTypes::GetAllPossibleDREIDINGNonHydrogenAtomTypes() : DREIDINGAtomTypes::GetAllPossibleDREIDINGAtomTypes();
729 last IDENTIFIERTYPE;
730 }
731
732 if ($OptionsInfo{AtomIdentifierType} =~ /^EStateAtomTypes$/i) {
733 $AtomTypesRef = $OptionsInfo{IgnoreHydrogens} ? EStateAtomTypes::GetAllPossibleEStateNonHydrogenAtomTypes() : EStateAtomTypes::GetAllPossibleEStateAtomTypes();
734 last IDENTIFIERTYPE;
735 }
736
737 if ($OptionsInfo{AtomIdentifierType} =~ /^MMFF94AtomTypes$/i) {
738 $AtomTypesRef = $OptionsInfo{IgnoreHydrogens} ? MMFF94AtomTypes::GetAllPossibleMMFF94NonHydrogenAtomTypes() : MMFF94AtomTypes::GetAllPossibleMMFF94AtomTypes();
739 last IDENTIFIERTYPE;
740 }
741
742 if ($OptionsInfo{AtomIdentifierType} =~ /^SLogPAtomTypes$/i) {
743 $AtomTypesRef = $OptionsInfo{IgnoreHydrogens} ? SLogPAtomTypes::GetAllPossibleSLogPNonHydrogenAtomTypes() : SLogPAtomTypes::GetAllPossibleSLogPAtomTypes();
744 last IDENTIFIERTYPE;
745 }
746
747 if ($OptionsInfo{AtomIdentifierType} =~ /^SYBYLAtomTypes$/i) {
748 $AtomTypesRef = $OptionsInfo{IgnoreHydrogens} ? SYBYLAtomTypes::GetAllPossibleSYBYLNonHydrogenAtomTypes() : SYBYLAtomTypes::GetAllPossibleSYBYLAtomTypes();
749 last IDENTIFIERTYPE;
750 }
751
752 if ($OptionsInfo{AtomIdentifierType} =~ /^TPSAAtomTypes$/i) {
753 $AtomTypesRef = TPSAAtomTypes::GetAllPossibleTPSAAtomTypes();
754 last IDENTIFIERTYPE;
755 }
756
757 if ($OptionsInfo{AtomIdentifierType} =~ /^UFFAtomTypes$/i) {
758 $AtomTypesRef = $OptionsInfo{IgnoreHydrogens} ? UFFAtomTypes::GetAllPossibleUFFNonHydrogenAtomTypes() : UFFAtomTypes::GetAllPossibleUFFAtomTypes();
759 last IDENTIFIERTYPE;
760 }
761 die "Error: GetFixedSizeAtomTypesSet: Atom types set for atom indentifier type, $OptionsInfo{AtomIdentifierType}, is not available...";
762 }
763
764 return $AtomTypesRef;
765 }
766
767 # Setup script usage and retrieve command line arguments specified using various options...
768 sub SetupScriptUsage {
769
770 # Retrieve all the options...
771 %Options = ();
772
773 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
774
775 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
776 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC';
777 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
778
779 $Options{atomtypessettouse} = 'ArbitrarySize';
780
781 $Options{bitsorder} = 'Ascending';
782 $Options{bitstringformat} = 'BinaryString';
783
784 $Options{compoundidmode} = 'LabelPrefix';
785 $Options{compoundidlabel} = 'CompoundID';
786 $Options{datafieldsmode} = 'CompoundID';
787
788 $Options{filter} = 'Yes';
789
790 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
791 $Options{keeplargestcomponent} = 'Yes';
792
793 $Options{mode} = 'AtomTypesCount';
794
795 $Options{ignorehydrogens} = 'Yes';
796
797 $Options{quote} = 'yes';
798
799 $Options{output} = 'text';
800 $Options{outdelim} = 'comma';
801 $Options{quote} = 'yes';
802
803 $Options{vectorstringformat} = '';
804
805 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "atomtypessettouse|e=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "help|h", "ignorehydrogens|i=s", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "vectorstringformat|v=s", "workingdir|w=s")) {
806 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
807 }
808 if ($Options{workingdir}) {
809 if (! -d $Options{workingdir}) {
810 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
811 }
812 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
813 }
814 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
815 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
816 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
817 }
818 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
819 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
820 }
821 if ($Options{atomtypessettouse} && $Options{atomtypessettouse} !~ /^(ArbitrarySize|FixedSize)$/) {
822 die "Error: The value specified, $Options{atomtypessettouse}, for option \"--AtomTypesSetToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
823 }
824 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
825 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
826 }
827 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
828 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
829 }
830 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
831 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
832 }
833 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
834 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
835 }
836 if ($Options{filter} !~ /^(Yes|No)$/i) {
837 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
838 }
839 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
840 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
841 }
842 if ($Options{ignorehydrogens} !~ /^(Yes|No)$/i) {
843 die "Error: The value specified, $Options{ignorehydrogens}, for option \"-i, --IgnoreHydrogens\" is not valid. Allowed values: Yes or No\n";
844 }
845 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
846 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
847 }
848 if ($Options{mode} !~ /^(AtomTypesCount|AtomTypesBits)$/i) {
849 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: AtomTypesCount, or AtomTypesBits\n";
850 }
851 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
852 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
853 }
854 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
855 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
856 }
857 if ($Options{quote} !~ /^(Yes|No)$/i) {
858 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
859 }
860 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
861 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
862 }
863 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
864 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
865 }
866 }
867
868 __END__
869
870 =head1 NAME
871
872 AtomTypesFingerprints.pl - Generate atom types fingerprints for SD files
873
874 =head1 SYNOPSIS
875
876 AtomTypesFingerprints.pl SDFile(s)...
877
878 AtomTypesFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
879 [B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes |
880 DREIDINGAtomTypes | EStateAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>]
881 [B<--AtomicInvariantsToUse> I<"AtomicInvariant, AtomicInvariant...">]
882 [B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">]
883 [B<--AtomTypesSetToUse> I<ArbitrarySize | FixedSize>]
884 [B<--BitsOrder> I<Ascending | Descending>] [B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>]
885 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
886 [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
887 [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">] [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>]
888 [B<-f, --Filter> I<Yes | No>] [B<--FingerprintsLabelMode> I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>] [B<--FingerprintsLabel> I<text>]
889 [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>]
890 [B<-m, --mode> I<AtomTypesCount | AtomTypesBits>] [B<-i, --IgnoreHydrogens> I<Yes | No>]
891 [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD |FP | text | all>] [B<-o, --overwrite>]
892 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-s, --size> I<number>] [B<--ValuesPrecision> I<number>]
893 [B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
894 [B<-w, --WorkingDir> I<DirName>]
895
896 =head1 DESCRIPTION
897
898 Generate atom types fingerprints for I<SDFile(s)> and create appropriate SD, FP or
899 CSV/TSV text file(s) containing fingerprints bit-vector or vector strings corresponding to
900 molecular fingerprints.
901
902 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
903 and I<.sd>. All other file names are ignored. All the SD files in a current directory
904 can be specified either by I<*.sdf> or the current directory name.
905
906 The current release of MayaChemTools supports generation of atom types fingerpritns
907 corresponding to following B<-a, --AtomIdentifierTypes>:
908
909 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
910 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
911 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
912
913 Based on the values specified for B<-a, --AtomIdentifierType> along with other specified
914 parameters such as B<--AtomicInvariantsToUse> and B<--FunctionalClassesToUse>, initial
915 atom types are assigned to all non-hydrogen atoms or all atoms in a molecule
916
917 Using the assigned atom types and specified B<-m, --Mode>, one of the following types of
918 fingerprints are generated:
919
920 AtomTypesCount - A vector containing count of atom types
921 AtomTypesBits - A bit vector indicating presence/absence of atom types
922
923 For I<AtomTypesCount> fingerprints, two types of atom types set size are allowed as
924 value of B<--AtomTypesSetToUse> option:
925
926 ArbitrarySize - Corresponds to only atom types detected in molecule
927 FixedSize - Corresponds to fixed number of atom types previously defined
928
929 For I<AtomTypesBits> fingerprints, only I<FixedSize> atom type set is allowed.
930
931 I<ArbitrarySize> corresponds to atom types detected in a molecule where as I<FixedSize> implies
932 a fix number of all possible atom types previously defined for a specific B<-a, --AtomIdentifierType>.
933
934 Fix number of all possible atom types for supported I<AtomIdentifierTypes> in current release
935 of MayaChemTools are:
936
937 AtomIdentifier Total TotalWithoutHydrogens
938
939 DREIDINGAtomTypes 37 34
940 EStateAtomTypes 109 87
941 MMFF94AtomTypes 212 171
942 SLogPAtomTypes 72 67
943 SYBYLAtomTypes 45 44
944 TPSAAtomTypes 47 47
945 UFFAtomTypes 126 124
946
947 The current release of MayaChemTools generates the following atom types fingerprints
948 bit-vector and vector strings:
949
950 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitraryS
951 ize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2
952 .BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1
953 O.X1.BO2;2 4 14 3 10 1 1 1 3 2
954
955 FingerprintsVector;AtomTypesCount:DREIDINGAtomTypes:ArbitrarySize;8;Nu
956 mericalValues;IDsAndValuesString;C_2 C_3 C_R F_ N_3 N_R O_2 O_3;2 9 22
957 1 1 1 2 3
958
959 FingerprintsVector;AtomTypesCount:DREIDINGAtomTypes:FixedSize;34;Order
960 edNumericalValues;IDsAndValuesString;B_3 B_2 C_3 C_R C_2 C_1 N_3 N_R N
961 _2 N_1 O_3 O_R O_2 O_1 F_ Al3 Si3 P_3 S_3 Cl Ga3 Ge3 As3 Se3 Br In3 Sn
962 3 Sb3 Te3 I_ Na Ca Fe Zn;0 0 9 22 2 0 1 1 0 0 3 0 2 0 1 0 0 0 0 0 0 0
963 0 0 0 0 0 0 0 0 0 0 0 0
964
965 FingerprintsBitVector;AtomTypesBits:DREIDINGAtomTypes:FixedSize;34;Bin
966 aryString;Ascending;0011101100101010000000000000000000000000
967
968 FingerprintsVector;AtomTypesCount:EStateAtomTypes:ArbitrarySize;11;Num
969 ericalValues;IDsAndValuesString;aaCH aasC aasN dO dssC sCH3 sF sOH ssC
970 H2 ssNH sssCH;14 8 1 2 2 2 1 3 4 1 3
971
972 FingerprintsVector;AtomTypesCount:EStateAtomTypes:FixedSize;87;Ordered
973 NumericalValues;IDsAndValuesString;sLi ssBe ssssBem sBH2 ssBH sssB sss
974 sBm sCH3 dCH2 ssCH2 tCH dsCH aaCH sssCH ddC tsC dssC aasC aaaC ssssC s
975 NH3p sNH2 ssNH2p dNH ssNH aaNH tN sssNHp dsN aaN sssN ddsN aasN ss...;
976 0 0 0 0 0 0 0 2 0 4 0 0 14 3 0 0 2 8 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 3 2 0 0
977 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
978
979 FingerprintsBitVector;AtomTypesBits:EStateAtomTypes:FixedSize;87;Binar
980 yString;Ascending;0000000101001100110000001000000010110000100000000000
981 000000000000000000000000000000000000
982
983 FingerprintsVector;AtomTypesCount:FunctionalClassAtomTypes:ArbitrarySi
984 ze;8;NumericalValues;IDsAndValuesString;Ar Ar.HBA HBA HBA.HBD HBD Hal
985 NI None;22 1 2 3 1 1 1 10
986
987 FingerprintsVector;AtomTypesCount:MMFF94AtomTypes:ArbitrarySize;13;Num
988 ericalValues;IDsAndValuesString;C5A C5B C=ON CB COO CR F N5 NC=O O=CN
989 O=CO OC=O OR;2 2 1 18 1 9 1 1 1 1 1 1 2
990
991 FingerprintsVector;AtomTypesCount:MMFF94AtomTypes:FixedSize;171;Ordere
992 dNumericalValues;IDsAndValuesString;CR C=C CSP2 C=O C=N CGD C=OR C=ON
993 CONN COO COON COOO C=OS C=S C=SN CSO2 CS=O CSS C=P CSP =C= OR OC=O OC=
994 C OC=N OC=S ONO2 ON=O OSO3 OSO2 OSO OS=O -OS OPO3 OPO2 OPO -OP -O-...;
995 9 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0
996 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
997 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 18 0 0 0 0 0 0 0 0 ...
998
999 FingerprintsBitVector;AtomTypesBits:MMFF94AtomTypes:FixedSize;171;Bina
1000 ryString;Ascending;100000010100000000000110000000000000000101000000100
1001 0100000000000000000000000000000000000000000100000000000000000000000000
1002 0000000011000000000000000001000000000000000000000000000
1003
1004 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:ArbitrarySize;16;Nume
1005 ricalValues;IDsAndValuesString;C1 C10 C11 C14 C18 C20 C21 C22 C5 CS F
1006 N11 N4 O10 O2 O9;5 1 1 1 14 4 2 1 2 2 1 1 1 1 3 1
1007
1008 FingerprintsVector;AtomTypesCount:SLogPAtomTypes:FixedSize;67;OrderedN
1009 umericalValues;IDsAndValuesString;C1 C2 C3 C4 C5 C6 C7 C8 C9 C10 C11 C
1010 12 C13 C14 C15 C16 C17 C18 C19 C20 C21 C22 C23 C24 C25 C26 C27 CS N1 N
1011 2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N12 N13 N14 NS O1 O2 O3 O4 O5 O6 O7 O8
1012 O9 O10 O11 O12 OS F Cl Br I Hal P S1 S2 S3 Me1 Me2;5 0 0 0 2 0 0 0 0 1
1013 1 0 0 1 0 0 0 14 0 4 2 1 0 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0...
1014
1015 FingerprintsBitVector;AtomTypesBits:SLogPAtomTypes:FixedSize;67;Binary
1016 String;Ascending;10001000011001000101110000010001000000100000100000011
1017 0001000000000000000
1018
1019 FingerprintsVector;AtomTypesCount:SYBYLAtomTypes:ArbitrarySize;9;Numer
1020 icalValues;IDsAndValuesString;C.2 C.3 C.ar F N.am N.ar O.2 O.3 O.co2;2
1021 9 22 1 1 1 1 2 2
1022
1023 FingerprintsVector;AtomTypesCount:SYBYLAtomTypes:FixedSize;44;OrderedN
1024 umericalValues;IDsAndValuesString;C.3 C.2 C.1 C.ar C.cat N.3 N.2 N.1 N
1025 .ar N.am N.pl3 N.4 O.3 O.2 O.co2 S.3 S.2 S.o S.o2 P.3 F Cl Br I ANY HA
1026 L HET Li Na Mg Al Si K Ca Cr.th Cr.oh Mn Fe Co.oh Cu Zn Se Mo Sn;9 2 0
1027 22 0 0 0 0 1 1 0 0 2 1 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1028 0 0 0 0 0 0 0
1029
1030 FingerprintsBitVector;AtomTypesBits:SYBYLAtomTypes:FixedSize;44;Binary
1031 String;Ascending;110100001100111000001000000000000000000000000000
1032
1033 FingerprintsVector;AtomTypesCount:TPSAAtomTypes:FixedSize;47;OrderedNu
1034 mericalValues;IDsAndValuesString;N1 N2 N3 N4 N5 N6 N7 N8 N9 N10 N11 N1
1035 2 N13 N14 N15 N16 N17 N18 N19 N20 N21 N22 N23 N24 N25 N26 N O1 O2 O3 O
1036 4 O5 O6 O S1 S2 S3 S4 S5 S6 S7 S P1 P2 P3 P4 P;0 0 0 0 0 0 1 0 0 0 0 0
1037 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 2 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1038
1039 FingerprintsBitVector;AtomTypesBits:TPSAAtomTypes:FixedSize;47;BinaryS
1040 tring;Ascending;000000100000000000001000000001100000000000000000
1041
1042 FingerprintsVector;AtomTypesCount:UFFAtomTypes:ArbitrarySize;8;Numeric
1043 alValues;IDsAndValuesString;C_2 C_3 C_R F_ N_3 N_R O_2 O_3;2 9 22 1 1
1044 1 2 3
1045
1046 FingerprintsVector;AtomTypesCount:UFFAtomTypes;124;OrderedNumerical
1047 Values;IDsAndValuesString;He4+4 Li Be3+2 B_3 B_2 C_3 C_R C_2 C_1 N_3 N_
1048 R N_2 N_1 O_3 O_3_z O_R O_2 O_1 F_ Ne4+4 Na Mg3+2 Al3 Si3 P_3+3 P_3+5 P
1049 _3+q S_3+2 S_3+4 S_3+6 S_R S_2 Cl Ar4+4 K_ Ca6+2 Sc3+3 Ti3+4 Ti6+4 V_3+
1050 ;0 0 0 0 0 12 0 3 0 3 0 1 0 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1051 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1052
1053 FingerprintsVector;AtomTypesCount:UFFAtomTypes:FixedSize;124;OrderedNu
1054 mericalValues;IDsAndValuesString;He4+4 Li Be3+2 B_3 B_2 C_3 C_R C_2 C_
1055 1 N_3 N_R N_2 N_1 O_3 O_3_z O_R O_2 O_1 F_ Ne4+4 Na Mg3+2 Al3 Si3 P_3+
1056 3 P_3+5 P_3+q S_3+2 S_3+4 S_3+6 S_R S_2 Cl Ar4+4 K_ Ca6+2 Sc3+3 Ti...;
1057 0 0 0 0 0 9 22 2 0 1 1 0 0 3 0 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1058 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
1060
1061 FingerprintsBitVector;AtomTypesBits:UFFAtomTypes:FixedSize;124;BinaryS
1062 tring;Ascending;000001110110010010100000000000000000000000000000000000
1063 0000000000000000000000000000000000000000000000000000000000000000000000
1064
1065 =head1 OPTIONS
1066
1067 =over 4
1068
1069 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
1070
1071 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
1072 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
1073 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
1074 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
1075
1076 The supported aromaticity model names along with model specific control parameters
1077 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
1078 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
1079 this file during class instantiation and makes it available to method B<DetectAromaticity>
1080 for detecting aromaticity corresponding to a specific model.
1081
1082 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | FunctionalClassAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>
1083
1084 Specify atom identifier type to use for assignment of atom types to hydrogen and/or
1085 non-hydrogen atoms during calculation of atom types fingerprints. Possible values in the
1086 current release are: I<AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
1087 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
1088 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>.
1089
1090 =item B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">
1091
1092 This value is used during I<AtomicInvariantsAtomTypes> value of B<a, --AtomIdentifierType>
1093 option. It's a list of comma separated valid atomic invariant atom types.
1094
1095 Possible values for atomic invariants are: I<AS, X, BO, LBO, SB, DB, TB,
1096 H, Ar, RA, FC, MN, SM>. Default value: I<AS,X,BO,H,FC>.
1097
1098 The atomic invariants abbreviations correspond to:
1099
1100 AS = Atom symbol corresponding to element symbol
1101
1102 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
1103 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
1104 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
1105 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
1106 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
1107 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
1108 H<n> = Number of implicit and explicit hydrogens for atom
1109 Ar = Aromatic annotation indicating whether atom is aromatic
1110 RA = Ring atom annotation indicating whether atom is a ring
1111 FC<+n/-n> = Formal charge assigned to atom
1112 MN<n> = Mass number indicating isotope other than most abundant isotope
1113 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
1114 3 (triplet)
1115
1116 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1117
1118 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1119
1120 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1121 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
1122
1123 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
1124 are also allowed:
1125
1126 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
1127 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
1128 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
1129 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
1130 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1131 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1132 H : NumOfImplicitAndExplicitHydrogens
1133 Ar : Aromatic
1134 RA : RingAtom
1135 FC : FormalCharge
1136 MN : MassNumber
1137 SM : SpinMultiplicity
1138
1139 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant
1140 atom types.
1141
1142 =item B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">
1143
1144 This value is used during I<FunctionalClassAtomTypes> value of B<a, --AtomIdentifierType>
1145 option. It's a list of comma separated valid functional classes.
1146
1147 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1148 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>.
1149
1150 The functional class abbreviations correspond to:
1151
1152 HBD: HydrogenBondDonor
1153 HBA: HydrogenBondAcceptor
1154 PI : PositivelyIonizable
1155 NI : NegativelyIonizable
1156 Ar : Aromatic
1157 Hal : Halogen
1158 H : Hydrophobic
1159 RA : RingAtom
1160 CA : ChainAtom
1161
1162 Functional class atom type specification for an atom corresponds to:
1163
1164 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1165
1166 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1167 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1168
1169 HydrogenBondDonor: NH, NH2, OH
1170 HydrogenBondAcceptor: N[!H], O
1171 PositivelyIonizable: +, NH2
1172 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1173
1174
1175 =item B<--AtomTypesSetToUse> I<ArbitrarySize | FixedSize>
1176
1177 Atom types set size to use during generation of atom types fingerprints.
1178
1179 Possible values for I<AtomTypesCount> values of B<-m, --mode> option: I<ArbitrarySize |
1180 FixedSize>; Default value: I<ArbitrarySize>.
1181
1182 Possible values for I<AtomTypesBits> value of B<-m, --mode> option: I<FixedSize>;
1183 Default value: I<FixedSize>.
1184
1185 I<FixedSize> value is not supported for I<AtomicInvariantsAtomTypes> value of
1186 B<-a, --AtomIdentifierType> option.
1187
1188 I<ArbitrarySize> corresponds to only atom types detected in molecule; I<FixedSize> corresponds
1189 to fixed number of previously defined atom types for specified B<-a, --AtomIdentifierType>.
1190
1191 =item B<--BitsOrder> I<Ascending | Descending>
1192
1193 Bits order to use during generation of fingerprints bit-vector string for I<AtomTypesBits> value of
1194 =item B<--BitsOrder> I<Ascending | Descending>
1195
1196 Bits order to use during generation of fingerprints bit-vector string for I<AtomTypesBits> value of
1197 B<-m, --mode> option. Possible values: I<Ascending, Descending>. Default: I<Ascending>.
1198
1199 I<Ascending> bit order which corresponds to first bit in each byte as the lowest bit as
1200 opposed to the highest bit.
1201
1202 Internally, bits are stored in I<Ascending> order using Perl vec function. Regardless
1203 of machine order, big-endian or little-endian, vec function always considers first
1204 string byte as the lowest byte and first bit within each byte as the lowest bit.
1205
1206 =item B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>
1207
1208 Format of fingerprints bit-vector string data in output SD, FP or CSV/TSV text file(s) specified by
1209 B<--output> used during I<AtomTypesBits> value of B<-m, --mode> option. Possible
1210 values: I<BinaryString, HexadecimalString>. Default value: I<BinaryString>.
1211
1212 I<BinaryString> corresponds to an ASCII string containing 1s and 0s. I<HexadecimalString>
1213 contains bit values in ASCII hexadecimal format.
1214
1215 Examples:
1216
1217 FingerprintsBitVector;AtomTypesBits:DREIDINGAtomTypes;34;BinaryString;
1218 Ascending;0010101010101000000000000000000000000000
1219
1220 FingerprintsBitVector;AtomTypesBits:MMFF94AtomTypes;171;BinaryString;
1221 Ascending;1000010101000000000001100000000000000001010000101000000000000
1222 00000000000000000000000000000000000001000000000000000000000000000000000
1223 0000000000000000000000000000000000000000000
1224
1225 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1226
1227 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1228
1229 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1230 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1231 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1232 look like Cmpd<Number>.
1233
1234 Examples for I<DataField> value of B<--CompoundIDMode>:
1235
1236 MolID
1237 ExtReg
1238
1239 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1240
1241 Compound
1242
1243 The value specified above generates compound IDs which correspond to Compound<Number>
1244 instead of default value of Cmpd<Number>.
1245
1246 =item B<--CompoundIDLabel> I<text>
1247
1248 Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value
1249 of B<--DataFieldsMode> option. Default: I<CompoundID>.
1250
1251 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1252
1253 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
1254 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
1255 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
1256 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
1257
1258 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1259 Default: I<LabelPrefix>.
1260
1261 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1262 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1263 values are replaced with sequential compound IDs.
1264
1265 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1266
1267 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
1268
1269 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1270 with generated fingerprints for I<text | all> values of B<--output> option.
1271
1272 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1273
1274 Examples:
1275
1276 Extreg
1277 MolID,CompoundName
1278
1279 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1280
1281 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1282 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
1283 data field; transfer SD data files common to all compounds; extract specified data fields;
1284 generate a compound ID using molname line, a compound prefix, or a combination of both.
1285 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1286
1287 =item B<-f, --Filter> I<Yes | No>
1288
1289 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1290 Default value: I<Yes>.
1291
1292 By default, compound data is checked before calculating fingerprints and compounds containing
1293 atom data corresponding to non-element symbols or no atom data are ignored.
1294
1295 =item B<--FingerprintsLabelMode> I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>
1296
1297 Specify how fingerprints label is generated in conjunction with B<--FingerprintsLabel> option value:
1298 use fingerprints label generated only by B<--FingerprintsLabel> option value or append atom type
1299 value IDs to B<--FingerprintsLabel> option value.
1300
1301 Possible values: I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>. Default value:
1302 I<FingerprintsLabelOnly>.
1303
1304 This option is only used for I<FixedSize> value of B<-e, --AtomTypesSetToUse> option during
1305 generation of I<AtomTypesCount> fingerprints and ignored for I<AtomTypesBits>.
1306
1307 Atom type IDs appended to B<--FingerprintsLabel> value during I<FingerprintsLabelWithIDs>
1308 values of B<--FingerprintsLabelMode> correspond to fixed number of previously defined
1309 atom types.
1310
1311 =item B<--FingerprintsLabel> I<text>
1312
1313 SD data label or text file column label to use for fingerprints string in output SD or
1314 CSV/TSV text file(s) specified by B<--output>. Default value: I<AtomTypesFingerprints>.
1315
1316 =item B<-h, --help>
1317
1318 Print this help message.
1319
1320 =item B<-i, --IgnoreHydrogens> I<Yes | No>
1321
1322 Ignore hydrogens during fingerprints generation. Possible values: I<Yes or No>.
1323 Default value: I<Yes>.
1324
1325 For I<yes> value of B<-i, --IgnoreHydrogens>, any explicit hydrogens are also used for
1326 generation of atom type fingerprints; implicit hydrogens are still ignored.
1327
1328 =item B<-k, --KeepLargestComponent> I<Yes | No>
1329
1330 Generate fingerprints for only the largest component in molecule. Possible values:
1331 I<Yes or No>. Default value: I<Yes>.
1332
1333 For molecules containing multiple connected components, fingerprints can be generated
1334 in two different ways: use all connected components or just the largest connected
1335 component. By default, all atoms except for the largest connected component are
1336 deleted before generation of fingerprints.
1337
1338 =item B<-m, --mode> I<AtomTypesCount | AtomTypesBits>
1339
1340 Specify type of atom types fingerprints to generate for molecules in I<SDFile(s)>.
1341 Possible values: I<AtomTypesCount or AtomTypesBits>. Default value: I<AtomTypesCount>.
1342
1343 For I<AtomTypesCount> values of B<-m, --mode> option, a fingerprint vector string is generated.
1344 The vector string corresponding to I<AtomTypesCount> contains count of atom types.
1345
1346 For I<AtomTypesBits> value of B<-m, --mode> option, a fingerprint bit-vector string containing
1347 zeros and ones indicating presence or absence of atom types is generated.
1348
1349 For I<AtomTypesCount> atom types fingerprints, two types of atom types set size can be specified
1350 using B<-a, --AtomTypesSetToUse> option: I<ArbitrarySize or FixedSize>. I<ArbitrarySize> corrresponds
1351 to only atom types detected in molecule; I<FixedSize> corresponds to fixed number of atom types
1352 previously defined.
1353
1354 For I<AtomTypesBits> atom types fingeprints, only I<FixedSize> is allowed.
1355
1356 Combination of B<-m, --Mode> and B<--AtomTypesSetToUse> along with B<-a, --AtomtomIdentifierType>
1357 allows generation of following different atom types fingerprints:
1358
1359 Mode AtomIdentifierType AtomTypesSetToUse
1360
1361 AtomTypesCount AtomicInvariantsAtomTypes ArbitrarySize [ Default ]
1362
1363 AtomTypesCount DREIDINGAtomTypes ArbitrarySize
1364 AtomTypesCount DREIDINGAtomTypes FixedSize
1365 AtomTypesBits DREIDINGAtomTypes FixedSize
1366
1367 AtomTypesCount EStateAtomTypes ArbitrarySize
1368 AtomTypesCount EStateAtomTypes FixedSize
1369 AtomTypesBits EStateAtomTypes FixedSize
1370
1371 AtomTypesCount FunctionalClassAtomTypes ArbitrarySize
1372
1373 AtomTypesCount MMFF94AtomTypes ArbitrarySize
1374 AtomTypesCount MMFF94AtomTypes FixedSize
1375 AtomTypesBits MMFF94AtomTypes FixedSize
1376
1377 AtomTypesCount SLogPAtomTypes ArbitrarySize
1378 AtomTypesCount SLogPAtomTypes FixedSize
1379 AtomTypesBits SLogPAtomTypes FixedSize
1380
1381 AtomTypesCount SYBYLAtomTypes ArbitrarySize
1382 AtomTypesCount SYBYLAtomTypes FixedSize
1383 AtomTypesBits SYBYLAtomTypes FixedSize
1384
1385 AtomTypesCount TPSAAtomTypes FixedSize
1386 AtomTypesBits TPSAAtomTypes FixedSize
1387
1388 AtomTypesCount UFFAtomTypes ArbitrarySize
1389 AtomTypesCount UFFAtomTypes FixedSize
1390 AtomTypesBits UFFAtomTypes FixedSize
1391
1392 The default is to generate I<AtomicInvariantAtomTypes> fingeprints corresponding to I<ArbitrarySize> as
1393 value of B<--AtomTypesSetToUse> option.
1394
1395 =item B<--OutDelim> I<comma | tab | semicolon>
1396
1397 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1398 Default value: I<comma>.
1399
1400 =item B<--output> I<SD | FP | text | all>
1401
1402 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1403
1404 =item B<-o, --overwrite>
1405
1406 Overwrite existing files.
1407
1408 =item B<-q, --quote> I<Yes | No>
1409
1410 Put quote around column values in output CSV/TSV text file(s). Possible values:
1411 I<Yes or No>. Default value: I<Yes>.
1412
1413 =item B<-r, --root> I<RootName>
1414
1415 New file name is generated using the root: <Root>.<Ext>. Default for new file
1416 names: <SDFileName><AtomTypesFP>.<Ext>. The file type determines <Ext> value.
1417 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1418 delimited text files, respectively.This option is ignored for multiple input files.
1419
1420 =item B<-v, --VectorStringFormat> I<ValuesString | IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1421
1422 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1423 B<--output> used during <AtomTypesCount> value of B<-m, --mode> option. Possible values:
1424 I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString |
1425 ValuesAndIDsPairsString>.
1426
1427 Default value during I<ArbitrarySize> value of B<-e, --AtomTypesSetToUse>
1428 option: I<IDsAndValuesString>. Default value during I<FixedSize> value of
1429 B<-e, --AtomTypesSetToUse> option: I<ValuesString>.
1430
1431 Example of I<SD> file containing atom types fingerprints string data:
1432
1433 ... ...
1434 ... ...
1435 $$$$
1436 ... ...
1437 ... ...
1438 ... ...
1439 41 44 0 0 0 0 0 0 0 0999 V2000
1440 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1441 ... ...
1442 2 3 1 0 0 0 0
1443 ... ...
1444 M END
1445 > <CmpdID>
1446 Cmpd1
1447
1448 > <AtomTypesFingerprints>
1449 FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:ArbitrarySi
1450 ze;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.H2 C.X2.B
1451 O3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.BO1.H1 O.
1452 X1.BO2;2 4 14 3 10 1 1 1 3 2
1453
1454 $$$$
1455 ... ...
1456 ... ...
1457
1458 Example of I<FP> file containing atom types fingerprints string data:
1459
1460 #
1461 # Package = MayaChemTools 7.4
1462 # Release Date = Oct 21, 2010
1463 #
1464 # TimeStamp = Fri Mar 11 14:28:07 2011
1465 #
1466 # FingerprintsStringType = FingerprintsVector
1467 #
1468 # Description = AtomTypesCount:AtomicInvariantsAtomTypes:ArbitrarySize
1469 # VectorStringFormat = IDsAndValuesString
1470 # VectorValuesType = NumericalValues
1471 #
1472 Cmpd1 10;C.X1.BO1.H3 C.X2.BO2.H2 C.X2.BO3.H1 C.X3.BO3.H1 C.X3.BO4 F...
1473 Cmpd2 9;C.X1.BO1.H3 C.X2.BO2.H2 C.X3.BO3.H1 C.X3.BO4 N.X1.BO1.H2 N....
1474 ... ...
1475 ... ..
1476
1477 Example of CSV I<Text> file atom types containing fingerprints string data:
1478
1479 "CompoundID","AtomTypesFingerprints"
1480 "Cmpd1","FingerprintsVector;AtomTypesCount:AtomicInvariantsAtomTypes:Ar
1481 bitrarySize;10;NumericalValues;IDsAndValuesString;C.X1.BO1.H3 C.X2.BO2.
1482 H2 C.X2.BO3.H1 C.X3.BO3.H1 C.X3.BO4 F.X1.BO1 N.X2.BO2.H1 N.X3.BO3 O.X1.
1483 BO1.H1 O.X1.BO2;2 4 14 3 10 1 1 1 3 2"
1484 O.X1.BO2;3 3 6 3 1 1 2 2 2"
1485 ... ...
1486 ... ...
1487
1488 Examples:
1489
1490 FingerprintsVector;AtomTypesCount:EStateAtomTypes:ArbitrarySize;11;Num
1491 ericalValues;IDsAndValuesString;aaCH aasC aasN dO dssC sCH3 sF sOH ssC
1492 H2 ssNH sssCH;14 8 1 2 2 2 1 3 4 1 3
1493
1494 FingerprintsVector;AtomTypesCount:SYBYLAtomTypes:ArbitrarySize;9;Numer
1495 icalValues;IDsAndValuesString;C.2 C.3 C.ar F N.am N.ar O.2 O.3 O.co2;2
1496 9 22 1 1 1 1 2 2
1497
1498 FingerprintsVector;AtomTypesCount:SYBYLAtomTypes:FixedSize;44;OrderedN
1499 umericalValues;IDsAndValuesString;C.3 C.2 C.1 C.ar C.cat N.3 N.2 N.1 N
1500 .ar N.am N.pl3 N.4 O.3 O.2 O.co2 S.3 S.2 S.o S.o2 P.3 F Cl Br I ANY HA
1501 L HET Li Na Mg Al Si K Ca Cr.th Cr.oh Mn Fe Co.oh Cu Zn Se Mo Sn;9 2 0
1502 22 0 0 0 0 1 1 0 0 2 1 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1503 0 0 0 0 0 0 0
1504
1505 =item B<-w, --WorkingDir> I<DirName>
1506
1507 Location of working directory. Default: current directory.
1508
1509 =back
1510
1511 =head1 EXAMPLES
1512
1513 To generate atomic invariants atom types count fingerprints of arbitrary size in vector
1514 string format and create a SampleATFP.csv file containing sequential compound IDs along
1515 with fingerprints vector strings data, type:
1516
1517 % AtomTypesFingerprints.pl -r SampleATFP -o Sample.sdf
1518
1519 To generate functional class atom types count fingerprints of arbitrary size in vector
1520 string format and create a SampleATFP.csv file containing sequential compound IDs along
1521 with fingerprints vector strings data, type:
1522
1523 % AtomTypesFingerprints.pl -m AtomTypesCount -a FunctionalClassAtomTypes
1524 -r SampleATFP -o Sample.sdf
1525
1526 To generate E-state atom types count fingerprints of arbitrary size in vector string
1527 format and create a SampleATFP.csv file containing sequential compound IDs along
1528 with fingerprints vector strings data, type:
1529
1530 % AtomTypesFingerprints.pl -m AtomTypesCount -a EStateAtomTypes
1531 --AtomTypesSetToUse ArbitrarySize -r SampleATFP -o Sample.sdf
1532
1533 To generate E-state atom types count fingerprints of fixed size in vector string
1534 with IDsAndValues format and create a SampleATFP.csv file containing sequential
1535 compound IDs along with fingerprints vector strings data, type:
1536
1537 % AtomTypesFingerprints.pl -m AtomTypesCount -a EStateAtomTypes
1538 --AtomTypesSetToUse FixedSize -v IDsAndValuesString
1539 -r SampleATFP -o Sample.sdf
1540
1541 To generate E-state atom types bits fingerprints of fixed size in bit-vector string
1542 format and create a SampleATFP.csv file containing sequential compound IDs along
1543 with fingerprints vector strings data, type:
1544
1545 % AtomTypesFingerprints.pl -m AtomTypesBits -a EStateAtomTypes
1546 --AtomTypesSetToUse FixedSize -r SampleATFP -o Sample.sdf
1547
1548 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1549 format and create a SampleATFP.csv file containing sequential compound IDs along
1550 with fingerprints vector strings data, type:
1551
1552 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1553 --AtomTypesSetToUse ArbitrarySize -r SampleATFP -o Sample.sdf
1554
1555 To generate MMFF94 atom types count fingerprints of fixed size in vector string
1556 format and create a SampleATFP.csv file containing sequential compound IDs along
1557 with fingerprints vector strings data, type:
1558
1559 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1560 --AtomTypesSetToUse FixedSize -r SampleATFP -o Sample.sdf
1561
1562 To generate MMFF94 atom types count fingerprints of fixed size in vector string
1563 with IDsAndValues format and create a SampleATFP.csv file containing sequential
1564 compound IDs along with fingerprints vector strings data, type:
1565
1566 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1567 --AtomTypesSetToUse FixedSize -v IDsAndValuesString
1568 -r SampleATFP -o Sample.sdf
1569
1570 To generate MMFF94 atom types bits fingerprints of fixed size in bit-vector string
1571 format and create a SampleATFP.csv file containing sequential compound IDs along
1572 with fingerprints vector strings data, type:
1573
1574 % AtomTypesFingerprints.pl -m AtomTypesBits -a MMFF94AtomTypes
1575 --AtomTypesSetToUse FixedSize -r SampleATFP -o Sample.sdf
1576
1577 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1578 format and create a SampleATFP.csv file containing compound ID from molecule
1579 name line along with fingerprints vector strings data, type
1580
1581 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1582 --DataFieldsMode CompoundID --CompoundIDMode MolName
1583 -r SampleATFP -o Sample.sdf
1584
1585 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1586 format and create a SampleATFP.csv file containing compound IDs using specified
1587 data field along with fingerprints vector strings data, type:
1588
1589 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1590 --DataFieldsMode CompoundID --CompoundIDMode DataField --CompoundID
1591 Mol_ID -r SampleATFP -o Sample.sdf
1592
1593 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1594 format and create a SampleATFP.csv file containing compound ID using combination
1595 of molecule name line and an explicit compound prefix along with fingerprints vector
1596 strings data, type:
1597
1598 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1599 --DataFieldsMode CompoundID --CompoundIDMode MolnameOrLabelPrefix
1600 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleATFP -o Sample.sdf
1601
1602 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1603 format and create a SampleATFP.csv file containing specific data fields columns along
1604 with fingerprints vector strings data, type:
1605
1606 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1607 --DataFieldsMode Specify --DataFields Mol_ID -r SampleATFP
1608 -o Sample.sdf
1609
1610 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1611 format and create a SampleATFP.csv file containing common data fields columns along
1612 with fingerprints vector strings data, type:
1613
1614 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1615 --DataFieldsMode Common -r SampleATFP -o Sample.sdf
1616
1617 To generate MMFF94 atom types count fingerprints of arbitrary size in vector string
1618 format and create SampleATFP.sdf, SampleATFP.fpf and SampleATFP.csv files containing
1619 all data fields columns in CSV file along with fingerprints vector strings data, type:
1620
1621 % AtomTypesFingerprints.pl -m AtomTypesCount -a MMFF94AtomTypes
1622 --DataFieldsMode All --output all -r SampleATFP -o Sample.sdf
1623
1624 =head1 AUTHOR
1625
1626 Manish Sud <msud@san.rr.com>
1627
1628 =head1 SEE ALSO
1629
1630 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1631 ExtendedConnectivityFingerprints.pl, MACCSKeysFingeprints.pl, PathLengthFingerprints.pl,
1632 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1633 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1634
1635 =head1 COPYRIGHT
1636
1637 Copyright (C) 2015 Manish Sud. All rights reserved.
1638
1639 This file is part of MayaChemTools.
1640
1641 MayaChemTools is free software; you can redistribute it and/or modify it under
1642 the terms of the GNU Lesser General Public License as published by the Free
1643 Software Foundation; either version 3 of the License, or (at your option)
1644 any later version.
1645
1646 =cut