comparison bin/TopologicalAtomTripletsFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: TopologicalAtomTripletsFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:23 $
5 # $Revision: 1.21 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::AtomicInvariantsAtomTypes;
43 use AtomTypes::FunctionalClassAtomTypes;
44 use Fingerprints::TopologicalAtomTripletsFingerprints;
45
46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
47
48 # Autoflush STDOUT
49 $| = 1;
50
51 # Starting message...
52 $ScriptName = basename($0);
53 print "\n$ScriptName: Starting...\n\n";
54 $StartTime = new Benchmark;
55
56 # Get the options and setup script...
57 SetupScriptUsage();
58 if ($Options{help} || @ARGV < 1) {
59 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
60 }
61
62 my(@SDFilesList);
63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
64
65 # Process options...
66 print "Processing options...\n";
67 my(%OptionsInfo);
68 ProcessOptions();
69
70 # Setup information about input files...
71 print "Checking input SD file(s)...\n";
72 my(%SDFilesInfo);
73 RetrieveSDFilesInfo();
74
75 # Process input files..
76 my($FileIndex);
77 if (@SDFilesList > 1) {
78 print "\nProcessing SD files...\n";
79 }
80 for $FileIndex (0 .. $#SDFilesList) {
81 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
82 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
83 GenerateTopologicalAtomTripletsFingerprints($FileIndex);
84 }
85 }
86 print "\n$ScriptName:Done...\n\n";
87
88 $EndTime = new Benchmark;
89 $TotalTime = timediff ($EndTime, $StartTime);
90 print "Total time: ", timestr($TotalTime), "\n";
91
92 ###############################################################################
93
94 # Generate fingerprints for a SD file...
95 #
96 sub GenerateTopologicalAtomTripletsFingerprints {
97 my($FileIndex) = @_;
98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
99
100 $SDFile = $SDFilesList[$FileIndex];
101
102 # Setup output files...
103 #
104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
105
106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
107 $MoleculeFileIO->Open();
108
109 $CmpdCount = 0;
110 $IgnoredCmpdCount = 0;
111
112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
113 $CmpdCount++;
114
115 # Filter compound data before calculating fingerprints...
116 if ($OptionsInfo{Filter}) {
117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
118 $IgnoredCmpdCount++;
119 next COMPOUND;
120 }
121 }
122
123 $TopologicalAtomTripletsFingerprints = GenerateMoleculeFingerprints($Molecule);
124 if (!$TopologicalAtomTripletsFingerprints) {
125 $IgnoredCmpdCount++;
126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
127 next COMPOUND;
128 }
129
130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
131 }
132 $MoleculeFileIO->Close();
133
134 if ($NewFPSDFileIO) {
135 $NewFPSDFileIO->Close();
136 }
137 if ($NewFPTextFileIO) {
138 $NewFPTextFileIO->Close();
139 }
140 if ($NewFPFileIO) {
141 $NewFPFileIO->Close();
142 }
143
144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
145 }
146
147 # Process compound being ignored due to problems in fingerprints geneation...
148 #
149 sub ProcessIgnoredCompound {
150 my($Mode, $CmpdCount, $Molecule) = @_;
151 my($CmpdID, $DataFieldLabelAndValuesRef);
152
153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
155
156 MODE: {
157 if ($Mode =~ /^ContainsNonElementalData$/i) {
158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
159 next MODE;
160 }
161
162 if ($Mode =~ /^ContainsNoElementalData$/i) {
163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
164 next MODE;
165 }
166
167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
169 next MODE;
170 }
171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
172 }
173 }
174
175 # Check and filter compounds....
176 #
177 sub CheckAndFilterCompound {
178 my($CmpdCount, $Molecule) = @_;
179 my($ElementCount, $NonElementCount);
180
181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
182
183 if ($NonElementCount) {
184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
185 return 1;
186 }
187
188 if (!$ElementCount) {
189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
190 return 1;
191 }
192
193 return 0;
194 }
195
196 # Write out compounds fingerprints generation summary statistics...
197 #
198 sub WriteFingerprintsGenerationSummaryStatistics {
199 my($CmpdCount, $IgnoredCmpdCount) = @_;
200 my($ProcessedCmpdCount);
201
202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
203
204 print "\nNumber of compounds: $CmpdCount\n";
205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
207 }
208
209 # Open output files...
210 #
211 sub SetupAndOpenOutputFiles {
212 my($FileIndex) = @_;
213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
214
215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
216
217 # Setup common parameters for fingerprints file IO objects...
218 #
219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
220
221 if ($OptionsInfo{SDOutput}) {
222 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
223 print "Generating SD file $NewFPSDFile...\n";
224 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
225 $NewFPSDFileIO->Open();
226 }
227
228 if ($OptionsInfo{FPOutput}) {
229 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
230 print "Generating FP file $NewFPFile...\n";
231 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
232 $NewFPFileIO->Open();
233 }
234
235 if ($OptionsInfo{TextOutput}) {
236 my($ColLabelsRef);
237
238 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
239 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
240
241 print "Generating text file $NewFPTextFile...\n";
242 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
243 $NewFPTextFileIO->Open();
244 }
245
246 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
247 }
248
249 # Write fingerpritns and other data to appropriate output files...
250 #
251 sub WriteDataToOutputFiles {
252 my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomTripletsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
253 my($DataFieldLabelAndValuesRef);
254
255 $DataFieldLabelAndValuesRef = undef;
256 if ($NewFPTextFileIO || $NewFPFileIO) {
257 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
258 }
259
260 if ($NewFPSDFileIO) {
261 my($CmpdString);
262
263 $CmpdString = $Molecule->GetInputMoleculeString();
264 $NewFPSDFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $CmpdString);
265 }
266
267 if ($NewFPTextFileIO) {
268 my($ColValuesRef);
269
270 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
271 $NewFPTextFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $ColValuesRef);
272 }
273
274 if ($NewFPFileIO) {
275 my($CompoundID);
276
277 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
278 $NewFPFileIO->WriteFingerprints($TopologicalAtomTripletsFingerprints, $CompoundID);
279 }
280 }
281
282 # Generate approriate column labels for FPText output file...
283 #
284 sub SetupFPTextFileCoulmnLabels {
285 my($FileIndex) = @_;
286 my($Line, @ColLabels);
287
288 @ColLabels = ();
289 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
290 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
291 }
292 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
293 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
294 }
295 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
296 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
297 }
298 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
299 push @ColLabels, $OptionsInfo{CompoundIDLabel};
300 }
301 # Add fingerprints label...
302 push @ColLabels, $OptionsInfo{FingerprintsLabel};
303
304 return \@ColLabels;
305 }
306
307 # Generate column values FPText output file..
308 #
309 sub SetupFPTextFileCoulmnValues {
310 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
311 my(@ColValues);
312
313 @ColValues = ();
314 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
315 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
316 }
317 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
318 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
319 }
320 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
321 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
322 }
323 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
325 }
326
327 return \@ColValues;
328 }
329
330 # Generate compound ID for FP and FPText output files..
331 #
332 sub SetupCmpdIDForOutputFiles {
333 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
334 my($CmpdID);
335
336 $CmpdID = '';
337 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
338 my($MolName);
339 $MolName = $Molecule->GetName();
340 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
341 }
342 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
343 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
344 }
345 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
346 my($SpecifiedDataField);
347 $SpecifiedDataField = $OptionsInfo{CompoundID};
348 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
349 }
350 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
351 $CmpdID = $Molecule->GetName();
352 }
353 return $CmpdID;
354 }
355
356 # Generate fingerprints for molecule...
357 #
358 sub GenerateMoleculeFingerprints {
359 my($Molecule) = @_;
360 my($TopologicalAtomTripletsFingerprints);
361
362 if ($OptionsInfo{KeepLargestComponent}) {
363 $Molecule->KeepLargestComponent();
364 }
365 if (!$Molecule->DetectRings()) {
366 return undef;
367 }
368 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
369 $Molecule->DetectAromaticity();
370
371 $TopologicalAtomTripletsFingerprints = new Fingerprints::TopologicalAtomTripletsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'UseTriangleInequality' => $OptionsInfo{UseTriangleInequality}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType});
372 SetAtomIdentifierTypeValuesToUse($TopologicalAtomTripletsFingerprints);
373
374 # Generate fingerprints...
375 $TopologicalAtomTripletsFingerprints->GenerateFingerprints();
376
377 # Make sure fingerprints generation is successful...
378 if (!$TopologicalAtomTripletsFingerprints->IsFingerprintsGenerationSuccessful()) {
379 return undef;
380 }
381
382 return $TopologicalAtomTripletsFingerprints;
383 }
384
385 # Set atom identifier type to use for generating fingerprints...
386 #
387 sub SetAtomIdentifierTypeValuesToUse {
388 my($TopologicalAtomTripletsFingerprints) = @_;
389
390 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
391 $TopologicalAtomTripletsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
392 }
393 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
394 $TopologicalAtomTripletsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
395 }
396 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
397 # Nothing to do for now...
398 }
399 else {
400 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
401 }
402 }
403
404 # Retrieve information about SD files...
405 #
406 sub RetrieveSDFilesInfo {
407 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
408
409 %SDFilesInfo = ();
410 @{$SDFilesInfo{FileOkay}} = ();
411 @{$SDFilesInfo{OutFileRoot}} = ();
412 @{$SDFilesInfo{SDOutFileNames}} = ();
413 @{$SDFilesInfo{FPOutFileNames}} = ();
414 @{$SDFilesInfo{TextOutFileNames}} = ();
415 @{$SDFilesInfo{AllDataFieldsRef}} = ();
416 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
417
418 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
419 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
420
421 FILELIST: for $Index (0 .. $#SDFilesList) {
422 $SDFile = $SDFilesList[$Index];
423
424 $SDFilesInfo{FileOkay}[$Index] = 0;
425 $SDFilesInfo{OutFileRoot}[$Index] = '';
426 $SDFilesInfo{SDOutFileNames}[$Index] = '';
427 $SDFilesInfo{FPOutFileNames}[$Index] = '';
428 $SDFilesInfo{TextOutFileNames}[$Index] = '';
429
430 $SDFile = $SDFilesList[$Index];
431 if (!(-e $SDFile)) {
432 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
433 next FILELIST;
434 }
435 if (!CheckFileType($SDFile, "sd sdf")) {
436 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
437 next FILELIST;
438 }
439
440 if ($CheckDataField) {
441 # Make sure data field exists in SD file..
442 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
443
444 @CmpdLines = ();
445 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
446 $CmpdString = ReadCmpdString(\*SDFILE);
447 close SDFILE;
448 @CmpdLines = split "\n", $CmpdString;
449 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
450 $SpecifiedDataField = $OptionsInfo{CompoundID};
451 if (!exists $DataFieldValues{$SpecifiedDataField}) {
452 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
453 next FILELIST;
454 }
455 }
456
457 $AllDataFieldsRef = '';
458 $CommonDataFieldsRef = '';
459 if ($CollectDataFields) {
460 my($CmpdCount);
461 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
462 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
463 close SDFILE;
464 }
465
466 # Setup output file names...
467 $FileDir = ""; $FileName = ""; $FileExt = "";
468 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
469
470 $TextOutFileExt = "csv";
471 if ($Options{outdelim} =~ /^tab$/i) {
472 $TextOutFileExt = "tsv";
473 }
474 $SDOutFileExt = $FileExt;
475 $FPOutFileExt = "fpf";
476
477 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
478 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
479 if ($RootFileName && $RootFileExt) {
480 $FileName = $RootFileName;
481 }
482 else {
483 $FileName = $OptionsInfo{OutFileRoot};
484 }
485 $OutFileRoot = $FileName;
486 }
487 else {
488 $OutFileRoot = "${FileName}TopologicalAtomTripletsFP";
489 }
490
491 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
492 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
493 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
494
495 if ($OptionsInfo{SDOutput}) {
496 if ($SDFile =~ /$NewSDFileName/i) {
497 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
498 print "Specify a different name using \"-r --root\" option or use default name.\n";
499 next FILELIST;
500 }
501 }
502
503 if (!$OptionsInfo{OverwriteFiles}) {
504 # Check SD and text outout files...
505 if ($OptionsInfo{SDOutput}) {
506 if (-e $NewSDFileName) {
507 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
508 next FILELIST;
509 }
510 }
511 if ($OptionsInfo{FPOutput}) {
512 if (-e $NewFPFileName) {
513 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
514 next FILELIST;
515 }
516 }
517 if ($OptionsInfo{TextOutput}) {
518 if (-e $NewTextFileName) {
519 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
520 next FILELIST;
521 }
522 }
523 }
524
525 $SDFilesInfo{FileOkay}[$Index] = 1;
526
527 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
528 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
529 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
530 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
531
532 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
533 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
534 }
535 }
536
537 # Process option values...
538 sub ProcessOptions {
539 %OptionsInfo = ();
540
541 ProcessAtomIdentifierTypeOptions();
542
543 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
544
545 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
546 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
547 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
548
549 my(@SpecifiedDataFields);
550 @SpecifiedDataFields = ();
551
552 @{$OptionsInfo{SpecifiedDataFields}} = ();
553 $OptionsInfo{CompoundID} = '';
554
555 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
556 if ($Options{compoundidmode} =~ /^DataField$/i) {
557 if (!$Options{compoundid}) {
558 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
559 }
560 $OptionsInfo{CompoundID} = $Options{compoundid};
561 }
562 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
563 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
564 }
565 }
566 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
567 if (!$Options{datafields}) {
568 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
569 }
570 @SpecifiedDataFields = split /\,/, $Options{datafields};
571 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
572 }
573
574 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
575
576 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomTripletsFingerprints';
577
578 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
579
580 $OptionsInfo{MinDistance} = $Options{mindistance};
581 $OptionsInfo{MaxDistance} = $Options{maxdistance};
582
583 $OptionsInfo{Output} = $Options{output};
584 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
585 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
586 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
587
588 $OptionsInfo{OutDelim} = $Options{outdelim};
589 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
590
591 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
592 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
593
594 $OptionsInfo{UseTriangleInequality} = ($Options{usetriangleinequality} =~ /^Yes$/i) ? 1 : 0;
595
596 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
597 }
598
599 # Process atom identifier type and related options...
600 #
601 sub ProcessAtomIdentifierTypeOptions {
602
603 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
604
605 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
606 ProcessAtomicInvariantsToUseOption();
607 }
608 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
609 ProcessFunctionalClassesToUse();
610 }
611 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
612 # Nothing to do for now...
613 }
614 else {
615 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
616 }
617 }
618
619 # Process specified atomic invariants to use...
620 #
621 sub ProcessAtomicInvariantsToUseOption {
622 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
623
624 @{$OptionsInfo{AtomicInvariantsToUse}} = ();
625 if (IsEmpty($Options{atomicinvariantstouse})) {
626 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
627 }
628 $AtomSymbolSpecified = 0;
629 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
630 for $AtomicInvariant (@AtomicInvariantsWords) {
631 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
632 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
633 }
634 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
635 $AtomSymbolSpecified = 1;
636 }
637 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
638 }
639 if (!$AtomSymbolSpecified) {
640 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
641 }
642 }
643
644 # Process specified functional classes invariants to use...
645 #
646 sub ProcessFunctionalClassesToUse {
647 my($FunctionalClass, @FunctionalClassesToUseWords);
648
649 @{$OptionsInfo{FunctionalClassesToUse}} = ();
650 if (IsEmpty($Options{functionalclassestouse})) {
651 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
652 }
653 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
654 for $FunctionalClass (@FunctionalClassesToUseWords) {
655 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
656 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
657 }
658 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
659 }
660 }
661
662 # Setup script usage and retrieve command line arguments specified using various options...
663 sub SetupScriptUsage {
664
665 # Retrieve all the options...
666 %Options = ();
667
668 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
669
670 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
671 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC';
672
673 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
674
675 $Options{compoundidmode} = 'LabelPrefix';
676 $Options{compoundidlabel} = 'CompoundID';
677 $Options{datafieldsmode} = 'CompoundID';
678
679 $Options{filter} = 'Yes';
680
681 $Options{keeplargestcomponent} = 'Yes';
682
683 $Options{mindistance} = 1;
684 $Options{maxdistance} = 10;
685
686 $Options{output} = 'text';
687 $Options{outdelim} = 'comma';
688 $Options{quote} = 'yes';
689
690 $Options{usetriangleinequality} = 'No';
691
692 $Options{vectorstringformat} = 'IDsAndValuesString';
693
694 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", , "usetriangleinequality|u=s", "vectorstringformat|v=s", "workingdir|w=s")) {
695 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
696 }
697 if ($Options{workingdir}) {
698 if (! -d $Options{workingdir}) {
699 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
700 }
701 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
702 }
703 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
704 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
705 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
706 }
707 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
708 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
709 }
710 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
711 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
712 }
713 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
714 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
715 }
716 if ($Options{filter} !~ /^(Yes|No)$/i) {
717 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
718 }
719 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
720 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
721 }
722 if (!IsPositiveInteger($Options{mindistance})) {
723 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
724 }
725 if (!IsPositiveInteger($Options{maxdistance})) {
726 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
727 }
728 if ($Options{mindistance} > $Options{maxdistance}) {
729 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
730 }
731 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
732 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
733 }
734 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
735 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
736 }
737 if ($Options{quote} !~ /^(Yes|No)$/i) {
738 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
739 }
740 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
741 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
742 }
743 if ($Options{usetriangleinequality} !~ /^(Yes|No)$/i) {
744 die "Error: The value specified, $Options{usetriangleinequality}, for option \"-u, --UseTriangleInequality\" is not valid. Allowed values: Yes or No\n";
745 }
746 if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
747 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
748 }
749 }
750
751 __END__
752
753 =head1 NAME
754
755 TopologicalAtomTripletsFingerprints.pl - Generate topological atom triplets fingerprints for SD files
756
757 =head1 SYNOPSIS
758
759 TopologicalAtomTripletsFingerprints.pl SDFile(s)...
760
761 TopologicalAtomTripletsFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
762 [B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes>]
763 [B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">]
764 [B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">]
765 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
766 [B<--CompoundIDMode>] [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">]
767 [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>] [B<-f, --Filter> I<Yes | No>]
768 [B<--FingerprintsLabel> I<text>] [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>]
769 [B<--MinDistance> I<number>] [B<--MaxDistance> I<number>]
770 [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD | FP | text | all>] [B<-o, --overwrite>]
771 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-u, --UseTriangleInequality> I<Yes | No>]
772 [B<-v, --VectorStringFormat> I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
773 [B<-w, --WorkingDir> dirname] SDFile(s)...
774
775 =head1 DESCRIPTION
776
777 Generate topological atom triplets fingerprints for I<SDFile(s)> and create
778 appropriate SD, FP or CSV/TSV text file(s) containing fingerprints vector strings corresponding to
779 molecular fingerprints.
780
781 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
782 and I<.sd>. All other file names are ignored. All the SD files in a current directory
783 can be specified either by I<*.sdf> or the current directory name.
784
785 The current release of MayaChemTools supports generation of topological atom triplets
786 fingerprints corresponding to following B<-a, --AtomIdentifierTypes>:
787
788 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
789 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
790 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
791
792 Based on the values specified for B<-a, --AtomIdentifierType> and B<--AtomicInvariantsToUse>,
793 initial atom types are assigned to all non-hydrogen atoms in a molecule. Using the distance
794 matrix for the molecule and initial atom types assigned to non-hydrogen atoms, all unique atom
795 pairs within B<--MinDistance> and B<--MaxDistance> are identified and counted. An atom triplet
796 identifier is generated for each unique atom triplet; the format of the atom triplet identifier is:
797
798 <ATx>-Dyz-<ATy>-Dxz-<ATz>-Dxy
799
800 ATx, ATy, ATz: Atom types assigned to atom x, atom y, and atom z
801 Dxy: Distance between atom x and atom y
802 Dxz: Distance between atom x and atom z
803 Dyz: Distance between atom y and atom z
804
805 where <AT1>-D23 <= <AT2>-D13 <= <AT3>-D12
806
807 The atom triplet identifiers for all unique atom triplets corresponding to non-hydrogen atoms constitute
808 topological atom triplets fingerprints of the molecule.
809
810 Example of I<SD> file containing topological atom triplets fingerprints string data:
811
812 ... ...
813 ... ...
814 $$$$
815 ... ...
816 ... ...
817 ... ...
818 41 44 0 0 0 0 0 0 0 0999 V2000
819 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
820 ... ...
821 2 3 1 0 0 0 0
822 ... ...
823 M END
824 > <CmpdID>
825 Cmpd1
826
827 > <TopologicalAtomTripletsFingerprints>
828 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:Mi
829 nDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1.B
830 O1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D10-C
831 .X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1...;
832 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2 2
833 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8 8 ...
834
835 $$$$
836 ... ...
837 ... ...
838
839 Example of I<FP> file containing topological atom triplets fingerprints string data:
840
841 #
842 # Package = MayaChemTools 7.4
843 # Release Date = Oct 21, 2010
844 #
845 # TimeStamp = Fri Mar 11 15:24:01 2011
846 #
847 # FingerprintsStringType = FingerprintsVector
848 #
849 # Description = TopologicalAtomTriplets:AtomicInvariantsAtomTypes:Mi...
850 # VectorStringFormat = IDsAndValuesString
851 # VectorValuesType = NumericalValues
852 #
853 Cmpd1 3096;C.X1.BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2...;1 2 2 2 2...
854 Cmpd2 1093;C.X1.BO1.H3-D1-C.X1.BO1.H3-D3-C.X2.BO2.H2-D4...;2 2 2 2 2...
855 ... ...
856 ... ..
857
858 Example of CSV I<Text> file containing topological atom triplets fingerprints string data:
859
860 "CompoundID","TopologicalAtomTripletsFingerprints"
861 "Cmpd1","FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAto
862 mTypes:MinDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesStri
863 ng;C.X1.BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2
864 .H2-D10-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1....;
865 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2 2
866 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8 8 ...
867 ... ...
868 ... ...
869
870 The current release of MayaChemTools generates the following types of topological atom triplets
871 fingerprints vector strings:
872
873 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
874 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
875 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1
876 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1
877 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....;
878 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
879 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
880
881 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
882 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesPairsString
883 ;C.X1.BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 1 C.X1.BO1.H3-D1-C.X2.BO
884 2.H2-D10-C.X3.BO4-D9 2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 2 C.X
885 1.BO1.H3-D1-C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 2 C.X1.BO1.H3-D1-C.X2.BO2.H2
886 -D6-C.X3.BO3.H1-D5 2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3.BO3.H1-D7 2...
887
888 FingerprintsVector;TopologicalAtomTriplets:DREIDINGAtomTypes:MinDistan
889 ce1:MaxDistance10;2377;NumericalValues;IDsAndValuesString;C_2-D1-C_2-D
890 9-C_3-D10 C_2-D1-C_2-D9-C_R-D10 C_2-D1-C_3-D1-C_3-D2 C_2-D1-C_3-D10-C_
891 3-D9 C_2-D1-C_3-D2-C_3-D3 C_2-D1-C_3-D2-C_R-D3 C_2-D1-C_3-D3-C_3-D4 C_
892 2-D1-C_3-D3-N_R-D4 C_2-D1-C_3-D3-O_3-D2 C_2-D1-C_3-D4-C_3-D5 C_2-D...;
893 1 1 1 2 1 1 3 1 1 2 2 1 1 1 1 1 1 1 1 2 1 3 4 5 1 1 6 4 2 2 3 1 1 1 2
894 2 1 2 1 1 2 2 2 1 2 1 2 1 1 3 3 2 6 4 2 1 1 1 2 2 1 1 1 1 1 1 1 1 1...
895
896 FingerprintsVector;TopologicalAtomTriplets:EStateAtomTypes:MinDistance
897 1:MaxDistance10;3298;NumericalValues;IDsAndValuesString;aaCH-D1-aaCH-D
898 1-aaCH-D2 aaCH-D1-aaCH-D1-aasC-D2 aaCH-D1-aaCH-D10-aaCH-D9 aaCH-D1-aaC
899 H-D10-aasC-D9 aaCH-D1-aaCH-D2-aaCH-D3 aaCH-D1-aaCH-D2-aasC-D1 aaCH-D1-
900 aaCH-D2-aasC-D3 aaCH-D1-aaCH-D3-aasC-D2 aaCH-D1-aaCH-D4-aasC-D5 aa...;
901 6 4 24 4 16 8 8 4 8 8 8 12 10 14 4 16 24 4 12 2 2 4 1 10 2 2 15 2 2 2
902 2 2 2 14 4 2 2 2 2 1 2 10 2 2 4 1 2 4 8 3 3 3 4 6 4 2 2 3 3 1 1 1 2 1
903 2 2 4 2 3 2 1 2 4 5 3 2 2 1 2 4 3 2 8 12 6 2 2 4 4 7 1 4 2 4 2 2 2 ...
904
905 FingerprintsVector;TopologicalAtomTriplets:FunctionalClassAtomTypes:Mi
906 nDistance1:MaxDistance10;2182;NumericalValues;IDsAndValuesString;Ar-D1
907 -Ar-D1-Ar-D2 Ar-D1-Ar-D1-Ar.HBA-D2 Ar-D1-Ar-D10-Ar-D9 Ar-D1-Ar-D10-Hal
908 -D9 Ar-D1-Ar-D2-Ar-D2 Ar-D1-Ar-D2-Ar-D3 Ar-D1-Ar-D2-Ar.HBA-D1 Ar-D1-Ar
909 -D2-Ar.HBA-D2 Ar-D1-Ar-D2-Ar.HBA-D3 Ar-D1-Ar-D2-HBD-D1 Ar-D1-Ar-D2...;
910 27 1 32 2 2 63 3 2 1 2 1 2 3 1 1 40 3 1 2 2 2 2 4 2 2 47 4 2 2 1 2 1 5
911 2 2 51 4 3 1 3 1 9 1 1 50 3 3 4 1 9 50 2 2 3 3 5 45 1 1 1 2 1 2 2 3 3
912 4 4 3 2 1 1 3 4 5 5 3 1 2 3 2 3 5 7 2 7 3 7 1 1 2 2 2 2 3 1 4 3 1 2...
913
914 FingerprintsVector;TopologicalAtomTriplets:MMFF94AtomTypes:MinDistance
915 1:MaxDistance10;2966;NumericalValues;IDsAndValuesString;C5A-D1-C5A-D1-
916 N5-D2 C5A-D1-C5A-D2-C5B-D2 C5A-D1-C5A-D3-CB-D2 C5A-D1-C5A-D3-CR-D2 C5A
917 -D1-C5B-D1-C5B-D2 C5A-D1-C5B-D2-C=ON-D1 C5A-D1-C5B-D2-CB-D1 C5A-D1-C5B
918 -D3-C=ON-D2 C5A-D1-C5B-D3-CB-D2 C5A-D1-C=ON-D3-NC=O-D2 C5A-D1-C=ON-D3-
919 O=CN-D2 C5A-D1-C=ON-D4-NC=O-D3 C5A-D1-C=ON-D4-O=CN-D3 C5A-D1-CB-D1-...
920
921 FingerprintsVector;TopologicalAtomTriplets:SLogPAtomTypes:MinDistance1
922 :MaxDistance10;3710;NumericalValues;IDsAndValuesString;C1-D1-C1-D1-C11
923 -D2 C1-D1-C1-D1-CS-D2 C1-D1-C1-D10-C5-D9 C1-D1-C1-D3-C10-D2 C1-D1-C1-D
924 3-C5-D2 C1-D1-C1-D3-CS-D2 C1-D1-C1-D3-CS-D4 C1-D1-C1-D4-C10-D5 C1-D1-C
925 1-D4-C11-D5 C1-D1-C1-D5-C10-D4 C1-D1-C1-D5-C5-D4 C1-D1-C1-D6-C11-D7 C1
926 -D1-C1-D6-CS-D5 C1-D1-C1-D6-CS-D7 C1-D1-C1-D8-C11-D9 C1-D1-C1-D8-CS...
927
928 FingerprintsVector;TopologicalAtomTriplets:SYBYLAtomTypes:MinDistance1
929 :MaxDistance10;2332;NumericalValues;IDsAndValuesString;C.2-D1-C.2-D9-C
930 .3-D10 C.2-D1-C.2-D9-C.ar-D10 C.2-D1-C.3-D1-C.3-D2 C.2-D1-C.3-D10-C.3-
931 D9 C.2-D1-C.3-D2-C.3-D3 C.2-D1-C.3-D2-C.ar-D3 C.2-D1-C.3-D3-C.3-D4 C.2
932 -D1-C.3-D3-N.ar-D4 C.2-D1-C.3-D3-O.3-D2 C.2-D1-C.3-D4-C.3-D5 C.2-D1-C.
933 3-D5-C.3-D6 C.2-D1-C.3-D5-O.3-D4 C.2-D1-C.3-D6-C.3-D7 C.2-D1-C.3-D7...
934
935 FingerprintsVector;TopologicalAtomTriplets:TPSAAtomTypes:MinDistance1:
936 MaxDistance10;1007;NumericalValues;IDsAndValuesString;N21-D1-N7-D3-Non
937 e-D4 N21-D1-N7-D5-None-D4 N21-D1-None-D1-None-D2 N21-D1-None-D2-None-D
938 2 N21-D1-None-D2-None-D3 N21-D1-None-D3-None-D4 N21-D1-None-D4-None-D5
939 N21-D1-None-D4-O3-D3 N21-D1-None-D4-O4-D3 N21-D1-None-D5-None-D6 N21-
940 D1-None-D6-None-D7 N21-D1-None-D6-O4-D5 N21-D1-None-D7-None-D8 N21-...
941
942 FingerprintsVector;TopologicalAtomTriplets:UFFAtomTypes:MinDistance1:M
943 axDistance10;2377;NumericalValues;IDsAndValuesString;C_2-D1-C_2-D9-C_3
944 -D10 C_2-D1-C_2-D9-C_R-D10 C_2-D1-C_3-D1-C_3-D2 C_2-D1-C_3-D10-C_3-D9
945 C_2-D1-C_3-D2-C_3-D3 C_2-D1-C_3-D2-C_R-D3 C_2-D1-C_3-D3-C_3-D4 C_2-D1-
946 C_3-D3-N_R-D4 C_2-D1-C_3-D3-O_3-D2 C_2-D1-C_3-D4-C_3-D5 C_2-D1-C_3-D5-
947 C_3-D6 C_2-D1-C_3-D5-O_3-D4 C_2-D1-C_3-D6-C_3-D7 C_2-D1-C_3-D7-C_3-...
948
949 =head1 OPTIONS
950
951 =over 4
952
953 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
954
955 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
956 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
957 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
958 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
959
960 The supported aromaticity model names along with model specific control parameters
961 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
962 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
963 this file during class instantiation and makes it available to method B<DetectAromaticity>
964 for detecting aromaticity corresponding to a specific model.
965
966 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | FunctionalClassAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>
967
968 Specify atom identifier type to use for assignment of initial atom identifier to non-hydrogen
969 atoms during calculation of topological atom triplets fingerprints. Possible values in the current
970 release are: I<AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
971 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
972 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>.
973
974 =item B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">
975
976 This value is used during I<AtomicInvariantsAtomTypes> value of B<a, --AtomIdentifierType>
977 option. It's a list of comma separated valid atomic invariant atom types.
978
979 Possible values for atomic invariants are: I<AS, X, BO, LBO, SB, DB, TB,
980 H, Ar, RA, FC, MN, SM>. Default value: I<AS,X,BO,H,FC>.
981
982 The atomic invariants abbreviations correspond to:
983
984 AS = Atom symbol corresponding to element symbol
985
986 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
987 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
988 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
989 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
990 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
991 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
992 H<n> = Number of implicit and explicit hydrogens for atom
993 Ar = Aromatic annotation indicating whether atom is aromatic
994 RA = Ring atom annotation indicating whether atom is a ring
995 FC<+n/-n> = Formal charge assigned to atom
996 MN<n> = Mass number indicating isotope other than most abundant isotope
997 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
998 3 (triplet)
999
1000 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1001
1002 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1003
1004 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1005 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
1006
1007 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
1008 are also allowed:
1009
1010 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
1011 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
1012 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
1013 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
1014 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1015 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1016 H : NumOfImplicitAndExplicitHydrogens
1017 Ar : Aromatic
1018 RA : RingAtom
1019 FC : FormalCharge
1020 MN : MassNumber
1021 SM : SpinMultiplicity
1022
1023 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant
1024 atom types.
1025
1026 =item B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">
1027
1028 This value is used during I<FunctionalClassAtomTypes> value of B<a, --AtomIdentifierType>
1029 option. It's a list of comma separated valid functional classes.
1030
1031 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1032 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>.
1033
1034 The functional class abbreviations correspond to:
1035
1036 HBD: HydrogenBondDonor
1037 HBA: HydrogenBondAcceptor
1038 PI : PositivelyIonizable
1039 NI : NegativelyIonizable
1040 Ar : Aromatic
1041 Hal : Halogen
1042 H : Hydrophobic
1043 RA : RingAtom
1044 CA : ChainAtom
1045
1046 Functional class atom type specification for an atom corresponds to:
1047
1048 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1049
1050 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1051 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1052
1053 HydrogenBondDonor: NH, NH2, OH
1054 HydrogenBondAcceptor: N[!H], O
1055 PositivelyIonizable: +, NH2
1056 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1057
1058 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1059
1060 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1061
1062 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1063 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1064 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1065 look like Cmpd<Number>.
1066
1067 Examples for I<DataField> value of B<--CompoundIDMode>:
1068
1069 MolID
1070 ExtReg
1071
1072 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1073
1074 Compound
1075
1076 The value specified above generates compound IDs which correspond to Compound<Number>
1077 instead of default value of Cmpd<Number>.
1078
1079 =item B<--CompoundIDLabel> I<text>
1080
1081 Specify compound ID column label for CSV/TSV text file(s) used during I<CompoundID> value
1082 of B<--DataFieldsMode> option. Default value: I<CompoundID>.
1083
1084 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1085
1086 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
1087 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
1088 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
1089 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
1090
1091 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1092 Default value: I<LabelPrefix>.
1093
1094 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1095 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1096 values are replaced with sequential compound IDs.
1097
1098 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1099
1100 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
1101
1102 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1103 with generated fingerprints for I<text | all> values of B<--output> option.
1104
1105 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1106
1107 Examples:
1108
1109 Extreg
1110 MolID,CompoundName
1111
1112 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1113
1114 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1115 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
1116 data field; transfer SD data files common to all compounds; extract specified data fields;
1117 generate a compound ID using molname line, a compound prefix, or a combination of both.
1118 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1119
1120 =item B<-f, --Filter> I<Yes | No>
1121
1122 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1123 Default value: I<Yes>.
1124
1125 By default, compound data is checked before calculating fingerprints and compounds containing
1126 atom data corresponding to non-element symbols or no atom data are ignored.
1127
1128 =item B<--FingerprintsLabel> I<text>
1129
1130 SD data label or text file column label to use for fingerprints string in output SD or
1131 CSV/TSV text file(s) specified by B<--output>. Default value: I<TopologicalAtomTripletsFingerprints>.
1132
1133 =item B<-h, --help>
1134
1135 Print this help message.
1136
1137 =item B<-k, --KeepLargestComponent> I<Yes | No>
1138
1139 Generate fingerprints for only the largest component in molecule. Possible values:
1140 I<Yes or No>. Default value: I<Yes>.
1141
1142 For molecules containing multiple connected components, fingerprints can be generated
1143 in two different ways: use all connected components or just the largest connected
1144 component. By default, all atoms except for the largest connected component are
1145 deleted before generation of fingerprints.
1146
1147 =item B<--MinDistance> I<number>
1148
1149 Minimum bond distance between atom triplets for generating topological atom triplets. Default value:
1150 I<1>. Valid values: positive integers and less than B<--MaxDistance>.
1151
1152 =item B<--MaxDistance> I<number>
1153
1154 Maximum bond distance between atom triplets for generating topological atom triplets. Default value:
1155 I<10>. Valid values: positive integers and greater than B<--MinDistance>.
1156
1157 =item B<--OutDelim> I<comma | tab | semicolon>
1158
1159 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1160 Default value: I<comma>
1161
1162 =item B<--output> I<SD | FP | text | all>
1163
1164 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1165
1166 =item B<-o, --overwrite>
1167
1168 Overwrite existing files.
1169
1170 =item B<-q, --quote> I<Yes | No>
1171
1172 Put quote around column values in output CSV/TSV text file(s). Possible values:
1173 I<Yes or No>. Default value: I<Yes>.
1174
1175 =item B<-r, --root> I<RootName>
1176
1177 New file name is generated using the root: <Root>.<Ext>. Default for new file names:
1178 <SDFileName><TopologicalAtomTripletsFP>.<Ext>. The file type determines <Ext> value.
1179 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1180 delimited text files, respectively.This option is ignored for multiple input files.
1181
1182 =item B<-u, --UseTriangleInequality> I<Yes | No>
1183
1184 Specify whether to imply triangle distance inequality test to distances between atom pairs in
1185 atom triplets during generation of atom triplets generation. Possible values: I<Yes or No>.
1186 Default value: I<No>.
1187
1188 Triangle distance inequality test implies that distance or binned distance between any two atom
1189 pairs in an atom triplet must be less than the sum of distances or binned distances between other
1190 two atoms pairs and greater than the difference of their distances.
1191
1192 For atom triplet ATx-Dyz-ATy-Dxz-ATz-Dxy to satisfy triangle inequality:
1193
1194 Dyz > |Dxz - Dxy| and Dyz < Dxz + Dxy
1195 Dxz > |Dyz - Dxy| and Dyz < Dyz + Dxy
1196 Dxy > |Dyz - Dxz| and Dxy < Dyz + Dxz
1197
1198 =item B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1199
1200 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1201 B<--output> option. Possible values: I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString |
1202 ValuesAndIDsPairsString>. Default value: I<IDsAndValuesString>.
1203
1204 Examples:
1205
1206
1207 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
1208 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesString;C.X1
1209 .BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D1
1210 0-C.X3.BO4-D9 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 C.X1.BO1.H3-D1
1211 -C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3....;
1212 1 2 2 2 2 2 2 2 8 8 4 8 4 4 2 2 2 2 4 2 2 2 4 2 2 2 2 1 2 2 4 4 4 2 2
1213 2 4 4 4 8 4 4 2 4 4 4 2 4 4 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 8...
1214
1215 FingerprintsVector;TopologicalAtomTriplets:AtomicInvariantsAtomTypes:M
1216 inDistance1:MaxDistance10;3096;NumericalValues;IDsAndValuesPairsString
1217 ;C.X1.BO1.H3-D1-C.X1.BO1.H3-D1-C.X3.BO3.H1-D2 1 C.X1.BO1.H3-D1-C.X2.BO
1218 2.H2-D10-C.X3.BO4-D9 2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D3-N.X3.BO3-D4 2 C.X
1219 1.BO1.H3-D1-C.X2.BO2.H2-D4-C.X2.BO2.H2-D5 2 C.X1.BO1.H3-D1-C.X2.BO2.H2
1220 -D6-C.X3.BO3.H1-D5 2 C.X1.BO1.H3-D1-C.X2.BO2.H2-D6-C.X3.BO3.H1-D7 2...
1221
1222 =item B<-w, --WorkingDir> I<DirName>
1223
1224 Location of working directory. Default value: current directory.
1225
1226 =back
1227
1228 =head1 EXAMPLES
1229
1230 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1231 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1232 file containing sequential compound IDs along with fingerprints vector strings data, type:
1233
1234 % TopologicalAtomTripletsFingerprints.pl -r SampleTATFP -o Sample.sdf
1235
1236 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1237 10 using atomic invariants atom types in IDsAndValuesString format and create SampleTATFP.sdf,
1238 SampleTATFP.fpf and SampleTATFP.csv files containing sequential compound IDs in CSV file along
1239 with fingerprints vector strings data, type:
1240
1241 % TopologicalAtomTripletsFingerprints.pl --output all -r SampleTATFP
1242 -o Sample.sdf
1243
1244 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1245 10 using atomic invariants atom types in IDsAndValuesPairsString format and create a SampleTATFP.csv
1246 file containing sequential compound IDs along with fingerprints vector strings data, type:
1247
1248 % TopologicalAtomTripletsFingerprints.pl --VectorStringFormat
1249 IDsAndValuesPairsString -r SampleTATFP -o Sample.sdf
1250
1251 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1252 10 using DREIDING atom types in IDsAndValuesString format and create a SampleTATFP.csv
1253 file containing sequential compound IDs along with fingerprints vector strings data, type:
1254
1255 % TopologicalAtomTripletsFingerprints.pl -a DREIDINGAtomTypes
1256 -r SampleTATFP -o Sample.sdf
1257
1258 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1259 10 using E-state atom types in IDsAndValuesString format and create a SampleTATFP.csv
1260 file containing sequential compound IDs along with fingerprints vector strings data, type:
1261
1262 % TopologicalAtomTripletsFingerprints.pl -a EStateAtomTypes
1263 -r SampleTATFP -o Sample.sdf
1264
1265 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1266 10 using functional class atom types in IDsAndValuesString format and create a SampleTATFP.csv
1267 file containing sequential compound IDs along with fingerprints vector strings data, type:
1268
1269 % TopologicalAtomTripletsFingerprints.pl -a FunctionalClassAtomTypes
1270 -r SampleTATFP -o Sample.sdf
1271
1272 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1273 10 using DREIDING atom types in IDsAndValuesString format and create a SampleTATFP.csv
1274 file containing sequential compound IDs along with fingerprints vector strings data, type:
1275
1276 % TopologicalAtomTripletsFingerprints.pl -a DREIDINGAtomTypes
1277 -r SampleTATFP -o Sample.sdf
1278
1279 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1280 10 using MM94 atom types in IDsAndValuesString format and create a SampleTATFP.csv
1281 file containing sequential compound IDs along with fingerprints vector strings data, type:
1282
1283 % TopologicalAtomTripletsFingerprints.pl -a MMFF94AtomTypes
1284 -r SampleTATFP -o Sample.sdf
1285
1286 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1287 10 using SLogP atom types in IDsAndValuesString format and create a SampleTATFP.csv
1288 file containing sequential compound IDs along with fingerprints vector strings data, type:
1289
1290 % TopologicalAtomTripletsFingerprints.pl -a SLogPAtomTypes
1291 -r SampleTATFP -o Sample.sdf
1292
1293 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1294 10 using SYBYL atom types in IDsAndValuesString format and create a SampleTATFP.csv
1295 file containing sequential compound IDs along with fingerprints vector strings data, type:
1296
1297 % TopologicalAtomTripletsFingerprints.pl -a SYBYLAtomTypes
1298 -r SampleTATFP -o Sample.sdf
1299
1300 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1301 10 using TPSA atom types in IDsAndValuesString format and create a SampleTATFP.csv
1302 file containing sequential compound IDs along with fingerprints vector strings data, type:
1303
1304 % TopologicalAtomTripletsFingerprints.pl -a TPSAAtomTypes
1305 -r SampleTATFP -o Sample.sdf
1306
1307 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1308 10 using UFF atom types in IDsAndValuesString format and create a SampleTATFP.csv
1309 file containing sequential compound IDs along with fingerprints vector strings data, type:
1310
1311 % TopologicalAtomTripletsFingerprints.pl -a UFFAtomTypes
1312 -r SampleTATFP -o Sample.sdf
1313
1314 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1315 6 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1316 file containing sequential compound IDs along with fingerprints vector strings data, type:
1317
1318 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1319 --MinDistance 1 --MaxDistance 6 -r SampleTATFP -o Sample.sdf
1320
1321 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1322 10 using only AS,X atomic invariants atom types in IDsAndValuesString format and create a
1323 SampleTATFP.csv file containing sequential compound IDs along with fingerprints vector strings
1324 data, type:
1325
1326 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1327 --AtomicInvariantsToUse "AS,X" --MinDistance 1 --MaxDistance 6
1328 -r SampleTATFP -o Sample.sdf
1329
1330 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1331 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1332 file containing compound ID from molecule name line along with fingerprints vector strings
1333 data, type:
1334
1335 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1336 --DataFieldsMode CompoundID -CompoundIDMode MolName
1337 -r SampleTATFP -o Sample.sdf
1338
1339 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1340 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1341 file containing compound IDs using specified data field along with fingerprints vector strings
1342 data, type:
1343
1344 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1345 --DataFieldsMode CompoundID -CompoundIDMode DataField --CompoundID
1346 Mol_ID -r SampleTATFP -o Sample.sdf
1347
1348 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1349 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1350 file containing compound ID using combination of molecule name line and an explicit compound
1351 prefix along with fingerprints vector strings data, type:
1352
1353 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1354 --DataFieldsMode CompoundID -CompoundIDMode MolnameOrLabelPrefix
1355 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleTATFP -o Sample.sdf
1356
1357 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1358 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1359 file containing specific data fields columns along with fingerprints vector strings
1360 data, type:
1361
1362 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1363 --DataFieldsMode Specify --DataFields Mol_ID -r SampleTATFP
1364 -o Sample.sdf
1365
1366 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1367 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTATFP.csv
1368 file containing common data fields columns along with fingerprints vector strings
1369 data, type:
1370
1371 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1372 --DataFieldsMode Common -r SampleTATFP -o Sample.sdf
1373
1374 To generate topological atom triplets fingerprints corresponding to bond distances from 1 through
1375 10 using atomic invariants atom types in IDsAndValuesString format and create SampleTATFP.sdf,
1376 SampleTATFP.fpf and SampleTATFP.csv files containing all data fields columns in CSV file along with
1377 fingerprints data, type:
1378
1379 % TopologicalAtomTripletsFingerprints.pl -a AtomicInvariantsAtomTypes
1380 --DataFieldsMode All --output all -r SampleTATFP
1381 -o Sample.sdf
1382
1383 =head1 AUTHOR
1384
1385 Manish Sud <msud@san.rr.com>
1386
1387 =head1 SEE ALSO
1388
1389 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1390 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl,
1391 PathLengthFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1392 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1393
1394 =head1 COPYRIGHT
1395
1396 Copyright (C) 2015 Manish Sud. All rights reserved.
1397
1398 This file is part of MayaChemTools.
1399
1400 MayaChemTools is free software; you can redistribute it and/or modify it under
1401 the terms of the GNU Lesser General Public License as published by the Free
1402 Software Foundation; either version 3 of the License, or (at your option)
1403 any later version.
1404
1405 =cut