comparison bin/TopologicalAtomPairsFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: TopologicalAtomPairsFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:22 $
5 # $Revision: 1.34 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::AtomicInvariantsAtomTypes;
43 use AtomTypes::FunctionalClassAtomTypes;
44 use Fingerprints::TopologicalAtomPairsFingerprints;
45
46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
47
48 # Autoflush STDOUT
49 $| = 1;
50
51 # Starting message...
52 $ScriptName = basename($0);
53 print "\n$ScriptName: Starting...\n\n";
54 $StartTime = new Benchmark;
55
56 # Get the options and setup script...
57 SetupScriptUsage();
58 if ($Options{help} || @ARGV < 1) {
59 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
60 }
61
62 my(@SDFilesList);
63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
64
65 # Process options...
66 print "Processing options...\n";
67 my(%OptionsInfo);
68 ProcessOptions();
69
70 # Setup information about input files...
71 print "Checking input SD file(s)...\n";
72 my(%SDFilesInfo);
73 RetrieveSDFilesInfo();
74
75 # Process input files..
76 my($FileIndex);
77 if (@SDFilesList > 1) {
78 print "\nProcessing SD files...\n";
79 }
80 for $FileIndex (0 .. $#SDFilesList) {
81 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
82 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
83 GenerateTopologicalAtomPairsFingerprints($FileIndex);
84 }
85 }
86 print "\n$ScriptName:Done...\n\n";
87
88 $EndTime = new Benchmark;
89 $TotalTime = timediff ($EndTime, $StartTime);
90 print "Total time: ", timestr($TotalTime), "\n";
91
92 ###############################################################################
93
94 # Generate fingerprints for a SD file...
95 #
96 sub GenerateTopologicalAtomPairsFingerprints {
97 my($FileIndex) = @_;
98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
99
100 $SDFile = $SDFilesList[$FileIndex];
101
102 # Setup output files...
103 #
104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
105
106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
107 $MoleculeFileIO->Open();
108
109 $CmpdCount = 0;
110 $IgnoredCmpdCount = 0;
111
112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
113 $CmpdCount++;
114
115 # Filter compound data before calculating fingerprints...
116 if ($OptionsInfo{Filter}) {
117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
118 $IgnoredCmpdCount++;
119 next COMPOUND;
120 }
121 }
122
123 $TopologicalAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule);
124 if (!$TopologicalAtomPairsFingerprints) {
125 $IgnoredCmpdCount++;
126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
127 next COMPOUND;
128 }
129
130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
131 }
132 $MoleculeFileIO->Close();
133
134 if ($NewFPSDFileIO) {
135 $NewFPSDFileIO->Close();
136 }
137 if ($NewFPTextFileIO) {
138 $NewFPTextFileIO->Close();
139 }
140 if ($NewFPFileIO) {
141 $NewFPFileIO->Close();
142 }
143
144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
145 }
146
147 # Process compound being ignored due to problems in fingerprints geneation...
148 #
149 sub ProcessIgnoredCompound {
150 my($Mode, $CmpdCount, $Molecule) = @_;
151 my($CmpdID, $DataFieldLabelAndValuesRef);
152
153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
155
156 MODE: {
157 if ($Mode =~ /^ContainsNonElementalData$/i) {
158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
159 next MODE;
160 }
161
162 if ($Mode =~ /^ContainsNoElementalData$/i) {
163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
164 next MODE;
165 }
166
167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
169 next MODE;
170 }
171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
172 }
173 }
174
175 # Check and filter compounds....
176 #
177 sub CheckAndFilterCompound {
178 my($CmpdCount, $Molecule) = @_;
179 my($ElementCount, $NonElementCount);
180
181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
182
183 if ($NonElementCount) {
184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
185 return 1;
186 }
187
188 if (!$ElementCount) {
189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
190 return 1;
191 }
192
193 return 0;
194 }
195
196 # Write out compounds fingerprints generation summary statistics...
197 #
198 sub WriteFingerprintsGenerationSummaryStatistics {
199 my($CmpdCount, $IgnoredCmpdCount) = @_;
200 my($ProcessedCmpdCount);
201
202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
203
204 print "\nNumber of compounds: $CmpdCount\n";
205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
207 }
208
209 # Open output files...
210 #
211 sub SetupAndOpenOutputFiles {
212 my($FileIndex) = @_;
213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
214
215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
216
217 # Setup common parameters for fingerprints file IO objects...
218 #
219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
220
221 if ($OptionsInfo{SDOutput}) {
222 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
223 print "Generating SD file $NewFPSDFile...\n";
224 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
225 $NewFPSDFileIO->Open();
226 }
227
228 if ($OptionsInfo{FPOutput}) {
229 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
230 print "Generating FP file $NewFPFile...\n";
231 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
232 $NewFPFileIO->Open();
233 }
234
235 if ($OptionsInfo{TextOutput}) {
236 my($ColLabelsRef);
237
238 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
239 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
240
241 print "Generating text file $NewFPTextFile...\n";
242 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
243 $NewFPTextFileIO->Open();
244 }
245
246 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
247 }
248
249 # Write fingerpritns and other data to appropriate output files...
250 #
251 sub WriteDataToOutputFiles {
252 my($FileIndex, $CmpdCount, $Molecule, $TopologicalAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
253 my($DataFieldLabelAndValuesRef);
254
255 $DataFieldLabelAndValuesRef = undef;
256 if ($NewFPTextFileIO || $NewFPFileIO) {
257 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
258 }
259
260 if ($NewFPSDFileIO) {
261 my($CmpdString);
262
263 $CmpdString = $Molecule->GetInputMoleculeString();
264 $NewFPSDFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $CmpdString);
265 }
266
267 if ($NewFPTextFileIO) {
268 my($ColValuesRef);
269
270 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
271 $NewFPTextFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $ColValuesRef);
272 }
273
274 if ($NewFPFileIO) {
275 my($CompoundID);
276
277 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
278 $NewFPFileIO->WriteFingerprints($TopologicalAtomPairsFingerprints, $CompoundID);
279 }
280 }
281
282 # Generate approriate column labels for FPText output file...
283 #
284 sub SetupFPTextFileCoulmnLabels {
285 my($FileIndex) = @_;
286 my($Line, @ColLabels);
287
288 @ColLabels = ();
289 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
290 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
291 }
292 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
293 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
294 }
295 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
296 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
297 }
298 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
299 push @ColLabels, $OptionsInfo{CompoundIDLabel};
300 }
301 # Add fingerprints label...
302 push @ColLabels, $OptionsInfo{FingerprintsLabel};
303
304 return \@ColLabels;
305 }
306
307 # Generate column values FPText output file..
308 #
309 sub SetupFPTextFileCoulmnValues {
310 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
311 my(@ColValues);
312
313 @ColValues = ();
314 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
315 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
316 }
317 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
318 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
319 }
320 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
321 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
322 }
323 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
325 }
326
327 return \@ColValues;
328 }
329
330 # Generate compound ID for FP and FPText output files..
331 #
332 sub SetupCmpdIDForOutputFiles {
333 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
334 my($CmpdID);
335
336 $CmpdID = '';
337 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
338 my($MolName);
339 $MolName = $Molecule->GetName();
340 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
341 }
342 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
343 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
344 }
345 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
346 my($SpecifiedDataField);
347 $SpecifiedDataField = $OptionsInfo{CompoundID};
348 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
349 }
350 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
351 $CmpdID = $Molecule->GetName();
352 }
353 return $CmpdID;
354 }
355
356 # Generate fingerprints for molecule...
357 #
358 sub GenerateMoleculeFingerprints {
359 my($Molecule) = @_;
360 my($TopologicalAtomPairsFingerprints);
361
362 if ($OptionsInfo{KeepLargestComponent}) {
363 $Molecule->KeepLargestComponent();
364 }
365 if (!$Molecule->DetectRings()) {
366 return undef;
367 }
368 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
369 $Molecule->DetectAromaticity();
370
371 $TopologicalAtomPairsFingerprints = new Fingerprints::TopologicalAtomPairsFingerprints('Molecule' => $Molecule, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType});
372 SetAtomIdentifierTypeValuesToUse($TopologicalAtomPairsFingerprints);
373
374 # Generate fingerprints...
375 $TopologicalAtomPairsFingerprints->GenerateFingerprints();
376
377 # Make sure fingerprints generation is successful...
378 if (!$TopologicalAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) {
379 return undef;
380 }
381
382 return $TopologicalAtomPairsFingerprints;
383 }
384
385 # Set atom identifier type to use for generating fingerprints...
386 #
387 sub SetAtomIdentifierTypeValuesToUse {
388 my($TopologicalAtomPairsFingerprints) = @_;
389
390 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
391 $TopologicalAtomPairsFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
392 }
393 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
394 $TopologicalAtomPairsFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
395 }
396 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
397 # Nothing to do for now...
398 }
399 else {
400 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
401 }
402 }
403
404 # Retrieve information about SD files...
405 #
406 sub RetrieveSDFilesInfo {
407 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
408
409 %SDFilesInfo = ();
410 @{$SDFilesInfo{FileOkay}} = ();
411 @{$SDFilesInfo{OutFileRoot}} = ();
412 @{$SDFilesInfo{SDOutFileNames}} = ();
413 @{$SDFilesInfo{FPOutFileNames}} = ();
414 @{$SDFilesInfo{TextOutFileNames}} = ();
415 @{$SDFilesInfo{AllDataFieldsRef}} = ();
416 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
417
418 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
419 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
420
421 FILELIST: for $Index (0 .. $#SDFilesList) {
422 $SDFile = $SDFilesList[$Index];
423
424 $SDFilesInfo{FileOkay}[$Index] = 0;
425 $SDFilesInfo{OutFileRoot}[$Index] = '';
426 $SDFilesInfo{SDOutFileNames}[$Index] = '';
427 $SDFilesInfo{FPOutFileNames}[$Index] = '';
428 $SDFilesInfo{TextOutFileNames}[$Index] = '';
429
430 $SDFile = $SDFilesList[$Index];
431 if (!(-e $SDFile)) {
432 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
433 next FILELIST;
434 }
435 if (!CheckFileType($SDFile, "sd sdf")) {
436 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
437 next FILELIST;
438 }
439
440 if ($CheckDataField) {
441 # Make sure data field exists in SD file..
442 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
443
444 @CmpdLines = ();
445 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
446 $CmpdString = ReadCmpdString(\*SDFILE);
447 close SDFILE;
448 @CmpdLines = split "\n", $CmpdString;
449 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
450 $SpecifiedDataField = $OptionsInfo{CompoundID};
451 if (!exists $DataFieldValues{$SpecifiedDataField}) {
452 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
453 next FILELIST;
454 }
455 }
456
457 $AllDataFieldsRef = '';
458 $CommonDataFieldsRef = '';
459 if ($CollectDataFields) {
460 my($CmpdCount);
461 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
462 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
463 close SDFILE;
464 }
465
466 # Setup output file names...
467 $FileDir = ""; $FileName = ""; $FileExt = "";
468 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
469
470 $TextOutFileExt = "csv";
471 if ($Options{outdelim} =~ /^tab$/i) {
472 $TextOutFileExt = "tsv";
473 }
474 $SDOutFileExt = $FileExt;
475 $FPOutFileExt = "fpf";
476
477 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
478 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
479 if ($RootFileName && $RootFileExt) {
480 $FileName = $RootFileName;
481 }
482 else {
483 $FileName = $OptionsInfo{OutFileRoot};
484 }
485 $OutFileRoot = $FileName;
486 }
487 else {
488 $OutFileRoot = "${FileName}TopologicalAtomPairsFP";
489 }
490
491 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
492 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
493 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
494
495 if ($OptionsInfo{SDOutput}) {
496 if ($SDFile =~ /$NewSDFileName/i) {
497 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
498 print "Specify a different name using \"-r --root\" option or use default name.\n";
499 next FILELIST;
500 }
501 }
502
503 if (!$OptionsInfo{OverwriteFiles}) {
504 # Check SD and text outout files...
505 if ($OptionsInfo{SDOutput}) {
506 if (-e $NewSDFileName) {
507 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
508 next FILELIST;
509 }
510 }
511 if ($OptionsInfo{FPOutput}) {
512 if (-e $NewFPFileName) {
513 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
514 next FILELIST;
515 }
516 }
517 if ($OptionsInfo{TextOutput}) {
518 if (-e $NewTextFileName) {
519 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
520 next FILELIST;
521 }
522 }
523 }
524
525 $SDFilesInfo{FileOkay}[$Index] = 1;
526
527 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
528 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
529 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
530 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
531
532 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
533 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
534 }
535 }
536
537 # Process option values...
538 sub ProcessOptions {
539 %OptionsInfo = ();
540
541 ProcessAtomIdentifierTypeOptions();
542
543 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
544
545 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
546 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
547 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
548
549 my(@SpecifiedDataFields);
550 @SpecifiedDataFields = ();
551
552 @{$OptionsInfo{SpecifiedDataFields}} = ();
553 $OptionsInfo{CompoundID} = '';
554
555 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
556 if ($Options{compoundidmode} =~ /^DataField$/i) {
557 if (!$Options{compoundid}) {
558 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
559 }
560 $OptionsInfo{CompoundID} = $Options{compoundid};
561 }
562 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
563 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
564 }
565 }
566 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
567 if (!$Options{datafields}) {
568 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
569 }
570 @SpecifiedDataFields = split /\,/, $Options{datafields};
571 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
572 }
573
574 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
575
576 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalAtomPairsFingerprints';
577
578 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
579
580 $OptionsInfo{MinDistance} = $Options{mindistance};
581 $OptionsInfo{MaxDistance} = $Options{maxdistance};
582
583 $OptionsInfo{Output} = $Options{output};
584 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
585 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
586 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
587
588 $OptionsInfo{OutDelim} = $Options{outdelim};
589 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
590
591 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
592 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
593
594 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
595 }
596
597 # Process atom identifier type and related options...
598 #
599 sub ProcessAtomIdentifierTypeOptions {
600
601 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
602
603 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
604 ProcessAtomicInvariantsToUseOption();
605 }
606 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
607 ProcessFunctionalClassesToUse();
608 }
609 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
610 # Nothing to do for now...
611 }
612 else {
613 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
614 }
615 }
616
617 # Process specified atomic invariants to use...
618 #
619 sub ProcessAtomicInvariantsToUseOption {
620 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
621
622 @{$OptionsInfo{AtomicInvariantsToUse}} = ();
623 if (IsEmpty($Options{atomicinvariantstouse})) {
624 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
625 }
626 $AtomSymbolSpecified = 0;
627 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
628 for $AtomicInvariant (@AtomicInvariantsWords) {
629 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
630 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
631 }
632 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
633 $AtomSymbolSpecified = 1;
634 }
635 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
636 }
637 if (!$AtomSymbolSpecified) {
638 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
639 }
640 }
641
642 # Process specified functional classes invariants to use...
643 #
644 sub ProcessFunctionalClassesToUse {
645 my($FunctionalClass, @FunctionalClassesToUseWords);
646
647 @{$OptionsInfo{FunctionalClassesToUse}} = ();
648 if (IsEmpty($Options{functionalclassestouse})) {
649 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
650 }
651 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
652 for $FunctionalClass (@FunctionalClassesToUseWords) {
653 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
654 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
655 }
656 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
657 }
658 }
659
660 # Setup script usage and retrieve command line arguments specified using various options...
661 sub SetupScriptUsage {
662
663 # Retrieve all the options...
664 %Options = ();
665
666 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
667
668 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
669 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC';
670
671 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
672
673 $Options{compoundidmode} = 'LabelPrefix';
674 $Options{compoundidlabel} = 'CompoundID';
675 $Options{datafieldsmode} = 'CompoundID';
676
677 $Options{filter} = 'Yes';
678
679 $Options{keeplargestcomponent} = 'Yes';
680
681 $Options{mindistance} = 1;
682 $Options{maxdistance} = 10;
683
684 $Options{output} = 'text';
685 $Options{outdelim} = 'comma';
686 $Options{quote} = 'yes';
687
688 $Options{vectorstringformat} = 'IDsAndValuesString';
689
690 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "vectorstringformat|v=s", "workingdir|w=s")) {
691 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
692 }
693 if ($Options{workingdir}) {
694 if (! -d $Options{workingdir}) {
695 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
696 }
697 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
698 }
699 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
700 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
701 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
702 }
703 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
704 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
705 }
706 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
707 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
708 }
709 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
710 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
711 }
712 if ($Options{filter} !~ /^(Yes|No)$/i) {
713 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
714 }
715 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
716 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
717 }
718 if (!IsPositiveInteger($Options{mindistance})) {
719 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: > 0 \n";
720 }
721 if (!IsPositiveInteger($Options{maxdistance})) {
722 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
723 }
724 if ($Options{mindistance} > $Options{maxdistance}) {
725 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
726 }
727 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
728 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
729 }
730 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
731 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
732 }
733 if ($Options{quote} !~ /^(Yes|No)$/i) {
734 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
735 }
736 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
737 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
738 }
739 if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
740 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
741 }
742 }
743
744 __END__
745
746 =head1 NAME
747
748 TopologicalAtomPairsFingerprints.pl - Generate topological atom pairs fingerprints for SD files
749
750 =head1 SYNOPSIS
751
752 TopologicalAtomPairsFingerprints.pl SDFile(s)...
753
754 TopologicalAtomPairsFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
755 [B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes>]
756 [B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">]
757 [B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">]
758 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
759 [B<--CompoundIDMode>] [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">]
760 [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>] [B<-f, --Filter> I<Yes | No>]
761 [B<--FingerprintsLabel> I<text>] [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>]
762 [B<--MinDistance> I<number>] [B<--MaxDistance> I<number>]
763 [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD | FP | text | all>] [B<-o, --overwrite>]
764 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>]
765 [B<-v, --VectorStringFormat> I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
766 [B<-w, --WorkingDir> dirname] SDFile(s)...
767
768 =head1 DESCRIPTION
769
770 Generate topological atom pairs fingerprints [ Ref 57, Ref 59, Ref 72 ] for I<SDFile(s)> and create
771 appropriate SD, FP or CSV/TSV text file(s) containing fingerprints vector strings corresponding to
772 molecular fingerprints.
773
774 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
775 and I<.sd>. All other file names are ignored. All the SD files in a current directory
776 can be specified either by I<*.sdf> or the current directory name.
777
778 The current release of MayaChemTools supports generation of topological atom pairs
779 corresponding to following B<-a, --AtomIdentifierTypes>:
780
781 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
782 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
783 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
784
785 Based on the values specified for B<-a, --AtomIdentifierType> and B<--AtomicInvariantsToUse>,
786 initial atom types are assigned to all non-hydrogen atoms in a molecule. Using the distance
787 matrix for the molecule and initial atom types assigned to non-hydrogen atoms, all unique atom
788 pairs within B<--MinDistance> and B<--MaxDistance> are identified and counted. An atom pair
789 identifier is generated for each unique atom pair; the format of the atom pair identifier is:
790
791 <AtomType1>-D<n>-<AtomType2>
792
793 AtomType1, AtomType2: Atom types assigned to atom1 and atom2
794 D: Distance between atom1 and atom2
795
796 where AtomType1 <= AtomType2
797
798 The atom pair identifiers for all unique atom pairs corresponding to non-hydrogen atoms constitute
799 topological atom pairs fingerprints of the molecule.
800
801 Example of I<SD> file containing topological atom pairs fingerprints string data:
802
803 ... ...
804 ... ...
805 $$$$
806 ... ...
807 ... ...
808 ... ...
809 41 44 0 0 0 0 0 0 0 0999 V2000
810 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
811 ... ...
812 2 3 1 0 0 0 0
813 ... ...
814 M END
815 > <CmpdID>
816 Cmpd1
817
818 > <TopologicalAtomPairsFingerprints>
819 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinDi
820 stance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1.H
821 3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.H1
822 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-C.X2...;
823 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
824 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1 ...
825
826 $$$$
827 ... ...
828 ... ...
829
830 Example of I<FP> file containing topological atom pairs fingerprints string data:
831
832 #
833 # Package = MayaChemTools 7.4
834 # Release Date = Oct 21, 2010
835 #
836 # TimeStamp = Fri Mar 11 15:04:36 2011
837 #
838 # FingerprintsStringType = FingerprintsVector
839 #
840 # Description = TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinDi...
841 # VectorStringFormat = IDsAndValuesString
842 # VectorValuesType = NumericalValues
843 #
844 Cmpd1 223;C.X1.BO1.H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2...;1 1...
845 Cmpd2 128;C.X1.BO1.H3-D1-C.X2.BO2.H2 C.X1.BO1.H3-D1-C.X3.BO4...;1 1...
846 ... ...
847 ... ..
848
849 Example of CSV I<Text> file containing topological atom pairs fingerprints string data:
850
851 "CompoundID","TopologicalAtomPairsFingerprints"
852 "Cmpd1","FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTy
853 pes:MinDistance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C
854 .X1.BO1.H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X
855 3.BO3.H1C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1...;
856 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
857 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1 ...
858 ... ...
859 ... ...
860
861 The current release of MayaChemTools generates the following types of topological atom pairs
862 fingerprints vector strings:
863
864 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
865 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
866 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.
867 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...;
868 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
869 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
870
871 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
872 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesPairsString;C.X
873 1.BO1.H3-D1-C.X3.BO3.H1 2 C.X2.BO2.H2-D1-C.X2.BO2.H2 1 C.X2.BO2.H2-D1-
874 C.X3.BO3.H1 4 C.X2.BO2.H2-D1-C.X3.BO4 1 C.X2.BO2.H2-D1-N.X3.BO3 1 C.X2
875 .BO3.H1-D1-C.X2.BO3.H1 10 C.X2.BO3.H1-D1-C.X3.BO4 8 C.X3.BO3.H1-D1-C.X
876 3.BO4 1 C.X3.BO3.H1-D1-O.X1.BO1.H1 2 C.X3.BO4-D1-C.X3.BO4 6 C.X3.BO...
877
878 FingerprintsVector;TopologicalAtomPairs:DREIDINGAtomTypes:MinDistance1
879 :MaxDistance10;157;NumericalValues;IDsAndValuesString;C_2-D1-C_3 C_2-D
880 1-C_R C_2-D1-N_3 C_2-D1-O_2 C_2-D1-O_3 C_3-D1-C_3 C_3-D1-C_R C_3-D1-N_
881 R C_3-D1-O_3 C_R-D1-C_R C_R-D1-F_ C_R-D1-N_3 C_R-D1-N_R C_2-D2-C_3 C_2
882 1 1 1 2 1 7 1 1 2 23 1 1 2 1 3 5 5 2 1 5 28 2 3 3 1 1 1 2 4 1 1 4 9 3
883 1 4 24 2 4 3 3 4 5 5 14 1 1 2 3 22 1 3 4 4 1 1 1 1 2 2 5 1 4 21 3 1...
884
885 FingerprintsVector;TopologicalAtomPairs:EStateAtomTypes:MinDistance1:M
886 axDistance10;251;NumericalValues;IDsAndValuesString;aaCH-D1-aaCH aaCH-
887 D1-aasC aasC-D1-aasC aasC-D1-aasN aasC-D1-dssC aasC-D1-sF aasC-D1-ssNH
888 aasC-D1-sssCH aasN-D1-ssCH2 dO-D1-dssC dssC-D1-sOH dssC-D1-ssCH2 d...;
889 10 8 5 2 1 1 1 1 1 2 1 1 1 2 2 1 4 10 12 2 2 6 3 1 3 2 2 1 1 1 1 1 1 1
890 1 1 5 2 1 1 6 12 2 2 2 2 6 1 3 2 2 5 2 2 1 2 1 1 1 1 1 1 3 1 3 19 2...
891
892 FingerprintsVector;TopologicalAtomPairs:FunctionalClassAtomTypes:MinDi
893 stance1:MaxDistance10;144;NumericalValues;IDsAndValuesString;Ar-D1-Ar
894 Ar-D1-Ar.HBA Ar-D1-HBD Ar-D1-Hal Ar-D1-None Ar.HBA-D1-None HBA-D1-NI H
895 BA-D1-None HBA.HBD-D1-NI HBA.HBD-D1-None HBD-D1-None NI-D1-None No...;
896 23 2 1 1 2 1 1 1 1 2 1 1 7 28 3 1 3 2 8 2 1 1 1 5 1 5 24 3 3 4 2 13 4
897 1 1 4 1 5 22 4 4 3 1 19 1 1 1 1 1 2 2 3 1 1 8 25 4 5 2 3 1 26 1 4 1 ...
898
899 FingerprintsVector;TopologicalAtomPairs:MMFF94AtomTypes:MinDistance1:M
900 axDistance10;227;NumericalValues;IDsAndValuesPairsString;C5A-D1-C5B 2
901 C5A-D1-CB 1 C5A-D1-CR 1 C5A-D1-N5 2 C5B-D1-C5B 1 C5B-D1-C=ON 1 C5B-D1-
902 CB 1 C=ON-D1-NC=O 1 C=ON-D1-O=CN 1 CB-D1-CB 18 CB-D1-F 1 CB-D1-NC=O 1
903 COO-D1-CR 1 COO-D1-O=CO 1 COO-D1-OC=O 1 CR-D1-CR 7 CR-D1-N5 1 CR-D1-OR
904 2 C5A-D2-C5A 1 C5A-D2-C5B 2 C5A-D2-C=ON 1 C5A-D2-CB 3 C5A-D2-CR 4 ...
905
906 FingerprintsVector;TopologicalAtomPairs:SLogPAtomTypes:MinDistance1:Ma
907 xDistance10;329;NumericalValues;IDsAndValuesPairsString;C1-D1-C10 1 C1
908 -D1-C11 2 C1-D1-C5 1 C1-D1-CS 4 C10-D1-N11 1 C11-D1-C21 1 C14-D1-C18 2
909 C14-D1-F 1 C18-D1-C18 10 C18-D1-C20 4 C18-D1-C22 2 C20-D1-C20 3 C20-D
910 1-C21 1 C20-D1-N11 1 C21-D1-C21 1 C21-D1-C5 1 C21-D1-N11 1 C22-D1-N4 1
911 C5-D1-N4 1 C5-D1-O10 1 C5-D1-O2 1 C5-D1-O9 1 CS-D1-O2 2 C1-D2-C1 3...
912
913 FingerprintsVector;TopologicalAtomPairs:SYBYLAtomTypes:MinDistance1:Ma
914 xDistance10;159;NumericalValues;IDsAndValuesPairsString;C.2-D1-C.3 1 C
915 .2-D1-C.ar 1 C.2-D1-N.am 1 C.2-D1-O.2 1 C.2-D1-O.co2 2 C.3-D1-C.3 7 C.
916 3-D1-C.ar 1 C.3-D1-N.ar 1 C.3-D1-O.3 2 C.ar-D1-C.ar 23 C.ar-D1-F 1 C.a
917 r-D1-N.am 1 C.ar-D1-N.ar 2 C.2-D2-C.3 1 C.2-D2-C.ar 3 C.3-D2-C.3 5 C.3
918 -D2-C.ar 5 C.3-D2-N.ar 2 C.3-D2-O.3 4 C.3-D2-O.co2 2 C.ar-D2-C.ar 2...
919
920 FingerprintsVector;TopologicalAtomPairs:TPSAAtomTypes:MinDistance1:Max
921 Distance10;64;NumericalValues;IDsAndValuesPairsString;N21-D1-None 3 N7
922 -D1-None 2 None-D1-None 34 None-D1-O3 2 None-D1-O4 3 N21-D2-None 5 N7-
923 D2-None 3 N7-D2-O3 1 None-D2-None 44 None-D2-O3 2 None-D2-O4 5 O3-D2-O
924 4 1 N21-D3-None 7 N7-D3-None 4 None-D3-None 45 None-D3-O3 4 None-D3-O4
925 5 N21-D4-N7 1 N21-D4-None 5 N21-D4-O3 1 N21-D4-O4 1 N7-D4-None 4 N...
926
927 FingerprintsVector;TopologicalAtomPairs:UFFAtomTypes:MinDistance1:MaxD
928 istance10;157;NumericalValues;IDsAndValuesPairsString;C_2-D1-C_3 1 C_2
929 -D1-C_R 1 C_2-D1-N_3 1 C_2-D1-O_2 2 C_2-D1-O_3 1 C_3-D1-C_3 7 C_3-D1-C
930 _R 1 C_3-D1-N_R 1 C_3-D1-O_3 2 C_R-D1-C_R 23 C_R-D1-F_ 1 C_R-D1-N_3 1
931 C_R-D1-N_R 2 C_2-D2-C_3 1 C_2-D2-C_R 3 C_3-D2-C_3 5 C_3-D2-C_R 5 C_3-D
932 2-N_R 2 C_3-D2-O_2 1 C_3-D2-O_3 5 C_R-D2-C_R 28 C_R-D2-F_ 2 C_R-D2-...
933
934 =head1 OPTIONS
935
936 =over 4
937
938 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
939
940 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
941 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
942 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
943 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
944
945 The supported aromaticity model names along with model specific control parameters
946 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
947 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
948 this file during class instantiation and makes it available to method B<DetectAromaticity>
949 for detecting aromaticity corresponding to a specific model.
950
951 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | FunctionalClassAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>
952
953 Specify atom identifier type to use for assignment of initial atom identifier to non-hydrogen
954 atoms during calculation of topological atom pairs fingerprints. Possible values in the current
955 release are: I<AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
956 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
957 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>.
958
959 =item B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">
960
961 This value is used during I<AtomicInvariantsAtomTypes> value of B<a, --AtomIdentifierType>
962 option. It's a list of comma separated valid atomic invariant atom types.
963
964 Possible values for atomic invariants are: I<AS, X, BO, LBO, SB, DB, TB,
965 H, Ar, RA, FC, MN, SM>. Default value: I<AS,X,BO,H,FC>.
966
967 The atomic invariants abbreviations correspond to:
968
969 AS = Atom symbol corresponding to element symbol
970
971 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
972 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
973 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
974 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
975 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
976 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
977 H<n> = Number of implicit and explicit hydrogens for atom
978 Ar = Aromatic annotation indicating whether atom is aromatic
979 RA = Ring atom annotation indicating whether atom is a ring
980 FC<+n/-n> = Formal charge assigned to atom
981 MN<n> = Mass number indicating isotope other than most abundant isotope
982 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
983 3 (triplet)
984
985 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
986
987 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
988
989 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
990 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
991
992 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
993 are also allowed:
994
995 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
996 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
997 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
998 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
999 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1000 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1001 H : NumOfImplicitAndExplicitHydrogens
1002 Ar : Aromatic
1003 RA : RingAtom
1004 FC : FormalCharge
1005 MN : MassNumber
1006 SM : SpinMultiplicity
1007
1008 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant
1009 atom types.
1010
1011 =item B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">
1012
1013 This value is used during I<FunctionalClassAtomTypes> value of B<a, --AtomIdentifierType>
1014 option. It's a list of comma separated valid functional classes.
1015
1016 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1017 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>.
1018
1019 The functional class abbreviations correspond to:
1020
1021 HBD: HydrogenBondDonor
1022 HBA: HydrogenBondAcceptor
1023 PI : PositivelyIonizable
1024 NI : NegativelyIonizable
1025 Ar : Aromatic
1026 Hal : Halogen
1027 H : Hydrophobic
1028 RA : RingAtom
1029 CA : ChainAtom
1030
1031 Functional class atom type specification for an atom corresponds to:
1032
1033 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1034
1035 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1036 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1037
1038 HydrogenBondDonor: NH, NH2, OH
1039 HydrogenBondAcceptor: N[!H], O
1040 PositivelyIonizable: +, NH2
1041 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1042
1043 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1044
1045 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1046
1047 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1048 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1049 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1050 look like Cmpd<Number>.
1051
1052 Examples for I<DataField> value of B<--CompoundIDMode>:
1053
1054 MolID
1055 ExtReg
1056
1057 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1058
1059 Compound
1060
1061 The value specified above generates compound IDs which correspond to Compound<Number>
1062 instead of default value of Cmpd<Number>.
1063
1064 =item B<--CompoundIDLabel> I<text>
1065
1066 Specify compound ID column label for CSV/TSV text file(s) used during I<CompoundID> value
1067 of B<--DataFieldsMode> option. Default value: I<CompoundID>.
1068
1069 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1070
1071 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
1072 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
1073 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
1074 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
1075
1076 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1077 Default value: I<LabelPrefix>.
1078
1079 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1080 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1081 values are replaced with sequential compound IDs.
1082
1083 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1084
1085 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
1086
1087 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1088 with generated fingerprints for I<text | both> values of B<--output> option.
1089
1090 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1091
1092 Examples:
1093
1094 Extreg
1095 MolID,CompoundName
1096
1097 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1098
1099 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1100 with generated fingerprints for I<text | both> values of B<--output> option: transfer all SD
1101 data field; transfer SD data files common to all compounds; extract specified data fields;
1102 generate a compound ID using molname line, a compound prefix, or a combination of both.
1103 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1104
1105 =item B<-f, --Filter> I<Yes | No>
1106
1107 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1108 Default value: I<Yes>.
1109
1110 By default, compound data is checked before calculating fingerprints and compounds containing
1111 atom data corresponding to non-element symbols or no atom data are ignored.
1112
1113 =item B<--FingerprintsLabel> I<text>
1114
1115 SD data label or text file column label to use for fingerprints string in output SD or
1116 CSV/TSV text file(s) specified by B<--output>. Default value: I<TopologicalAtomPairsFingerprints>.
1117
1118 =item B<-h, --help>
1119
1120 Print this help message.
1121
1122 =item B<-k, --KeepLargestComponent> I<Yes | No>
1123
1124 Generate fingerprints for only the largest component in molecule. Possible values:
1125 I<Yes or No>. Default value: I<Yes>.
1126
1127 For molecules containing multiple connected components, fingerprints can be generated
1128 in two different ways: use all connected components or just the largest connected
1129 component. By default, all atoms except for the largest connected component are
1130 deleted before generation of fingerprints.
1131
1132 =item B<--MinDistance> I<number>
1133
1134 Minimum bond distance between atom pairs for generating topological atom pairs. Default value:
1135 I<1>. Valid values: positive integers and less than B<--MaxDistance>.
1136
1137 =item B<--MaxDistance> I<number>
1138
1139 Maximum bond distance between atom pairs for generating topological atom pairs. Default value:
1140 I<10>. Valid values: positive integers and greater than B<--MinDistance>.
1141
1142 =item B<--OutDelim> I<comma | tab | semicolon>
1143
1144 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1145 Default value: I<comma>
1146
1147 =item B<--output> I<SD | FP | text | all>
1148
1149 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1150
1151 =item B<-o, --overwrite>
1152
1153 Overwrite existing files.
1154
1155 =item B<-q, --quote> I<Yes | No>
1156
1157 Put quote around column values in output CSV/TSV text file(s). Possible values:
1158 I<Yes or No>. Default value: I<Yes>.
1159
1160 =item B<-r, --root> I<RootName>
1161
1162 New file name is generated using the root: <Root>.<Ext>. Default for new file names:
1163 <SDFileName><TopologicalAtomPairsFP>.<Ext>. The file type determines <Ext> value.
1164 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1165 delimited text files, respectively.This option is ignored for multiple input files.
1166
1167 =item B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1168
1169 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1170 B<--output> option. Possible values: I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString |
1171 ValuesAndIDsPairsString>. Default value: I<IDsAndValuesString>.
1172
1173 Examples:
1174
1175 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
1176 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesString;C.X1.BO1
1177 .H3-D1-C.X3.BO3.H1 C.X2.BO2.H2-D1-C.X2.BO2.H2 C.X2.BO2.H2-D1-C.X3.BO3.
1178 H1 C.X2.BO2.H2-D1-C.X3.BO4 C.X2.BO2.H2-D1-N.X3.BO3 C.X2.BO3.H1-D1-...;
1179 2 1 4 1 1 10 8 1 2 6 1 2 2 1 2 1 2 2 1 2 1 5 1 10 12 2 2 1 2 1 9 1 3 1
1180 1 1 2 2 1 3 6 1 6 14 2 2 2 3 1 3 1 8 2 2 1 3 2 6 1 2 2 5 1 3 1 23 1...
1181
1182 FingerprintsVector;TopologicalAtomPairs:AtomicInvariantsAtomTypes:MinD
1183 istance1:MaxDistance10;223;NumericalValues;IDsAndValuesPairsString;C.X
1184 1.BO1.H3-D1-C.X3.BO3.H1 2 C.X2.BO2.H2-D1-C.X2.BO2.H2 1 C.X2.BO2.H2-D1-
1185 C.X3.BO3.H1 4 C.X2.BO2.H2-D1-C.X3.BO4 1 C.X2.BO2.H2-D1-N.X3.BO3 1 C.X2
1186 .BO3.H1-D1-C.X2.BO3.H1 10 C.X2.BO3.H1-D1-C.X3.BO4 8 C.X3.BO3.H1-D1-C.X
1187 3.BO4 1 C.X3.BO3.H1-D1-O.X1.BO1.H1 2 C.X3.BO4-D1-C.X3.BO4 6 C.X3.BO...
1188
1189
1190 =item B<-w, --WorkingDir> I<DirName>
1191
1192 Location of working directory. Default value: current directory.
1193
1194 =back
1195
1196 =head1 EXAMPLES
1197
1198 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1199 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1200 file containing sequential compound IDs along with fingerprints vector strings data, type:
1201
1202 % TopologicalAtomPairsFingerprints.pl -r SampleTAPFP -o Sample.sdf
1203
1204 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1205 10 using atomic invariants atom types in IDsAndValuesString format and create SampleTAPFP.sdf,
1206 SampleTAPFP.fpf and SampleTAPFP.csv files containing sequential compound IDs in CSV file along
1207 with fingerprints vector strings data, type:
1208
1209 % TopologicalAtomPairsFingerprints.pl --output all -r SampleTAPFP
1210 -o Sample.sdf
1211
1212 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1213 10 using DREIDING atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1214 file containing sequential compound IDs along with fingerprints vector strings data, type:
1215
1216 % TopologicalAtomPairsFingerprints.pl -a DREIDINGAtomTypes
1217 -r SampleTAPFP -o Sample.sdf
1218
1219 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1220 10 using E-state types in IDsAndValuesString format and create a SampleTAPFP.csv
1221 file containing sequential compound IDs along with fingerprints vector strings data, type:
1222
1223 % TopologicalAtomPairsFingerprints.pl -a EStateAtomTypes
1224 -r SampleTAPFP -o Sample.sdf
1225
1226 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1227 10 using DREIDING atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1228 file containing sequential compound IDs along with fingerprints vector strings data, type:
1229
1230 % TopologicalAtomPairsFingerprints.pl -a DREIDINGAtomTypes
1231 -r SampleTAPFP -o Sample.sdf
1232
1233 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1234 10 using functional class atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1235 file containing sequential compound IDs along with fingerprints vector strings data, type:
1236
1237 % TopologicalAtomPairsFingerprints.pl -a FunctionalClassAtomTypes
1238 -r SampleTAPFP -o Sample.sdf
1239
1240 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1241 10 using MMFF94 atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1242 file containing sequential compound IDs along with fingerprints vector strings data, type:
1243
1244 % TopologicalAtomPairsFingerprints.pl -a MMFF94AtomTypes
1245 -r SampleTAPFP -o Sample.sdf
1246
1247 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1248 10 using SLogP atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1249 file containing sequential compound IDs along with fingerprints vector strings data, type:
1250
1251 % TopologicalAtomPairsFingerprints.pl -a SLogPAtomTypes
1252 -r SampleTAPFP -o Sample.sdf
1253
1254 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1255 10 using SYBYL atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1256 file containing sequential compound IDs along with fingerprints vector strings data, type:
1257
1258 % TopologicalAtomPairsFingerprints.pl -a SYBYLAtomTypes
1259 -r SampleTAPFP -o Sample.sdf
1260
1261 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1262 10 using TPSA atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1263 file containing sequential compound IDs along with fingerprints vector strings data, type:
1264
1265 % TopologicalAtomPairsFingerprints.pl -a TPSAAtomTypes
1266 -r SampleTAPFP -o Sample.sdf
1267
1268 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1269 10 using UFF atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1270 file containing sequential compound IDs along with fingerprints vector strings data, type:
1271
1272 % TopologicalAtomPairsFingerprints.pl -a UFFAtomTypes
1273 -r SampleTAPFP -o Sample.sdf
1274
1275 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1276 10 using atomic invariants atom types in IDsAndValuesPairsString format and create a SampleTAPFP.csv
1277 file containing sequential compound IDs along with fingerprints vector strings data, type:
1278
1279 % TopologicalAtomPairsFingerprints.pl --VectorStringFormat
1280 IDsAndValuesPairsString -r SampleTAPFP -o Sample.sdf
1281
1282 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1283 6 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1284 file containing sequential compound IDs along with fingerprints vector strings data, type:
1285
1286 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1287 --MinDistance 1 --MaxDistance 6 -r SampleTAPFP -o Sample.sdf
1288
1289 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1290 10 using only AS,X atomic invariants atom types in IDsAndValuesString format and create a
1291 SampleTAPFP.csv file containing sequential compound IDs along with fingerprints vector strings
1292 data, type:
1293
1294 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1295 --AtomicInvariantsToUse "AS,X" --MinDistance 1 --MaxDistance 6
1296 -r SampleTAPFP -o Sample.sdf
1297
1298 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1299 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1300 file containing compound ID from molecule name line along with fingerprints vector strings
1301 data, type:
1302
1303 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1304 --DataFieldsMode CompoundID -CompoundIDMode MolName
1305 -r SampleTAPFP -o Sample.sdf
1306
1307 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1308 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1309 file containing compound IDs using specified data field along with fingerprints vector strings
1310 data, type:
1311
1312 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1313 --DataFieldsMode CompoundID -CompoundIDMode DataField --CompoundID
1314 Mol_ID -r SampleTAPFP -o Sample.sdf
1315
1316 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1317 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1318 file containing compound ID using combination of molecule name line and an explicit compound
1319 prefix along with fingerprints vector strings data, type:
1320
1321 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1322 --DataFieldsMode CompoundID -CompoundIDMode MolnameOrLabelPrefix
1323 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleTAPFP -o Sample.sdf
1324
1325 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1326 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1327 file containing specific data fields columns along with fingerprints vector strings
1328 data, type:
1329
1330 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1331 --DataFieldsMode Specify --DataFields Mol_ID -r SampleTAPFP
1332 -o Sample.sdf
1333
1334 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1335 10 using atomic invariants atom types in IDsAndValuesString format and create a SampleTAPFP.csv
1336 file containing common data fields columns along with fingerprints vector strings
1337 data, type:
1338
1339 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1340 --DataFieldsMode Common -r SampleTAPFP -o Sample.sdf
1341
1342 To generate topological atom pairs fingerprints corresponding to bond distances from 1 through
1343 10 using atomic invariants atom types in IDsAndValuesString format and create SampleTAPFP.sdf,
1344 SampleTAPFP.fpf and SampleTAPFP.csv files containing all data fields columns in CSV file along
1345 with fingerprints data, type:
1346
1347 % TopologicalAtomPairsFingerprints.pl -a AtomicInvariantsAtomTypes
1348 --DataFieldsMode All --output all -r SampleTAPFP
1349 -o Sample.sdf
1350
1351 =head1 AUTHOR
1352
1353 Manish Sud <msud@san.rr.com>
1354
1355 =head1 SEE ALSO
1356
1357 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1358 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl,
1359 PathLengthFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1360 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1361
1362 =head1 COPYRIGHT
1363
1364 Copyright (C) 2015 Manish Sud. All rights reserved.
1365
1366 This file is part of MayaChemTools.
1367
1368 MayaChemTools is free software; you can redistribute it and/or modify it under
1369 the terms of the GNU Lesser General Public License as published by the Free
1370 Software Foundation; either version 3 of the License, or (at your option)
1371 any later version.
1372
1373 =cut