comparison bin/TopologicalPharmacophoreAtomPairsFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: TopologicalPharmacophoreAtomPairsFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:23 $
5 # $Revision: 1.36 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::FunctionalClassAtomTypes;
43 use Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints;
44
45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
46
47 # Autoflush STDOUT
48 $| = 1;
49
50 # Starting message...
51 $ScriptName = basename($0);
52 print "\n$ScriptName: Starting...\n\n";
53 $StartTime = new Benchmark;
54
55 # Get the options and setup script...
56 SetupScriptUsage();
57 if ($Options{help} || @ARGV < 1) {
58 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
59 }
60
61 my(@SDFilesList);
62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
63
64 # Process options...
65 print "Processing options...\n";
66 my(%OptionsInfo);
67 ProcessOptions();
68
69 # Setup information about input files...
70 print "Checking input SD file(s)...\n";
71 my(%SDFilesInfo);
72 RetrieveSDFilesInfo();
73
74 # Process input files..
75 my($FileIndex);
76 if (@SDFilesList > 1) {
77 print "\nProcessing SD files...\n";
78 }
79 for $FileIndex (0 .. $#SDFilesList) {
80 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
81 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
82 GenerateTopologicalPharmacophoreAtomPairsFingerprints($FileIndex);
83 }
84 }
85 print "\n$ScriptName:Done...\n\n";
86
87 $EndTime = new Benchmark;
88 $TotalTime = timediff ($EndTime, $StartTime);
89 print "Total time: ", timestr($TotalTime), "\n";
90
91 ###############################################################################
92
93 # Generate fingerprints for a SD file...
94 #
95 sub GenerateTopologicalPharmacophoreAtomPairsFingerprints {
96 my($FileIndex) = @_;
97 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, $SetupOutputFiles);
98
99 $SDFile = $SDFilesList[$FileIndex];
100
101 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
102 $SetupOutputFiles = 1;
103
104 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
105 $MoleculeFileIO->Open();
106
107 $CmpdCount = 0;
108 $IgnoredCmpdCount = 0;
109
110 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
111 $CmpdCount++;
112
113 # Filter compound data before calculating fingerprints...
114 if ($OptionsInfo{Filter}) {
115 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
116 $IgnoredCmpdCount++;
117 next COMPOUND;
118 }
119 }
120
121 $TopologicalPharmacophoreAtomPairsFingerprints = GenerateMoleculeFingerprints($Molecule);
122 if (!$TopologicalPharmacophoreAtomPairsFingerprints) {
123 $IgnoredCmpdCount++;
124 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
125 next COMPOUND;
126 }
127
128 if ($SetupOutputFiles) {
129 $SetupOutputFiles = 0;
130 SetupFingerprintsLabelValueIDs($TopologicalPharmacophoreAtomPairsFingerprints);
131 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
132 }
133
134 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
135 }
136 $MoleculeFileIO->Close();
137
138 if ($NewFPSDFileIO) {
139 $NewFPSDFileIO->Close();
140 }
141 if ($NewFPTextFileIO) {
142 $NewFPTextFileIO->Close();
143 }
144 if ($NewFPFileIO) {
145 $NewFPFileIO->Close();
146 }
147
148 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
149 }
150
151 # Process compound being ignored due to problems in fingerprints geneation...
152 #
153 sub ProcessIgnoredCompound {
154 my($Mode, $CmpdCount, $Molecule) = @_;
155 my($CmpdID, $DataFieldLabelAndValuesRef);
156
157 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
158 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
159
160 MODE: {
161 if ($Mode =~ /^ContainsNonElementalData$/i) {
162 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
163 next MODE;
164 }
165
166 if ($Mode =~ /^ContainsNoElementalData$/i) {
167 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
168 next MODE;
169 }
170
171 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
172 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
173 next MODE;
174 }
175 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
176 }
177 }
178
179 # Check and filter compounds....
180 #
181 sub CheckAndFilterCompound {
182 my($CmpdCount, $Molecule) = @_;
183 my($ElementCount, $NonElementCount);
184
185 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
186
187 if ($NonElementCount) {
188 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
189 return 1;
190 }
191
192 if (!$ElementCount) {
193 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
194 return 1;
195 }
196
197 return 0;
198 }
199
200 # Write out compounds fingerprints generation summary statistics...
201 #
202 sub WriteFingerprintsGenerationSummaryStatistics {
203 my($CmpdCount, $IgnoredCmpdCount) = @_;
204 my($ProcessedCmpdCount);
205
206 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
207
208 print "\nNumber of compounds: $CmpdCount\n";
209 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
210 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
211 }
212
213 # Append atom pair value IDs to fingerprint label...
214 #
215 sub SetupFingerprintsLabelValueIDs {
216 my($TopologicalPharmacophoreAtomPairsFingerprints) = @_;
217
218 if ($OptionsInfo{AtomPairsSetSizeToUse} =~ /^ArbitrarySize$/i ||
219 $OptionsInfo{FingerprintsLabelMode} !~ /^FingerprintsLabelWithIDs$/i) {
220 return;
221 }
222
223 $OptionsInfo{FingerprintsLabel} .= "; Value IDs: " . $TopologicalPharmacophoreAtomPairsFingerprints->GetFingerprintsVector->GetValueIDsString();
224 }
225
226 # Open output files...
227 #
228 sub SetupAndOpenOutputFiles {
229 my($FileIndex) = @_;
230 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
231
232 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
233
234 # Setup common parameters for fingerprints file IO objects...
235 #
236 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
237
238 if ($OptionsInfo{SDOutput}) {
239 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
240 print "Generating SD file $NewFPSDFile...\n";
241 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
242 $NewFPSDFileIO->Open();
243 }
244
245 if ($OptionsInfo{FPOutput}) {
246 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
247 print "Generating FP file $NewFPFile...\n";
248 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
249 $NewFPFileIO->Open();
250 }
251
252 if ($OptionsInfo{TextOutput}) {
253 my($ColLabelsRef);
254
255 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
256 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
257
258 print "Generating text file $NewFPTextFile...\n";
259 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
260 $NewFPTextFileIO->Open();
261 }
262
263 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
264 }
265
266 # Write fingerpritns and other data to appropriate output files...
267 #
268 sub WriteDataToOutputFiles {
269 my($FileIndex, $CmpdCount, $Molecule, $TopologicalPharmacophoreAtomPairsFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
270 my($DataFieldLabelAndValuesRef);
271
272 $DataFieldLabelAndValuesRef = undef;
273 if ($NewFPTextFileIO || $NewFPFileIO) {
274 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
275 }
276
277 if ($NewFPSDFileIO) {
278 my($CmpdString);
279
280 $CmpdString = $Molecule->GetInputMoleculeString();
281 $NewFPSDFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CmpdString);
282 }
283
284 if ($NewFPTextFileIO) {
285 my($ColValuesRef);
286
287 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
288 $NewFPTextFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $ColValuesRef);
289 }
290
291 if ($NewFPFileIO) {
292 my($CompoundID);
293
294 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
295 $NewFPFileIO->WriteFingerprints($TopologicalPharmacophoreAtomPairsFingerprints, $CompoundID);
296 }
297 }
298
299 # Generate approriate column labels for FPText output file...
300 #
301 sub SetupFPTextFileCoulmnLabels {
302 my($FileIndex) = @_;
303 my($Line, @ColLabels);
304
305 @ColLabels = ();
306 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
307 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
308 }
309 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
310 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
311 }
312 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
313 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
314 }
315 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
316 push @ColLabels, $OptionsInfo{CompoundIDLabel};
317 }
318 # Add fingerprints label...
319 push @ColLabels, $OptionsInfo{FingerprintsLabel};
320
321 return \@ColLabels;
322 }
323
324 # Generate column values FPText output file..
325 #
326 sub SetupFPTextFileCoulmnValues {
327 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
328 my(@ColValues);
329
330 @ColValues = ();
331 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
332 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
333 }
334 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
335 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
336 }
337 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
338 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
339 }
340 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
341 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
342 }
343
344 return \@ColValues;
345 }
346
347 # Generate compound ID for FP and FPText output files..
348 #
349 sub SetupCmpdIDForOutputFiles {
350 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
351 my($CmpdID);
352
353 $CmpdID = '';
354 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
355 my($MolName);
356 $MolName = $Molecule->GetName();
357 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
358 }
359 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
360 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
361 }
362 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
363 my($SpecifiedDataField);
364 $SpecifiedDataField = $OptionsInfo{CompoundID};
365 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
366 }
367 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
368 $CmpdID = $Molecule->GetName();
369 }
370 return $CmpdID;
371 }
372
373 # Generate fingerprints for molecule...
374 #
375 sub GenerateMoleculeFingerprints {
376 my($Molecule) = @_;
377 my($TopologicalPharmacophoreAtomPairsFingerprints);
378
379 if ($OptionsInfo{KeepLargestComponent}) {
380 $Molecule->KeepLargestComponent();
381 }
382 if (!$Molecule->DetectRings()) {
383 return undef;
384 }
385 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
386 $Molecule->DetectAromaticity();
387
388 if ($OptionsInfo{FuzzifyAtomPairsCount}) {
389 $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, , 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, , 'ValuesPrecision' => $OptionsInfo{ValuesPrecision}, 'FuzzifyAtomPairsCount' => $OptionsInfo{FuzzifyAtomPairsCount}, 'FuzzificationMode' => $OptionsInfo{FuzzificationMode}, 'FuzzificationMethodology' => $OptionsInfo{FuzzificationMethodology}, 'FuzzFactor' => $OptionsInfo{FuzzFactor});
390 }
391 else {
392 $TopologicalPharmacophoreAtomPairsFingerprints = new Fingerprints::TopologicalPharmacophoreAtomPairsFingerprints('Molecule' => $Molecule, 'AtomPairsSetSizeToUse' => $OptionsInfo{AtomPairsSetSizeToUse}, 'MinDistance' => $OptionsInfo{MinDistance}, 'MaxDistance' => $OptionsInfo{MaxDistance}, 'AtomTypesToUse' => \@{$OptionsInfo{AtomTypesToUse}}, 'NormalizationMethodology' => $OptionsInfo{NormalizationMethodology}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision});
393 }
394
395 # Set atom types weights...
396 if ($OptionsInfo{UseAtomTypesWeight}) {
397 $TopologicalPharmacophoreAtomPairsFingerprints->SetAtomTypesWeight(%{$OptionsInfo{AtomTypesWeight}});
398 }
399
400 # Generate fingerprints...
401 $TopologicalPharmacophoreAtomPairsFingerprints->GenerateFingerprints();
402
403 # Make sure fingerprints generation is successful...
404 if (!$TopologicalPharmacophoreAtomPairsFingerprints->IsFingerprintsGenerationSuccessful()) {
405 return undef;
406 }
407
408 return $TopologicalPharmacophoreAtomPairsFingerprints;
409 }
410
411 # Retrieve information about SD files...
412 #
413 sub RetrieveSDFilesInfo {
414 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
415
416 %SDFilesInfo = ();
417 @{$SDFilesInfo{FileOkay}} = ();
418 @{$SDFilesInfo{OutFileRoot}} = ();
419 @{$SDFilesInfo{SDOutFileNames}} = ();
420 @{$SDFilesInfo{FPOutFileNames}} = ();
421 @{$SDFilesInfo{TextOutFileNames}} = ();
422 @{$SDFilesInfo{AllDataFieldsRef}} = ();
423 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
424
425 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
426 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
427
428 FILELIST: for $Index (0 .. $#SDFilesList) {
429 $SDFile = $SDFilesList[$Index];
430
431 $SDFilesInfo{FileOkay}[$Index] = 0;
432 $SDFilesInfo{OutFileRoot}[$Index] = '';
433 $SDFilesInfo{SDOutFileNames}[$Index] = '';
434 $SDFilesInfo{FPOutFileNames}[$Index] = '';
435 $SDFilesInfo{TextOutFileNames}[$Index] = '';
436
437 $SDFile = $SDFilesList[$Index];
438 if (!(-e $SDFile)) {
439 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
440 next FILELIST;
441 }
442 if (!CheckFileType($SDFile, "sd sdf")) {
443 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
444 next FILELIST;
445 }
446
447 if ($CheckDataField) {
448 # Make sure data field exists in SD file..
449 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
450
451 @CmpdLines = ();
452 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
453 $CmpdString = ReadCmpdString(\*SDFILE);
454 close SDFILE;
455 @CmpdLines = split "\n", $CmpdString;
456 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
457 $SpecifiedDataField = $OptionsInfo{CompoundID};
458 if (!exists $DataFieldValues{$SpecifiedDataField}) {
459 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
460 next FILELIST;
461 }
462 }
463
464 $AllDataFieldsRef = '';
465 $CommonDataFieldsRef = '';
466 if ($CollectDataFields) {
467 my($CmpdCount);
468 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
469 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
470 close SDFILE;
471 }
472
473 # Setup output file names...
474 $FileDir = ""; $FileName = ""; $FileExt = "";
475 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
476
477 $TextOutFileExt = "csv";
478 if ($Options{outdelim} =~ /^tab$/i) {
479 $TextOutFileExt = "tsv";
480 }
481 $SDOutFileExt = $FileExt;
482 $FPOutFileExt = "fpf";
483
484 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
485 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
486 if ($RootFileName && $RootFileExt) {
487 $FileName = $RootFileName;
488 }
489 else {
490 $FileName = $OptionsInfo{OutFileRoot};
491 }
492 $OutFileRoot = $FileName;
493 }
494 else {
495 $OutFileRoot = "${FileName}TopologicalPharmacophoreAtomPairsFP";
496 }
497
498 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
499 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
500 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
501
502 if ($OptionsInfo{SDOutput}) {
503 if ($SDFile =~ /$NewSDFileName/i) {
504 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
505 print "Specify a different name using \"-r --root\" option or use default name.\n";
506 next FILELIST;
507 }
508 }
509
510 if (!$OptionsInfo{OverwriteFiles}) {
511 # Check SD and text outout files...
512 if ($OptionsInfo{SDOutput}) {
513 if (-e $NewSDFileName) {
514 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
515 next FILELIST;
516 }
517 }
518 if ($OptionsInfo{FPOutput}) {
519 if (-e $NewFPFileName) {
520 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
521 next FILELIST;
522 }
523 }
524 if ($OptionsInfo{TextOutput}) {
525 if (-e $NewTextFileName) {
526 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
527 next FILELIST;
528 }
529 }
530 }
531
532 $SDFilesInfo{FileOkay}[$Index] = 1;
533
534 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
535 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
536 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
537 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
538
539 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
540 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
541 }
542 }
543
544 # Process option values...
545 sub ProcessOptions {
546 %OptionsInfo = ();
547
548 ProcessAtomTypesToUseOption();
549 ProcessAtomTypesWeightOption();
550
551 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
552
553 $OptionsInfo{AtomPairsSetSizeToUse} = $Options{atompairssetsizetouse};
554
555 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
556 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
557 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
558
559 my(@SpecifiedDataFields);
560 @SpecifiedDataFields = ();
561
562 @{$OptionsInfo{SpecifiedDataFields}} = ();
563 $OptionsInfo{CompoundID} = '';
564
565 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
566 if ($Options{compoundidmode} =~ /^DataField$/i) {
567 if (!$Options{compoundid}) {
568 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
569 }
570 $OptionsInfo{CompoundID} = $Options{compoundid};
571 }
572 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
573 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
574 }
575 }
576 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
577 if (!$Options{datafields}) {
578 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
579 }
580 @SpecifiedDataFields = split /\,/, $Options{datafields};
581 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
582 }
583
584 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
585
586 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
587 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'TopologicalPharmacophoreAtomPairsFingerprints';
588
589 $OptionsInfo{FuzzifyAtomPairsCount} = ($Options{fuzzifyatompairscount} =~ /^Yes$/i) ? 1 : 0;
590 $OptionsInfo{FuzzificationMode} = $Options{fuzzificationmode};
591 $OptionsInfo{FuzzificationMethodology} = $Options{fuzzificationmethodology};
592 $OptionsInfo{FuzzFactor} = $Options{fuzzfactor};
593
594 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
595
596 $OptionsInfo{MinDistance} = $Options{mindistance};
597 $OptionsInfo{MaxDistance} = $Options{maxdistance};
598
599 $OptionsInfo{NormalizationMethodology} = $Options{normalizationmethodology};
600
601 $OptionsInfo{Output} = $Options{output};
602 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
603 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
604 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
605
606 $OptionsInfo{OutDelim} = $Options{outdelim};
607 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
608
609 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
610 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
611
612 $OptionsInfo{ValuesPrecision} = $Options{valuesprecision};
613
614 # Setup default vector string format...
615 my($VectorStringFormat);
616 $VectorStringFormat = '';
617
618 if ($Options{vectorstringformat}) {
619 $VectorStringFormat = $Options{vectorstringformat};
620
621 if ($Options{atompairssetsizetouse} =~ /^ArbitrarySize$/i && $VectorStringFormat =~ /^ValuesString$/i) {
622 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid for $Options{atompairssetsizetouse} value of \"--AtomPairsSetSizeToUse\" option. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
623 }
624 }
625 else {
626 $VectorStringFormat = ($Options{atompairssetsizetouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
627 }
628 $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
629 }
630
631 # Process atom type to use option...
632 #
633 sub ProcessAtomTypesToUseOption {
634 my($AtomType, $SpecifiedAtomTypesToUse, @AtomTypesWords);
635
636 @{$OptionsInfo{AtomTypesToUse}} = ();
637 if (IsEmpty($Options{atomtypestouse})) {
638 die "Error: Atom types value specified using \"-a, --AtomTypesToUse\" option is empty\n";
639 }
640
641 $SpecifiedAtomTypesToUse = $Options{atomtypestouse};
642 $SpecifiedAtomTypesToUse =~ s/ //g;
643 @AtomTypesWords = split /\,/, $SpecifiedAtomTypesToUse;
644
645 for $AtomType (@AtomTypesWords) {
646 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
647 die "Error: Atomic type specified, $AtomType, using \"-a, --AtomTypesToUse\" option is not valid...\n ";
648 }
649 push @{$OptionsInfo{AtomTypesToUse}}, $AtomType;
650 }
651 }
652
653 # Process atom types weight option...
654 #
655 sub ProcessAtomTypesWeightOption {
656 my($Index, $AtomType, $AtomTypeWeight, $SpecifiedAtomTypesWeight, @AtomTypesWeightsPairs);
657
658 %{$OptionsInfo{AtomTypesWeight}} = ();
659
660 if (IsEmpty($Options{atomtypesweight})) {
661 die "Error: Atom types weight value specified using \"--AtomTypesWeight\" option is empty\n";
662 }
663 $OptionsInfo{UseAtomTypesWeight} = ($Options{atomtypesweight} =~ /^None$/i) ? 0 : 1;
664 if (!$OptionsInfo{UseAtomTypesWeight}) {
665 return;
666 }
667
668 # Process specified atom type/weight pairs...
669 $SpecifiedAtomTypesWeight = $Options{atomtypesweight};
670 $SpecifiedAtomTypesWeight =~ s/ //g;
671 @AtomTypesWeightsPairs = split /\,/, $SpecifiedAtomTypesWeight;
672
673 if (@AtomTypesWeightsPairs % 2) {
674 die "Error: Invalid number of values specified using \"--AtomTypesWeight\" option: It must contain even number of values.\n";
675 }
676
677 for ($Index = 0; $Index < @AtomTypesWeightsPairs; $Index += 2) {
678 $AtomType = $AtomTypesWeightsPairs[$Index]; $AtomTypeWeight = $AtomTypesWeightsPairs[$Index + 1];
679 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($AtomType)) {
680 die "Error: Atom type specified, $AtomType, using \"--AtomTypesWeight\" option is not valid\n ";
681 }
682 if (!(IsFloat($AtomTypeWeight) && $AtomTypeWeight >= 0)) {
683 die "Error: Atom type weight specified, $AtomTypeWeight, using option \"--AtomTypesWeight\" is not valid. Allowed values: real numbers >= 0 \n";
684 }
685 $OptionsInfo{AtomTypesWeight}{$AtomType} = $AtomTypeWeight;
686 }
687 }
688
689 # Setup script usage and retrieve command line arguments specified using various options...
690 sub SetupScriptUsage {
691
692 # Retrieve all the options...
693 %Options = ();
694
695 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
696
697 $Options{atompairssetsizetouse} = 'ArbitrarySize';
698
699 $Options{atomtypestouse} = 'HBD,HBA,PI,NI,H';
700 $Options{atomtypesweight} = 'None';
701
702 $Options{compoundidmode} = 'LabelPrefix';
703 $Options{compoundidlabel} = 'CompoundID';
704 $Options{datafieldsmode} = 'CompoundID';
705
706 $Options{filter} = 'Yes';
707
708 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
709
710 $Options{fuzzifyatompairscount} = 'No';
711 $Options{fuzzificationmode} = 'AfterNormalization';
712 $Options{fuzzificationmethodology} = 'FuzzyBinning';
713 $Options{fuzzfactor} = 0.15;
714
715 $Options{keeplargestcomponent} = 'Yes';
716
717 $Options{mindistance} = 1;
718 $Options{maxdistance} = 10;
719
720 $Options{normalizationmethodology} = 'None';
721
722 $Options{output} = 'text';
723 $Options{outdelim} = 'comma';
724 $Options{quote} = 'yes';
725
726 $Options{valuesprecision} = 2;
727
728 $Options{vectorstringformat} = '';
729
730 if (!GetOptions(\%Options, "aromaticitymodel=s", "atompairssetsizetouse=s", "atomtypestouse|a=s", "atomtypesweight=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "fuzzifyatompairscount=s", "fuzzificationmode=s", "fuzzificationmethodology=s", "fuzzfactor=s", "help|h", "keeplargestcomponent|k=s", "mindistance=s", "maxdistance=s", "normalizationmethodology|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) {
731 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
732 }
733 if ($Options{workingdir}) {
734 if (! -d $Options{workingdir}) {
735 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
736 }
737 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
738 }
739 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
740 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
741 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
742 }
743 if ($Options{atompairssetsizetouse} !~ /^(ArbitrarySize|FixedSize)$/i) {
744 die "Error: The value specified, $Options{atompairssetsizetouse}, for option \"--AtomPairsSetSizeToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
745 }
746 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
747 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
748 }
749 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
750 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
751 }
752 if ($Options{filter} !~ /^(Yes|No)$/i) {
753 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
754 }
755 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
756 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
757 }
758 if ($Options{fuzzifyatompairscount} !~ /^(Yes|No)$/i) {
759 die "Error: The value specified, $Options{fuzzifyatompairscount}, for option \"--FuzzifyAtomPairsCount\" is not valid. Allowed values: Yes or No\n";
760 }
761 if ($Options{fuzzificationmode} !~ /^(BeforeNormalization|AfterNormalization)$/i) {
762 die "Error: The value specified, $Options{fuzzificationmode}, for option \"--FuzzificationMode\" is not valid. Allowed values: BeforeNormalization or AfterNormalization\n";
763 }
764 if ($Options{fuzzificationmethodology} !~ /^(FuzzyBinning|FuzzyBinSmoothing)$/i) {
765 die "Error: The value specified, $Options{fuzzificationmethodology}, for option \"--FuzzificationMethodology\" is not valid. Allowed values: FuzzyBinning or FuzzyBinSmoothing\n";
766 }
767 if (!IsFloat($Options{fuzzfactor})) {
768 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" is not valid. Allowed values: real numbers >= 0 \n";
769 }
770 if ($Options{fuzzificationmethodology} !~ /^FuzzyBinning$/i) {
771 if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 1.0)) {
772 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinning \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 1 \n";
773 }
774 }
775 elsif ($Options{fuzzificationmethodology} !~ /^FuzzyBinSmoothing$/i) {
776 if (!($Options{fuzzfactor} >=0 && $Options{fuzzfactor} <= 0.5)) {
777 die "Error: The value specified, $Options{fuzzfactor}, for option \"--FuzzFactor\" during FuzzyBinSmoothing \"--FuzzificationMethodology\" is not valid. Allowed values: >= 0 and <= 0.5 \n";
778 }
779 }
780 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
781 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
782 }
783 if (!IsInteger($Options{mindistance})) {
784 die "Error: The value specified, $Options{mindistance}, for option \"--MinDistance\" is not valid. Allowed values: >= 0 \n";
785 }
786 if (!IsPositiveInteger($Options{maxdistance})) {
787 die "Error: The value specified, $Options{maxdistance}, for option \"--MaxDistance\" is not valid. Allowed values: > 0 \n";
788 }
789 if ($Options{mindistance} > $Options{maxdistance}) {
790 die "Error: The value specified, specified, $Options{mindistance}, for option \"--MinDistance\" must be less than the value specified, $Options{maxdistance}, for option \"--MaxDistance\" \n";
791 }
792 if ($Options{normalizationmethodology} !~ /^(None|ByHeavyAtomsCount|ByAtomTypesCount)$/i) {
793 die "Error: The value specified, $Options{normalizationmethodology}, for option \"--NormalizationMethodology\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByAtomTypesCount\n";
794 }
795 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
796 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
797 }
798 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
799 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
800 }
801 if ($Options{quote} !~ /^(Yes|No)$/i) {
802 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
803 }
804 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
805 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
806 }
807 if (!IsPositiveInteger($Options{valuesprecision})) {
808 die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n";
809 }
810 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
811 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
812 }
813 }
814
815 __END__
816
817 =head1 NAME
818
819 TopologicalPharmacophoreAtomPairsFingerprints.pl - Generate topological pharmacophore atom pairs fingerprints for SD files
820
821 =head1 SYNOPSIS
822
823 TopologicalPharmacophoreAtomPairsFingerprints.pl SDFile(s)...
824
825 TopologicalPharmacophoreAtomPairsFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
826 [B<--AtomPairsSetSizeToUse> I<ArbitrarySize | FixedSize>]
827 [B<-a, --AtomTypesToUse> I<"AtomType1, AtomType2...">]
828 [B<--AtomTypesWeight> I<"AtomType1, Weight1, AtomType2, Weight2...">]
829 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
830 [B<--CompoundIDMode>] [B<--DataFields> I<"FieldLabel1, FieldLabel2,...">]
831 [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>] [B<-f, --Filter> I<Yes | No>]
832 [B<--FingerprintsLabelMode> I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>] [B<--FingerprintsLabel> I<text>]
833 [B<--FuzzifyAtomPairsCount> I<Yes | No>] [B<--FuzzificationMode> I<FuzzyBinning | FuzzyBinSmoothing>]
834 [B<--FuzzificationMethodology> I<FuzzyBinning | FuzzyBinSmoothing>] [B<--FuzzFactor> I<number>]
835 [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>] [B<--MinDistance> I<number>]
836 [B<--MaxDistance> I<number>] [B<-n, --NormalizationMethodology> I<None | ByHeavyAtomsCount | ByAtomTypesCount>]
837 [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD | FP | text | all>] [B<-o, --overwrite>]
838 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<--ValuesPrecision> I<number>]
839 [B<-v, --VectorStringFormat> I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
840 [B<-w, --WorkingDir> dirname] SDFile(s)...
841
842 =head1 DESCRIPTION
843
844 Generate topological pharmacophore atom pairs fingerprints [ Ref 60-62, Ref 65, Ref 68 ] for
845 I<SDFile(s)> and create appropriate SD, FP or CSV/TSV text file(s) containing fingerprints vector
846 strings corresponding to molecular fingerprints.
847
848 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
849 and I<.sd>. All other file names are ignored. All the SD files in a current directory
850 can be specified either by I<*.sdf> or the current directory name.
851
852 Based on the values specified for B<--AtomTypesToUse>, pharmacophore atom types are
853 assigned to all non-hydrogen atoms in a molecule and a distance matrix is generated.
854 A pharmacophore atom pairs basis set is initialized for all unique possible pairs within
855 B<--MinDistance> and B<--MaxDistance> range.
856
857 Let:
858
859 P = Valid pharmacophore atom type
860
861 Px = Pharmacophore atom type x
862 Py = Pharmacophore atom type y
863
864 Dmin = Minimum distance corresponding to number of bonds between
865 two atoms
866 Dmax = Maximum distance corresponding to number of bonds between
867 two atoms
868 D = Distance corresponding to number of bonds between two atoms
869
870 Px-Dn-Py = Pharmacophore atom pair ID for atom types Px and Py at
871 distance Dn
872
873 P = Number of pharmacophore atom types to consider
874 PPDn = Number of possible unique pharmacophore atom pairs at a distance Dn
875
876 PPT = Total number of possible pharmacophore atom pairs at all distances
877 between Dmin and Dmax
878
879 Then:
880
881 PPD = (P * (P - 1))/2 + P
882
883 PPT = ((Dmax - Dmin) + 1) * ((P * (P - 1))/2 + P)
884 = ((Dmax - Dmin) + 1) * PPD
885
886 So for default values of Dmin = 1, Dmax = 10 and P = 5,
887
888 PPD = (5 * (5 - 1))/2 + 5 = 15
889 PPT = ((10 - 1) + 1) * 15 = 150
890
891 The pharmacophore atom pairs bais set includes 150 values.
892
893 The atom pair IDs correspond to:
894
895 Px-Dn-Py = Pharmacophore atom pair ID for atom types Px and Py at
896 distance Dn
897
898 For example: H-D1-H, H-D2-HBA, PI-D5-PI and so on
899
900 Using distance matrix and pharmacohore atom types, occurrence of unique pharmacohore atom
901 pairs is counted. The contribution of each atom type to atom pair interaction is optionally
902 weighted by specified B<--AtomTypesWeight> before assigning its count to appropriate distance
903 bin. Based on B<--NormalizationMethodology> option, pharmacophore atom pairs count is optionally
904 normalized. Additionally, pharmacohore atom pairs count is optionally fuzzified before or after
905 the normalization controlled by values of B<--FuzzifyAtomPairsCount>, B<--FuzzificationMode>,
906 B<--FuzzificationMethodology> and B<--FuzzFactor> options.
907
908 The final pharmacophore atom pairs count along with atom pair identifiers involving all non-hydrogen
909 atoms, with optional normalization and fuzzification, constitute pharmacophore topological atom pairs
910 fingerprints of the molecule.
911
912 For I<ArbitrarySize> value of B<--AtomPairsSetSizeToUse> option, the fingerprint vector correspond to
913 only those topological pharmacophore atom pairs which are present and have non-zero count. However,
914 for I<FixedSize> value of B<--AtomPairsSetSizeToUse> option, the fingerprint vector contains all possible
915 valid topological pharmacophore atom pairs with both zero and non-zero count values.
916
917 Example of I<SD> file containing topological pharmacophore atom pairs fingerprints string data:
918
919 ... ...
920 ... ...
921 $$$$
922 ... ...
923 ... ...
924 ... ...
925 41 44 0 0 0 0 0 0 0 0999 V2000
926 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
927 ... ...
928 2 3 1 0 0 0 0
929 ... ...
930 M END
931 > <CmpdID>
932 Cmpd1
933
934 > <TopologicalPharmacophoreAtomPairsFingerprints>
935 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
936 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
937 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
938 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D...;
939 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10 3
940 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
941
942 $$$$
943 ... ...
944 ... ...
945
946 Example of I<FP> file containing topological pharmacophore atom pairs fingerprints string data:
947
948 #
949 # Package = MayaChemTools 7.4
950 # Release Date = Oct 21, 2010
951 #
952 # TimeStamp = Fri Mar 11 15:32:48 2011
953 #
954 # FingerprintsStringType = FingerprintsVector
955 #
956 # Description = TopologicalPharmacophoreAtomPairs:ArbitrarySize:MinDistance1:MaxDistance10
957 # VectorStringFormat = IDsAndValuesString
958 # VectorValuesType = NumericalValues
959 #
960 Cmpd1 54;H-D1-H H-D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA...;18 1 2...
961 Cmpd2 61;H-D1-H H-D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA...;5 1 2 ...
962 ... ...
963 ... ..
964
965 Example of CSV I<Text> file containing topological pharmacophore atom pairs fingerprints string data:
966
967 "CompoundID","TopologicalPharmacophoreAtomPairsFingerprints"
968 "Cmpd1","FingerprintsVector;TopologicalPharmacophoreAtomPairs:Arbitrary
969 Size:MinDistance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H
970 -D1-H H-D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA H
971 BA-D2-HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4...;
972 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10 3
973 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1"
974 ... ...
975 ... ...
976
977 The current release of MayaChemTools generates the following types of topological pharmacophore
978 atom pairs fingerprints vector strings:
979
980 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
981 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
982 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
983 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H
984 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...;
985 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
986 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
987
988 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
989 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
990 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
991 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
992 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
993 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
994
995 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
996 ance1:MaxDistance10;150;OrderedNumericalValues;IDsAndValuesString;H-D1
997 -H H-D1-HBA H-D1-HBD H-D1-NI H-D1-PI HBA-D1-HBA HBA-D1-HBD HBA-D1-NI H
998 BA-D1-PI HBD-D1-HBD HBD-D1-NI HBD-D1-PI NI-D1-NI NI-D1-PI PI-D1-PI H-D
999 2-H H-D2-HBA H-D2-HBD H-D2-NI H-D2-PI HBA-D2-HBA HBA-D2-HBD HBA-D2...;
1000 18 0 0 1 0 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3
1001 1 0 0 0 1 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0
1002 1 0 0 1 0 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0
1003
1004
1005 =head1 OPTIONS
1006
1007 =over 4
1008
1009 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
1010
1011 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
1012 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
1013 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
1014 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
1015
1016 The supported aromaticity model names along with model specific control parameters
1017 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
1018 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
1019 this file during class instantiation and makes it available to method B<DetectAromaticity>
1020 for detecting aromaticity corresponding to a specific model.
1021
1022 =item B<--AtomPairsSetSizeToUse> I<ArbitrarySize | FixedSize>
1023
1024 Atom pairs set size to use during generation of topological pharmacophore atom pairs
1025 fingerprints.
1026
1027 Possible values: I<ArbitrarySize | FixedSize>; Default value: I<ArbitrarySize>.
1028
1029 For I<ArbitrarySize> value of B<--AtomPairsSetSizeToUse> option, the fingerprint vector
1030 correspond to only those topological pharmacophore atom pairs which are present and
1031 have non-zero count. However, for I<FixedSize> value of B<--AtomPairsSetSizeToUse>
1032 option, the fingerprint vector contains all possible valid topological pharmacophore atom
1033 pairs with both zero and non-zero count values.
1034
1035 =item B<-a, --AtomTypesToUse> I<"AtomType1,AtomType2,...">
1036
1037 Pharmacophore atom types to use during generation of topological phramacophore
1038 atom pairs. It's a list of comma separated valid pharmacophore atom types.
1039
1040 Possible values for pharmacophore atom types are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1041 Default value [ Ref 60-62 ] : I<HBD,HBA,PI,NI,H>.
1042
1043 The pharmacophore atom types abbreviations correspond to:
1044
1045 HBD: HydrogenBondDonor
1046 HBA: HydrogenBondAcceptor
1047 PI : PositivelyIonizable
1048 NI : NegativelyIonizable
1049 Ar : Aromatic
1050 Hal : Halogen
1051 H : Hydrophobic
1052 RA : RingAtom
1053 CA : ChainAtom
1054
1055 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign pharmacophore atom
1056 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1057
1058 HydrogenBondDonor: NH, NH2, OH
1059 HydrogenBondAcceptor: N[!H], O
1060 PositivelyIonizable: +, NH2
1061 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1062
1063 =item B<--AtomTypesWeight> I<"AtomType1,Weight1,AtomType2,Weight2...">
1064
1065 Weights of specified pharmacophore atom types to use during calculation of their contribution
1066 to atom pair count. Default value: I<None>. Valid values: real numbers greater than 0. In general
1067 it's comma delimited list of valid atom type and its weight.
1068
1069 The weight values allow to increase the importance of specific pharmacophore atom type
1070 in the generated fingerprints. A weight value of 0 for an atom type eliminates its contribution to
1071 atom pair count where as weight value of 2 doubles its contribution.
1072
1073 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1074
1075 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1076
1077 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1078 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1079 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1080 look like Cmpd<Number>.
1081
1082 Examples for I<DataField> value of B<--CompoundIDMode>:
1083
1084 MolID
1085 ExtReg
1086
1087 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1088
1089 Compound
1090
1091 The value specified above generates compound IDs which correspond to Compound<Number>
1092 instead of default value of Cmpd<Number>.
1093
1094 =item B<--CompoundIDLabel> I<text>
1095
1096 Specify compound ID column label for CSV/TSV text file(s) used during I<CompoundID> value
1097 of B<--DataFieldsMode> option. Default value: I<CompoundID>.
1098
1099 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1100
1101 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
1102 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
1103 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
1104 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
1105
1106 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1107 Default value: I<LabelPrefix>.
1108
1109 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1110 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1111 values are replaced with sequential compound IDs.
1112
1113 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1114
1115 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
1116
1117 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1118 with generated fingerprints for I<text | all> values of B<--output> option.
1119
1120 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1121
1122 Examples:
1123
1124 Extreg
1125 MolID,CompoundName
1126
1127 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1128
1129 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1130 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
1131 data field; transfer SD data files common to all compounds; extract specified data fields;
1132 generate a compound ID using molname line, a compound prefix, or a combination of both.
1133 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1134
1135 =item B<-f, --Filter> I<Yes | No>
1136
1137 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1138 Default value: I<Yes>.
1139
1140 By default, compound data is checked before calculating fingerprints and compounds containing
1141 atom data corresponding to non-element symbols or no atom data are ignored.
1142
1143 =item B<--FingerprintsLabelMode> I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>
1144
1145 Specify how fingerprints label is generated in conjunction with B<--FingerprintsLabel> option value:
1146 use fingerprints label generated only by B<--FingerprintsLabel> option value or append topological
1147 atom pair count value IDs to B<--FingerprintsLabel> option value.
1148
1149 Possible values: I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>. Default value:
1150 I<FingerprintsLabelOnly>.
1151
1152 Topological atom pairs IDs appended to B<--FingerprintsLabel> value during I<FingerprintsLabelWithIDs>
1153 values of B<--FingerprintsLabelMode> correspond to atom pair count values in fingerprint vector string.
1154
1155 I<FingerprintsLabelWithIDs> value of B<--FingerprintsLabelMode> is ignored during I<ArbitrarySize> value
1156 of B<--AtomPairsSetSizeToUse> option and topological atom pairs IDs not appended to the label.
1157
1158 =item B<--FingerprintsLabel> I<text>
1159
1160 SD data label or text file column label to use for fingerprints string in output SD or
1161 CSV/TSV text file(s) specified by B<--output>. Default value: I<TopologicalPharmacophoreAtomPairsFingerprints>.
1162
1163 =item B<--FuzzifyAtomPairsCount> I<Yes | No>
1164
1165 To fuzzify or not to fuzzify atom pairs count. Possible values: I<Yes or No>. Default value:
1166 I<No>.
1167
1168 =item B<--FuzzificationMode> I<BeforeNormalization | AfterNormalization>
1169
1170 When to fuzzify atom pairs count. Possible values: I<BeforeNormalization | AfterNormalizationYes>.
1171 Default value: I<AfterNormalization>.
1172
1173 =item B<--FuzzificationMethodology> I<FuzzyBinning | FuzzyBinSmoothing>
1174
1175 How to fuzzify atom pairs count. Possible values: I<FuzzyBinning | FuzzyBinSmoothing>.
1176 Default value: I<FuzzyBinning>.
1177
1178 In conjunction with values for options B<--FuzzifyAtomPairsCount>, B<--FuzzificationMode> and
1179 B<--FuzzFactor>, B<--FuzzificationMethodology> option is used to fuzzify pharmacophore atom
1180 pairs count.
1181
1182 Let:
1183
1184 Px = Pharmacophore atom type x
1185 Py = Pharmacophore atom type y
1186 PPxy = Pharmacophore atom pair between atom type Px and Py
1187
1188 PPxyDn = Pharmacophore atom pairs count between atom type Px and Py
1189 at distance Dn
1190 PPxyDn-1 = Pharmacophore atom pairs count between atom type Px and Py
1191 at distance Dn - 1
1192 PPxyDn+1 = Pharmacophore atom pairs count between atom type Px and Py
1193 at distance Dn + 1
1194
1195 FF = FuzzFactor for FuzzyBinning and FuzzyBinSmoothing
1196
1197 Then:
1198
1199 For I<FuzzyBinning>:
1200
1201 PPxyDn = PPxyDn (Unchanged)
1202
1203 PPxyDn-1 = PPxyDn-1 + PPxyDn * FF
1204 PPxyDn+1 = PPxyDn+1 + PPxyDn * FF
1205
1206 For I<FuzzyBinSmoothing>:
1207
1208 PPxyDn = PPxyDn - PPxyDn * 2FF for Dmin < Dn < Dmax
1209 PPxyDn = PPxyDn - PPxyDn * FF for Dn = Dmin or Dmax
1210
1211 PPxyDn-1 = PPxyDn-1 + PPxyDn * FF
1212 PPxyDn+1 = PPxyDn+1 + PPxyDn * FF
1213
1214 In both fuzzification schemes, a value of 0 for FF implies no fuzzification of occurrence counts.
1215 A value of 1 during I<FuzzyBinning> corresponds to maximum fuzzification of occurrence counts;
1216 however, a value of 1 during I<FuzzyBinSmoothing> ends up completely distributing the value over
1217 the previous and next distance bins.
1218
1219 So for default value of B<--FuzzFactor> (FF) 0.15, the occurrence count of pharmacohore atom pairs
1220 at distance Dn during FuzzyBinning is left unchanged and the counts at distances Dn -1 and Dn + 1
1221 are incremented by PPxyDn * 0.15.
1222
1223 And during I<FuzzyBinSmoothing> the occurrence counts at Distance Dn is scaled back using multiplicative
1224 factor of (1 - 2*0.15) and the occurrence counts at distances Dn -1 and Dn + 1 are incremented by
1225 PPxyDn * 0.15. In otherwords, occurrence bin count is smoothed out by distributing it over the
1226 previous and next distance value.
1227
1228 =item B<--FuzzFactor> I<number>
1229
1230 Specify by how much to fuzzify atom pairs count. Default value: I<0.15>. Valid values: For
1231 I<FuzzyBinning> value of B<--FuzzificationMethodology> option: I<between 0 and 1.0>; For
1232 I<FuzzyBinSmoothing> value of B<--FuzzificationMethodology> option: I<between 0 and 0.5>.
1233
1234 =item B<-h, --help>
1235
1236 Print this help message.
1237
1238 =item B<-k, --KeepLargestComponent> I<Yes | No>
1239
1240 Generate fingerprints for only the largest component in molecule. Possible values:
1241 I<Yes or No>. Default value: I<Yes>.
1242
1243 For molecules containing multiple connected components, fingerprints can be generated
1244 in two different ways: use all connected components or just the largest connected
1245 component. By default, all atoms except for the largest connected component are
1246 deleted before generation of fingerprints.
1247
1248 =item B<--MinDistance> I<number>
1249
1250 Minimum bond distance between atom pairs for generating topological pharmacophore atom
1251 pairs. Default value: I<1>. Valid values: positive integers including 0 and less than B<--MaxDistance>.
1252
1253 =item B<--MaxDistance> I<number>
1254
1255 Maximum bond distance between atom pairs for generating topological pharmacophore atom
1256 pairs. Default value: I<10>. Valid values: positive integers and greater than B<--MinDistance>.
1257
1258 =item B<-n, --NormalizationMethodology> I<None | ByHeavyAtomsCount | ByAtomTypesCount>
1259
1260 Normalization methodology to use for scaling the occurrence count of pharmacophore atom
1261 pairs within specified distance range. Possible values: I<None, ByHeavyAtomsCount or
1262 ByAtomTypesCount>. Default value: I<None>.
1263
1264 =item B<--OutDelim> I<comma | tab | semicolon>
1265
1266 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1267 Default value: I<comma>.
1268
1269 =item B<--output> I<SD | FP | text | all>
1270
1271 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1272
1273 =item B<-o, --overwrite>
1274
1275 Overwrite existing files.
1276
1277 =item B<-q, --quote> I<Yes | No>
1278
1279 Put quote around column values in output CSV/TSV text file(s). Possible values:
1280 I<Yes or No>. Default value: I<Yes>
1281
1282 =item B<-r, --root> I<RootName>
1283
1284 New file name is generated using the root: <Root>.<Ext>. Default for new file names:
1285 <SDFileName><TopologicalPharmacophoreAtomPairsFP>.<Ext>. The file type determines <Ext> value.
1286 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1287 delimited text files, respectively.This option is ignored for multiple input files.
1288
1289 =item B<--ValuesPrecision> I<number>
1290
1291 Precision of atom pairs count real values which might be generated after normalization
1292 or fuzzification. Default value: up to I<2> decimal places. Valid values: positive integers.
1293
1294 =item B<-v, --VectorStringFormat> I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1295
1296 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1297 B<--output> option. Possible values: I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString
1298 | ValuesAndIDsString | ValuesAndIDsPairsString>.
1299
1300 Default value during I<FixedSize> value of B<--AtomPairsSetSizeToUse> option: I<ValuesString>. Default
1301 value during I<ArbitrarySize> value of B<--AtomPairsSetSizeToUse> option: I<IDsAndValuesString>.
1302
1303 I<ValuesString> option value is not allowed for I<ArbitrarySize> value of B<--AtomPairsSetSizeToUse>
1304 option.
1305
1306 Examples:
1307
1308 FingerprintsVector;TopologicalPharmacophoreAtomPairs:ArbitrarySize:Min
1309 Distance1:MaxDistance10;54;NumericalValues;IDsAndValuesString;H-D1-H H
1310 -D1-NI HBA-D1-NI HBD-D1-NI H-D2-H H-D2-HBA H-D2-HBD HBA-D2-HBA HBA-D2-
1311 HBD H-D3-H H-D3-HBA H-D3-HBD H-D3-NI HBA-D3-NI HBD-D3-NI H-D4-H H-D4-H
1312 BA H-D4-HBD HBA-D4-HBA HBA-D4-HBD HBD-D4-HBD H-D5-H H-D5-HBA H-D5-...;
1313 18 1 2 1 22 12 8 1 2 18 6 3 1 1 1 22 13 6 5 7 2 28 9 5 1 1 1 36 16 10
1314 3 4 1 37 10 8 1 35 10 9 3 3 1 28 7 7 4 18 16 12 5 1 2 1
1315
1316 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
1317 ance1:MaxDistance10;150;OrderedNumericalValues;ValuesString;18 0 0 1 0
1318 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3 1 0 0 0 1
1319 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0 1 0 0 1 0
1320 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0 0 37 10 8 0 0 0 0 1 0 0 0 0 0 0
1321 0 35 10 9 0 0 3 3 0 0 1 0 0 0 0 0 28 7 7 4 0 0 0 0 0 0 0 0 0 0 0 18...
1322
1323 FingerprintsVector;TopologicalPharmacophoreAtomPairs:FixedSize:MinDist
1324 ance1:MaxDistance10;150;OrderedNumericalValues;IDsAndValuesString;H-D1
1325 -H H-D1-HBA H-D1-HBD H-D1-NI H-D1-PI HBA-D1-HBA HBA-D1-HBD HBA-D1-NI H
1326 BA-D1-PI HBD-D1-HBD HBD-D1-NI HBD-D1-PI NI-D1-NI NI-D1-PI PI-D1-PI H-D
1327 2-H H-D2-HBA H-D2-HBD H-D2-NI H-D2-PI HBA-D2-HBA HBA-D2-HBD HBA-D2...;
1328 18 0 0 1 0 0 0 2 0 0 1 0 0 0 0 22 12 8 0 0 1 2 0 0 0 0 0 0 0 0 18 6 3
1329 1 0 0 0 1 0 0 1 0 0 0 0 22 13 6 0 0 5 7 0 0 2 0 0 0 0 0 28 9 5 1 0 0 0
1330 1 0 0 1 0 0 0 0 36 16 10 0 0 3 4 0 0 1 0 0 0 0
1331
1332 =item B<-w, --WorkingDir> I<DirName>
1333
1334 Location of working directory. Default value: current directory.
1335
1336 =back
1337
1338 =head1 EXAMPLES
1339
1340 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to distances
1341 from 1 through 10 using default atom types with no weighting, normalization, and fuzzification
1342 of atom pairs count and create a SampleTPAPFP.csv file containing sequential compound IDs along
1343 with fingerprints vector strings data in ValuesString format, type:
1344
1345 % TopologicalPharmacophoreAtomPairsFingerprints.pl -r SampleTPAPFP
1346 -o Sample.sdf
1347
1348 To generate topological pharmacophore atom pairs fingerprints of fixed size corresponding to distances
1349 from 1 through 10 using default atom types with no weighting, normalization, and fuzzification
1350 of atom pairs count and create a SampleTPAPFP.csv file containing sequential compound IDs along
1351 with fingerprints vector strings data in ValuesString format, type:
1352
1353 % TopologicalPharmacophoreAtomPairsFingerprints.pl
1354 --AtomPairsSetSizeToUse FixedSize -r SampleTPAPFP-o Sample.sdf
1355
1356 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to distances
1357 from 1 through 10 using default atom types with no weighting, normalization, and fuzzification
1358 of atom pairs count and create SampleTPAPFP.sdf, SampleTPAPFP.fpf and SampleTPAPFP.csv files containing
1359 sequential compound IDs in CSV file along with fingerprints vector strings data in ValuesString
1360 format, type:
1361
1362 % TopologicalPharmacophoreAtomPairsFingerprints.pl --output all
1363 -r SampleTPAPFP -o Sample.sdf
1364
1365 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to distances
1366 from 1 through 10 using default atom types with no weighting, normalization, and fuzzification
1367 of atom pairs count and create a SampleTPAPFP.csv file containing sequential compound IDs along
1368 with fingerprints vector strings data in IDsAndValuesPairsString format, type:
1369
1370 % TopologicalPharmacophoreAtomPairsFingerprints.pl --VectorStringFormat
1371 IDsAndValuesPairsString -r SampleTPAPFP -o Sample.sdf
1372
1373 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to distances
1374 from 1 through 6 using default atom types with no weighting, normalization, and fuzzification
1375 of atom pairs count and create a SampleTPAPFP.csv file containing sequential compound IDs along
1376 with fingerprints vector strings data in ValuesString format, type:
1377
1378 % TopologicalPharmacophoreAtomPairsFingerprints.pl --MinDistance 1
1379 -MaxDistance 6 -r SampleTPAPFP -o Sample.sdf
1380
1381 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to distances
1382 from 1 through 10 using "HBD,HBA,PI,NI" atom types with double the weighting for "HBD,HBA" and
1383 normalization by HeavyAtomCount but no fuzzification of atom pairs count and create a
1384 SampleTPAPFP.csv file containing sequential compound IDs along with fingerprints vector strings
1385 data in ValuesString format, type:
1386
1387 % TopologicalPharmacophoreAtomPairsFingerprints.pl --MinDistance 1
1388 -MaxDistance 10 --AtomTypesToUse "HBD,HBA,PI, NI" --AtomTypesWeight
1389 "HBD,2,HBA,2,PI,1,NI,1" --NormalizationMethodology ByHeavyAtomsCount
1390 --FuzzifyAtomPairsCount No -r SampleTPAPFP -o Sample.sdf
1391
1392 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to
1393 distances from 1 through 10 using "HBD,HBA,PI,NI,H" atom types with no weighting of atom types and
1394 normalization but with fuzzification of atom pairs count using FuzzyBinning methodology
1395 with FuzzFactor value 0.15 and create a SampleTPAPFP.csv file containing sequential compound
1396 IDs along with fingerprints vector strings data in ValuesString format, type:
1397
1398 % TopologicalPharmacophoreAtomPairsFingerprints.pl --MinDistance 1
1399 --MaxDistance 10 --AtomTypesToUse "HBD,HBA,PI, NI,H" --AtomTypesWeight
1400 "HBD,1,HBA,1,PI,1,NI,1,H,1" --NormalizationMethodology None
1401 --FuzzifyAtomPairsCount Yes --FuzzificationMethodology FuzzyBinning
1402 --FuzzFactor 0.5 -r SampleTPAPFP -o Sample.sdf
1403
1404 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding to distances
1405 distances from 1 through 10 using default atom types with no weighting,
1406 normalization, and fuzzification of atom pairs count and create a SampleTPAPFP.csv
1407 file containing compound ID from molecule name line along with fingerprints vector strings
1408 data, type:
1409
1410 % TopologicalPharmacophoreAtomPairsFingerprints.pl --DataFieldsMode
1411 CompoundID -CompoundIDMode MolName -r SampleTPAPFP -o Sample.sdf
1412
1413 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding
1414 to distances from 1 through 10 using default atom types with no weighting,
1415 normalization, and fuzzification of atom pairs count and create a SampleTPAPFP.csv
1416 file containing compound IDs using specified data field along with fingerprints vector strings
1417 data, type:
1418
1419 % TopologicalPharmacophoreAtomPairsFingerprints.pl --DataFieldsMode
1420 CompoundID -CompoundIDMode DataField --CompoundID Mol_ID
1421 -r SampleTPAPFP -o Sample.sdf
1422
1423 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding
1424 to distances from 1 through 10 using default atom types with no weighting,
1425 normalization, and fuzzification of atom pairs count and create a SampleTPAPFP.csv
1426 file containing compound ID using combination of molecule name line and an explicit compound
1427 prefix along with fingerprints vector strings data, type:
1428
1429 % TopologicalPharmacophoreAtomPairsFingerprints.pl --DataFieldsMode
1430 CompoundID -CompoundIDMode MolnameOrLabelPrefix
1431 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleTPAPFP -o Sample.sdf
1432
1433 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding
1434 to distances from 1 through 10 using default atom types with no weighting,
1435 normalization, and fuzzification of atom pairs count and create a SampleTPAPFP.csv
1436 file containing specific data fields columns along with fingerprints vector strings
1437 data, type:
1438
1439 % TopologicalPharmacophoreAtomPairsFingerprints.pl --DataFieldsMode
1440 Specify --DataFields Mol_ID -r SampleTPAPFP -o Sample.sdf
1441
1442 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding
1443 to distances from 1 through 10 using default atom types with no weighting,
1444 normalization, and fuzzification of atom pairs count and create a SampleTPAPFP.csv
1445 file containing common data fields columns along with fingerprints vector strings
1446 data, type:
1447
1448 % TopologicalPharmacophoreAtomPairsFingerprints.pl --DataFieldsMode
1449 Common -r SampleTPAPFP -o Sample.sdf
1450
1451 To generate topological pharmacophore atom pairs fingerprints of arbitrary size corresponding
1452 to distances from 1 through 10 using default atom types with no weighting,
1453 normalization, and fuzzification of atom pairs count and create SampleTPAPFP.sdf, SampleTPAPFP.fpf,
1454 and SampleTPAPFP.csv files containing all data fields columns in CSV file along with fingerprints
1455 data, type:
1456
1457 % TopologicalPharmacophoreAtomPairsFingerprints.pl --DataFieldsMode
1458 All --output all -r SampleTPAPFP -o Sample.sdf
1459
1460
1461 =head1 AUTHOR
1462
1463 Manish Sud <msud@san.rr.com>
1464
1465 =head1 SEE ALSO
1466
1467 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1468 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
1469 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1470 TopologicalPharmacophoreAtomTripletsFingerprints.pl
1471
1472 =head1 COPYRIGHT
1473
1474 Copyright (C) 2015 Manish Sud. All rights reserved.
1475
1476 This file is part of MayaChemTools.
1477
1478 MayaChemTools is free software; you can redistribute it and/or modify it under
1479 the terms of the GNU Lesser General Public License as published by the Free
1480 Software Foundation; either version 3 of the License, or (at your option)
1481 any later version.
1482
1483 =cut