comparison mayachemtool/mayachemtools/bin/PathLengthFingerprints.pl @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:68300206e90d
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: PathLengthFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.50 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::AtomicInvariantsAtomTypes;
43 use AtomTypes::FunctionalClassAtomTypes;
44 use Fingerprints::PathLengthFingerprints;
45
46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
47
48 # Autoflush STDOUT
49 $| = 1;
50
51 # Starting message...
52 $ScriptName = basename($0);
53 print "\n$ScriptName: Starting...\n\n";
54 $StartTime = new Benchmark;
55
56 # Get the options and setup script...
57 SetupScriptUsage();
58 if ($Options{help} || @ARGV < 1) {
59 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
60 }
61
62 my(@SDFilesList);
63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
64
65 # Process options...
66 print "Processing options...\n";
67 my(%OptionsInfo);
68 ProcessOptions();
69
70 # Setup information about input files...
71 print "Checking input SD file(s)...\n";
72 my(%SDFilesInfo);
73 RetrieveSDFilesInfo();
74
75 # Process input files..
76 my($FileIndex);
77 if (@SDFilesList > 1) {
78 print "\nProcessing SD files...\n";
79 }
80 for $FileIndex (0 .. $#SDFilesList) {
81 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
82 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
83 GeneratePathLengthFingerprints($FileIndex);
84 }
85 }
86 print "\n$ScriptName:Done...\n\n";
87
88 $EndTime = new Benchmark;
89 $TotalTime = timediff ($EndTime, $StartTime);
90 print "Total time: ", timestr($TotalTime), "\n";
91
92 ###############################################################################
93
94 # Generate fingerprints for a SD file...
95 #
96 sub GeneratePathLengthFingerprints {
97 my($FileIndex) = @_;
98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $PathLengthFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
99
100 $SDFile = $SDFilesList[$FileIndex];
101
102 # Setup output files...
103 #
104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
105
106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
107 $MoleculeFileIO->Open();
108
109 $CmpdCount = 0;
110 $IgnoredCmpdCount = 0;
111
112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
113 $CmpdCount++;
114
115 # Filter compound data before calculating fingerprints...
116 if ($OptionsInfo{Filter}) {
117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
118 $IgnoredCmpdCount++;
119 next COMPOUND;
120 }
121 }
122
123 $PathLengthFingerprints = GenerateMoleculeFingerprints($Molecule);
124 if (!$PathLengthFingerprints) {
125 $IgnoredCmpdCount++;
126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
127 next COMPOUND;
128 }
129
130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $PathLengthFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
131 }
132 $MoleculeFileIO->Close();
133
134 if ($NewFPSDFileIO) {
135 $NewFPSDFileIO->Close();
136 }
137 if ($NewFPTextFileIO) {
138 $NewFPTextFileIO->Close();
139 }
140 if ($NewFPFileIO) {
141 $NewFPFileIO->Close();
142 }
143
144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
145 }
146
147 # Process compound being ignored due to problems in fingerprints geneation...
148 #
149 sub ProcessIgnoredCompound {
150 my($Mode, $CmpdCount, $Molecule) = @_;
151 my($CmpdID, $DataFieldLabelAndValuesRef);
152
153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
155
156 MODE: {
157 if ($Mode =~ /^ContainsNonElementalData$/i) {
158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
159 next MODE;
160 }
161
162 if ($Mode =~ /^ContainsNoElementalData$/i) {
163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
164 next MODE;
165 }
166
167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
169 next MODE;
170 }
171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
172 }
173 }
174
175 # Check and filter compounds....
176 #
177 sub CheckAndFilterCompound {
178 my($CmpdCount, $Molecule) = @_;
179 my($ElementCount, $NonElementCount);
180
181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
182
183 if ($NonElementCount) {
184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
185 return 1;
186 }
187
188 if (!$ElementCount) {
189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
190 return 1;
191 }
192
193 return 0;
194 }
195
196 # Write out compounds fingerprints generation summary statistics...
197 #
198 sub WriteFingerprintsGenerationSummaryStatistics {
199 my($CmpdCount, $IgnoredCmpdCount) = @_;
200 my($ProcessedCmpdCount);
201
202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
203
204 print "\nNumber of compounds: $CmpdCount\n";
205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
207 }
208
209 # Open output files...
210 #
211 sub SetupAndOpenOutputFiles {
212 my($FileIndex) = @_;
213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
214
215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
216
217 # Setup common parameters for fingerprints file IO objects...
218 #
219 %FingerprintsFileIOParams = ();
220 if ($OptionsInfo{Mode} =~ /^PathLengthBits$/i) {
221 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
222 }
223 elsif ($OptionsInfo{Mode} =~ /^PathLengthCount$/i) {
224 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
225 }
226
227 if ($OptionsInfo{SDOutput}) {
228 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
229 print "Generating SD file $NewFPSDFile...\n";
230 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
231 $NewFPSDFileIO->Open();
232 }
233
234 if ($OptionsInfo{FPOutput}) {
235 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
236 print "Generating FP file $NewFPFile...\n";
237 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
238 $NewFPFileIO->Open();
239 }
240
241 if ($OptionsInfo{TextOutput}) {
242 my($ColLabelsRef);
243
244 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
245 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
246
247 print "Generating text file $NewFPTextFile...\n";
248 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
249 $NewFPTextFileIO->Open();
250 }
251
252 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
253 }
254
255 # Write fingerpritns and other data to appropriate output files...
256 #
257 sub WriteDataToOutputFiles {
258 my($FileIndex, $CmpdCount, $Molecule, $PathLengthFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
259 my($DataFieldLabelAndValuesRef);
260
261 $DataFieldLabelAndValuesRef = undef;
262 if ($NewFPTextFileIO || $NewFPFileIO) {
263 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
264 }
265
266 if ($NewFPSDFileIO) {
267 my($CmpdString);
268
269 $CmpdString = $Molecule->GetInputMoleculeString();
270 $NewFPSDFileIO->WriteFingerprints($PathLengthFingerprints, $CmpdString);
271 }
272
273 if ($NewFPTextFileIO) {
274 my($ColValuesRef);
275
276 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
277 $NewFPTextFileIO->WriteFingerprints($PathLengthFingerprints, $ColValuesRef);
278 }
279
280 if ($NewFPFileIO) {
281 my($CompoundID);
282
283 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
284 $NewFPFileIO->WriteFingerprints($PathLengthFingerprints, $CompoundID);
285 }
286 }
287
288 # Generate approriate column labels for FPText output file...
289 #
290 sub SetupFPTextFileCoulmnLabels {
291 my($FileIndex) = @_;
292 my($Line, @ColLabels);
293
294 @ColLabels = ();
295 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
296 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
297 }
298 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
299 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
300 }
301 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
302 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
303 }
304 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
305 push @ColLabels, $OptionsInfo{CompoundIDLabel};
306 }
307 # Add fingerprints label...
308 push @ColLabels, $OptionsInfo{FingerprintsLabel};
309
310 return \@ColLabels;
311 }
312
313 # Generate column values FPText output file..
314 #
315 sub SetupFPTextFileCoulmnValues {
316 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
317 my(@ColValues);
318
319 @ColValues = ();
320 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
321 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
322 }
323 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
325 }
326 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
327 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
328 }
329 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
330 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
331 }
332
333 return \@ColValues;
334 }
335
336 # Generate compound ID for FP and FPText output files..
337 #
338 sub SetupCmpdIDForOutputFiles {
339 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
340 my($CmpdID);
341
342 $CmpdID = '';
343 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
344 my($MolName);
345 $MolName = $Molecule->GetName();
346 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
347 }
348 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
349 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
350 }
351 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
352 my($SpecifiedDataField);
353 $SpecifiedDataField = $OptionsInfo{CompoundID};
354 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
355 }
356 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
357 $CmpdID = $Molecule->GetName();
358 }
359 return $CmpdID;
360 }
361
362 # Generate fingerprints for molecule...
363 #
364 sub GenerateMoleculeFingerprints {
365 my($Molecule) = @_;
366 my($PathLengthFingerprints);
367
368 if ($OptionsInfo{KeepLargestComponent}) {
369 $Molecule->KeepLargestComponent();
370 }
371 if ($OptionsInfo{IgnoreHydrogens}) {
372 $Molecule->DeleteHydrogens();
373 }
374
375 if ($OptionsInfo{DetectAromaticity}) {
376 if (!$Molecule->DetectRings()) {
377 return undef;
378 }
379 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
380 $Molecule->DetectAromaticity();
381 }
382
383 $PathLengthFingerprints = undef;
384 if ($OptionsInfo{Mode} =~ /^PathLengthBits$/i) {
385 $PathLengthFingerprints = GeneratePathLengthBitsFingerprints($Molecule);
386 }
387 elsif ($OptionsInfo{Mode} =~ /^PathLengthCount$/i) {
388 $PathLengthFingerprints = GeneratePathLengthCountFingerprints($Molecule);
389 }
390 else {
391 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: PathLengthBits or PathLengthCount\n";
392 }
393
394 return $PathLengthFingerprints;
395 }
396
397 # Generate pathlength bits finerprints for molecule...
398 #
399 sub GeneratePathLengthBitsFingerprints {
400 my($Molecule) = @_;
401 my($PathLengthFingerprints);
402
403 $PathLengthFingerprints = new Fingerprints::PathLengthFingerprints('Molecule' => $Molecule, 'Type' => 'PathLengthBits', 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}, 'NumOfBitsToSetPerPath' => $OptionsInfo{NumOfBitsToSetPerPath}, 'Size' => $OptionsInfo{Size}, 'MinLength' => $OptionsInfo{MinPathLength}, 'MaxLength' => $OptionsInfo{MaxPathLength}, 'AllowRings' => $OptionsInfo{AllowRings}, 'AllowSharedBonds' => $OptionsInfo{AllowSharedBonds}, 'UseBondSymbols' => $OptionsInfo{UseBondSymbols}, 'UseUniquePaths' => $OptionsInfo{UseUniquePaths}, 'UsePerlCoreRandom' => $OptionsInfo{UsePerlCoreRandom});
404
405 # Set atom identifier type...
406 SetAtomIdentifierTypeValuesToUse($PathLengthFingerprints);
407
408 # Generate fingerprints...
409 $PathLengthFingerprints->GenerateFingerprints();
410
411 # Make sure fingerprints generation is successful...
412 if (!$PathLengthFingerprints->IsFingerprintsGenerationSuccessful()) {
413 return undef;
414 }
415
416 if ($OptionsInfo{Fold}) {
417 my($CheckSizeValue) = 0;
418 $PathLengthFingerprints->FoldFingerprintsBySize($OptionsInfo{FoldedSize}, $CheckSizeValue);
419 }
420
421 return $PathLengthFingerprints;
422 }
423
424 # Generate pathlength count finerprints for molecule...
425 #
426 sub GeneratePathLengthCountFingerprints {
427 my($Molecule) = @_;
428 my($PathLengthFingerprints);
429
430 $PathLengthFingerprints = new Fingerprints::PathLengthFingerprints('Molecule' => $Molecule, 'Type' => 'PathLengthCount', 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}, 'MinLength' => $OptionsInfo{MinPathLength}, 'MaxLength' => $OptionsInfo{MaxPathLength}, 'AllowRings' => $OptionsInfo{AllowRings}, 'AllowSharedBonds' => $OptionsInfo{AllowSharedBonds}, 'UseBondSymbols' => $OptionsInfo{UseBondSymbols}, 'UseUniquePaths' => $OptionsInfo{UseUniquePaths});
431
432 # Set atom identifier type...
433 SetAtomIdentifierTypeValuesToUse($PathLengthFingerprints);
434
435 # Generate fingerprints...
436 $PathLengthFingerprints->GenerateFingerprints();
437
438 # Make sure fingerprints generation is successful...
439 if (!$PathLengthFingerprints->IsFingerprintsGenerationSuccessful()) {
440 return undef;
441 }
442 return $PathLengthFingerprints;
443 }
444
445 # Set atom identifier type to use for generating path strings...
446 #
447 sub SetAtomIdentifierTypeValuesToUse {
448 my($PathLengthFingerprints) = @_;
449
450 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
451 $PathLengthFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
452 }
453 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
454 $PathLengthFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
455 }
456 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
457 # Nothing to do for now...
458 }
459 else {
460 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
461 }
462 }
463
464 # Retrieve information about SD files...
465 #
466 sub RetrieveSDFilesInfo {
467 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
468
469 %SDFilesInfo = ();
470 @{$SDFilesInfo{FileOkay}} = ();
471 @{$SDFilesInfo{OutFileRoot}} = ();
472 @{$SDFilesInfo{SDOutFileNames}} = ();
473 @{$SDFilesInfo{FPOutFileNames}} = ();
474 @{$SDFilesInfo{TextOutFileNames}} = ();
475 @{$SDFilesInfo{AllDataFieldsRef}} = ();
476 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
477
478 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
479 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
480
481 FILELIST: for $Index (0 .. $#SDFilesList) {
482 $SDFile = $SDFilesList[$Index];
483
484 $SDFilesInfo{FileOkay}[$Index] = 0;
485 $SDFilesInfo{OutFileRoot}[$Index] = '';
486 $SDFilesInfo{SDOutFileNames}[$Index] = '';
487 $SDFilesInfo{FPOutFileNames}[$Index] = '';
488 $SDFilesInfo{TextOutFileNames}[$Index] = '';
489
490 $SDFile = $SDFilesList[$Index];
491 if (!(-e $SDFile)) {
492 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
493 next FILELIST;
494 }
495 if (!CheckFileType($SDFile, "sd sdf")) {
496 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
497 next FILELIST;
498 }
499
500 if ($CheckDataField) {
501 # Make sure data field exists in SD file..
502 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
503
504 @CmpdLines = ();
505 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
506 $CmpdString = ReadCmpdString(\*SDFILE);
507 close SDFILE;
508 @CmpdLines = split "\n", $CmpdString;
509 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
510 $SpecifiedDataField = $OptionsInfo{CompoundID};
511 if (!exists $DataFieldValues{$SpecifiedDataField}) {
512 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
513 next FILELIST;
514 }
515 }
516
517 $AllDataFieldsRef = '';
518 $CommonDataFieldsRef = '';
519 if ($CollectDataFields) {
520 my($CmpdCount);
521 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
522 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
523 close SDFILE;
524 }
525
526 # Setup output file names...
527 $FileDir = ""; $FileName = ""; $FileExt = "";
528 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
529
530 $TextOutFileExt = "csv";
531 if ($Options{outdelim} =~ /^tab$/i) {
532 $TextOutFileExt = "tsv";
533 }
534 $SDOutFileExt = $FileExt;
535 $FPOutFileExt = "fpf";
536
537 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
538 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
539 if ($RootFileName && $RootFileExt) {
540 $FileName = $RootFileName;
541 }
542 else {
543 $FileName = $OptionsInfo{OutFileRoot};
544 }
545 $OutFileRoot = $FileName;
546 }
547 else {
548 $OutFileRoot = "${FileName}PathLengthFP";
549 }
550
551 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
552 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
553 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
554
555 if ($OptionsInfo{SDOutput}) {
556 if ($SDFile =~ /$NewSDFileName/i) {
557 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
558 print "Specify a different name using \"-r --root\" option or use default name.\n";
559 next FILELIST;
560 }
561 }
562
563 if (!$OptionsInfo{OverwriteFiles}) {
564 # Check SD, FP and text outout files...
565 if ($OptionsInfo{SDOutput}) {
566 if (-e $NewSDFileName) {
567 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
568 next FILELIST;
569 }
570 }
571 if ($OptionsInfo{FPOutput}) {
572 if (-e $NewFPFileName) {
573 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
574 next FILELIST;
575 }
576 }
577 if ($OptionsInfo{TextOutput}) {
578 if (-e $NewTextFileName) {
579 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
580 next FILELIST;
581 }
582 }
583 }
584
585 $SDFilesInfo{FileOkay}[$Index] = 1;
586
587 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
588 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
589 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
590 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
591
592 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
593 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
594 }
595 }
596
597 # Process option values...
598 sub ProcessOptions {
599 %OptionsInfo = ();
600
601 $OptionsInfo{Mode} = $Options{mode};
602 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
603 $OptionsInfo{PathMode} = $Options{pathmode};
604
605 ProcessAtomIdentifierTypeOptions();
606
607 $OptionsInfo{BitsOrder} = $Options{bitsorder};
608 $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
609
610 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
611 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
612 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
613
614 my(@SpecifiedDataFields);
615 @SpecifiedDataFields = ();
616
617 @{$OptionsInfo{SpecifiedDataFields}} = ();
618 $OptionsInfo{CompoundID} = '';
619
620 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
621 if ($Options{compoundidmode} =~ /^DataField$/i) {
622 if (!$Options{compoundid}) {
623 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
624 }
625 $OptionsInfo{CompoundID} = $Options{compoundid};
626 }
627 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
628 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
629 }
630 }
631 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
632 if (!$Options{datafields}) {
633 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
634 }
635 @SpecifiedDataFields = split /\,/, $Options{datafields};
636 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
637 }
638
639 if ($Options{atomidentifiertype} !~ /^AtomicInvariantsAtomTypes$/i) {
640 if ($Options{detectaromaticity} =~ /^No$/i) {
641 die "Error: The value specified, $Options{detectaromaticity}, for option \"--DetectAromaticity\" is not valid. No value is only allowed during AtomicInvariantsAtomTypes value for \"-a, --AtomIdentifierType\" \n";
642 }
643 }
644 $OptionsInfo{DetectAromaticity} = ($Options{detectaromaticity} =~ /^Yes$/i) ? 1 : 0;
645
646 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
647
648 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'PathLengthFingerprints';
649
650 my($Size, $MinSize, $MaxSize);
651 $MinSize = 32;
652 $MaxSize = 2**32;
653 $Size = $Options{size};
654 if (!(IsPositiveInteger($Size) && $Size >= $MinSize && $Size <= $MaxSize && IsNumberPowerOfNumber($Size, 2))) {
655 die "Error: Invalid size value, $Size, for \"-s, --size\" option. Allowed values: power of 2, >= minimum size of $MinSize, and <= maximum size of $MaxSize.\n";
656 }
657 $OptionsInfo{Size} = $Size;
658
659 $OptionsInfo{Fold} = ($Options{fold} =~ /^Yes$/i) ? 1 : 0;
660 my($FoldedSize);
661 $FoldedSize = $Options{foldedsize};
662 if ($Options{fold} =~ /^Yes$/i) {
663 if (!(IsPositiveInteger($FoldedSize) && $FoldedSize < $Size && IsNumberPowerOfNumber($FoldedSize, 2))) {
664 die "Error: Invalid folded size value, $FoldedSize, for \"--FoldedSize\" option. Allowed values: power of 2, >= minimum size of $MinSize, and < size value of $Size.\n";
665 }
666 }
667 $OptionsInfo{FoldedSize} = $FoldedSize;
668
669 $OptionsInfo{IgnoreHydrogens} = ($Options{ignorehydrogens} =~ /^Yes$/i) ? 1 : 0;
670 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
671
672 my($MinPathLength, $MaxPathLength);
673 $MinPathLength = $Options{minpathlength};
674 $MaxPathLength = $Options{maxpathlength};
675 if (!IsPositiveInteger($MinPathLength)) {
676 die "Error: Invalid path length value, $MinPathLength, for \"--MinPathLength\" option. Allowed values: > 0\n";
677 }
678 if (!IsPositiveInteger($MaxPathLength)) {
679 die "Error: Invalid path length value, $MaxPathLength, for \"--MinPathLength\" option. Allowed values: > 0\n";
680 }
681 if ($MinPathLength >= $MaxPathLength) {
682 die "Error: Invalid minimum and maximum path length values, $MinPathLength and $MaxPathLength, for \"--MinPathLength\" and \"--MaxPathLength\"options. Allowed values: minimum path length value must be smaller than maximum path length value.\n";
683 }
684 $OptionsInfo{MinPathLength} = $MinPathLength;
685 $OptionsInfo{MaxPathLength} = $MaxPathLength;
686
687 my($NumOfBitsToSetPerPath);
688 $NumOfBitsToSetPerPath = $Options{numofbitstosetperpath};
689 if (!IsPositiveInteger($MaxPathLength)) {
690 die "Error: Invalid value, $NumOfBitsToSetPerPath, for \"-n, --NumOfBitsToSetPerPath\" option. Allowed values: > 0\n";
691 }
692 if ($NumOfBitsToSetPerPath >= $Size) {
693 die "Error: Invalid value, $NumOfBitsToSetPerPath, for \"-n, --NumOfBitsToSetPerPath\" option. Allowed values: It must be less than the size, $Size, of the fingerprint bit-string.\n";
694 }
695 $OptionsInfo{NumOfBitsToSetPerPath} = $NumOfBitsToSetPerPath;
696
697 $OptionsInfo{Output} = $Options{output};
698 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
699 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
700 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
701
702 $OptionsInfo{OutDelim} = $Options{outdelim};
703 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
704
705 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
706 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
707
708 $OptionsInfo{UseBondSymbols} = ($Options{usebondsymbols} =~ /^Yes$/i) ? 1 : 0;
709
710 $OptionsInfo{UsePerlCoreRandom} = ($Options{useperlcorerandom} =~ /^Yes$/i) ? 1 : 0;
711
712 $OptionsInfo{UseUniquePaths} = ($Options{useuniquepaths} =~ /^Yes$/i) ? 1 : 0;
713
714 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
715
716 # Setup parameters used during generation of fingerprints by PathLengthFingerprints class...
717 my($AllowRings, $AllowSharedBonds);
718 $AllowRings = 1;
719 $AllowSharedBonds = 1;
720 MODE: {
721 if ($Options{pathmode} =~ /^AtomPathsWithoutRings$/i) { $AllowSharedBonds = 0; $AllowRings = 0; last MODE;}
722 if ($Options{pathmode} =~ /^AtomPathsWithRings$/i) { $AllowSharedBonds = 0; $AllowRings = 1; last MODE;}
723 if ($Options{pathmode} =~ /^AllAtomPathsWithoutRings$/i) { $AllowSharedBonds = 1; $AllowRings = 0; last MODE;}
724 if ($Options{pathmode} =~ /^AllAtomPathsWithRings$/i) { $AllowSharedBonds = 1; $AllowRings = 1; last MODE;}
725 die "Error: ProcessOptions: mode value, $Options{pathmode}, is not supported.\n";
726 }
727 $OptionsInfo{AllowRings} = $AllowRings;
728 $OptionsInfo{AllowSharedBonds} = $AllowSharedBonds;
729 }
730
731 # Process atom identifier type and related options...
732 #
733 sub ProcessAtomIdentifierTypeOptions {
734
735 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
736
737 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
738 ProcessAtomicInvariantsToUseOption();
739 }
740 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
741 ProcessFunctionalClassesToUse();
742 }
743 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
744 # Nothing to do for now...
745 }
746 else {
747 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
748 }
749 }
750
751 # Process specified atomic invariants to use...
752 #
753 sub ProcessAtomicInvariantsToUseOption {
754 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
755
756 @{$OptionsInfo{AtomicInvariantsToUse}} = ();
757 if (IsEmpty($Options{atomicinvariantstouse})) {
758 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
759 }
760 $AtomSymbolSpecified = 0;
761 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
762 for $AtomicInvariant (@AtomicInvariantsWords) {
763 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
764 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
765 }
766 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
767 $AtomSymbolSpecified = 1;
768 }
769 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
770 }
771 if (!$AtomSymbolSpecified) {
772 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
773 }
774 }
775
776 # Process specified functional classes invariants to use...
777 #
778 sub ProcessFunctionalClassesToUse {
779 my($FunctionalClass, @FunctionalClassesToUseWords);
780
781 @{$OptionsInfo{FunctionalClassesToUse}} = ();
782 if (IsEmpty($Options{functionalclassestouse})) {
783 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
784 }
785 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
786 for $FunctionalClass (@FunctionalClassesToUseWords) {
787 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
788 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
789 }
790 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
791 }
792 }
793
794 # Setup script usage and retrieve command line arguments specified using various options...
795 sub SetupScriptUsage {
796
797 # Retrieve all the options...
798 %Options = ();
799
800 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
801
802 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
803 $Options{atomicinvariantstouse} = 'AS';
804
805 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
806
807 $Options{bitsorder} = 'Ascending';
808 $Options{bitstringformat} = 'HexadecimalString';
809
810 $Options{compoundidmode} = 'LabelPrefix';
811 $Options{compoundidlabel} = 'CompoundID';
812 $Options{datafieldsmode} = 'CompoundID';
813 $Options{detectaromaticity} = 'Yes';
814
815 $Options{filter} = 'Yes';
816
817 $Options{fold} = 'No';
818 $Options{foldedsize} = 256;
819
820 $Options{ignorehydrogens} = 'Yes';
821 $Options{keeplargestcomponent} = 'Yes';
822
823 $Options{mode} = 'PathLengthBits';
824 $Options{pathmode} = 'AllAtomPathsWithRings';
825
826 $Options{minpathlength} = 1;
827 $Options{maxpathlength} = 8;
828
829 $Options{numofbitstosetperpath} = 1;
830
831 $Options{output} = 'text';
832 $Options{outdelim} = 'comma';
833 $Options{quote} = 'yes';
834
835 $Options{size} = 1024;
836
837 $Options{usebondsymbols} = 'yes';
838 $Options{useperlcorerandom} = 'yes';
839 $Options{useuniquepaths} = 'yes';
840
841 $Options{vectorstringformat} = 'IDsAndValuesString';
842
843 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "detectaromaticity=s", "filter|f=s", "fingerprintslabel=s", "fold=s", "foldedsize=i", "help|h", "ignorehydrogens|i=s", "keeplargestcomponent|k=s", "mode|m=s", "minpathlength=i", "maxpathlength=i", "numofbitstosetperpath|n=i", "outdelim=s", "output=s", "overwrite|o", "pathmode|p=s", "quote|q=s", "root|r=s", "size|s=i", "usebondsymbols|u=s", "useperlcorerandom=s", "useuniquepaths=s", "vectorstringformat|v=s", "workingdir|w=s")) {
844 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
845 }
846 if ($Options{workingdir}) {
847 if (! -d $Options{workingdir}) {
848 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
849 }
850 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
851 }
852 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
853 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
854 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
855 }
856 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|FunctionalClassAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
857 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
858 }
859 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
860 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
861 }
862 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
863 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
864 }
865 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
866 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
867 }
868 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
869 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
870 }
871 if ($Options{detectaromaticity} !~ /^(Yes|No)$/i) {
872 die "Error: The value specified, $Options{detectaromaticity}, for option \"--DetectAromaticity\" is not valid. Allowed values: Yes or No\n";
873 }
874 if ($Options{filter} !~ /^(Yes|No)$/i) {
875 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
876 }
877 if ($Options{fold} !~ /^(Yes|No)$/i) {
878 die "Error: The value specified, $Options{fold}, for option \"--fold\" is not valid. Allowed values: Yes or No\n";
879 }
880 if (!IsPositiveInteger($Options{foldedsize})) {
881 die "Error: The value specified, $Options{foldedsize}, for option \"--FoldedSize\" is not valid. Allowed values: > 0 \n";
882 }
883 if ($Options{ignorehydrogens} !~ /^(Yes|No)$/i) {
884 die "Error: The value specified, $Options{ignorehydrogens}, for option \"-i, --IgnoreHydrogens\" is not valid. Allowed values: Yes or No\n";
885 }
886 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
887 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
888 }
889 if ($Options{mode} !~ /^(PathLengthBits|PathLengthCount)$/i) {
890 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: PathLengthBits or PathLengthCount\n";
891 }
892 if (!IsPositiveInteger($Options{minpathlength})) {
893 die "Error: The value specified, $Options{minpathlength}, for option \"--MinPathLength\" is not valid. Allowed values: > 0 \n";
894 }
895 if (!IsPositiveInteger($Options{numofbitstosetperpath})) {
896 die "Error: The value specified, $Options{NumOfBitsToSetPerPath}, for option \"--NumOfBitsToSetPerPath\" is not valid. Allowed values: > 0 \n";
897 }
898 if (!IsPositiveInteger($Options{maxpathlength})) {
899 die "Error: The value specified, $Options{maxpathlength}, for option \"--MaxPathLength\" is not valid. Allowed values: > 0 \n";
900 }
901 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
902 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
903 }
904 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
905 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
906 }
907 if ($Options{pathmode} !~ /^(AtomPathsWithoutRings|AtomPathsWithRings|AllAtomPathsWithoutRings|AllAtomPathsWithRings)$/i) {
908 die "Error: The value specified, $Options{pathmode}, for option \"-m, --PathMode\" is not valid. Allowed values: AtomPathsWithoutRings, AtomPathsWithRings, AllAtomPathsWithoutRings or AllAtomPathsWithRings\n";
909 }
910 if ($Options{quote} !~ /^(Yes|No)$/i) {
911 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
912 }
913 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
914 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
915 }
916
917 if (!IsPositiveInteger($Options{size})) {
918 die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: > 0 \n";
919 }
920 if ($Options{usebondsymbols} !~ /^(Yes|No)$/i) {
921 die "Error: The value specified, $Options{usebondsymbols}, for option \"-u, --UseBondSymbols\" is not valid. Allowed values: Yes or No\n";
922 }
923 if ($Options{useperlcorerandom} !~ /^(Yes|No)$/i) {
924 die "Error: The value specified, $Options{useperlcorerandom}, for option \"--UsePerlCoreRandom\" is not valid. Allowed values: Yes or No\n";
925 }
926 if ($Options{useuniquepaths} !~ /^(Yes|No)$/i) {
927 die "Error: The value specified, $Options{useuniquepaths}, for option \"--UseUniquePaths\" is not valid. Allowed values: Yes or No\n";
928 }
929 if ($Options{vectorstringformat} !~ /^(IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
930 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
931 }
932 }
933
934 __END__
935
936 =head1 NAME
937
938 PathLengthFingerprints.pl - Generate atom path length based fingerprints for SD files
939
940 =head1 SYNOPSIS
941
942 PathLengthFingerprints.pl SDFile(s)...
943
944 PathLengthFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
945 [B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes>]
946 [B<--AtomicInvariantsToUse> I<"AtomicInvariant1,AtomicInvariant2...">]
947 [B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">]
948 [B<--BitsOrder> I<Ascending | Descending>] [B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>]
949 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
950 [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
951 [B<--DataFields> I<"FieldLabel1,FieldLabel2,... ">] [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>]
952 [B<--DetectAromaticity> I<Yes | No>] [B<-f, --Filter> I<Yes | No>] [B<--FingerprintsLabel> I<text>]
953 [B<--fold> I<Yes | No>] [B<--FoldedSize> I<number>] [B<-h, --help>]
954 [B<-i, --IgnoreHydrogens> I<Yes | No>] [B<-k, --KeepLargestComponent> I<Yes | No>]
955 [B<-m, --mode> I<PathLengthBits | PathLengthCount>]
956 [B<--MinPathLength> I<number>] [B<--MaxPathLength> I<number>] [B<-n, --NumOfBitsToSetPerPath> I<number>]
957 [B<--OutDelim> I<comma | tab | semicolon>]
958 [B<--output> I<SD | FP | text | all>] [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>]
959 [B<-p, --PathMode> I<AtomPathsWithoutRings | AtomPathsWithRings | AllAtomPathsWithoutRings | AllAtomPathsWithRings>]
960 [B<-s, --size> I<number>] [B<-u, --UseBondSymbols> I<Yes | No>] [B<--UsePerlCoreRandom> I<Yes | No>]
961 [B<--UseUniquePaths> I<Yes | No>] [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>]
962 [B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
963 [B<-w, --WorkingDir> dirname] SDFile(s)...
964
965 =head1 DESCRIPTION
966
967 Generate atom path length fingerprints for I<SDFile(s)> and create appropriate SD, FP or
968 CSV/TSV text file(s) containing fingerprints bit-vector or vector strings corresponding to
969 molecular fingerprints.
970
971 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
972 and I<.sd>. All other file names are ignored. All the SD files in a current directory
973 can be specified either by I<*.sdf> or the current directory name.
974
975 The current release of MayaChemTools supports generation of path length fingerprints
976 corresponding to following B<-a, --AtomIdentifierTypes>:
977
978 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
979 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
980 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
981
982 Based on the values specified for B<-p, --PathMode>, B<--MinPathLength> and B<--MaxPathLength>,
983 all appropriate atom paths are generated for each atom in the molecule and collected in a list and
984 the list is filtered to remove any structurally duplicate paths as indicated by the value of
985 B<--UseUniquePaths> option.
986
987 For each atom path in the filtered atom paths list, an atom path string is created using value of
988 B<-a, --AtomIdentifierType> and specified values to use for a particular atom identifier type.
989 Value of B<-u, --UseBondSymbols> controls whether bond order symbols are used during generation
990 of atom path string. For each atom path, only lexicographically smaller atom path strings are kept.
991
992 For I<PathLengthBits> value of B<-m, --mode> option, each atom path is hashed to a 32 bit unsigned
993 integer key using B<TextUtil::HashCode> function. Using the hash key as a seed for a random number
994 generator, a random integer value between 0 and B<--Size> is used to set corresponding bits
995 in the fingerprint bit-vector string. Value of B<--NumOfBitsToSetPerPath> option controls the number
996 of time a random number is generated to set corresponding bits.
997
998 For I< PathLengthCount> value of B<-m, --mode> option, the number of times an atom path appears
999 is tracked and a fingerprints count-string corresponding to count of atom paths is generated.
1000
1001 Example of I<SD> file containing path length fingerprints string data:
1002
1003 ... ...
1004 ... ...
1005 $$$$
1006 ... ...
1007 ... ...
1008 ... ...
1009 41 44 0 0 0 0 0 0 0 0999 V2000
1010 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
1011 ... ...
1012 2 3 1 0 0 0 0
1013 ... ...
1014 M END
1015 > <CmpdID>
1016 Cmpd1
1017
1018 > <PathLengthFingerprints>
1019 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLengt
1020 h1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a49913991a66
1021 03130b0a19e8051c89184414953800cc2151082844a201042800130860308e8204d4028
1022 00831048940e44281c00060449a5000ac80c894114e006321264401600846c050164462
1023 08190410805000304a10205b0100e04c0038ba0fad0209c0ca8b1200012268b61c0026a
1024 aa0660a11014a011d46
1025
1026 $$$$
1027 ... ...
1028 ... ...
1029
1030 Example of I<FP> file containing path length fingerprints string data:
1031
1032 #
1033 # Package = MayaChemTools 7.4
1034 # ReleaseDate = Oct 21, 2010
1035 #
1036 # TimeStamp = Mon Mar 7 15:14:01 2011
1037 #
1038 # FingerprintsStringType = FingerprintsBitVector
1039 #
1040 # Description = PathLengthBits:AtomicInvariantsAtomTypes:MinLength1:...
1041 # Size = 1024
1042 # BitStringFormat = HexadecimalString
1043 # BitsOrder = Ascending
1044 #
1045 Cmpd1 9c8460989ec8a49913991a6603130b0a19e8051c89184414953800cc21510...
1046 Cmpd2 000000249400840040100042011001001980410c000000001010088001120...
1047 ... ...
1048 ... ..
1049
1050 Example of CSV I<Text> file containing pathlength fingerprints string data:
1051
1052 "CompoundID","PathLengthFingerprints"
1053 "Cmpd1","FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes
1054 :MinLength1:MaxLength8;1024;HexadecimalString;Ascending;9c8460989ec8a4
1055 9913991a6603130b0a19e8051c89184414953800cc2151082844a20104280013086030
1056 8e8204d402800831048940e44281c00060449a5000ac80c894114e006321264401..."
1057 ... ...
1058 ... ...
1059
1060 The current release of MayaChemTools generates the following types of path length
1061 fingerprints bit-vector and vector strings:
1062
1063 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1064 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
1065 0100010101011000101001011100110001000010001001101000001001001001001000
1066 0010110100000111001001000001001010100100100000000011000000101001011100
1067 0010000001000101010100000100111100110111011011011000000010110111001101
1068 0101100011000000010001000011000010100011101100001000001000100000000...
1069
1070 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1071 th1:MaxLength8;1024;HexadecimalString;Ascending;48caa1315d82d91122b029
1072 42861c9409a4208182d12015509767bd0867653604481a8b1288000056090583603078
1073 9cedae54e26596889ab121309800900490515224208421502120a0dd9200509723ae89
1074 00024181b86c0122821d4e4880c38620dab280824b455404009f082003d52c212b4e6d
1075 6ea05280140069c780290c43
1076
1077 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
1078 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
1079 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
1080 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
1081 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
1082 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
1083
1084 FingerprintsVector;PathLengthCount:DREIDINGAtomTypes:MinLength1:MaxLen
1085 gth8;410;NumericalValues;IDsAndValuesPairsString;C_2 2 C_3 9 C_R 22 F_
1086 1 N_3 1 N_R 1 O_2 2 O_3 3 C_2=O_2 2 C_2C_3 1 C_2C_R 1 C_2N_3 1 C_2O_3
1087 1 C_3C_3 7 C_3C_R 1 C_3N_R 1 C_3O_3 2 C_R:C_R 21 C_R:N_R 2 C_RC_R 2 C
1088 _RF_ 1 C_RN_3 1 C_2C_3C_3 1 C_2C_R:C_R 2 C_2N_3C_R 1 C_3C_2=O_2 1 C_3C
1089 _2O_3 1 C_3C_3C_3 5 C_3C_3C_R 2 C_3C_3N_R 1 C_3C_3O_3 4 C_3C_R:C_R ...
1090
1091 FingerprintsVector;PathLengthCount:EStateAtomTypes:MinLength1:MaxLengt
1092 h8;454;NumericalValues;IDsAndValuesPairsString;aaCH 14 aasC 8 aasN 1 d
1093 O 2 dssC 2 sCH3 2 sF 1 sOH 3 ssCH2 4 ssNH 1 sssCH 3 aaCH:aaCH 10 aaCH:
1094 aasC 8 aasC:aasC 3 aasC:aasN 2 aasCaasC 2 aasCdssC 1 aasCsF 1 aasCssNH
1095 1 aasCsssCH 1 aasNssCH2 1 dO=dssC 2 dssCsOH 1 dssCssCH2 1 dssCssNH 1
1096 sCH3sssCH 2 sOHsssCH 2 ssCH2ssCH2 1 ssCH2sssCH 4 aaCH:aaCH:aaCH 6 a...
1097
1098 FingerprintsVector;PathLengthCount:FunctionalClassAtomTypes:MinLength1
1099 :MaxLength8;404;NumericalValues;IDsAndValuesPairsString;Ar 22 Ar.HBA 1
1100 HBA 2 HBA.HBD 3 HBD 1 Hal 1 NI 1 None 10 Ar.HBA:Ar 2 Ar.HBANone 1 Ar:
1101 Ar 21 ArAr 2 ArHBD 1 ArHal 1 ArNone 2 HBA.HBDNI 1 HBA.HBDNone 2 HBA=NI
1102 1 HBA=None 1 HBDNone 1 NINone 1 NoneNone 7 Ar.HBA:Ar:Ar 2 Ar.HBA:ArAr
1103 1 Ar.HBA:ArNone 1 Ar.HBANoneNone 1 Ar:Ar.HBA:Ar 1 Ar:Ar.HBANone 2 ...
1104
1105 FingerprintsVector;PathLengthCount:MMFF94AtomTypes:MinLength1:MaxLengt
1106 h8;463;NumericalValues;IDsAndValuesPairsString;C5A 2 C5B 2 C=ON 1 CB 1
1107 8 COO 1 CR 9 F 1 N5 1 NC=O 1 O=CN 1 O=CO 1 OC=O 1 OR 2 C5A:C5B 2 C5A:N
1108 5 2 C5ACB 1 C5ACR 1 C5B:C5B 1 C5BC=ON 1 C5BCB 1 C=ON=O=CN 1 C=ONNC=O 1
1109 CB:CB 18 CBF 1 CBNC=O 1 COO=O=CO 1 COOCR 1 COOOC=O 1 CRCR 7 CRN5 1 CR
1110 OR 2 C5A:C5B:C5B 2 C5A:C5BC=ON 1 C5A:C5BCB 1 C5A:N5:C5A 1 C5A:N5CR ...
1111
1112 FingerprintsVector;PathLengthCount:SLogPAtomTypes:MinLength1:MaxLength
1113 8;518;NumericalValues;IDsAndValuesPairsString;C1 5 C10 1 C11 1 C14 1 C
1114 18 14 C20 4 C21 2 C22 1 C5 2 CS 2 F 1 N11 1 N4 1 O10 1 O2 3 O9 1 C10C1
1115 1 C10N11 1 C11C1 2 C11C21 1 C14:C18 2 C14F 1 C18:C18 10 C18:C20 4 C18
1116 :C22 2 C1C5 1 C1CS 4 C20:C20 1 C20:C21 1 C20:N11 1 C20C20 2 C21:C21 1
1117 C21:N11 1 C21C5 1 C22N4 1 C5=O10 1 C5=O9 1 C5N4 1 C5O2 1 CSO2 2 C10...
1118
1119 FingerprintsVector;PathLengthCount:SYBYLAtomTypes:MinLength1:MaxLength
1120 8;412;NumericalValues;IDsAndValuesPairsString;C.2 2 C.3 9 C.ar 22 F 1
1121 N.am 1 N.ar 1 O.2 1 O.3 2 O.co2 2 C.2=O.2 1 C.2=O.co2 1 C.2C.3 1 C.2C.
1122 ar 1 C.2N.am 1 C.2O.co2 1 C.3C.3 7 C.3C.ar 1 C.3N.ar 1 C.3O.3 2 C.ar:C
1123 .ar 21 C.ar:N.ar 2 C.arC.ar 2 C.arF 1 C.arN.am 1 C.2C.3C.3 1 C.2C.ar:C
1124 .ar 2 C.2N.amC.ar 1 C.3C.2=O.co2 1 C.3C.2O.co2 1 C.3C.3C.3 5 C.3C.3...
1125
1126 FingerprintsVector;PathLengthCount:TPSAAtomTypes:MinLength1:MaxLength8
1127 ;331;NumericalValues;IDsAndValuesPairsString;N21 1 N7 1 None 34 O3 2 O
1128 4 3 N21:None 2 N21None 1 N7None 2 None:None 21 None=O3 2 NoneNone 13 N
1129 oneO4 3 N21:None:None 2 N21:NoneNone 2 N21NoneNone 1 N7None:None 2 N7N
1130 one=O3 1 N7NoneNone 1 None:N21:None 1 None:N21None 2 None:None:None 20
1131 None:NoneNone 12 NoneN7None 1 NoneNone=O3 2 NoneNoneNone 8 NoneNon...
1132
1133 FingerprintsVector;PathLengthCount:UFFAtomTypes:MinLength1:MaxLength8;
1134 410;NumericalValues;IDsAndValuesPairsString;C_2 2 C_3 9 C_R 22 F_ 1 N_
1135 3 1 N_R 1 O_2 2 O_3 3 C_2=O_2 2 C_2C_3 1 C_2C_R 1 C_2N_3 1 C_2O_3 1 C_
1136 3C_3 7 C_3C_R 1 C_3N_R 1 C_3O_3 2 C_R:C_R 21 C_R:N_R 2 C_RC_R 2 C_RF_
1137 1 C_RN_3 1 C_2C_3C_3 1 C_2C_R:C_R 2 C_2N_3C_R 1 C_3C_2=O_2 1 C_3C_2O_3
1138 1 C_3C_3C_3 5 C_3C_3C_R 2 C_3C_3N_R 1 C_3C_3O_3 4 C_3C_R:C_R 1 C_3...
1139
1140 =head1 OPTIONS
1141
1142 =over 4
1143
1144 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
1145
1146 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
1147 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
1148 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
1149 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
1150
1151 The supported aromaticity model names along with model specific control parameters
1152 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
1153 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
1154 this file during class instantiation and makes it available to method B<DetectAromaticity>
1155 for detecting aromaticity corresponding to a specific model.
1156
1157 This option is ignored during I<No> value of B<--DetectAromaticity> option.
1158
1159 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | FunctionalClassAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>
1160
1161 Specify atom identifier type to use for assignment of atom types to hydrogen and/or
1162 non-hydrogen atoms during calculation of atom types fingerprints. Possible values in the
1163 current release are: I<AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
1164 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
1165 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>.
1166
1167 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | FunctionalClassAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>
1168
1169 Specify atom identifier type to use during generation of atom path strings
1170 corresponding to path length fingerprints. Possible values in the current release are:
1171 I<AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
1172 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
1173 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>.
1174
1175 =item B<--AtomicInvariantsToUse> I<"AtomicInvariant1,AtomicInvariant2...">
1176
1177 This value is used during I<AtomicInvariantsAtomTypes> value of B<a, --AtomIdentifierType>
1178 option. It's a list of comma separated valid atomic invariant atom types.
1179
1180 Possible values for atomic invariants are: I<AS, X, BO, LBO, SB, DB, TB,
1181 H, Ar, RA, FC, MN, SM>. Default value: I<AS>.
1182
1183 The atomic invariants abbreviations correspond to:
1184
1185 AS = Atom symbol corresponding to element symbol
1186
1187 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
1188 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
1189 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
1190 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
1191 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
1192 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
1193 H<n> = Number of implicit and explicit hydrogens for atom
1194 Ar = Aromatic annotation indicating whether atom is aromatic
1195 RA = Ring atom annotation indicating whether atom is a ring
1196 FC<+n/-n> = Formal charge assigned to atom
1197 MN<n> = Mass number indicating isotope other than most abundant isotope
1198 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
1199 3 (triplet)
1200
1201 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1202
1203 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1204
1205 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1206 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
1207
1208 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
1209 are also allowed:
1210
1211 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
1212 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
1213 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
1214 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
1215 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1216 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1217 H : NumOfImplicitAndExplicitHydrogens
1218 Ar : Aromatic
1219 RA : RingAtom
1220 FC : FormalCharge
1221 MN : MassNumber
1222 SM : SpinMultiplicity
1223
1224 Examples:
1225
1226 B<Benzene>: Using value of I<AS> for B<--AtomicInvariantsToUse>, I<Yes> for B<UseBondSymbols>,
1227 and I< AllAtomPathsWithRings> for B<-p, --PathMode>, atom path strings generated are:
1228
1229 C C:C C:C:C C:C:C:C C:C:C:C:C C:C:C:C:C:C C:C:C:C:C:C:C
1230
1231 And using I<AS,X,BO> for B<--AtomicInvariantsToUse> generates following atom path
1232 strings:
1233
1234 C.X2.BO3 C.X2.BO3:C.X2.BO3 C.X2.BO3:C.X2.BO3:C.X2.BO3
1235 C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3
1236 C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3
1237 C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3
1238 C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3:C.X2.BO3
1239
1240 B<Urea>: Using value of I<AS> for B<--AtomicInvariantsToUse>, I<Yes> for B<UseBondSymbols>,
1241 and I< AllAtomPathsWithRings> for B<-p, --PathMode>, atom path strings are:
1242
1243 C N O C=O CN NC=O NCN
1244
1245 And using I<AS,X,BO> for B<--AtomicInvariantsToUse> generates following atom path
1246 strings:
1247
1248 C.X3.BO4 N.X1.BO1 O.X1.BO2 C.X3.BO4=O.X1.BO2
1249 C.X3.BO4N.X1.BO1 N.X1.BO1C.X3.BO4=O.X1.BO2
1250 N.X1.BO1C.X3.BO4N.X1.BO1
1251
1252 =item B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">
1253
1254 This value is used during I<FunctionalClassAtomTypes> value of B<a, --AtomIdentifierType>
1255 option. It's a list of comma separated valid functional classes.
1256
1257 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1258 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>.
1259
1260 The functional class abbreviations correspond to:
1261
1262 HBD: HydrogenBondDonor
1263 HBA: HydrogenBondAcceptor
1264 PI : PositivelyIonizable
1265 NI : NegativelyIonizable
1266 Ar : Aromatic
1267 Hal : Halogen
1268 H : Hydrophobic
1269 RA : RingAtom
1270 CA : ChainAtom
1271
1272 Functional class atom type specification for an atom corresponds to:
1273
1274 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1275
1276 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1277 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1278
1279 HydrogenBondDonor: NH, NH2, OH
1280 HydrogenBondAcceptor: N[!H], O
1281 PositivelyIonizable: +, NH2
1282 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1283
1284 =item B<--BitsOrder> I<Ascending | Descending>
1285
1286 Bits order to use during generation of fingerprints bit-vector string for I<PathLengthBits> value of
1287 B<-m, --mode> option. Possible values: I<Ascending, Descending>. Default: I<Ascending>.
1288
1289 I<Ascending> bit order which corresponds to first bit in each byte as the lowest bit as
1290 opposed to the highest bit.
1291
1292 Internally, bits are stored in I<Ascending> order using Perl vec function. Regardless
1293 of machine order, big-endian or little-endian, vec function always considers first
1294 string byte as the lowest byte and first bit within each byte as the lowest bit.
1295
1296 =item B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>
1297
1298 Format of fingerprints bit-vector string data in output SD, FP or CSV/TSV text file(s) specified by
1299 B<--output> used during I<PathLengthBits> value of B<-m, --mode> option. Possible
1300 values: I<BinaryString, HexadecimalString>. Default value: I<HexadecimalString>.
1301
1302 I<BinaryString> corresponds to an ASCII string containing 1s and 0s. I<HexadecimalString>
1303 contains bit values in ASCII hexadecimal format.
1304
1305 Examples:
1306
1307 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1308 th1:MaxLength8;1024;BinaryString;Ascending;001000010011010101011000110
1309 0100010101011000101001011100110001000010001001101000001001001001001000
1310 0010110100000111001001000001001010100100100000000011000000101001011100
1311 0010000001000101010100000100111100110111011011011000000010110111001101
1312 0101100011000000010001000011000010100011101100001000001000100000000...
1313
1314 FingerprintsBitVector;PathLengthBits:AtomicInvariantsAtomTypes:MinLeng
1315 th1:MaxLength8;1024;HexadecimalString;Ascending;48caa1315d82d91122b029
1316 42861c9409a4208182d12015509767bd0867653604481a8b1288000056090583603078
1317 9cedae54e26596889ab121309800900490515224208421502120a0dd9200509723ae89
1318 00024181b86c0122821d4e4880c38620dab280824b455404009f082003d52c212b4e6d
1319 6ea05280140069c780290c43
1320
1321 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1322
1323 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1324
1325 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1326 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1327 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1328 look like Cmpd<Number>.
1329
1330 Examples for I<DataField> value of B<--CompoundIDMode>:
1331
1332 MolID
1333 ExtReg
1334
1335 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1336
1337 Compound
1338
1339 The value specified above generates compound IDs which correspond to Compound<Number>
1340 instead of default value of Cmpd<Number>.
1341
1342 =item B<--CompoundIDLabel> I<text>
1343
1344 Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value
1345 of B<--DataFieldsMode> option. Default: I<CompoundID>.
1346
1347 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1348
1349 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
1350 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
1351 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
1352 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
1353
1354 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1355 Default: I<LabelPrefix>.
1356
1357 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1358 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1359 values are replaced with sequential compound IDs.
1360
1361 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1362
1363 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,... ">
1364
1365 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1366 with generated fingerprints for I<text | all> values of B<--output> option.
1367
1368 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1369
1370 Examples:
1371
1372 Extreg
1373 MolID,CompoundName
1374
1375 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1376
1377 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1378 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
1379 data field; transfer SD data files common to all compounds; extract specified data fields;
1380 generate a compound ID using molname line, a compound prefix, or a combination of both.
1381 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1382
1383 =item B<--DetectAromaticity> I<Yes | No>
1384
1385 Detect aromaticity before generating fingerprints. Possible values: I<Yes or No>.
1386 Default value: I<Yes>.
1387
1388 I<No> B<--DetectAromaticity> forces usage of atom and bond aromaticity values
1389 from I<SDFile(s)> and skips the step which detects and assigns aromaticity.
1390
1391 I<No> B<--DetectAromaticity> value is only allowed uring I<AtomicInvariantsAtomTypes>
1392 value of B<-a, --AtomIdentifierType> options; for all possible values B<-a, --AtomIdentifierType>
1393 values, it must be I<Yes>.
1394
1395 =item B<-f, --Filter> I<Yes | No>
1396
1397 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1398 Default value: I<Yes>.
1399
1400 By default, compound data is checked before calculating fingerprints and compounds containing
1401 atom data corresponding to non-element symbols or no atom data are ignored.
1402
1403 =item B<--FingerprintsLabel> I<text>
1404
1405 SD data label or text file column label to use for fingerprints string in output SD or
1406 CSV/TSV text file(s) specified by B<--output>. Default value: I<PathLenghFingerprints>.
1407
1408 =item B<--fold> I<Yes | No>
1409
1410 Fold fingerprints to increase bit density during I<PathLengthBits> value of
1411 B<-m, --mode> option. Possible values: I<Yes or No>. Default value: I<No>.
1412
1413 =item B<--FoldedSize> I<number>
1414
1415 Size of folded fingerprint during I<PathLengthBits> value of B<-m, --mode> option. Default
1416 value: I<256>. Valid values correspond to any positive integer which is less than
1417 B<-s, --size> and meets the criteria for its value.
1418
1419 Examples:
1420
1421 128
1422 512
1423
1424 =item B<-h, --help>
1425
1426 Print this help message
1427
1428 =item B<-i, --IgnoreHydrogens> I<Yes | No>
1429
1430 Ignore hydrogens during fingerprints generation. Possible values: I<Yes or No>.
1431 Default value: I<Yes>.
1432
1433 For I<yes> value of B<-i, --IgnoreHydrogens>, any explicit hydrogens are also used for
1434 generation of atoms path lengths and fingerprints; implicit hydrogens are still ignored.
1435
1436 =item B<-k, --KeepLargestComponent> I<Yes | No>
1437
1438 Generate fingerprints for only the largest component in molecule. Possible values:
1439 I<Yes or No>. Default value: I<Yes>.
1440
1441 For molecules containing multiple connected components, fingerprints can be generated
1442 in two different ways: use all connected components or just the largest connected
1443 component. By default, all atoms except for the largest connected component are
1444 deleted before generation of fingerprints.
1445
1446 =item B<-m, --mode> I<PathLengthBits | PathLengthCount>
1447
1448 Specify type of path length fingerprints to generate for molecules in I<SDFile(s)>. Possible
1449 values: I<PathLengthBits, PathLengthCount>. Default value: I<PathLengthBits>.
1450
1451 For I<PathLengthBits> value of B<-m, --mode> option, a fingerprint bit-vector string containing
1452 zeros and ones is generated and for I<PathLengthCount> value, a fingerprint vector string
1453 corresponding to number of atom paths is generated.
1454
1455 =item B<--MinPathLength> I<number>
1456
1457 Minimum atom path length to include in fingerprints. Default value: I<1>. Valid values:
1458 positive integers and less than B<--MaxPathLength>. Path length of 1 correspond to
1459 a path containing only one atom.
1460
1461 =item B<--MaxPathLength> I<number>
1462
1463 Maximum atom path length to include in fingerprints. Default value: I<8>. Valid values:
1464 positive integers and greater than B<--MinPathLength>.
1465
1466 =item B<-n, --NumOfBitsToSetPerPath> I<number>
1467
1468 Number of bits to set per path during generation of fingerprints bit-vector string for I<PathLengthBits>
1469 value of B<-m, --mode> option. Default value: I<1>. Valid values: positive integers.
1470
1471 =item B<--OutDelim> I<comma | tab | semicolon>
1472
1473 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1474 Default value: I<comma>.
1475
1476 =item B<--output> I<SD | FP | text | all>
1477
1478 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1479
1480 =item B<-o, --overwrite>
1481
1482 Overwrite existing files.
1483
1484 =item B<-p, --PathMode> I<AtomPathsWithoutRings | AtomPathsWithRings | AllAtomPathsWithoutRings | AllAtomPathsWithRings>
1485
1486 Specify type of atom paths to use for generating pathlength fingerprints for molecules in
1487 I<SDFile(s)>. Possible values:I<AtomPathsWithoutRings, AtomPathsWithRings,
1488 AllAtomPathsWithoutRings, AllAtomPathsWithRings>. Default value: I<AllAtomPathsWithRings>.
1489
1490 For molecules with no rings, first two and last two options are equivalent and generate
1491 same set of atom paths starting from each atom with length between B<--MinPathLength>
1492 and B<--MaxPathLength>. However, all these four options can result in the same set of
1493 final atom paths for molecules containing fused, bridged or spiro rings.
1494
1495 For molecules containing rings, atom paths starting from each atom can be traversed in
1496 four different ways:
1497
1498 I<AtomPathsWithoutRings> - Atom paths containing no rings and without sharing of bonds
1499 in traversed paths.
1500
1501 I<AtomPathsWithRings> - Atom paths containing rings and without any sharing of bonds in
1502 traversed paths.
1503
1504 I<AllAtomPathsWithoutRings> - All possible atom paths containing no rings and without any
1505 sharing of bonds in traversed paths.
1506
1507 I<AllAtomPathsWithRings> - All possible atom paths containing rings and with sharing of
1508 bonds in traversed paths.
1509
1510 Atom path traversal is terminated at the ring atom.
1511
1512 Based on values specified for for B<-p, --PathMode>, B<--MinPathLength> and
1513 B<--MaxPathLength>, all appropriate atom paths are generated for each atom in the molecule
1514 and collected in a list.
1515
1516 For each atom path in the filtered atom paths list, an atom path string is created using value of
1517 B<-a, --AtomIdentifierType> and specified values to use for a particular atom identifier type.
1518 Value of B<-u, --UseBondSymbols> controls whether bond order symbols are used during generation
1519 of atom path string. Atom symbol corresponds to element symbol and characters used to represent
1520 bond order are: I<1 - None; 2 - '='; 3 - '#'; 1.5 or aromatic - ':'; others: bond order value>. By default,
1521 bond symbols are included in atom path strings. Exclusion of bond symbols in atom path strings
1522 results in fingerprints which correspond purely to atom paths without considering bonds.
1523
1524 B<UseUniquePaths> controls the removal of structurally duplicate atom path strings are removed
1525 from the list.
1526
1527 For I<PathLengthBits> value of B<-m, --mode> option, each atom path is hashed to a 32 bit unsigned
1528 integer key using B<TextUtil::HashCode> function. Using the hash key as a seed for a random number
1529 generator, a random integer value between 0 and B<--Size> is used to set corresponding bits
1530 in the fingerprint bit-vector string. Value of B<--NumOfBitsToSetPerPaths> option controls the number
1531 of time a random number is generated to set corresponding bits.
1532
1533 For I< PathLengthCount> value of B<-m, --mode> option, the number of times an atom path appears
1534 is tracked and a fingerprints count-string corresponding to count of atom paths is generated.
1535
1536 For molecule containing rings, combination of B<-p, --PathMode> and B<--UseBondSymbols> allows
1537 generation of up to 8 different types of atom path length strings:
1538
1539 AllowSharedBonds AllowRings UseBondSymbols
1540
1541 0 0 1 - AtomPathsNoCyclesWithBondSymbols
1542 0 1 1 - AtomPathsWithCyclesWithBondSymbols
1543
1544 1 0 1 - AllAtomPathsNoCyclesWithBondSymbols
1545 1 1 1 - AllAtomPathsWithCyclesWithBondSymbols
1546 [ DEFAULT ]
1547
1548 0 0 0 - AtomPathsNoCyclesNoBondSymbols
1549 0 1 0 - AtomPathsWithCyclesNoBondSymbols
1550
1551 1 0 0 - AllAtomPathsNoCyclesNoBondSymbols
1552 1 1 0 - AllAtomPathsWithCyclesNoWithBondSymbols
1553
1554 Default atom path length fingerprints generation for molecules containing rings with
1555 I<AllAtomPathsWithRings> value for B<-p, --PathMode>, I<Yes> value for B<--UseBondSymbols>,
1556 I<2> value for B<--MinPathLength> and I<8> value for B<--MaxPathLength> is the most time
1557 consuming. Combinations of other options can substantially speed up fingerprint generation
1558 for molecules containing complex ring systems.
1559
1560 Additionally, value for option B<-a, --AtomIdentifierType> in conjunction with corresponding specified
1561 values for atom types changes the nature of atom path length strings and the fingerprints.
1562
1563 =item B<-q, --quote> I<Yes | No>
1564
1565 Put quote around column values in output CSV/TSV text file(s). Possible values:
1566 I<Yes or No>. Default value: I<Yes>.
1567
1568 =item B<-r, --root> I<RootName>
1569
1570 New file name is generated using the root: <Root>.<Ext>. Default for new file
1571 names: <SDFileName><PathLengthFP>.<Ext>. The file type determines <Ext> value.
1572 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1573 delimited text files, respectively.This option is ignored for multiple input files.
1574
1575 =item B<-s, --size> I<number>
1576
1577 Size of fingerprints. Default value: I<1024>. Valid values correspond to any positive
1578 integer which satisfies the following criteria: power of 2, >= 32 and <= 2 ** 32.
1579
1580 Examples:
1581
1582 256
1583 512
1584 2048
1585
1586 =item B<-u, --UseBondSymbols> I<Yes | No>
1587
1588 Specify whether to use bond symbols for atom paths during generation of atom path strings.
1589 Possible values: I<Yes or No>. Default value: I<Yes>.
1590
1591 I<No> value option for B<-u, --UseBondSymbols> allows the generation of fingerprints corresponding
1592 purely to atoms disregarding all bonds.
1593
1594 =item B<--UsePerlCoreRandom> I<Yes | No>
1595
1596 Specify whether to use Perl CORE::rand or MayaChemTools MathUtil::random function
1597 during random number generation for setting bits in fingerprints bit-vector strings. Possible
1598 values: I<Yes or No>. Default value: I<Yes>.
1599
1600 I<No> value option for B<--UsePerlCoreRandom> allows the generation of fingerprints
1601 bit-vector strings which are same across different platforms.
1602
1603 The random number generator implemented in MayaChemTools is a variant of
1604 linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ].
1605 It is also referred to as Lehmer random number generator or Park-Miller
1606 random number generator.
1607
1608 Unlike Perl's core random number generator function rand, the random number
1609 generator implemented in MayaChemTools, MathUtil::random, generates consistent
1610 random values across different platforms for a specific random seed and leads
1611 to generation of portable fingerprints bit-vector strings.
1612
1613 =item B<--UseUniquePaths> I<Yes | No>
1614
1615 Specify whether to use structurally unique atom paths during generation of atom path strings.
1616 Possible values: I<Yes or No>. Default value: I<Yes>.
1617
1618 I<No> value option for B<--UseUniquePaths> allows usage of all atom paths generated by
1619 B<-p, --PathMode> option value for generation of atom path strings leading to duplicate
1620 path count during I<PathLengthCount> value of B<-m, --mode> option. It doesn't change fingerprint
1621 string generated during I<PathLengthBits> value of B<-m, --mode>.
1622
1623 For example, during I<AllAtomPathsWithRings> value of B<-p, --PathMode> option, benzene has
1624 12 linear paths of length 2 and 12 cyclic paths length of 7, but only 6 linear paths of length 2 and
1625 1 cyclic path of length 7 are structurally unique.
1626
1627 =item B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1628
1629 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1630 B<--output> used during I<PathLengthCount> value of B<-m, --mode> option. Possible
1631 values: I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString |
1632 ValuesAndIDsPairsString>. Defaultvalue: I<IDsAndValuesString>.
1633
1634 Examples:
1635
1636 FingerprintsVector;PathLengthCount:AtomicInvariantsAtomTypes:MinLength
1637 1:MaxLength8;432;NumericalValues;IDsAndValuesPairsString;C.X1.BO1.H3 2
1638 C.X2.BO2.H2 4 C.X2.BO3.H1 14 C.X3.BO3.H1 3 C.X3.BO4 10 F.X1.BO1 1 N.X
1639 2.BO2.H1 1 N.X3.BO3 1 O.X1.BO1.H1 3 O.X1.BO2 2 C.X1.BO1.H3C.X3.BO3.H1
1640 2 C.X2.BO2.H2C.X2.BO2.H2 1 C.X2.BO2.H2C.X3.BO3.H1 4 C.X2.BO2.H2C.X3.BO
1641 4 1 C.X2.BO2.H2N.X3.BO3 1 C.X2.BO3.H1:C.X2.BO3.H1 10 C.X2.BO3.H1:C....
1642
1643 FingerprintsVector;PathLengthCount:EStateAtomTypes:MinLength1:MaxLengt
1644 h8;454;NumericalValues;IDsAndValuesPairsString;aaCH 14 aasC 8 aasN 1 d
1645 O 2 dssC 2 sCH3 2 sF 1 sOH 3 ssCH2 4 ssNH 1 sssCH 3 aaCH:aaCH 10 aaCH:
1646 aasC 8 aasC:aasC 3 aasC:aasN 2 aasCaasC 2 aasCdssC 1 aasCsF 1 aasCssNH
1647 1 aasCsssCH 1 aasNssCH2 1 dO=dssC 2 dssCsOH 1 dssCssCH2 1 dssCssNH 1
1648 sCH3sssCH 2 sOHsssCH 2 ssCH2ssCH2 1 ssCH2sssCH 4 aaCH:aaCH:aaCH 6 a...
1649
1650 =item B<-w, --WorkingDir> I<DirName>
1651
1652 Location of working directory. Default: current directory.
1653
1654 =back
1655
1656 =head1 EXAMPLES
1657
1658 To generate path length fingerprints corresponding to all unique paths from length 1
1659 through 8 in hexadecimal bit-vector string format of size 1024 and create a
1660 SamplePLFPHex.csv file containing sequential compound IDs along with fingerprints
1661 bit-vector strings data, type:
1662
1663 % PathLengthFingerprints.pl -o -r SamplePLFPHex Sample.sdf
1664
1665 To generate path length fingerprints corresponding to all unique paths from length 1
1666 through 8 in hexadecimal bit-vector string format of size 1024 and create SamplePLFPHex.sdf,
1667 SamplePLFPHex.fpf, and SamplePLFPHex.csv files containing sequential compound IDs
1668 in CSV file along with fingerprints bit-vector strings data, type:
1669
1670 % PathLengthFingerprints.pl --output all -o -r SamplePLFPHex Sample.sdf
1671
1672 To generate path length fingerprints corresponding to all unique paths from length 1
1673 through 8 in binary bit-vector string format of size 1024 and create a
1674 SamplePLFPBin.csv file containing sequential compound IDs along with fingerprints
1675 bit-vector strings data, type:
1676
1677 % PathLengthFingerprints.pl --BitStringFormat BinaryString --size 2048
1678 -o -r SamplePLFPBin Sample.sdf
1679
1680 To generate path length fingerprints corresponding to count of all unique paths from
1681 length 1 through 8 in IDsAndValuesString format and create a SamplePLFPCount.csv file
1682 containing sequential compound IDs along with fingerprints vector strings data, type:
1683
1684 % PathLengthFingerprints.pl -m PathLengthCount -o -r SamplePLFPCount
1685 Sample.sdf
1686
1687 To generate path length fingerprints corresponding to count of all unique paths from
1688 length 1 through 8 in IDsAndValuesString format using E-state atom types and
1689 create a SamplePLFPCount.csv file containing sequential compound IDs along with fingerprints
1690 vector strings data, type:
1691
1692 % PathLengthFingerprints.pl -m PathLengthCount --AtomIdentifierType
1693 EStateAtomTypes -o -r SamplePLFPCount Sample.sdf
1694
1695 To generate path length fingerprints corresponding to count of all unique paths from
1696 length 1 through 8 in IDsAndValuesString format using SLogP atom types and
1697 create a SamplePLFPCount.csv file containing sequential compound IDs along with fingerprints
1698 vector strings data, type:
1699
1700 % PathLengthFingerprints.pl -m PathLengthCount --AtomIdentifierType
1701 SLogPAtomTypes -o -r SamplePLFPCount Sample.sdf
1702
1703 To generate path length fingerprints corresponding to count of all unique paths from
1704 length 1 through 8 in IDsAndValuesString format and create a SamplePLFPCount.csv file
1705 containing sequential compound IDs along with fingerprints vector strings data, type:
1706
1707 % PathLengthFingerprints.pl -m PathLengthCount --VectorStringFormat
1708 ValuesAndIDsPairsString -o -r SamplePLFPCount Sample.sdf
1709
1710 To generate path length fingerprints corresponding to count of all unique paths from
1711 length 1 through 8 in IDsAndValuesString format using AS,X,BO as atomic invariants and
1712 create a SamplePLFPCount.csv file containing sequential compound IDs along with fingerprints
1713 vector strings data, type:
1714
1715 % PathLengthFingerprints.pl -m PathLengthCount --AtomIdentifierType
1716 AtomicInvariantsAtomTypes --AtomicInvariantsToUse "AS,X,BO" -o
1717 -r SamplePLFPCount Sample.sdf
1718
1719 To generate path length fingerprints corresponding to count of all paths from
1720 length 1 through 8 in IDsAndValuesString format and create a SamplePLFPCount.csv file
1721 containing compound IDs from MolName line along with fingerprints vector strings data, type:
1722
1723 % PathLengthFingerprints.pl -m PathLengthCount --UseUniquePaths No
1724 -o --CompoundIDMode MolName -r SamplePLFPCount --UseUniquePaths No
1725 Sample.sdf
1726
1727 To generate path length fingerprints corresponding to all unique paths from length 1
1728 through 8 in hexadecimal bit-vector string format of size 512 after folding and create
1729 SamplePLFPHex.sdf, SamplePLFPHex.fpf, and SamplePLFPHex.sdf files containing sequential
1730 compound IDs along with fingerprints bit-vector strings data, type:
1731
1732 % PathLengthFingerprints.pl --output all --Fold Yes --FoldedSize 512
1733 -o -r SamplePLFPHex Sample.sdf
1734
1735 To generate path length fingerprints corresponding to all unique paths from length 1
1736 through 8 containing no rings and without sharing of bonds in hexadecimal bit-vector
1737 string format of size 1024 and create a SamplePLFPHex.csv file containing sequential
1738 compound IDs along with fingerprints bit-vector strings data and all data fields, type:
1739
1740 % PathLengthFingerprints.pl -p AtomPathsWithoutRings --DataFieldsMode All
1741 -o -r SamplePLFPHex Sample.sdf
1742
1743 To generate path length fingerprints corresponding to all unique paths from length 1
1744 through 8 containing rings and without sharing of bonds in hexadecimal bit-vector
1745 string format of size 1024 and create a SamplePLFPHex.tsv file containing compound IDs
1746 derived from combination of molecule name line and an explicit compound prefix
1747 along with fingerprints bit-vector strings data and all data fields, type:
1748
1749 % PathLengthFingerprints.pl -p AtomPathsWithRings --DataFieldsMode
1750 CompoundID --CompoundIDMode MolnameOrLabelPrefix --CompoundID Cmpd
1751 --CompoundIDLabel MolID --FingerprintsLabel PathLengthFP --OutDelim Tab
1752 -r SamplePLFPHex -o Sample.sdf
1753
1754 To generate path length fingerprints corresponding to count of all unique paths from
1755 length 1 through 8 in IDsAndValuesString format and create a SamplePLFPCount.csv file
1756 containing sequential compound IDs along with fingerprints vector strings data using
1757 aromaticity specified in SD file, type:
1758
1759 % PathLengthFingerprints.pl -m PathLengthCount --DetectAromaticity No
1760 -o -r SamplePLFPCount Sample.sdf
1761
1762 To generate path length fingerprints corresponding to all unique paths from length 2
1763 through 6 in hexadecimal bit-vector string format of size 1024 and create a
1764 SamplePLFPHex.csv file containing sequential compound IDs along with fingerprints
1765 bit-vector strings data, type:
1766
1767 % PathLengthFingerprints.pl --MinPathLength 2 --MaxPathLength 6
1768 -o -r SamplePLFPHex Sample.sdf
1769
1770 =head1 AUTHOR
1771
1772 Manish Sud <msud@san.rr.com>
1773
1774 =head1 SEE ALSO
1775
1776 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1777 ExtendedConnectivityFingerprints.pl, MACCSKeysFingerprints.pl,
1778 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1779 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1780
1781 =head1 COPYRIGHT
1782
1783 Copyright (C) 2015 Manish Sud. All rights reserved.
1784
1785 This file is part of MayaChemTools.
1786
1787 MayaChemTools is free software; you can redistribute it and/or modify it under
1788 the terms of the GNU Lesser General Public License as published by the Free
1789 Software Foundation; either version 3 of the License, or (at your option)
1790 any later version.
1791
1792 =cut