comparison bin/EStateIndiciesFingerprints.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: EStateIndiciesFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:19 $
5 # $Revision: 1.23 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::EStateAtomTypes;
43 use Fingerprints::EStateIndiciesFingerprints;
44
45 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
46
47 # Autoflush STDOUT
48 $| = 1;
49
50 # Starting message...
51 $ScriptName = basename($0);
52 print "\n$ScriptName: Starting...\n\n";
53 $StartTime = new Benchmark;
54
55 # Get the options and setup script...
56 SetupScriptUsage();
57 if ($Options{help} || @ARGV < 1) {
58 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
59 }
60
61 my(@SDFilesList);
62 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
63
64 # Process options...
65 print "Processing options...\n";
66 my(%OptionsInfo);
67 ProcessOptions();
68
69 # Setup information about input files...
70 print "Checking input SD file(s)...\n";
71 my(%SDFilesInfo);
72 RetrieveSDFilesInfo();
73
74 # Process input files..
75 my($FileIndex);
76 if (@SDFilesList > 1) {
77 print "\nProcessing SD files...\n";
78 }
79 for $FileIndex (0 .. $#SDFilesList) {
80 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
81 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
82 GenerateEStateIndiciesFingerprints($FileIndex);
83 }
84 }
85 print "\n$ScriptName:Done...\n\n";
86
87 $EndTime = new Benchmark;
88 $TotalTime = timediff ($EndTime, $StartTime);
89 print "Total time: ", timestr($TotalTime), "\n";
90
91 ###############################################################################
92
93 # Generate fingerprints for a SD file...
94 #
95 sub GenerateEStateIndiciesFingerprints {
96 my($FileIndex) = @_;
97 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
98
99 $SDFile = $SDFilesList[$FileIndex];
100
101 # Setup output files...
102 #
103 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
104
105 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
106 $MoleculeFileIO->Open();
107
108 $CmpdCount = 0;
109 $IgnoredCmpdCount = 0;
110
111 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
112 $CmpdCount++;
113
114 # Filter compound data before calculating fingerprints...
115 if ($OptionsInfo{Filter}) {
116 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
117 $IgnoredCmpdCount++;
118 next COMPOUND;
119 }
120 }
121
122 $EStateIndiciesFingerprints = GenerateMoleculeFingerprints($Molecule);
123 if (!$EStateIndiciesFingerprints) {
124 $IgnoredCmpdCount++;
125 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
126 next COMPOUND;
127 }
128
129 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
130 }
131 $MoleculeFileIO->Close();
132
133 if ($NewFPSDFileIO) {
134 $NewFPSDFileIO->Close();
135 }
136 if ($NewFPTextFileIO) {
137 $NewFPTextFileIO->Close();
138 }
139 if ($NewFPFileIO) {
140 $NewFPFileIO->Close();
141 }
142
143 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
144 }
145
146 # Process compound being ignored due to problems in fingerprints geneation...
147 #
148 sub ProcessIgnoredCompound {
149 my($Mode, $CmpdCount, $Molecule) = @_;
150 my($CmpdID, $DataFieldLabelAndValuesRef);
151
152 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
153 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
154
155 MODE: {
156 if ($Mode =~ /^ContainsNonElementalData$/i) {
157 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
158 next MODE;
159 }
160
161 if ($Mode =~ /^ContainsNoElementalData$/i) {
162 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
163 next MODE;
164 }
165
166 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
167 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
168 next MODE;
169 }
170 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
171 }
172 }
173
174 # Check and filter compounds....
175 #
176 sub CheckAndFilterCompound {
177 my($CmpdCount, $Molecule) = @_;
178 my($ElementCount, $NonElementCount);
179
180 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
181
182 if ($NonElementCount) {
183 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
184 return 1;
185 }
186
187 if (!$ElementCount) {
188 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
189 return 1;
190 }
191
192 return 0;
193 }
194
195 # Write out compounds fingerprints generation summary statistics...
196 #
197 sub WriteFingerprintsGenerationSummaryStatistics {
198 my($CmpdCount, $IgnoredCmpdCount) = @_;
199 my($ProcessedCmpdCount);
200
201 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
202
203 print "\nNumber of compounds: $CmpdCount\n";
204 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
205 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
206 }
207
208 # Open output files...
209 #
210 sub SetupAndOpenOutputFiles {
211 my($FileIndex) = @_;
212 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
213
214 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
215
216 # Setup common parameters for fingerprints file IO objects...
217 #
218 %FingerprintsFileIOParams = ();
219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
220
221 if ($OptionsInfo{SDOutput}) {
222 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
223 print "Generating SD file $NewFPSDFile...\n";
224 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
225 $NewFPSDFileIO->Open();
226 }
227
228 if ($OptionsInfo{FPOutput}) {
229 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
230 print "Generating FP file $NewFPFile...\n";
231 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
232 $NewFPFileIO->Open();
233 }
234
235 if ($OptionsInfo{TextOutput}) {
236 my($ColLabelsRef);
237
238 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
239 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
240
241 print "Generating text file $NewFPTextFile...\n";
242 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
243 $NewFPTextFileIO->Open();
244 }
245
246 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
247 }
248
249 # Write fingerpritns and other data to appropriate output files...
250 #
251 sub WriteDataToOutputFiles {
252 my($FileIndex, $CmpdCount, $Molecule, $EStateIndiciesFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
253 my($DataFieldLabelAndValuesRef);
254
255 $DataFieldLabelAndValuesRef = undef;
256 if ($NewFPTextFileIO || $NewFPFileIO) {
257 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
258 }
259
260 if ($NewFPSDFileIO) {
261 my($CmpdString);
262
263 $CmpdString = $Molecule->GetInputMoleculeString();
264 $NewFPSDFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CmpdString);
265 }
266
267 if ($NewFPTextFileIO) {
268 my($ColValuesRef);
269
270 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
271 $NewFPTextFileIO->WriteFingerprints($EStateIndiciesFingerprints, $ColValuesRef);
272 }
273
274 if ($NewFPFileIO) {
275 my($CompoundID);
276
277 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
278 $NewFPFileIO->WriteFingerprints($EStateIndiciesFingerprints, $CompoundID);
279 }
280
281 }
282
283 # Generate approriate column labels for FPText output file...
284 #
285 sub SetupFPTextFileCoulmnLabels {
286 my($FileIndex) = @_;
287 my($Line, @ColLabels);
288
289 @ColLabels = ();
290 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
291 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
292 }
293 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
294 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
295 }
296 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
297 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
298 }
299 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
300 push @ColLabels, $OptionsInfo{CompoundIDLabel};
301 }
302 # Add fingerprints label...
303 push @ColLabels, $OptionsInfo{FingerprintsLabel};
304
305 return \@ColLabels;
306 }
307
308 # Generate column values FPText output file..
309 #
310 sub SetupFPTextFileCoulmnValues {
311 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
312 my(@ColValues);
313
314 @ColValues = ();
315 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
316 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
317 }
318 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
319 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
320 }
321 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
322 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
323 }
324 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
325 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
326 }
327
328 return \@ColValues;
329 }
330
331 # Generate compound ID for FP and FPText output files..
332 #
333 sub SetupCmpdIDForOutputFiles {
334 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
335 my($CmpdID);
336
337 $CmpdID = '';
338 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
339 my($MolName);
340 $MolName = $Molecule->GetName();
341 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
342 }
343 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
344 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
345 }
346 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
347 my($SpecifiedDataField);
348 $SpecifiedDataField = $OptionsInfo{CompoundID};
349 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
350 }
351 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
352 $CmpdID = $Molecule->GetName();
353 }
354 return $CmpdID;
355 }
356
357 # Generate fingerprints for molecule...
358 #
359 sub GenerateMoleculeFingerprints {
360 my($Molecule) = @_;
361 my($EStateIndiciesFingerprints);
362
363 if ($OptionsInfo{KeepLargestComponent}) {
364 $Molecule->KeepLargestComponent();
365 }
366 if (!$Molecule->DetectRings()) {
367 return undef;
368 }
369 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
370 $Molecule->DetectAromaticity();
371
372 $EStateIndiciesFingerprints = new Fingerprints::EStateIndiciesFingerprints('Molecule' => $Molecule, 'EStateAtomTypesSetToUse' => $OptionsInfo{EStateAtomTypesSetToUse}, 'ValuesPrecision' => $OptionsInfo{ValuesPrecision});
373
374 # Generate E-state indicies fingerprints...
375 $EStateIndiciesFingerprints->GenerateFingerprints();
376
377 # Make sure E-state indicies fingerprints generation is successful...
378 if (!$EStateIndiciesFingerprints->IsFingerprintsGenerationSuccessful()) {
379 return undef;
380 }
381
382 return $EStateIndiciesFingerprints;
383 }
384
385 # Retrieve information about SD files...
386 #
387 sub RetrieveSDFilesInfo {
388 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
389
390 %SDFilesInfo = ();
391 @{$SDFilesInfo{FileOkay}} = ();
392 @{$SDFilesInfo{OutFileRoot}} = ();
393 @{$SDFilesInfo{SDOutFileNames}} = ();
394 @{$SDFilesInfo{FPOutFileNames}} = ();
395 @{$SDFilesInfo{TextOutFileNames}} = ();
396 @{$SDFilesInfo{AllDataFieldsRef}} = ();
397 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
398
399 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
400 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
401
402 FILELIST: for $Index (0 .. $#SDFilesList) {
403 $SDFile = $SDFilesList[$Index];
404
405 $SDFilesInfo{FileOkay}[$Index] = 0;
406 $SDFilesInfo{OutFileRoot}[$Index] = '';
407 $SDFilesInfo{SDOutFileNames}[$Index] = '';
408 $SDFilesInfo{FPOutFileNames}[$Index] = '';
409 $SDFilesInfo{TextOutFileNames}[$Index] = '';
410
411 $SDFile = $SDFilesList[$Index];
412 if (!(-e $SDFile)) {
413 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
414 next FILELIST;
415 }
416 if (!CheckFileType($SDFile, "sd sdf")) {
417 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
418 next FILELIST;
419 }
420
421 if ($CheckDataField) {
422 # Make sure data field exists in SD file..
423 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
424
425 @CmpdLines = ();
426 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
427 $CmpdString = ReadCmpdString(\*SDFILE);
428 close SDFILE;
429 @CmpdLines = split "\n", $CmpdString;
430 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
431 $SpecifiedDataField = $OptionsInfo{CompoundID};
432 if (!exists $DataFieldValues{$SpecifiedDataField}) {
433 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
434 next FILELIST;
435 }
436 }
437
438 $AllDataFieldsRef = '';
439 $CommonDataFieldsRef = '';
440 if ($CollectDataFields) {
441 my($CmpdCount);
442 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
443 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
444 close SDFILE;
445 }
446
447 # Setup output file names...
448 $FileDir = ""; $FileName = ""; $FileExt = "";
449 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
450
451 $TextOutFileExt = "csv";
452 if ($Options{outdelim} =~ /^tab$/i) {
453 $TextOutFileExt = "tsv";
454 }
455 $SDOutFileExt = $FileExt;
456 $FPOutFileExt = "fpf";
457
458 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
459 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
460 if ($RootFileName && $RootFileExt) {
461 $FileName = $RootFileName;
462 }
463 else {
464 $FileName = $OptionsInfo{OutFileRoot};
465 }
466 $OutFileRoot = $FileName;
467 }
468 else {
469 $OutFileRoot = "${FileName}EStateIndiciesFP";
470 }
471
472 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
473 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
474 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
475
476 if ($OptionsInfo{SDOutput}) {
477 if ($SDFile =~ /$NewSDFileName/i) {
478 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
479 print "Specify a different name using \"-r --root\" option or use default name.\n";
480 next FILELIST;
481 }
482 }
483
484 if (!$OptionsInfo{OverwriteFiles}) {
485 # Check SD and text outout files...
486 if ($OptionsInfo{SDOutput}) {
487 if (-e $NewSDFileName) {
488 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
489 next FILELIST;
490 }
491 }
492 if ($OptionsInfo{FPOutput}) {
493 if (-e $NewFPFileName) {
494 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
495 next FILELIST;
496 }
497 }
498 if ($OptionsInfo{TextOutput}) {
499 if (-e $NewTextFileName) {
500 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
501 next FILELIST;
502 }
503 }
504 }
505
506 $SDFilesInfo{FileOkay}[$Index] = 1;
507
508 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
509 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
510 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
511 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
512
513 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
514 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
515 }
516 }
517
518 # Process option values...
519 sub ProcessOptions {
520 %OptionsInfo = ();
521
522 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
523
524 $OptionsInfo{EStateAtomTypesSetToUse} = $Options{estateatomtypessettouse} ? $Options{estateatomtypessettouse} : 'ArbitrarySize';
525
526 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
527 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
528 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
529
530 my(@SpecifiedDataFields);
531 @SpecifiedDataFields = ();
532
533 @{$OptionsInfo{SpecifiedDataFields}} = ();
534 $OptionsInfo{CompoundID} = '';
535
536 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
537 if ($Options{compoundidmode} =~ /^DataField$/i) {
538 if (!$Options{compoundid}) {
539 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
540 }
541 $OptionsInfo{CompoundID} = $Options{compoundid};
542 }
543 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
544 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
545 }
546 }
547 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
548 if (!$Options{datafields}) {
549 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
550 }
551 @SpecifiedDataFields = split /\,/, $Options{datafields};
552 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
553 }
554
555 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'EStateIndiciesFingerprints';
556
557 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
558
559 if ($Options{fingerprintslabelmode} =~ /^FingerprintsLabelWithIDs$/) {
560 if ($Options{estateatomtypessettouse} =~ /^FixedSize$/i) {
561 # Append E-state atom types for non-hydrogen atoms to the fingerprints label...
562 my($AtomType, @IDs);
563 @IDs = ();
564 for $AtomType (@{AtomTypes::EStateAtomTypes::GetAllPossibleEStateNonHydrogenAtomTypes()}) {
565 push @IDs, "S${AtomType}";
566 }
567 $OptionsInfo{FingerprintsLabel} .= "; EStateAtomTypes: " . TextUtil::JoinWords(\@IDs, " ", 0);
568 }
569 }
570 $OptionsInfo{FingerprintsLabelMode} = $Options{fingerprintslabelmode};
571
572 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
573
574 $OptionsInfo{Output} = $Options{output};
575 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
576 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
577 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
578
579 $OptionsInfo{OutDelim} = $Options{outdelim};
580 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
581
582 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
583 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
584
585 # Precision for E-state indicies...
586 $OptionsInfo{ValuesPrecision} = $Options{valuesprecision};
587
588 # Setup default vector string format...
589 my($VectorStringFormat);
590 $VectorStringFormat = '';
591 if ($Options{vectorstringformat}) {
592 $VectorStringFormat = $Options{vectorstringformat};
593 }
594 else {
595 $VectorStringFormat = ($Options{estateatomtypessettouse} =~ /^FixedSize$/) ? "ValuesString" : "IDsAndValuesString";
596 }
597 $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
598 }
599
600 # Setup script usage and retrieve command line arguments specified using various options...
601 sub SetupScriptUsage {
602
603 # Retrieve all the options...
604 %Options = ();
605
606 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
607
608 $Options{compoundidmode} = 'LabelPrefix';
609 $Options{compoundidlabel} = 'CompoundID';
610 $Options{datafieldsmode} = 'CompoundID';
611
612 $Options{filter} = 'Yes';
613
614 $Options{estateatomtypessettouse} = 'ArbitrarySize';
615
616 $Options{fingerprintslabelmode} = 'FingerprintsLabelOnly';
617 $Options{keeplargestcomponent} = 'Yes';
618
619 $Options{output} = 'text';
620 $Options{outdelim} = 'comma';
621 $Options{quote} = 'yes';
622
623 $Options{valuesprecision} = 3;
624
625 $Options{vectorstringformat} = '';
626
627 if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "estateatomtypessettouse|e=s", "filter|f=s", "fingerprintslabelmode=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "valuesprecision=s", "vectorstringformat|v=s", "workingdir|w=s")) {
628 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
629 }
630 if ($Options{workingdir}) {
631 if (! -d $Options{workingdir}) {
632 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
633 }
634 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
635 }
636 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
637 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
638 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
639 }
640 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
641 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
642 }
643 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
644 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
645 }
646 if ($Options{estateatomtypessettouse} && $Options{estateatomtypessettouse} !~ /^(ArbitrarySize|FixedSize)$/) {
647 die "Error: The value specified, $Options{estateatomtypessettouse}, for option \"-e, --EStateAtomTypesSetToUse\" is not valid. Allowed values: ArbitrarySize or FixedSize\n";
648 }
649 if ($Options{filter} !~ /^(Yes|No)$/i) {
650 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
651 }
652 if ($Options{fingerprintslabelmode} !~ /^(FingerprintsLabelOnly|FingerprintsLabelWithIDs)$/i) {
653 die "Error: The value specified, $Options{fingerprintslabelmode}, for option \"--FingerprintsLabelMode\" is not valid. Allowed values: FingerprintsLabelOnly or FingerprintsLabelWithIDs\n";
654 }
655 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
656 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
657 }
658 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
659 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
660 }
661 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
662 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
663 }
664 if ($Options{quote} !~ /^(Yes|No)$/i) {
665 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
666 }
667 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
668 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
669 }
670 if (!IsPositiveInteger($Options{valuesprecision})) {
671 die "Error: The value specified, $Options{valuesprecision}, for option \"--ValuesPrecision\" is not valid. Allowed values: > 0 \n";
672 }
673 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
674 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
675 }
676 }
677
678 __END__
679
680 =head1 NAME
681
682 EStateIndiciesFingerprints.pl - Generate E-state indicies fingerprints for SD files
683
684 =head1 SYNOPSIS
685
686 EStateIndiciesFingerprints.pl SDFile(s)...
687
688 EStateIndiciesFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
689 [B<--CompoundID> I<DataFieldName or LabelPrefixString>]
690 [B<--CompoundIDLabel> I<text>] [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
691 [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">] [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>]
692 [B<-e, --EStateAtomTypesSetToUse> I<ArbitrarySize or FixedSize>] [B<-f, --Filter> I<Yes | No>]
693 [B<--FingerprintsLabelMode> I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>] [B<--FingerprintsLabel> I<text>]
694 [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>]
695 [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD | FP | text | all>] [B<-o, --overwrite>]
696 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-s, --size> I<number>] [B<--ValuesPrecision> I<number>]
697 [B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
698 [B<-w, --WorkingDir> I<DirName>]
699
700 =head1 DESCRIPTION
701
702 Generate E-state indicies fingerprints [ Ref 75-78 ] for I<SDFile(s)> and create appropriate SD,
703 FP, or CSV/TSV text file(s) containing fingerprints bit-vector or vector strings corresponding to
704 molecular fingerprints.
705
706 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
707 and I<.sd>. All other file names are ignored. All the SD files in a current directory
708 can be specified either by I<*.sdf> or the current directory name.
709
710 E-state atom types are assigned to all non-hydrogen atoms in a molecule using module
711 AtomTypes::EStateAtomTypes.pm and E-state values are calculated using module
712 AtomicDescriptors::EStateValues.pm. Using E-state atom types and E-state values,
713 B<EStateIndiciesFingerprints> constituting sum of E-state values for E-sate atom types
714 is generated.
715
716 Two types of E-state atom types set size are allowed:
717
718 ArbitrarySize - Corresponds to only E-state atom types detected
719 in molecule
720 FixedSize - Corresponds to fixed number of E-state atom types previously
721 defined
722
723 Module AtomTypes::EStateAtomTypes.pm, used to assign E-state atom types to
724 non-hydrogen atoms in the molecule, is able to assign atom types to any valid
725 atom group. However, for I<FixedSize> value of B<EStateAtomTypesSetToUse>, only a
726 fixed set of E-state atom types corresponding to specific atom groups [ Appendix III in
727 Ref 77 ] are used for fingerprints.
728
729 The fixed size E-state atom type set size used during generation of fingerprints contains
730 87 E-state non-hydrogen atom types in EStateAtomTypes.csv data file distributed with
731 MayaChemTools.
732
733 Combination of Type and EStateAtomTypesSetToUse allow generation of 2 different types of
734 E-state indicies fingerprints:
735
736 Type EStateAtomTypesSetToUse
737
738 EStateIndicies ArbitrarySize [ default fingerprints ]
739 EStateIndicies FixedSize
740
741 Example of I<SD> file containing E-state indicies fingerprints string data:
742
743 ... ...
744 ... ...
745 $$$$
746 ... ...
747 ... ...
748 ... ...
749 41 44 0 0 0 0 0 0 0 0999 V2000
750 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
751 ... ...
752 2 3 1 0 0 0 0
753 ... ...
754 M END
755 > <CmpdID>
756 Cmpd1
757
758 > <EStateIndiciesFingerprints>
759 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDsA
760 ndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssNH
761 SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3.02
762 4 -2.270
763
764 $$$$
765 ... ...
766 ... ...
767
768 Example of I<FP> file containing E-state indicies fingerprints string data:
769
770 #
771 # Package = MayaChemTools 7.4
772 # Release Date = Oct 21, 2010
773 #
774 # TimeStamp = Fri Mar 11 14:35:11 2011
775 #
776 # FingerprintsStringType = FingerprintsVector
777 #
778 # Description = EStateIndicies:ArbitrarySize
779 # VectorStringFormat = IDsAndValuesString
780 # VectorValuesType = NumericalValues
781 #
782 Cmpd1 11;SaaCH SaasC SaasN SdO SdssC...;24.778 4.387 1.993 25.023 -1...
783 Cmpd2 9;SdNH SdO SdssC SsCH3 SsNH...;7.418 22.984 -1.583 5.387 5.400...
784 ... ...
785 ... ..
786
787 Example of CSV I<Text> file containing E-state indicies fingerprints string data:
788
789 "CompoundID","EStateIndiciesFingerprints"
790 "Cmpd1","FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalVa
791 lues;IDsAndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssC
792 H2 SssNH SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0
793 .073 3.024 -2.270"
794 "Cmpd2","FingerprintsVector;EStateIndicies:ArbitrarySize;9;NumericalVal
795 ues;IDsAndValuesString;SdNH SdO SdssC SsCH3 SsNH2 SsOH SssCH2 SssNH Sss
796 sCH;7.418 22.984 -1.583 5.387 5.400 19.852 1.737 5.624 -3.319"
797 ... ...
798 ... ...
799
800 The current release of MayaChemTools generates the following types of E-state
801 fingerprints vector strings:
802
803 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
804 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
805 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3
806 .024 -2.270
807
808 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
809 ValuesString;0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435
810 4.387 0 0 0 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 1
811 4.006 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
812 0 0 0 0 0 0 0 0 0 0 0 0 0 0
813
814 FingerprintsVector;EStateIndicies:FixedSize;87;OrderedNumericalValues;
815 IDsAndValuesString;SsLi SssBe SssssBem SsBH2 SssBH SsssB SssssBm SsCH3
816 SdCH2 SssCH2 StCH SdsCH SaaCH SsssCH SddC StsC SdssC SaasC SaaaC Sssss
817 C SsNH3p SsNH2 SssNH2p SdNH SssNH SaaNH StN SsssNHp SdsN SaaN SsssN Sd
818 0 0 0 0 0 0 0 3.975 0 -0.073 0 0 24.778 -2.270 0 0 -1.435 4.387 0 0 0
819 0 0 0 3.024 0 0 0 0 0 0 0 1.993 0 29.759 25.023 0 0 0 0 14.006 0 0 0 0
820 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
821
822 =head1 OPTIONS
823
824 =over 4
825
826 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
827
828 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
829 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
830 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
831 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
832
833 The supported aromaticity model names along with model specific control parameters
834 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
835 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
836 this file during class instantiation and makes it available to method B<DetectAromaticity>
837 for detecting aromaticity corresponding to a specific model.
838
839 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
840
841 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
842
843 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
844 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
845 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
846 look like Cmpd<Number>.
847
848 Examples for I<DataField> value of B<--CompoundIDMode>:
849
850 MolID
851 ExtReg
852
853 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
854
855 Compound
856
857 The value specified above generates compound IDs which correspond to Compound<Number>
858 instead of default value of Cmpd<Number>.
859
860 =item B<--CompoundIDLabel> I<text>
861
862 Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value
863 of B<--DataFieldsMode> option. Default: I<CompoundID>.
864
865 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
866
867 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
868 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
869 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
870 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
871
872 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
873 Default: I<LabelPrefix>.
874
875 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
876 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
877 values are replaced with sequential compound IDs.
878
879 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
880
881 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
882
883 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
884 with generated fingerprints for I<text | all> values of B<--output> option.
885
886 This is only used for I<Specify> value of B<--DataFieldsMode> option.
887
888 Examples:
889
890 Extreg
891 MolID,CompoundName
892
893 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
894
895 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
896 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
897 data field; transfer SD data files common to all compounds; extract specified data fields;
898 generate a compound ID using molname line, a compound prefix, or a combination of both.
899 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
900
901 =item B<-e, --EStateAtomTypesSetToUse> I<ArbitrarySize | FixedSize>
902
903 E-state atom types set size to use during generation of E-state indicies fingerprints.
904 Possible values: I<ArbitrarySize | FixedSize>; Default value: I<ArbitrarySize>.
905
906 I<ArbitrarySize> corrresponds to only E-state atom types detected in molecule; I<FixedSize>
907 corresponds to fixed number of previously defined E-state atom types.
908
909 For I<EStateIndicies>, a fingerprint vector string is generated. The vector string corresponding to
910 I<EStateIndicies> contains sum of E-state values for E-state atom types.
911
912 Module B<AtomTypes::EStateAtomTypes.pm> is used to assign E-state atom types to
913 non-hydrogen atoms in the molecule which is able to assign atom types to any valid
914 atom group. However, for I<FixedSize> value of B<EStateAtomTypesSetToUse>,
915 only a fixed set of E-state atom types corresponding to specific atom groups [ Appendix
916 III in Ref 77 ] are used for fingerprints.
917
918 The fixed size E-state atom type set size used during generation of fingerprints contains
919 87 E-state non-hydrogen atom types in EStateAtomTypes.csv data file distributed with
920 MayaChemTools.
921
922 =item B<-f, --Filter> I<Yes | No>
923
924 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
925 Default value: I<Yes>.
926
927 By default, compound data is checked before calculating fingerprints and compounds containing
928 atom data corresponding to non-element symbols or no atom data are ignored.
929
930 =item B<--FingerprintsLabelMode> I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>
931
932 Specify how fingerprints label is generated in conjunction with B<--FingerprintsLabel> option value:
933 use fingerprints label generated only by B<--FingerprintsLabel> option value or append E-state
934 atom type value IDs to B<--FingerprintsLabel> option value.
935
936 Possible values: I<FingerprintsLabelOnly | FingerprintsLabelWithIDs>. Default value:
937 I<FingerprintsLabelOnly>.
938
939 This option is only used for I<FixedSize> value of B<-e, --EStateAtomTypesSetToUse> option during
940 generation of I<EStateIndicies> E-state fingerprints.
941
942 E-state atom type IDs appended to B<--FingerprintsLabel> value during I<FingerprintsLabelWithIDs>
943 values of B<--FingerprintsLabelMode> correspond to fixed number of previously defined E-state
944 atom types.
945
946 =item B<--FingerprintsLabel> I<text>
947
948 SD data label or text file column label to use for fingerprints string in output SD or
949 CSV/TSV text file(s) specified by B<--output>. Default value: I<EStateIndiciesFingerprints>.
950
951 =item B<-h, --help>
952
953 Print this help message.
954
955 =item B<-k, --KeepLargestComponent> I<Yes | No>
956
957 Generate fingerprints for only the largest component in molecule. Possible values:
958 I<Yes or No>. Default value: I<Yes>.
959
960 For molecules containing multiple connected components, fingerprints can be generated
961 in two different ways: use all connected components or just the largest connected
962 component. By default, all atoms except for the largest connected component are
963 deleted before generation of fingerprints.
964
965 =item B<--OutDelim> I<comma | tab | semicolon>
966
967 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
968 Default value: I<comma>.
969
970 =item B<--output> I<SD | FP | text | all>
971
972 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
973
974 =item B<-o, --overwrite>
975
976 Overwrite existing files.
977
978 =item B<-q, --quote> I<Yes | No>
979
980 Put quote around column values in output CSV/TSV text file(s). Possible values:
981 I<Yes or No>. Default value: I<Yes>.
982
983 =item B<-r, --root> I<RootName>
984
985 New file name is generated using the root: <Root>.<Ext>. Default for new file
986 names: <SDFileName><EStateIndiciesFP>.<Ext>. The file type determines <Ext> value.
987 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
988 delimited text files, respectively.This option is ignored for multiple input files.
989
990 =item B<--ValuesPrecision> I<number>
991
992 Precision of values for E-state indicies option. Default value: up to I<3> decimal places.
993 Valid values: positive integers.
994
995 =item B<-v, --VectorStringFormat> I<ValuesString | IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
996
997 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
998 B<--output> used for I<EStateIndicies>. Possible values: I<ValuesString, IDsAndValuesString,
999 IDsAndValuesPairsString, ValuesAndIDsString, ValuesAndIDsPairsString>.
1000
1001 Default value during I<ArbitrarySize> value of B<-e, --EStateAtomTypesSetToUse>
1002 option: I<IDsAndValuesString>. Default value during I<FixedSize> value of
1003 B<-e, --EStateAtomTypesSetToUse> option: I<ValuesString>.
1004
1005 Examples:
1006
1007 FingerprintsVector;EStateIndicies:ArbitrarySize;11;NumericalValues;IDs
1008 AndValuesString;SaaCH SaasC SaasN SdO SdssC SsCH3 SsF SsOH SssCH2 SssN
1009 H SsssCH;24.778 4.387 1.993 25.023 -1.435 3.975 14.006 29.759 -0.073 3
1010 .024 -2.270
1011
1012 =item B<-w, --WorkingDir> I<DirName>
1013
1014 Location of working directory. Default: current directory.
1015
1016 =back
1017
1018 =head1 EXAMPLES
1019
1020 To generate E-state fingerprints of arbitrary size in vector string format and create a
1021 SampleESFP.csv file containing sequential compound IDs along with fingerprints
1022 vector strings data, type:
1023
1024 % EStateIndiciesFingerprints.pl -r SampleESFP -o Sample.sdf
1025
1026 To generate E-state fingerprints of fixed size in vector string format and create a
1027 SampleESFP.csv file containing sequential compound IDs along with fingerprints
1028 vector strings data, type:
1029
1030 % EStateIndiciesFingerprints.pl -e FixedSize -r SampleESFP
1031 -o Sample.sdf
1032
1033 To generate E-state fingerprints of fixed size in vector string with IDsAndValues
1034 format and create a SampleESFP.csv file containing sequential compound IDs
1035 along with fingerprints vector strings data, type:
1036
1037 % EStateIndiciesFingerprints.pl -e FixedSize -v IDsAndValuesString
1038 -r SampleESFP -o Sample.sdf
1039
1040 To generate E-state fingerprints of fixed size in vector string format
1041 and create a SampleESFP.csv file containing compound ID from molecule
1042 name line along with fingerprints vector strings data, type
1043
1044 % EStateIndiciesFingerprints.pl -e FixedSize
1045 --DataFieldsMode CompoundID --CompoundIDMode MolName
1046 -r SampleESFP -o Sample.sdf
1047
1048 To generate E-state fingerprints of fixed size in vector string format
1049 and create a SampleESFP.csv file containing compound IDs using specified
1050 data field along with fingerprints vector strings data, type:
1051
1052 % EStateIndiciesFingerprints.pl -e FixedSize
1053 --DataFieldsMode CompoundID --CompoundIDMode DataField --CompoundID
1054 Mol_ID -r SampleESFP -o Sample.sdf
1055
1056 To generate E-state fingerprints of fixed size in vector string format
1057 and create a SampleESFP.csv file containing compound ID using combination
1058 of molecule name line and an explicit compound prefix along with fingerprints vector
1059 strings data, type:
1060
1061 % EStateIndiciesFingerprints.pl -e FixedSize
1062 --DataFieldsMode CompoundID --CompoundIDMode MolnameOrLabelPrefix
1063 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleESFP -o Sample.sdf
1064
1065 To generate E-state fingerprints of fixed size in vector string format
1066 and create a SampleESFP.csv file containing specific data fields columns along
1067 with fingerprints vector strings data, type:
1068
1069 % EStateIndiciesFingerprints.pl -e FixedSize
1070 --DataFieldsMode Specify --DataFields Mol_ID -r SampleESFP
1071 -o Sample.sdf
1072
1073 To generate E-state fingerprints of fixed size in vector string format
1074 and create a SampleESFP.csv file containing common data fields columns along
1075 with fingerprints vector strings data, type:
1076
1077 % EStateIndiciesFingerprints.pl -e FixedSize
1078 --DataFieldsMode Common -r SampleESFP -o Sample.sdf
1079
1080 To generate E-state fingerprints of fixed size in vector string format and create
1081 SampleESFP.sdf, SampleESFP.fpf, and SampleESFP.csv files containing all data
1082 fields columns in CSV file along with fingerprints vector strings data, type:
1083
1084 % EStateIndiciesFingerprints.pl -e FixedSize
1085 --DataFieldsMode All --output all -r SampleESFP -o Sample.sdf
1086
1087 =head1 AUTHOR
1088
1089 Manish Sud <msud@san.rr.com>
1090
1091 =head1 SEE ALSO
1092
1093 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1094 ExtendedConnectivityFingerprints.pl, MACCSKeysFingeprints.pl, PathLengthFingerprints.pl,
1095 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1096 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1097
1098 =head1 COPYRIGHT
1099
1100 Copyright (C) 2015 Manish Sud. All rights reserved.
1101
1102 This file is part of MayaChemTools.
1103
1104 MayaChemTools is free software; you can redistribute it and/or modify it under
1105 the terms of the GNU Lesser General Public License as published by the Free
1106 Software Foundation; either version 3 of the License, or (at your option)
1107 any later version.
1108
1109 =cut