comparison mayachemtools/bin/ExtendedConnectivityFingerprints.pl @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: ExtendedConnectivityFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:19 $
5 # $Revision: 1.37 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use AtomTypes::AtomicInvariantsAtomTypes;
43 use AtomTypes::FunctionalClassAtomTypes;
44 use Fingerprints::ExtendedConnectivityFingerprints;
45
46 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
47
48 # Autoflush STDOUT
49 $| = 1;
50
51 # Starting message...
52 $ScriptName = basename($0);
53 print "\n$ScriptName: Starting...\n\n";
54 $StartTime = new Benchmark;
55
56 # Get the options and setup script...
57 SetupScriptUsage();
58 if ($Options{help} || @ARGV < 1) {
59 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
60 }
61
62 my(@SDFilesList);
63 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
64
65 # Process options...
66 print "Processing options...\n";
67 my(%OptionsInfo);
68 ProcessOptions();
69
70 # Setup information about input files...
71 print "Checking input SD file(s)...\n";
72 my(%SDFilesInfo);
73 RetrieveSDFilesInfo();
74
75 # Process input files..
76 my($FileIndex);
77 if (@SDFilesList > 1) {
78 print "\nProcessing SD files...\n";
79 }
80 for $FileIndex (0 .. $#SDFilesList) {
81 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
82 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
83 GenerateExtendedConnectivityFingerprints($FileIndex);
84 }
85 }
86 print "\n$ScriptName:Done...\n\n";
87
88 $EndTime = new Benchmark;
89 $TotalTime = timediff ($EndTime, $StartTime);
90 print "Total time: ", timestr($TotalTime), "\n";
91
92 ###############################################################################
93
94 # Generate fingerprints for a SD file...
95 #
96 sub GenerateExtendedConnectivityFingerprints {
97 my($FileIndex) = @_;
98 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $ExtendedConnectivityFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
99
100 $SDFile = $SDFilesList[$FileIndex];
101
102 # Setup output files...
103 #
104 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
105
106 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
107 $MoleculeFileIO->Open();
108
109 $CmpdCount = 0;
110 $IgnoredCmpdCount = 0;
111
112 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
113 $CmpdCount++;
114
115 # Filter compound data before calculating fingerprints...
116 if ($OptionsInfo{Filter}) {
117 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
118 $IgnoredCmpdCount++;
119 next COMPOUND;
120 }
121 }
122
123 $ExtendedConnectivityFingerprints = GenerateMoleculeFingerprints($Molecule);
124 if (!$ExtendedConnectivityFingerprints) {
125 $IgnoredCmpdCount++;
126 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
127 next COMPOUND;
128 }
129
130 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $ExtendedConnectivityFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
131 }
132 $MoleculeFileIO->Close();
133
134 if ($NewFPSDFileIO) {
135 $NewFPSDFileIO->Close();
136 }
137 if ($NewFPTextFileIO) {
138 $NewFPTextFileIO->Close();
139 }
140 if ($NewFPFileIO) {
141 $NewFPFileIO->Close();
142 }
143
144 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
145 }
146
147 # Process compound being ignored due to problems in fingerprints geneation...
148 #
149 sub ProcessIgnoredCompound {
150 my($Mode, $CmpdCount, $Molecule) = @_;
151 my($CmpdID, $DataFieldLabelAndValuesRef);
152
153 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
154 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
155
156 MODE: {
157 if ($Mode =~ /^ContainsNonElementalData$/i) {
158 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
159 next MODE;
160 }
161
162 if ($Mode =~ /^ContainsNoElementalData$/i) {
163 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
164 next MODE;
165 }
166
167 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
169 next MODE;
170 }
171 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
172 }
173 }
174
175 # Check and filter compounds....
176 #
177 sub CheckAndFilterCompound {
178 my($CmpdCount, $Molecule) = @_;
179 my($ElementCount, $NonElementCount);
180
181 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
182
183 if ($NonElementCount) {
184 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
185 return 1;
186 }
187
188 if (!$ElementCount) {
189 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
190 return 1;
191 }
192
193 return 0;
194 }
195
196 # Write out compounds fingerprints generation summary statistics...
197 #
198 sub WriteFingerprintsGenerationSummaryStatistics {
199 my($CmpdCount, $IgnoredCmpdCount) = @_;
200 my($ProcessedCmpdCount);
201
202 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
203
204 print "\nNumber of compounds: $CmpdCount\n";
205 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
206 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
207 }
208
209 # Open output files...
210 #
211 sub SetupAndOpenOutputFiles {
212 my($FileIndex) = @_;
213 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
214
215 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
216
217 # Setup common parameters for fingerprints file IO objects...
218 #
219 %FingerprintsFileIOParams = ();
220 if ($OptionsInfo{Mode} =~ /^(ExtendedConnectivity|ExtendedConnectivityCount)$/i) {
221 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
222 }
223 elsif ($OptionsInfo{Mode} =~ /^ExtendedConnectivityBits$/i) {
224 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
225 }
226
227 if ($OptionsInfo{SDOutput}) {
228 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
229 print "Generating SD file $NewFPSDFile...\n";
230 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
231 $NewFPSDFileIO->Open();
232 }
233
234 if ($OptionsInfo{FPOutput}) {
235 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
236 print "Generating FP file $NewFPFile...\n";
237 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
238 $NewFPFileIO->Open();
239 }
240
241 if ($OptionsInfo{TextOutput}) {
242 my($ColLabelsRef);
243
244 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
245 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
246
247 print "Generating text file $NewFPTextFile...\n";
248 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
249 $NewFPTextFileIO->Open();
250 }
251
252 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
253 }
254
255 # Write fingerpritns and other data to appropriate output files...
256 #
257 sub WriteDataToOutputFiles {
258 my($FileIndex, $CmpdCount, $Molecule, $ExtendedConnectivityFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
259 my($DataFieldLabelAndValuesRef);
260
261 $DataFieldLabelAndValuesRef = undef;
262 if ($NewFPTextFileIO || $NewFPFileIO) {
263 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
264 }
265
266 if ($NewFPSDFileIO) {
267 my($CmpdString);
268
269 $CmpdString = $Molecule->GetInputMoleculeString();
270 $NewFPSDFileIO->WriteFingerprints($ExtendedConnectivityFingerprints, $CmpdString);
271 }
272
273 if ($NewFPTextFileIO) {
274 my($ColValuesRef);
275
276 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
277 $NewFPTextFileIO->WriteFingerprints($ExtendedConnectivityFingerprints, $ColValuesRef);
278 }
279
280 if ($NewFPFileIO) {
281 my($CompoundID);
282
283 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
284 $NewFPFileIO->WriteFingerprints($ExtendedConnectivityFingerprints, $CompoundID);
285 }
286 }
287
288 # Generate approriate column labels for FPText output file...
289 #
290 sub SetupFPTextFileCoulmnLabels {
291 my($FileIndex) = @_;
292 my($Line, @ColLabels);
293
294 @ColLabels = ();
295 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
296 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
297 }
298 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
299 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
300 }
301 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
302 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
303 }
304 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
305 push @ColLabels, $OptionsInfo{CompoundIDLabel};
306 }
307 # Add fingerprints label...
308 push @ColLabels, $OptionsInfo{FingerprintsLabel};
309
310 return \@ColLabels;
311 }
312
313 # Generate column values FPText output file..
314 #
315 sub SetupFPTextFileCoulmnValues {
316 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
317 my(@ColValues);
318
319 @ColValues = ();
320 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
321 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
322 }
323 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
324 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
325 }
326 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
327 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
328 }
329 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
330 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
331 }
332
333 return \@ColValues;
334 }
335
336 # Generate compound ID for FP and FPText output files..
337 #
338 sub SetupCmpdIDForOutputFiles {
339 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
340 my($CmpdID);
341
342 $CmpdID = '';
343 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
344 my($MolName);
345 $MolName = $Molecule->GetName();
346 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
347 }
348 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
349 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
350 }
351 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
352 my($SpecifiedDataField);
353 $SpecifiedDataField = $OptionsInfo{CompoundID};
354 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
355 }
356 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
357 $CmpdID = $Molecule->GetName();
358 }
359 return $CmpdID;
360 }
361
362 # Generate fingerprints for molecule...
363 #
364 sub GenerateMoleculeFingerprints {
365 my($Molecule) = @_;
366 my($ExtendedConnectivityFingerprints);
367
368 if ($OptionsInfo{KeepLargestComponent}) {
369 $Molecule->KeepLargestComponent();
370 }
371 if (!$Molecule->DetectRings()) {
372 return undef;
373 }
374 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
375 $Molecule->DetectAromaticity();
376
377 $ExtendedConnectivityFingerprints = undef;
378 if ($OptionsInfo{Mode} =~ /^(ExtendedConnectivity|ExtendedConnectivityCount)$/i ) {
379 $ExtendedConnectivityFingerprints = new Fingerprints::ExtendedConnectivityFingerprints('Type' => $OptionsInfo{Mode}, 'Molecule' => $Molecule, 'NeighborhoodRadius' => $OptionsInfo{NeighborhoodRadius}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType});
380 }
381 elsif ($OptionsInfo{Mode} =~ /^ExtendedConnectivityBits$/i) {
382 $ExtendedConnectivityFingerprints = new Fingerprints::ExtendedConnectivityFingerprints('Type' => $OptionsInfo{Mode}, 'Molecule' => $Molecule, 'NeighborhoodRadius' => $OptionsInfo{NeighborhoodRadius}, 'AtomIdentifierType' => $OptionsInfo{AtomIdentifierType}, 'Size' => $OptionsInfo{Size}, 'UsePerlCoreRandom' => $OptionsInfo{UsePerlCoreRandom});
383 }
384 else {
385 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: ExtendedConnectivity, ExtendedConnectivityCount or ExtendedConnectivityBits\n";
386 }
387 SetAtomIdentifierTypeValuesToUse($ExtendedConnectivityFingerprints);
388
389 # Generate fingerprints...
390 $ExtendedConnectivityFingerprints->GenerateFingerprints();
391
392 # Make sure fingerprints generation is successful...
393 if (!$ExtendedConnectivityFingerprints->IsFingerprintsGenerationSuccessful()) {
394 return undef;
395 }
396
397 return $ExtendedConnectivityFingerprints;
398 }
399
400 # Set atom identifier type to use for generating fingerprints...
401 #
402 sub SetAtomIdentifierTypeValuesToUse {
403 my($ExtendedConnectivityFingerprints) = @_;
404
405 if ($OptionsInfo{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
406 $ExtendedConnectivityFingerprints->SetAtomicInvariantsToUse(\@{$OptionsInfo{AtomicInvariantsToUse}});
407 }
408 elsif ($OptionsInfo{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
409 $ExtendedConnectivityFingerprints->SetFunctionalClassesToUse(\@{$OptionsInfo{FunctionalClassesToUse}});
410 }
411 elsif ($OptionsInfo{AtomIdentifierType} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
412 # Nothing to do for now...
413 }
414 else {
415 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
416 }
417 }
418
419 # Retrieve information about SD files...
420 #
421 sub RetrieveSDFilesInfo {
422 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
423
424 %SDFilesInfo = ();
425 @{$SDFilesInfo{FileOkay}} = ();
426 @{$SDFilesInfo{OutFileRoot}} = ();
427 @{$SDFilesInfo{SDOutFileNames}} = ();
428 @{$SDFilesInfo{FPOutFileNames}} = ();
429 @{$SDFilesInfo{TextOutFileNames}} = ();
430 @{$SDFilesInfo{AllDataFieldsRef}} = ();
431 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
432
433 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
434 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
435
436 FILELIST: for $Index (0 .. $#SDFilesList) {
437 $SDFile = $SDFilesList[$Index];
438
439 $SDFilesInfo{FileOkay}[$Index] = 0;
440 $SDFilesInfo{OutFileRoot}[$Index] = '';
441 $SDFilesInfo{SDOutFileNames}[$Index] = '';
442 $SDFilesInfo{FPOutFileNames}[$Index] = '';
443 $SDFilesInfo{TextOutFileNames}[$Index] = '';
444
445 $SDFile = $SDFilesList[$Index];
446 if (!(-e $SDFile)) {
447 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
448 next FILELIST;
449 }
450 if (!CheckFileType($SDFile, "sd sdf")) {
451 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
452 next FILELIST;
453 }
454
455 if ($CheckDataField) {
456 # Make sure data field exists in SD file..
457 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
458
459 @CmpdLines = ();
460 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
461 $CmpdString = ReadCmpdString(\*SDFILE);
462 close SDFILE;
463 @CmpdLines = split "\n", $CmpdString;
464 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
465 $SpecifiedDataField = $OptionsInfo{CompoundID};
466 if (!exists $DataFieldValues{$SpecifiedDataField}) {
467 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
468 next FILELIST;
469 }
470 }
471
472 $AllDataFieldsRef = '';
473 $CommonDataFieldsRef = '';
474 if ($CollectDataFields) {
475 my($CmpdCount);
476 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
477 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
478 close SDFILE;
479 }
480
481 # Setup output file names...
482 $FileDir = ""; $FileName = ""; $FileExt = "";
483 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
484
485 $TextOutFileExt = "csv";
486 if ($Options{outdelim} =~ /^tab$/i) {
487 $TextOutFileExt = "tsv";
488 }
489 $SDOutFileExt = $FileExt;
490 $FPOutFileExt = "fpf";
491
492 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
493 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
494 if ($RootFileName && $RootFileExt) {
495 $FileName = $RootFileName;
496 }
497 else {
498 $FileName = $OptionsInfo{OutFileRoot};
499 }
500 $OutFileRoot = $FileName;
501 }
502 else {
503 $OutFileRoot = "${FileName}ExtendedConnectivityFP";
504 }
505
506 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
507 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
508 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
509
510 if ($OptionsInfo{SDOutput}) {
511 if ($SDFile =~ /$NewSDFileName/i) {
512 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
513 print "Specify a different name using \"-r --root\" option or use default name.\n";
514 next FILELIST;
515 }
516 }
517
518 if (!$OptionsInfo{OverwriteFiles}) {
519 # Check SD and text outout files...
520 if ($OptionsInfo{SDOutput}) {
521 if (-e $NewSDFileName) {
522 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
523 next FILELIST;
524 }
525 }
526 if ($OptionsInfo{FPOutput}) {
527 if (-e $NewFPFileName) {
528 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
529 next FILELIST;
530 }
531 }
532 if ($OptionsInfo{TextOutput}) {
533 if (-e $NewTextFileName) {
534 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
535 next FILELIST;
536 }
537 }
538 }
539
540 $SDFilesInfo{FileOkay}[$Index] = 1;
541
542 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
543 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
544 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
545 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
546
547 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
548 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
549 }
550 }
551
552 # Process option values...
553 sub ProcessOptions {
554 %OptionsInfo = ();
555
556 ProcessAtomIdentifierTypeOptions();
557
558 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
559
560 $OptionsInfo{BitsOrder} = $Options{bitsorder};
561 $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
562
563 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
564 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
565 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
566
567 my(@SpecifiedDataFields);
568 @SpecifiedDataFields = ();
569
570 @{$OptionsInfo{SpecifiedDataFields}} = ();
571 $OptionsInfo{CompoundID} = '';
572
573 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
574 if ($Options{compoundidmode} =~ /^DataField$/i) {
575 if (!$Options{compoundid}) {
576 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
577 }
578 $OptionsInfo{CompoundID} = $Options{compoundid};
579 }
580 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
581 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
582 }
583 }
584 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
585 if (!$Options{datafields}) {
586 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
587 }
588 @SpecifiedDataFields = split /\,/, $Options{datafields};
589 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
590 }
591
592 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'ExtendedConnectivityFingerprints';
593
594 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
595
596 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
597
598 $OptionsInfo{Mode} = $Options{mode};
599
600 $OptionsInfo{NeighborhoodRadius} = $Options{neighborhoodradius};
601
602 $OptionsInfo{UsePerlCoreRandom} = ($Options{useperlcorerandom} =~ /^Yes$/i) ? 1 : 0;
603
604 $OptionsInfo{Output} = $Options{output};
605 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
606 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
607 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
608
609 $OptionsInfo{OutDelim} = $Options{outdelim};
610 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
611
612 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
613 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
614
615 my($Size, $MinSize, $MaxSize);
616 $MinSize = 32;
617 $MaxSize = 2**32;
618 $Size = $Options{size};
619 if (!(IsPositiveInteger($Size) && $Size >= $MinSize && $Size <= $MaxSize && IsNumberPowerOfNumber($Size, 2))) {
620 die "Error: Invalid size value, $Size, for \"-s, --size\" option. Allowed values: power of 2, >= minimum size of $MinSize, and <= maximum size of $MaxSize.\n";
621 }
622 $OptionsInfo{Size} = $Size;
623
624 # Setup default vector string format...
625 #
626 my($VectorStringFormat);
627 $VectorStringFormat = '';
628 if ($Options{vectorstringformat}) {
629 $VectorStringFormat = $Options{vectorstringformat};
630 }
631 else {
632 $VectorStringFormat = ($Options{mode} =~ /^ExtendedConnectivity$/) ? "ValuesString" : "IDsAndValuesString";
633 }
634 $OptionsInfo{VectorStringFormat} = $VectorStringFormat;
635 }
636
637 # Process atom identifier type and related options...
638 #
639 sub ProcessAtomIdentifierTypeOptions {
640
641 $OptionsInfo{AtomIdentifierType} = $Options{atomidentifiertype};
642
643 if ($Options{atomidentifiertype} =~ /^AtomicInvariantsAtomTypes$/i) {
644 ProcessAtomicInvariantsToUseOption();
645 }
646 elsif ($Options{atomidentifiertype} =~ /^FunctionalClassAtomTypes$/i) {
647 ProcessFunctionalClassesToUse();
648 }
649 elsif ($Options{atomidentifiertype} =~ /^(DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
650 # Nothing to do for now...
651 }
652 else {
653 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
654 }
655 }
656
657 # Process specified atomic invariants to use...
658 #
659 sub ProcessAtomicInvariantsToUseOption {
660 my($AtomicInvariant, $AtomSymbolSpecified, @AtomicInvariantsWords);
661
662 @{$OptionsInfo{AtomicInvariantsToUse}} = ();
663 if (IsEmpty($Options{atomicinvariantstouse})) {
664 die "Error: Atomic invariants value specified using \"--AtomicInvariantsToUse\" option is empty\n";
665 }
666 $AtomSymbolSpecified = 0;
667 @AtomicInvariantsWords = split /\,/, $Options{atomicinvariantstouse};
668 for $AtomicInvariant (@AtomicInvariantsWords) {
669 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($AtomicInvariant)) {
670 die "Error: Atomic invariant specified, $AtomicInvariant, using \"--AtomicInvariantsToUse\" option is not valid...\n ";
671 }
672 if ($AtomicInvariant =~ /^(AS|AtomSymbol)$/i) {
673 $AtomSymbolSpecified = 1;
674 }
675 push @{$OptionsInfo{AtomicInvariantsToUse}}, $AtomicInvariant;
676 }
677 if (!$AtomSymbolSpecified) {
678 die "Error: Atomic invariant, AS or AtomSymbol, must be specified as using \"--AtomicInvariantsToUse\" option...\n ";
679 }
680 }
681
682 # Process specified functional classes invariants to use...
683 #
684 sub ProcessFunctionalClassesToUse {
685 my($FunctionalClass, @FunctionalClassesToUseWords);
686
687 @{$OptionsInfo{FunctionalClassesToUse}} = ();
688 if (IsEmpty($Options{functionalclassestouse})) {
689 die "Error: Functional classes value specified using \"--FunctionalClassesToUse\" option is empty\n";
690 }
691 @FunctionalClassesToUseWords = split /\,/, $Options{functionalclassestouse};
692 for $FunctionalClass (@FunctionalClassesToUseWords) {
693 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($FunctionalClass)) {
694 die "Error: Functional class specified, $FunctionalClass, using \"--FunctionalClassesToUse\" option is not valid...\n ";
695 }
696 push @{$OptionsInfo{FunctionalClassesToUse}}, $FunctionalClass;
697 }
698 }
699
700 # Setup script usage and retrieve command line arguments specified using various options...
701 sub SetupScriptUsage {
702
703 # Retrieve all the options...
704 %Options = ();
705
706 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
707
708 $Options{atomidentifiertype} = 'AtomicInvariantsAtomTypes';
709 $Options{atomicinvariantstouse} = 'AS,X,BO,H,FC,MN';
710 $Options{functionalclassestouse} = 'HBD,HBA,PI,NI,Ar,Hal';
711
712 $Options{bitsorder} = 'Ascending';
713 $Options{bitstringformat} = 'HexadecimalString';
714
715 $Options{compoundidmode} = 'LabelPrefix';
716 $Options{compoundidlabel} = 'CompoundID';
717 $Options{datafieldsmode} = 'CompoundID';
718
719 $Options{filter} = 'Yes';
720
721 $Options{keeplargestcomponent} = 'Yes';
722
723 $Options{mode} = 'ExtendedConnectivity';
724
725 $Options{neighborhoodradius} = 2;
726
727 $Options{useperlcorerandom} = 'yes';
728
729 $Options{output} = 'text';
730 $Options{outdelim} = 'comma';
731 $Options{quote} = 'yes';
732
733 $Options{size} = 1024;
734
735 $Options{vectorstringformat} = '';
736
737 if (!GetOptions(\%Options, "aromaticitymodel=s", "atomidentifiertype|a=s", "atomicinvariantstouse=s", "functionalclassestouse=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mode|m=s", "neighborhoodradius|n=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "useperlcorerandom=s", "vectorstringformat|v=s", "workingdir|w=s")) {
738 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
739 }
740 if ($Options{workingdir}) {
741 if (! -d $Options{workingdir}) {
742 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
743 }
744 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
745 }
746 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
747 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
748 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
749 }
750 if ($Options{atomidentifiertype} !~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
751 die "Error: The value specified, $Options{atomidentifiertype}, for option \"-a, --AtomIdentifierType\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes\n";
752 }
753 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
754 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
755 }
756 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
757 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
758 }
759 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
760 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
761 }
762 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
763 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
764 }
765 if ($Options{filter} !~ /^(Yes|No)$/i) {
766 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
767 }
768 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
769 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
770 }
771 if ($Options{mode} !~ /^(ExtendedConnectivity|ExtendedConnectivityCount|ExtendedConnectivityBits)$/i) {
772 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: ExtendedConnectivity, ExtendedConnecticityCount, or ExtendedConnectivityBits\n";
773 }
774 if (!(IsInteger($Options{neighborhoodradius}) && ($Options{neighborhoodradius} >= 0))) {
775 die "Error: The value specified, $Options{neighborhoodradius}, for option \"-n, --NeighborhoodRadius\" is not valid. Allowed values: >= 0 \n";
776 }
777 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
778 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
779 }
780 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
781 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
782 }
783 if ($Options{quote} !~ /^(Yes|No)$/i) {
784 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
785 }
786 if (!IsPositiveInteger($Options{size})) {
787 die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: > 0 \n";
788 }
789 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
790 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
791 }
792 if ($Options{useperlcorerandom} !~ /^(Yes|No)$/i) {
793 die "Error: The value specified, $Options{useperlcorerandom}, for option \"--UsePerlCoreRandom\" is not valid. Allowed values: Yes or No\n";
794 }
795 if ($Options{vectorstringformat} && $Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
796 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
797 }
798 }
799
800 __END__
801
802 =head1 NAME
803
804 ExtendedConnectivityFingerprints.pl - Generate extended connectivity fingerprints for SD files
805
806 =head1 SYNOPSIS
807
808 ExtendedConnectivityFingerprints.pl SDFile(s)...
809
810 ExtendedConnectivityFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
811 [B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes>]
812 [B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">]
813 [B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">]
814 [B<--BitsOrder> I<Ascending | Descending>] [B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>]
815 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
816 [B<--CompoundIDMode>] [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">]
817 [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>] [B<-f, --Filter> I<Yes | No>]
818 [B<--FingerprintsLabel> I<text>] [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>]
819 [B<-m, --mode> I<ExtendedConnectivity | ExtendedConnecticityCount | ExtendedConnecticityBits>]
820 [B<-n, --NeighborhoodRadius> I<number>] [B<--OutDelim> I<comma | tab | semicolon>] [B<--output> I<SD | FP | text | all>]
821 [B<-o, --overwrite>] [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-s, --size> I<number>]
822 [B<--UsePerlCoreRandom> I<Yes | No>]
823 [B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
824 [B<-w, --WorkingDir> dirname] SDFile(s)...
825
826 =head1 DESCRIPTION
827
828 Generate extended connectivity fingerprints [ Ref 48, Ref 52 ] for I<SDFile(s)> and create appropriate
829 SD, FP or CSV/TSV text file(s) containing fingerprints vector strings corresponding to molecular fingerprints.
830
831 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
832 and I<.sd>. All other file names are ignored. All the SD files in a current directory
833 can be specified either by I<*.sdf> or the current directory name.
834
835 The current release of MayaChemTools supports generation of extended connectivity fingerprints
836 corresponding to following B<-a, --AtomIdentifierTypes>:
837
838 AtomicInvariantsAtomTypes, DREIDINGAtomTypes, EStateAtomTypes,
839 FunctionalClassAtomTypes, MMFF94AtomTypes, SLogPAtomTypes,
840 SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes
841
842 Based on values specified for B<-a, --AtomIdentifierType>, B<--AtomicInvariantsToUse>
843 and B<--FunctionalClassesToUse>, initial atom types are assigned to all non-hydrogen atoms in
844 a molecule and these atom types strings are converted into initial atom identifier integers using
845 B<TextUtil::HashCode> function. The duplicate atom identifiers are removed.
846
847 For B<-n, --NeighborhoodRadius> value of I<0>, the initial set of unique atom identifiers comprises
848 the molecule fingerprints. Otherwise, atom neighborhoods are generated for each non-hydrogen
849 atom up to specified B<-n, --NeighborhoodRadius> value. For each non-hydrogen central atom
850 at a specific radius, its neighbors at next radius level along with their bond orders and previously
851 calculated atom identifiers are collected which in turn are used to generate a new integer
852 atom identifier; the bond orders and atom identifier pairs list is first sorted by bond order
853 followed by atom identifiers to make these values graph invariant.
854
855 After integer atom identifiers have been generated for all non-hydrogen atoms at all specified
856 neighborhood radii, the duplicate integer atom identifiers corresponding to same hash code
857 value generated using B<TextUtil::HashCode> are tracked by keeping the atom identifiers at
858 lower radius. Additionally, all structurally duplicate integer atom identifiers at each specified
859 radius are also tracked by identifying equivalent atoms and bonds corresponding to substructures
860 used for generating atom identifier and keeping integer atom identifier with lowest value.
861
862 For I<ExtendedConnnectivity> value of fingerprints B<-m, --mode>, the duplicate identifiers are
863 removed from the list and the unique atom identifiers constitute the extended connectivity
864 fingerprints of a molecule.
865
866 For I<ExtendedConnnectivityCount> value of fingerprints B<-m, --mode>, the occurrence of each
867 unique atom identifiers appears is counted and the unique atom identifiers along with their
868 count constitute the extended connectivity fingerprints of a molecule.
869
870 For I<ExtendedConnectivityBits> value of fingerprints B<-m, --mode>, the unique atom identifiers
871 are used as a random number seed to generate a random integer value between 0 and B<--Size> which
872 in turn is used to set corresponding bits in the fingerprint bit-vector string.
873
874 Example of I<SD> file containing extended connectivity fingerprints string data:
875
876 ... ...
877 ... ...
878 $$$$
879 ... ...
880 ... ...
881 ... ...
882 41 44 0 0 0 0 0 0 0 0999 V2000
883 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
884 ... ...
885 2 3 1 0 0 0 0
886 ... ...
887 M END
888 > <CmpdID>
889 Cmpd1
890
891 > <ExtendedConnectivityFingerprints>
892 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radiu
893 s2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391 66
894 6191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414087
895 99 49532520 64643108 79385615 96062769 273726379 564565671 855141035 90
896 6706094 988546669 1018231313 1032696425 1197507444 1331250018 133853...
897
898 $$$$
899 ... ...
900 ... ...
901
902 Example of I<FP> file containing extended connectivity fingerprints string data:
903
904 #
905 # Package = MayaChemTools 7.4
906 # Release Date = Oct 21, 2010
907 #
908 # TimeStamp = Fri Mar 11 14:43:57 2011
909 #
910 # FingerprintsStringType = FingerprintsVector
911 #
912 # Description = ExtendedConnectivity:AtomicInvariantsAtomTypes:Radius2
913 # VectorStringFormat = ValuesString
914 # VectorValuesType = AlphaNumericalValues
915 #
916 Cmpd1 60;73555770 333564680 352413391 666191900 1001270906 137167432...
917 Cmpd2 41;73555770 333564680 666191900 1142173602 1363635752 14814699...
918 ... ...
919 ... ..
920
921 Example of CSV I<Text> file containing extended connectivity fingerprints string data:
922
923 "CompoundID","ExtendedConnectivityFingerprints"
924 "Cmpd1","FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTy
925 pes:Radius2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352
926 413391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
927 2141408799 49532520 64643108 79385615 96062769 273726379 564565671 8551
928 41035 906706094 988546669 1018231313 1032696425 1197507444 13312500..."
929 ... ...
930 ... ...
931
932 The current release of MayaChemTools generates the following types of extended connectivity
933 fingerprints vector strings:
934
935 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
936 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
937 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
938 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
939 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
940 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
941
942 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
943 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
944 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
945 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
946 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
947 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
948
949 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
950 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
951 0000000001010000000110000011000000000000100000000000000000000000100001
952 1000000110000000000000000000000000010011000000000000000000000000010000
953 0000000000000000000000000010000000000000000001000000000000000000000000
954 0000000000010000100001000000000000101000000000000000100000000000000...
955
956 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
957 es:Radius2;1024;HexadecimalString;Ascending;000000010050c0600800000803
958 0300000091000004000000020000100000000124008200020000000040020000000000
959 2080000000820040010020000000008040000000000080001000000000400000000000
960 4040000090000061010000000800200000000000001400000000020080000000000020
961 00008020200000408000
962
963 FingerprintsVector;ExtendedConnectivity:FunctionalClassAtomTypes:Radiu
964 s2;57;AlphaNumericalValues;ValuesString;24769214 508787397 850393286 8
965 62102353 981185303 1231636850 1649386610 1941540674 263599683 32920567
966 1 571109041 639579325 683993318 723853089 810600886 885767127 90326012
967 7 958841485 981022393 1126908698 1152248391 1317567065 1421489994 1455
968 632544 1557272891 1826413669 1983319256 2015750777 2029559552 20404...
969
970 FingerprintsVector;ExtendedConnectivityCount:FunctionalClassAtomTypes:
971 Radius2;57;NumericalValues;IDsAndValuesString;24769214 508787397 85039
972 3286 862102353 981185303 1231636850 1649386610 1941540674 263599683 32
973 9205671 571109041 639579325 683993318 723853089 810600886 885767127...;
974 1 1 1 10 2 22 3 1 3 3 1 1 1 3 2 2 1 2 2 2 3 1 1 1 1 1 14 1 1 1 1 1 1 2
975 1 2 1 1 2 2 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1 1 1
976
977 FingerprintsBitVector;ExtendedConnectivityBits:FunctionalClassAtomType
978 s:Radius2;1024;BinaryString;Ascending;00000000000000000000100000000000
979 0000000001000100000000001000000000000000000000000000000000101000000010
980 0000001000000000010000000000000000000000000000000000000000000000000100
981 0000000000001000000000000001000000000001001000000000000000000000000000
982 0000000000000000100000000000001000000000000000000000000000000000000...
983
984 FingerprintsVector;ExtendedConnectivity:DREIDINGAtomTypes:Radius2;56;A
985 lphaNumericalValues;ValuesString;280305427 357928343 721790579 1151822
986 898 1207111054 1380963747 1568213839 1603445250 4559268 55012922 18094
987 0813 335715751 534801009 684609658 829361048 972945982 999881534 10076
988 55741 1213692591 1222032501 1224517934 1235687794 1244268533 152812070
989 0 1629595024 1856308891 1978806036 2001865095 2096549435 172675415 ...
990
991 FingerprintsVector;ExtendedConnectivity:EStateAtomTypes:Radius2;62;Alp
992 haNumericalValues;ValuesString;25189973 528584866 662581668 671034184
993 926543080 1347067490 1738510057 1759600920 2034425745 2097234755 21450
994 44754 96779665 180364292 341712110 345278822 386540408 387387308 50430
995 1706 617094135 771528807 957666640 997798220 1158349170 1291258082 134
996 1138533 1395329837 1420277211 1479584608 1486476397 1487556246 1566...
997
998 FingerprintsVector;ExtendedConnectivity:MMFF94AtomTypes:Radius2;64;Alp
999 haNumericalValues;ValuesString;224051550 746527773 998750766 103704190
1000 2 1239701709 1248384926 1259447756 1521678386 1631549126 1909437580 20
1001 37095052 2104274756 2117729376 8770364 31445800 81450228 314289324 344
1002 041929 581773587 638555787 692022098 811840536 929651561 936421792 988
1003 636432 1048624296 1054288509 1369487579 1454058929 1519352190 17271...
1004
1005 FingerprintsVector;ExtendedConnectivity:SLogPAtomTypes:Radius2;71;Alph
1006 aNumericalValues;ValuesString;78989290 116507218 489454042 888737940 1
1007 162561799 1241797255 1251494264 1263717127 1471206899 1538061784 17654
1008 07295 1795036542 1809833874 2020454493 2055310842 2117729376 11868981
1009 56731842 149505242 184525155 196984339 288181334 481409282 556716568 6
1010 41915747 679881756 721736571 794256218 908276640 992898760 10987549...
1011
1012 FingerprintsVector;ExtendedConnectivity:SYBYLAtomTypes:Radius2;58;Alph
1013 aNumericalValues;ValuesString;199957044 313356892 455463968 465982819
1014 1225318176 1678585943 1883366064 1963811677 2117729376 113784599 19153
1015 8837 196629033 263865277 416380653 477036669 681527491 730724924 90906
1016 5537 1021959189 1133014972 1174311016 1359441203 1573452838 1661585138
1017 1668649038 1684198062 1812312554 1859266290 1891651106 2072549404 ...
1018
1019 FingerprintsVector;ExtendedConnectivity:TPSAAtomTypes:Radius2;47;Alpha
1020 NumericalValues;ValuesString;20818206 259344053 862102353 1331904542 1
1021 700688206 265614156 363161397 681332588 810600886 885767127 950172500
1022 951454814 1059668746 1247054493 1382302230 1399502637 1805025917 19189
1023 39561 2114677228 2126402271 8130483 17645742 32278373 149975755 160327
1024 654 256360355 279492740 291251259 317592700 333763396 972105960 101...
1025
1026 FingerprintsVector;ExtendedConnectivity:UFFAtomTypes:Radius2;56;AlphaN
1027 umericalValues;ValuesString;280305427 357928343 721790579 1151822898 1
1028 207111054 1380963747 1568213839 1603445250 4559268 55012922 180940813
1029 335715751 534801009 684609658 829361048 972945982 999881534 1007655741
1030 1213692591 1222032501 1224517934 1235687794 1244268533 1528120700 162
1031 9595024 1856308891 1978806036 2001865095 2096549435 172675415 18344...
1032
1033 =head1 OPTIONS
1034
1035 =over 4
1036
1037 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
1038
1039 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
1040 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
1041 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
1042 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
1043
1044 The supported aromaticity model names along with model specific control parameters
1045 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
1046 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
1047 this file during class instantiation and makes it available to method B<DetectAromaticity>
1048 for detecting aromaticity corresponding to a specific model.
1049
1050 =item B<-a, --AtomIdentifierType> I<AtomicInvariantsAtomTypes | FunctionalClassAtomTypes | DREIDINGAtomTypes | EStateAtomTypes | MMFF94AtomTypes | SLogPAtomTypes | SYBYLAtomTypes | TPSAAtomTypes | UFFAtomTypes>
1051
1052 Specify atom identifier type to use for assignment of initial atom identifier to non-hydrogen
1053 atoms during calculation of extended connectivity fingerprints [ Ref 48, Ref 52]. Possible values
1054 in the current release are: I<AtomicInvariantsAtomTypes, FunctionalClassAtomTypes,
1055 DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes,
1056 TPSAAtomTypes, UFFAtomTypes>. Default value: I<AtomicInvariantsAtomTypes>.
1057
1058 =item B<--AtomicInvariantsToUse> I<"AtomicInvariant,AtomicInvariant...">
1059
1060 This value is used during I<AtomicInvariantsAtomTypes> value of B<a, --AtomIdentifierType>
1061 option. It's a list of comma separated valid atomic invariant atom types.
1062
1063 Possible values for atomic invarians are: I<AS, X, BO, LBO, SB, DB, TB,
1064 H, Ar, RA, FC, MN, SM>. Default value [ Ref 24 ]: I<AS,X,BO,H,FC,MN>.
1065
1066 The atomic invariants abbreviations correspond to:
1067
1068 AS = Atom symbol corresponding to element symbol
1069
1070 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
1071 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
1072 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
1073 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
1074 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
1075 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
1076 H<n> = Number of implicit and explicit hydrogens for atom
1077 Ar = Aromatic annotation indicating whether atom is aromatic
1078 RA = Ring atom annotation indicating whether atom is a ring
1079 FC<+n/-n> = Formal charge assigned to atom
1080 MN<n> = Mass number indicating isotope other than most abundant isotope
1081 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
1082 3 (triplet)
1083
1084 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1085
1086 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1087
1088 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1089 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
1090
1091 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
1092 are also allowed:
1093
1094 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
1095 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
1096 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
1097 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
1098 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1099 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1100 H : NumOfImplicitAndExplicitHydrogens
1101 Ar : Aromatic
1102 RA : RingAtom
1103 FC : FormalCharge
1104 MN : MassNumber
1105 SM : SpinMultiplicity
1106
1107 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant
1108 atom types.
1109
1110 =item B<--BitsOrder> I<Ascending | Descending>
1111
1112 Bits order to use during generation of fingerprints bit-vector string for I<ExtendedConnectivityBits>
1113 value of B<-m, --mode> option. Possible values: I<Ascending, Descending>. Default: I<Ascending>.
1114
1115 I<Ascending> bit order which corresponds to first bit in each byte as the lowest bit as
1116 opposed to the highest bit.
1117
1118 Internally, bits are stored in I<Ascending> order using Perl vec function. Regardless
1119 of machine order, big-endian or little-endian, vec function always considers first
1120 string byte as the lowest byte and first bit within each byte as the lowest bit.
1121
1122 =item B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>
1123
1124 Format of fingerprints bit-vector string data in output SD, FP or CSV/TSV text file(s) specified by
1125 B<--output> used during I<ExtendedConnectivityBits> value of B<-m, --mode> option. Possible
1126 values: I<BinaryString, HexadecimalString>. Default value: I<BinaryString>.
1127
1128 I<BinaryString> corresponds to an ASCII string containing 1s and 0s. I<HexadecimalString>
1129 contains bit values in ASCII hexadecimal format.
1130
1131 Examples:
1132
1133 FingerprintsBitVector;ExtendedConnectivityBits:AtomicInvariantsAtomTyp
1134 es:Radius2;1024;BinaryString;Ascending;0000000000000000000000000000100
1135 0000000001010000000110000011000000000000100000000000000000000000100001
1136 1000000110000000000000000000000000010011000000000000000000000000010000
1137 0000000000000000000000000010000000000000000001000000000000000000000000
1138 0000000000010000100001000000000000101000000000000000100000000000000...
1139
1140 FingerprintsBitVector;ExtendedConnectivityBits:FunctionalClassAtomType
1141 s:Radius2;1024;BinaryString;Ascending;00000000000000000000100000000000
1142 0000000001000100000000001000000000000000000000000000000000101000000010
1143 0000001000000000010000000000000000000000000000000000000000000000000100
1144 0000000000001000000000000001000000000001001000000000000000000000000000
1145 0000000000000000100000000000001000000000000000000000000000000000000...
1146
1147 =item B<--FunctionalClassesToUse> I<"FunctionalClass1,FunctionalClass2...">
1148
1149 This value is used during I<FunctionalClassAtomTypes> value of B<a, --AtomIdentifierType>
1150 option. It's a list of comma separated valid functional classes.
1151
1152 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1153 Default value [ Ref 24 ]: I<HBD,HBA,PI,NI,Ar,Hal>.
1154
1155 The functional class abbreviations correspond to:
1156
1157 HBD: HydrogenBondDonor
1158 HBA: HydrogenBondAcceptor
1159 PI : PositivelyIonizable
1160 NI : NegativelyIonizable
1161 Ar : Aromatic
1162 Hal : Halogen
1163 H : Hydrophobic
1164 RA : RingAtom
1165 CA : ChainAtom
1166
1167 Functional class atom type specification for an atom corresponds to:
1168
1169 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1170
1171 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1172 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1173
1174 HydrogenBondDonor: NH, NH2, OH
1175 HydrogenBondAcceptor: N[!H], O
1176 PositivelyIonizable: +, NH2
1177 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1178
1179 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1180
1181 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1182
1183 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1184 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1185 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1186 look like Cmpd<Number>.
1187
1188 Examples for I<DataField> value of B<--CompoundIDMode>:
1189
1190 MolID
1191 ExtReg
1192
1193 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1194
1195 Compound
1196
1197 The value specified above generates compound IDs which correspond to Compound<Number>
1198 instead of default value of Cmpd<Number>.
1199
1200 =item B<--CompoundIDLabel> I<text>
1201
1202 Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value
1203 of B<--DataFieldsMode> option. Default: I<CompoundID>.
1204
1205 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1206
1207 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
1208 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
1209 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
1210 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
1211
1212 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1213 Default: I<LabelPrefix>.
1214
1215 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1216 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1217 values are replaced with sequential compound IDs.
1218
1219 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1220
1221 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
1222
1223 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1224 with generated fingerprints for I<text | all> values of B<--output> option.
1225
1226 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1227
1228 Examples:
1229
1230 Extreg
1231 MolID,CompoundName
1232
1233 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1234
1235 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1236 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
1237 data field; transfer SD data files common to all compounds; extract specified data fields;
1238 generate a compound ID using molname line, a compound prefix, or a combination of both.
1239 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1240
1241 =item B<-f, --Filter> I<Yes | No>
1242
1243 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1244 Default value: I<Yes>.
1245
1246 By default, compound data is checked before calculating fingerprints and compounds containing
1247 atom data corresponding to non-element symbols or no atom data are ignored.
1248
1249 =item B<--FingerprintsLabel> I<text>
1250
1251 SD data label or text file column label to use for fingerprints string in output SD or
1252 CSV/TSV text file(s) specified by B<--output>. Default value: I<ExtendedConnectivityFingerprints>.
1253
1254 =item B<-h, --help>
1255
1256 Print this help message.
1257
1258 =item B<-k, --KeepLargestComponent> I<Yes | No>
1259
1260 Generate fingerprints for only the largest component in molecule. Possible values:
1261 I<Yes or No>. Default value: I<Yes>.
1262
1263 For molecules containing multiple connected components, fingerprints can be generated
1264 in two different ways: use all connected components or just the largest connected
1265 component. By default, all atoms except for the largest connected component are
1266 deleted before generation of fingerprints.
1267
1268 =item B<-m, --mode> I<ExtendedConnectivity | ExtendedConnectivityCount | ExtendedConnectivityBits>
1269
1270 Specify type of extended connectivity fingerprints to generate for molecules in I<SDFile(s)>.
1271 Possible values: I<ExtendedConnectivity, ExtendedConnecticityCount or
1272 ExtendedConnectivityBits>. Default value: I<ExtendedConnectivity>.
1273
1274 For I<ExtendedConnnectivity> value of fingerprints B<-m, --mode>, a fingerprint vector
1275 containing unique atom identifiers constitute the extended connectivity fingerprints
1276 of a molecule.
1277
1278 For I<ExtendedConnnectivityCount> value of fingerprints B<-m, --mode>, a fingerprint vector
1279 containing unique atom identifiers along with their count constitute the extended connectivity
1280 fingerprints of a molecule.
1281
1282 For I<ExtendedConnnectivityBits> value of fingerprints B<-m, --mode>, a fingerprint bit vector
1283 indicating presence/absence of structurally unique atom identifiers constitute the extended
1284 connectivity fingerprints of a molecule.
1285
1286 =item B<-n, --NeighborhoodRadius> I<number>
1287
1288 Atomic neighborhood radius for generating extended connectivity neighborhoods. Default
1289 value: I<2>. Valid values: >= 0. Neighborhood radius of zero correspond to just the list
1290 of non-hydrogen atoms.
1291
1292 Default value of I<2> for atomic neighborhood radius generates extended connectivity
1293 fingerprints corresponding to path length or diameter value of I<4> [ Ref 52b ].
1294
1295 =item B<--OutDelim> I<comma | tab | semicolon>
1296
1297 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1298 Default value: I<comma>.
1299
1300 =item B<--output> I<SD | FP | text | all>
1301
1302 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1303
1304 =item B<-o, --overwrite>
1305
1306 Overwrite existing files.
1307
1308 =item B<-q, --quote> I<Yes | No>
1309
1310 Put quote around column values in output CSV/TSV text file(s). Possible values:
1311 I<Yes or No>. Default value: I<Yes>.
1312
1313 =item B<-r, --root> I<RootName>
1314
1315 New file name is generated using the root: <Root>.<Ext>. Default for new file names:
1316 <SDFileName><ExtendedConnectivityFP>.<Ext>. The file type determines <Ext>
1317 value. The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1318 delimited text files, respectively.This option is ignored for multiple input files.
1319
1320 =item B<-s, --size> I<number>
1321
1322 Size of bit-vector to use during generation of fingerprints bit-vector string for
1323 I<ExtendedConnectivityBits> value of B<-m, --mode>. Default value: I<1024>.
1324 Valid values correspond to any positive integer which satisfies the following criteria:
1325 power of 2, >= 32 and <= 2 ** 32.
1326
1327 Examples:
1328
1329 512
1330 1024
1331 2048
1332
1333 =item B<--UsePerlCoreRandom> I<Yes | No>
1334
1335 Specify whether to use Perl CORE::rand or MayaChemTools MathUtil::random function
1336 during random number generation for setting bits in fingerprints bit-vector strings. Possible
1337 values: I<Yes or No>. Default value: I<Yes>.
1338
1339 I<No> value option for B<--UsePerlCoreRandom> allows the generation of fingerprints
1340 bit-vector strings which are same across different platforms.
1341
1342 The random number generator implemented in MayaChemTools is a variant of
1343 linear congruential generator (LCG) as described by Miller et al. [ Ref 120 ].
1344 It is also referred to as Lehmer random number generator or Park-Miller
1345 random number generator.
1346
1347 Unlike Perl's core random number generator function rand, the random number
1348 generator implemented in MayaChemTools, MathUtil::random, generates consistent
1349 random values across different platforms for a specific random seed and leads
1350 to generation of portable fingerprints bit-vector strings.
1351
1352 =item B<-v, --VectorStringFormat> I<ValuesString | IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1353
1354 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1355 B<--output> used during <ExtendedConnectivityCount> value of B<-m, --mode> option. Possible
1356 values: I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString |
1357 ValuesAndIDsPairsString>.
1358
1359 Default value during <ExtendedConnectivityCount> value of B<-m, --mode> option:
1360 I<IDsAndValuesString>.
1361
1362 Default value during <ExtendedConnectivity> value of B<-m, --mode> option: I<ValuesString>.
1363
1364 Examples:
1365
1366 FingerprintsVector;ExtendedConnectivity:AtomicInvariantsAtomTypes:Radi
1367 us2;60;AlphaNumericalValues;ValuesString;73555770 333564680 352413391
1368 666191900 1001270906 1371674323 1481469939 1977749791 2006158649 21414
1369 08799 49532520 64643108 79385615 96062769 273726379 564565671 85514103
1370 5 906706094 988546669 1018231313 1032696425 1197507444 1331250018 1338
1371 532734 1455473691 1607485225 1609687129 1631614296 1670251330 17303...
1372
1373 FingerprintsVector;ExtendedConnectivityCount:AtomicInvariantsAtomTypes
1374 :Radius2;60;NumericalValues;IDsAndValuesString;73555770 333564680 3524
1375 13391 666191900 1001270906 1371674323 1481469939 1977749791 2006158649
1376 2141408799 49532520 64643108 79385615 96062769 273726379 564565671...;
1377 3 2 1 1 14 1 2 10 4 3 1 1 1 1 2 1 2 1 1 1 2 3 1 1 2 1 3 3 8 2 2 2 6 2
1378 1 2 1 1 2 1 1 1 2 1 1 2 1 2 1 1 1 1 1 1 1 1 1 2 1 1
1379
1380 =item B<-w, --WorkingDir> I<DirName>
1381
1382 Location of working directory. Default: current directory.
1383
1384 =back
1385
1386 =head1 EXAMPLES
1387
1388 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1389 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv
1390 file containing sequential compound IDs along with fingerprints vector strings data, type:
1391
1392 % ExtendedConnectivityFingerprints.pl -r SampleECAIFP -o Sample.sdf
1393
1394 To generate extended connectivity count fingerprints corresponding to neighborhood radius up to
1395 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv
1396 file containing sequential compound IDs along with fingerprints vector strings data, type:
1397
1398 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityCount
1399 -r SampleECAIFP -o Sample.sdf
1400
1401 To generate extended connectivity bits fingerprints as hexadecimal bit-string corresponding to
1402 neighborhood radius up to 2 using atomic invariants atom types in vector string format and
1403 create a SampleECAIFP.csv file containing sequential compound IDs along with fingerprints
1404 vector strings data, type:
1405
1406 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityBits
1407 -r SampleECAIFP -o Sample.sdf
1408
1409 To generate extended connectivity bits fingerprints as binary bit-string corresponding to
1410 neighborhood radius up to 2 using atomic invariants atom types in vector string format and
1411 create a SampleECAIFP.csv file containing sequential compound IDs along with fingerprints
1412 vector strings data, type:
1413
1414 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityBits
1415 --BitStringFormat BinaryString -r SampleECAIFP -o Sample.sdf
1416
1417 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1418 2 using atomic invariants atom types in vector string format and create SampleECAIFP.sdf, SampleECAIFP.fpf
1419 and SampleECAIFP.csv files containing sequential compound IDs in CSV file along with fingerprints
1420 vector strings data, type:
1421
1422 % ExtendedConnectivityFingerprints.pl --output all -r SampleECAIFP
1423 -o Sample.sdf
1424
1425 To generate extended connectivity count fingerprints corresponding to neighborhood radius up to
1426 2 using atomic invariants atom types in vector string format and create SampleECAIFP.sdf, SampleECAIFP.fpf
1427 and SampleECAIFP.csv files containing sequential compound IDs in CSV file along with fingerprints
1428 vector strings data, type:
1429
1430 % ExtendedConnectivityFingerprints.pl -m ExtendedConnectivityCount
1431 --output all -r SampleECAIFP -o Sample.sdf
1432
1433 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1434 2 using functional class atom types in vector string format and create a SampleECFCFP.csv file
1435 containing sequential compound IDs along with fingerprints vector strings data, type:
1436
1437 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes
1438 -r SampleECFCFP -o Sample.sdf
1439
1440 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1441 2 using DREIDING atom types in vector string format and create a SampleECFP.csv file
1442 containing sequential compound IDs along with fingerprints vector strings data, type:
1443
1444 % ExtendedConnectivityFingerprints.pl -a DREIDINGAtomTypes
1445 -r SampleECFP -o Sample.sdf
1446
1447 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1448 2 using E-state atom types in vector string format and create a SampleECFP.csv file
1449 containing sequential compound IDs along with fingerprints vector strings data, type:
1450
1451 % ExtendedConnectivityFingerprints.pl -a EStateAtomTypes
1452 -r SampleECFP -o Sample.sdf
1453
1454 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1455 2 using MMFF94 atom types in vector string format and create a SampleECFP.csv file
1456 containing sequential compound IDs along with fingerprints vector strings data, type:
1457
1458 % ExtendedConnectivityFingerprints.pl -a MMFF94AtomTypes
1459 -r SampleECFP -o Sample.sdf
1460
1461 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1462 2 using SLogP atom types in vector string format and create a SampleECFP.csv file
1463 containing sequential compound IDs along with fingerprints vector strings data, type:
1464
1465 % ExtendedConnectivityFingerprints.pl -a SLogPAtomTypes
1466 -r SampleECFP -o Sample.sdf
1467
1468 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1469 2 using SYBYL atom types in vector string format and create a SampleECFP.csv file
1470 containing sequential compound IDs along with fingerprints vector strings data, type:
1471
1472 % ExtendedConnectivityFingerprints.pl -a SYBYLAtomTypes
1473 -r SampleECFP -o Sample.sdf
1474
1475 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1476 2 using TPSA atom types in vector string format and create a SampleECFP.csv file
1477 containing sequential compound IDs along with fingerprints vector strings data, type:
1478
1479 % ExtendedConnectivityFingerprints.pl -a TPSAAtomTypes
1480 -r SampleECFP -o Sample.sdf
1481
1482 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1483 2 using UFF atom types in vector string format and create a SampleECFP.csv file
1484 containing sequential compound IDs along with fingerprints vector strings data, type:
1485
1486 % ExtendedConnectivityFingerprints.pl -a UFFAtomTypes
1487 -r SampleECFP -o Sample.sdf
1488
1489 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1490 3 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv
1491 file containing sequential compound IDs along with fingerprints vector strings data, type:
1492
1493 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes -n 3
1494 -r SampleECAIFP -o Sample.sdf
1495
1496 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1497 3 using functional class atom types in vector string format and create a SampleECFCFP.csv file
1498 containing sequential compound IDs along with fingerprints vector strings data, type:
1499
1500 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes -n 3
1501 -r SampleECFCFP -o Sample.sdf
1502
1503 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1504 2 using only AS,X atomic invariants atom types in vector string format and create a
1505 SampleECAIFP.csv file containing sequential compound IDs along with fingerprints vector
1506 strings data, type:
1507
1508 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes
1509 --AtomicInvariantsToUse "AS,X" -r SampleECAIFP -o Sample.sdf
1510
1511 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1512 2 using only HBD,HBA functional class atom types in vector string format and create a
1513 SampleECFCFP.csv file containing sequential compound IDs along with fingerprints vector
1514 strings data, type:
1515
1516 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes
1517 --FunctionalClassesToUse "HBD,HBA" -r SampleECFCFP -o Sample.sdf
1518
1519 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1520 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.csv
1521 file containing compound ID from molecule name line along with fingerprints vector strings
1522 data, type:
1523
1524 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes
1525 --DataFieldsMode CompoundID -CompoundIDMode MolName
1526 -r SampleECAIFP -o Sample.sdf
1527
1528 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1529 2 using functional class atom types in vector string format and create a SampleECFCFP.csv
1530 file containing compound IDs using specified data field along with fingerprints vector strings
1531 data, type:
1532
1533 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes
1534 --DataFieldsMode CompoundID -CompoundIDMode DataField --CompoundID Mol_ID
1535 -r SampleECFCFP -o Sample.sdf
1536
1537 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1538 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.tsv
1539 file containing compound ID using combination of molecule name line and an explicit compound
1540 prefix along with fingerprints vector strings data, type:
1541
1542 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes
1543 --DataFieldsMode CompoundID -CompoundIDMode MolnameOrLabelPrefix
1544 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleECAIFP -o Sample.sdf
1545
1546 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1547 2 using functional class atom types in vector string format and create a SampleECFCFP.csv
1548 file containing specific data fields columns along with fingerprints vector strings
1549 data, type:
1550
1551 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes
1552 --DataFieldsMode Specify --DataFields Mol_ID -r SampleECFCFP
1553 -o Sample.sdf
1554
1555 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1556 2 using atomic invariants atom types in vector string format and create a SampleECAIFP.tsv
1557 file containing common data fields columns along with fingerprints vector strings data, type:
1558
1559 % ExtendedConnectivityFingerprints.pl -a AtomicInvariantsAtomTypes
1560 --DataFieldsMode Common -r SampleECAIFP -o Sample.sdf
1561
1562 To generate extended connectivity fingerprints corresponding to neighborhood radius up to
1563 2 using functional class atom types in vector string format and create SampleECFCFP.sdf, SampleECFCFP.fpf
1564 and SampleECFCFP.csv files containing all data fields columns in CSV file along with fingerprints
1565 vector strings data, type:
1566
1567 % ExtendedConnectivityFingerprints.pl -a FunctionalClassAtomTypes
1568 --DataFieldsMode All --output all -r SampleECFCFP
1569 -o Sample.sdf
1570
1571 =head1 AUTHOR
1572
1573 Manish Sud <msud@san.rr.com>
1574
1575 =head1 SEE ALSO
1576
1577 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1578 MACCSKeysFingerprints.pl, PathLengthFingerprints.pl,
1579 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1580 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1581
1582 =head1 COPYRIGHT
1583
1584 Copyright (C) 2015 Manish Sud. All rights reserved.
1585
1586 This file is part of MayaChemTools.
1587
1588 MayaChemTools is free software; you can redistribute it and/or modify it under
1589 the terms of the GNU Lesser General Public License as published by the Free
1590 Software Foundation; either version 3 of the License, or (at your option)
1591 any later version.
1592
1593 =cut