comparison mayachemtools/bin/MACCSKeysFingerprints.pl @ 0:73ae111cf86f draft

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 11:55:01 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:73ae111cf86f
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: MACCSKeysFingerprints.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.31 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use FileIO::FingerprintsSDFileIO;
40 use FileIO::FingerprintsTextFileIO;
41 use FileIO::FingerprintsFPFileIO;
42 use Fingerprints::MACCSKeys;
43
44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
45
46 # Autoflush STDOUT
47 $| = 1;
48
49 # Starting message...
50 $ScriptName = basename($0);
51 print "\n$ScriptName: Starting...\n\n";
52 $StartTime = new Benchmark;
53
54 # Get the options and setup script...
55 SetupScriptUsage();
56 if ($Options{help} || @ARGV < 1) {
57 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
58 }
59
60 my(@SDFilesList);
61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
62
63 # Process options...
64 print "Processing options...\n";
65 my(%OptionsInfo);
66 ProcessOptions();
67
68 # Setup information about input files...
69 print "Checking input SD file(s)...\n";
70 my(%SDFilesInfo);
71 RetrieveSDFilesInfo();
72
73 # Process input files..
74 my($FileIndex);
75 if (@SDFilesList > 1) {
76 print "\nProcessing SD files...\n";
77 }
78 for $FileIndex (0 .. $#SDFilesList) {
79 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
80 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
81 GenerateMACCSKeysFingerprints($FileIndex);
82 }
83 }
84 print "\n$ScriptName:Done...\n\n";
85
86 $EndTime = new Benchmark;
87 $TotalTime = timediff ($EndTime, $StartTime);
88 print "Total time: ", timestr($TotalTime), "\n";
89
90 ###############################################################################
91
92 # Generate fingerprints for a SD file...
93 #
94 sub GenerateMACCSKeysFingerprints {
95 my($FileIndex) = @_;
96 my($CmpdCount, $IgnoredCmpdCount, $SDFile, $MoleculeFileIO, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
97
98 $SDFile = $SDFilesList[$FileIndex];
99
100 # Setup output files...
101 #
102 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = SetupAndOpenOutputFiles($FileIndex);
103
104 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
105 $MoleculeFileIO->Open();
106
107 $CmpdCount = 0;
108 $IgnoredCmpdCount = 0;
109
110 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
111 $CmpdCount++;
112
113 # Filter compound data before calculating fingerprints...
114 if ($OptionsInfo{Filter}) {
115 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
116 $IgnoredCmpdCount++;
117 next COMPOUND;
118 }
119 }
120
121 $MACCSKeysFingerprints = GenerateMoleculeFingerprints($Molecule);
122 if (!$MACCSKeysFingerprints) {
123 $IgnoredCmpdCount++;
124 ProcessIgnoredCompound('FingerprintsGenerationFailed', $CmpdCount, $Molecule);
125 next COMPOUND;
126 }
127
128 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
129 }
130 $MoleculeFileIO->Close();
131
132 if ($NewFPSDFileIO) {
133 $NewFPSDFileIO->Close();
134 }
135 if ($NewFPTextFileIO) {
136 $NewFPTextFileIO->Close();
137 }
138 if ($NewFPFileIO) {
139 $NewFPFileIO->Close();
140 }
141
142 WriteFingerprintsGenerationSummaryStatistics($CmpdCount, $IgnoredCmpdCount);
143 }
144
145 # Process compound being ignored due to problems in fingerprints geneation...
146 #
147 sub ProcessIgnoredCompound {
148 my($Mode, $CmpdCount, $Molecule) = @_;
149 my($CmpdID, $DataFieldLabelAndValuesRef);
150
151 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
152 $CmpdID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
153
154 MODE: {
155 if ($Mode =~ /^ContainsNonElementalData$/i) {
156 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
157 next MODE;
158 }
159
160 if ($Mode =~ /^ContainsNoElementalData$/i) {
161 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
162 next MODE;
163 }
164
165 if ($Mode =~ /^FingerprintsGenerationFailed$/i) {
166 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
167 next MODE;
168 }
169 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Fingerprints generation didn't succeed...\n\n";
170 }
171 }
172
173 # Check and filter compounds....
174 #
175 sub CheckAndFilterCompound {
176 my($CmpdCount, $Molecule) = @_;
177 my($ElementCount, $NonElementCount);
178
179 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
180
181 if ($NonElementCount) {
182 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
183 return 1;
184 }
185
186 if (!$ElementCount) {
187 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
188 return 1;
189 }
190
191 return 0;
192 }
193
194 # Write out compounds fingerprints generation summary statistics...
195 #
196 sub WriteFingerprintsGenerationSummaryStatistics {
197 my($CmpdCount, $IgnoredCmpdCount) = @_;
198 my($ProcessedCmpdCount);
199
200 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
201
202 print "\nNumber of compounds: $CmpdCount\n";
203 print "Number of compounds processed successfully during fingerprints generation: $ProcessedCmpdCount\n";
204 print "Number of compounds ignored during fingerprints generation: $IgnoredCmpdCount\n";
205 }
206
207 # Open output files...
208 #
209 sub SetupAndOpenOutputFiles {
210 my($FileIndex) = @_;
211 my($NewFPSDFile, $NewFPFile, $NewFPTextFile, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO, %FingerprintsFileIOParams);
212
213 ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = (undef) x 3;
214
215 # Setup common parameters for fingerprints file IO objects...
216 #
217 %FingerprintsFileIOParams = ();
218 if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
219 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsBitVectorString', 'BitStringFormat' => $OptionsInfo{BitStringFormat}, 'BitsOrder' => $OptionsInfo{BitsOrder});
220 }
221 elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
222 %FingerprintsFileIOParams = ('Mode' => 'Write', 'Overwrite' => $OptionsInfo{OverwriteFiles}, 'FingerprintsStringMode' => 'FingerprintsVectorString', 'VectorStringFormat' => $OptionsInfo{VectorStringFormat});
223 }
224
225 if ($OptionsInfo{SDOutput}) {
226 $NewFPSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
227 print "Generating SD file $NewFPSDFile...\n";
228 $NewFPSDFileIO = new FileIO::FingerprintsSDFileIO('Name' => $NewFPSDFile, %FingerprintsFileIOParams, 'FingerprintsFieldLabel' => $OptionsInfo{FingerprintsLabel});
229 $NewFPSDFileIO->Open();
230 }
231
232 if ($OptionsInfo{FPOutput}) {
233 $NewFPFile = $SDFilesInfo{FPOutFileNames}[$FileIndex];
234 print "Generating FP file $NewFPFile...\n";
235 $NewFPFileIO = new FileIO::FingerprintsFPFileIO('Name' => $NewFPFile, %FingerprintsFileIOParams);
236 $NewFPFileIO->Open();
237 }
238
239 if ($OptionsInfo{TextOutput}) {
240 my($ColLabelsRef);
241
242 $NewFPTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
243 $ColLabelsRef = SetupFPTextFileCoulmnLabels($FileIndex);
244
245 print "Generating text file $NewFPTextFile...\n";
246 $NewFPTextFileIO = new FileIO::FingerprintsTextFileIO('Name' => $NewFPTextFile, %FingerprintsFileIOParams, 'DataColLabels' => $ColLabelsRef, 'OutDelim' => $OptionsInfo{OutDelim}, 'OutQuote' => $OptionsInfo{OutQuote});
247 $NewFPTextFileIO->Open();
248 }
249
250 return ($NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO);
251 }
252
253 # Write fingerpritns and other data to appropriate output files...
254 #
255 sub WriteDataToOutputFiles {
256 my($FileIndex, $CmpdCount, $Molecule, $MACCSKeysFingerprints, $NewFPSDFileIO, $NewFPTextFileIO, $NewFPFileIO) = @_;
257 my($DataFieldLabelAndValuesRef);
258
259 $DataFieldLabelAndValuesRef = undef;
260 if ($NewFPTextFileIO || $NewFPFileIO) {
261 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
262 }
263
264 if ($NewFPSDFileIO) {
265 my($CmpdString);
266
267 $CmpdString = $Molecule->GetInputMoleculeString();
268 $NewFPSDFileIO->WriteFingerprints($MACCSKeysFingerprints, $CmpdString);
269 }
270
271 if ($NewFPTextFileIO) {
272 my($ColValuesRef);
273
274 $ColValuesRef = SetupFPTextFileCoulmnValues($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
275 $NewFPTextFileIO->WriteFingerprints($MACCSKeysFingerprints, $ColValuesRef);
276 }
277
278 if ($NewFPFileIO) {
279 my($CompoundID);
280
281 $CompoundID = SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
282 $NewFPFileIO->WriteFingerprints($MACCSKeysFingerprints, $CompoundID);
283 }
284 }
285
286 # Generate approriate column labels for FPText output file...
287 #
288 sub SetupFPTextFileCoulmnLabels {
289 my($FileIndex) = @_;
290 my($Line, @ColLabels);
291
292 @ColLabels = ();
293 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
294 push @ColLabels, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
295 }
296 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
297 push @ColLabels, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
298 }
299 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
300 push @ColLabels, @{$OptionsInfo{SpecifiedDataFields}};
301 }
302 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
303 push @ColLabels, $OptionsInfo{CompoundIDLabel};
304 }
305 # Add fingerprints label...
306 push @ColLabels, $OptionsInfo{FingerprintsLabel};
307
308 return \@ColLabels;
309 }
310
311 # Generate column values FPText output file..
312 #
313 sub SetupFPTextFileCoulmnValues {
314 my($FileIndex, $CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
315 my(@ColValues);
316
317 @ColValues = ();
318 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
319 push @ColValues, SetupCmpdIDForOutputFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
320 }
321 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
322 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
323 }
324 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
325 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
326 }
327 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
328 @ColValues = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
329 }
330
331 return \@ColValues;
332 }
333
334 # Generate compound ID for FP and FPText output files..
335 #
336 sub SetupCmpdIDForOutputFiles {
337 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
338 my($CmpdID);
339
340 $CmpdID = '';
341 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
342 my($MolName);
343 $MolName = $Molecule->GetName();
344 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
345 }
346 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
347 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
348 }
349 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
350 my($SpecifiedDataField);
351 $SpecifiedDataField = $OptionsInfo{CompoundID};
352 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
353 }
354 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
355 $CmpdID = $Molecule->GetName();
356 }
357 return $CmpdID;
358 }
359
360 # Generate fingerprints for molecule...
361 #
362 sub GenerateMoleculeFingerprints {
363 my($Molecule) = @_;
364 my($MACCSKeysFingerprints);
365
366 if ($OptionsInfo{KeepLargestComponent}) {
367 $Molecule->KeepLargestComponent();
368 }
369 if (!$Molecule->DetectRings()) {
370 return undef;
371 }
372 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
373 $Molecule->DetectAromaticity();
374
375 $MACCSKeysFingerprints = undef;
376 if ($OptionsInfo{Mode} =~ /^MACCSKeyBits$/i) {
377 $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyBits', 'Size' => $OptionsInfo{Size});
378 }
379 elsif ($OptionsInfo{Mode} =~ /^MACCSKeyCount$/i) {
380 $MACCSKeysFingerprints = new Fingerprints::MACCSKeys('Molecule' => $Molecule, 'Type' => 'MACCSKeyCount', 'Size' => $OptionsInfo{Size});
381 }
382 else {
383 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
384 }
385 $MACCSKeysFingerprints->GenerateMACCSKeys();
386
387 return $MACCSKeysFingerprints;
388 }
389
390 # Retrieve information about SD files...
391 #
392 sub RetrieveSDFilesInfo {
393 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $FPOutFileExt, $NewSDFileName, $NewFPFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
394
395 %SDFilesInfo = ();
396 @{$SDFilesInfo{FileOkay}} = ();
397 @{$SDFilesInfo{OutFileRoot}} = ();
398 @{$SDFilesInfo{SDOutFileNames}} = ();
399 @{$SDFilesInfo{FPOutFileNames}} = ();
400 @{$SDFilesInfo{TextOutFileNames}} = ();
401 @{$SDFilesInfo{AllDataFieldsRef}} = ();
402 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
403
404 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
405 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
406
407 FILELIST: for $Index (0 .. $#SDFilesList) {
408 $SDFile = $SDFilesList[$Index];
409
410 $SDFilesInfo{FileOkay}[$Index] = 0;
411 $SDFilesInfo{OutFileRoot}[$Index] = '';
412 $SDFilesInfo{SDOutFileNames}[$Index] = '';
413 $SDFilesInfo{FPOutFileNames}[$Index] = '';
414 $SDFilesInfo{TextOutFileNames}[$Index] = '';
415
416 $SDFile = $SDFilesList[$Index];
417 if (!(-e $SDFile)) {
418 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
419 next FILELIST;
420 }
421 if (!CheckFileType($SDFile, "sd sdf")) {
422 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
423 next FILELIST;
424 }
425
426 if ($CheckDataField) {
427 # Make sure data field exists in SD file..
428 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
429
430 @CmpdLines = ();
431 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
432 $CmpdString = ReadCmpdString(\*SDFILE);
433 close SDFILE;
434 @CmpdLines = split "\n", $CmpdString;
435 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
436 $SpecifiedDataField = $OptionsInfo{CompoundID};
437 if (!exists $DataFieldValues{$SpecifiedDataField}) {
438 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
439 next FILELIST;
440 }
441 }
442
443 $AllDataFieldsRef = '';
444 $CommonDataFieldsRef = '';
445 if ($CollectDataFields) {
446 my($CmpdCount);
447 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
448 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
449 close SDFILE;
450 }
451
452 # Setup output file names...
453 $FileDir = ""; $FileName = ""; $FileExt = "";
454 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
455
456 $TextOutFileExt = "csv";
457 if ($Options{outdelim} =~ /^tab$/i) {
458 $TextOutFileExt = "tsv";
459 }
460 $SDOutFileExt = $FileExt;
461 $FPOutFileExt = "fpf";
462
463 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
464 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
465 if ($RootFileName && $RootFileExt) {
466 $FileName = $RootFileName;
467 }
468 else {
469 $FileName = $OptionsInfo{OutFileRoot};
470 }
471 $OutFileRoot = $FileName;
472 }
473 else {
474 $OutFileRoot = "${FileName}MACCSKeysFP";
475 }
476
477 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
478 $NewFPFileName = "${OutFileRoot}.${FPOutFileExt}";
479 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
480
481 if ($OptionsInfo{SDOutput}) {
482 if ($SDFile =~ /$NewSDFileName/i) {
483 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
484 print "Specify a different name using \"-r --root\" option or use default name.\n";
485 next FILELIST;
486 }
487 }
488
489 if (!$OptionsInfo{OverwriteFiles}) {
490 # Check SD and text outout files...
491 if ($OptionsInfo{SDOutput}) {
492 if (-e $NewSDFileName) {
493 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
494 next FILELIST;
495 }
496 }
497 if ($OptionsInfo{FPOutput}) {
498 if (-e $NewFPFileName) {
499 warn "Warning: Ignoring file $SDFile: The file $NewFPFileName already exists\n";
500 next FILELIST;
501 }
502 }
503 if ($OptionsInfo{TextOutput}) {
504 if (-e $NewTextFileName) {
505 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
506 next FILELIST;
507 }
508 }
509 }
510
511 $SDFilesInfo{FileOkay}[$Index] = 1;
512
513 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
514 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
515 $SDFilesInfo{FPOutFileNames}[$Index] = $NewFPFileName;
516 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
517
518 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
519 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
520 }
521 }
522
523 # Process option values...
524 sub ProcessOptions {
525 %OptionsInfo = ();
526
527 $OptionsInfo{Mode} = $Options{mode};
528 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
529
530 $OptionsInfo{BitsOrder} = $Options{bitsorder};
531 $OptionsInfo{BitStringFormat} = $Options{bitstringformat};
532
533 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
534 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
535 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
536
537 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
538
539 my(@SpecifiedDataFields);
540 @SpecifiedDataFields = ();
541
542 @{$OptionsInfo{SpecifiedDataFields}} = ();
543 $OptionsInfo{CompoundID} = '';
544
545 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
546 if ($Options{compoundidmode} =~ /^DataField$/i) {
547 if (!$Options{compoundid}) {
548 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
549 }
550 $OptionsInfo{CompoundID} = $Options{compoundid};
551 }
552 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
553 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
554 }
555 }
556 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
557 if (!$Options{datafields}) {
558 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
559 }
560 @SpecifiedDataFields = split /\,/, $Options{datafields};
561 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
562 }
563
564 $OptionsInfo{FingerprintsLabel} = $Options{fingerprintslabel} ? $Options{fingerprintslabel} : 'MACCSKeysFingerprints';
565
566 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
567
568 $OptionsInfo{Output} = $Options{output};
569 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|All)$/i) ? 1 : 0;
570 $OptionsInfo{FPOutput} = ($Options{output} =~ /^(FP|All)$/i) ? 1 : 0;
571 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|All)$/i) ? 1 : 0;
572
573 $OptionsInfo{OutDelim} = $Options{outdelim};
574 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
575
576 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
577 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
578
579 $OptionsInfo{Size} = $Options{size};
580
581 $OptionsInfo{VectorStringFormat} = $Options{vectorstringformat};
582 }
583
584 # Setup script usage and retrieve command line arguments specified using various options...
585 sub SetupScriptUsage {
586
587 # Retrieve all the options...
588 %Options = ();
589
590 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
591
592 $Options{bitsorder} = 'Ascending';
593 $Options{bitstringformat} = 'BinaryString';
594
595 $Options{compoundidmode} = 'LabelPrefix';
596 $Options{compoundidlabel} = 'CompoundID';
597 $Options{datafieldsmode} = 'CompoundID';
598
599 $Options{filter} = 'Yes';
600
601 $Options{keeplargestcomponent} = 'Yes';
602
603 $Options{mode} = 'MACCSKeyBits';
604
605 $Options{output} = 'text';
606 $Options{outdelim} = 'comma';
607 $Options{quote} = 'yes';
608
609 $Options{size} = 166;
610
611 $Options{vectorstringformat} = 'ValuesString';
612
613 if (!GetOptions(\%Options, "aromaticitymodel=s", "bitsorder=s", "bitstringformat|b=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "fingerprintslabel=s", "help|h", "keeplargestcomponent|k=s", "mode|m=s", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "root|r=s", "size|s=i", "vectorstringformat|v=s", "workingdir|w=s")) {
614 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
615 }
616 if ($Options{workingdir}) {
617 if (! -d $Options{workingdir}) {
618 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
619 }
620 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
621 }
622 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
623 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
624 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
625 }
626 if ($Options{bitsorder} !~ /^(Ascending|Descending)$/i) {
627 die "Error: The value specified, $Options{bitsorder}, for option \"--BitsOrder\" is not valid. Allowed values: Ascending or Descending\n";
628 }
629 if ($Options{bitstringformat} !~ /^(BinaryString|HexadecimalString)$/i) {
630 die "Error: The value specified, $Options{bitstringformat}, for option \"-b, --bitstringformat\" is not valid. Allowed values: BinaryString or HexadecimalString\n";
631 }
632 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
633 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
634 }
635 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
636 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
637 }
638 if ($Options{filter} !~ /^(Yes|No)$/i) {
639 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
640 }
641 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
642 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
643 }
644 if ($Options{mode} !~ /^(MACCSKeyBits|MACCSKeyCount)$/i) {
645 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: MACCSKeyBits or MACCSKeyCount\n";
646 }
647 if ($Options{output} !~ /^(SD|FP|text|all)$/i) {
648 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, FP, text, or all\n";
649 }
650 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
651 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
652 }
653 if ($Options{quote} !~ /^(Yes|No)$/i) {
654 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
655 }
656 if ($Options{outdelim} =~ /semicolon/i && $Options{quote} =~ /^No$/i) {
657 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not allowed with, semicolon value of \"--outdelim\" option: Fingerprints string use semicolon as delimiter for various data fields and must be quoted.\n";
658 }
659 if (!(IsPositiveInteger($Options{size}) && ($Options{size} == 166 || $Options{size} == 322))) {
660 die "Error: The value specified, $Options{size}, for option \"-s, --size\" is not valid. Allowed values: 166 or 322 \n";
661 }
662 if ($Options{vectorstringformat} !~ /^(ValuesString|IDsAndValuesString|IDsAndValuesPairsString|ValuesAndIDsString|ValuesAndIDsPairsString)$/i) {
663 die "Error: The value specified, $Options{vectorstringformat}, for option \"-v, --VectorStringFormat\" is not valid. Allowed values: ValuesString, IDsAndValuesString, IDsAndValuesPairsString, ValuesAndIDsString or ValuesAndIDsPairsString\n";
664 }
665 }
666
667 __END__
668
669 =head1 NAME
670
671 MACCSKeysFingerprints.pl - Generate MACCS key fingerprints for SD files
672
673 =head1 SYNOPSIS
674
675 MACCSKeysFingerprints.pl SDFile(s)...
676
677 MACCSKeysFingerprints.pl [B<--AromaticityModel> I<AromaticityModelType>]
678 [B<--BitsOrder> I<Ascending | Descending>]
679 [B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>]
680 [B<--CompoundID> I<DataFieldName or LabelPrefixString>] [B<--CompoundIDLabel> I<text>]
681 [B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>]
682 [B<--DataFields> I<"FieldLabel1,FieldLabel2,...">] [B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>]
683 [B<-f, --Filter> I<Yes | No>] [B<--FingerprintsLabel> I<text>] [B<-h, --help>] [B<-k, --KeepLargestComponent> I<Yes | No>]
684 [B<-m, --mode> I<MACCSKeyBits | MACCSKeyCount>] [B<--OutDelim> I<comma | tab | semicolon>]
685 [B<--output> I<SD | FP | text | all>] [B<-o, --overwrite>]
686 [B<-q, --quote> I<Yes | No>] [B<-r, --root> I<RootName>] [B<-s, --size> I<number>]
687 [B<-v, --VectorStringFormat> I<IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>]
688 [B<-w, --WorkingDir> I<DirName>]
689
690 =head1 DESCRIPTION
691
692 Generate MACCS (Molecular ACCess System) keys fingerprints [ Ref 45-47 ] for I<SDFile(s)>
693 and create appropriate SD, FP or CSV/TSV text file(s) containing fingerprints bit-vector or
694 vector strings corresponding to molecular fingerprints.
695
696 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
697 and I<.sd>. All other file names are ignored. All the SD files in a current directory
698 can be specified either by I<*.sdf> or the current directory name.
699
700 For each MACCS keys definition, atoms are processed to determine their membership to the key
701 and the appropriate molecular fingerprints strings are generated. An atom can belong to multiple
702 MACCS keys.
703
704 For I<MACCSKeyBits> value of B<-m, --mode> option, a fingerprint bit-vector string containing
705 zeros and ones is generated and for I<MACCSKeyCount> value, a fingerprint vector string
706 corresponding to number of MACCS keys [ Ref 45-47 ] is generated.
707
708 I<MACCSKeyBits | MACCSKeyCount> values for B<-m, --mode> option along with two possible
709 I<166 | 322> values of B<-s, --size> supports generation of four different types of MACCS
710 keys fingerprint: I<MACCS166KeyBits, MACCS166KeyCount, MACCS322KeyBits, MACCS322KeyCount>.
711
712 Example of I<SD> file containing MAACS keys fingerprints string data:
713
714 ... ...
715 ... ...
716 $$$$
717 ... ...
718 ... ...
719 ... ...
720 41 44 0 0 0 0 0 0 0 0999 V2000
721 -3.3652 1.4499 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
722 ... ...
723 2 3 1 0 0 0 0
724 ... ...
725 M END
726 > <CmpdID>
727 Cmpd1
728
729 > <MACCSKeysFingerprints>
730 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;000000000
731 00000000000000000000000000000000100100001001000000001001000000001110001
732 00101010111100011011000100110110000011011110100110111111111111011111111
733 11111111110111000
734
735 $$$$
736 ... ...
737 ... ...
738
739 Example of I<FP> file containing MAACS keys fingerprints string data:
740
741 #
742 # Package = MayaChemTools 7.4
743 # Release Date = Oct 21, 2010
744 #
745 # TimeStamp = Fri Mar 11 14:57:24 2011
746 #
747 # FingerprintsStringType = FingerprintsBitVector
748 #
749 # Description = MACCSKeyBits
750 # Size = 166
751 # BitStringFormat = BinaryString
752 # BitsOrder = Ascending
753 #
754 Cmpd1 00000000000000000000000000000000000000000100100001001000000001...
755 Cmpd2 00000000000000000000000010000000001000000010000000001000000000...
756 ... ...
757 ... ..
758
759 Example of CSV I<Text> file containing MAACS keys fingerprints string data:
760
761 "CompoundID","MACCSKeysFingerprints"
762 "Cmpd1","FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;
763 00000000000000000000000000000000000000000100100001001000000001001000000
764 00111000100101010111100011011000100110110000011011110100110111111111111
765 01111111111111111110111000"
766 ... ...
767 ... ...
768
769 The current release of MayaChemTools generates the following types of MACCS keys
770 fingerprints bit-vector and vector strings:
771
772 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
773 0000000000000000000000000000000001001000010010000000010010000000011100
774 0100101010111100011011000100110110000011011110100110111111111111011111
775 11111111111110111000
776
777 FingerprintsBitVector;MACCSKeyBits;166;HexadecimalString;Ascending;000
778 000000021210210e845f8d8c60b79dffbffffd1
779
780 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
781 1110011111100101111111000111101100110000000000000011100010000000000000
782 0000000000000000000000000000000000000000000000101000000000000000000000
783 0000000000000000000000000000000000000000000000000000000000000000000000
784 0000000000000000000000000000000000000011000000000000000000000000000000
785 0000000000000000000000000000000000000000
786
787 FingerprintsBitVector;MACCSKeyBits;322;HexadecimalString;Ascending;7d7
788 e7af3edc000c1100000000000000500000000000000000000000000000000300000000
789 000000000
790
791 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
792 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
793 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
794 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
795 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
796 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
797
798 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
799 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
800 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
801 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
802 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
803 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
804
805 =head1 OPTIONS
806
807 =over 4
808
809 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
810
811 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
812 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
813 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
814 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
815
816 The supported aromaticity model names along with model specific control parameters
817 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
818 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
819 this file during class instantiation and makes it available to method B<DetectAromaticity>
820 for detecting aromaticity corresponding to a specific model.
821
822 =item B<--BitsOrder> I<Ascending | Descending>
823
824 Bits order to use during generation of fingerprints bit-vector string for I<MACCSKeyBits> value of
825 B<-m, --mode> option. Possible values: I<Ascending, Descending>. Default: I<Ascending>.
826
827 I<Ascending> bit order which corresponds to first bit in each byte as the lowest bit as
828 opposed to the highest bit.
829
830 Internally, bits are stored in I<Ascending> order using Perl vec function. Regardless
831 of machine order, big-endian or little-endian, vec function always considers first
832 string byte as the lowest byte and first bit within each byte as the lowest bit.
833
834 =item B<-b, --BitStringFormat> I<BinaryString | HexadecimalString>
835
836 Format of fingerprints bit-vector string data in output SD, FP or CSV/TSV text file(s) specified by
837 B<--output> used during I<MACCSKeyBits> value of B<-m, --mode> option. Possible
838 values: I<BinaryString, HexadecimalString>. Default value: I<BinaryString>.
839
840 I<BinaryString> corresponds to an ASCII string containing 1s and 0s. I<HexadecimalString>
841 contains bit values in ASCII hexadecimal format.
842
843 Examples:
844
845 FingerprintsBitVector;MACCSKeyBits;166;BinaryString;Ascending;00000000
846 0000000000000000000000000000000001001000010010000000010010000000011100
847 0100101010111100011011000100110110000011011110100110111111111111011111
848 11111111111110111000
849
850 FingerprintsBitVector;MACCSKeyBits;166;HexadecimalString;Ascending;000
851 000000021210210e845f8d8c60b79dffbffffd1
852
853 FingerprintsBitVector;MACCSKeyBits;322;BinaryString;Ascending;11101011
854 1110011111100101111111000111101100110000000000000011100010000000000000
855 0000000000000000000000000000000000000000000000101000000000000000000000
856 0000000000000000000000000000000000000000000000000000000000000000000000
857 0000000000000000000000000000000000000011000000000000000000000000000000
858 0000000000000000000000000000000000000000
859
860 FingerprintsBitVector;MACCSKeyBits;322;HexadecimalString;Ascending;7d7
861 e7af3edc000c1100000000000000500000000000000000000000000000000300000000
862 000000000
863
864 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
865
866 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
867
868 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
869 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
870 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
871 look like Cmpd<Number>.
872
873 Examples for I<DataField> value of B<--CompoundIDMode>:
874
875 MolID
876 ExtReg
877
878 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
879
880 Compound
881
882 The value specified above generates compound IDs which correspond to Compound<Number>
883 instead of default value of Cmpd<Number>.
884
885 =item B<--CompoundIDLabel> I<text>
886
887 Specify compound ID column label for FP or CSV/TSV text file(s) used during I<CompoundID> value
888 of B<--DataFieldsMode> option. Default: I<CompoundID>.
889
890 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
891
892 Specify how to generate compound IDs and write to FP or CSV/TSV text file(s) along with generated
893 fingerprints for I<FP | text | all> values of B<--output> option: use a I<SDFile(s)> datafield value;
894 use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix; use combination
895 of both MolName and LabelPrefix with usage of LabelPrefix values for empty molname lines.
896
897 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
898 Default: I<LabelPrefix>.
899
900 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
901 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
902 values are replaced with sequential compound IDs.
903
904 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
905
906 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
907
908 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
909 with generated fingerprints for I<text | all> values of B<--output> option.
910
911 This is only used for I<Specify> value of B<--DataFieldsMode> option.
912
913 Examples:
914
915 Extreg
916 MolID,CompoundName
917
918 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
919
920 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
921 with generated fingerprints for I<text | all> values of B<--output> option: transfer all SD
922 data field; transfer SD data files common to all compounds; extract specified data fields;
923 generate a compound ID using molname line, a compound prefix, or a combination of both.
924 Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
925
926 =item B<-f, --Filter> I<Yes | No>
927
928 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
929 Default value: I<Yes>.
930
931 By default, compound data is checked before calculating fingerprints and compounds containing
932 atom data corresponding to non-element symbols or no atom data are ignored.
933
934 =item B<--FingerprintsLabel> I<text>
935
936 SD data label or text file column label to use for fingerprints string in output SD or
937 CSV/TSV text file(s) specified by B<--output>. Default value: I<MACCSKeyFingerprints>.
938
939 =item B<-h, --help>
940
941 Print this help message.
942
943 =item B<-k, --KeepLargestComponent> I<Yes | No>
944
945 Generate fingerprints for only the largest component in molecule. Possible values:
946 I<Yes or No>. Default value: I<Yes>.
947
948 For molecules containing multiple connected components, fingerprints can be generated
949 in two different ways: use all connected components or just the largest connected
950 component. By default, all atoms except for the largest connected component are
951 deleted before generation of fingerprints.
952
953 =item B<-m, --mode> I<MACCSKeyBits | MACCSKeyCount>
954
955 Specify type of MACCS keys [ Ref 45-47 ] fingerprints to generate for molecules in I<SDFile(s)>.
956 Possible values: I<MACCSKeyBits, MACCSKeyCount>. Default value: I<MACCSKeyBits>.
957
958 For I<MACCSKeyBits> value of B<-m, --mode> option, a fingerprint bit-vector string containing
959 zeros and ones is generated and for I<MACCSKeyCount> value, a fingerprint vector string
960 corresponding to number of MACCS keys is generated.
961
962 I<MACCSKeyBits | MACCSKeyCount> values for B<-m, --mode> option along with two possible
963 I<166 | 322> values of B<-s, --size> supports generation of four different types of MACCS
964 keys fingerprint: I<MACCS166KeyBits, MACCS166KeyCount, MACCS322KeyBits, MACCS322KeyCount>.
965
966 Definition of MACCS keys uses the following atom and bond symbols to define atom and
967 bond environments:
968
969 Atom symbols for 166 keys [ Ref 47 ]:
970
971 A : Any valid periodic table element symbol
972 Q : Hetro atoms; any non-C or non-H atom
973 X : Halogens; F, Cl, Br, I
974 Z : Others; other than H, C, N, O, Si, P, S, F, Cl, Br, I
975
976 Atom symbols for 322 keys [ Ref 46 ]:
977
978 A : Any valid periodic table element symbol
979 Q : Hetro atoms; any non-C or non-H atom
980 X : Others; other than H, C, N, O, Si, P, S, F, Cl, Br, I
981 Z is neither defined nor used
982
983 Bond types:
984
985 - : Single
986 = : Double
987 T : Triple
988 # : Triple
989 ~ : Single or double query bond
990 % : An aromatic query bond
991
992 None : Any bond type; no explicit bond specified
993
994 $ : Ring bond; $ before a bond type specifies ring bond
995 ! : Chain or non-ring bond; ! before a bond type specifies chain bond
996
997 @ : A ring linkage and the number following it specifies the
998 atoms position in the line, thus @1 means linked back to the first
999 atom in the list.
1000
1001 Aromatic: Kekule or Arom5
1002
1003 Kekule: Bonds in 6-membered rings with alternate single/double bonds
1004 or perimeter bonds
1005 Arom5: Bonds in 5-membered rings with two double bonds and a hetro
1006 atom at the apex of the ring.
1007
1008 MACCS 166 keys [ Ref 45-47 ] are defined as follows:
1009
1010 Key Description
1011
1012 1 ISOTOPE
1013 2 103 < ATOMIC NO. < 256
1014 3 GROUP IVA,VA,VIA PERIODS 4-6 (Ge...)
1015 4 ACTINIDE
1016 5 GROUP IIIB,IVB (Sc...)
1017 6 LANTHANIDE
1018 7 GROUP VB,VIB,VIIB (V...)
1019 8 QAAA@1
1020 9 GROUP VIII (Fe...)
1021 10 GROUP IIA (ALKALINE EARTH)
1022 11 4M RING
1023 12 GROUP IB,IIB (Cu...)
1024 13 ON(C)C
1025 14 S-S
1026 15 OC(O)O
1027 16 QAA@1
1028 17 CTC
1029 18 GROUP IIIA (B...)
1030 19 7M RING
1031 20 SI
1032 21 C=C(Q)Q
1033 22 3M RING
1034 23 NC(O)O
1035 24 N-O
1036 25 NC(N)N
1037 26 C$=C($A)$A
1038 27 I
1039 28 QCH2Q
1040 29 P
1041 30 CQ(C)(C)A
1042 31 QX
1043 32 CSN
1044 33 NS
1045 34 CH2=A
1046 35 GROUP IA (ALKALI METAL)
1047 36 S HETEROCYCLE
1048 37 NC(O)N
1049 38 NC(C)N
1050 39 OS(O)O
1051 40 S-O
1052 41 CTN
1053 42 F
1054 43 QHAQH
1055 44 OTHER
1056 45 C=CN
1057 46 BR
1058 47 SAN
1059 48 OQ(O)O
1060 49 CHARGE
1061 50 C=C(C)C
1062 51 CSO
1063 52 NN
1064 53 QHAAAQH
1065 54 QHAAQH
1066 55 OSO
1067 56 ON(O)C
1068 57 O HETEROCYCLE
1069 58 QSQ
1070 59 Snot%A%A
1071 60 S=O
1072 61 AS(A)A
1073 62 A$A!A$A
1074 63 N=O
1075 64 A$A!S
1076 65 C%N
1077 66 CC(C)(C)A
1078 67 QS
1079 68 QHQH (&...)
1080 69 QQH
1081 70 QNQ
1082 71 NO
1083 72 OAAO
1084 73 S=A
1085 74 CH3ACH3
1086 75 A!N$A
1087 76 C=C(A)A
1088 77 NAN
1089 78 C=N
1090 79 NAAN
1091 80 NAAAN
1092 81 SA(A)A
1093 82 ACH2QH
1094 83 QAAAA@1
1095 84 NH2
1096 85 CN(C)C
1097 86 CH2QCH2
1098 87 X!A$A
1099 88 S
1100 89 OAAAO
1101 90 QHAACH2A
1102 91 QHAAACH2A
1103 92 OC(N)C
1104 93 QCH3
1105 94 QN
1106 95 NAAO
1107 96 5M RING
1108 97 NAAAO
1109 98 QAAAAA@1
1110 99 C=C
1111 100 ACH2N
1112 101 8M RING
1113 102 QO
1114 103 CL
1115 104 QHACH2A
1116 105 A$A($A)$A
1117 106 QA(Q)Q
1118 107 XA(A)A
1119 108 CH3AAACH2A
1120 109 ACH2O
1121 110 NCO
1122 111 NACH2A
1123 112 AA(A)(A)A
1124 113 Onot%A%A
1125 114 CH3CH2A
1126 115 CH3ACH2A
1127 116 CH3AACH2A
1128 117 NAO
1129 118 ACH2CH2A > 1
1130 119 N=A
1131 120 HETEROCYCLIC ATOM > 1 (&...)
1132 121 N HETEROCYCLE
1133 122 AN(A)A
1134 123 OCO
1135 124 QQ
1136 125 AROMATIC RING > 1
1137 126 A!O!A
1138 127 A$A!O > 1 (&...)
1139 128 ACH2AAACH2A
1140 129 ACH2AACH2A
1141 130 QQ > 1 (&...)
1142 131 QH > 1
1143 132 OACH2A
1144 133 A$A!N
1145 134 X (HALOGEN)
1146 135 Nnot%A%A
1147 136 O=A > 1
1148 137 HETEROCYCLE
1149 138 QCH2A > 1 (&...)
1150 139 OH
1151 140 O > 3 (&...)
1152 141 CH3 > 2 (&...)
1153 142 N > 1
1154 143 A$A!O
1155 144 Anot%A%Anot%A
1156 145 6M RING > 1
1157 146 O > 2
1158 147 ACH2CH2A
1159 148 AQ(A)A
1160 149 CH3 > 1
1161 150 A!A$A!A
1162 151 NH
1163 152 OC(C)C
1164 153 QCH2A
1165 154 C=O
1166 155 A!CH2!A
1167 156 NA(A)A
1168 157 C-O
1169 158 C-N
1170 159 O > 1
1171 160 CH3
1172 161 N
1173 162 AROMATIC
1174 163 6M RING
1175 164 O
1176 165 RING
1177 166 FRAGMENTS
1178
1179 MACCS 322 keys set as defined in tables 1, 2 and 3 [ Ref 46 ] include:
1180
1181 . 26 atom properties of type P, as listed in Table 1
1182 . 32 one-atom environments, as listed in Table 3
1183 . 264 atom-bond-atom combinations listed in Table 4
1184
1185 Total number of keys in three tables is : 322
1186
1187 Atom symbol, X, used for 322 keys [ Ref 46 ] doesn't refer to Halogens as it does for 166 keys. In
1188 order to keep the definition of 322 keys consistent with the published definitions, the symbol X is
1189 used to imply "others" atoms, but it's internally mapped to symbol X as defined for 166 keys
1190 during the generation of key values.
1191
1192 Atom properties-based keys (26):
1193
1194 Key Description
1195 1 A(AAA) or AA(A)A - atom with at least three neighbors
1196 2 Q - heteroatom
1197 3 Anot%not-A - atom involved in one or more multiple bonds, not aromatic
1198 4 A(AAAA) or AA(A)(A)A - atom with at least four neighbors
1199 5 A(QQ) or QA(Q) - atom with at least two heteroatom neighbors
1200 6 A(QQQ) or QA(Q)Q - atom with at least three heteroatom neighbors
1201 7 QH - heteroatom with at least one hydrogen attached
1202 8 CH2(AA) or ACH2A - carbon with at least two single bonds and at least
1203 two hydrogens attached
1204 9 CH3(A) or ACH3 - carbon with at least one single bond and at least three
1205 hydrogens attached
1206 10 Halogen
1207 11 A(-A-A-A) or A-A(-A)-A - atom has at least three single bonds
1208 12 AAAAAA@1 > 2 - atom is in at least two different six-membered rings
1209 13 A($A$A$A) or A$A($A)$A - atom has more than two ring bonds
1210 14 A$A!A$A - atom is at a ring/chain boundary. When a comparison is done
1211 with another atom the path passes through the chain bond.
1212 15 Anot%A%Anot%A - atom is at an aromatic/nonaromatic boundary. When a
1213 comparison is done with another atom the path
1214 passes through the aromatic bond.
1215 16 A!A!A - atom with more than one chain bond
1216 17 A!A$A!A - atom is at a ring/chain boundary. When a comparison is done
1217 with another atom the path passes through the ring bond.
1218 18 A%Anot%A%A - atom is at an aromatic/nonaromatic boundary. When a
1219 comparison is done with another atom the
1220 path passes through the nonaromatic bond.
1221 19 HETEROCYCLE - atom is a heteroatom in a ring.
1222 20 rare properties: atom with five or more neighbors, atom in
1223 four or more rings, or atom types other than
1224 H, C, N, O, S, F, Cl, Br, or I
1225 21 rare properties: atom has a charge, is an isotope, has two or
1226 more multiple bonds, or has a triple bond.
1227 22 N - nitrogen
1228 23 S - sulfur
1229 24 O - oxygen
1230 25 A(AA)A(A)A(AA) - atom has two neighbors, each with three or
1231 more neighbors (including the central atom).
1232 26 CHACH2 - atom has two hydrocarbon (CH2) neighbors
1233
1234 Atomic environments properties-based keys (32):
1235
1236 Key Description
1237 27 C(CC)
1238 28 C(CCC)
1239 29 C(CN)
1240 30 C(CCN)
1241 31 C(NN)
1242 32 C(NNC)
1243 33 C(NNN)
1244 34 C(CO)
1245 35 C(CCO)
1246 36 C(NO)
1247 37 C(NCO)
1248 38 C(NNO)
1249 39 C(OO)
1250 40 C(COO)
1251 41 C(NOO)
1252 42 C(OOO)
1253 43 Q(CC)
1254 44 Q(CCC)
1255 45 Q(CN)
1256 46 Q(CCN)
1257 47 Q(NN)
1258 48 Q(CNN)
1259 49 Q(NNN)
1260 50 Q(CO)
1261 51 Q(CCO)
1262 52 Q(NO)
1263 53 Q(CNO)
1264 54 Q(NNO)
1265 55 Q(OO)
1266 56 Q(COO)
1267 57 Q(NOO)
1268 58 Q(OOO)
1269
1270 Note: The first symbol is the central atom, with atoms bonded to the central atom listed in
1271 parentheses. Q is any non-C, non-H atom. If only two atoms are in parentheses, there is
1272 no implication concerning the other atoms bonded to the central atom.
1273
1274 Atom-Bond-Atom properties-based keys: (264)
1275
1276 Key Description
1277 59 C-C
1278 60 C-N
1279 61 C-O
1280 62 C-S
1281 63 C-Cl
1282 64 C-P
1283 65 C-F
1284 66 C-Br
1285 67 C-Si
1286 68 C-I
1287 69 C-X
1288 70 N-N
1289 71 N-O
1290 72 N-S
1291 73 N-Cl
1292 74 N-P
1293 75 N-F
1294 76 N-Br
1295 77 N-Si
1296 78 N-I
1297 79 N-X
1298 80 O-O
1299 81 O-S
1300 82 O-Cl
1301 83 O-P
1302 84 O-F
1303 85 O-Br
1304 86 O-Si
1305 87 O-I
1306 88 O-X
1307 89 S-S
1308 90 S-Cl
1309 91 S-P
1310 92 S-F
1311 93 S-Br
1312 94 S-Si
1313 95 S-I
1314 96 S-X
1315 97 Cl-Cl
1316 98 Cl-P
1317 99 Cl-F
1318 100 Cl-Br
1319 101 Cl-Si
1320 102 Cl-I
1321 103 Cl-X
1322 104 P-P
1323 105 P-F
1324 106 P-Br
1325 107 P-Si
1326 108 P-I
1327 109 P-X
1328 110 F-F
1329 111 F-Br
1330 112 F-Si
1331 113 F-I
1332 114 F-X
1333 115 Br-Br
1334 116 Br-Si
1335 117 Br-I
1336 118 Br-X
1337 119 Si-Si
1338 120 Si-I
1339 121 Si-X
1340 122 I-I
1341 123 I-X
1342 124 X-X
1343 125 C=C
1344 126 C=N
1345 127 C=O
1346 128 C=S
1347 129 C=Cl
1348 130 C=P
1349 131 C=F
1350 132 C=Br
1351 133 C=Si
1352 134 C=I
1353 135 C=X
1354 136 N=N
1355 137 N=O
1356 138 N=S
1357 139 N=Cl
1358 140 N=P
1359 141 N=F
1360 142 N=Br
1361 143 N=Si
1362 144 N=I
1363 145 N=X
1364 146 O=O
1365 147 O=S
1366 148 O=Cl
1367 149 O=P
1368 150 O=F
1369 151 O=Br
1370 152 O=Si
1371 153 O=I
1372 154 O=X
1373 155 S=S
1374 156 S=Cl
1375 157 S=P
1376 158 S=F
1377 159 S=Br
1378 160 S=Si
1379 161 S=I
1380 162 S=X
1381 163 Cl=Cl
1382 164 Cl=P
1383 165 Cl=F
1384 166 Cl=Br
1385 167 Cl=Si
1386 168 Cl=I
1387 169 Cl=X
1388 170 P=P
1389 171 P=F
1390 172 P=Br
1391 173 P=Si
1392 174 P=I
1393 175 P=X
1394 176 F=F
1395 177 F=Br
1396 178 F=Si
1397 179 F=I
1398 180 F=X
1399 181 Br=Br
1400 182 Br=Si
1401 183 Br=I
1402 184 Br=X
1403 185 Si=Si
1404 186 Si=I
1405 187 Si=X
1406 188 I=I
1407 189 I=X
1408 190 X=X
1409 191 C#C
1410 192 C#N
1411 193 C#O
1412 194 C#S
1413 195 C#Cl
1414 196 C#P
1415 197 C#F
1416 198 C#Br
1417 199 C#Si
1418 200 C#I
1419 201 C#X
1420 202 N#N
1421 203 N#O
1422 204 N#S
1423 205 N#Cl
1424 206 N#P
1425 207 N#F
1426 208 N#Br
1427 209 N#Si
1428 210 N#I
1429 211 N#X
1430 212 O#O
1431 213 O#S
1432 214 O#Cl
1433 215 O#P
1434 216 O#F
1435 217 O#Br
1436 218 O#Si
1437 219 O#I
1438 220 O#X
1439 221 S#S
1440 222 S#Cl
1441 223 S#P
1442 224 S#F
1443 225 S#Br
1444 226 S#Si
1445 227 S#I
1446 228 S#X
1447 229 Cl#Cl
1448 230 Cl#P
1449 231 Cl#F
1450 232 Cl#Br
1451 233 Cl#Si
1452 234 Cl#I
1453 235 Cl#X
1454 236 P#P
1455 237 P#F
1456 238 P#Br
1457 239 P#Si
1458 240 P#I
1459 241 P#X
1460 242 F#F
1461 243 F#Br
1462 244 F#Si
1463 245 F#I
1464 246 F#X
1465 247 Br#Br
1466 248 Br#Si
1467 249 Br#I
1468 250 Br#X
1469 251 Si#Si
1470 252 Si#I
1471 253 Si#X
1472 254 I#I
1473 255 I#X
1474 256 X#X
1475 257 C$C
1476 258 C$N
1477 259 C$O
1478 260 C$S
1479 261 C$Cl
1480 262 C$P
1481 263 C$F
1482 264 C$Br
1483 265 C$Si
1484 266 C$I
1485 267 C$X
1486 268 N$N
1487 269 N$O
1488 270 N$S
1489 271 N$Cl
1490 272 N$P
1491 273 N$F
1492 274 N$Br
1493 275 N$Si
1494 276 N$I
1495 277 N$X
1496 278 O$O
1497 279 O$S
1498 280 O$Cl
1499 281 O$P
1500 282 O$F
1501 283 O$Br
1502 284 O$Si
1503 285 O$I
1504 286 O$X
1505 287 S$S
1506 288 S$Cl
1507 289 S$P
1508 290 S$F
1509 291 S$Br
1510 292 S$Si
1511 293 S$I
1512 294 S$X
1513 295 Cl$Cl
1514 296 Cl$P
1515 297 Cl$F
1516 298 Cl$Br
1517 299 Cl$Si
1518 300 Cl$I
1519 301 Cl$X
1520 302 P$P
1521 303 P$F
1522 304 P$Br
1523 305 P$Si
1524 306 P$I
1525 307 P$X
1526 308 F$F
1527 309 F$Br
1528 310 F$Si
1529 311 F$I
1530 312 F$X
1531 313 Br$Br
1532 314 Br$Si
1533 315 Br$I
1534 316 Br$X
1535 317 Si$Si
1536 318 Si$I
1537 319 Si$X
1538 320 I$I
1539 321 I$X
1540 322 X$X
1541
1542 =item B<--OutDelim> I<comma | tab | semicolon>
1543
1544 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1545 Default value: I<comma>.
1546
1547 =item B<--output> I<SD | FP | text | all>
1548
1549 Type of output files to generate. Possible values: I<SD, FP, text, or all>. Default value: I<text>.
1550
1551 =item B<-o, --overwrite>
1552
1553 Overwrite existing files.
1554
1555 =item B<-q, --quote> I<Yes | No>
1556
1557 Put quote around column values in output CSV/TSV text file(s). Possible values:
1558 I<Yes or No>. Default value: I<Yes>.
1559
1560 =item B<-r, --root> I<RootName>
1561
1562 New file name is generated using the root: <Root>.<Ext>. Default for new file
1563 names: <SDFileName><MACCSKeysFP>.<Ext>. The file type determines <Ext> value.
1564 The sdf, fpf, csv, and tsv <Ext> values are used for SD, FP, comma/semicolon, and tab
1565 delimited text files, respectively.This option is ignored for multiple input files.
1566
1567 =item B<-s, --size> I<number>
1568
1569 Size of MACCS keys [ Ref 45-47 ] set to use during fingerprints generation. Possible values: I<166 or 322>.
1570 Default value: I<166>.
1571
1572 =item B<-v, --VectorStringFormat> I<ValuesString | IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString | ValuesAndIDsPairsString>
1573
1574 Format of fingerprints vector string data in output SD, FP or CSV/TSV text file(s) specified by
1575 B<--output> used during I<MACCSKeyCount> value of B<-m, --mode> option. Possible
1576 values: I<ValuesString, IDsAndValuesString | IDsAndValuesPairsString | ValuesAndIDsString |
1577 ValuesAndIDsPairsString>. Defaultvalue: I<ValuesString>.
1578
1579 Examples:
1580
1581 FingerprintsVector;MACCSKeyCount;166;OrderedNumericalValues;ValuesStri
1582 ng;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1583 0 0 0 0 0 0 0 1 0 0 3 0 0 0 0 4 0 0 2 0 0 0 0 0 0 0 0 2 0 0 2 0 0 0 0
1584 0 0 0 0 1 1 8 0 0 0 1 0 0 1 0 1 0 1 0 3 1 3 1 0 0 0 1 2 0 11 1 0 0 0
1585 5 0 0 1 2 0 1 1 0 0 0 0 0 1 1 0 1 1 1 1 0 4 0 0 1 1 0 4 6 1 1 1 2 1 1
1586 3 5 2 2 0 5 3 5 1 1 2 5 1 2 1 2 4 8 3 5 5 2 2 0 3 5 4 1
1587
1588 FingerprintsVector;MACCSKeyCount;322;OrderedNumericalValues;ValuesStri
1589 ng;14 8 2 0 2 0 4 4 2 1 4 0 0 2 5 10 5 2 1 0 0 2 0 5 13 3 28 5 5 3 0 0
1590 0 4 2 1 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 22 5 3 0 0 0 1 0
1591 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1592 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
1593 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1594
1595 =item B<-w, --WorkingDir> I<DirName>
1596
1597 Location of working directory. Default: current directory.
1598
1599 =back
1600
1601 =head1 EXAMPLES
1602
1603 To generate MACCS keys fingerprints of size 166 in binary bit-vector string format
1604 and create a SampleMACCS166FPBin.csv file containing sequential compound IDs along with
1605 fingerprints bit-vector strings data, type:
1606
1607 % MACCSKeysFingerprints.pl -r SampleMACCS166FPBin -o Sample.sdf
1608
1609 To generate MACCS keys fingerprints of size 166 in binary bit-vector string format
1610 and create SampleMACCS166FPBin.sdf, SampleMACCS166FPBin.csv and SampleMACCS166FPBin.csv
1611 files containing sequential compound IDs in CSV file along with fingerprints bit-vector strings data, type:
1612
1613 % MACCSKeysFingerprints.pl --output all -r SampleMACCS166FPBin
1614 -o Sample.sdf
1615
1616 To generate MACCS keys fingerprints of size 322 in binary bit-vector string format
1617 and create a SampleMACCS322FPBin.csv file containing sequential compound IDs along with
1618 fingerprints bit-vector strings data, type:
1619
1620 % MACCSKeysFingerprints.pl -size 322 -r SampleMACCS322FPBin -o Sample.sdf
1621
1622 To generate MACCS keys fingerprints of size 166 corresponding to count of keys in
1623 ValuesString format and create a SampleMACCS166FPCount.csv file containing sequential
1624 compound IDs along with fingerprints vector strings data, type:
1625
1626 % MACCSKeysFingerprints.pl -m MACCSKeyCount -r SampleMACCS166FPCount
1627 -o Sample.sdf
1628
1629 To generate MACCS keys fingerprints of size 322 corresponding to count of keys in
1630 ValuesString format and create a SampleMACCS322FPCount.csv file containing sequential
1631 compound IDs along with fingerprints vector strings data, type:
1632
1633 % MACCSKeysFingerprints.pl -m MACCSKeyCount -size 322
1634 -r SampleMACCS322FPCount -o Sample.sdf
1635
1636 To generate MACCS keys fingerprints of size 166 in hexadecimal bit-vector string format with
1637 ascending bits order and create a SampleMACCS166FPHex.csv file containing compound IDs
1638 from MolName along with fingerprints bit-vector strings data, type:
1639
1640 % MACCSKeysFingerprints.pl -m MACCSKeyBits --size 166 --BitStringFormat
1641 HexadecimalString --BitsOrder Ascending --DataFieldsMode CompoundID
1642 --CompoundIDMode MolName -r SampleMACCS166FPBin -o Sample.sdf
1643
1644 To generate MACCS keys fingerprints of size 166 corresponding to count of keys in
1645 IDsAndValuesString format and create a SampleMACCS166FPCount.csv file containing
1646 compound IDs from MolName line along with fingerprints vector strings data, type:
1647
1648 % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166
1649 --VectorStringFormat IDsAndValuesString --DataFieldsMode CompoundID
1650 --CompoundIDMode MolName -r SampleMACCS166FPCount -o Sample.sdf
1651
1652 To generate MACCS keys fingerprints of size 166 corresponding to count of keys in
1653 IDsAndValuesString format and create a SampleMACCS166FPCount.csv file containing
1654 compound IDs using specified data field along with fingerprints vector strings data, type:
1655
1656 % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166
1657 --VectorStringFormat IDsAndValuesString --DataFieldsMode CompoundID
1658 --CompoundIDMode DataField --CompoundID Mol_ID -r
1659 SampleMACCS166FPCount -o Sample.sdf
1660
1661 To generate MACCS keys fingerprints of size 322 corresponding to count of keys in
1662 ValuesString format and create a SampleMACCS322FPCount.tsv file containing compound
1663 IDs derived from combination of molecule name line and an explicit compound prefix
1664 along with fingerprints vector strings data in a column labels MACCSKeyCountFP, type:
1665
1666 % MACCSKeysFingerprints.pl -m MACCSKeyCount -size 322 --DataFieldsMode
1667 CompoundID --CompoundIDMode MolnameOrLabelPrefix --CompoundID Cmpd
1668 --CompoundIDLabel MolID --FingerprintsLabel MACCSKeyCountFP --OutDelim
1669 Tab -r SampleMACCS322FPCount -o Sample.sdf
1670
1671 To generate MACCS keys fingerprints of size 166 corresponding to count of keys in
1672 ValuesString format and create a SampleMACCS166FPCount.csv file containing
1673 specific data fields columns along with fingerprints vector strings data, type:
1674
1675 % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166
1676 --VectorStringFormat ValuesString --DataFieldsMode Specify --DataFields
1677 Mol_ID -r SampleMACCS166FPCount -o Sample.sdf
1678
1679 To generate MACCS keys fingerprints of size 322 corresponding to count of keys in
1680 ValuesString format and create a SampleMACCS322FPCount.csv file containing
1681 common data fields columns along with fingerprints vector strings data, type:
1682
1683 % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 322
1684 --VectorStringFormat ValuesString --DataFieldsMode Common -r
1685 SampleMACCS322FPCount -o Sample.sdf
1686
1687 To generate MACCS keys fingerprints of size 166 corresponding to count of keys in
1688 ValuesString format and create SampleMACCS166FPCount.sdf, SampleMACCS166FPCount.fpf and
1689 SampleMACCS166FPCount.csv files containing all data fields columns in CSV file
1690 along with fingerprints vector strings data, type:
1691
1692 % MACCSKeysFingerprints.pl -m MACCSKeyCount --size 166 --output all
1693 --VectorStringFormat ValuesString --DataFieldsMode All -r
1694 SampleMACCS166FPCount -o Sample.sdf
1695
1696 =head1 AUTHOR
1697
1698 Manish Sud <msud@san.rr.com>
1699
1700 =head1 SEE ALSO
1701
1702 InfoFingerprintsFiles.pl, SimilarityMatricesFingerprints.pl, AtomNeighborhoodsFingerprints.pl,
1703 ExtendedConnectivityFingerprints.pl, PathLengthFingerprints.pl,
1704 TopologicalAtomPairsFingerprints.pl, TopologicalAtomTorsionsFingerprints.pl,
1705 TopologicalPharmacophoreAtomPairsFingerprints.pl, TopologicalPharmacophoreAtomTripletsFingerprints.pl
1706
1707 =head1 COPYRIGHT
1708
1709 Copyright (C) 2015 Manish Sud. All rights reserved.
1710
1711 This file is part of MayaChemTools.
1712
1713 MayaChemTools is free software; you can redistribute it and/or modify it under
1714 the terms of the GNU Lesser General Public License as published by the Free
1715 Software Foundation; either version 3 of the License, or (at your option)
1716 any later version.
1717
1718 =cut