comparison bin/CalculatePhysicochemicalProperties.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: CalculatePhysicochemicalProperties.pl,v $
4 # $Date: 2015/02/28 20:46:19 $
5 # $Revision: 1.20 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use TextUtil;
37 use SDFileUtil;
38 use MoleculeFileIO;
39 use Molecule;
40 use AtomTypes::AtomicInvariantsAtomTypes;
41 use AtomTypes::FunctionalClassAtomTypes;
42 use MolecularDescriptors::MolecularDescriptorsGenerator;
43
44 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
45
46 # Autoflush STDOUT
47 $| = 1;
48
49 # Starting message...
50 $ScriptName = basename($0);
51 print "\n$ScriptName: Starting...\n\n";
52 $StartTime = new Benchmark;
53
54 # Get the options and setup script...
55 SetupScriptUsage();
56 if ($Options{help} || @ARGV < 1) {
57 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
58 }
59
60 my(@SDFilesList);
61 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
62
63 # Process options...
64 print "Processing options...\n";
65 my(%OptionsInfo);
66 ProcessOptions();
67
68 # Setup information about input files...
69 print "Checking input SD file(s)...\n";
70 my(%SDFilesInfo);
71 RetrieveSDFilesInfo();
72
73 # Process input files..
74 my($FileIndex);
75 if (@SDFilesList > 1) {
76 print "\nProcessing SD files...\n";
77 }
78 for $FileIndex (0 .. $#SDFilesList) {
79 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
80 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
81 CalculatePhysicochemicalProperties($FileIndex);
82 }
83 }
84 print "\n$ScriptName:Done...\n\n";
85
86 $EndTime = new Benchmark;
87 $TotalTime = timediff ($EndTime, $StartTime);
88 print "Total time: ", timestr($TotalTime), "\n";
89
90 ###############################################################################
91
92 # Calculate physicochemical properties for a SD file...
93 #
94 sub CalculatePhysicochemicalProperties {
95 my($FileIndex) = @_;
96 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount, $SDFile, $MoleculeFileIO, $Molecule, $MolecularDescriptorsGenerator, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef);
97
98 $SDFile = $SDFilesList[$FileIndex];
99
100 # Setup output files...
101 $NewSDFileRef = ''; $NewTextFileRef = '';
102 ($NewSDFileRef, $NewTextFileRef) = SetupAndOpenOutputFiles($FileIndex);
103
104 # Setup molecular descriptor generator to calculate property values for specifed
105 # property names...
106 $MolecularDescriptorsGenerator = SetupMolecularDescriptorsGenerator();
107
108 ($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = ('0') x 4;
109
110 $MoleculeFileIO = new MoleculeFileIO('Name' => $SDFile);
111 $MoleculeFileIO->Open();
112
113 COMPOUND: while ($Molecule = $MoleculeFileIO->ReadMolecule()) {
114 $CmpdCount++;
115
116 # Filter compound data before calculating physiochemical properties...
117 if ($OptionsInfo{Filter}) {
118 if (CheckAndFilterCompound($CmpdCount, $Molecule)) {
119 $IgnoredCmpdCount++;
120 next COMPOUND;
121 }
122 }
123
124 # Calculate properties...
125 $PhysicochemicalPropertiesDataRef = CalculateMoleculeProperties($MolecularDescriptorsGenerator, $Molecule);
126
127 if (!defined($PhysicochemicalPropertiesDataRef)) {
128 $IgnoredCmpdCount++;
129 ProcessIgnoredCompound('PropertiesCalculationFailed', $CmpdCount, $Molecule);
130 next COMPOUND;
131 }
132
133 # Calculate any rule violations...
134 if ($OptionsInfo{RuleOf5Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf5Violations}) {
135 $RuleOf5ViolationsCount++;
136 }
137
138 if ($OptionsInfo{RuleOf3Violations} && $PhysicochemicalPropertiesDataRef->{RuleOf3Violations}) {
139 $RuleOf3ViolationsCount++;
140 }
141
142 # Write out calculate properties...
143 WriteDataToOutputFiles($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef);
144 }
145 $MoleculeFileIO->Close();
146
147 if ($OptionsInfo{SDOutput} && $NewSDFileRef) {
148 close $NewSDFileRef;
149 }
150 if ($OptionsInfo{TextOutput} && $NewTextFileRef) {
151 close $NewTextFileRef;
152 }
153
154 WriteCalculationSummaryStatistics($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount);
155 }
156
157 # Process compound being ignored due to problems in physicochemical properties calculation...
158 #
159 sub ProcessIgnoredCompound {
160 my($Mode, $CmpdCount, $Molecule) = @_;
161 my($CmpdID, $DataFieldLabelAndValuesRef);
162
163 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
164 $CmpdID = SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
165
166 MODE: {
167 if ($Mode =~ /^ContainsNonElementalData$/i) {
168 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains atom data corresponding to non-elemental atom symbol(s)...\n\n";
169 next MODE;
170 }
171
172 if ($Mode =~ /^ContainsNoElementalData$/i) {
173 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Compound contains no atom data...\n\n";
174 next MODE;
175 }
176
177 if ($Mode =~ /^PropertiesCalculationFailed$/i) {
178 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n";
179 next MODE;
180 }
181 warn "\nWarning: Ignoring compound record number $CmpdCount with ID $CmpdID: Physicochemical properties calculation didn't succeed...\n\n";
182 }
183 }
184
185 # Check and filter compounds....
186 #
187 sub CheckAndFilterCompound {
188 my($CmpdCount, $Molecule) = @_;
189 my($ElementCount, $NonElementCount);
190
191 ($ElementCount, $NonElementCount) = $Molecule->GetNumOfElementsAndNonElements();
192
193 if ($NonElementCount) {
194 ProcessIgnoredCompound('ContainsNonElementalData', $CmpdCount, $Molecule);
195 return 1;
196 }
197
198 if (!$ElementCount) {
199 ProcessIgnoredCompound('ContainsNoElementalData', $CmpdCount, $Molecule);
200 return 1;
201 }
202
203 return 0;
204 }
205
206 # Write out compounds physicochemical properties calculation summary statistics...
207 #
208 sub WriteCalculationSummaryStatistics {
209 my($CmpdCount, $IgnoredCmpdCount, $RuleOf5ViolationsCount, $RuleOf3ViolationsCount) = @_;
210 my($ProcessedCmpdCount);
211
212 $ProcessedCmpdCount = $CmpdCount - $IgnoredCmpdCount;
213
214 print "\nNumber of compounds: $CmpdCount\n";
215 print "Number of compounds processed successfully during physicochemical properties calculation: $ProcessedCmpdCount\n";
216 print "Number of compounds ignored during physicochemical properties calculation: $IgnoredCmpdCount\n";
217
218 if ($OptionsInfo{RuleOf5Violations}) {
219 print "Number of compounds with one or more RuleOf5 violations: $RuleOf5ViolationsCount\n";
220 }
221
222 if ($OptionsInfo{RuleOf3Violations}) {
223 print "Number of compounds with one or more RuleOf3 violations: $RuleOf3ViolationsCount\n";
224 }
225
226 }
227
228 # Open output files...
229 #
230 sub SetupAndOpenOutputFiles {
231 my($FileIndex) = @_;
232 my($NewSDFile, $NewTextFile, $NewSDFileRef, $NewTextFileRef);
233
234 $NewSDFileRef = '';
235 $NewTextFileRef = '';
236
237 if ($OptionsInfo{SDOutput}) {
238 $NewSDFile = $SDFilesInfo{SDOutFileNames}[$FileIndex];
239 print "Generating SD file $NewSDFile...\n";
240 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
241 $NewSDFileRef = \*NEWSDFILE;
242 }
243 if ($OptionsInfo{TextOutput}) {
244 $NewTextFile = $SDFilesInfo{TextOutFileNames}[$FileIndex];
245 print "Generating text file $NewTextFile...\n";
246 open NEWTEXTFILE, ">$NewTextFile" or die "Error: Couldn't open $NewTextFile: $! \n";
247 WriteTextFileCoulmnLabels($FileIndex, \*NEWTEXTFILE);
248 $NewTextFileRef = \*NEWTEXTFILE;
249 }
250 return ($NewSDFileRef, $NewTextFileRef);
251 }
252
253 # Write calculated physicochemical properties and other data to appropriate output files...
254 #
255 sub WriteDataToOutputFiles {
256 my($FileIndex, $CmpdCount, $Molecule, $PhysicochemicalPropertiesDataRef, $NewSDFileRef, $NewTextFileRef) = @_;
257 my($PropertyName, $PropertyValue);
258
259 if ($OptionsInfo{SDOutput}) {
260 # Retrieve input compound string used to create molecule and write it out
261 # without last line containing a delimiter...
262 my($CmpdString);
263 $CmpdString = $Molecule->GetInputMoleculeString();
264 $CmpdString =~ s/\$\$\$\$$//;
265 print $NewSDFileRef "$CmpdString";
266
267 # Write out calculated physicochemical properties data...
268 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) {
269 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName};
270 print $NewSDFileRef "> <$PropertyName>\n$PropertyValue\n\n";
271 }
272
273 # Write out RuleOf5 violations for molecule....
274 if ($OptionsInfo{RuleOf5Violations}) {
275 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations};
276 print $NewSDFileRef "> <RuleOf5Violations>\n$PropertyValue\n\n";
277 }
278
279 # Write out RuleOf3 violations for molecule....
280 if ($OptionsInfo{RuleOf3Violations}) {
281 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations};
282 print $NewSDFileRef "> <RuleOf3Violations>\n$PropertyValue\n\n";
283 }
284
285 # Write out delimiter...
286 print $NewSDFileRef "\$\$\$\$\n";
287 }
288
289 if ($OptionsInfo{TextOutput}) {
290 my($Line, $DataFieldLabelAndValuesRef, $DataFieldLabel, $DataFieldValue, @LineWords,);
291
292 $DataFieldLabelAndValuesRef = $Molecule->GetDataFieldLabelAndValues();
293 @LineWords = ();
294 if ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
295 push @LineWords, SetupCmpdIDForTextFiles($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef);
296 }
297 elsif ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
298 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
299 }
300 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
301 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
302 }
303 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
304 @LineWords = map { exists $DataFieldLabelAndValuesRef->{$_} ? $DataFieldLabelAndValuesRef->{$_} : ''} @{$OptionsInfo{SpecifiedDataFields}};
305 }
306
307 # Append calculated physicochemical properties data...
308 for $PropertyName (@{$OptionsInfo{SpecifiedPropertyNames}}) {
309 $PropertyValue = $PhysicochemicalPropertiesDataRef->{$PropertyName};
310 push @LineWords, $PropertyValue;
311 }
312
313 # Write out RuleOf5 violations for molecule....
314 if ($OptionsInfo{RuleOf5Violations}) {
315 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf5Violations};
316 push @LineWords, $PropertyValue;
317 }
318
319 # Write out RuleOf3 violations for molecule....
320 if ($OptionsInfo{RuleOf3Violations}) {
321 $PropertyValue = $PhysicochemicalPropertiesDataRef->{RuleOf3Violations};
322 push @LineWords, $PropertyValue;
323 }
324
325 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
326 print $NewTextFileRef "$Line\n";
327 }
328 }
329
330 # Write out approriate column labels to text file...
331 sub WriteTextFileCoulmnLabels {
332 my($FileIndex, $NewTextFileRef) = @_;
333 my($Line, @LineWords);
334
335 @LineWords = ();
336 if ($OptionsInfo{DataFieldsMode} =~ /^All$/i) {
337 push @LineWords, @{$SDFilesInfo{AllDataFieldsRef}[$FileIndex]};
338 }
339 elsif ($OptionsInfo{DataFieldsMode} =~ /^Common$/i) {
340 push @LineWords, @{$SDFilesInfo{CommonDataFieldsRef}[$FileIndex]};
341 }
342 elsif ($OptionsInfo{DataFieldsMode} =~ /^Specify$/i) {
343 push @LineWords, @{$OptionsInfo{SpecifiedDataFields}};
344 }
345 elsif ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) {
346 push @LineWords, $OptionsInfo{CompoundIDLabel};
347 }
348 my($SpecifiedPropertyName);
349
350 # Append physicochemical properties column labels...
351 push @LineWords, @{$OptionsInfo{SpecifiedPropertyNames}};
352
353 # Write out RuleOf5 violations label...
354 if ($OptionsInfo{RuleOf5Violations}) {
355 push @LineWords, 'RuleOf5Violations';
356 }
357
358 # Write out RuleOf3 violations label...
359 if ($OptionsInfo{RuleOf3Violations}) {
360 push @LineWords, 'RuleOf3Violations';
361 }
362
363 $Line = JoinWords(\@LineWords, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
364 print $NewTextFileRef "$Line\n";
365 }
366
367 # Generate compound ID for text files..
368 #
369 sub SetupCmpdIDForTextFiles {
370 my($CmpdCount, $Molecule, $DataFieldLabelAndValuesRef) = @_;
371 my($CmpdID);
372
373 $CmpdID = '';
374 if ($OptionsInfo{CompoundIDMode} =~ /^MolNameOrLabelPrefix$/i) {
375 my($MolName);
376 $MolName = $Molecule->GetName();
377 $CmpdID = $MolName ? $MolName : "$OptionsInfo{CompoundID}${CmpdCount}";
378 }
379 elsif ($OptionsInfo{CompoundIDMode} =~ /^LabelPrefix$/i) {
380 $CmpdID = "$OptionsInfo{CompoundID}${CmpdCount}";
381 }
382 elsif ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i) {
383 my($SpecifiedDataField);
384 $SpecifiedDataField = $OptionsInfo{CompoundID};
385 $CmpdID = exists $DataFieldLabelAndValuesRef->{$SpecifiedDataField} ? $DataFieldLabelAndValuesRef->{$SpecifiedDataField} : '';
386 }
387 elsif ($OptionsInfo{CompoundIDMode} =~ /^MolName$/i) {
388 $CmpdID = $Molecule->GetName();
389 }
390 return $CmpdID;
391 }
392
393 # Calculate physicochemical properties for molecule...
394 #
395 sub CalculateMoleculeProperties {
396 my($MolecularDescriptorsGenerator, $Molecule) = @_;
397 my($PropertyName, $PropertyValue, $MolecularDescriptorsObject, %CalculatedPhysicochemicalProperties);
398
399 %CalculatedPhysicochemicalProperties = ();
400
401 if ($OptionsInfo{KeepLargestComponent}) {
402 $Molecule->KeepLargestComponent();
403 }
404
405 if (!$Molecule->DetectRings()) {
406 return undef;
407 }
408 $Molecule->SetAromaticityModel($OptionsInfo{AromaticityModel});
409 $Molecule->DetectAromaticity();
410
411 if ($OptionsInfo{AddHydrogens}) {
412 $Molecule->AddHydrogens();
413 }
414
415 # Calculate physicochemical properties...
416 $MolecularDescriptorsGenerator->SetMolecule($Molecule);
417 $MolecularDescriptorsGenerator->GenerateDescriptors();
418
419 if (!$MolecularDescriptorsGenerator->IsDescriptorsGenerationSuccessful()) {
420 return undef;
421 }
422
423 %CalculatedPhysicochemicalProperties = $MolecularDescriptorsGenerator->GetDescriptorNamesAndValues();
424
425 # Count RuleOf3 violations...
426 if ($OptionsInfo{RuleOf3Violations}) {
427 CalculateRuleViolationsCount('RuleOf3Violations', \%CalculatedPhysicochemicalProperties);
428 }
429
430 # Count RuleOf5 violations...
431 if ($OptionsInfo{RuleOf5Violations}) {
432 CalculateRuleViolationsCount('RuleOf5Violations', \%CalculatedPhysicochemicalProperties);
433 }
434
435 return \%CalculatedPhysicochemicalProperties;
436 }
437
438 # Setup molecular descriptor generator to calculate property values for specifed
439 # property names...
440 #
441 sub SetupMolecularDescriptorsGenerator {
442 my($PropertyName, $MolecularDescriptorsGenerator);
443
444 $MolecularDescriptorsGenerator = new MolecularDescriptors::MolecularDescriptorsGenerator('Mode' => 'Specify', 'DescriptorNames' => \@{$OptionsInfo{SpecifiedPropertyNames}});
445
446 # Setup molecular desciptor calculation parameters...
447 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularWeight')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('ExactMass')}) ) {
448 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'WeightAndMassDescriptors', %{$OptionsInfo{PrecisionParametersMap}});
449 }
450
451 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('RotatableBonds')})) {
452 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'RotatableBondsDescriptors', %{$OptionsInfo{RotatableBondsParametersMap}});
453 }
454
455 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondDonors')}) || exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('HydrogenBondAcceptors')}) ) {
456 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'HydrogenBondsDescriptors', 'HydrogenBondsType' => $OptionsInfo{HydrogenBonds});
457 }
458
459 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('TPSA')})) {
460 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'TPSADescriptors', %{$OptionsInfo{TPSAParametersMap}});
461 }
462
463 if (exists($OptionsInfo{SpecifiedPropertyNamesMap}{lc('MolecularComplexity')})) {
464 $MolecularDescriptorsGenerator->SetDescriptorClassParameters('DescriptorClassName' => 'MolecularComplexityDescriptors', %{$OptionsInfo{MolecularComplexityParametersMap}});
465 }
466
467 return $MolecularDescriptorsGenerator;
468 }
469
470 # Calculate RuleOf3 or RuleOf5 violations count...
471 #
472 sub CalculateRuleViolationsCount {
473 my($RuleViolationsType, $CalculatedPropertiesMapRef) = @_;
474 my($RuleViolationsCount, $PropertyName);
475
476 $RuleViolationsCount = 0;
477
478 RULEVIOLATIONSTYPE: {
479 if ($RuleViolationsType =~ /^RuleOf3Violations$/i) {
480 for $PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) {
481 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf3MaxPropertyValuesMap}{$PropertyName}) {
482 $RuleViolationsCount++;
483 }
484 }
485 last RULEVIOLATIONSTYPE;
486 }
487
488 if ($RuleViolationsType =~ /^RuleOf5Violations$/i) {
489 for $PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) {
490 if ($CalculatedPropertiesMapRef->{$PropertyName} > $OptionsInfo{RuleOf5MaxPropertyValuesMap}{$PropertyName}) {
491 $RuleViolationsCount++;
492 }
493 }
494 last RULEVIOLATIONSTYPE;
495 }
496
497 die "Warning: Unknown rule violation type: $RuleViolationsType...";
498 }
499
500 # Set rule violation count...
501 $CalculatedPropertiesMapRef->{$RuleViolationsType} = $RuleViolationsCount;
502
503 }
504
505 # Retrieve information about SD files...
506 #
507 sub RetrieveSDFilesInfo {
508 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $OutFileRoot, $TextOutFileExt, $SDOutFileExt, $NewSDFileName, $NewTextFileName, $CheckDataField, $CollectDataFields, $AllDataFieldsRef, $CommonDataFieldsRef);
509
510 %SDFilesInfo = ();
511 @{$SDFilesInfo{FileOkay}} = ();
512 @{$SDFilesInfo{OutFileRoot}} = ();
513 @{$SDFilesInfo{SDOutFileNames}} = ();
514 @{$SDFilesInfo{TextOutFileNames}} = ();
515 @{$SDFilesInfo{AllDataFieldsRef}} = ();
516 @{$SDFilesInfo{CommonDataFieldsRef}} = ();
517
518 $CheckDataField = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^CompoundID$/i) && ($OptionsInfo{CompoundIDMode} =~ /^DataField$/i)) ? 1 : 0;
519 $CollectDataFields = ($OptionsInfo{TextOutput} && ($OptionsInfo{DataFieldsMode} =~ /^(All|Common)$/i)) ? 1 : 0;
520
521 FILELIST: for $Index (0 .. $#SDFilesList) {
522 $SDFile = $SDFilesList[$Index];
523
524 $SDFilesInfo{FileOkay}[$Index] = 0;
525 $SDFilesInfo{OutFileRoot}[$Index] = '';
526 $SDFilesInfo{SDOutFileNames}[$Index] = '';
527 $SDFilesInfo{TextOutFileNames}[$Index] = '';
528
529 $SDFile = $SDFilesList[$Index];
530 if (!(-e $SDFile)) {
531 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
532 next FILELIST;
533 }
534 if (!CheckFileType($SDFile, "sd sdf")) {
535 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
536 next FILELIST;
537 }
538
539 if ($CheckDataField) {
540 # Make sure data field exists in SD file..
541 my($CmpdString, $SpecifiedDataField, @CmpdLines, %DataFieldValues);
542
543 @CmpdLines = ();
544 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
545 $CmpdString = ReadCmpdString(\*SDFILE);
546 close SDFILE;
547 @CmpdLines = split "\n", $CmpdString;
548 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
549 $SpecifiedDataField = $OptionsInfo{CompoundID};
550 if (!exists $DataFieldValues{$SpecifiedDataField}) {
551 warn "Warning: Ignoring file $SDFile: Data field value, $SpecifiedDataField, using \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\" doesn't exist\n";
552 next FILELIST;
553 }
554 }
555
556 $AllDataFieldsRef = '';
557 $CommonDataFieldsRef = '';
558 if ($CollectDataFields) {
559 my($CmpdCount);
560 open SDFILE, "$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
561 ($CmpdCount, $AllDataFieldsRef, $CommonDataFieldsRef) = GetAllAndCommonCmpdDataHeaderLabels(\*SDFILE);
562 close SDFILE;
563 }
564
565 # Setup output file names...
566 $FileDir = ""; $FileName = ""; $FileExt = "";
567 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
568
569 $TextOutFileExt = "csv";
570 if ($Options{outdelim} =~ /^tab$/i) {
571 $TextOutFileExt = "tsv";
572 }
573 $SDOutFileExt = $FileExt;
574
575 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
576 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
577 if ($RootFileName && $RootFileExt) {
578 $FileName = $RootFileName;
579 }
580 else {
581 $FileName = $OptionsInfo{OutFileRoot};
582 }
583 $OutFileRoot = $FileName;
584 }
585 else {
586 $OutFileRoot = "${FileName}PhysicochemicalProperties";
587 }
588
589 $NewSDFileName = "${OutFileRoot}.${SDOutFileExt}";
590 $NewTextFileName = "${OutFileRoot}.${TextOutFileExt}";
591
592 if ($OptionsInfo{SDOutput}) {
593 if ($SDFile =~ /$NewSDFileName/i) {
594 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
595 print "Specify a different name using \"-r --root\" option or use default name.\n";
596 next FILELIST;
597 }
598 }
599
600 if (!$OptionsInfo{OverwriteFiles}) {
601 # Check SD and text outout files...
602 if ($OptionsInfo{SDOutput}) {
603 if (-e $NewSDFileName) {
604 warn "Warning: Ignoring file $SDFile: The file $NewSDFileName already exists\n";
605 next FILELIST;
606 }
607 }
608 if ($OptionsInfo{TextOutput}) {
609 if (-e $NewTextFileName) {
610 warn "Warning: Ignoring file $SDFile: The file $NewTextFileName already exists\n";
611 next FILELIST;
612 }
613 }
614 }
615
616 $SDFilesInfo{FileOkay}[$Index] = 1;
617
618 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
619 $SDFilesInfo{SDOutFileNames}[$Index] = $NewSDFileName;
620 $SDFilesInfo{TextOutFileNames}[$Index] = $NewTextFileName;
621
622 $SDFilesInfo{AllDataFieldsRef}[$Index] = $AllDataFieldsRef;
623 $SDFilesInfo{CommonDataFieldsRef}[$Index] = $CommonDataFieldsRef;
624 }
625 }
626
627 # Process option values...
628 sub ProcessOptions {
629 %OptionsInfo = ();
630
631 $OptionsInfo{AromaticityModel} = $Options{aromaticitymodel};
632
633 # Process property name related options...
634 ProcessPropertyNamesOption();
635
636 # Setup RuleOf3 and RuleOf5 violation calculations...
637 $OptionsInfo{RuleOf3Violations} = ($Options{ruleof3violations} =~ /^Yes$/i) ? 1 : 0;
638 $OptionsInfo{RuleOf5Violations} = ($Options{ruleof5violations} =~ /^Yes$/i) ? 1 : 0;
639
640 $OptionsInfo{CompoundIDMode} = $Options{compoundidmode};
641 $OptionsInfo{CompoundIDLabel} = $Options{compoundidlabel};
642 $OptionsInfo{DataFieldsMode} = $Options{datafieldsmode};
643
644 my(@SpecifiedDataFields);
645 @SpecifiedDataFields = ();
646
647 @{$OptionsInfo{SpecifiedDataFields}} = ();
648 $OptionsInfo{CompoundID} = '';
649
650 if ($Options{datafieldsmode} =~ /^CompoundID$/i) {
651 if ($Options{compoundidmode} =~ /^DataField$/i) {
652 if (!$Options{compoundid}) {
653 die "Error: You must specify a value for \"--CompoundID\" option in \"DataField\" \"--CompoundIDMode\". \n";
654 }
655 $OptionsInfo{CompoundID} = $Options{compoundid};
656 }
657 elsif ($Options{compoundidmode} =~ /^(LabelPrefix|MolNameOrLabelPrefix)$/i) {
658 $OptionsInfo{CompoundID} = $Options{compoundid} ? $Options{compoundid} : 'Cmpd';
659 }
660 }
661 elsif ($Options{datafieldsmode} =~ /^Specify$/i) {
662 if (!$Options{datafields}) {
663 die "Error: You must specify a value for \"--DataFields\" option in \"Specify\" \"-d, --DataFieldsMode\". \n";
664 }
665 @SpecifiedDataFields = split /\,/, $Options{datafields};
666 push @{$OptionsInfo{SpecifiedDataFields}}, @SpecifiedDataFields;
667 }
668
669 # Types of hydrogen bonds...
670 $OptionsInfo{HydrogenBonds} = $Options{hydrogenbonds};
671
672 # Process precision value parameters...
673 ProcessPrecisionOption();
674
675 # Process rotatable bonds parameters...
676 ProcessRotatableBondsOption();
677
678 # Process TPSA parameters...
679 ProcessTPSAOption();
680
681 # Process molecular complexity parameters...
682 ProcessMolecularComplexityOption();
683
684 $OptionsInfo{Filter} = ($Options{filter} =~ /^Yes$/i) ? 1 : 0;
685
686 $OptionsInfo{KeepLargestComponent} = ($Options{keeplargestcomponent} =~ /^Yes$/i) ? 1 : 0;
687
688 $OptionsInfo{Output} = $Options{output};
689 $OptionsInfo{SDOutput} = ($Options{output} =~ /^(SD|Both)$/i) ? 1 : 0;
690 $OptionsInfo{TextOutput} = ($Options{output} =~ /^(Text|Both)$/i) ? 1 : 0;
691
692 $OptionsInfo{OutDelim} = ($Options{outdelim} =~ /tab/i ) ? "\t" : (($Options{outdelim} =~ /semicolon/i) ? "\;" : "\,");
693 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^Yes$/i) ? 1 : 0;
694
695 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
696 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
697 }
698
699 # Process property name related options...
700 #
701 sub ProcessPropertyNamesOption {
702
703 # Setup supported physicochemical properties...
704 my($SupportedProperty);
705
706 @{$OptionsInfo{SupportedPropertyNames}} = ();
707 %{$OptionsInfo{SupportedPropertyNamesMap}} = ();
708
709 @{$OptionsInfo{RuleOf5PropertyNames}} = ();
710 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = ();
711
712 @{$OptionsInfo{RuleOf3PropertyNames}} = ();
713 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = ();
714
715 @{$OptionsInfo{DefaultPropertyNames}} = ();
716
717 @{$OptionsInfo{SupportedPropertyNames}} = qw(MolecularWeight ExactMass HeavyAtoms Rings AromaticRings MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP SMR TPSA Fsp3Carbons Sp3Carbons MolecularComplexity);
718
719 @{$OptionsInfo{RuleOf5PropertyNames}} = qw(MolecularWeight HydrogenBondDonors HydrogenBondAcceptors SLogP);
720 %{$OptionsInfo{RuleOf5MaxPropertyValuesMap}} = ('MolecularWeight' => 500, 'HydrogenBondDonors' => 5, 'HydrogenBondAcceptors' => 10, 'SLogP' => 5);
721
722 @{$OptionsInfo{RuleOf3PropertyNames}} = qw(MolecularWeight RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA);
723 %{$OptionsInfo{RuleOf3MaxPropertyValuesMap}} = ('MolecularWeight' => 300, 'RotatableBonds' => 3, 'HydrogenBondDonors' => 3, 'HydrogenBondAcceptors' => 3, 'SLogP' => 3, 'TPSA' => 60);
724
725 @{$OptionsInfo{DefaultPropertyNames}} = qw(MolecularWeight HeavyAtoms MolecularVolume RotatableBonds HydrogenBondDonors HydrogenBondAcceptors SLogP TPSA);
726
727 for $SupportedProperty (@{$OptionsInfo{SupportedPropertyNames}}) {
728 $OptionsInfo{SupportedPropertyNamesMap}{lc($SupportedProperty)} = $SupportedProperty;
729 }
730
731 # Process specified properties....
732 my($SpecifiedPropertyName, @SpecifiedPropertyNames, %SpecifiedPropertyNamesMap);
733
734 @SpecifiedPropertyNames = ();
735 %SpecifiedPropertyNamesMap = ();
736
737 @{$OptionsInfo{SpecifiedPropertyNames}} = ();
738 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = ();
739
740 if ($Options{mode} =~ /^All$/i) {
741 @SpecifiedPropertyNames = @{$OptionsInfo{SupportedPropertyNames}};
742 }
743 elsif ($Options{mode} =~ /^RuleOf5$/i) {
744 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf5PropertyNames}};
745 }
746 elsif ($Options{mode} =~ /^RuleOf3$/i) {
747 @SpecifiedPropertyNames = @{$OptionsInfo{RuleOf3PropertyNames}};
748 }
749 elsif (IsEmpty($Options{mode})) {
750 @SpecifiedPropertyNames = @{$OptionsInfo{DefaultPropertyNames}};
751 }
752 else {
753 # Comma delimited lisr of specified property names...
754 my($Mode, $PropertyName, @PropertyNames, @UnsupportedPropertyNames);
755
756 $Mode = $Options{mode};
757 $Mode =~ s/ //g;
758
759 @PropertyNames = split ",", $Mode;
760 @UnsupportedPropertyNames = ();
761
762 for $PropertyName (@PropertyNames) {
763 if (exists($OptionsInfo{SupportedPropertyNamesMap}{lc($PropertyName)})) {
764 push @SpecifiedPropertyNames, $PropertyName;
765 }
766 else {
767 push @UnsupportedPropertyNames, $PropertyName;
768 }
769 }
770 if (@UnsupportedPropertyNames) {
771 if (@UnsupportedPropertyNames > 1) {
772 warn "Error: The physicochemical property names specified - ", JoinWords(\@UnsupportedPropertyNames, ", ", 0)," - for option \"-m --mode\" are not valid.\n";
773 }
774 else {
775 warn "Error: The physicochemical property name specified, @UnsupportedPropertyNames , for option \"-m --mode\" is not valid.\n";
776 }
777 die "Allowed values:", JoinWords(\@{$OptionsInfo{SupportedPropertyNames}}, ", ", 0), "\n";
778 }
779 if (!@SpecifiedPropertyNames) {
780 die "Error: No valid physicochemical property names specified for option \"-m --mode\".\n";
781 }
782 }
783
784 # Set up specified property names map...
785 PROPERTY: for $SpecifiedPropertyName (@SpecifiedPropertyNames) {
786 if (exists $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)}) {
787 warn "Warning: The physicochemical property name, $SpecifiedPropertyName, is specified multiple times as value of option \"-m --mode\" .\n";
788 next PROPERTY;
789 }
790 # Canonical specified property name...
791 $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($SpecifiedPropertyName)};
792 }
793
794 # Make sure for calculation of RuleOf3Violations, all appropriate property names are specified...
795 if ($Options{ruleof3violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf5$/i) {
796 die "Error: The value specified, $Options{ruleof3violations}, for \"--RuleOf3Violations\" option in \"RuleOf5\" \"-m --Mode\" is not valid. You must specify RuleOf3 value for \"-m --Mode\" to calculate RuleOf3 violations.\n";
797 }
798
799 if ($Options{ruleof3violations} =~ /^Yes$/i) {
800 my($RuleOf3PropertyName, @MissingRuleOf3Names);
801
802 @MissingRuleOf3Names = ();
803 PROPERTY: for $RuleOf3PropertyName (@{$OptionsInfo{RuleOf3PropertyNames}}) {
804 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)}) {
805 next PROPERTY;
806 }
807 push @MissingRuleOf3Names, $RuleOf3PropertyName;
808
809 # Add property name to specified properties names list and map...
810 push @SpecifiedPropertyNames, $RuleOf3PropertyName;
811 $SpecifiedPropertyNamesMap{lc($RuleOf3PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf3PropertyName)};
812 }
813 if (@MissingRuleOf3Names) {
814 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf3Violations and have been added to the list of property names: @MissingRuleOf3Names\n";
815 }
816 }
817
818 # Make sure for calculation of RuleOf5Violations, all appropriate property names are specified...
819 if ($Options{ruleof5violations} =~ /^Yes$/i && $Options{mode} =~ /^RuleOf3$/i) {
820 die "Error: The value specified, $Options{ruleof5violations}, for \"--RuleOf5Violations\" option in \"RuleOf3\" \"-m --Mode\" is not valid. You must specify RuleOf5 value for \"-m --Mode\" to calculate RuleOf5 violations.\n";
821 }
822
823 if ($Options{ruleof5violations} =~ /^Yes$/i) {
824 my($RuleOf5PropertyName, @MissingRuleOf5Names);
825
826 @MissingRuleOf5Names = ();
827 PROPERTY: for $RuleOf5PropertyName (@{$OptionsInfo{RuleOf5PropertyNames}}) {
828 if (exists $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)}) {
829 next PROPERTY;
830 }
831 push @MissingRuleOf5Names, $RuleOf5PropertyName;
832
833 # Add property name to specified properties names list and map...
834 push @SpecifiedPropertyNames, $RuleOf5PropertyName;
835 $SpecifiedPropertyNamesMap{lc($RuleOf5PropertyName)} = $OptionsInfo{SupportedPropertyNamesMap}{lc($RuleOf5PropertyName)};
836 }
837 if (@MissingRuleOf5Names) {
838 warn "Warning: The following physicochemical property names not specified in \"-m --Mode\" option are required for calculating RuleOf5Violations and have been added to the list of property names: @MissingRuleOf5Names\n";
839 }
840 }
841 $OptionsInfo{Mode} = $Options{mode};
842
843 # Setup canonical specified property names corresponding to supported names in mixed case...
844 my(@SpecifiedCanonicalPropertyNames);
845
846 @SpecifiedCanonicalPropertyNames = ();
847 for $SpecifiedPropertyName (@SpecifiedPropertyNames) {
848 push @SpecifiedCanonicalPropertyNames, $SpecifiedPropertyNamesMap{lc($SpecifiedPropertyName)};
849 }
850 @{$OptionsInfo{SpecifiedPropertyNames}} = @SpecifiedCanonicalPropertyNames;
851 %{$OptionsInfo{SpecifiedPropertyNamesMap}} = %SpecifiedPropertyNamesMap;
852
853 # Based on specified property names, figure out whether hydrogens need to be added before
854 # calculation of properties...
855 #
856 $OptionsInfo{AddHydrogens} = 0;
857 if (exists($SpecifiedPropertyNamesMap{lc('MolecularVolume')}) || exists($SpecifiedPropertyNamesMap{lc('SLogP')}) || exists($SpecifiedPropertyNamesMap{lc('SMR')})) {
858 $OptionsInfo{AddHydrogens} = 1;
859 }
860 }
861
862 # Process precision option...
863 #
864 sub ProcessPrecisionOption {
865 my($ParameterName, $ParameterValue, %PrecisionParametersMap, %PrecisionParameterNamesMap);
866
867 %{$OptionsInfo{PrecisionParametersMap}} = ();
868
869 %PrecisionParametersMap = ('WeightPrecision' => 2, 'MassPrecision' => 4);
870 %PrecisionParameterNamesMap = ('molecularweight' => 'WeightPrecision', 'exactmass' => 'MassPrecision');
871
872 if ($Options{precision}) {
873 # Process specified values...
874 my($Index, $SpecifiedPrecision, @SpecifiedPrecisionValuePairs);
875
876 $SpecifiedPrecision = $Options{precision};
877 $SpecifiedPrecision =~ s/ //g;
878 @SpecifiedPrecisionValuePairs = split ",", $SpecifiedPrecision;
879 if (@SpecifiedPrecisionValuePairs % 2) {
880 die "Error: Invalid number of values specified using \"--Precision\" option: It must contain even number of values.\n";
881 }
882 for ($Index = 0; (($Index + 1) < @SpecifiedPrecisionValuePairs); $Index += 2 ) {
883 $ParameterName = $SpecifiedPrecisionValuePairs[$Index];
884 $ParameterValue = $SpecifiedPrecisionValuePairs[$Index + 1];
885 if (!exists $PrecisionParameterNamesMap{lc($ParameterName)}) {
886 die "Error: The precision parameter name specified, $ParameterName, for option \"--Precision\" is not valid.\n";
887 }
888 if (!IsPositiveInteger($ParameterValue)) {
889 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--Precision\" is not valid. Allowed values: positive integer. \n";
890 }
891 $ParameterName = $PrecisionParameterNamesMap{lc($ParameterName)};
892 $PrecisionParametersMap{$ParameterName} = $ParameterValue;
893 }
894 }
895 $OptionsInfo{Precision} = $Options{precision};
896 %{$OptionsInfo{PrecisionParametersMap}} = %PrecisionParametersMap;
897 }
898
899 # Process rotatable bonds option...
900 sub ProcessRotatableBondsOption {
901 my($ParameterName, $ParameterValue, %RotatableBondsParametersMap, %RotatableBondsParameterNamesMap);
902
903 %{$OptionsInfo{RotatableBondsParametersMap}} = ();
904 %RotatableBondsParametersMap = ('IgnoreTerminalBonds' => 1, 'IgnoreBondsToTripleBonds' => 1, 'IgnoreAmideBonds' => 1, 'IgnoreThioamideBonds' => 1, 'IgnoreSulfonamideBonds' => 1);
905
906 for $ParameterName (keys %RotatableBondsParametersMap) {
907 $RotatableBondsParameterNamesMap{lc($ParameterName)} = $ParameterName;
908 }
909
910 if ($Options{rotatablebonds}) {
911 # Process specified values...
912 my($Index, $SpecifiedRotatableBonds, @SpecifiedRotatableBondsValuePairs);
913
914 $SpecifiedRotatableBonds = $Options{rotatablebonds};
915 $SpecifiedRotatableBonds =~ s/ //g;
916 @SpecifiedRotatableBondsValuePairs = split ",", $SpecifiedRotatableBonds;
917 if (@SpecifiedRotatableBondsValuePairs % 2) {
918 die "Error: Invalid number of values specified using \"--RotatableBonds\" option: It must contain even number of values.\n";
919 }
920 for ($Index = 0; (($Index + 1) < @SpecifiedRotatableBondsValuePairs); $Index += 2 ) {
921 $ParameterName = $SpecifiedRotatableBondsValuePairs[$Index];
922 $ParameterValue = $SpecifiedRotatableBondsValuePairs[$Index + 1];
923 if (!exists $RotatableBondsParameterNamesMap{lc($ParameterName)}) {
924 die "Error: The rotatable bonds parameter name specified, $ParameterName, for option \"--RotatableBonds\" is not valid.\n";
925 }
926 if ($ParameterValue !~ /^(Yes|No)$/i) {
927 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--RotatableBonds\" is not valid. Allowed values: Yes or No. \n";
928 }
929 $ParameterName = $RotatableBondsParameterNamesMap{lc($ParameterName)};
930 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0;
931 $RotatableBondsParametersMap{$ParameterName} = $ParameterValue;
932 }
933 }
934 $OptionsInfo{RotatableBonds} = $Options{rotatablebonds};
935 %{$OptionsInfo{RotatableBondsParametersMap}} = %RotatableBondsParametersMap;
936 }
937
938 # Process TPSA option...
939 #
940 sub ProcessTPSAOption {
941 my($ParameterName, $ParameterValue, %TPSAParametersMap, %TPSAParameterNamesMap);
942
943 %{$OptionsInfo{TPSAParametersMap}} = ();
944
945 %TPSAParametersMap = ('IgnorePhosphorus' => 1, 'IgnoreSulfur' => 1);
946 for $ParameterName (keys %TPSAParametersMap) {
947 $TPSAParameterNamesMap{lc($ParameterName)} = $ParameterName;
948 }
949
950 if ($Options{tpsa}) {
951 # Process specified values...
952 my($Index, $SpecifiedTPSA, @SpecifiedTPSAValuePairs);
953
954 $SpecifiedTPSA = $Options{tpsa};
955 $SpecifiedTPSA =~ s/ //g;
956 @SpecifiedTPSAValuePairs = split ",", $SpecifiedTPSA;
957 if (@SpecifiedTPSAValuePairs % 2) {
958 die "Error: Invalid number of values specified using \"--TPSA\" option: It must contain even number of values.\n";
959 }
960 for ($Index = 0; (($Index + 1) < @SpecifiedTPSAValuePairs); $Index += 2 ) {
961 $ParameterName = $SpecifiedTPSAValuePairs[$Index];
962 $ParameterValue = $SpecifiedTPSAValuePairs[$Index + 1];
963 if (!exists $TPSAParameterNamesMap{lc($ParameterName)}) {
964 die "Error: The TPSA parameter name specified, $ParameterName, for option \"--TPSA\" is not valid.\n";
965 }
966 if ($ParameterValue !~ /^(Yes|No)$/i) {
967 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--TPSA\" is not valid. Allowed values: Yes or No. \n";
968 }
969 $ParameterName = $TPSAParameterNamesMap{lc($ParameterName)};
970 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0;
971 $TPSAParametersMap{$ParameterName} = $ParameterValue;
972 }
973 }
974 $OptionsInfo{TPSA} = $Options{tpsa};
975 %{$OptionsInfo{TPSAParametersMap}} = %TPSAParametersMap;
976 }
977
978 # Process molecular complexity parameters...
979 #
980 sub ProcessMolecularComplexityOption {
981 my($MolecularComplexityType, $ParameterName, $ParameterValue, @ParameterNames, @ParameterValues, @AtomIdentifierTypeParameters, %ComplexityParametersMap, %ComplexityParameterNamesMap);
982
983 %{$OptionsInfo{MolecularComplexityParametersMap}} = ();
984
985 %ComplexityParametersMap = ('MolecularComplexityType' => '', 'AtomIdentifierType' => '',
986 'AtomicInvariantsToUse' => '', 'FunctionalClassesToUse' => '',
987 'MACCSKeysSize' => '166', 'NeighborhoodRadius' => '2',
988 'MinPathLength' => '1', 'MaxPathLength' => '8', 'UseBondSymbols' => '1',
989 'MinDistance' => '1', 'MaxDistance' => '10', 'UseTriangleInequality' => '',
990 'DistanceBinSize' => '2', 'NormalizationMethodology' => 'None');
991
992 %ComplexityParameterNamesMap = ();
993 for $ParameterName (keys %ComplexityParametersMap) {
994 $ComplexityParameterNamesMap{lc($ParameterName)} = $ParameterName;
995 }
996
997 if ($Options{molecularcomplexity}) {
998 # Process specified values...
999 my($Index, $SpecifiedComplexity, @SpecifiedComplexityValuePairs);
1000
1001 $SpecifiedComplexity = $Options{molecularcomplexity};
1002
1003 @SpecifiedComplexityValuePairs = split ",", $SpecifiedComplexity;
1004 if (@SpecifiedComplexityValuePairs % 2) {
1005 die "Error: Invalid number of values specified using \"--MolecularComplexity\" option: It must contain even number of values.\n";
1006 }
1007
1008 for ($Index = 0; (($Index + 1) < @SpecifiedComplexityValuePairs); $Index += 2 ) {
1009 $ParameterName = $SpecifiedComplexityValuePairs[$Index];
1010 $ParameterValue = $SpecifiedComplexityValuePairs[$Index + 1];
1011
1012 $ParameterName = RemoveLeadingAndTrailingWhiteSpaces($ParameterName);
1013 $ParameterValue = RemoveLeadingAndTrailingWhiteSpaces($ParameterValue);
1014
1015 if (!exists $ComplexityParameterNamesMap{lc($ParameterName)}) {
1016 die "Error: The molecular complexity parameter name specified, $ParameterName, for option \"--MolecularComplexity\" is not valid.\n";
1017 }
1018 $ParameterName = $ComplexityParameterNamesMap{lc($ParameterName)};
1019
1020 if ($ParameterName =~ /^AtomicInvariantsToUse$/i) {
1021 my($AtomSymbolFound);
1022
1023 $AtomSymbolFound = 0;
1024 @ParameterValues = split(' ', $ParameterValue);
1025 for $ParameterValue (@ParameterValues) {
1026 if (!AtomTypes::AtomicInvariantsAtomTypes::IsAtomicInvariantAvailable($ParameterValue)) {
1027 die "Error: The atomic invariant specified, $ParameterValue, for AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid.\n";
1028 }
1029 if ($ParameterValue =~ /^(AS|AtomSymbol)$/i) {
1030 $AtomSymbolFound = 1;
1031 }
1032 }
1033 if (!$AtomSymbolFound) {
1034 die "Error: The atomic invariants specified using AtomicInvariantsToUse in option \"--MolecularComplexity\" is not valid: AtomicInvariant atom symbol, AS or AtomSymbol, must be specified.\n";
1035 }
1036 $ParameterValue = JoinWords(\@ParameterValues, ",", 0);
1037 }
1038 elsif ($ParameterName =~ /^FunctionalClassesToUse$/i) {
1039 @ParameterValues = split(' ', $ParameterValue);
1040 for $ParameterValue (@ParameterValues) {
1041 if (!AtomTypes::FunctionalClassAtomTypes::IsFunctionalClassAvailable($ParameterValue)) {
1042 die "Error: The functional class specified, $ParameterValue, for FunctionalClassesToUse in option \"--MolecularComplexity\" is not valid.\n";
1043 }
1044 }
1045 $ParameterValue = JoinWords(\@ParameterValues, ",", 0);
1046 }
1047 else {
1048 if ($ParameterValue =~ / /) {
1049 $ParameterValue =~ s/ //g;
1050 }
1051 if ($ParameterValue =~ /^(Yes|No)$/i) {
1052 $ParameterValue = ($ParameterValue =~ /^Yes$/i) ? 1 : 0;
1053 }
1054 }
1055
1056 if ($ParameterName =~ /^MolecularComplexityType$/i) {
1057 if ($ParameterValue !~ /^(AtomTypesFingerprints|ExtendedConnectivityFingerprints|MACCSKeys|PathLengthFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints|TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) {
1058 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: AtomTypesFingerprints, ExtendedConnectivityFingerprints, MACCSKeys, PathLengthFingerprints, TopologicalAtomPairsFingerprints, TopologicalAtomTripletsFingerprints, TopologicalAtomTorsionsFingerprints, TopologicalPharmacophoreAtomPairsFingerprints, or TopologicalPharmacophoreAtomTripletsFingerprints..\n";
1059 }
1060 }
1061 elsif ($ParameterName =~ /^AtomIdentifierType$/i) {
1062 if ($ParameterValue !~ /^(AtomicInvariantsAtomTypes|FunctionalClassAtomTypes|DREIDINGAtomTypes|EStateAtomTypes|MMFF94AtomTypes|SLogPAtomTypes|SYBYLAtomTypes|TPSAAtomTypes|UFFAtomTypes)$/i) {
1063 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Supported atom identifier types in current release of MayaChemTools: AtomicInvariantsAtomTypes, FunctionalClassAtomTypes, DREIDINGAtomTypes, EStateAtomTypes, MMFF94AtomTypes, SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes and UFFAtomTypes.\n";
1064 }
1065 }
1066 elsif ($ParameterName =~ /^(MACCSKeysSize|MinPathLength|MaxPathLength|MinDistance|MaxDistance|DistanceBinSize)$/i) {
1067 if (!IsPositiveInteger($ParameterValue)) {
1068 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: positive integer. \n";
1069 }
1070 }
1071 elsif ($ParameterName =~ /^NeighborhoodRadius$/i) {
1072 if (!(IsInteger($ParameterValue) && $ParameterValue >=0)) {
1073 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: 0 or positive integer. \n";
1074 }
1075 }
1076 elsif ($ParameterName =~ /^NormalizationMethodology$/i) {
1077 if ($ParameterValue !~ /^(None|ByHeavyAtomsCount|ByPossibleKeysCount)$/i) {
1078 die "Error: The parameter value specified, $ParameterValue, for parameter name, $ParameterName in option \"--MolecularComplexity\" is not valid. Allowed values: None, ByHeavyAtomsCount, or ByPossibleKeysCount\n";
1079 }
1080 }
1081 $ComplexityParametersMap{$ParameterName} = $ParameterValue;
1082 }
1083
1084 if ($ComplexityParametersMap{MACCSKeysSize} !~ /^(166|322)$/i) {
1085 die "Error: The parameter value specified, $ComplexityParametersMap{MACCSKeysSize}, for parameter name, MACCSKeysSize in option \"--MolecularComplexity\" is not valid. Allowed values: 166 or 322\n";
1086 }
1087 if ($ComplexityParametersMap{MinPathLength} > $ComplexityParametersMap{MaxPathLength}) {
1088 die "Error: The parameter value specified for MinPathLength, $ComplexityParametersMap{MinPathLength}, must be <= MaxPathLength, $ComplexityParametersMap{MaxPathLength} ...\n";
1089 }
1090 if ($ComplexityParametersMap{MinDistance} > $ComplexityParametersMap{MaxDistance}) {
1091 die "Error: The parameter value specified for MinDistance, $ComplexityParametersMap{MinDistance}, must be <= MaxDistance, $ComplexityParametersMap{MaxDistance} ...\n";
1092 }
1093 }
1094
1095 # Set default parameter values...
1096
1097 if (IsEmpty($ComplexityParametersMap{MolecularComplexityType})) {
1098 $ComplexityParametersMap{MolecularComplexityType} = 'MACCSKeys';
1099 }
1100 $MolecularComplexityType = $ComplexityParametersMap{MolecularComplexityType};
1101
1102
1103 if (IsEmpty($ComplexityParametersMap{AtomIdentifierType})) {
1104 $ComplexityParametersMap{AtomIdentifierType} = ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) ? "FunctionalClassAtomTypes" : "AtomicInvariantsAtomTypes";
1105 }
1106
1107 if (IsEmpty($ComplexityParametersMap{AtomicInvariantsToUse})) {
1108 my($AtomicInvariantsToUse);
1109
1110 if ($MolecularComplexityType =~ /^(AtomTypesFingerprints|TopologicalAtomPairsFingerprints|TopologicalAtomTripletsFingerprints|TopologicalAtomTorsionsFingerprints)$/i) {
1111 $AtomicInvariantsToUse = "AS,X,BO,H,FC";
1112 }
1113 elsif ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) {
1114 $AtomicInvariantsToUse = "AS,X,BO,H,FC,MN";
1115 }
1116 else {
1117 $AtomicInvariantsToUse = "AS";
1118 }
1119 $ComplexityParametersMap{AtomicInvariantsToUse} = $AtomicInvariantsToUse;
1120 }
1121
1122 if (IsEmpty($ComplexityParametersMap{FunctionalClassesToUse})) {
1123 my($FunctionalClassesToUse);
1124
1125 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) {
1126 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H";
1127 }
1128 elsif ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) {
1129 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar";
1130 }
1131 else {
1132 $FunctionalClassesToUse = "HBD,HBA,PI,NI,H,Ar,Hal";
1133 }
1134 $ComplexityParametersMap{FunctionalClassesToUse} = $FunctionalClassesToUse;
1135 }
1136
1137 my(@AtomicInvariantsToUse);
1138 @AtomicInvariantsToUse = split ',', $ComplexityParametersMap{AtomicInvariantsToUse};
1139 $ComplexityParametersMap{AtomicInvariantsToUse} = \@AtomicInvariantsToUse;
1140
1141 my(@FunctionalClassesToUse);
1142 @FunctionalClassesToUse = split ',', $ComplexityParametersMap{FunctionalClassesToUse};
1143 $ComplexityParametersMap{FunctionalClassesToUse} = \@FunctionalClassesToUse;
1144
1145 if (IsEmpty($ComplexityParametersMap{UseTriangleInequality})) {
1146 $ComplexityParametersMap{UseTriangleInequality} = 0;
1147 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) {
1148 $ComplexityParametersMap{UseTriangleInequality} = 1;
1149 }
1150 }
1151
1152 if ($MolecularComplexityType =~ /^(TopologicalPharmacophoreAtomPairsFingerprints|TopologicalPharmacophoreAtomTripletsFingerprints)$/i) {
1153 if ($ComplexityParametersMap{AtomIdentifierType} !~ /^FunctionalClassAtomTypes$/i) {
1154 die "Error: The parameter value specified for AtomIdentifierType, $ComplexityParametersMap{AtomIdentifierType}, in option \"--MolecularComplexity\" is not valid for MolecularComplexityType, $MolecularComplexityType: Allowed value: FunctionalClassAtomTypes...\n";
1155 }
1156 }
1157
1158 # Set up approprate paremeter names for specified molecular complexity...
1159
1160 @ParameterNames = ();
1161 push @ParameterNames, 'MolecularComplexityType';
1162
1163 @AtomIdentifierTypeParameters = ();
1164 push @AtomIdentifierTypeParameters, 'AtomIdentifierType';
1165 if ($ComplexityParametersMap{AtomIdentifierType} =~ /^AtomicInvariantsAtomTypes$/i) {
1166 push @AtomIdentifierTypeParameters, 'AtomicInvariantsToUse';
1167 }
1168 elsif ($ComplexityParametersMap{AtomIdentifierType} =~ /^FunctionalClassAtomTypes$/i) {
1169 push @AtomIdentifierTypeParameters, 'FunctionalClassesToUse';
1170 }
1171
1172 COMPLEXITYTYPE: {
1173 if ($MolecularComplexityType =~ /^AtomTypesFingerprints$/i) {
1174 push @ParameterNames, @AtomIdentifierTypeParameters;
1175 last COMPLEXITYTYPE;
1176 }
1177 if ($MolecularComplexityType =~ /^ExtendedConnectivityFingerprints$/i) {
1178 push @ParameterNames, @AtomIdentifierTypeParameters;
1179 push @ParameterNames, ('NeighborhoodRadius', 'NormalizationMethodology');
1180 last COMPLEXITYTYPE;
1181 }
1182 if ($MolecularComplexityType =~ /^MACCSKeys$/i) {
1183 push @ParameterNames, 'MACCSKeysSize';
1184 last COMPLEXITYTYPE;
1185 }
1186 if ($MolecularComplexityType =~ /^PathLengthFingerprints$/i) {
1187 push @ParameterNames, @AtomIdentifierTypeParameters;
1188 push @ParameterNames, ('MinPathLength', 'MaxPathLength', 'UseBondSymbols');
1189 last COMPLEXITYTYPE;
1190 }
1191 if ($MolecularComplexityType =~ /^TopologicalAtomPairsFingerprints$/i) {
1192 push @ParameterNames, @AtomIdentifierTypeParameters;
1193 push @ParameterNames, ('MinDistance', 'MaxDistance');
1194 last COMPLEXITYTYPE;
1195 }
1196 if ($MolecularComplexityType =~ /^TopologicalAtomTripletsFingerprints$/i) {
1197 push @ParameterNames, @AtomIdentifierTypeParameters;
1198 push @ParameterNames, ('MinDistance', 'MaxDistance', 'UseTriangleInequality');
1199 last COMPLEXITYTYPE;
1200 }
1201 if ($MolecularComplexityType =~ /^TopologicalAtomTorsionsFingerprints$/i) {
1202 push @ParameterNames, @AtomIdentifierTypeParameters;
1203 last COMPLEXITYTYPE;
1204 }
1205 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomPairsFingerprints$/i) {
1206 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'NormalizationMethodology');
1207 last COMPLEXITYTYPE;
1208 }
1209 if ($MolecularComplexityType =~ /^TopologicalPharmacophoreAtomTripletsFingerprints$/i) {
1210 push @ParameterNames, ('AtomIdentifierType', 'FunctionalClassesToUse', 'MinDistance', 'MaxDistance', 'UseTriangleInequality', 'NormalizationMethodology', 'DistanceBinSize');
1211 last COMPLEXITYTYPE;
1212 }
1213 die "Error: The parameter value specified, $ParameterValue, for parameter name MolecularComplexityType using \"--MolecularComplexity\" is not valid.\n";
1214 }
1215
1216 $OptionsInfo{MolecularComplexity} = $Options{molecularcomplexity};
1217
1218 %{$OptionsInfo{MolecularComplexityParametersMap}} = ();
1219 for $ParameterName (@ParameterNames) {
1220 $ParameterValue = $ComplexityParametersMap{$ParameterName};
1221 $OptionsInfo{MolecularComplexityParametersMap}{$ParameterName} = $ParameterValue;
1222 }
1223 }
1224
1225 # Setup script usage and retrieve command line arguments specified using various options...
1226 sub SetupScriptUsage {
1227
1228 # Retrieve all the options...
1229 %Options = ();
1230
1231 $Options{aromaticitymodel} = 'MayaChemToolsAromaticityModel';
1232
1233 $Options{compoundidmode} = 'LabelPrefix';
1234 $Options{compoundidlabel} = 'CompoundID';
1235 $Options{datafieldsmode} = 'CompoundID';
1236
1237 $Options{filter} = 'Yes';
1238
1239 $Options{hydrogenbonds} = 'HBondsType2';
1240
1241 $Options{keeplargestcomponent} = 'Yes';
1242
1243 # Default mode values are set later...
1244 $Options{mode} = '';
1245
1246 # Default moelcular complexity values are set later...
1247 $Options{molecularcomplexity} = '';
1248
1249 # Default precision values are set later...
1250 $Options{precision} = '';
1251
1252 $Options{output} = 'text';
1253 $Options{outdelim} = 'comma';
1254 $Options{quote} = 'yes';
1255
1256 # Default rotatable bond parameter values are set later...
1257 $Options{rotatablebonds} = '';
1258
1259 $Options{ruleof3violations} = 'No';
1260 $Options{ruleof5violations} = 'No';
1261
1262 # Default TPSA paramater values are set later...
1263 $Options{tpsa} = '';
1264
1265 if (!GetOptions(\%Options, "aromaticitymodel=s", "compoundid=s", "compoundidlabel=s", "compoundidmode=s", "datafields=s", "datafieldsmode|d=s", "filter|f=s", "help|h", "hydrogenbonds=s", "keeplargestcomponent|k=s", "mode|m=s", "molecularcomplexity=s", "outdelim=s", "output=s", "overwrite|o", "precision=s", "rotatablebonds=s", "ruleof3violations=s", "ruleof5violations=s", "quote|q=s", "root|r=s", "tpsa=s", "workingdir|w=s")) {
1266 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1267 }
1268 if ($Options{workingdir}) {
1269 if (! -d $Options{workingdir}) {
1270 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1271 }
1272 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1273 }
1274 if (!Molecule::IsSupportedAromaticityModel($Options{aromaticitymodel})) {
1275 my(@SupportedModels) = Molecule::GetSupportedAromaticityModels();
1276 die "Error: The value specified, $Options{aromaticitymodel}, for option \"--AromaticityModel\" is not valid. Supported aromaticity models in current release of MayaChemTools: @SupportedModels\n";
1277 }
1278 if ($Options{compoundidmode} !~ /^(DataField|MolName|LabelPrefix|MolNameOrLabelPrefix)$/i) {
1279 die "Error: The value specified, $Options{compoundidmode}, for option \"--CompoundIDMode\" is not valid. Allowed values: DataField, MolName, LabelPrefix or MolNameOrLabelPrefix\n";
1280 }
1281 if ($Options{datafieldsmode} !~ /^(All|Common|Specify|CompoundID)$/i) {
1282 die "Error: The value specified, $Options{datafieldsmode}, for option \"-d, --DataFieldsMode\" is not valid. Allowed values: All, Common, Specify or CompoundID\n";
1283 }
1284 if ($Options{filter} !~ /^(Yes|No)$/i) {
1285 die "Error: The value specified, $Options{filter}, for option \"-f, --Filter\" is not valid. Allowed values: Yes or No\n";
1286 }
1287 if ($Options{hydrogenbonds} !~ /^(HBondsType1|HydrogenBondsType1|HBondsType2|HydrogenBondsType2)$/i) {
1288 die "Error: The value specified, $Options{hydrogenbonds}, for option \"--HydrogenBonds\" is not valid. Allowed values: HBondsType1, HydrogenBondsType1, HBondsType2, HydrogenBondsType2\n";
1289 }
1290 if ($Options{keeplargestcomponent} !~ /^(Yes|No)$/i) {
1291 die "Error: The value specified, $Options{keeplargestcomponent}, for option \"-k, --KeepLargestComponent\" is not valid. Allowed values: Yes or No\n";
1292 }
1293 if ($Options{output} !~ /^(SD|text|both)$/i) {
1294 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1295 }
1296 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1297 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1298 }
1299 if ($Options{quote} !~ /^(Yes|No)$/i) {
1300 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: Yes or No\n";
1301 }
1302 if ($Options{ruleof3violations} !~ /^(Yes|No)$/i) {
1303 die "Error: The value specified, $Options{ruleof3violations}, for option \"--RuleOf3Violations\" is not valid. Allowed values: Yes or No\n";
1304 }
1305 if ($Options{ruleof5violations} !~ /^(Yes|No)$/i) {
1306 die "Error: The value specified, $Options{ruleof5violations}, for option \"--RuleOf5Violations\" is not valid. Allowed values: Yes or No\n";
1307 }
1308 }
1309
1310 __END__
1311
1312 =head1 NAME
1313
1314 CalculatePhysicochemicalProperties.pl - Calculate physicochemical properties for SD files
1315
1316 =head1 SYNOPSIS
1317
1318 CalculatePhysicochemicalProperties.pl SDFile(s)...
1319
1320 PhysicochemicalProperties.pl [B<--AromaticityModel> I<AromaticityModelType>]
1321 [B<--CompoundID> DataFieldName or LabelPrefixString]
1322 [B<--CompoundIDLabel> text] [B<--CompoundIDMode>] [B<--DataFields> "FieldLabel1, FieldLabel2,..."]
1323 [B<-d, --DataFieldsMode> All | Common | Specify | CompoundID] [B<-f, --Filter> Yes | No] [B<-h, --help>]
1324 [B<--HydrogenBonds> HBondsType1 | HBondsType2] [B<-k, --KeepLargestComponent> Yes | No]
1325 [B<-m, --mode> All | RuleOf5 | RuleOf3 | "name1, [name2,...]"]
1326 [B<--MolecularComplexity> I<Name,Value, [Name,Value,...]>]
1327 [B<--OutDelim> comma | tab | semicolon] [B<--output> SD | text | both] [B<-o, --overwrite>]
1328 [B<--Precision> Name,Number,[Name,Number,..]] [B<--RotatableBonds> Name,Value, [Name,Value,...]]
1329 [B<--RuleOf3Violations> Yes | No] [B<--RuleOf5Violations> Yes | No]
1330 [B<-q, --quote> Yes | No] [B<-r, --root> RootName]
1331 [B<-w, --WorkingDir> dirname] SDFile(s)...
1332
1333 =head1 DESCRIPTION
1334
1335 Calculate physicochemical properties for I<SDFile(s)> and create appropriate SD or CSV/TSV
1336 text file(s) containing calculated properties.
1337
1338 The current release of MayaChemTools supports the calculation of these physicochemical
1339 properties:
1340
1341 MolecularWeight, ExactMass, HeavyAtoms, Rings, AromaticRings,
1342 van der Waals MolecularVolume [ Ref 93 ], RotatableBonds,
1343 HydrogenBondDonors, HydrogenBondAcceptors, LogP and
1344 Molar Refractivity (SLogP and SMR) [ Ref 89 ], Topological Polar
1345 Surface Area (TPSA) [ Ref 90 ], Fraction of SP3 carbons (Fsp3Carbons)
1346 and SP3 carbons (Sp3Carbons) [ Ref 115-116, Ref 119 ],
1347 MolecularComplexity [ Ref 117-119 ]
1348
1349 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
1350 and I<.sd>. All other file names are ignored. All the SD files in a current directory
1351 can be specified either by I<*.sdf> or the current directory name.
1352
1353 The calculation of molecular complexity using I<MolecularComplexityType> parameter
1354 corresponds to the number of bits-set or unique keys [ Ref 117-119 ] in molecular fingerprints.
1355 Default value for I<MolecularComplexityType>: I<MACCSKeys> of size 166. The calculation
1356 of MACCSKeys is relatively expensive and can take rather substantial amount of time.
1357
1358 =head1 OPTIONS
1359
1360 =over 4
1361
1362 =item B<--AromaticityModel> I<MDLAromaticityModel | TriposAromaticityModel | MMFFAromaticityModel | ChemAxonBasicAromaticityModel | ChemAxonGeneralAromaticityModel | DaylightAromaticityModel | MayaChemToolsAromaticityModel>
1363
1364 Specify aromaticity model to use during detection of aromaticity. Possible values in the current
1365 release are: I<MDLAromaticityModel, TriposAromaticityModel, MMFFAromaticityModel,
1366 ChemAxonBasicAromaticityModel, ChemAxonGeneralAromaticityModel, DaylightAromaticityModel
1367 or MayaChemToolsAromaticityModel>. Default value: I<MayaChemToolsAromaticityModel>.
1368
1369 The supported aromaticity model names along with model specific control parameters
1370 are defined in B<AromaticityModelsData.csv>, which is distributed with the current release
1371 and is available under B<lib/data> directory. B<Molecule.pm> module retrieves data from
1372 this file during class instantiation and makes it available to method B<DetectAromaticity>
1373 for detecting aromaticity corresponding to a specific model.
1374
1375 =item B<--CompoundID> I<DataFieldName or LabelPrefixString>
1376
1377 This value is B<--CompoundIDMode> specific and indicates how compound ID is generated.
1378
1379 For I<DataField> value of B<--CompoundIDMode> option, it corresponds to datafield label name
1380 whose value is used as compound ID; otherwise, it's a prefix string used for generating compound
1381 IDs like LabelPrefixString<Number>. Default value, I<Cmpd>, generates compound IDs which
1382 look like Cmpd<Number>.
1383
1384 Examples for I<DataField> value of B<--CompoundIDMode>:
1385
1386 MolID
1387 ExtReg
1388
1389 Examples for I<LabelPrefix> or I<MolNameOrLabelPrefix> value of B<--CompoundIDMode>:
1390
1391 Compound
1392
1393 The value specified above generates compound IDs which correspond to Compound<Number>
1394 instead of default value of Cmpd<Number>.
1395
1396 =item B<--CompoundIDLabel> I<text>
1397
1398 Specify compound ID column label for CSV/TSV text file(s) used during I<CompoundID> value
1399 of B<--DataFieldsMode> option. Default value: I<CompoundID>.
1400
1401 =item B<--CompoundIDMode> I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>
1402
1403 Specify how to generate compound IDs and write to CSV/TSV text file(s) along with calculated
1404 physicochemical properties for I<text | both> values of B<--output> option: use a I<SDFile(s)>
1405 datafield value; use molname line from I<SDFile(s)>; generate a sequential ID with specific prefix;
1406 use combination of both MolName and LabelPrefix with usage of LabelPrefix values for empty
1407 molname lines.
1408
1409 Possible values: I<DataField | MolName | LabelPrefix | MolNameOrLabelPrefix>.
1410 Default value: I<LabelPrefix>.
1411
1412 For I<MolNameAndLabelPrefix> value of B<--CompoundIDMode>, molname line in I<SDFile(s)> takes
1413 precedence over sequential compound IDs generated using I<LabelPrefix> and only empty molname
1414 values are replaced with sequential compound IDs.
1415
1416 This is only used for I<CompoundID> value of B<--DataFieldsMode> option.
1417
1418 =item B<--DataFields> I<"FieldLabel1,FieldLabel2,...">
1419
1420 Comma delimited list of I<SDFiles(s)> data fields to extract and write to CSV/TSV text file(s) along
1421 with calculated physicochemical properties for I<text | both> values of B<--output> option.
1422
1423 This is only used for I<Specify> value of B<--DataFieldsMode> option.
1424
1425 Examples:
1426
1427 Extreg
1428 MolID,CompoundName
1429
1430 =item B<-d, --DataFieldsMode> I<All | Common | Specify | CompoundID>
1431
1432 Specify how data fields in I<SDFile(s)> are transferred to output CSV/TSV text file(s) along
1433 with calculated physicochemical properties for I<text | both> values of B<--output> option:
1434 transfer all SD data field; transfer SD data files common to all compounds; extract specified
1435 data fields; generate a compound ID using molname line, a compound prefix, or a combination
1436 of both. Possible values: I<All | Common | specify | CompoundID>. Default value: I<CompoundID>.
1437
1438 =item B<-f, --Filter> I<Yes | No>
1439
1440 Specify whether to check and filter compound data in SDFile(s). Possible values: I<Yes or No>.
1441 Default value: I<Yes>.
1442
1443 By default, compound data is checked before calculating physiochemical properties and compounds
1444 containing atom data corresponding to non-element symbols or no atom data are ignored.
1445
1446 =item B<-h, --help>
1447
1448 Print this help message.
1449
1450 =item B<--HydrogenBonds> I<HBondsType1 | HBondsType2>
1451
1452 Parameters to control calculation of hydrogen bond donors and acceptors. Possible values:
1453 I<HBondsType1, HydrogenBondsType1, HBondsType2, HydrogenBondsType2>. Default value:
1454 I<HBondsType2> which corresponds to B<RuleOf5> definition for number of hydrogen bond
1455 donors and acceptors.
1456
1457 The current release of MayaChemTools supports identification of two types of hydrogen bond
1458 donor and acceptor atoms with these names:
1459
1460 HBondsType1 or HydrogenBondsType1
1461 HBondsType2 or HydrogenBondsType2
1462
1463 The names of these hydrogen bond types are rather arbitrary. However, their definitions have
1464 specific meaning and are as follows:
1465
1466 HydrogenBondsType1 [ Ref 60-61, Ref 65-66 ]:
1467
1468 Donor: NH, NH2, OH - Any N and O with available H
1469 Acceptor: N[!H], O - Any N without available H and any O
1470
1471 HydrogenBondsType2 [ Ref 91 ]:
1472
1473 Donor: NH, NH2, OH - N and O with available H
1474 Acceptor: N, O - And N and O
1475
1476 =item B<-k, --KeepLargestComponent> I<Yes | No>
1477
1478 Calculate physicochemical properties for only the largest component in molecule. Possible values:
1479 I<Yes or No>. Default value: I<Yes>.
1480
1481 For molecules containing multiple connected components, physicochemical properties can be
1482 calculated in two different ways: use all connected components or just the largest connected
1483 component. By default, all atoms except for the largest connected component are
1484 deleted before calculation of physicochemical properties.
1485
1486 =item B<-m, --mode> I<All | RuleOf5 | RuleOf3 | "name1, [name2,...]">
1487
1488 Specify physicochemical properties to calculate for SDFile(s): calculate all available physical
1489 chemical properties; calculate properties corresponding to Rule of 5; or use a comma delimited
1490 list of supported physicochemical properties. Possible values: I<All | RuleOf5 | RuleOf3 |
1491 "name1, [name2,...]">.
1492
1493 Default value: I<MolecularWeight, HeavyAtoms, MolecularVolume, RotatableBonds, HydrogenBondDonors,
1494 HydrogenBondAcceptors, SLogP, TPSA>. These properties are calculated by default.
1495
1496 I<RuleOf5> [ Ref 91 ] includes these properties: I<MolecularWeight, HydrogenBondDonors, HydrogenBondAcceptors,
1497 SLogP>. I<RuleOf5> states: MolecularWeight <= 500, HydrogenBondDonors <= 5, HydrogenBondAcceptors <= 10, and
1498 logP <= 5.
1499
1500 I<RuleOf3> [ Ref 92 ] includes these properties: I<MolecularWeight, RotatableBonds, HydrogenBondDonors,
1501 HydrogenBondAcceptors, SLogP, TPSA>. I<RuleOf3> states: MolecularWeight <= 300, RotatableBonds <= 3,
1502 HydrogenBondDonors <= 3, HydrogenBondAcceptors <= 3, logP <= 3, and TPSA <= 60.
1503
1504 I<All> calculates all supported physicochemical properties: I<MolecularWeight, ExactMass,
1505 HeavyAtoms, Rings, AromaticRings, MolecularVolume, RotatableBonds, HydrogenBondDonors,
1506 HydrogenBondAcceptors, SLogP, SMR, TPSA, Fsp3Carbons, Sp3Carbons, MolecularComplexity>.
1507
1508 =item B<--MolecularComplexity> I<Name,Value, [Name,Value,...]>
1509
1510 Parameters to control calculation of molecular complexity: it's a comma delimited list of parameter
1511 name and value pairs.
1512
1513 Possible parameter names: I<MolecularComplexityType, AtomIdentifierType,
1514 AtomicInvariantsToUse, FunctionalClassesToUse, MACCSKeysSize, NeighborhoodRadius,
1515 MinPathLength, MaxPathLength, UseBondSymbols, MinDistance, MaxDistance,
1516 UseTriangleInequality, DistanceBinSize, NormalizationMethodology>.
1517
1518 The valid paramater valuse for each parameter name are described in the following sections.
1519
1520 The current release of MayaChemTools supports calculation of molecular complexity using
1521 I<MolecularComplexityType> parameter corresponding to the number of bits-set or unique
1522 keys [ Ref 117-119 ] in molecular fingerprints. The valid values for I<MolecularComplexityType>
1523 are:
1524
1525 AtomTypesFingerprints
1526 ExtendedConnectivityFingerprints
1527 MACCSKeys
1528 PathLengthFingerprints
1529 TopologicalAtomPairsFingerprints
1530 TopologicalAtomTripletsFingerprints
1531 TopologicalAtomTorsionsFingerprints
1532 TopologicalPharmacophoreAtomPairsFingerprints
1533 TopologicalPharmacophoreAtomTripletsFingerprints
1534
1535 Default value for I<MolecularComplexityType>: I<MACCSKeys>.
1536
1537 I<AtomIdentifierType> parameter name correspods to atom types used during generation of
1538 fingerprints. The valid values for I<AtomIdentifierType> are: I<AtomicInvariantsAtomTypes,
1539 DREIDINGAtomTypes, EStateAtomTypes, FunctionalClassAtomTypes, MMFF94AtomTypes,
1540 SLogPAtomTypes, SYBYLAtomTypes, TPSAAtomTypes, UFFAtomTypes>. I<AtomicInvariantsAtomTypes>
1541 is not supported for during the following values of I<MolecularComplexityType>: I<MACCSKeys,
1542 TopologicalPharmacophoreAtomPairsFingerprints, TopologicalPharmacophoreAtomTripletsFingerprints>.
1543 I<FunctionalClassAtomTypes> is the only valid value for I<AtomIdentifierType> for topological
1544 pharmacophore fingerprints.
1545
1546 Default value for I<AtomIdentifierType>: I<AtomicInvariantsAtomTypes>
1547 for all except topological pharmacophore fingerprints where it is I<FunctionalClassAtomTypes>.
1548
1549 I<AtomicInvariantsToUse> parameter name and values are used during I<AtomicInvariantsAtomTypes>
1550 value of parameter I<AtomIdentifierType>. It's a list of space separated valid atomic invariant atom types.
1551
1552 Possible values for atomic invariants are: I<AS, X, BO, LBO, SB, DB, TB, H, Ar, RA, FC, MN, SM>.
1553 Default value for I<AtomicInvariantsToUse> parameter are set differently for different fingerprints
1554 using I<MolecularComplexityType> parameter as shown below:
1555
1556 MolecularComplexityType AtomicInvariantsToUse
1557
1558 AtomTypesFingerprints AS X BO H FC
1559 TopologicalAtomPairsFingerprints AS X BO H FC
1560 TopologicalAtomTripletsFingerprints AS X BO H FC
1561 TopologicalAtomTorsionsFingerprints AS X BO H FC
1562
1563 ExtendedConnectivityFingerprints AS X BO H FC MN
1564 PathLengthFingerprints AS
1565
1566
1567 The atomic invariants abbreviations correspond to:
1568
1569 AS = Atom symbol corresponding to element symbol
1570
1571 X<n> = Number of non-hydrogen atom neighbors or heavy atoms
1572 BO<n> = Sum of bond orders to non-hydrogen atom neighbors or heavy atoms
1573 LBO<n> = Largest bond order of non-hydrogen atom neighbors or heavy atoms
1574 SB<n> = Number of single bonds to non-hydrogen atom neighbors or heavy atoms
1575 DB<n> = Number of double bonds to non-hydrogen atom neighbors or heavy atoms
1576 TB<n> = Number of triple bonds to non-hydrogen atom neighbors or heavy atoms
1577 H<n> = Number of implicit and explicit hydrogens for atom
1578 Ar = Aromatic annotation indicating whether atom is aromatic
1579 RA = Ring atom annotation indicating whether atom is a ring
1580 FC<+n/-n> = Formal charge assigned to atom
1581 MN<n> = Mass number indicating isotope other than most abundant isotope
1582 SM<n> = Spin multiplicity of atom. Possible values: 1 (singlet), 2 (doublet) or
1583 3 (triplet)
1584
1585 Atom type generated by AtomTypes::AtomicInvariantsAtomTypes class corresponds to:
1586
1587 AS.X<n>.BO<n>.LBO<n>.<SB><n>.<DB><n>.<TB><n>.H<n>.Ar.RA.FC<+n/-n>.MN<n>.SM<n>
1588
1589 Except for AS which is a required atomic invariant in atom types, all other atomic invariants are
1590 optional. Atom type specification doesn't include atomic invariants with zero or undefined values.
1591
1592 In addition to usage of abbreviations for specifying atomic invariants, the following descriptive words
1593 are also allowed:
1594
1595 X : NumOfNonHydrogenAtomNeighbors or NumOfHeavyAtomNeighbors
1596 BO : SumOfBondOrdersToNonHydrogenAtoms or SumOfBondOrdersToHeavyAtoms
1597 LBO : LargestBondOrderToNonHydrogenAtoms or LargestBondOrderToHeavyAtoms
1598 SB : NumOfSingleBondsToNonHydrogenAtoms or NumOfSingleBondsToHeavyAtoms
1599 DB : NumOfDoubleBondsToNonHydrogenAtoms or NumOfDoubleBondsToHeavyAtoms
1600 TB : NumOfTripleBondsToNonHydrogenAtoms or NumOfTripleBondsToHeavyAtoms
1601 H : NumOfImplicitAndExplicitHydrogens
1602 Ar : Aromatic
1603 RA : RingAtom
1604 FC : FormalCharge
1605 MN : MassNumber
1606 SM : SpinMultiplicity
1607
1608 I<AtomTypes::AtomicInvariantsAtomTypes> module is used to assign atomic invariant
1609 atom types.
1610
1611 I<FunctionalClassesToUse> parameter name and values are used during I<FunctionalClassAtomTypes>
1612 value of parameter I<AtomIdentifierType>. It's a list of space separated valid atomic invariant atom types.
1613
1614 Possible values for atom functional classes are: I<Ar, CA, H, HBA, HBD, Hal, NI, PI, RA>.
1615
1616 Default value for I<FunctionalClassesToUse> parameter is set to:
1617
1618 HBD HBA PI NI Ar Hal
1619
1620 for all fingerprints except for the following two I<MolecularComplexityType> fingerints:
1621
1622 MolecularComplexityType FunctionalClassesToUse
1623
1624 TopologicalPharmacophoreAtomPairsFingerprints HBD HBA P, NI H
1625 TopologicalPharmacophoreAtomTripletsFingerprints HBD HBA PI NI H Ar
1626
1627 The functional class abbreviations correspond to:
1628
1629 HBD: HydrogenBondDonor
1630 HBA: HydrogenBondAcceptor
1631 PI : PositivelyIonizable
1632 NI : NegativelyIonizable
1633 Ar : Aromatic
1634 Hal : Halogen
1635 H : Hydrophobic
1636 RA : RingAtom
1637 CA : ChainAtom
1638
1639 Functional class atom type specification for an atom corresponds to:
1640
1641 Ar.CA.H.HBA.HBD.Hal.NI.PI.RA
1642
1643 I<AtomTypes::FunctionalClassAtomTypes> module is used to assign functional class atom
1644 types. It uses following definitions [ Ref 60-61, Ref 65-66 ]:
1645
1646 HydrogenBondDonor: NH, NH2, OH
1647 HydrogenBondAcceptor: N[!H], O
1648 PositivelyIonizable: +, NH2
1649 NegativelyIonizable: -, C(=O)OH, S(=O)OH, P(=O)OH
1650
1651 I<MACCSKeysSize> parameter name is only used during I<MACCSKeys> value of
1652 I<MolecularComplexityType> and corresponds to the size of MACCS key set. Possible
1653 values: I<166 or 322>. Default value: I<166>.
1654
1655 I<NeighborhoodRadius> parameter name is only used during I<ExtendedConnectivityFingerprints>
1656 value of I<MolecularComplexityType> and corresponds to atomic neighborhoods radius for
1657 generating extended connectivity fingerprints. Possible values: positive integer. Default value:
1658 I<2>.
1659
1660 I<MinPathLength> and I<MaxPathLength> parameters are only used during I<PathLengthFingerprints>
1661 value of I<MolecularComplexityType> and correspond to minimum and maximum path lengths to use
1662 for generating path length fingerprints. Possible values: positive integers. Default value: I<MinPathLength - 1>;
1663 I<MaxPathLength - 8>.
1664
1665 I<UseBondSymbols> parameter is only used during I<PathLengthFingerprints> value of
1666 I<MolecularComplexityType> and indicates whether bond symbols are included in atom path
1667 strings used to generate path length fingerprints. Possible value: I<Yes or No>. Default value:
1668 I<Yes>.
1669
1670 I<MinDistance> and I<MaxDistance> parameters are only used during I<TopologicalAtomPairsFingerprints>
1671 and I<TopologicalAtomTripletsFingerprints> values of I<MolecularComplexityType> and correspond to
1672 minimum and maximum bond distance between atom pairs during topological pharmacophore fingerprints.
1673 Possible values: positive integers. Default value: I<MinDistance - 1>; I<MaxDistance - 10>.
1674
1675 I<UseTriangleInequality> parameter is used during these values for I<MolecularComplexityType>:
1676 I<TopologicalAtomTripletsFingerprints> and I<TopologicalPharmacophoreAtomTripletsFingerprints>.
1677 Possible values: I<Yes or No>. It determines wheter to apply triangle inequality to distance triplets.
1678 Default value: I<TopologicalAtomTripletsFingerprints - No>;
1679 I<TopologicalPharmacophoreAtomTripletsFingerprints - Yes>.
1680
1681 I<DistanceBinSize> parameter is used during I<TopologicalPharmacophoreAtomTripletsFingerprints>
1682 value of I<MolecularComplexityType> and correspons to distance bin size used for binning
1683 distances during generation of topological pharmacophore atom triplets fingerprints. Possible
1684 value: positive integer. Default value: I<2>.
1685
1686 I<NormalizationMethodology> is only used for these values for I<MolecularComplexityType>:
1687 I<ExtendedConnectivityFingerprints>, I<TopologicalPharmacophoreAtomPairsFingerprints>
1688 and I<TopologicalPharmacophoreAtomTripletsFingerprints>. It corresponds to normalization
1689 methodology to use for scaling the number of bits-set or unique keys during generation of
1690 fingerprints. Possible values during I<ExtendedConnectivityFingerprints>: I<None or
1691 ByHeavyAtomsCount>; Default value: I<None>. Possible values during topological
1692 pharmacophore atom pairs and tripletes fingerprints: I<None or ByPossibleKeysCount>;
1693 Default value: I<None>. I<ByPossibleKeysCount> corresponds to total number of
1694 possible topological pharmacophore atom pairs or triplets in a molecule.
1695
1696 Examples of I<MolecularComplexity> name and value parameters:
1697
1698 MolecularComplexityType,AtomTypesFingerprints,AtomIdentifierType,
1699 AtomicInvariantsAtomTypes,AtomicInvariantsToUse,AS X BO H FC
1700
1701 MolecularComplexityType,ExtendedConnectivityFingerprints,
1702 AtomIdentifierType,AtomicInvariantsAtomTypes,
1703 AtomicInvariantsToUse,AS X BO H FC MN,NeighborhoodRadius,2,
1704 NormalizationMethodology,None
1705
1706 MolecularComplexityType,MACCSKeys,MACCSKeysSize,166
1707
1708 MolecularComplexityType,PathLengthFingerprints,AtomIdentifierType,
1709 AtomicInvariantsAtomTypes,AtomicInvariantsToUse,AS,MinPathLength,
1710 1,MaxPathLength,8,UseBondSymbols,Yes
1711
1712 MolecularComplexityType,TopologicalAtomPairsFingerprints,
1713 AtomIdentifierType,AtomicInvariantsAtomTypes,AtomicInvariantsToUse,
1714 AS X BO H FC,MinDistance,1,MaxDistance,10
1715
1716 MolecularComplexityType,TopologicalAtomTripletsFingerprints,
1717 AtomIdentifierType,AtomicInvariantsAtomTypes,AtomicInvariantsToUse,
1718 AS X BO H FC,MinDistance,1,MaxDistance,10,UseTriangleInequality,No
1719
1720 MolecularComplexityType,TopologicalAtomTorsionsFingerprints,
1721 AtomIdentifierType,AtomicInvariantsAtomTypes,AtomicInvariantsToUse,
1722 AS X BO H FC
1723
1724 MolecularComplexityType,TopologicalPharmacophoreAtomPairsFingerprints,
1725 AtomIdentifierType,FunctionalClassAtomTypes,FunctionalClassesToUse,
1726 HBD HBA PI NI H,MinDistance,1,MaxDistance,10,NormalizationMethodology,
1727 None
1728
1729 MolecularComplexityType,TopologicalPharmacophoreAtomTripletsFingerprints,
1730 AtomIdentifierType,FunctionalClassAtomTypes,FunctionalClassesToUse,
1731 HBD HBA PI NI H Ar,MinDistance,1,MaxDistance,10,NormalizationMethodology,
1732 None,UseTriangleInequality,Yes,NormalizationMethodology,None,
1733 DistanceBinSize,2
1734
1735 =item B<--OutDelim> I<comma | tab | semicolon>
1736
1737 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1738 Default value: I<comma>.
1739
1740 =item B<--output> I<SD | text | both>
1741
1742 Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<text>.
1743
1744 =item B<-o, --overwrite>
1745
1746 Overwrite existing files.
1747
1748 =item B<--Precision> I<Name,Number,[Name,Number,..]>
1749
1750 Precision of calculated property values in the output file: it's a comma delimited list of
1751 property name and precision value pairs. Possible property names: I<MolecularWeight,
1752 ExactMass>. Possible values: positive intergers. Default value: I<MolecularWeight,2,
1753 ExactMass,4>.
1754
1755 Examples:
1756
1757 ExactMass,3
1758 MolecularWeight,1,ExactMass,2
1759
1760 =item B<-q, --quote> I<Yes | No>
1761
1762 Put quote around column values in output CSV/TSV text file(s). Possible values:
1763 I<Yes or No>. Default value: I<Yes>.
1764
1765 =item B<-r, --root> I<RootName>
1766
1767 New file name is generated using the root: <Root>.<Ext>. Default for new file names:
1768 <SDFileName><PhysicochemicalProperties>.<Ext>. The file type determines <Ext> value.
1769 The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab
1770 delimited text files, respectively.This option is ignored for multiple input files.
1771
1772 =item B<--RotatableBonds> I<Name,Value, [Name,Value,...]>
1773
1774 Parameters to control calculation of rotatable bonds [ Ref 92 ]: it's a comma delimited list of parameter
1775 name and value pairs. Possible parameter names: I<IgnoreTerminalBonds, IgnoreBondsToTripleBonds,
1776 IgnoreAmideBonds, IgnoreThioamideBonds, IgnoreSulfonamideBonds>. Possible parameter values:
1777 I<Yes or No>. By default, value of all parameters is set to I<Yes>.
1778
1779 =item B<--RuleOf3Violations> I<Yes | No>
1780
1781 Specify whether to calculate B<RuleOf3Violations> for SDFile(s). Possible values: I<Yes or No>.
1782 Default value: I<No>.
1783
1784 For I<Yes> value of B<RuleOf3Violations>, in addition to calculating total number of B<RuleOf3> violations,
1785 individual violations for compounds are also written to output files.
1786
1787 B<RuleOf3> [ Ref 92 ] states: MolecularWeight <= 300, RotatableBonds <= 3, HydrogenBondDonors <= 3,
1788 HydrogenBondAcceptors <= 3, logP <= 3, and TPSA <= 60.
1789
1790 =item B<--RuleOf5Violations> I<Yes | No>
1791
1792 Specify whether to calculate B<RuleOf5Violations> for SDFile(s). Possible values: I<Yes or No>.
1793 Default value: I<No>.
1794
1795 For I<Yes> value of B<RuleOf5Violations>, in addition to calculating total number of B<RuleOf5> violations,
1796 individual violations for compounds are also written to output files.
1797
1798 B<RuleOf5> [ Ref 91 ] states: MolecularWeight <= 500, HydrogenBondDonors <= 5, HydrogenBondAcceptors <= 10,
1799 and logP <= 5.
1800
1801 =item B<--TPSA> I<Name,Value, [Name,Value,...]>
1802
1803 Parameters to control calculation of TPSA: it's a comma delimited list of parameter name and value
1804 pairs. Possible parameter names: I<IgnorePhosphorus, IgnoreSulfur>. Possible parameter values:
1805 I<Yes or No>. By default, value of all parameters is set to I<Yes>.
1806
1807 By default, TPSA atom contributions from Phosphorus and Sulfur atoms are not included during
1808 TPSA calculations. [ Ref 91 ]
1809
1810 =item B<-w, --WorkingDir> I<DirName>
1811
1812 Location of working directory. Default value: current directory.
1813
1814 =back
1815
1816 =head1 EXAMPLES
1817
1818 To calculate default set of physicochemical properties - MolecularWeight, HeavyAtoms,
1819 MolecularVolume, RotatableBonds, HydrogenBondDonor, HydrogenBondAcceptors, SLogP,
1820 TPSA - and generate a SamplePhysicochemicalProperties.csv file containing sequential
1821 compound IDs along with properties data, type:
1822
1823 % CalculatePhysicochemicalProperties.pl -o Sample.sdf
1824
1825 To calculate all available physicochemical properties and generate both SampleAllProperties.csv
1826 and SampleAllProperties.sdf files containing sequential compound IDs in CSV file along with
1827 properties data, type:
1828
1829 % CalculatePhysicochemicalProperties.pl -m All --output both
1830 -r SampleAllProperties -o Sample.sdf
1831
1832 To calculate RuleOf5 physicochemical properties and generate a SampleRuleOf5Properties.csv file
1833 containing sequential compound IDs along with properties data, type:
1834
1835 % CalculatePhysicochemicalProperties.pl -m RuleOf5
1836 -r SampleRuleOf5Properties -o Sample.sdf
1837
1838 To calculate RuleOf5 physicochemical properties along with counting RuleOf5 violations and generate
1839 a SampleRuleOf5Properties.csv file containing sequential compound IDs along with properties data, type:
1840
1841 % CalculatePhysicochemicalProperties.pl -m RuleOf5 --RuleOf5Violations Yes
1842 -r SampleRuleOf5Properties -o Sample.sdf
1843
1844 To calculate RuleOf3 physicochemical properties and generate a SampleRuleOf3Properties.csv file
1845 containing sequential compound IDs along with properties data, type:
1846
1847 % CalculatePhysicochemicalProperties.pl -m RuleOf3
1848 -r SampleRuleOf3Properties -o Sample.sdf
1849
1850 To calculate RuleOf3 physicochemical properties along with counting RuleOf3 violations and generate
1851 a SampleRuleOf3Properties.csv file containing sequential compound IDs along with properties data, type:
1852
1853 % CalculatePhysicochemicalProperties.pl -m RuleOf3 --RuleOf3Violations Yes
1854 -r SampleRuleOf3Properties -o Sample.sdf
1855
1856 To calculate a specific set of physicochemical properties and generate a SampleProperties.csv file
1857 containing sequential compound IDs along with properties data, type:
1858
1859 % CalculatePhysicochemicalProperties.pl -m "Rings,AromaticRings"
1860 -r SampleProperties -o Sample.sdf
1861
1862 To calculate HydrogenBondDonors and HydrogenBondAcceptors using HydrogenBondsType1 definition
1863 and generate a SampleProperties.csv file containing sequential compound IDs along with properties
1864 data, type:
1865
1866 % CalculatePhysicochemicalProperties.pl -m "HydrogenBondDonors,HydrogenBondAcceptors"
1867 --HydrogenBonds HBondsType1 -r SampleProperties -o Sample.sdf
1868
1869 To calculate TPSA using sulfur and phosphorus atoms along with nitrogen and oxygen atoms and
1870 generate a SampleProperties.csv file containing sequential compound IDs along with properties
1871 data, type:
1872
1873 % CalculatePhysicochemicalProperties.pl -m "TPSA" --TPSA "IgnorePhosphorus,No,
1874 IgnoreSulfur,No" -r SampleProperties -o Sample.sdf
1875
1876 To calculate MolecularComplexity using extendend connectivity fingerprints corresponding
1877 to atom neighborhood radius of 2 with atomic invariant atom types without any scaling and
1878 generate a SampleProperties.csv file containing sequential compound IDs along with properties
1879 data, type:
1880
1881 % CalculatePhysicochemicalProperties.pl -m MolecularComplexity --MolecularComplexity
1882 "MolecularComplexityType,ExtendedConnectivityFingerprints,NeighborhoodRadius,2,
1883 AtomIdentifierType, AtomicInvariantsAtomTypes,
1884 AtomicInvariantsToUse,AS X BO H FC MN,NormalizationMethodology,None"
1885 -r SampleProperties -o Sample.sdf
1886
1887 To calculate RuleOf5 physicochemical properties along with counting RuleOf5 violations and generate
1888 a SampleRuleOf5Properties.csv file containing compound IDs from molecule name line along with
1889 properties data, type:
1890
1891 % CalculatePhysicochemicalProperties.pl -m RuleOf5 --RuleOf5Violations Yes
1892 --DataFieldsMode CompoundID --CompoundIDMode MolName
1893 -r SampleRuleOf5Properties -o Sample.sdf
1894
1895 To calculate all available physicochemical properties and generate a SampleAllProperties.csv
1896 file containing compound ID using specified data field along with along with properties data,
1897 type:
1898
1899 % CalculatePhysicochemicalProperties.pl -m All
1900 --DataFieldsMode CompoundID --CompoundIDMode DataField --CompoundID Mol_ID
1901 -r SampleAllProperties -o Sample.sdf
1902
1903 To calculate all available physicochemical properties and generate a SampleAllProperties.csv
1904 file containing compound ID using combination of molecule name line and an explicit compound
1905 prefix along with properties data, type:
1906
1907 % CalculatePhysicochemicalProperties.pl -m All
1908 --DataFieldsMode CompoundID --CompoundIDMode MolnameOrLabelPrefix
1909 --CompoundID Cmpd --CompoundIDLabel MolID -r SampleAllProperties
1910 -o Sample.sdf
1911
1912 To calculate all available physicochemical properties and generate a SampleAllProperties.csv
1913 file containing specific data fields columns along with with properties data, type:
1914
1915 % CalculatePhysicochemicalProperties.pl -m All
1916 --DataFieldsMode Specify --DataFields Mol_ID -r SampleAllProperties
1917 -o Sample.sdf
1918
1919 To calculate all available physicochemical properties and generate a SampleAllProperties.csv
1920 file containing common data fields columns along with with properties data, type:
1921
1922 % CalculatePhysicochemicalProperties.pl -m All
1923 --DataFieldsMode Common -r SampleAllProperties -o Sample.sdf
1924
1925 To calculate all available physicochemical properties and generate both SampleAllProperties.csv
1926 and CSV files containing all data fields columns in CSV files along with with properties data, type:
1927
1928 % CalculatePhysicochemicalProperties.pl -m All
1929 --DataFieldsMode All --output both -r SampleAllProperties
1930 -o Sample.sdf
1931
1932 =head1 AUTHOR
1933
1934 Manish Sud <msud@san.rr.com>
1935
1936 =head1 SEE ALSO
1937
1938 ExtractFromSDtFiles.pl, ExtractFromTextFiles.pl, InfoSDFiles.pl, InfoTextFiles.pl
1939
1940 =head1 COPYRIGHT
1941
1942 Copyright (C) 2015 Manish Sud. All rights reserved.
1943
1944 This file is part of MayaChemTools.
1945
1946 MayaChemTools is free software; you can redistribute it and/or modify it under
1947 the terms of the GNU Lesser General Public License as published by the Free
1948 Software Foundation; either version 3 of the License, or (at your option)
1949 any later version.
1950
1951 =cut