comparison bin/ExtractFromSDFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: ExtractFromSDFiles.pl,v $
4 # $Date: 2015/03/22 19:11:27 $
5 # $Revision: 1.48 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use SDFileUtil;
36 use FileUtil;
37 use TextUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename($0);
46 print "\n$ScriptName:Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Get the options and setup script...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 my(@SDFilesList);
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
57
58 # Process options...
59 print "Processing options...\n";
60 my(%OptionsInfo);
61 ProcessOptions();
62
63 # Collect information about SD files...
64 print "Checking input SD file(s)...\n";
65 my(%SDFilesInfo);
66 RetrieveSDFilesInfo();
67
68 # Generate output files...
69 my($FileIndex);
70 if (@SDFilesList > 1) {
71 print "\nProcessing SD files...\n";
72 }
73 for $FileIndex (0 .. $#SDFilesList) {
74 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
75 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
76 ExtractFromSDFile($FileIndex);
77 }
78 }
79 print "\n$ScriptName:Done...\n\n";
80
81 $EndTime = new Benchmark;
82 $TotalTime = timediff ($EndTime, $StartTime);
83 print "Total time: ", timestr($TotalTime), "\n";
84
85 ###############################################################################
86
87 # Extract data from a SD file...
88 sub ExtractFromSDFile {
89 my($FileIndex) = @_;
90
91 OpenInputAndOutputFiles($FileIndex);
92
93 MODE: {
94 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) {
95 ExtractAllDataFields($FileIndex);
96 last MODE;
97 }
98 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) {
99 ExtractCommonDataFields($FileIndex);
100 last MODE;
101 }
102 if ($OptionsInfo{Mode} =~ /^DataFields$/i) {
103 ExtractDataFields($FileIndex);
104 last MODE;
105 }
106 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) {
107 ExtractDataFieldByList($FileIndex);
108 last MODE;
109 }
110 if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) {
111 ExtractDataFieldNotByList($FileIndex);
112 last MODE;
113 }
114 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) {
115 ExtractDataFieldsByValue($FileIndex);
116 last MODE;
117 }
118 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) {
119 ExtractDataFieldsByRegex($FileIndex);
120 last MODE;
121 }
122 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) {
123 ExtractRandomCompounds($FileIndex);
124 last MODE;
125 }
126 if ($OptionsInfo{Mode} =~ /^MolNames$/i) {
127 ExtractMolNames($FileIndex);
128 last MODE;
129 }
130 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) {
131 ExtractRecordNum($FileIndex);
132 last MODE;
133 }
134 if ($OptionsInfo{Mode} =~ /^RecordNums$/i) {
135 ExtractRecordNums($FileIndex);
136 last MODE;
137 }
138 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) {
139 ExtractRecordRange($FileIndex);
140 last MODE;
141 }
142 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) {
143 Extract2DCmpdRecords($FileIndex);
144 last MODE;
145 }
146 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) {
147 Extract3DCmpdRecords($FileIndex);
148 last MODE;
149 }
150 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
151 }
152
153 CloseInputAndOutputFiles();
154 }
155
156 # Extract all data fields...
157 sub ExtractAllDataFields {
158 my($FileIndex) = @_;
159 my(@CmpdLines);
160
161 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
162 WriteTextFileColLabels();
163
164 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
165 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
166 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
167
168 SetupDataValues();
169 WriteTextFileCmpdData();
170 WriteSDFileCmpdData();
171 }
172 }
173
174 # Extract common data fields...
175 sub ExtractCommonDataFields {
176 my($FileIndex) = @_;
177 my(@CmpdLines);
178
179 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]};
180 WriteTextFileColLabels();
181
182 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
183 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
184 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
185
186 SetupDataValues();
187 WriteTextFileCmpdData();
188 WriteSDFileCmpdData();
189 }
190 }
191
192 # Extract specified data fields...
193 sub ExtractDataFields {
194 my($FileIndex) = @_;
195 my(@CmpdLines);
196
197 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}};
198 WriteTextFileColLabels();
199
200 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
201 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
202 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
203
204 SetupDataValues();
205 WriteTextFileCmpdData();
206 WriteSDFileCmpdData();
207 }
208 }
209
210 # Extract data fields using a list...
211 sub ExtractDataFieldByList {
212 my($FileIndex) = @_;
213 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
214
215 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
216 WriteTextFileColLabels();
217
218 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) {
219 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
220 }
221 $SpecifiedDataFieldValuesFoundCount = 0;
222 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
223
224 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
225 $CmpdNum++;
226
227 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
228 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
229
230 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
231 next CMPDSTRING;
232 }
233
234 SetupDataValues();
235
236 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
237 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
238
239 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
240 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) {
241 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") {
242 $SpecifiedDataFieldValuesFoundCount++;
243 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found";
244 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) {
245 WriteSDFileCmpdString();
246 WriteTextFileCmpdData();
247 }
248 }
249 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) {
250 WriteSDFileCmpdString();
251 WriteTextFileCmpdData();
252 }
253 }
254 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) {
255 last CMPDSTRING;
256 }
257 }
258 }
259 }
260
261 # Extract data field whose values are not on the specified list...
262 sub ExtractDataFieldNotByList {
263 my($FileIndex) = @_;
264 my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
265
266 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
267 WriteTextFileColLabels();
268
269 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
270
271 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
272 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
273 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
274
275 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
276 next CMPDSTRING;
277 }
278
279 SetupDataValues();
280
281 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
282
283 # Make sure the current value is not empty and is not only specified list of values...
284 if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
285 next CMPDSTRING;
286 }
287
288 WriteSDFileCmpdString();
289 WriteTextFileCmpdData();
290 }
291 }
292
293 # Extract data fields by value...
294 sub ExtractDataFieldsByValue {
295 my($FileIndex) = @_;
296 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines);
297
298 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
299 WriteTextFileColLabels();
300
301 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
302 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
303 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
304
305 SetupDataValues();
306 $ViolationCount = 0;
307
308 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
309 if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
310 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
311 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label};
312 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label};
313
314 if ($OptionsInfo{NumericalComparison}) {
315 CRITERION: {
316 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
317 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
318 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
319 $Nothing = 1;
320 }
321 }
322 else {
323 CRITERION: {
324 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
325 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
326 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
327 $Nothing = 1;
328 }
329 }
330 }
331 }
332 if ($ViolationCount <= $OptionsInfo{Violations}) {
333 WriteSDFileCmpdString();
334 WriteTextFileCmpdData();
335 }
336 }
337 }
338
339 # Extract data fields by value using regular expression match...
340 sub ExtractDataFieldsByRegex {
341 my($FileIndex) = @_;
342 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines);
343
344 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
345 WriteTextFileColLabels();
346
347 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
348 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
349 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
350
351 SetupDataValues();
352 $ViolationCount = 0;
353
354 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
355 if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
356 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
357 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label};
358 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label};
359
360 if ($OptionsInfo{RegexIgnoreCase}) {
361 CRITERION: {
362 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
363 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
364 $Nothing = 1;
365 }
366 }
367 else {
368 CRITERION: {
369 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
370 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
371 $Nothing = 1;
372 }
373 }
374 }
375 }
376 if ($ViolationCount <= $OptionsInfo{Violations}) {
377 WriteSDFileCmpdString();
378 WriteTextFileCmpdData();
379 }
380 }
381 }
382
383 # Extract random compounds...
384 sub ExtractRandomCompounds {
385 my($FileIndex) = @_;
386 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap);
387
388 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
389 WriteTextFileColLabels();
390
391 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex];
392 srand($OptionsInfo{Seed});
393 $RandomCycleCount = 0;
394
395 %RandomCmpdIndexMap = ();
396 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) {
397 $RandomCycleCount++;
398 $RandomIndex = int (rand $CmpdCount) + 1;
399 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
400 }
401
402 $CmpdNum = 0;
403 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
404 $CmpdNum++;
405 if (!exists $RandomCmpdIndexMap{$CmpdNum}) {
406 next CMPDSTRING;
407 }
408
409 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
410
411 WriteSDFileCmpdString();
412
413 if ($OptionsInfo{OutputTextFile}) {
414 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
415 SetupDataValues();
416 WriteTextFileCmpdData();
417 }
418 }
419 }
420
421 # Extract mol names...
422 sub ExtractMolNames {
423 my($FileIndex) = @_;
424 my($MolName, $NewTextFileRef, @CmpdLines);
425
426 push @{$SDFilesInfo{DataLabels}}, "MolName";
427 WriteTextFileColLabels();
428
429 $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
430 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
431 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
432 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote});
433 print $NewTextFileRef "$MolName\n";
434 }
435 }
436
437 # Extract a specific compound record...
438 sub ExtractRecordNum {
439 my($FileIndex) = @_;
440 my($CmpdNum, @CmpdLines);
441
442 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
443 WriteTextFileColLabels();
444
445 $CmpdNum = 0;
446
447 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
448 $CmpdNum++;
449 if ($CmpdNum != $OptionsInfo{RecordNum}) {
450 next CMPDSTRING;
451 }
452
453 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
454 WriteSDFileCmpdString();
455
456 if ($OptionsInfo{OutputTextFile}) {
457 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
458 SetupDataValues();
459 WriteTextFileCmpdData();
460 }
461 last CMPDSTRING;
462 }
463 }
464
465 # Extract a specific compound records...
466 sub ExtractRecordNums {
467 my($FileIndex) = @_;
468 my($CmpdNum, $CmpdCount, @CmpdLines);
469
470 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
471 WriteTextFileColLabels();
472
473 $CmpdNum = 0;
474 $CmpdCount = 0;
475
476 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
477 $CmpdNum++;
478
479 if (exists $OptionsInfo{RecordNums}{$CmpdNum}) {
480 $CmpdCount++;
481 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
482
483 WriteSDFileCmpdString();
484
485 if ($OptionsInfo{OutputTextFile}) {
486 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
487 SetupDataValues();
488 WriteTextFileCmpdData();
489 }
490 }
491 elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) {
492 last CMPDSTRING;
493 }
494 }
495 }
496
497
498 # Extract compounds in a specific record range...
499 sub ExtractRecordRange {
500 my($FileIndex) = @_;
501 my($CmpdNum, @CmpdLines);
502
503 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
504 WriteTextFileColLabels();
505
506 $CmpdNum = 0;
507 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
508 $CmpdNum++;
509
510 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) {
511 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
512
513 WriteSDFileCmpdString();
514
515 if ($OptionsInfo{OutputTextFile}) {
516 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
517 SetupDataValues();
518 WriteTextFileCmpdData();
519 }
520 }
521 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) {
522 last CMPDSTRING;
523 }
524 }
525 }
526
527 # Extract 2D compound records...
528 sub Extract2DCmpdRecords {
529 my($FileIndex) = @_;
530 my(@CmpdLines);
531
532 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
533 WriteTextFileColLabels();
534
535
536 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
537 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
538 if (!IsCmpd2D(\@CmpdLines)) {
539 next CMPDSTRING;
540 }
541
542 WriteSDFileCmpdString();
543
544 if ($OptionsInfo{OutputTextFile}) {
545 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
546 SetupDataValues();
547 WriteTextFileCmpdData();
548 }
549 }
550 }
551
552 # Extract 3D compound records...
553 sub Extract3DCmpdRecords {
554 my($FileIndex) = @_;
555 my(@CmpdLines);
556
557 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
558 WriteTextFileColLabels();
559
560
561 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
562 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
563 if (!IsCmpd3D(\@CmpdLines)) {
564 next CMPDSTRING;
565 }
566
567 WriteSDFileCmpdString();
568
569 if ($OptionsInfo{OutputTextFile}) {
570 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
571 SetupDataValues();
572 WriteTextFileCmpdData();
573 }
574 }
575 }
576
577
578 # Open input and output files...
579 sub OpenInputAndOutputFiles {
580 my($FileIndex) = @_;
581
582 $SDFilesInfo{NewTextFileRef} = undef;
583 $SDFilesInfo{NewSDFileRef} = undef;
584
585 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) {
586 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
587 }
588 elsif ($OptionsInfo{OutputSDFile}) {
589 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n";
590 }
591 else {
592 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
593 }
594
595 if ($OptionsInfo{OutputSDFile}) {
596 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n";
597 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE;
598 }
599 if ($OptionsInfo{OutputTextFile}) {
600 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n";
601 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE;
602 }
603
604 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n";
605 $SDFilesInfo{InputSDFileRef} = \*SDFILE;
606
607 }
608
609 # Close open input and output files...
610 sub CloseInputAndOutputFiles {
611 if ($SDFilesInfo{NewSDFileRef}) {
612 close $SDFilesInfo{NewSDFileRef};
613 }
614 if ($SDFilesInfo{NewTextFileRef}) {
615 close $SDFilesInfo{NewTextFileRef};
616 }
617
618 if ($SDFilesInfo{InputSDFileRef}) {
619 close $SDFilesInfo{InputSDFileRef};
620 }
621
622 $SDFilesInfo{NewTextFileRef} = undef;
623 $SDFilesInfo{NewSDFileRef} = undef;
624 $SDFilesInfo{InputSDFileRef} = undef;
625 }
626
627 # Write out column labels for text file...
628 sub WriteTextFileColLabels {
629 my($ColLabelsLine, $NewTextFileRef);
630
631 if (!$OptionsInfo{OutputTextFile}) {
632 return;
633 }
634 $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
635
636 if ($OptionsInfo{OutputStrDataString}) {
637 # Append structure data string label...
638 my(@DataLabels);
639
640 @DataLabels = ();
641 push @DataLabels, @{$SDFilesInfo{DataLabels}};
642 push @DataLabels, "StructureDataString";
643
644 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
645 }
646 else {
647 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
648 }
649 print $NewTextFileRef "$ColLabelsLine\n";
650 }
651
652 # Setup values for data fields...
653 sub SetupDataValues {
654 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}};
655 }
656
657 # Write out structure data and specific data fields to SD file...
658 sub WriteSDFileCmpdData {
659 my($MolString, $Count, $NewSDFileRef);
660
661 if (!$OptionsInfo{OutputSDFile}) {
662 return;
663 }
664
665 $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
666
667 ($MolString) = split "M END", $SDFilesInfo{CmpdString};
668 $MolString .= "M END";
669 print $NewSDFileRef "$MolString\n";
670
671 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) {
672 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n";
673 }
674 print $NewSDFileRef "\$\$\$\$\n";
675 }
676
677 # Write out compound string...
678 sub WriteSDFileCmpdString {
679 my($NewSDFileRef);
680
681 if (!$OptionsInfo{OutputSDFile}) {
682 return;
683 }
684
685 $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
686 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n";
687 }
688
689 # Write out data for text file...
690 sub WriteTextFileCmpdData {
691 my($DataValuesLine, $NewTextFileRef);
692
693 if (!$OptionsInfo{OutputTextFile}) {
694 return;
695 }
696
697 $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
698 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
699
700 # Handle multiple lines data values for data fields by joining 'em using semicolons...
701 if ($DataValuesLine =~ /\n/) {
702 $DataValuesLine =~ s/\n/;/g;
703 }
704
705 if ($OptionsInfo{OutputStrDataString}) {
706 # Append structure data string...
707 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter);
708
709 if ($OptionsInfo{StrDataStringWithFields}) {
710 $StrDataString = $SDFilesInfo{CmpdString};
711 }
712 else {
713 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString};
714 $StrDataString .= "M END";
715 }
716 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter};
717 $StrDataString =~ s/\n/$StrDataStringDelimiter/g;
718
719 $OutDelim = $OptionsInfo{OutDelim};
720 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : "";
721
722 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n";
723 }
724 else {
725 print $NewTextFileRef "$DataValuesLine\n";
726 }
727 }
728
729 # Retrieve information about input SD files...
730 sub RetrieveSDFilesInfo {
731 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
732
733 %SDFilesInfo = ();
734
735 @{$SDFilesInfo{FileOkay}} = ();
736 @{$SDFilesInfo{CmpdCount}} = ();
737 @{$SDFilesInfo{NewTextFileName}} = ();
738 @{$SDFilesInfo{NewSDFileName}} = ();
739
740 @{$SDFilesInfo{AllDataFieldLabels}} = ();
741 @{$SDFilesInfo{CommonDataFieldLabels}} = ();
742
743 FILELIST: for $Index (0 .. $#SDFilesList) {
744 $SDFile = $SDFilesList[$Index];
745
746 $SDFilesInfo{FileOkay}[$Index] = 0;
747
748 $SDFilesInfo{CmpdCount}[$Index] = 0;
749 $SDFilesInfo{NewTextFileName}[$Index] = "";
750 $SDFilesInfo{NewSDFileName}[$Index] = "";
751
752 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = ();
753 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = ();
754
755 if (!(-e $SDFile)) {
756 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
757 next FILELIST;
758 }
759
760 if (!CheckFileType($SDFile, "sd sdf")) {
761 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
762 next FILELIST;
763 }
764
765 # Generate appropriate name for the new output file.
766 $FileDir = ""; $FileName = ""; $FileExt = "";
767 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
768 $NewFileName = $FileName;
769 $NewFileName = $FileName . $OptionsInfo{FileNameMode};
770 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
771 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
772 if ($RootFileName && $RootFileExt) {
773 $NewFileName = $RootFileName;
774 }
775 else {
776 $NewFileName = $OptionsInfo{OutFileRoot};
777 }
778 }
779 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}";
780 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}";
781
782 if ($OptionsInfo{OutputSDFile}) {
783 if (lc($NewSDFileName) eq lc($SDFile)) {
784 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
785 print "Specify a different name using \"-r --root\" option or use default name.\n";
786 next FILELIST;
787 }
788 }
789
790 if (!$OptionsInfo{Overwrite}) {
791 if ($OptionsInfo{OutputSDFile}) {
792 if (-e $NewSDFileName) {
793 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
794 next FILELIST;
795 }
796 }
797 if ($OptionsInfo{OutputTextFile}) {
798 if (-e $NewTextFileName) {
799 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
800 next FILELIST;
801 }
802 }
803 }
804
805 if (!open SDFILE, "$SDFile") {
806 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
807 next FILELIST;
808 }
809
810 my($CountCmpds, $CollectDataFields);
811 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
812
813 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0;
814
815 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0;
816
817 $CmpdCount = 0;
818 if ($CountCmpds || $CollectDataFields) {
819 @DataFieldLabels = ();
820 @CommonDataFieldLabels = ();
821 %DataFieldLabelsMap = ();
822 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
823 $CmpdCount++;
824 if ($OptionsInfo{Mode} =~ /^recordnum$/i) {
825 if ($CmpdCount == $OptionsInfo{RecordNum}) {
826 @CmpdLines = split "\n", $CmpdString;
827 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
828 last CMPDSTRING;
829 }
830 }
831 if ($CollectDataFields) {
832 my($Label);
833 @CmpdLines = split "\n", $CmpdString;
834 # Process compound data header labels and figure out which ones are present for
835 # all the compounds...
836 if (@DataFieldLabels) {
837 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
838 my(%CmpdDataFieldLabelsMap) = ();
839 # Setup a map for the current labels...
840 for $Label (@CmpdDataFieldLabels) {
841 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
842 }
843 # Check the presence old labels for this compound; otherwise, mark 'em new...
844 for $Label (@DataFieldLabels) {
845 if (!$CmpdDataFieldLabelsMap{$Label}) {
846 $DataFieldLabelsMap{$Label} = "PresentInSome";
847 }
848 }
849 # Check the presence this compound in the old labels; otherwise, add 'em...
850 for $Label (@CmpdDataFieldLabels ) {
851 if (!$DataFieldLabelsMap{$Label}) {
852 # It's a new label...
853 push @DataFieldLabels, $Label;
854 $DataFieldLabelsMap{$Label} = "PresentInSome";
855 }
856 }
857 }
858 else {
859 # Get the initial label set and set up a map...
860 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
861 for $Label (@DataFieldLabels) {
862 $DataFieldLabelsMap{$Label} = "PresentInAll";
863 }
864 }
865 # Identify the common data field labels...
866 if ($Options{mode} =~ /^commondatafields$/i) {
867 @CommonDataFieldLabels = ();
868 for $Label (@DataFieldLabels) {
869 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
870 push @CommonDataFieldLabels, $Label;
871 }
872 }
873 }
874 }
875 }
876 }
877
878 $SDFilesInfo{FileOkay}[$Index] = 1;
879
880 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName;
881 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName;
882
883 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
884
885 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels;
886 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels;
887
888 close SDFILE;
889 }
890 }
891
892 # Process options...
893 sub ProcessOptions {
894 %OptionsInfo = ();
895
896 $OptionsInfo{Mode} = $Options{mode};
897
898 $OptionsInfo{InDelim} = "\,";
899 if ($Options{indelim} =~ /^semicolon$/i) {
900 $OptionsInfo{InDelim} = "\;";
901 }
902 elsif ($Options{indelim} =~ /^tab$/i) {
903 $OptionsInfo{InDelim} = "\t";
904 }
905
906 $OptionsInfo{OutDelim} = "\,";
907 if ($Options{outdelim} =~ /^semicolon$/i) {
908 $OptionsInfo{OutDelim} = "\;";
909 }
910 elsif ($Options{outdelim} =~ /^tab$/i) {
911 $OptionsInfo{OutDelim} = "\t";
912 }
913
914 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
915
916 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0;
917
918 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
919 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
920
921 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds};
922
923 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode};
924 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0;
925
926 $OptionsInfo{Violations} = $Options{violations};
927 $OptionsInfo{Seed} = $Options{seed};
928
929
930 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
931 if ($Options{datafields} || $Options{datafieldsfile}) {
932 if ($Options{datafields} && $Options{datafieldsfile}) {
933 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
934 }
935 }
936 else {
937 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
938 }
939 }
940 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef;
941 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef;
942
943 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0;
944
945 %{$OptionsInfo{RecordNums}} = ();
946 $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0;
947
948 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef;
949
950 if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) {
951 if ($Options{record}) {
952 my($Record, @RecordSplit);
953
954 $Record = $Options{record};
955 $Record =~ s/ //g;
956
957 @RecordSplit = split ",", $Record;
958
959 if ($Options{mode} =~ /^recordnum$/i ) {
960 if (@RecordSplit == 1) {
961 $OptionsInfo{RecordNum} = $RecordSplit[0];
962 if ($OptionsInfo{RecordNum} <= 0) {
963 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n";
964 }
965 }
966 else {
967 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
968 }
969 }
970 elsif ($Options{mode} =~ /^recordnums$/i ) {
971 my($RecordNum, $RecordCount, @SortedRecordSplit);
972
973 @SortedRecordSplit = sort { $a <=> $b } @RecordSplit;
974
975 $RecordCount = 0;
976 RECORDNUM: for $RecordNum (@SortedRecordSplit) {
977 if (exists $OptionsInfo{RecordNums}{$RecordNum}) {
978 next RECORDNUM;
979 }
980 $RecordCount++;
981 $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum;
982 }
983 $OptionsInfo{RecordNumsCount} = $RecordCount;
984 $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0];
985 $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit];
986 }
987 else {
988 if (@RecordSplit == 2) {
989 $OptionsInfo{StartRecordNum} = $RecordSplit[0];
990 $OptionsInfo{EndRecordNum} = $RecordSplit[1];
991 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) {
992 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n";
993 }
994 }
995 else {
996 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
997 }
998 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) {
999 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n";
1000 }
1001 }
1002 }
1003 else {
1004 die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n";
1005 }
1006 }
1007
1008 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1009
1010 my(@Words, $Line, $Value);
1011 if ($Options{mode} =~ /^datafields$/i) {
1012 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1013 if ($Options{datafields}) {
1014 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields};
1015 }
1016 elsif ($Options{datafieldsfile}) {
1017 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1018 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1019 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1020 if (@Words) {
1021 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words;
1022 }
1023 }
1024 close DATAFIELDSFILE;
1025 }
1026 }
1027 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
1028 my(@DataFieldsByValueTriplets);
1029 @DataFieldsByValueTriplets = ();
1030 if ($Options{datafields}) {
1031 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields};
1032 }
1033 elsif ($Options{datafieldsfile}) {
1034 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1035 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1036 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1037 if (@Words) {
1038 push @DataFieldsByValueTriplets, @Words;
1039 }
1040 }
1041 close DATAFIELDSFILE;
1042 }
1043 if ((@DataFieldsByValueTriplets % 3)) {
1044 if ($Options{datafields}) {
1045 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
1046 }
1047 elsif ($Options{datafieldsfile}) {
1048 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
1049 }
1050 }
1051 my($Index, $Label, $Value, $Criterion);
1052
1053 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1054 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = ();
1055 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = ();
1056
1057 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
1058 $Label = $DataFieldsByValueTriplets[$Index];
1059 $Value = $DataFieldsByValueTriplets[$Index + 1];
1060 $Criterion = $DataFieldsByValueTriplets[$Index + 2];
1061
1062 if ($Criterion =~ /^(eq|le|ge)$/i) {
1063 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1064 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value;
1065 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion;
1066 }
1067 else {
1068 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
1069 }
1070 }
1071 }
1072 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) {
1073 my(@DataFieldsByRegexTriplets);
1074
1075 @DataFieldsByRegexTriplets = ();
1076 if ($Options{datafields}) {
1077 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields});
1078 }
1079 elsif ($Options{datafieldsfile}) {
1080 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1081 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1082 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1083 if (@Words) {
1084 push @DataFieldsByRegexTriplets, @Words;
1085 }
1086 }
1087 close DATAFIELDSFILE;
1088 }
1089 if ((@DataFieldsByRegexTriplets % 3)) {
1090 if ($Options{datafields}) {
1091 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n";
1092 }
1093 elsif ($Options{datafieldsfile}) {
1094 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n";
1095 }
1096 }
1097
1098 my($Index, $Label, $Value, $Criterion);
1099
1100 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
1101 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = ();
1102 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = ();
1103
1104 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) {
1105 $Label = $DataFieldsByRegexTriplets[$Index];
1106 $Value = $DataFieldsByRegexTriplets[$Index + 1];
1107 $Criterion = $DataFieldsByRegexTriplets[$Index + 2];
1108
1109 if ($Criterion =~ /^(eq|ne)$/i) {
1110 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
1111 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value;
1112 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion;
1113 }
1114 else {
1115 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n";
1116 }
1117 }
1118 }
1119 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
1120 my($Index, @DataFieldAndValuesList);
1121 if ($Options{datafields}) {
1122 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields};
1123 }
1124 elsif ($Options{datafieldsfile}) {
1125 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
1126 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
1127 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
1128 if (@Words) {
1129 push @DataFieldAndValuesList, @Words;
1130 }
1131 }
1132 close DATAFIELDSFILE;
1133 }
1134 if (@DataFieldAndValuesList < 2) {
1135 if ($Options{datafields}) {
1136 die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
1137 }
1138 elsif ($Options{datafieldsfile}) {
1139 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
1140 }
1141 }
1142
1143 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0];
1144 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1;
1145 %{$OptionsInfo{SpecifiedDataFieldValues}} = ();
1146
1147 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
1148 $Value = $DataFieldAndValuesList[$Index];
1149 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
1150 }
1151 }
1152
1153 $OptionsInfo{SDFileExt} = "sdf";
1154 $OptionsInfo{TextFileExt} = "csv";
1155
1156 if ($Options{outdelim} =~ /^tab$/i) {
1157 $OptionsInfo{TextFileExt} = "tsv";
1158 }
1159
1160 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
1161 $OptionsInfo{OutputSDFile} = 0;
1162 $OptionsInfo{OutputTextFile} = 1;
1163 }
1164 else {
1165 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
1166 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
1167 }
1168
1169 $OptionsInfo{StrDataString} = $Options{strdatastring};
1170 $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0;
1171
1172 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter};
1173
1174 if (IsEmpty($Options{strdatastringdelimiter})) {
1175 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n";
1176 }
1177 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode};
1178 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0;
1179
1180 MODE: {
1181 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; }
1182 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; }
1183 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; }
1184 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; }
1185 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; }
1186 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; }
1187 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; }
1188 if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; }
1189 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; }
1190 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; }
1191 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; }
1192 if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; }
1193 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; }
1194 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; }
1195 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; }
1196 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1197 }
1198
1199 }
1200
1201 # Setup script usage and retrieve command line arguments specified using various options...
1202 sub SetupScriptUsage {
1203
1204 # Retrieve all the options...
1205 %Options = ();
1206 $Options{numofcmpds} = 1;
1207 $Options{mode} = "alldatafields";
1208 $Options{indelim} = "comma";
1209 $Options{outdelim} = "comma";
1210 $Options{output} = "SD";
1211 $Options{quote} = "yes";
1212 $Options{regexignorecase} = "yes";
1213 $Options{valuecomparisonmode} = "numeric";
1214 $Options{violations} = 0;
1215 $Options{seed} = 123456789;
1216
1217 $Options{strdatastring} = "no";
1218 $Options{strdatastringdelimiter} = "|";
1219 $Options{strdatastringmode} = "StrOnly";
1220
1221 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) {
1222 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
1223 }
1224 if ($Options{workingdir}) {
1225 if (! -d $Options{workingdir}) {
1226 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
1227 }
1228 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
1229 }
1230 if ($Options{numofcmpds} < 1) {
1231 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
1232 }
1233 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) {
1234 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n";
1235 }
1236 if ($Options{violations} < 0) {
1237 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
1238 }
1239 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) {
1240 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
1241 }
1242 if ($Options{output} !~ /^(SD|text|both)$/i) {
1243 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
1244 }
1245 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
1246 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1247 }
1248 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
1249 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
1250 }
1251 if ($Options{quote} !~ /^(yes|no)$/i) {
1252 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
1253 }
1254 if ($Options{regexignorecase} !~ /^(yes|no)$/i) {
1255 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n";
1256 }
1257 if ($Options{strdatastring} !~ /^(yes|no)$/i) {
1258 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n";
1259 }
1260 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) {
1261 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n";
1262 }
1263 }
1264
1265 __END__
1266
1267 =head1 NAME
1268
1269 ExtractFromSDFiles.pl - Extract specific data from SDFile(s)
1270
1271 =head1 SYNOPSIS
1272
1273 ExtractFromSDFiles.pl SDFile(s)...
1274
1275 ExtractFromSDFiles.pl [B<-h, --help>]
1276 [B<-d, --datafields> "fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value..."]
1277 [B<--datafieldsfile> filename] [B<--indelim> comma | tab | semicolon] [B<-m, --mode> alldatafields |
1278 commondatafields | | datafieldnotbylist | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist |
1279 datafielduniquebylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords |
1280 3dcmpdrecords ] [B<-n, --numofcmpds> number] [B<--outdelim> comma | tab | semicolon]
1281 [B<--output> SD | text | both] [B<-o, --overwrite>] [B<-q, --quote> yes | no]
1282 [B<--record> recnum | startrecnum,endrecnum] B<--RegexIgnoreCase> I<yes or no>
1283 [B<-r, --root> rootname] [B<-s, --seed> number] [B<--StrDataString> yes | no]
1284 [B<--StrDataStringDelimiter> text] [B<--StrDataStringMode> StrOnly | StrAndDataFields]
1285 [B<--ValueComparisonMode> I<Numeric | Alphanumeric>]
1286 [B<-v, --violations-> number] [B<-w, --workingdir> dirname] SDFile(s)...
1287
1288 =head1 DESCRIPTION
1289
1290 Extract specific data from I<SDFile(s)> and generate appropriate SD or CSV/TSV text
1291 file(s). The structure data from SDFile(s) is not transferred to CSV/TSV text file(s).
1292 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
1293 and I<.sd>. All other file names are ignored. All the SD files in a current directory
1294 can be specified either by I<*.sdf> or the current directory name.
1295
1296 =head1 OPTIONS
1297
1298 =over 4
1299
1300 =item B<-h, --help>
1301
1302 Print this help message.
1303
1304 =item B<-d, --datafields> I<"fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value,...">
1305
1306 This value is mode specific. In general, it's a list of comma separated data field labels
1307 and associated mode specific values.
1308
1309 For I<datafields> mode, input value format is: I<fieldlabel,...>. Examples:
1310
1311 Extreg
1312 Extreg,CompoundName,ID
1313
1314 For I<datafieldsbyvalue> mode, input value format contains these triplets:
1315 I<fieldlabel,value, criteria...>. Possible values for criteria: I<le, ge or eq>.
1316 The values of B<--ValueComparisonMode> indicates whether values are
1317 compared numerical or string comarison operators. Default is to consider
1318 data field values as numerical values and use numerical comparison operators.
1319 Examples:
1320
1321 MolWt,450,le
1322 MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le
1323
1324 For I<datafieldsbyregex> mode, input value format contains these triplets:
1325 I<fieldlabel,regex, criteria...>. I<regex> corresponds to any valid regular expression
1326 and is used to match the values for specified I<fieldlabel>. Possible values for criteria:
1327 I<eq or ne>. During I<eq> and I<ne> values, data field label value is matched with
1328 regular expression using =~ and !~ respectively. B<--RegexIgnoreCase> option
1329 value is used to determine whether to ignore letter upper/lower case during
1330 regular expression match. Examples:
1331
1332 Name,ol,eq
1333 Name,'^pat',ne
1334
1335 For I<datafieldbylist> and I<datafielduniquebylist> mode, input value format is:
1336 I<fieldlabel,value1,value2...>. This is equivalent to I<datafieldsbyvalue> mode with
1337 this input value format:I<fieldlabel,value1,eq,fieldlabel,value2,eq,...>. For
1338 I<datafielduniquebylist> mode, only unique compounds identified by first occurrence
1339 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds
1340 are simply ignored.
1341
1342 For I<datafieldnotbylist> mode, input value format is: I<fieldlabel,value1,value2...>. In this
1343 mode, the script behaves exactly opposite of I<datafieldbylist> mode, and only those compounds
1344 are extracted whose data field values don't match any specified data field value.
1345
1346 =item B<--datafieldsfile> I<filename>
1347
1348 Filename which contains various mode specific values. This option provides a way
1349 to specify mode specific values in a file instead of entering them on the command
1350 line using B<-d --datafields>.
1351
1352 For I<datafields> mode, input file lines contain comma delimited field labels:
1353 I<fieldlabel,...>. Example:
1354
1355 Line 1:MolId
1356 Line 2:"Extreg",CompoundName,ID
1357
1358 For I<datafieldsbyvalue> mode, input file lines contains these comma separated triplets:
1359 I<fieldlabel,value, criteria>. Possible values for criteria: I<le, ge or eq>. Examples:
1360
1361 Line 1:MolWt,450,le
1362
1363 Line 1:"MolWt",450,le,"LogP",5,le,"SumNumNO",10,le,"SumNHOH",5,le
1364
1365 Line 1:MolWt,450,le
1366 Line 2:"LogP",5,le
1367 Line 3:"SumNumNO",10,le
1368 Line 4: SumNHOH,5,le
1369
1370 For I<datafieldbylist> and I<datafielduniquebylist> mode, input file line format is:
1371
1372 Line 1:fieldlabel;
1373 Subsequent lines:value1,value2...
1374
1375 For I<datafieldbylist>, I<datafielduniquebylist>, and I<datafieldnotbylist> mode, input file
1376 line format is:
1377
1378 Line 1:fieldlabel;
1379 Subsequent lines:value1,value2...
1380
1381 For I<datafielduniquebylist> mode, only unique compounds identified by first occurrence
1382 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds
1383 are simply ignored. Example:
1384
1385 Line 1: MolID
1386 Subsequent Lines:
1387 907508
1388 832291,4642
1389 "1254","907303"
1390
1391 =item B<--indelim> I<comma | tab | semicolon>
1392
1393 Delimiter used to specify text values for B<-d --datafields> and B<--datafieldsfile> options.
1394 Possible values: I<comma, tab, or semicolon>. Default value: I<comma>.
1395
1396 =item B<-m, --mode> I<alldatafields | commondatafields | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | datafielduniquebylist | datafieldnotbylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | 3dcmpdrecords>
1397
1398 Specify what to extract from I<SDFile(s)>. Possible values: I<alldatafields, commondatafields,
1399 datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, datafieldnotbylist,
1400 molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords>.
1401 Default value: I<alldatafields>.
1402
1403 For I<alldatafields> and I<molnames> mode, only a CSV/TSV text file is generated; for all
1404 other modes, however, a SD file is generated by default - you can change the behavior to genereate
1405 text file using I<--output> option.
1406
1407 For I<3DCmpdRecords> mode, only those compounds with at least one non-zero value for Z atomic coordinates
1408 are retrieved; however, during retrieval of compounds in I<2DCmpdRecords> mode, all Z atomic coordinates must
1409 be zero.
1410
1411 =item B<-n, --numofcmpds> I<number>
1412
1413 Number of compouds to extract during I<randomcmpds> mode.
1414
1415 =item B<--outdelim> I<comma | tab | semicolon>
1416
1417 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
1418 Default value: I<comma>
1419
1420 =item B<--output> I<SD | text | both>
1421
1422 Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<SD>. For
1423 I<alldatafields> and I<molnames> mode, this option is ingored and only a CSV/TSV text file is generated.
1424
1425 =item B<-o, --overwrite>
1426
1427 Overwrite existing files.
1428
1429 =item B<-q, --quote> I<yes | no>
1430
1431 Put quote around column values in output CSV/TSV text file(s). Possible values:
1432 I<yes or no>. Default value: I<yes>.
1433
1434 =item B<--record> I<recnum | recnums | startrecnum,endrecnum>
1435
1436 Record number, record numbers or range of records to extract during I<recordnum>, I<recordnums>
1437 and I<recordrange> mode. Input value format is: <num>, <num1,num2,...> and <startnum, endnum>
1438 for I<recordnum>, I<recordnums> and I<recordrange> modes recpectively. Default value: none.
1439
1440 =item B<--RegexIgnoreCase> I<yes or no>
1441
1442 Specify whether to ingnore case during I<datafieldsbyregex> value of B<-m, --mode> option.
1443 Possible values: I<yes or no>. Default value: I<yes>.
1444
1445 =item B<-r, --root> I<rootname>
1446
1447 New file name is generated using the root: <Root>.<Ext>. Default for new file
1448 names: <SDFileName><mode>.<Ext>. The file type determines <Ext> value.
1449 The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab
1450 delimited text files respectively.This option is ignored for multiple input files.
1451
1452 =item B<-s, --seed> I<number>
1453
1454 Random number seed used for I<randomcmpds> mode. Default:123456789.
1455
1456 =item B<--StrDataString> I<yes | no>
1457
1458 Specify whether to write out structure data string to CSV/TSV text file(s). Possible values:
1459 I<yes or no>. Default value: I<no>.
1460
1461 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure
1462 data lines into a structure data string.
1463
1464 This option is ignored during generation of SD file(s).
1465
1466 =item B<--StrDataStringDelimiter> I<text>
1467
1468 Delimiter for joining multiple stucture data lines into a string before writing to CSV/TSV text
1469 file(s). Possible values: I<any alphanumeric text>. Default value: I<|>.
1470
1471 This option is ignored during generation of SD file(s).
1472
1473 =item B<--StrDataStringMode> I<StrOnly | StrAndDataFields>
1474
1475 Specify whether to include SD data fields and values along with the structure data into structure
1476 data string before writing it out to CSV/TSV text file(s). Possible values: I<StrOnly or StrAndDataFields>.
1477 Default value: I<StrOnly>.
1478
1479 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure
1480 data lines into a structure data string.
1481
1482 This option is ignored during generation of SD file(s).
1483
1484 =item B<--ValueComparisonMode> I<Numeric | Alphanumeric>
1485
1486 Specify how to compare data field values during I<datafieldsbyvalue> mode: Compare
1487 values using either numeric or string ((eq, le, ge) comparison operators. Possible values:
1488 I<Numeric or Alphanumeric>. Defaule value: I<Numeric>.
1489
1490 =item B<-v, --violations> I<number>
1491
1492 Number of criterion violations allowed for values specified during I<datafieldsbyvalue>
1493 and I<datafieldsbyregex> mode. Default value: I<0>.
1494
1495 =item B<-w, --workingdir> I<dirname>
1496
1497 Location of working directory. Default: current directory.
1498
1499 =back
1500
1501 =head1 EXAMPLES
1502
1503 To retrieve all data fields from SD files and generate CSV text files, type:
1504
1505 % ExtractFromSDFiles.pl -o Sample.sdf
1506 % ExtractFromSDFiles.pl -o *.sdf
1507
1508 To retrieve all data fields from SD file and generate CSV text files containing
1509 a column with structure data as a string with | as line delimiter, type:
1510
1511 % ExtractFromSDFiles.pl --StrDataString Yes -o Sample.sdf
1512
1513 To retrieve MOL_ID data fileld from SD file and generate CSV text files containing
1514 a column with structure data along with all data fields as a string with | as line
1515 delimiter, type:
1516
1517 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID" --StrDataString Yes
1518 --StrDataStringMode StrAndDataFields --StrDataStringDelimiter "|"
1519 --output text -o Sample.sdf
1520
1521 To retrieve common data fields which exists for all the compounds in
1522 a SD file and generate a TSV text file NewSample.tsv, type:
1523
1524 % ExtractFromSDFiles.pl -m commondatafields --outdelim tab -r NewSample
1525 --output Text -o Sample.sdf
1526
1527 To retrieve MolId, ExtReg, and CompoundName data field from a SD file and generate a
1528 CSV text file NewSample.csv, type:
1529
1530 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID,MolWeight,
1531 CompoundName" -r NewSample --output Text -o Sample.sdf
1532
1533 To retrieve compounds from a SD which meet a specific set of criteria - MolWt <= 450,
1534 LogP <= 5 and SumNO < 10 - from a SD file and generate a new SD file NewSample.sdf,
1535 type:
1536
1537 % ExtractFromSDFiles.pl -m datafieldsbyvalue -d "MolWt,450,le,LogP
1538 ,5,le,SumNO,10" -r NewSample -o Sample.sdf
1539
1540 To retrive compounds from a SD file with a specific set of values for MolID and
1541 generate a new SD file NewSample.sdf, type:
1542
1543 % ExtractFromSDFiles.pl -m datafieldbylist -d "Mol_ID,159,4509,4619"
1544 -r NewSample -o Sample.sdf
1545
1546 To retrive compounds from a SD file with values for MolID not on a list of specified
1547 values and generate a new SD file NewSample.sdf, type:
1548
1549 % ExtractFromSDFiles.pl -m datafieldnotbylist -d "Mol_ID,159,4509,4619"
1550 -r NewSample -o Sample.sdf
1551
1552 To retrive 10 random compounds from a SD file and generate a new SD file RandomSample.sdf, type:
1553
1554 % ExtractFromSDFiles.pl -m randomcmpds -n 10 -r RandomSample
1555 -o Sample.sdf
1556
1557 To retrive compound record number 10 from a SD file and generate a new SD file NewSample.sdf, type:
1558
1559 % ExtractFromSDFiles.pl -m recordnum --record 10 -r NewSample
1560 -o Sample.sdf
1561
1562 To retrive compound record numbers 10, 20 and 30 from a SD file and generate a new SD file
1563 NewSample.sdf, type:
1564
1565 % ExtractFromSDFiles.pl -m recordnums --record 10,20,30 -r NewSample
1566 -o Sample.sdf
1567
1568 To retrive compound records between 10 to 20 from SD file and generate a new SD
1569 file NewSample.sdf, type:
1570
1571 % ExtractFromSDFiles.pl -m recordrange --record 10,20 -r NewSample
1572 -o Sample.sdf
1573
1574 =head1 AUTHOR
1575
1576 Manish Sud <msud@san.rr.com>
1577
1578 =head1 SEE ALSO
1579
1580 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl
1581
1582 =head1 COPYRIGHT
1583
1584 Copyright (C) 2015 Manish Sud. All rights reserved.
1585
1586 This file is part of MayaChemTools.
1587
1588 MayaChemTools is free software; you can redistribute it and/or modify it under
1589 the terms of the GNU Lesser General Public License as published by the Free
1590 Software Foundation; either version 3 of the License, or (at your option)
1591 any later version.
1592
1593 =cut