0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: ExtractFromSDFiles.pl,v $
|
|
4 # $Date: 2015/03/22 19:11:27 $
|
|
5 # $Revision: 1.48 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use SDFileUtil;
|
|
36 use FileUtil;
|
|
37 use TextUtil;
|
|
38
|
|
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
40
|
|
41 # Autoflush STDOUT
|
|
42 $| = 1;
|
|
43
|
|
44 # Starting message...
|
|
45 $ScriptName = basename($0);
|
|
46 print "\n$ScriptName:Starting...\n\n";
|
|
47 $StartTime = new Benchmark;
|
|
48
|
|
49 # Get the options and setup script...
|
|
50 SetupScriptUsage();
|
|
51 if ($Options{help} || @ARGV < 1) {
|
|
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
53 }
|
|
54
|
|
55 my(@SDFilesList);
|
|
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
|
|
57
|
|
58 # Process options...
|
|
59 print "Processing options...\n";
|
|
60 my(%OptionsInfo);
|
|
61 ProcessOptions();
|
|
62
|
|
63 # Collect information about SD files...
|
|
64 print "Checking input SD file(s)...\n";
|
|
65 my(%SDFilesInfo);
|
|
66 RetrieveSDFilesInfo();
|
|
67
|
|
68 # Generate output files...
|
|
69 my($FileIndex);
|
|
70 if (@SDFilesList > 1) {
|
|
71 print "\nProcessing SD files...\n";
|
|
72 }
|
|
73 for $FileIndex (0 .. $#SDFilesList) {
|
|
74 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
|
|
75 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
|
|
76 ExtractFromSDFile($FileIndex);
|
|
77 }
|
|
78 }
|
|
79 print "\n$ScriptName:Done...\n\n";
|
|
80
|
|
81 $EndTime = new Benchmark;
|
|
82 $TotalTime = timediff ($EndTime, $StartTime);
|
|
83 print "Total time: ", timestr($TotalTime), "\n";
|
|
84
|
|
85 ###############################################################################
|
|
86
|
|
87 # Extract data from a SD file...
|
|
88 sub ExtractFromSDFile {
|
|
89 my($FileIndex) = @_;
|
|
90
|
|
91 OpenInputAndOutputFiles($FileIndex);
|
|
92
|
|
93 MODE: {
|
|
94 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) {
|
|
95 ExtractAllDataFields($FileIndex);
|
|
96 last MODE;
|
|
97 }
|
|
98 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) {
|
|
99 ExtractCommonDataFields($FileIndex);
|
|
100 last MODE;
|
|
101 }
|
|
102 if ($OptionsInfo{Mode} =~ /^DataFields$/i) {
|
|
103 ExtractDataFields($FileIndex);
|
|
104 last MODE;
|
|
105 }
|
|
106 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) {
|
|
107 ExtractDataFieldByList($FileIndex);
|
|
108 last MODE;
|
|
109 }
|
|
110 if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) {
|
|
111 ExtractDataFieldNotByList($FileIndex);
|
|
112 last MODE;
|
|
113 }
|
|
114 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) {
|
|
115 ExtractDataFieldsByValue($FileIndex);
|
|
116 last MODE;
|
|
117 }
|
|
118 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) {
|
|
119 ExtractDataFieldsByRegex($FileIndex);
|
|
120 last MODE;
|
|
121 }
|
|
122 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) {
|
|
123 ExtractRandomCompounds($FileIndex);
|
|
124 last MODE;
|
|
125 }
|
|
126 if ($OptionsInfo{Mode} =~ /^MolNames$/i) {
|
|
127 ExtractMolNames($FileIndex);
|
|
128 last MODE;
|
|
129 }
|
|
130 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) {
|
|
131 ExtractRecordNum($FileIndex);
|
|
132 last MODE;
|
|
133 }
|
|
134 if ($OptionsInfo{Mode} =~ /^RecordNums$/i) {
|
|
135 ExtractRecordNums($FileIndex);
|
|
136 last MODE;
|
|
137 }
|
|
138 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) {
|
|
139 ExtractRecordRange($FileIndex);
|
|
140 last MODE;
|
|
141 }
|
|
142 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) {
|
|
143 Extract2DCmpdRecords($FileIndex);
|
|
144 last MODE;
|
|
145 }
|
|
146 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) {
|
|
147 Extract3DCmpdRecords($FileIndex);
|
|
148 last MODE;
|
|
149 }
|
|
150 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
|
|
151 }
|
|
152
|
|
153 CloseInputAndOutputFiles();
|
|
154 }
|
|
155
|
|
156 # Extract all data fields...
|
|
157 sub ExtractAllDataFields {
|
|
158 my($FileIndex) = @_;
|
|
159 my(@CmpdLines);
|
|
160
|
|
161 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
162 WriteTextFileColLabels();
|
|
163
|
|
164 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
165 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
166 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
167
|
|
168 SetupDataValues();
|
|
169 WriteTextFileCmpdData();
|
|
170 WriteSDFileCmpdData();
|
|
171 }
|
|
172 }
|
|
173
|
|
174 # Extract common data fields...
|
|
175 sub ExtractCommonDataFields {
|
|
176 my($FileIndex) = @_;
|
|
177 my(@CmpdLines);
|
|
178
|
|
179 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]};
|
|
180 WriteTextFileColLabels();
|
|
181
|
|
182 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
183 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
184 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
185
|
|
186 SetupDataValues();
|
|
187 WriteTextFileCmpdData();
|
|
188 WriteSDFileCmpdData();
|
|
189 }
|
|
190 }
|
|
191
|
|
192 # Extract specified data fields...
|
|
193 sub ExtractDataFields {
|
|
194 my($FileIndex) = @_;
|
|
195 my(@CmpdLines);
|
|
196
|
|
197 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}};
|
|
198 WriteTextFileColLabels();
|
|
199
|
|
200 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
201 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
202 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
203
|
|
204 SetupDataValues();
|
|
205 WriteTextFileCmpdData();
|
|
206 WriteSDFileCmpdData();
|
|
207 }
|
|
208 }
|
|
209
|
|
210 # Extract data fields using a list...
|
|
211 sub ExtractDataFieldByList {
|
|
212 my($FileIndex) = @_;
|
|
213 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
|
|
214
|
|
215 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
216 WriteTextFileColLabels();
|
|
217
|
|
218 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) {
|
|
219 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
|
|
220 }
|
|
221 $SpecifiedDataFieldValuesFoundCount = 0;
|
|
222 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
|
|
223
|
|
224 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
225 $CmpdNum++;
|
|
226
|
|
227 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
228 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
229
|
|
230 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
|
|
231 next CMPDSTRING;
|
|
232 }
|
|
233
|
|
234 SetupDataValues();
|
|
235
|
|
236 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
|
|
237 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
|
|
238
|
|
239 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
|
|
240 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) {
|
|
241 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") {
|
|
242 $SpecifiedDataFieldValuesFoundCount++;
|
|
243 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found";
|
|
244 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) {
|
|
245 WriteSDFileCmpdString();
|
|
246 WriteTextFileCmpdData();
|
|
247 }
|
|
248 }
|
|
249 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) {
|
|
250 WriteSDFileCmpdString();
|
|
251 WriteTextFileCmpdData();
|
|
252 }
|
|
253 }
|
|
254 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) {
|
|
255 last CMPDSTRING;
|
|
256 }
|
|
257 }
|
|
258 }
|
|
259 }
|
|
260
|
|
261 # Extract data field whose values are not on the specified list...
|
|
262 sub ExtractDataFieldNotByList {
|
|
263 my($FileIndex) = @_;
|
|
264 my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines);
|
|
265
|
|
266 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
267 WriteTextFileColLabels();
|
|
268
|
|
269 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel};
|
|
270
|
|
271 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
272 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
273 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
274
|
|
275 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) {
|
|
276 next CMPDSTRING;
|
|
277 }
|
|
278
|
|
279 SetupDataValues();
|
|
280
|
|
281 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel};
|
|
282
|
|
283 # Make sure the current value is not empty and is not only specified list of values...
|
|
284 if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) {
|
|
285 next CMPDSTRING;
|
|
286 }
|
|
287
|
|
288 WriteSDFileCmpdString();
|
|
289 WriteTextFileCmpdData();
|
|
290 }
|
|
291 }
|
|
292
|
|
293 # Extract data fields by value...
|
|
294 sub ExtractDataFieldsByValue {
|
|
295 my($FileIndex) = @_;
|
|
296 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines);
|
|
297
|
|
298 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
299 WriteTextFileColLabels();
|
|
300
|
|
301 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
302 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
303 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
304
|
|
305 SetupDataValues();
|
|
306 $ViolationCount = 0;
|
|
307
|
|
308 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
|
|
309 if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
|
|
310 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
|
|
311 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label};
|
|
312 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label};
|
|
313
|
|
314 if ($OptionsInfo{NumericalComparison}) {
|
|
315 CRITERION: {
|
|
316 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
|
|
317 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
|
|
318 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
|
|
319 $Nothing = 1;
|
|
320 }
|
|
321 }
|
|
322 else {
|
|
323 CRITERION: {
|
|
324 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
|
|
325 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
|
|
326 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } }
|
|
327 $Nothing = 1;
|
|
328 }
|
|
329 }
|
|
330 }
|
|
331 }
|
|
332 if ($ViolationCount <= $OptionsInfo{Violations}) {
|
|
333 WriteSDFileCmpdString();
|
|
334 WriteTextFileCmpdData();
|
|
335 }
|
|
336 }
|
|
337 }
|
|
338
|
|
339 # Extract data fields by value using regular expression match...
|
|
340 sub ExtractDataFieldsByRegex {
|
|
341 my($FileIndex) = @_;
|
|
342 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines);
|
|
343
|
|
344 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
345 WriteTextFileColLabels();
|
|
346
|
|
347 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
348 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
349 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
350
|
|
351 SetupDataValues();
|
|
352 $ViolationCount = 0;
|
|
353
|
|
354 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) {
|
|
355 if (exists $SDFilesInfo{DataFieldValues}{$Label}) {
|
|
356 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label};
|
|
357 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label};
|
|
358 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label};
|
|
359
|
|
360 if ($OptionsInfo{RegexIgnoreCase}) {
|
|
361 CRITERION: {
|
|
362 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
|
|
363 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } }
|
|
364 $Nothing = 1;
|
|
365 }
|
|
366 }
|
|
367 else {
|
|
368 CRITERION: {
|
|
369 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
|
|
370 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } }
|
|
371 $Nothing = 1;
|
|
372 }
|
|
373 }
|
|
374 }
|
|
375 }
|
|
376 if ($ViolationCount <= $OptionsInfo{Violations}) {
|
|
377 WriteSDFileCmpdString();
|
|
378 WriteTextFileCmpdData();
|
|
379 }
|
|
380 }
|
|
381 }
|
|
382
|
|
383 # Extract random compounds...
|
|
384 sub ExtractRandomCompounds {
|
|
385 my($FileIndex) = @_;
|
|
386 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap);
|
|
387
|
|
388 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
389 WriteTextFileColLabels();
|
|
390
|
|
391 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex];
|
|
392 srand($OptionsInfo{Seed});
|
|
393 $RandomCycleCount = 0;
|
|
394
|
|
395 %RandomCmpdIndexMap = ();
|
|
396 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) {
|
|
397 $RandomCycleCount++;
|
|
398 $RandomIndex = int (rand $CmpdCount) + 1;
|
|
399 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex;
|
|
400 }
|
|
401
|
|
402 $CmpdNum = 0;
|
|
403 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
404 $CmpdNum++;
|
|
405 if (!exists $RandomCmpdIndexMap{$CmpdNum}) {
|
|
406 next CMPDSTRING;
|
|
407 }
|
|
408
|
|
409 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
410
|
|
411 WriteSDFileCmpdString();
|
|
412
|
|
413 if ($OptionsInfo{OutputTextFile}) {
|
|
414 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
415 SetupDataValues();
|
|
416 WriteTextFileCmpdData();
|
|
417 }
|
|
418 }
|
|
419 }
|
|
420
|
|
421 # Extract mol names...
|
|
422 sub ExtractMolNames {
|
|
423 my($FileIndex) = @_;
|
|
424 my($MolName, $NewTextFileRef, @CmpdLines);
|
|
425
|
|
426 push @{$SDFilesInfo{DataLabels}}, "MolName";
|
|
427 WriteTextFileColLabels();
|
|
428
|
|
429 $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
|
|
430 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
431 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
432 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote});
|
|
433 print $NewTextFileRef "$MolName\n";
|
|
434 }
|
|
435 }
|
|
436
|
|
437 # Extract a specific compound record...
|
|
438 sub ExtractRecordNum {
|
|
439 my($FileIndex) = @_;
|
|
440 my($CmpdNum, @CmpdLines);
|
|
441
|
|
442 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
443 WriteTextFileColLabels();
|
|
444
|
|
445 $CmpdNum = 0;
|
|
446
|
|
447 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
448 $CmpdNum++;
|
|
449 if ($CmpdNum != $OptionsInfo{RecordNum}) {
|
|
450 next CMPDSTRING;
|
|
451 }
|
|
452
|
|
453 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
454 WriteSDFileCmpdString();
|
|
455
|
|
456 if ($OptionsInfo{OutputTextFile}) {
|
|
457 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
458 SetupDataValues();
|
|
459 WriteTextFileCmpdData();
|
|
460 }
|
|
461 last CMPDSTRING;
|
|
462 }
|
|
463 }
|
|
464
|
|
465 # Extract a specific compound records...
|
|
466 sub ExtractRecordNums {
|
|
467 my($FileIndex) = @_;
|
|
468 my($CmpdNum, $CmpdCount, @CmpdLines);
|
|
469
|
|
470 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
471 WriteTextFileColLabels();
|
|
472
|
|
473 $CmpdNum = 0;
|
|
474 $CmpdCount = 0;
|
|
475
|
|
476 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
477 $CmpdNum++;
|
|
478
|
|
479 if (exists $OptionsInfo{RecordNums}{$CmpdNum}) {
|
|
480 $CmpdCount++;
|
|
481 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
482
|
|
483 WriteSDFileCmpdString();
|
|
484
|
|
485 if ($OptionsInfo{OutputTextFile}) {
|
|
486 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
487 SetupDataValues();
|
|
488 WriteTextFileCmpdData();
|
|
489 }
|
|
490 }
|
|
491 elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) {
|
|
492 last CMPDSTRING;
|
|
493 }
|
|
494 }
|
|
495 }
|
|
496
|
|
497
|
|
498 # Extract compounds in a specific record range...
|
|
499 sub ExtractRecordRange {
|
|
500 my($FileIndex) = @_;
|
|
501 my($CmpdNum, @CmpdLines);
|
|
502
|
|
503 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
504 WriteTextFileColLabels();
|
|
505
|
|
506 $CmpdNum = 0;
|
|
507 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
508 $CmpdNum++;
|
|
509
|
|
510 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) {
|
|
511 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
512
|
|
513 WriteSDFileCmpdString();
|
|
514
|
|
515 if ($OptionsInfo{OutputTextFile}) {
|
|
516 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
517 SetupDataValues();
|
|
518 WriteTextFileCmpdData();
|
|
519 }
|
|
520 }
|
|
521 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) {
|
|
522 last CMPDSTRING;
|
|
523 }
|
|
524 }
|
|
525 }
|
|
526
|
|
527 # Extract 2D compound records...
|
|
528 sub Extract2DCmpdRecords {
|
|
529 my($FileIndex) = @_;
|
|
530 my(@CmpdLines);
|
|
531
|
|
532 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
533 WriteTextFileColLabels();
|
|
534
|
|
535
|
|
536 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
537 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
538 if (!IsCmpd2D(\@CmpdLines)) {
|
|
539 next CMPDSTRING;
|
|
540 }
|
|
541
|
|
542 WriteSDFileCmpdString();
|
|
543
|
|
544 if ($OptionsInfo{OutputTextFile}) {
|
|
545 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
546 SetupDataValues();
|
|
547 WriteTextFileCmpdData();
|
|
548 }
|
|
549 }
|
|
550 }
|
|
551
|
|
552 # Extract 3D compound records...
|
|
553 sub Extract3DCmpdRecords {
|
|
554 my($FileIndex) = @_;
|
|
555 my(@CmpdLines);
|
|
556
|
|
557 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]};
|
|
558 WriteTextFileColLabels();
|
|
559
|
|
560
|
|
561 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) {
|
|
562 @CmpdLines = split "\n", $SDFilesInfo{CmpdString};
|
|
563 if (!IsCmpd3D(\@CmpdLines)) {
|
|
564 next CMPDSTRING;
|
|
565 }
|
|
566
|
|
567 WriteSDFileCmpdString();
|
|
568
|
|
569 if ($OptionsInfo{OutputTextFile}) {
|
|
570 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
|
|
571 SetupDataValues();
|
|
572 WriteTextFileCmpdData();
|
|
573 }
|
|
574 }
|
|
575 }
|
|
576
|
|
577
|
|
578 # Open input and output files...
|
|
579 sub OpenInputAndOutputFiles {
|
|
580 my($FileIndex) = @_;
|
|
581
|
|
582 $SDFilesInfo{NewTextFileRef} = undef;
|
|
583 $SDFilesInfo{NewSDFileRef} = undef;
|
|
584
|
|
585 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) {
|
|
586 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
|
|
587 }
|
|
588 elsif ($OptionsInfo{OutputSDFile}) {
|
|
589 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n";
|
|
590 }
|
|
591 else {
|
|
592 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n";
|
|
593 }
|
|
594
|
|
595 if ($OptionsInfo{OutputSDFile}) {
|
|
596 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n";
|
|
597 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE;
|
|
598 }
|
|
599 if ($OptionsInfo{OutputTextFile}) {
|
|
600 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n";
|
|
601 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE;
|
|
602 }
|
|
603
|
|
604 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n";
|
|
605 $SDFilesInfo{InputSDFileRef} = \*SDFILE;
|
|
606
|
|
607 }
|
|
608
|
|
609 # Close open input and output files...
|
|
610 sub CloseInputAndOutputFiles {
|
|
611 if ($SDFilesInfo{NewSDFileRef}) {
|
|
612 close $SDFilesInfo{NewSDFileRef};
|
|
613 }
|
|
614 if ($SDFilesInfo{NewTextFileRef}) {
|
|
615 close $SDFilesInfo{NewTextFileRef};
|
|
616 }
|
|
617
|
|
618 if ($SDFilesInfo{InputSDFileRef}) {
|
|
619 close $SDFilesInfo{InputSDFileRef};
|
|
620 }
|
|
621
|
|
622 $SDFilesInfo{NewTextFileRef} = undef;
|
|
623 $SDFilesInfo{NewSDFileRef} = undef;
|
|
624 $SDFilesInfo{InputSDFileRef} = undef;
|
|
625 }
|
|
626
|
|
627 # Write out column labels for text file...
|
|
628 sub WriteTextFileColLabels {
|
|
629 my($ColLabelsLine, $NewTextFileRef);
|
|
630
|
|
631 if (!$OptionsInfo{OutputTextFile}) {
|
|
632 return;
|
|
633 }
|
|
634 $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
|
|
635
|
|
636 if ($OptionsInfo{OutputStrDataString}) {
|
|
637 # Append structure data string label...
|
|
638 my(@DataLabels);
|
|
639
|
|
640 @DataLabels = ();
|
|
641 push @DataLabels, @{$SDFilesInfo{DataLabels}};
|
|
642 push @DataLabels, "StructureDataString";
|
|
643
|
|
644 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
645 }
|
|
646 else {
|
|
647 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
648 }
|
|
649 print $NewTextFileRef "$ColLabelsLine\n";
|
|
650 }
|
|
651
|
|
652 # Setup values for data fields...
|
|
653 sub SetupDataValues {
|
|
654 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}};
|
|
655 }
|
|
656
|
|
657 # Write out structure data and specific data fields to SD file...
|
|
658 sub WriteSDFileCmpdData {
|
|
659 my($MolString, $Count, $NewSDFileRef);
|
|
660
|
|
661 if (!$OptionsInfo{OutputSDFile}) {
|
|
662 return;
|
|
663 }
|
|
664
|
|
665 $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
|
|
666
|
|
667 ($MolString) = split "M END", $SDFilesInfo{CmpdString};
|
|
668 $MolString .= "M END";
|
|
669 print $NewSDFileRef "$MolString\n";
|
|
670
|
|
671 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) {
|
|
672 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n";
|
|
673 }
|
|
674 print $NewSDFileRef "\$\$\$\$\n";
|
|
675 }
|
|
676
|
|
677 # Write out compound string...
|
|
678 sub WriteSDFileCmpdString {
|
|
679 my($NewSDFileRef);
|
|
680
|
|
681 if (!$OptionsInfo{OutputSDFile}) {
|
|
682 return;
|
|
683 }
|
|
684
|
|
685 $NewSDFileRef = $SDFilesInfo{NewSDFileRef};
|
|
686 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n";
|
|
687 }
|
|
688
|
|
689 # Write out data for text file...
|
|
690 sub WriteTextFileCmpdData {
|
|
691 my($DataValuesLine, $NewTextFileRef);
|
|
692
|
|
693 if (!$OptionsInfo{OutputTextFile}) {
|
|
694 return;
|
|
695 }
|
|
696
|
|
697 $NewTextFileRef = $SDFilesInfo{NewTextFileRef};
|
|
698 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote});
|
|
699
|
|
700 # Handle multiple lines data values for data fields by joining 'em using semicolons...
|
|
701 if ($DataValuesLine =~ /\n/) {
|
|
702 $DataValuesLine =~ s/\n/;/g;
|
|
703 }
|
|
704
|
|
705 if ($OptionsInfo{OutputStrDataString}) {
|
|
706 # Append structure data string...
|
|
707 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter);
|
|
708
|
|
709 if ($OptionsInfo{StrDataStringWithFields}) {
|
|
710 $StrDataString = $SDFilesInfo{CmpdString};
|
|
711 }
|
|
712 else {
|
|
713 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString};
|
|
714 $StrDataString .= "M END";
|
|
715 }
|
|
716 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter};
|
|
717 $StrDataString =~ s/\n/$StrDataStringDelimiter/g;
|
|
718
|
|
719 $OutDelim = $OptionsInfo{OutDelim};
|
|
720 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : "";
|
|
721
|
|
722 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n";
|
|
723 }
|
|
724 else {
|
|
725 print $NewTextFileRef "$DataValuesLine\n";
|
|
726 }
|
|
727 }
|
|
728
|
|
729 # Retrieve information about input SD files...
|
|
730 sub RetrieveSDFilesInfo {
|
|
731 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount);
|
|
732
|
|
733 %SDFilesInfo = ();
|
|
734
|
|
735 @{$SDFilesInfo{FileOkay}} = ();
|
|
736 @{$SDFilesInfo{CmpdCount}} = ();
|
|
737 @{$SDFilesInfo{NewTextFileName}} = ();
|
|
738 @{$SDFilesInfo{NewSDFileName}} = ();
|
|
739
|
|
740 @{$SDFilesInfo{AllDataFieldLabels}} = ();
|
|
741 @{$SDFilesInfo{CommonDataFieldLabels}} = ();
|
|
742
|
|
743 FILELIST: for $Index (0 .. $#SDFilesList) {
|
|
744 $SDFile = $SDFilesList[$Index];
|
|
745
|
|
746 $SDFilesInfo{FileOkay}[$Index] = 0;
|
|
747
|
|
748 $SDFilesInfo{CmpdCount}[$Index] = 0;
|
|
749 $SDFilesInfo{NewTextFileName}[$Index] = "";
|
|
750 $SDFilesInfo{NewSDFileName}[$Index] = "";
|
|
751
|
|
752 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = ();
|
|
753 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = ();
|
|
754
|
|
755 if (!(-e $SDFile)) {
|
|
756 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
|
|
757 next FILELIST;
|
|
758 }
|
|
759
|
|
760 if (!CheckFileType($SDFile, "sd sdf")) {
|
|
761 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
|
|
762 next FILELIST;
|
|
763 }
|
|
764
|
|
765 # Generate appropriate name for the new output file.
|
|
766 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
767 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
|
|
768 $NewFileName = $FileName;
|
|
769 $NewFileName = $FileName . $OptionsInfo{FileNameMode};
|
|
770 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
|
|
771 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
|
|
772 if ($RootFileName && $RootFileExt) {
|
|
773 $NewFileName = $RootFileName;
|
|
774 }
|
|
775 else {
|
|
776 $NewFileName = $OptionsInfo{OutFileRoot};
|
|
777 }
|
|
778 }
|
|
779 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}";
|
|
780 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}";
|
|
781
|
|
782 if ($OptionsInfo{OutputSDFile}) {
|
|
783 if (lc($NewSDFileName) eq lc($SDFile)) {
|
|
784 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n";
|
|
785 print "Specify a different name using \"-r --root\" option or use default name.\n";
|
|
786 next FILELIST;
|
|
787 }
|
|
788 }
|
|
789
|
|
790 if (!$OptionsInfo{Overwrite}) {
|
|
791 if ($OptionsInfo{OutputSDFile}) {
|
|
792 if (-e $NewSDFileName) {
|
|
793 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n";
|
|
794 next FILELIST;
|
|
795 }
|
|
796 }
|
|
797 if ($OptionsInfo{OutputTextFile}) {
|
|
798 if (-e $NewTextFileName) {
|
|
799 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n";
|
|
800 next FILELIST;
|
|
801 }
|
|
802 }
|
|
803 }
|
|
804
|
|
805 if (!open SDFILE, "$SDFile") {
|
|
806 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
|
|
807 next FILELIST;
|
|
808 }
|
|
809
|
|
810 my($CountCmpds, $CollectDataFields);
|
|
811 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels);
|
|
812
|
|
813 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0;
|
|
814
|
|
815 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0;
|
|
816
|
|
817 $CmpdCount = 0;
|
|
818 if ($CountCmpds || $CollectDataFields) {
|
|
819 @DataFieldLabels = ();
|
|
820 @CommonDataFieldLabels = ();
|
|
821 %DataFieldLabelsMap = ();
|
|
822 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
|
|
823 $CmpdCount++;
|
|
824 if ($OptionsInfo{Mode} =~ /^recordnum$/i) {
|
|
825 if ($CmpdCount == $OptionsInfo{RecordNum}) {
|
|
826 @CmpdLines = split "\n", $CmpdString;
|
|
827 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
828 last CMPDSTRING;
|
|
829 }
|
|
830 }
|
|
831 if ($CollectDataFields) {
|
|
832 my($Label);
|
|
833 @CmpdLines = split "\n", $CmpdString;
|
|
834 # Process compound data header labels and figure out which ones are present for
|
|
835 # all the compounds...
|
|
836 if (@DataFieldLabels) {
|
|
837 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
838 my(%CmpdDataFieldLabelsMap) = ();
|
|
839 # Setup a map for the current labels...
|
|
840 for $Label (@CmpdDataFieldLabels) {
|
|
841 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome";
|
|
842 }
|
|
843 # Check the presence old labels for this compound; otherwise, mark 'em new...
|
|
844 for $Label (@DataFieldLabels) {
|
|
845 if (!$CmpdDataFieldLabelsMap{$Label}) {
|
|
846 $DataFieldLabelsMap{$Label} = "PresentInSome";
|
|
847 }
|
|
848 }
|
|
849 # Check the presence this compound in the old labels; otherwise, add 'em...
|
|
850 for $Label (@CmpdDataFieldLabels ) {
|
|
851 if (!$DataFieldLabelsMap{$Label}) {
|
|
852 # It's a new label...
|
|
853 push @DataFieldLabels, $Label;
|
|
854 $DataFieldLabelsMap{$Label} = "PresentInSome";
|
|
855 }
|
|
856 }
|
|
857 }
|
|
858 else {
|
|
859 # Get the initial label set and set up a map...
|
|
860 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines);
|
|
861 for $Label (@DataFieldLabels) {
|
|
862 $DataFieldLabelsMap{$Label} = "PresentInAll";
|
|
863 }
|
|
864 }
|
|
865 # Identify the common data field labels...
|
|
866 if ($Options{mode} =~ /^commondatafields$/i) {
|
|
867 @CommonDataFieldLabels = ();
|
|
868 for $Label (@DataFieldLabels) {
|
|
869 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") {
|
|
870 push @CommonDataFieldLabels, $Label;
|
|
871 }
|
|
872 }
|
|
873 }
|
|
874 }
|
|
875 }
|
|
876 }
|
|
877
|
|
878 $SDFilesInfo{FileOkay}[$Index] = 1;
|
|
879
|
|
880 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName;
|
|
881 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName;
|
|
882
|
|
883 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount;
|
|
884
|
|
885 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels;
|
|
886 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels;
|
|
887
|
|
888 close SDFILE;
|
|
889 }
|
|
890 }
|
|
891
|
|
892 # Process options...
|
|
893 sub ProcessOptions {
|
|
894 %OptionsInfo = ();
|
|
895
|
|
896 $OptionsInfo{Mode} = $Options{mode};
|
|
897
|
|
898 $OptionsInfo{InDelim} = "\,";
|
|
899 if ($Options{indelim} =~ /^semicolon$/i) {
|
|
900 $OptionsInfo{InDelim} = "\;";
|
|
901 }
|
|
902 elsif ($Options{indelim} =~ /^tab$/i) {
|
|
903 $OptionsInfo{InDelim} = "\t";
|
|
904 }
|
|
905
|
|
906 $OptionsInfo{OutDelim} = "\,";
|
|
907 if ($Options{outdelim} =~ /^semicolon$/i) {
|
|
908 $OptionsInfo{OutDelim} = "\;";
|
|
909 }
|
|
910 elsif ($Options{outdelim} =~ /^tab$/i) {
|
|
911 $OptionsInfo{OutDelim} = "\t";
|
|
912 }
|
|
913
|
|
914 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0;
|
|
915
|
|
916 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0;
|
|
917
|
|
918 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef;
|
|
919 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef;
|
|
920
|
|
921 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds};
|
|
922
|
|
923 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode};
|
|
924 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0;
|
|
925
|
|
926 $OptionsInfo{Violations} = $Options{violations};
|
|
927 $OptionsInfo{Seed} = $Options{seed};
|
|
928
|
|
929
|
|
930 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
|
|
931 if ($Options{datafields} || $Options{datafieldsfile}) {
|
|
932 if ($Options{datafields} && $Options{datafieldsfile}) {
|
|
933 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
|
|
934 }
|
|
935 }
|
|
936 else {
|
|
937 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n";
|
|
938 }
|
|
939 }
|
|
940 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef;
|
|
941 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef;
|
|
942
|
|
943 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0;
|
|
944
|
|
945 %{$OptionsInfo{RecordNums}} = ();
|
|
946 $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0;
|
|
947
|
|
948 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef;
|
|
949
|
|
950 if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) {
|
|
951 if ($Options{record}) {
|
|
952 my($Record, @RecordSplit);
|
|
953
|
|
954 $Record = $Options{record};
|
|
955 $Record =~ s/ //g;
|
|
956
|
|
957 @RecordSplit = split ",", $Record;
|
|
958
|
|
959 if ($Options{mode} =~ /^recordnum$/i ) {
|
|
960 if (@RecordSplit == 1) {
|
|
961 $OptionsInfo{RecordNum} = $RecordSplit[0];
|
|
962 if ($OptionsInfo{RecordNum} <= 0) {
|
|
963 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n";
|
|
964 }
|
|
965 }
|
|
966 else {
|
|
967 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n";
|
|
968 }
|
|
969 }
|
|
970 elsif ($Options{mode} =~ /^recordnums$/i ) {
|
|
971 my($RecordNum, $RecordCount, @SortedRecordSplit);
|
|
972
|
|
973 @SortedRecordSplit = sort { $a <=> $b } @RecordSplit;
|
|
974
|
|
975 $RecordCount = 0;
|
|
976 RECORDNUM: for $RecordNum (@SortedRecordSplit) {
|
|
977 if (exists $OptionsInfo{RecordNums}{$RecordNum}) {
|
|
978 next RECORDNUM;
|
|
979 }
|
|
980 $RecordCount++;
|
|
981 $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum;
|
|
982 }
|
|
983 $OptionsInfo{RecordNumsCount} = $RecordCount;
|
|
984 $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0];
|
|
985 $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit];
|
|
986 }
|
|
987 else {
|
|
988 if (@RecordSplit == 2) {
|
|
989 $OptionsInfo{StartRecordNum} = $RecordSplit[0];
|
|
990 $OptionsInfo{EndRecordNum} = $RecordSplit[1];
|
|
991 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) {
|
|
992 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n";
|
|
993 }
|
|
994 }
|
|
995 else {
|
|
996 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n";
|
|
997 }
|
|
998 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) {
|
|
999 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n";
|
|
1000 }
|
|
1001 }
|
|
1002 }
|
|
1003 else {
|
|
1004 die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n";
|
|
1005 }
|
|
1006 }
|
|
1007
|
|
1008 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
|
|
1009
|
|
1010 my(@Words, $Line, $Value);
|
|
1011 if ($Options{mode} =~ /^datafields$/i) {
|
|
1012 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
|
|
1013 if ($Options{datafields}) {
|
|
1014 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields};
|
|
1015 }
|
|
1016 elsif ($Options{datafieldsfile}) {
|
|
1017 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
|
|
1018 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
|
|
1019 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
|
|
1020 if (@Words) {
|
|
1021 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words;
|
|
1022 }
|
|
1023 }
|
|
1024 close DATAFIELDSFILE;
|
|
1025 }
|
|
1026 }
|
|
1027 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) {
|
|
1028 my(@DataFieldsByValueTriplets);
|
|
1029 @DataFieldsByValueTriplets = ();
|
|
1030 if ($Options{datafields}) {
|
|
1031 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields};
|
|
1032 }
|
|
1033 elsif ($Options{datafieldsfile}) {
|
|
1034 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
|
|
1035 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
|
|
1036 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
|
|
1037 if (@Words) {
|
|
1038 push @DataFieldsByValueTriplets, @Words;
|
|
1039 }
|
|
1040 }
|
|
1041 close DATAFIELDSFILE;
|
|
1042 }
|
|
1043 if ((@DataFieldsByValueTriplets % 3)) {
|
|
1044 if ($Options{datafields}) {
|
|
1045 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n";
|
|
1046 }
|
|
1047 elsif ($Options{datafieldsfile}) {
|
|
1048 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n";
|
|
1049 }
|
|
1050 }
|
|
1051 my($Index, $Label, $Value, $Criterion);
|
|
1052
|
|
1053 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
|
|
1054 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = ();
|
|
1055 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = ();
|
|
1056
|
|
1057 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) {
|
|
1058 $Label = $DataFieldsByValueTriplets[$Index];
|
|
1059 $Value = $DataFieldsByValueTriplets[$Index + 1];
|
|
1060 $Criterion = $DataFieldsByValueTriplets[$Index + 2];
|
|
1061
|
|
1062 if ($Criterion =~ /^(eq|le|ge)$/i) {
|
|
1063 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
|
|
1064 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value;
|
|
1065 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion;
|
|
1066 }
|
|
1067 else {
|
|
1068 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n";
|
|
1069 }
|
|
1070 }
|
|
1071 }
|
|
1072 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) {
|
|
1073 my(@DataFieldsByRegexTriplets);
|
|
1074
|
|
1075 @DataFieldsByRegexTriplets = ();
|
|
1076 if ($Options{datafields}) {
|
|
1077 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields});
|
|
1078 }
|
|
1079 elsif ($Options{datafieldsfile}) {
|
|
1080 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
|
|
1081 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
|
|
1082 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
|
|
1083 if (@Words) {
|
|
1084 push @DataFieldsByRegexTriplets, @Words;
|
|
1085 }
|
|
1086 }
|
|
1087 close DATAFIELDSFILE;
|
|
1088 }
|
|
1089 if ((@DataFieldsByRegexTriplets % 3)) {
|
|
1090 if ($Options{datafields}) {
|
|
1091 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n";
|
|
1092 }
|
|
1093 elsif ($Options{datafieldsfile}) {
|
|
1094 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n";
|
|
1095 }
|
|
1096 }
|
|
1097
|
|
1098 my($Index, $Label, $Value, $Criterion);
|
|
1099
|
|
1100 @{$OptionsInfo{SpecifiedDataFieldLabels}} = ();
|
|
1101 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = ();
|
|
1102 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = ();
|
|
1103
|
|
1104 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) {
|
|
1105 $Label = $DataFieldsByRegexTriplets[$Index];
|
|
1106 $Value = $DataFieldsByRegexTriplets[$Index + 1];
|
|
1107 $Criterion = $DataFieldsByRegexTriplets[$Index + 2];
|
|
1108
|
|
1109 if ($Criterion =~ /^(eq|ne)$/i) {
|
|
1110 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label;
|
|
1111 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value;
|
|
1112 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion;
|
|
1113 }
|
|
1114 else {
|
|
1115 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n";
|
|
1116 }
|
|
1117 }
|
|
1118 }
|
|
1119 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) {
|
|
1120 my($Index, @DataFieldAndValuesList);
|
|
1121 if ($Options{datafields}) {
|
|
1122 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields};
|
|
1123 }
|
|
1124 elsif ($Options{datafieldsfile}) {
|
|
1125 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n";
|
|
1126 while ($Line = GetTextLine(\*DATAFIELDSFILE)) {
|
|
1127 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line);
|
|
1128 if (@Words) {
|
|
1129 push @DataFieldAndValuesList, @Words;
|
|
1130 }
|
|
1131 }
|
|
1132 close DATAFIELDSFILE;
|
|
1133 }
|
|
1134 if (@DataFieldAndValuesList < 2) {
|
|
1135 if ($Options{datafields}) {
|
|
1136 die "Error: Invalid number of values specified by \"-d --datafields\" option\n";
|
|
1137 }
|
|
1138 elsif ($Options{datafieldsfile}) {
|
|
1139 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n";
|
|
1140 }
|
|
1141 }
|
|
1142
|
|
1143 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0];
|
|
1144 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1;
|
|
1145 %{$OptionsInfo{SpecifiedDataFieldValues}} = ();
|
|
1146
|
|
1147 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) {
|
|
1148 $Value = $DataFieldAndValuesList[$Index];
|
|
1149 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound";
|
|
1150 }
|
|
1151 }
|
|
1152
|
|
1153 $OptionsInfo{SDFileExt} = "sdf";
|
|
1154 $OptionsInfo{TextFileExt} = "csv";
|
|
1155
|
|
1156 if ($Options{outdelim} =~ /^tab$/i) {
|
|
1157 $OptionsInfo{TextFileExt} = "tsv";
|
|
1158 }
|
|
1159
|
|
1160 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) {
|
|
1161 $OptionsInfo{OutputSDFile} = 0;
|
|
1162 $OptionsInfo{OutputTextFile} = 1;
|
|
1163 }
|
|
1164 else {
|
|
1165 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0;
|
|
1166 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0;
|
|
1167 }
|
|
1168
|
|
1169 $OptionsInfo{StrDataString} = $Options{strdatastring};
|
|
1170 $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0;
|
|
1171
|
|
1172 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter};
|
|
1173
|
|
1174 if (IsEmpty($Options{strdatastringdelimiter})) {
|
|
1175 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n";
|
|
1176 }
|
|
1177 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode};
|
|
1178 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0;
|
|
1179
|
|
1180 MODE: {
|
|
1181 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; }
|
|
1182 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; }
|
|
1183 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; }
|
|
1184 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; }
|
|
1185 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; }
|
|
1186 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; }
|
|
1187 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; }
|
|
1188 if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; }
|
|
1189 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; }
|
|
1190 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; }
|
|
1191 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; }
|
|
1192 if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; }
|
|
1193 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; }
|
|
1194 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; }
|
|
1195 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; }
|
|
1196 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
|
|
1197 }
|
|
1198
|
|
1199 }
|
|
1200
|
|
1201 # Setup script usage and retrieve command line arguments specified using various options...
|
|
1202 sub SetupScriptUsage {
|
|
1203
|
|
1204 # Retrieve all the options...
|
|
1205 %Options = ();
|
|
1206 $Options{numofcmpds} = 1;
|
|
1207 $Options{mode} = "alldatafields";
|
|
1208 $Options{indelim} = "comma";
|
|
1209 $Options{outdelim} = "comma";
|
|
1210 $Options{output} = "SD";
|
|
1211 $Options{quote} = "yes";
|
|
1212 $Options{regexignorecase} = "yes";
|
|
1213 $Options{valuecomparisonmode} = "numeric";
|
|
1214 $Options{violations} = 0;
|
|
1215 $Options{seed} = 123456789;
|
|
1216
|
|
1217 $Options{strdatastring} = "no";
|
|
1218 $Options{strdatastringdelimiter} = "|";
|
|
1219 $Options{strdatastringmode} = "StrOnly";
|
|
1220
|
|
1221 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) {
|
|
1222 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
1223 }
|
|
1224 if ($Options{workingdir}) {
|
|
1225 if (! -d $Options{workingdir}) {
|
|
1226 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
1227 }
|
|
1228 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
1229 }
|
|
1230 if ($Options{numofcmpds} < 1) {
|
|
1231 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n";
|
|
1232 }
|
|
1233 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) {
|
|
1234 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n";
|
|
1235 }
|
|
1236 if ($Options{violations} < 0) {
|
|
1237 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n";
|
|
1238 }
|
|
1239 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) {
|
|
1240 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n";
|
|
1241 }
|
|
1242 if ($Options{output} !~ /^(SD|text|both)$/i) {
|
|
1243 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n";
|
|
1244 }
|
|
1245 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) {
|
|
1246 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
|
|
1247 }
|
|
1248 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) {
|
|
1249 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n";
|
|
1250 }
|
|
1251 if ($Options{quote} !~ /^(yes|no)$/i) {
|
|
1252 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n";
|
|
1253 }
|
|
1254 if ($Options{regexignorecase} !~ /^(yes|no)$/i) {
|
|
1255 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n";
|
|
1256 }
|
|
1257 if ($Options{strdatastring} !~ /^(yes|no)$/i) {
|
|
1258 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n";
|
|
1259 }
|
|
1260 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) {
|
|
1261 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n";
|
|
1262 }
|
|
1263 }
|
|
1264
|
|
1265 __END__
|
|
1266
|
|
1267 =head1 NAME
|
|
1268
|
|
1269 ExtractFromSDFiles.pl - Extract specific data from SDFile(s)
|
|
1270
|
|
1271 =head1 SYNOPSIS
|
|
1272
|
|
1273 ExtractFromSDFiles.pl SDFile(s)...
|
|
1274
|
|
1275 ExtractFromSDFiles.pl [B<-h, --help>]
|
|
1276 [B<-d, --datafields> "fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value..."]
|
|
1277 [B<--datafieldsfile> filename] [B<--indelim> comma | tab | semicolon] [B<-m, --mode> alldatafields |
|
|
1278 commondatafields | | datafieldnotbylist | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist |
|
|
1279 datafielduniquebylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords |
|
|
1280 3dcmpdrecords ] [B<-n, --numofcmpds> number] [B<--outdelim> comma | tab | semicolon]
|
|
1281 [B<--output> SD | text | both] [B<-o, --overwrite>] [B<-q, --quote> yes | no]
|
|
1282 [B<--record> recnum | startrecnum,endrecnum] B<--RegexIgnoreCase> I<yes or no>
|
|
1283 [B<-r, --root> rootname] [B<-s, --seed> number] [B<--StrDataString> yes | no]
|
|
1284 [B<--StrDataStringDelimiter> text] [B<--StrDataStringMode> StrOnly | StrAndDataFields]
|
|
1285 [B<--ValueComparisonMode> I<Numeric | Alphanumeric>]
|
|
1286 [B<-v, --violations-> number] [B<-w, --workingdir> dirname] SDFile(s)...
|
|
1287
|
|
1288 =head1 DESCRIPTION
|
|
1289
|
|
1290 Extract specific data from I<SDFile(s)> and generate appropriate SD or CSV/TSV text
|
|
1291 file(s). The structure data from SDFile(s) is not transferred to CSV/TSV text file(s).
|
|
1292 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf>
|
|
1293 and I<.sd>. All other file names are ignored. All the SD files in a current directory
|
|
1294 can be specified either by I<*.sdf> or the current directory name.
|
|
1295
|
|
1296 =head1 OPTIONS
|
|
1297
|
|
1298 =over 4
|
|
1299
|
|
1300 =item B<-h, --help>
|
|
1301
|
|
1302 Print this help message.
|
|
1303
|
|
1304 =item B<-d, --datafields> I<"fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value,...">
|
|
1305
|
|
1306 This value is mode specific. In general, it's a list of comma separated data field labels
|
|
1307 and associated mode specific values.
|
|
1308
|
|
1309 For I<datafields> mode, input value format is: I<fieldlabel,...>. Examples:
|
|
1310
|
|
1311 Extreg
|
|
1312 Extreg,CompoundName,ID
|
|
1313
|
|
1314 For I<datafieldsbyvalue> mode, input value format contains these triplets:
|
|
1315 I<fieldlabel,value, criteria...>. Possible values for criteria: I<le, ge or eq>.
|
|
1316 The values of B<--ValueComparisonMode> indicates whether values are
|
|
1317 compared numerical or string comarison operators. Default is to consider
|
|
1318 data field values as numerical values and use numerical comparison operators.
|
|
1319 Examples:
|
|
1320
|
|
1321 MolWt,450,le
|
|
1322 MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le
|
|
1323
|
|
1324 For I<datafieldsbyregex> mode, input value format contains these triplets:
|
|
1325 I<fieldlabel,regex, criteria...>. I<regex> corresponds to any valid regular expression
|
|
1326 and is used to match the values for specified I<fieldlabel>. Possible values for criteria:
|
|
1327 I<eq or ne>. During I<eq> and I<ne> values, data field label value is matched with
|
|
1328 regular expression using =~ and !~ respectively. B<--RegexIgnoreCase> option
|
|
1329 value is used to determine whether to ignore letter upper/lower case during
|
|
1330 regular expression match. Examples:
|
|
1331
|
|
1332 Name,ol,eq
|
|
1333 Name,'^pat',ne
|
|
1334
|
|
1335 For I<datafieldbylist> and I<datafielduniquebylist> mode, input value format is:
|
|
1336 I<fieldlabel,value1,value2...>. This is equivalent to I<datafieldsbyvalue> mode with
|
|
1337 this input value format:I<fieldlabel,value1,eq,fieldlabel,value2,eq,...>. For
|
|
1338 I<datafielduniquebylist> mode, only unique compounds identified by first occurrence
|
|
1339 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds
|
|
1340 are simply ignored.
|
|
1341
|
|
1342 For I<datafieldnotbylist> mode, input value format is: I<fieldlabel,value1,value2...>. In this
|
|
1343 mode, the script behaves exactly opposite of I<datafieldbylist> mode, and only those compounds
|
|
1344 are extracted whose data field values don't match any specified data field value.
|
|
1345
|
|
1346 =item B<--datafieldsfile> I<filename>
|
|
1347
|
|
1348 Filename which contains various mode specific values. This option provides a way
|
|
1349 to specify mode specific values in a file instead of entering them on the command
|
|
1350 line using B<-d --datafields>.
|
|
1351
|
|
1352 For I<datafields> mode, input file lines contain comma delimited field labels:
|
|
1353 I<fieldlabel,...>. Example:
|
|
1354
|
|
1355 Line 1:MolId
|
|
1356 Line 2:"Extreg",CompoundName,ID
|
|
1357
|
|
1358 For I<datafieldsbyvalue> mode, input file lines contains these comma separated triplets:
|
|
1359 I<fieldlabel,value, criteria>. Possible values for criteria: I<le, ge or eq>. Examples:
|
|
1360
|
|
1361 Line 1:MolWt,450,le
|
|
1362
|
|
1363 Line 1:"MolWt",450,le,"LogP",5,le,"SumNumNO",10,le,"SumNHOH",5,le
|
|
1364
|
|
1365 Line 1:MolWt,450,le
|
|
1366 Line 2:"LogP",5,le
|
|
1367 Line 3:"SumNumNO",10,le
|
|
1368 Line 4: SumNHOH,5,le
|
|
1369
|
|
1370 For I<datafieldbylist> and I<datafielduniquebylist> mode, input file line format is:
|
|
1371
|
|
1372 Line 1:fieldlabel;
|
|
1373 Subsequent lines:value1,value2...
|
|
1374
|
|
1375 For I<datafieldbylist>, I<datafielduniquebylist>, and I<datafieldnotbylist> mode, input file
|
|
1376 line format is:
|
|
1377
|
|
1378 Line 1:fieldlabel;
|
|
1379 Subsequent lines:value1,value2...
|
|
1380
|
|
1381 For I<datafielduniquebylist> mode, only unique compounds identified by first occurrence
|
|
1382 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds
|
|
1383 are simply ignored. Example:
|
|
1384
|
|
1385 Line 1: MolID
|
|
1386 Subsequent Lines:
|
|
1387 907508
|
|
1388 832291,4642
|
|
1389 "1254","907303"
|
|
1390
|
|
1391 =item B<--indelim> I<comma | tab | semicolon>
|
|
1392
|
|
1393 Delimiter used to specify text values for B<-d --datafields> and B<--datafieldsfile> options.
|
|
1394 Possible values: I<comma, tab, or semicolon>. Default value: I<comma>.
|
|
1395
|
|
1396 =item B<-m, --mode> I<alldatafields | commondatafields | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | datafielduniquebylist | datafieldnotbylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | 3dcmpdrecords>
|
|
1397
|
|
1398 Specify what to extract from I<SDFile(s)>. Possible values: I<alldatafields, commondatafields,
|
|
1399 datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, datafieldnotbylist,
|
|
1400 molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords>.
|
|
1401 Default value: I<alldatafields>.
|
|
1402
|
|
1403 For I<alldatafields> and I<molnames> mode, only a CSV/TSV text file is generated; for all
|
|
1404 other modes, however, a SD file is generated by default - you can change the behavior to genereate
|
|
1405 text file using I<--output> option.
|
|
1406
|
|
1407 For I<3DCmpdRecords> mode, only those compounds with at least one non-zero value for Z atomic coordinates
|
|
1408 are retrieved; however, during retrieval of compounds in I<2DCmpdRecords> mode, all Z atomic coordinates must
|
|
1409 be zero.
|
|
1410
|
|
1411 =item B<-n, --numofcmpds> I<number>
|
|
1412
|
|
1413 Number of compouds to extract during I<randomcmpds> mode.
|
|
1414
|
|
1415 =item B<--outdelim> I<comma | tab | semicolon>
|
|
1416
|
|
1417 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon>
|
|
1418 Default value: I<comma>
|
|
1419
|
|
1420 =item B<--output> I<SD | text | both>
|
|
1421
|
|
1422 Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<SD>. For
|
|
1423 I<alldatafields> and I<molnames> mode, this option is ingored and only a CSV/TSV text file is generated.
|
|
1424
|
|
1425 =item B<-o, --overwrite>
|
|
1426
|
|
1427 Overwrite existing files.
|
|
1428
|
|
1429 =item B<-q, --quote> I<yes | no>
|
|
1430
|
|
1431 Put quote around column values in output CSV/TSV text file(s). Possible values:
|
|
1432 I<yes or no>. Default value: I<yes>.
|
|
1433
|
|
1434 =item B<--record> I<recnum | recnums | startrecnum,endrecnum>
|
|
1435
|
|
1436 Record number, record numbers or range of records to extract during I<recordnum>, I<recordnums>
|
|
1437 and I<recordrange> mode. Input value format is: <num>, <num1,num2,...> and <startnum, endnum>
|
|
1438 for I<recordnum>, I<recordnums> and I<recordrange> modes recpectively. Default value: none.
|
|
1439
|
|
1440 =item B<--RegexIgnoreCase> I<yes or no>
|
|
1441
|
|
1442 Specify whether to ingnore case during I<datafieldsbyregex> value of B<-m, --mode> option.
|
|
1443 Possible values: I<yes or no>. Default value: I<yes>.
|
|
1444
|
|
1445 =item B<-r, --root> I<rootname>
|
|
1446
|
|
1447 New file name is generated using the root: <Root>.<Ext>. Default for new file
|
|
1448 names: <SDFileName><mode>.<Ext>. The file type determines <Ext> value.
|
|
1449 The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab
|
|
1450 delimited text files respectively.This option is ignored for multiple input files.
|
|
1451
|
|
1452 =item B<-s, --seed> I<number>
|
|
1453
|
|
1454 Random number seed used for I<randomcmpds> mode. Default:123456789.
|
|
1455
|
|
1456 =item B<--StrDataString> I<yes | no>
|
|
1457
|
|
1458 Specify whether to write out structure data string to CSV/TSV text file(s). Possible values:
|
|
1459 I<yes or no>. Default value: I<no>.
|
|
1460
|
|
1461 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure
|
|
1462 data lines into a structure data string.
|
|
1463
|
|
1464 This option is ignored during generation of SD file(s).
|
|
1465
|
|
1466 =item B<--StrDataStringDelimiter> I<text>
|
|
1467
|
|
1468 Delimiter for joining multiple stucture data lines into a string before writing to CSV/TSV text
|
|
1469 file(s). Possible values: I<any alphanumeric text>. Default value: I<|>.
|
|
1470
|
|
1471 This option is ignored during generation of SD file(s).
|
|
1472
|
|
1473 =item B<--StrDataStringMode> I<StrOnly | StrAndDataFields>
|
|
1474
|
|
1475 Specify whether to include SD data fields and values along with the structure data into structure
|
|
1476 data string before writing it out to CSV/TSV text file(s). Possible values: I<StrOnly or StrAndDataFields>.
|
|
1477 Default value: I<StrOnly>.
|
|
1478
|
|
1479 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure
|
|
1480 data lines into a structure data string.
|
|
1481
|
|
1482 This option is ignored during generation of SD file(s).
|
|
1483
|
|
1484 =item B<--ValueComparisonMode> I<Numeric | Alphanumeric>
|
|
1485
|
|
1486 Specify how to compare data field values during I<datafieldsbyvalue> mode: Compare
|
|
1487 values using either numeric or string ((eq, le, ge) comparison operators. Possible values:
|
|
1488 I<Numeric or Alphanumeric>. Defaule value: I<Numeric>.
|
|
1489
|
|
1490 =item B<-v, --violations> I<number>
|
|
1491
|
|
1492 Number of criterion violations allowed for values specified during I<datafieldsbyvalue>
|
|
1493 and I<datafieldsbyregex> mode. Default value: I<0>.
|
|
1494
|
|
1495 =item B<-w, --workingdir> I<dirname>
|
|
1496
|
|
1497 Location of working directory. Default: current directory.
|
|
1498
|
|
1499 =back
|
|
1500
|
|
1501 =head1 EXAMPLES
|
|
1502
|
|
1503 To retrieve all data fields from SD files and generate CSV text files, type:
|
|
1504
|
|
1505 % ExtractFromSDFiles.pl -o Sample.sdf
|
|
1506 % ExtractFromSDFiles.pl -o *.sdf
|
|
1507
|
|
1508 To retrieve all data fields from SD file and generate CSV text files containing
|
|
1509 a column with structure data as a string with | as line delimiter, type:
|
|
1510
|
|
1511 % ExtractFromSDFiles.pl --StrDataString Yes -o Sample.sdf
|
|
1512
|
|
1513 To retrieve MOL_ID data fileld from SD file and generate CSV text files containing
|
|
1514 a column with structure data along with all data fields as a string with | as line
|
|
1515 delimiter, type:
|
|
1516
|
|
1517 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID" --StrDataString Yes
|
|
1518 --StrDataStringMode StrAndDataFields --StrDataStringDelimiter "|"
|
|
1519 --output text -o Sample.sdf
|
|
1520
|
|
1521 To retrieve common data fields which exists for all the compounds in
|
|
1522 a SD file and generate a TSV text file NewSample.tsv, type:
|
|
1523
|
|
1524 % ExtractFromSDFiles.pl -m commondatafields --outdelim tab -r NewSample
|
|
1525 --output Text -o Sample.sdf
|
|
1526
|
|
1527 To retrieve MolId, ExtReg, and CompoundName data field from a SD file and generate a
|
|
1528 CSV text file NewSample.csv, type:
|
|
1529
|
|
1530 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID,MolWeight,
|
|
1531 CompoundName" -r NewSample --output Text -o Sample.sdf
|
|
1532
|
|
1533 To retrieve compounds from a SD which meet a specific set of criteria - MolWt <= 450,
|
|
1534 LogP <= 5 and SumNO < 10 - from a SD file and generate a new SD file NewSample.sdf,
|
|
1535 type:
|
|
1536
|
|
1537 % ExtractFromSDFiles.pl -m datafieldsbyvalue -d "MolWt,450,le,LogP
|
|
1538 ,5,le,SumNO,10" -r NewSample -o Sample.sdf
|
|
1539
|
|
1540 To retrive compounds from a SD file with a specific set of values for MolID and
|
|
1541 generate a new SD file NewSample.sdf, type:
|
|
1542
|
|
1543 % ExtractFromSDFiles.pl -m datafieldbylist -d "Mol_ID,159,4509,4619"
|
|
1544 -r NewSample -o Sample.sdf
|
|
1545
|
|
1546 To retrive compounds from a SD file with values for MolID not on a list of specified
|
|
1547 values and generate a new SD file NewSample.sdf, type:
|
|
1548
|
|
1549 % ExtractFromSDFiles.pl -m datafieldnotbylist -d "Mol_ID,159,4509,4619"
|
|
1550 -r NewSample -o Sample.sdf
|
|
1551
|
|
1552 To retrive 10 random compounds from a SD file and generate a new SD file RandomSample.sdf, type:
|
|
1553
|
|
1554 % ExtractFromSDFiles.pl -m randomcmpds -n 10 -r RandomSample
|
|
1555 -o Sample.sdf
|
|
1556
|
|
1557 To retrive compound record number 10 from a SD file and generate a new SD file NewSample.sdf, type:
|
|
1558
|
|
1559 % ExtractFromSDFiles.pl -m recordnum --record 10 -r NewSample
|
|
1560 -o Sample.sdf
|
|
1561
|
|
1562 To retrive compound record numbers 10, 20 and 30 from a SD file and generate a new SD file
|
|
1563 NewSample.sdf, type:
|
|
1564
|
|
1565 % ExtractFromSDFiles.pl -m recordnums --record 10,20,30 -r NewSample
|
|
1566 -o Sample.sdf
|
|
1567
|
|
1568 To retrive compound records between 10 to 20 from SD file and generate a new SD
|
|
1569 file NewSample.sdf, type:
|
|
1570
|
|
1571 % ExtractFromSDFiles.pl -m recordrange --record 10,20 -r NewSample
|
|
1572 -o Sample.sdf
|
|
1573
|
|
1574 =head1 AUTHOR
|
|
1575
|
|
1576 Manish Sud <msud@san.rr.com>
|
|
1577
|
|
1578 =head1 SEE ALSO
|
|
1579
|
|
1580 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl
|
|
1581
|
|
1582 =head1 COPYRIGHT
|
|
1583
|
|
1584 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
1585
|
|
1586 This file is part of MayaChemTools.
|
|
1587
|
|
1588 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
1589 the terms of the GNU Lesser General Public License as published by the Free
|
|
1590 Software Foundation; either version 3 of the License, or (at your option)
|
|
1591 any later version.
|
|
1592
|
|
1593 =cut
|