Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/ExtractFromSDFiles.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: ExtractFromSDFiles.pl,v $ | |
4 # $Date: 2015/03/22 19:11:27 $ | |
5 # $Revision: 1.48 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use SDFileUtil; | |
36 use FileUtil; | |
37 use TextUtil; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename($0); | |
46 print "\n$ScriptName:Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Get the options and setup script... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 my(@SDFilesList); | |
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | |
57 | |
58 # Process options... | |
59 print "Processing options...\n"; | |
60 my(%OptionsInfo); | |
61 ProcessOptions(); | |
62 | |
63 # Collect information about SD files... | |
64 print "Checking input SD file(s)...\n"; | |
65 my(%SDFilesInfo); | |
66 RetrieveSDFilesInfo(); | |
67 | |
68 # Generate output files... | |
69 my($FileIndex); | |
70 if (@SDFilesList > 1) { | |
71 print "\nProcessing SD files...\n"; | |
72 } | |
73 for $FileIndex (0 .. $#SDFilesList) { | |
74 if ($SDFilesInfo{FileOkay}[$FileIndex]) { | |
75 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | |
76 ExtractFromSDFile($FileIndex); | |
77 } | |
78 } | |
79 print "\n$ScriptName:Done...\n\n"; | |
80 | |
81 $EndTime = new Benchmark; | |
82 $TotalTime = timediff ($EndTime, $StartTime); | |
83 print "Total time: ", timestr($TotalTime), "\n"; | |
84 | |
85 ############################################################################### | |
86 | |
87 # Extract data from a SD file... | |
88 sub ExtractFromSDFile { | |
89 my($FileIndex) = @_; | |
90 | |
91 OpenInputAndOutputFiles($FileIndex); | |
92 | |
93 MODE: { | |
94 if ($OptionsInfo{Mode} =~ /^AllDataFields$/i) { | |
95 ExtractAllDataFields($FileIndex); | |
96 last MODE; | |
97 } | |
98 if ($OptionsInfo{Mode} =~ /^CommonDataFields$/i) { | |
99 ExtractCommonDataFields($FileIndex); | |
100 last MODE; | |
101 } | |
102 if ($OptionsInfo{Mode} =~ /^DataFields$/i) { | |
103 ExtractDataFields($FileIndex); | |
104 last MODE; | |
105 } | |
106 if ($OptionsInfo{Mode} =~ /^(DataFieldByList|DatafieldUniqueByList)$/i) { | |
107 ExtractDataFieldByList($FileIndex); | |
108 last MODE; | |
109 } | |
110 if ($OptionsInfo{Mode} =~ /^DataFieldNotByList$/i) { | |
111 ExtractDataFieldNotByList($FileIndex); | |
112 last MODE; | |
113 } | |
114 if ($OptionsInfo{Mode} =~ /^DataFieldsByValue$/i) { | |
115 ExtractDataFieldsByValue($FileIndex); | |
116 last MODE; | |
117 } | |
118 if ($OptionsInfo{Mode} =~ /^DataFieldsByRegex$/i) { | |
119 ExtractDataFieldsByRegex($FileIndex); | |
120 last MODE; | |
121 } | |
122 if ($OptionsInfo{Mode} =~ /^RandomCmpds$/i) { | |
123 ExtractRandomCompounds($FileIndex); | |
124 last MODE; | |
125 } | |
126 if ($OptionsInfo{Mode} =~ /^MolNames$/i) { | |
127 ExtractMolNames($FileIndex); | |
128 last MODE; | |
129 } | |
130 if ($OptionsInfo{Mode} =~ /^RecordNum$/i) { | |
131 ExtractRecordNum($FileIndex); | |
132 last MODE; | |
133 } | |
134 if ($OptionsInfo{Mode} =~ /^RecordNums$/i) { | |
135 ExtractRecordNums($FileIndex); | |
136 last MODE; | |
137 } | |
138 if ($OptionsInfo{Mode} =~ /^RecordRange$/i) { | |
139 ExtractRecordRange($FileIndex); | |
140 last MODE; | |
141 } | |
142 if ($OptionsInfo{Mode} =~ /^2DCmpdRecords$/i) { | |
143 Extract2DCmpdRecords($FileIndex); | |
144 last MODE; | |
145 } | |
146 if ($OptionsInfo{Mode} =~ /^3DCmpdRecords$/i) { | |
147 Extract3DCmpdRecords($FileIndex); | |
148 last MODE; | |
149 } | |
150 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; | |
151 } | |
152 | |
153 CloseInputAndOutputFiles(); | |
154 } | |
155 | |
156 # Extract all data fields... | |
157 sub ExtractAllDataFields { | |
158 my($FileIndex) = @_; | |
159 my(@CmpdLines); | |
160 | |
161 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
162 WriteTextFileColLabels(); | |
163 | |
164 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
165 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
166 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
167 | |
168 SetupDataValues(); | |
169 WriteTextFileCmpdData(); | |
170 WriteSDFileCmpdData(); | |
171 } | |
172 } | |
173 | |
174 # Extract common data fields... | |
175 sub ExtractCommonDataFields { | |
176 my($FileIndex) = @_; | |
177 my(@CmpdLines); | |
178 | |
179 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{CommonDataFieldLabels}[$FileIndex]}; | |
180 WriteTextFileColLabels(); | |
181 | |
182 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
183 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
184 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
185 | |
186 SetupDataValues(); | |
187 WriteTextFileCmpdData(); | |
188 WriteSDFileCmpdData(); | |
189 } | |
190 } | |
191 | |
192 # Extract specified data fields... | |
193 sub ExtractDataFields { | |
194 my($FileIndex) = @_; | |
195 my(@CmpdLines); | |
196 | |
197 @{$SDFilesInfo{DataLabels}} = @{$OptionsInfo{SpecifiedDataFieldLabels}}; | |
198 WriteTextFileColLabels(); | |
199 | |
200 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
201 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
202 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
203 | |
204 SetupDataValues(); | |
205 WriteTextFileCmpdData(); | |
206 WriteSDFileCmpdData(); | |
207 } | |
208 } | |
209 | |
210 # Extract data fields using a list... | |
211 sub ExtractDataFieldByList { | |
212 my($FileIndex) = @_; | |
213 my($CmpdNum, $Value, $SpecifiedDataFieldValuesFoundCount, $CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); | |
214 | |
215 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
216 WriteTextFileColLabels(); | |
217 | |
218 for $Value (keys %{$OptionsInfo{SpecifiedDataFieldValues}}) { | |
219 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; | |
220 } | |
221 $SpecifiedDataFieldValuesFoundCount = 0; | |
222 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; | |
223 | |
224 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
225 $CmpdNum++; | |
226 | |
227 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
228 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
229 | |
230 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { | |
231 next CMPDSTRING; | |
232 } | |
233 | |
234 SetupDataValues(); | |
235 | |
236 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; | |
237 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; | |
238 | |
239 if (exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { | |
240 if ($SpecifiedDataFieldValuesFoundCount < $OptionsInfo{SpecifiedDataFieldValuesCount}) { | |
241 if ($OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} eq "NotFound") { | |
242 $SpecifiedDataFieldValuesFoundCount++; | |
243 $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue} = "Found"; | |
244 if ($OptionsInfo{Mode} =~ /^DataFieldUniqueByList$/i) { | |
245 WriteSDFileCmpdString(); | |
246 WriteTextFileCmpdData(); | |
247 } | |
248 } | |
249 if ($OptionsInfo{Mode} =~ /^DataFieldByList$/i) { | |
250 WriteSDFileCmpdString(); | |
251 WriteTextFileCmpdData(); | |
252 } | |
253 } | |
254 if ($SpecifiedDataFieldValuesFoundCount >= $OptionsInfo{SpecifiedDataFieldValuesCount}) { | |
255 last CMPDSTRING; | |
256 } | |
257 } | |
258 } | |
259 } | |
260 | |
261 # Extract data field whose values are not on the specified list... | |
262 sub ExtractDataFieldNotByList { | |
263 my($FileIndex) = @_; | |
264 my($CurrentValue, $SpecifiedDataFieldLabel, @CmpdLines); | |
265 | |
266 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
267 WriteTextFileColLabels(); | |
268 | |
269 $SpecifiedDataFieldLabel = $OptionsInfo{SpecifiedDataFieldLabel}; | |
270 | |
271 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
272 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
273 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
274 | |
275 if (!exists $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}) { | |
276 next CMPDSTRING; | |
277 } | |
278 | |
279 SetupDataValues(); | |
280 | |
281 $CurrentValue = $SDFilesInfo{DataFieldValues}{$SpecifiedDataFieldLabel}; | |
282 | |
283 # Make sure the current value is not empty and is not only specified list of values... | |
284 if (IsEmpty($CurrentValue) || exists $OptionsInfo{SpecifiedDataFieldValues}{$CurrentValue}) { | |
285 next CMPDSTRING; | |
286 } | |
287 | |
288 WriteSDFileCmpdString(); | |
289 WriteTextFileCmpdData(); | |
290 } | |
291 } | |
292 | |
293 # Extract data fields by value... | |
294 sub ExtractDataFieldsByValue { | |
295 my($FileIndex) = @_; | |
296 my($Label, $CurrentValue, $SpecifiedCriterion, $SpecifiedValue, $ViolationCount, $Nothing, @CmpdLines); | |
297 | |
298 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
299 WriteTextFileColLabels(); | |
300 | |
301 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
302 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
303 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
304 | |
305 SetupDataValues(); | |
306 $ViolationCount = 0; | |
307 | |
308 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { | |
309 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { | |
310 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; | |
311 $SpecifiedCriterion = $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label}; | |
312 $SpecifiedValue = $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label}; | |
313 | |
314 if ($OptionsInfo{NumericalComparison}) { | |
315 CRITERION: { | |
316 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue != $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
317 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue > $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
318 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue < $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
319 $Nothing = 1; | |
320 } | |
321 } | |
322 else { | |
323 CRITERION: { | |
324 if ($SpecifiedCriterion =~ /^eq$/i) { if ($CurrentValue ne $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
325 if ($SpecifiedCriterion =~ /^le$/i) { if ($CurrentValue gt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
326 if ($SpecifiedCriterion =~ /^ge$/i) { if ($CurrentValue lt $SpecifiedValue) { $ViolationCount++; last CRITERION; } } | |
327 $Nothing = 1; | |
328 } | |
329 } | |
330 } | |
331 } | |
332 if ($ViolationCount <= $OptionsInfo{Violations}) { | |
333 WriteSDFileCmpdString(); | |
334 WriteTextFileCmpdData(); | |
335 } | |
336 } | |
337 } | |
338 | |
339 # Extract data fields by value using regular expression match... | |
340 sub ExtractDataFieldsByRegex { | |
341 my($FileIndex) = @_; | |
342 my($Label, $CurrentValue, $SpecifiedRegexCriterion, $SpecifiedRegex, $ViolationCount, $Nothing, @CmpdLines); | |
343 | |
344 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
345 WriteTextFileColLabels(); | |
346 | |
347 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
348 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
349 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
350 | |
351 SetupDataValues(); | |
352 $ViolationCount = 0; | |
353 | |
354 for $Label (@{$OptionsInfo{SpecifiedDataFieldLabels}}) { | |
355 if (exists $SDFilesInfo{DataFieldValues}{$Label}) { | |
356 $CurrentValue = $SDFilesInfo{DataFieldValues}{$Label}; | |
357 $SpecifiedRegexCriterion = $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label}; | |
358 $SpecifiedRegex = $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label}; | |
359 | |
360 if ($OptionsInfo{RegexIgnoreCase}) { | |
361 CRITERION: { | |
362 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } | |
363 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/i) { $ViolationCount++; last CRITERION; } } | |
364 $Nothing = 1; | |
365 } | |
366 } | |
367 else { | |
368 CRITERION: { | |
369 if ($SpecifiedRegexCriterion =~ /^eq$/i) { if ($CurrentValue !~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } | |
370 if ($SpecifiedRegexCriterion =~ /^ne$/i) { if ($CurrentValue =~ /$SpecifiedRegex/) { $ViolationCount++; last CRITERION; } } | |
371 $Nothing = 1; | |
372 } | |
373 } | |
374 } | |
375 } | |
376 if ($ViolationCount <= $OptionsInfo{Violations}) { | |
377 WriteSDFileCmpdString(); | |
378 WriteTextFileCmpdData(); | |
379 } | |
380 } | |
381 } | |
382 | |
383 # Extract random compounds... | |
384 sub ExtractRandomCompounds { | |
385 my($FileIndex) = @_; | |
386 my($CmpdNum, $CmpdCount, $RandomCycleCount, $RandomIndex, @CmpdLines, %RandomCmpdIndexMap); | |
387 | |
388 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
389 WriteTextFileColLabels(); | |
390 | |
391 $CmpdCount = $SDFilesInfo{CmpdCount}[$FileIndex]; | |
392 srand($OptionsInfo{Seed}); | |
393 $RandomCycleCount = 0; | |
394 | |
395 %RandomCmpdIndexMap = (); | |
396 while ($RandomCycleCount <= $CmpdCount && $RandomCycleCount <= $OptionsInfo{NumOfCmpds}) { | |
397 $RandomCycleCount++; | |
398 $RandomIndex = int (rand $CmpdCount) + 1; | |
399 $RandomCmpdIndexMap{$RandomIndex} = $RandomIndex; | |
400 } | |
401 | |
402 $CmpdNum = 0; | |
403 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
404 $CmpdNum++; | |
405 if (!exists $RandomCmpdIndexMap{$CmpdNum}) { | |
406 next CMPDSTRING; | |
407 } | |
408 | |
409 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
410 | |
411 WriteSDFileCmpdString(); | |
412 | |
413 if ($OptionsInfo{OutputTextFile}) { | |
414 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
415 SetupDataValues(); | |
416 WriteTextFileCmpdData(); | |
417 } | |
418 } | |
419 } | |
420 | |
421 # Extract mol names... | |
422 sub ExtractMolNames { | |
423 my($FileIndex) = @_; | |
424 my($MolName, $NewTextFileRef, @CmpdLines); | |
425 | |
426 push @{$SDFilesInfo{DataLabels}}, "MolName"; | |
427 WriteTextFileColLabels(); | |
428 | |
429 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; | |
430 while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
431 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
432 $MolName = QuoteAWord(ParseCmpdMolNameLine($CmpdLines[0]), $OptionsInfo{OutQuote}); | |
433 print $NewTextFileRef "$MolName\n"; | |
434 } | |
435 } | |
436 | |
437 # Extract a specific compound record... | |
438 sub ExtractRecordNum { | |
439 my($FileIndex) = @_; | |
440 my($CmpdNum, @CmpdLines); | |
441 | |
442 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
443 WriteTextFileColLabels(); | |
444 | |
445 $CmpdNum = 0; | |
446 | |
447 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
448 $CmpdNum++; | |
449 if ($CmpdNum != $OptionsInfo{RecordNum}) { | |
450 next CMPDSTRING; | |
451 } | |
452 | |
453 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
454 WriteSDFileCmpdString(); | |
455 | |
456 if ($OptionsInfo{OutputTextFile}) { | |
457 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
458 SetupDataValues(); | |
459 WriteTextFileCmpdData(); | |
460 } | |
461 last CMPDSTRING; | |
462 } | |
463 } | |
464 | |
465 # Extract a specific compound records... | |
466 sub ExtractRecordNums { | |
467 my($FileIndex) = @_; | |
468 my($CmpdNum, $CmpdCount, @CmpdLines); | |
469 | |
470 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
471 WriteTextFileColLabels(); | |
472 | |
473 $CmpdNum = 0; | |
474 $CmpdCount = 0; | |
475 | |
476 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
477 $CmpdNum++; | |
478 | |
479 if (exists $OptionsInfo{RecordNums}{$CmpdNum}) { | |
480 $CmpdCount++; | |
481 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
482 | |
483 WriteSDFileCmpdString(); | |
484 | |
485 if ($OptionsInfo{OutputTextFile}) { | |
486 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
487 SetupDataValues(); | |
488 WriteTextFileCmpdData(); | |
489 } | |
490 } | |
491 elsif ($CmpdNum > $OptionsInfo{RecordNumsMax} || $CmpdCount >= $OptionsInfo{RecordNumsCount}) { | |
492 last CMPDSTRING; | |
493 } | |
494 } | |
495 } | |
496 | |
497 | |
498 # Extract compounds in a specific record range... | |
499 sub ExtractRecordRange { | |
500 my($FileIndex) = @_; | |
501 my($CmpdNum, @CmpdLines); | |
502 | |
503 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
504 WriteTextFileColLabels(); | |
505 | |
506 $CmpdNum = 0; | |
507 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
508 $CmpdNum++; | |
509 | |
510 if ($CmpdNum >= $OptionsInfo{StartRecordNum} && $CmpdNum <= $OptionsInfo{EndRecordNum}) { | |
511 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
512 | |
513 WriteSDFileCmpdString(); | |
514 | |
515 if ($OptionsInfo{OutputTextFile}) { | |
516 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
517 SetupDataValues(); | |
518 WriteTextFileCmpdData(); | |
519 } | |
520 } | |
521 elsif ($CmpdNum > $OptionsInfo{EndRecordNum}) { | |
522 last CMPDSTRING; | |
523 } | |
524 } | |
525 } | |
526 | |
527 # Extract 2D compound records... | |
528 sub Extract2DCmpdRecords { | |
529 my($FileIndex) = @_; | |
530 my(@CmpdLines); | |
531 | |
532 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
533 WriteTextFileColLabels(); | |
534 | |
535 | |
536 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
537 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
538 if (!IsCmpd2D(\@CmpdLines)) { | |
539 next CMPDSTRING; | |
540 } | |
541 | |
542 WriteSDFileCmpdString(); | |
543 | |
544 if ($OptionsInfo{OutputTextFile}) { | |
545 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
546 SetupDataValues(); | |
547 WriteTextFileCmpdData(); | |
548 } | |
549 } | |
550 } | |
551 | |
552 # Extract 3D compound records... | |
553 sub Extract3DCmpdRecords { | |
554 my($FileIndex) = @_; | |
555 my(@CmpdLines); | |
556 | |
557 @{$SDFilesInfo{DataLabels}} = @{$SDFilesInfo{AllDataFieldLabels}[$FileIndex]}; | |
558 WriteTextFileColLabels(); | |
559 | |
560 | |
561 CMPDSTRING: while ($SDFilesInfo{CmpdString} = ReadCmpdString($SDFilesInfo{InputSDFileRef})) { | |
562 @CmpdLines = split "\n", $SDFilesInfo{CmpdString}; | |
563 if (!IsCmpd3D(\@CmpdLines)) { | |
564 next CMPDSTRING; | |
565 } | |
566 | |
567 WriteSDFileCmpdString(); | |
568 | |
569 if ($OptionsInfo{OutputTextFile}) { | |
570 %{$SDFilesInfo{DataFieldValues}} = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | |
571 SetupDataValues(); | |
572 WriteTextFileCmpdData(); | |
573 } | |
574 } | |
575 } | |
576 | |
577 | |
578 # Open input and output files... | |
579 sub OpenInputAndOutputFiles { | |
580 my($FileIndex) = @_; | |
581 | |
582 $SDFilesInfo{NewTextFileRef} = undef; | |
583 $SDFilesInfo{NewSDFileRef} = undef; | |
584 | |
585 if ($OptionsInfo{OutputTextFile} && $OptionsInfo{OutputSDFile}) { | |
586 print "Generating files $SDFilesInfo{NewSDFileName}[$FileIndex] and $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; | |
587 } | |
588 elsif ($OptionsInfo{OutputSDFile}) { | |
589 print "Generating file $SDFilesInfo{NewSDFileName}[$FileIndex]...\n"; | |
590 } | |
591 else { | |
592 print "Generating file $SDFilesInfo{NewTextFileName}[$FileIndex]...\n"; | |
593 } | |
594 | |
595 if ($OptionsInfo{OutputSDFile}) { | |
596 open NEWSDFILE, ">$SDFilesInfo{NewSDFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewSDFileName}[$FileIndex]: $! \n"; | |
597 $SDFilesInfo{NewSDFileRef} = \*NEWSDFILE; | |
598 } | |
599 if ($OptionsInfo{OutputTextFile}) { | |
600 open NEWTEXTFILE, ">$SDFilesInfo{NewTextFileName}[$FileIndex]" or die "Error: Couldn't open $SDFilesInfo{NewTextFileName}[$FileIndex]: $! \n"; | |
601 $SDFilesInfo{NewTextFileRef} = \*NEWTEXTFILE; | |
602 } | |
603 | |
604 open SDFILE, "$SDFilesList[$FileIndex]" or die "Error: Couldn't open $SDFilesList[$FileIndex]: $! \n"; | |
605 $SDFilesInfo{InputSDFileRef} = \*SDFILE; | |
606 | |
607 } | |
608 | |
609 # Close open input and output files... | |
610 sub CloseInputAndOutputFiles { | |
611 if ($SDFilesInfo{NewSDFileRef}) { | |
612 close $SDFilesInfo{NewSDFileRef}; | |
613 } | |
614 if ($SDFilesInfo{NewTextFileRef}) { | |
615 close $SDFilesInfo{NewTextFileRef}; | |
616 } | |
617 | |
618 if ($SDFilesInfo{InputSDFileRef}) { | |
619 close $SDFilesInfo{InputSDFileRef}; | |
620 } | |
621 | |
622 $SDFilesInfo{NewTextFileRef} = undef; | |
623 $SDFilesInfo{NewSDFileRef} = undef; | |
624 $SDFilesInfo{InputSDFileRef} = undef; | |
625 } | |
626 | |
627 # Write out column labels for text file... | |
628 sub WriteTextFileColLabels { | |
629 my($ColLabelsLine, $NewTextFileRef); | |
630 | |
631 if (!$OptionsInfo{OutputTextFile}) { | |
632 return; | |
633 } | |
634 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; | |
635 | |
636 if ($OptionsInfo{OutputStrDataString}) { | |
637 # Append structure data string label... | |
638 my(@DataLabels); | |
639 | |
640 @DataLabels = (); | |
641 push @DataLabels, @{$SDFilesInfo{DataLabels}}; | |
642 push @DataLabels, "StructureDataString"; | |
643 | |
644 $ColLabelsLine = JoinWords(\@DataLabels, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
645 } | |
646 else { | |
647 $ColLabelsLine = JoinWords(\@{$SDFilesInfo{DataLabels}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
648 } | |
649 print $NewTextFileRef "$ColLabelsLine\n"; | |
650 } | |
651 | |
652 # Setup values for data fields... | |
653 sub SetupDataValues { | |
654 @{$SDFilesInfo{DataValues}} = map { exists $SDFilesInfo{DataFieldValues}{$_} ? $SDFilesInfo{DataFieldValues}{$_} : "" } @{$SDFilesInfo{DataLabels}}; | |
655 } | |
656 | |
657 # Write out structure data and specific data fields to SD file... | |
658 sub WriteSDFileCmpdData { | |
659 my($MolString, $Count, $NewSDFileRef); | |
660 | |
661 if (!$OptionsInfo{OutputSDFile}) { | |
662 return; | |
663 } | |
664 | |
665 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; | |
666 | |
667 ($MolString) = split "M END", $SDFilesInfo{CmpdString}; | |
668 $MolString .= "M END"; | |
669 print $NewSDFileRef "$MolString\n"; | |
670 | |
671 for $Count (0 .. $#{$SDFilesInfo{DataLabels}}) { | |
672 print $NewSDFileRef "> <$SDFilesInfo{DataLabels}[$Count]>\n$SDFilesInfo{DataValues}[$Count]\n\n"; | |
673 } | |
674 print $NewSDFileRef "\$\$\$\$\n"; | |
675 } | |
676 | |
677 # Write out compound string... | |
678 sub WriteSDFileCmpdString { | |
679 my($NewSDFileRef); | |
680 | |
681 if (!$OptionsInfo{OutputSDFile}) { | |
682 return; | |
683 } | |
684 | |
685 $NewSDFileRef = $SDFilesInfo{NewSDFileRef}; | |
686 print $NewSDFileRef "$SDFilesInfo{CmpdString}\n"; | |
687 } | |
688 | |
689 # Write out data for text file... | |
690 sub WriteTextFileCmpdData { | |
691 my($DataValuesLine, $NewTextFileRef); | |
692 | |
693 if (!$OptionsInfo{OutputTextFile}) { | |
694 return; | |
695 } | |
696 | |
697 $NewTextFileRef = $SDFilesInfo{NewTextFileRef}; | |
698 $DataValuesLine = JoinWords(\@{$SDFilesInfo{DataValues}}, $OptionsInfo{OutDelim}, $OptionsInfo{OutQuote}); | |
699 | |
700 # Handle multiple lines data values for data fields by joining 'em using semicolons... | |
701 if ($DataValuesLine =~ /\n/) { | |
702 $DataValuesLine =~ s/\n/;/g; | |
703 } | |
704 | |
705 if ($OptionsInfo{OutputStrDataString}) { | |
706 # Append structure data string... | |
707 my($StrDataString, $OutQuoteValue, $OutDelim, $StrDataStringDelimiter); | |
708 | |
709 if ($OptionsInfo{StrDataStringWithFields}) { | |
710 $StrDataString = $SDFilesInfo{CmpdString}; | |
711 } | |
712 else { | |
713 ($StrDataString) = split "M END", $SDFilesInfo{CmpdString}; | |
714 $StrDataString .= "M END"; | |
715 } | |
716 $StrDataStringDelimiter = $OptionsInfo{StrDataStringDelimiter}; | |
717 $StrDataString =~ s/\n/$StrDataStringDelimiter/g; | |
718 | |
719 $OutDelim = $OptionsInfo{OutDelim}; | |
720 $OutQuoteValue = $OptionsInfo{OutQuote} ? "\"" : ""; | |
721 | |
722 print $NewTextFileRef "$DataValuesLine${OutDelim}${OutQuoteValue}${StrDataString}${OutQuoteValue}\n"; | |
723 } | |
724 else { | |
725 print $NewTextFileRef "$DataValuesLine\n"; | |
726 } | |
727 } | |
728 | |
729 # Retrieve information about input SD files... | |
730 sub RetrieveSDFilesInfo { | |
731 my($SDFile, $Index, $FileDir, $FileExt, $FileName, $NewFileName, $NewSDFileName, $NewTextFileName, $CmpdCount); | |
732 | |
733 %SDFilesInfo = (); | |
734 | |
735 @{$SDFilesInfo{FileOkay}} = (); | |
736 @{$SDFilesInfo{CmpdCount}} = (); | |
737 @{$SDFilesInfo{NewTextFileName}} = (); | |
738 @{$SDFilesInfo{NewSDFileName}} = (); | |
739 | |
740 @{$SDFilesInfo{AllDataFieldLabels}} = (); | |
741 @{$SDFilesInfo{CommonDataFieldLabels}} = (); | |
742 | |
743 FILELIST: for $Index (0 .. $#SDFilesList) { | |
744 $SDFile = $SDFilesList[$Index]; | |
745 | |
746 $SDFilesInfo{FileOkay}[$Index] = 0; | |
747 | |
748 $SDFilesInfo{CmpdCount}[$Index] = 0; | |
749 $SDFilesInfo{NewTextFileName}[$Index] = ""; | |
750 $SDFilesInfo{NewSDFileName}[$Index] = ""; | |
751 | |
752 @{$SDFilesInfo{AllDataFieldLabels}[$Index]} = (); | |
753 @{$SDFilesInfo{CommonDataFieldLabels}[$Index]} = (); | |
754 | |
755 if (!(-e $SDFile)) { | |
756 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | |
757 next FILELIST; | |
758 } | |
759 | |
760 if (!CheckFileType($SDFile, "sd sdf")) { | |
761 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | |
762 next FILELIST; | |
763 } | |
764 | |
765 # Generate appropriate name for the new output file. | |
766 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
767 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | |
768 $NewFileName = $FileName; | |
769 $NewFileName = $FileName . $OptionsInfo{FileNameMode}; | |
770 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { | |
771 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | |
772 if ($RootFileName && $RootFileExt) { | |
773 $NewFileName = $RootFileName; | |
774 } | |
775 else { | |
776 $NewFileName = $OptionsInfo{OutFileRoot}; | |
777 } | |
778 } | |
779 $NewSDFileName = $NewFileName . ".$OptionsInfo{SDFileExt}"; | |
780 $NewTextFileName = $NewFileName . ".$OptionsInfo{TextFileExt}"; | |
781 | |
782 if ($OptionsInfo{OutputSDFile}) { | |
783 if (lc($NewSDFileName) eq lc($SDFile)) { | |
784 warn "Warning: Ignoring input file $SDFile: Same output, $NewSDFileName, and input file names.\n"; | |
785 print "Specify a different name using \"-r --root\" option or use default name.\n"; | |
786 next FILELIST; | |
787 } | |
788 } | |
789 | |
790 if (!$OptionsInfo{Overwrite}) { | |
791 if ($OptionsInfo{OutputSDFile}) { | |
792 if (-e $NewSDFileName) { | |
793 warn "Warning: Ignoring file $SDFile: New file, $NewSDFileName, already exists\n"; | |
794 next FILELIST; | |
795 } | |
796 } | |
797 if ($OptionsInfo{OutputTextFile}) { | |
798 if (-e $NewTextFileName) { | |
799 warn "Warning: Ignoring file $SDFile: New file, $NewTextFileName, already exists\n"; | |
800 next FILELIST; | |
801 } | |
802 } | |
803 } | |
804 | |
805 if (!open SDFILE, "$SDFile") { | |
806 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; | |
807 next FILELIST; | |
808 } | |
809 | |
810 my($CountCmpds, $CollectDataFields); | |
811 my($CmpdString, @CmpdLines, @DataFieldLabels, %DataFieldLabelsMap,@CommonDataFieldLabels); | |
812 | |
813 $CountCmpds = ($OptionsInfo{Mode} =~ /^randomcmpds$/i) ? 1 : 0; | |
814 | |
815 $CollectDataFields = (($OptionsInfo{Mode} =~ /^(alldatafields|commondatafields|randomcmpds)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^(datafieldsbyvalue|datafieldsbyregex)$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafielduniquebylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^datafieldnotbylist$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnum$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordnums$/i && $OptionsInfo{OutputTextFile}) || ($OptionsInfo{Mode} =~ /^recordrange$/i && $OptionsInfo{OutputTextFile})) ? 1 : 0; | |
816 | |
817 $CmpdCount = 0; | |
818 if ($CountCmpds || $CollectDataFields) { | |
819 @DataFieldLabels = (); | |
820 @CommonDataFieldLabels = (); | |
821 %DataFieldLabelsMap = (); | |
822 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { | |
823 $CmpdCount++; | |
824 if ($OptionsInfo{Mode} =~ /^recordnum$/i) { | |
825 if ($CmpdCount == $OptionsInfo{RecordNum}) { | |
826 @CmpdLines = split "\n", $CmpdString; | |
827 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); | |
828 last CMPDSTRING; | |
829 } | |
830 } | |
831 if ($CollectDataFields) { | |
832 my($Label); | |
833 @CmpdLines = split "\n", $CmpdString; | |
834 # Process compound data header labels and figure out which ones are present for | |
835 # all the compounds... | |
836 if (@DataFieldLabels) { | |
837 my (@CmpdDataFieldLabels) = GetCmpdDataHeaderLabels(\@CmpdLines); | |
838 my(%CmpdDataFieldLabelsMap) = (); | |
839 # Setup a map for the current labels... | |
840 for $Label (@CmpdDataFieldLabels) { | |
841 $CmpdDataFieldLabelsMap{$Label} = "PresentInSome"; | |
842 } | |
843 # Check the presence old labels for this compound; otherwise, mark 'em new... | |
844 for $Label (@DataFieldLabels) { | |
845 if (!$CmpdDataFieldLabelsMap{$Label}) { | |
846 $DataFieldLabelsMap{$Label} = "PresentInSome"; | |
847 } | |
848 } | |
849 # Check the presence this compound in the old labels; otherwise, add 'em... | |
850 for $Label (@CmpdDataFieldLabels ) { | |
851 if (!$DataFieldLabelsMap{$Label}) { | |
852 # It's a new label... | |
853 push @DataFieldLabels, $Label; | |
854 $DataFieldLabelsMap{$Label} = "PresentInSome"; | |
855 } | |
856 } | |
857 } | |
858 else { | |
859 # Get the initial label set and set up a map... | |
860 @DataFieldLabels = GetCmpdDataHeaderLabels(\@CmpdLines); | |
861 for $Label (@DataFieldLabels) { | |
862 $DataFieldLabelsMap{$Label} = "PresentInAll"; | |
863 } | |
864 } | |
865 # Identify the common data field labels... | |
866 if ($Options{mode} =~ /^commondatafields$/i) { | |
867 @CommonDataFieldLabels = (); | |
868 for $Label (@DataFieldLabels) { | |
869 if ($DataFieldLabelsMap{$Label} eq "PresentInAll") { | |
870 push @CommonDataFieldLabels, $Label; | |
871 } | |
872 } | |
873 } | |
874 } | |
875 } | |
876 } | |
877 | |
878 $SDFilesInfo{FileOkay}[$Index] = 1; | |
879 | |
880 $SDFilesInfo{NewTextFileName}[$Index] = $NewTextFileName; | |
881 $SDFilesInfo{NewSDFileName}[$Index] = $NewSDFileName; | |
882 | |
883 $SDFilesInfo{CmpdCount}[$Index] = $CmpdCount; | |
884 | |
885 push @{$SDFilesInfo{AllDataFieldLabels}[$Index]}, @DataFieldLabels; | |
886 push @{$SDFilesInfo{CommonDataFieldLabels}[$Index]}, @CommonDataFieldLabels; | |
887 | |
888 close SDFILE; | |
889 } | |
890 } | |
891 | |
892 # Process options... | |
893 sub ProcessOptions { | |
894 %OptionsInfo = (); | |
895 | |
896 $OptionsInfo{Mode} = $Options{mode}; | |
897 | |
898 $OptionsInfo{InDelim} = "\,"; | |
899 if ($Options{indelim} =~ /^semicolon$/i) { | |
900 $OptionsInfo{InDelim} = "\;"; | |
901 } | |
902 elsif ($Options{indelim} =~ /^tab$/i) { | |
903 $OptionsInfo{InDelim} = "\t"; | |
904 } | |
905 | |
906 $OptionsInfo{OutDelim} = "\,"; | |
907 if ($Options{outdelim} =~ /^semicolon$/i) { | |
908 $OptionsInfo{OutDelim} = "\;"; | |
909 } | |
910 elsif ($Options{outdelim} =~ /^tab$/i) { | |
911 $OptionsInfo{OutDelim} = "\t"; | |
912 } | |
913 | |
914 $OptionsInfo{OutQuote} = ($Options{quote} =~ /^yes$/i) ? 1 : 0; | |
915 | |
916 $OptionsInfo{RegexIgnoreCase} = ($Options{regexignorecase} =~ /^yes$/i) ? 1 : 0; | |
917 | |
918 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : undef; | |
919 $OptionsInfo{Overwrite} = $Options{overwrite} ? $Options{overwrite} : undef; | |
920 | |
921 $OptionsInfo{NumOfCmpds} = $Options{numofcmpds}; | |
922 | |
923 $OptionsInfo{ValueComparisonMode} = $Options{valuecomparisonmode}; | |
924 $OptionsInfo{NumericalComparison} = ($Options{valuecomparisonmode} =~ /^Numeric$/i) ? 1 : 0; | |
925 | |
926 $OptionsInfo{Violations} = $Options{violations}; | |
927 $OptionsInfo{Seed} = $Options{seed}; | |
928 | |
929 | |
930 if ($Options{mode} =~ /^(datafields|datafieldsbyregex|datafieldsbyvalue|datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { | |
931 if ($Options{datafields} || $Options{datafieldsfile}) { | |
932 if ($Options{datafields} && $Options{datafieldsfile}) { | |
933 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify only one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; | |
934 } | |
935 } | |
936 else { | |
937 die "Error: For \"-m --mode\" option values of datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, or datafieldnotbylist specify one of the \"-d --datafields\" or \"--datafieldsfile\" option.\n"; | |
938 } | |
939 } | |
940 $OptionsInfo{DataFields} = $Options{datafields} ? $Options{datafields} : undef; | |
941 $OptionsInfo{DataFieldsFile} = $Options{datafieldsfile} ? $Options{datafieldsfile} : undef; | |
942 | |
943 $OptionsInfo{RecordNum} = 0; $OptionsInfo{StartRecordNum} = 0; $OptionsInfo{EndRecordNum} = 0; | |
944 | |
945 %{$OptionsInfo{RecordNums}} = (); | |
946 $OptionsInfo{RecordNumsMin} = 0; $OptionsInfo{RecordNumsMax} = 0; $OptionsInfo{RecordNumsCount} = 0; | |
947 | |
948 $OptionsInfo{Record} = $Options{record} ? $Options{record} : undef; | |
949 | |
950 if ($Options{mode} =~ /^(recordnum|recordnums|recordrange)$/i) { | |
951 if ($Options{record}) { | |
952 my($Record, @RecordSplit); | |
953 | |
954 $Record = $Options{record}; | |
955 $Record =~ s/ //g; | |
956 | |
957 @RecordSplit = split ",", $Record; | |
958 | |
959 if ($Options{mode} =~ /^recordnum$/i ) { | |
960 if (@RecordSplit == 1) { | |
961 $OptionsInfo{RecordNum} = $RecordSplit[0]; | |
962 if ($OptionsInfo{RecordNum} <= 0) { | |
963 die "Error: The value specified, $OptionsInfo{RecordNum}, for option \"--records\" is not valid. Allowed values: > 0 \n"; | |
964 } | |
965 } | |
966 else { | |
967 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 1 value is allowed.\n"; | |
968 } | |
969 } | |
970 elsif ($Options{mode} =~ /^recordnums$/i ) { | |
971 my($RecordNum, $RecordCount, @SortedRecordSplit); | |
972 | |
973 @SortedRecordSplit = sort { $a <=> $b } @RecordSplit; | |
974 | |
975 $RecordCount = 0; | |
976 RECORDNUM: for $RecordNum (@SortedRecordSplit) { | |
977 if (exists $OptionsInfo{RecordNums}{$RecordNum}) { | |
978 next RECORDNUM; | |
979 } | |
980 $RecordCount++; | |
981 $OptionsInfo{RecordNums}{$RecordNum} = $RecordNum; | |
982 } | |
983 $OptionsInfo{RecordNumsCount} = $RecordCount; | |
984 $OptionsInfo{RecordNumsMin} = $SortedRecordSplit[0]; | |
985 $OptionsInfo{RecordNumsMax} = $SortedRecordSplit[$#SortedRecordSplit]; | |
986 } | |
987 else { | |
988 if (@RecordSplit == 2) { | |
989 $OptionsInfo{StartRecordNum} = $RecordSplit[0]; | |
990 $OptionsInfo{EndRecordNum} = $RecordSplit[1]; | |
991 if ($OptionsInfo{StartRecordNum} <= 0 || $OptionsInfo{EndRecordNum} <= 0) { | |
992 die "Error: The value pair specified, $Options{record}, for option \"--records\" is not valid. Allowed values: > 0 \n"; | |
993 } | |
994 } | |
995 else { | |
996 die "Error: Invalid number of values, ", scalar(@RecordSplit), ", specified using \"--record\" option: only 2 values is allowed.\n"; | |
997 } | |
998 if ($OptionsInfo{StartRecordNum} > $OptionsInfo{EndRecordNum}) { | |
999 die "Error: Start record number, $OptionsInfo{StartRecordNum}, must be smaller than end record number, $OptionsInfo{EndRecordNum}.\nSpecify different values using \"--record\" option.\n"; | |
1000 } | |
1001 } | |
1002 } | |
1003 else { | |
1004 die "Error: For \"-m --mode\" option values recordnum, recordnums or recordrange, specify \"--record\" option value.\n"; | |
1005 } | |
1006 } | |
1007 | |
1008 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
1009 | |
1010 my(@Words, $Line, $Value); | |
1011 if ($Options{mode} =~ /^datafields$/i) { | |
1012 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
1013 if ($Options{datafields}) { | |
1014 @{$OptionsInfo{SpecifiedDataFieldLabels}} = split $OptionsInfo{InDelim}, $Options{datafields}; | |
1015 } | |
1016 elsif ($Options{datafieldsfile}) { | |
1017 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
1018 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
1019 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
1020 if (@Words) { | |
1021 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, @Words; | |
1022 } | |
1023 } | |
1024 close DATAFIELDSFILE; | |
1025 } | |
1026 } | |
1027 elsif ($Options{mode} =~ /^datafieldsbyvalue$/i) { | |
1028 my(@DataFieldsByValueTriplets); | |
1029 @DataFieldsByValueTriplets = (); | |
1030 if ($Options{datafields}) { | |
1031 @DataFieldsByValueTriplets = split $OptionsInfo{InDelim}, $Options{datafields}; | |
1032 } | |
1033 elsif ($Options{datafieldsfile}) { | |
1034 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
1035 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
1036 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
1037 if (@Words) { | |
1038 push @DataFieldsByValueTriplets, @Words; | |
1039 } | |
1040 } | |
1041 close DATAFIELDSFILE; | |
1042 } | |
1043 if ((@DataFieldsByValueTriplets % 3)) { | |
1044 if ($Options{datafields}) { | |
1045 die "Error: Triplets not found in values specified by \"-d --datafields\" option\n"; | |
1046 } | |
1047 elsif ($Options{datafieldsfile}) { | |
1048 die "Error: Triplets not found in values specified by \"--datafieldsfile\" option\n"; | |
1049 } | |
1050 } | |
1051 my($Index, $Label, $Value, $Criterion); | |
1052 | |
1053 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
1054 %{$OptionsInfo{SpecifiedDataFieldValuesMap}} = (); | |
1055 %{$OptionsInfo{SpecifiedDataFieldCriteriaMap}} = (); | |
1056 | |
1057 for ($Index = 0; $Index < @DataFieldsByValueTriplets; $Index = $Index + 3) { | |
1058 $Label = $DataFieldsByValueTriplets[$Index]; | |
1059 $Value = $DataFieldsByValueTriplets[$Index + 1]; | |
1060 $Criterion = $DataFieldsByValueTriplets[$Index + 2]; | |
1061 | |
1062 if ($Criterion =~ /^(eq|le|ge)$/i) { | |
1063 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; | |
1064 $OptionsInfo{SpecifiedDataFieldValuesMap}{$Label} = $Value; | |
1065 $OptionsInfo{SpecifiedDataFieldCriteriaMap}{$Label} = $Criterion; | |
1066 } | |
1067 else { | |
1068 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion\n"; | |
1069 } | |
1070 } | |
1071 } | |
1072 elsif ($Options{mode} =~ /^datafieldsbyregex$/i) { | |
1073 my(@DataFieldsByRegexTriplets); | |
1074 | |
1075 @DataFieldsByRegexTriplets = (); | |
1076 if ($Options{datafields}) { | |
1077 @DataFieldsByRegexTriplets = quotewords($OptionsInfo{InDelim}, 0, $Options{datafields}); | |
1078 } | |
1079 elsif ($Options{datafieldsfile}) { | |
1080 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
1081 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
1082 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
1083 if (@Words) { | |
1084 push @DataFieldsByRegexTriplets, @Words; | |
1085 } | |
1086 } | |
1087 close DATAFIELDSFILE; | |
1088 } | |
1089 if ((@DataFieldsByRegexTriplets % 3)) { | |
1090 if ($Options{datafields}) { | |
1091 die "Error: Triplet not found in values specified by \"-d --datafields\" option\n"; | |
1092 } | |
1093 elsif ($Options{datafieldsfile}) { | |
1094 die "Error: Triplet not found in values specified by \"--datafieldsfile\" option\n"; | |
1095 } | |
1096 } | |
1097 | |
1098 my($Index, $Label, $Value, $Criterion); | |
1099 | |
1100 @{$OptionsInfo{SpecifiedDataFieldLabels}} = (); | |
1101 %{$OptionsInfo{SpecifiedDataFieldRegexMap}} = (); | |
1102 %{$OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}} = (); | |
1103 | |
1104 for ($Index = 0; $Index < @DataFieldsByRegexTriplets; $Index = $Index + 3) { | |
1105 $Label = $DataFieldsByRegexTriplets[$Index]; | |
1106 $Value = $DataFieldsByRegexTriplets[$Index + 1]; | |
1107 $Criterion = $DataFieldsByRegexTriplets[$Index + 2]; | |
1108 | |
1109 if ($Criterion =~ /^(eq|ne)$/i) { | |
1110 push @{$OptionsInfo{SpecifiedDataFieldLabels}}, $Label; | |
1111 $OptionsInfo{SpecifiedDataFieldRegexMap}{$Label} = $Value; | |
1112 $OptionsInfo{SpecifiedDataFieldRegexCriteriaMap}{$Label} = $Criterion; | |
1113 } | |
1114 else { | |
1115 warn "Warning: Ignoring triplet value, $Label $Value $Criterion , specified using \"-d --datafields\" or \"--datafieldsfile\" option: Invalid criterion value: $Criterion; Supported values: eq or ne\n"; | |
1116 } | |
1117 } | |
1118 } | |
1119 elsif ($Options{mode} =~ /^(datafieldbylist|datafielduniquebylist|datafieldnotbylist)$/i) { | |
1120 my($Index, @DataFieldAndValuesList); | |
1121 if ($Options{datafields}) { | |
1122 @DataFieldAndValuesList = split $OptionsInfo{InDelim}, $Options{datafields}; | |
1123 } | |
1124 elsif ($Options{datafieldsfile}) { | |
1125 open DATAFIELDSFILE, "$Options{datafieldsfile}" or die "Error: Couldn't open $Options{datafieldsfile}: $! \n"; | |
1126 while ($Line = GetTextLine(\*DATAFIELDSFILE)) { | |
1127 @Words = quotewords($OptionsInfo{InDelim}, 0, $Line); | |
1128 if (@Words) { | |
1129 push @DataFieldAndValuesList, @Words; | |
1130 } | |
1131 } | |
1132 close DATAFIELDSFILE; | |
1133 } | |
1134 if (@DataFieldAndValuesList < 2) { | |
1135 if ($Options{datafields}) { | |
1136 die "Error: Invalid number of values specified by \"-d --datafields\" option\n"; | |
1137 } | |
1138 elsif ($Options{datafieldsfile}) { | |
1139 die "Error: Invalid number values specified by \"--datafieldsfile\" option\n"; | |
1140 } | |
1141 } | |
1142 | |
1143 $OptionsInfo{SpecifiedDataFieldLabel} = $DataFieldAndValuesList[0]; | |
1144 $OptionsInfo{SpecifiedDataFieldValuesCount} = @DataFieldAndValuesList - 1; | |
1145 %{$OptionsInfo{SpecifiedDataFieldValues}} = (); | |
1146 | |
1147 for ($Index = 1; $Index < @DataFieldAndValuesList; $Index++) { | |
1148 $Value = $DataFieldAndValuesList[$Index]; | |
1149 $OptionsInfo{SpecifiedDataFieldValues}{$Value} = "NotFound"; | |
1150 } | |
1151 } | |
1152 | |
1153 $OptionsInfo{SDFileExt} = "sdf"; | |
1154 $OptionsInfo{TextFileExt} = "csv"; | |
1155 | |
1156 if ($Options{outdelim} =~ /^tab$/i) { | |
1157 $OptionsInfo{TextFileExt} = "tsv"; | |
1158 } | |
1159 | |
1160 if ($Options{mode} =~ /^(alldatafields|molnames)$/i) { | |
1161 $OptionsInfo{OutputSDFile} = 0; | |
1162 $OptionsInfo{OutputTextFile} = 1; | |
1163 } | |
1164 else { | |
1165 $OptionsInfo{OutputSDFile} = ($Options{output} =~ /^(SD|both)$/i) ? 1 : 0; | |
1166 $OptionsInfo{OutputTextFile} = ($Options{output} =~ /^(text|both)$/i) ? 1 : 0; | |
1167 } | |
1168 | |
1169 $OptionsInfo{StrDataString} = $Options{strdatastring}; | |
1170 $OptionsInfo{OutputStrDataString} = ($Options{strdatastring} =~ /^Yes$/i) ? 1 : 0; | |
1171 | |
1172 $OptionsInfo{StrDataStringDelimiter} = $Options{strdatastringdelimiter}; | |
1173 | |
1174 if (IsEmpty($Options{strdatastringdelimiter})) { | |
1175 die "Error: No value specified for \"--StrDataStringDelimiter\" option.\n"; | |
1176 } | |
1177 $OptionsInfo{StrDataStringMode} = $Options{strdatastringmode}; | |
1178 $OptionsInfo{StrDataStringWithFields} = $Options{strdatastringmode} =~ /^StrAndDataFields$/i ? 1 : 0; | |
1179 | |
1180 MODE: { | |
1181 if ($Options{mode} =~ /^alldatafields$/i) { $OptionsInfo{FileNameMode} = "AllDataDields"; last MODE; } | |
1182 if ($Options{mode} =~ /^commondatafields$/i) { $OptionsInfo{FileNameMode} = "CommonDataDields"; last MODE; } | |
1183 if ($Options{mode} =~ /^datafields$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFields"; last MODE; } | |
1184 if ($Options{mode} =~ /^datafieldsbyvalue$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByValue"; last MODE; } | |
1185 if ($Options{mode} =~ /^datafieldsbyregex$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldsByRegex"; last MODE; } | |
1186 if ($Options{mode} =~ /^datafieldbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataField"; last MODE; } | |
1187 if ($Options{mode} =~ /^datafielduniquebylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedUniqueDataField"; last MODE; } | |
1188 if ($Options{mode} =~ /^datafieldnotbylist$/i) { $OptionsInfo{FileNameMode} = "SpecifiedDataFieldNotByList"; last MODE; } | |
1189 if ($Options{mode} =~ /^molnames$/i) { $OptionsInfo{FileNameMode} = "MolName"; last MODE; } | |
1190 if ($Options{mode} =~ /^randomcmpds$/i) { $OptionsInfo{FileNameMode} = "RandomCmpds"; last MODE; } | |
1191 if ($Options{mode} =~ /^recordnum$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{RecordNum}"; last MODE; } | |
1192 if ($Options{mode} =~ /^recordnums$/i) { $OptionsInfo{FileNameMode} = "RecordNums"; last MODE; } | |
1193 if ($Options{mode} =~ /^recordrange$/i) { $OptionsInfo{FileNameMode} = "RecordNum$OptionsInfo{StartRecordNum}" . "To" . "$OptionsInfo{EndRecordNum}"; last MODE; } | |
1194 if ($Options{mode} =~ /^2dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "2DCmpdRecords"; last MODE; } | |
1195 if ($Options{mode} =~ /^3dcmpdrecords$/i) { $OptionsInfo{FileNameMode} = "3DCmpdRecords"; last MODE; } | |
1196 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, , datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; | |
1197 } | |
1198 | |
1199 } | |
1200 | |
1201 # Setup script usage and retrieve command line arguments specified using various options... | |
1202 sub SetupScriptUsage { | |
1203 | |
1204 # Retrieve all the options... | |
1205 %Options = (); | |
1206 $Options{numofcmpds} = 1; | |
1207 $Options{mode} = "alldatafields"; | |
1208 $Options{indelim} = "comma"; | |
1209 $Options{outdelim} = "comma"; | |
1210 $Options{output} = "SD"; | |
1211 $Options{quote} = "yes"; | |
1212 $Options{regexignorecase} = "yes"; | |
1213 $Options{valuecomparisonmode} = "numeric"; | |
1214 $Options{violations} = 0; | |
1215 $Options{seed} = 123456789; | |
1216 | |
1217 $Options{strdatastring} = "no"; | |
1218 $Options{strdatastringdelimiter} = "|"; | |
1219 $Options{strdatastringmode} = "StrOnly"; | |
1220 | |
1221 if (!GetOptions(\%Options, "help|h", "datafields|d=s", "datafieldsfile=s", "indelim=s", "mode|m=s", "numofcmpds|n=i", "outdelim=s", "output=s", "overwrite|o", "quote|q=s", "regexignorecase=s", "record=s", "root|r=s", "seed|s=i", "strdatastring=s", "strdatastringdelimiter=s", "strdatastringmode=s", "valuecomparisonmode=s", "violations|v=i", "workingdir|w=s")) { | |
1222 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
1223 } | |
1224 if ($Options{workingdir}) { | |
1225 if (! -d $Options{workingdir}) { | |
1226 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
1227 } | |
1228 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
1229 } | |
1230 if ($Options{numofcmpds} < 1) { | |
1231 die "Error: The value specified, $Options{numofcmpds}, for option \"-n --numofcmpds\" is not valid. Allowed values: >= 1 \n"; | |
1232 } | |
1233 if ($Options{valuecomparisonmode} !~ /^(Numeric|Alphanumeric)$/i) { | |
1234 die "Error: The value specified, $Options{valuecomparisonmode}, for option \"--ValueComparisonMode\" is not valid. Allowed values: Numeric or Alphanumeric\n"; | |
1235 } | |
1236 if ($Options{violations} < 0) { | |
1237 die "Error: The value specified, $Options{violations}, for option \"-v --violations\" is not valid. Allowed values: >= 0 \n"; | |
1238 } | |
1239 if ($Options{mode} !~ /^(alldatafields|commondatafields|datafields|datafieldsbyvalue|datafieldsbyregex|datafieldbylist|datafielduniquebylist|datafieldnotbylist|molnames|randomcmpds|recordnum|recordnums|recordrange|2dcmpdrecords|3dcmpdrecords)$/i) { | |
1240 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: alldatafields, commondatafields, datafields, datafieldsbyvalue, datafieldbylist, datafielduniquebylist, datafieldnotbylist, molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords\n"; | |
1241 } | |
1242 if ($Options{output} !~ /^(SD|text|both)$/i) { | |
1243 die "Error: The value specified, $Options{output}, for option \"--output\" is not valid. Allowed values: SD, text, or both\n"; | |
1244 } | |
1245 if ($Options{indelim} !~ /^(comma|semicolon|tab)$/i) { | |
1246 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
1247 } | |
1248 if ($Options{outdelim} !~ /^(comma|semicolon|tab)$/i) { | |
1249 die "Error: The value specified, $Options{outdelim}, for option \"--outdelim\" is not valid. Allowed values: comma, tab, or semicolon\n"; | |
1250 } | |
1251 if ($Options{quote} !~ /^(yes|no)$/i) { | |
1252 die "Error: The value specified, $Options{quote}, for option \"-q --quote\" is not valid. Allowed values: yes or no\n"; | |
1253 } | |
1254 if ($Options{regexignorecase} !~ /^(yes|no)$/i) { | |
1255 die "Error: The value specified, $Options{regexignorecase}, for option \"--regexignorecase\" is not valid. Allowed values: yes or no\n"; | |
1256 } | |
1257 if ($Options{strdatastring} !~ /^(yes|no)$/i) { | |
1258 die "Error: The value specified, $Options{strdatastring}, for option \"--StrDataString\" is not valid. Allowed values: yes or no\n"; | |
1259 } | |
1260 if ($Options{strdatastringmode} !~ /^(StrOnly|StrAndDataFields)$/i) { | |
1261 die "Error: The value specified, $Options{strdatastringmode}, for option \"--StrDataStringMode\" is not valid. Allowed values: StrOnly or StrAndDataFields\n"; | |
1262 } | |
1263 } | |
1264 | |
1265 __END__ | |
1266 | |
1267 =head1 NAME | |
1268 | |
1269 ExtractFromSDFiles.pl - Extract specific data from SDFile(s) | |
1270 | |
1271 =head1 SYNOPSIS | |
1272 | |
1273 ExtractFromSDFiles.pl SDFile(s)... | |
1274 | |
1275 ExtractFromSDFiles.pl [B<-h, --help>] | |
1276 [B<-d, --datafields> "fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value..."] | |
1277 [B<--datafieldsfile> filename] [B<--indelim> comma | tab | semicolon] [B<-m, --mode> alldatafields | | |
1278 commondatafields | | datafieldnotbylist | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | | |
1279 datafielduniquebylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | | |
1280 3dcmpdrecords ] [B<-n, --numofcmpds> number] [B<--outdelim> comma | tab | semicolon] | |
1281 [B<--output> SD | text | both] [B<-o, --overwrite>] [B<-q, --quote> yes | no] | |
1282 [B<--record> recnum | startrecnum,endrecnum] B<--RegexIgnoreCase> I<yes or no> | |
1283 [B<-r, --root> rootname] [B<-s, --seed> number] [B<--StrDataString> yes | no] | |
1284 [B<--StrDataStringDelimiter> text] [B<--StrDataStringMode> StrOnly | StrAndDataFields] | |
1285 [B<--ValueComparisonMode> I<Numeric | Alphanumeric>] | |
1286 [B<-v, --violations-> number] [B<-w, --workingdir> dirname] SDFile(s)... | |
1287 | |
1288 =head1 DESCRIPTION | |
1289 | |
1290 Extract specific data from I<SDFile(s)> and generate appropriate SD or CSV/TSV text | |
1291 file(s). The structure data from SDFile(s) is not transferred to CSV/TSV text file(s). | |
1292 Multiple SDFile names are separated by spaces. The valid file extensions are I<.sdf> | |
1293 and I<.sd>. All other file names are ignored. All the SD files in a current directory | |
1294 can be specified either by I<*.sdf> or the current directory name. | |
1295 | |
1296 =head1 OPTIONS | |
1297 | |
1298 =over 4 | |
1299 | |
1300 =item B<-h, --help> | |
1301 | |
1302 Print this help message. | |
1303 | |
1304 =item B<-d, --datafields> I<"fieldlabel,..." | "fieldlabel,value,criteria..." | "fieldlabel,value,value,..."> | |
1305 | |
1306 This value is mode specific. In general, it's a list of comma separated data field labels | |
1307 and associated mode specific values. | |
1308 | |
1309 For I<datafields> mode, input value format is: I<fieldlabel,...>. Examples: | |
1310 | |
1311 Extreg | |
1312 Extreg,CompoundName,ID | |
1313 | |
1314 For I<datafieldsbyvalue> mode, input value format contains these triplets: | |
1315 I<fieldlabel,value, criteria...>. Possible values for criteria: I<le, ge or eq>. | |
1316 The values of B<--ValueComparisonMode> indicates whether values are | |
1317 compared numerical or string comarison operators. Default is to consider | |
1318 data field values as numerical values and use numerical comparison operators. | |
1319 Examples: | |
1320 | |
1321 MolWt,450,le | |
1322 MolWt,450,le,LogP,5,le,SumNumNO,10,le,SumNHOH,5,le | |
1323 | |
1324 For I<datafieldsbyregex> mode, input value format contains these triplets: | |
1325 I<fieldlabel,regex, criteria...>. I<regex> corresponds to any valid regular expression | |
1326 and is used to match the values for specified I<fieldlabel>. Possible values for criteria: | |
1327 I<eq or ne>. During I<eq> and I<ne> values, data field label value is matched with | |
1328 regular expression using =~ and !~ respectively. B<--RegexIgnoreCase> option | |
1329 value is used to determine whether to ignore letter upper/lower case during | |
1330 regular expression match. Examples: | |
1331 | |
1332 Name,ol,eq | |
1333 Name,'^pat',ne | |
1334 | |
1335 For I<datafieldbylist> and I<datafielduniquebylist> mode, input value format is: | |
1336 I<fieldlabel,value1,value2...>. This is equivalent to I<datafieldsbyvalue> mode with | |
1337 this input value format:I<fieldlabel,value1,eq,fieldlabel,value2,eq,...>. For | |
1338 I<datafielduniquebylist> mode, only unique compounds identified by first occurrence | |
1339 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds | |
1340 are simply ignored. | |
1341 | |
1342 For I<datafieldnotbylist> mode, input value format is: I<fieldlabel,value1,value2...>. In this | |
1343 mode, the script behaves exactly opposite of I<datafieldbylist> mode, and only those compounds | |
1344 are extracted whose data field values don't match any specified data field value. | |
1345 | |
1346 =item B<--datafieldsfile> I<filename> | |
1347 | |
1348 Filename which contains various mode specific values. This option provides a way | |
1349 to specify mode specific values in a file instead of entering them on the command | |
1350 line using B<-d --datafields>. | |
1351 | |
1352 For I<datafields> mode, input file lines contain comma delimited field labels: | |
1353 I<fieldlabel,...>. Example: | |
1354 | |
1355 Line 1:MolId | |
1356 Line 2:"Extreg",CompoundName,ID | |
1357 | |
1358 For I<datafieldsbyvalue> mode, input file lines contains these comma separated triplets: | |
1359 I<fieldlabel,value, criteria>. Possible values for criteria: I<le, ge or eq>. Examples: | |
1360 | |
1361 Line 1:MolWt,450,le | |
1362 | |
1363 Line 1:"MolWt",450,le,"LogP",5,le,"SumNumNO",10,le,"SumNHOH",5,le | |
1364 | |
1365 Line 1:MolWt,450,le | |
1366 Line 2:"LogP",5,le | |
1367 Line 3:"SumNumNO",10,le | |
1368 Line 4: SumNHOH,5,le | |
1369 | |
1370 For I<datafieldbylist> and I<datafielduniquebylist> mode, input file line format is: | |
1371 | |
1372 Line 1:fieldlabel; | |
1373 Subsequent lines:value1,value2... | |
1374 | |
1375 For I<datafieldbylist>, I<datafielduniquebylist>, and I<datafieldnotbylist> mode, input file | |
1376 line format is: | |
1377 | |
1378 Line 1:fieldlabel; | |
1379 Subsequent lines:value1,value2... | |
1380 | |
1381 For I<datafielduniquebylist> mode, only unique compounds identified by first occurrence | |
1382 of I<value> associated with I<fieldlabel> in I<SDFile(s)> are kept; any subsequent compounds | |
1383 are simply ignored. Example: | |
1384 | |
1385 Line 1: MolID | |
1386 Subsequent Lines: | |
1387 907508 | |
1388 832291,4642 | |
1389 "1254","907303" | |
1390 | |
1391 =item B<--indelim> I<comma | tab | semicolon> | |
1392 | |
1393 Delimiter used to specify text values for B<-d --datafields> and B<--datafieldsfile> options. | |
1394 Possible values: I<comma, tab, or semicolon>. Default value: I<comma>. | |
1395 | |
1396 =item B<-m, --mode> I<alldatafields | commondatafields | datafields | datafieldsbyvalue | datafieldsbyregex | datafieldbylist | datafielduniquebylist | datafieldnotbylist | molnames | randomcmpds | recordnum | recordnums | recordrange | 2dcmpdrecords | 3dcmpdrecords> | |
1397 | |
1398 Specify what to extract from I<SDFile(s)>. Possible values: I<alldatafields, commondatafields, | |
1399 datafields, datafieldsbyvalue, datafieldsbyregex, datafieldbylist, datafielduniquebylist, datafieldnotbylist, | |
1400 molnames, randomcmpds, recordnum, recordnums, recordrange, 2dcmpdrecords, 3dcmpdrecords>. | |
1401 Default value: I<alldatafields>. | |
1402 | |
1403 For I<alldatafields> and I<molnames> mode, only a CSV/TSV text file is generated; for all | |
1404 other modes, however, a SD file is generated by default - you can change the behavior to genereate | |
1405 text file using I<--output> option. | |
1406 | |
1407 For I<3DCmpdRecords> mode, only those compounds with at least one non-zero value for Z atomic coordinates | |
1408 are retrieved; however, during retrieval of compounds in I<2DCmpdRecords> mode, all Z atomic coordinates must | |
1409 be zero. | |
1410 | |
1411 =item B<-n, --numofcmpds> I<number> | |
1412 | |
1413 Number of compouds to extract during I<randomcmpds> mode. | |
1414 | |
1415 =item B<--outdelim> I<comma | tab | semicolon> | |
1416 | |
1417 Delimiter for output CSV/TSV text file(s). Possible values: I<comma, tab, or semicolon> | |
1418 Default value: I<comma> | |
1419 | |
1420 =item B<--output> I<SD | text | both> | |
1421 | |
1422 Type of output files to generate. Possible values: I<SD, text, or both>. Default value: I<SD>. For | |
1423 I<alldatafields> and I<molnames> mode, this option is ingored and only a CSV/TSV text file is generated. | |
1424 | |
1425 =item B<-o, --overwrite> | |
1426 | |
1427 Overwrite existing files. | |
1428 | |
1429 =item B<-q, --quote> I<yes | no> | |
1430 | |
1431 Put quote around column values in output CSV/TSV text file(s). Possible values: | |
1432 I<yes or no>. Default value: I<yes>. | |
1433 | |
1434 =item B<--record> I<recnum | recnums | startrecnum,endrecnum> | |
1435 | |
1436 Record number, record numbers or range of records to extract during I<recordnum>, I<recordnums> | |
1437 and I<recordrange> mode. Input value format is: <num>, <num1,num2,...> and <startnum, endnum> | |
1438 for I<recordnum>, I<recordnums> and I<recordrange> modes recpectively. Default value: none. | |
1439 | |
1440 =item B<--RegexIgnoreCase> I<yes or no> | |
1441 | |
1442 Specify whether to ingnore case during I<datafieldsbyregex> value of B<-m, --mode> option. | |
1443 Possible values: I<yes or no>. Default value: I<yes>. | |
1444 | |
1445 =item B<-r, --root> I<rootname> | |
1446 | |
1447 New file name is generated using the root: <Root>.<Ext>. Default for new file | |
1448 names: <SDFileName><mode>.<Ext>. The file type determines <Ext> value. | |
1449 The sdf, csv, and tsv <Ext> values are used for SD, comma/semicolon, and tab | |
1450 delimited text files respectively.This option is ignored for multiple input files. | |
1451 | |
1452 =item B<-s, --seed> I<number> | |
1453 | |
1454 Random number seed used for I<randomcmpds> mode. Default:123456789. | |
1455 | |
1456 =item B<--StrDataString> I<yes | no> | |
1457 | |
1458 Specify whether to write out structure data string to CSV/TSV text file(s). Possible values: | |
1459 I<yes or no>. Default value: I<no>. | |
1460 | |
1461 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure | |
1462 data lines into a structure data string. | |
1463 | |
1464 This option is ignored during generation of SD file(s). | |
1465 | |
1466 =item B<--StrDataStringDelimiter> I<text> | |
1467 | |
1468 Delimiter for joining multiple stucture data lines into a string before writing to CSV/TSV text | |
1469 file(s). Possible values: I<any alphanumeric text>. Default value: I<|>. | |
1470 | |
1471 This option is ignored during generation of SD file(s). | |
1472 | |
1473 =item B<--StrDataStringMode> I<StrOnly | StrAndDataFields> | |
1474 | |
1475 Specify whether to include SD data fields and values along with the structure data into structure | |
1476 data string before writing it out to CSV/TSV text file(s). Possible values: I<StrOnly or StrAndDataFields>. | |
1477 Default value: I<StrOnly>. | |
1478 | |
1479 The value of B<StrDataStringDelimiter> option is used as a delimiter to join structure | |
1480 data lines into a structure data string. | |
1481 | |
1482 This option is ignored during generation of SD file(s). | |
1483 | |
1484 =item B<--ValueComparisonMode> I<Numeric | Alphanumeric> | |
1485 | |
1486 Specify how to compare data field values during I<datafieldsbyvalue> mode: Compare | |
1487 values using either numeric or string ((eq, le, ge) comparison operators. Possible values: | |
1488 I<Numeric or Alphanumeric>. Defaule value: I<Numeric>. | |
1489 | |
1490 =item B<-v, --violations> I<number> | |
1491 | |
1492 Number of criterion violations allowed for values specified during I<datafieldsbyvalue> | |
1493 and I<datafieldsbyregex> mode. Default value: I<0>. | |
1494 | |
1495 =item B<-w, --workingdir> I<dirname> | |
1496 | |
1497 Location of working directory. Default: current directory. | |
1498 | |
1499 =back | |
1500 | |
1501 =head1 EXAMPLES | |
1502 | |
1503 To retrieve all data fields from SD files and generate CSV text files, type: | |
1504 | |
1505 % ExtractFromSDFiles.pl -o Sample.sdf | |
1506 % ExtractFromSDFiles.pl -o *.sdf | |
1507 | |
1508 To retrieve all data fields from SD file and generate CSV text files containing | |
1509 a column with structure data as a string with | as line delimiter, type: | |
1510 | |
1511 % ExtractFromSDFiles.pl --StrDataString Yes -o Sample.sdf | |
1512 | |
1513 To retrieve MOL_ID data fileld from SD file and generate CSV text files containing | |
1514 a column with structure data along with all data fields as a string with | as line | |
1515 delimiter, type: | |
1516 | |
1517 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID" --StrDataString Yes | |
1518 --StrDataStringMode StrAndDataFields --StrDataStringDelimiter "|" | |
1519 --output text -o Sample.sdf | |
1520 | |
1521 To retrieve common data fields which exists for all the compounds in | |
1522 a SD file and generate a TSV text file NewSample.tsv, type: | |
1523 | |
1524 % ExtractFromSDFiles.pl -m commondatafields --outdelim tab -r NewSample | |
1525 --output Text -o Sample.sdf | |
1526 | |
1527 To retrieve MolId, ExtReg, and CompoundName data field from a SD file and generate a | |
1528 CSV text file NewSample.csv, type: | |
1529 | |
1530 % ExtractFromSDFiles.pl -m datafields -d "Mol_ID,MolWeight, | |
1531 CompoundName" -r NewSample --output Text -o Sample.sdf | |
1532 | |
1533 To retrieve compounds from a SD which meet a specific set of criteria - MolWt <= 450, | |
1534 LogP <= 5 and SumNO < 10 - from a SD file and generate a new SD file NewSample.sdf, | |
1535 type: | |
1536 | |
1537 % ExtractFromSDFiles.pl -m datafieldsbyvalue -d "MolWt,450,le,LogP | |
1538 ,5,le,SumNO,10" -r NewSample -o Sample.sdf | |
1539 | |
1540 To retrive compounds from a SD file with a specific set of values for MolID and | |
1541 generate a new SD file NewSample.sdf, type: | |
1542 | |
1543 % ExtractFromSDFiles.pl -m datafieldbylist -d "Mol_ID,159,4509,4619" | |
1544 -r NewSample -o Sample.sdf | |
1545 | |
1546 To retrive compounds from a SD file with values for MolID not on a list of specified | |
1547 values and generate a new SD file NewSample.sdf, type: | |
1548 | |
1549 % ExtractFromSDFiles.pl -m datafieldnotbylist -d "Mol_ID,159,4509,4619" | |
1550 -r NewSample -o Sample.sdf | |
1551 | |
1552 To retrive 10 random compounds from a SD file and generate a new SD file RandomSample.sdf, type: | |
1553 | |
1554 % ExtractFromSDFiles.pl -m randomcmpds -n 10 -r RandomSample | |
1555 -o Sample.sdf | |
1556 | |
1557 To retrive compound record number 10 from a SD file and generate a new SD file NewSample.sdf, type: | |
1558 | |
1559 % ExtractFromSDFiles.pl -m recordnum --record 10 -r NewSample | |
1560 -o Sample.sdf | |
1561 | |
1562 To retrive compound record numbers 10, 20 and 30 from a SD file and generate a new SD file | |
1563 NewSample.sdf, type: | |
1564 | |
1565 % ExtractFromSDFiles.pl -m recordnums --record 10,20,30 -r NewSample | |
1566 -o Sample.sdf | |
1567 | |
1568 To retrive compound records between 10 to 20 from SD file and generate a new SD | |
1569 file NewSample.sdf, type: | |
1570 | |
1571 % ExtractFromSDFiles.pl -m recordrange --record 10,20 -r NewSample | |
1572 -o Sample.sdf | |
1573 | |
1574 =head1 AUTHOR | |
1575 | |
1576 Manish Sud <msud@san.rr.com> | |
1577 | |
1578 =head1 SEE ALSO | |
1579 | |
1580 FilterSDFiles.pl, InfoSDFiles.pl, SplitSDFiles.pl, MergeTextFilesWithSD.pl | |
1581 | |
1582 =head1 COPYRIGHT | |
1583 | |
1584 Copyright (C) 2015 Manish Sud. All rights reserved. | |
1585 | |
1586 This file is part of MayaChemTools. | |
1587 | |
1588 MayaChemTools is free software; you can redistribute it and/or modify it under | |
1589 the terms of the GNU Lesser General Public License as published by the Free | |
1590 Software Foundation; either version 3 of the License, or (at your option) | |
1591 any later version. | |
1592 | |
1593 =cut |