comparison bin/SortSDFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: SortSDFiles.pl,v $
4 # $Date: 2015/02/28 20:46:21 $
5 # $Revision: 1.26 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use FileUtil;
36 use SDFileUtil;
37 use TextUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename($0);
46 print "\n$ScriptName: Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Get the options and setup script...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 my(@SDFilesList);
56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
57
58 print "Processing options...\n";
59 my(%OptionsInfo);
60 ProcessOptions();
61
62 print "Checking input SD file(s)...\n";
63 my(%SDFilesInfo);
64 RetrieveSDFilesInfo();
65
66 # Generate output files...
67 my($FileIndex);
68 if (@SDFilesList > 1) {
69 print "\nProcessing SD files...\n";
70 }
71 for $FileIndex (0 .. $#SDFilesList) {
72 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
73 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
74 SortSDFile($FileIndex);
75 }
76 }
77 print "\n$ScriptName:Done...\n\n";
78
79 $EndTime = new Benchmark;
80 $TotalTime = timediff ($EndTime, $StartTime);
81 print "Total time: ", timestr($TotalTime), "\n";
82
83 ###############################################################################
84
85 # Sort it out...
86 sub SortSDFile {
87 my($Index) = @_;
88 my($SDFile, $NewSDFile, $KeyDataFieldName);
89
90 $SDFile = $SDFilesList[$Index];
91 $NewSDFile = $SDFilesInfo{OutFile}[$Index];
92 $KeyDataFieldName = $SDFilesInfo{KeyDataFieldName}[$Index];
93
94 print "Generating new SD file $NewSDFile...\n";
95 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n";
96 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
97
98 # Go over all compound records and store 'em using key value as hash...
99 my(%KeyToCompundRecordsMap, @InvalidCompoundRecords, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues, $KeyDataFieldValue);
100 %KeyToCompundRecordsMap = ();
101 @InvalidCompoundRecords = ();
102 $CmpdCount = 0;
103
104 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
105 $CmpdCount++;
106 @CmpdLines = split "\n", $CmpdString;
107 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
108 #Make sure data field value is okay...
109 if (!(IsNotEmpty($DataFieldValues{$KeyDataFieldName}) && ($DataFieldValues{$KeyDataFieldName} !~ /\n/))) {
110 push @InvalidCompoundRecords, $CmpdString;
111 if ($OptionsInfo{DetailLevel} >= 3 ) {
112 print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName :\n $CmpdString\n\n";
113 }
114 elsif ($OptionsInfo{DetailLevel} >= 2) {
115 print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName...\n";
116 }
117 next COMPOUND;
118 }
119 $KeyDataFieldValue = $DataFieldValues{$KeyDataFieldName};
120 if ($OptionsInfo{KeyData} =~ /^numeric$/i) {
121 if (!IsFloat($KeyDataFieldValue)) {
122 push @InvalidCompoundRecords, $CmpdString;
123 if ($OptionsInfo{DetailLevel} >= 3 ) {
124 print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName :\n $CmpdString\n\n";
125 }
126 elsif ($OptionsInfo{DetailLevel} >= 2) {
127 print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName...\n";
128 }
129 next COMPOUND;
130 }
131 }
132 if (exists($KeyToCompundRecordsMap{$KeyDataFieldValue})) {
133 # Append to existing coompund data...
134 $KeyToCompundRecordsMap{$KeyDataFieldValue} .= "\n" . $CmpdString;
135 }
136 else {
137 $KeyToCompundRecordsMap{$KeyDataFieldValue} = $CmpdString;
138 }
139 }
140
141 if ($OptionsInfo{Sort} =~ /^ascending$/i) {
142 if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
143 for $KeyDataFieldValue (sort { lc($a) cmp lc($b) } keys %KeyToCompundRecordsMap ) {
144 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
145 }
146 }
147 else {
148 for $KeyDataFieldValue (sort { $a <=> $b } keys %KeyToCompundRecordsMap ) {
149 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
150 }
151 }
152 }
153 else {
154 if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) {
155 for $KeyDataFieldValue (sort { lc($b) cmp lc($a) } keys %KeyToCompundRecordsMap ) {
156 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
157 }
158 }
159 else {
160 for $KeyDataFieldValue (sort { $b <=> $a } keys %KeyToCompundRecordsMap ) {
161 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n";
162 }
163 }
164 }
165 # Append the records containing data not appropriate for sorting...
166 if (@InvalidCompoundRecords) {
167 print "Placing ", scalar(@InvalidCompoundRecords)," compound record(s) with invalid data field key data the end...\n";
168 for $CmpdString (@InvalidCompoundRecords) {
169 print NEWSDFILE "$CmpdString\n";
170 }
171 }
172 close NEWSDFILE;
173 close SDFILE;
174 }
175
176 # Retrieve information about input SD files...
177 sub RetrieveSDFilesInfo {
178 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName);
179
180 %SDFilesInfo = ();
181
182 @{$SDFilesInfo{FileOkay}} = ();
183 @{$SDFilesInfo{OutFile}} = ();
184 @{$SDFilesInfo{KeyDataFieldName}} = ();
185
186 FILELIST: for $Index (0 .. $#SDFilesList) {
187 $SDFile = $SDFilesList[$Index];
188 $SDFilesInfo{FileOkay}[$Index] = 0;
189 $SDFilesInfo{OutFile}[$Index] = "";
190 $SDFilesInfo{KeyDataFieldName}[$Index] = "";
191
192 if (!(-e $SDFile)) {
193 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
194 next FILELIST;
195 }
196 if (!CheckFileType($SDFile, "sd sdf")) {
197 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
198 next FILELIST;
199 }
200 $FileDir = ""; $FileName = ""; $FileExt = "";
201 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
202 if ($Options{root} && (@SDFilesList == 1)) {
203 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
204 if ($RootFileName && $RootFileExt) {
205 $FileName = $RootFileName;
206 }
207 else {
208 $FileName = $Options{root};
209 }
210 $OutFileRoot = $FileName;
211 }
212 else {
213 $OutFileRoot = $FileName . "SortedByDataField";
214 }
215
216 $OutFile = $OutFileRoot . ".$FileExt";
217 if (lc($OutFile) eq lc($SDFile)) {
218 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n";
219 next FILELIST;
220 }
221 if (!$Options{overwrite}) {
222 if (-e $OutFile) {
223 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n";
224 next FILELIST;
225 }
226 }
227 # Setup data field name...
228 if ($OptionsInfo{SpecifiedDataFieldName}) {
229 $DataFieldName = $OptionsInfo{SpecifiedDataFieldName};
230 }
231 else {
232 my($CmpdString, @CmpdLines, @DataFieldNames);
233 @DataFieldNames = ();
234 if (!open(SDFILE, "$SDFile")) {
235 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
236 next FILELIST;
237 }
238 $CmpdString = ReadCmpdString(\*SDFILE);
239 close SDFILE;
240
241 @CmpdLines = split "\n", $CmpdString;
242 @DataFieldNames = GetCmpdDataHeaderLabels(\@CmpdLines);
243 $DataFieldName = $DataFieldNames[0];
244 }
245
246 $SDFilesInfo{FileOkay}[$Index] = 1;
247 $SDFilesInfo{OutFile}[$Index] = "$OutFile";
248 $SDFilesInfo{KeyDataFieldName}[$Index] = $DataFieldName;
249 }
250 }
251
252 # Process option values...
253 sub ProcessOptions {
254 $OptionsInfo{DetailLevel} = $Options{detail};
255
256 $OptionsInfo{Key} = defined $Options{key} ? $Options{key} : undef;
257 $OptionsInfo{SpecifiedDataFieldName} = "";
258 if (defined $Options{key}) {
259 $OptionsInfo{SpecifiedDataFieldName} = $Options{key};
260 }
261
262 $OptionsInfo{KeyData} = $Options{keydata};
263 $OptionsInfo{Sort} = $Options{sort};
264
265 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
266 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef;
267 }
268
269 # Setup script usage and retrieve command line arguments specified using various options...
270 sub SetupScriptUsage {
271
272 # Retrieve all the options...
273 %Options = ();
274 $Options{detail} = 1;
275 $Options{sort} = "ascending";
276 $Options{keydata} = "numeric";
277 if (!GetOptions(\%Options, "detail|d=i", "help|h", "key|k=s", "keydata=s", "overwrite|o", "root|r=s", "sort|s=s", "workingdir|w=s")) {
278 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
279 }
280 if ($Options{workingdir}) {
281 if (! -d $Options{workingdir}) {
282 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
283 }
284 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
285 }
286 if ($Options{keydata} !~ /^(numeric|alphanumeric)$/i) {
287 die "Error: The value specified, $Options{keydata}, for option \"--keydata\" is not valid. Allowed values: numeric or alphanumeric\n";
288 }
289 if ($Options{sort} !~ /^(ascending|descending)$/i) {
290 die "Error: The value specified, $Options{sort}, for option \"-s --sort\" is not valid. Allowed values: ascending or descending\n";
291 }
292 if (!IsPositiveInteger($Options{detail})) {
293 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n";
294 }
295 }
296
297 __END__
298
299 =head1 NAME
300
301 SortSDFiles.pl - Sort SDFile(s) using values for a data field
302
303 =head1 SYNOPSIS
304
305 SortSDFiles.pl SDFile(s)...
306
307 SortSDFiles.pl [B<-d, --detail> infolevel] [B<-h, --help>] [B<-k, --key> I<SD data field name>]
308 [B<--keydata> numeric | alphanumeric] [B<-o, --overwrite>] [B<-r, --root> rootname]
309 [B<-s, --sort> ascending | descending] [B<-w, --workingdir> dirname] SDFile(s)...
310
311 =head1 DESCRIPTION
312
313 Sort I<SDFile(s)> using values for a specified data field name key. Only one SD
314 data field name key can be specified for sorting. In an event of conflict during sorting
315 process, two similar values for a SD data field name key are simply transferred to
316 output files in order of their presence in input files. Additionally, compound records
317 with no data field name, empty field values, or field values containing multiple lines
318 are simply placed at the end. The file names are separated by space.The valid file
319 extensions are I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a
320 current directory can be specified either by I<*.sdf> or the current directory name.
321
322 =head1 OPTIONS
323
324 =over 4
325
326 =item B<-d, --detail> I<infolevel>
327
328 Level of information to print about compound records being ignored. Default: I<1>. Possible
329 values: I<1, 2 or 3>.
330
331 =item B<-h, --help>
332
333 Print this help message.
334
335 =item B<-k, --key> I<SD data field name>
336
337 I<SDFile(s)> data field name used for sorting compound records. Default value: I<first
338 data field name>. Compound records with no I<sdfieldname>, empty field values, field
339 values containing multiple lines, or field values inappropriate for sorting are simply placed
340 at the end.
341
342 =item B<--keydata> I<numeric | alphanumeric>
343
344 Data type for I<sdfieldname> values. Possible values: I<numeric or alphanumeric>. Default
345 value: I<numeric>. For I<alphanumeric> data values, comparison is case insensitive.
346
347 =item B<-o, --overwrite>
348
349 Overwrite existing files.
350
351 =item B<-r, --root> I<rootname>
352
353 New SD file name is generated using the root: <Root>.<Ext>. Default new file
354 name: <InitialSDFileName>SortedByDataField.<Ext>. This option is ignored for multiple
355 input files.
356
357 =item B<-s, --sort> I<ascending | descending>
358
359 Sorting order for SD data field values. Possible values: I<ascending or descending>.
360 Default value: I<ascending>.
361
362 =item B<-w, --workingdir> I<dirname>
363
364 Location of working directory. Default: current directory.
365
366 =back
367
368 =head1 EXAMPLES
369
370 To perform numerical sort in ascending order using first data field values and
371 generate a new SD file NewSample1.sdf, type:
372
373 % SortSDFiles.pl -o -r NewSample1 Sample1.sdf
374
375 To perform numerical sort in descending order using MolWeight data field and
376 generate a new SD text file NewSample1.sdf, type:
377
378 % SortSDFiles.pl -k MolWeight --keydata numeric -s descending
379 -r NewSample1 -o Sample1.sdf
380
381 =head1 AUTHOR
382
383 Manish Sud <msud@san.rr.com>
384
385 =head1 SEE ALSO
386
387 JoinSDFiles.pl, MergeTextFilesWithSD.pl, SplitSDFiles.pl, SDFilesToHTML.pl
388
389 =head1 COPYRIGHT
390
391 Copyright (C) 2015 Manish Sud. All rights reserved.
392
393 This file is part of MayaChemTools.
394
395 MayaChemTools is free software; you can redistribute it and/or modify it under
396 the terms of the GNU Lesser General Public License as published by the Free
397 Software Foundation; either version 3 of the License, or (at your option)
398 any later version.
399
400 =cut