1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: SortSDFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:21 $ 5 # $Revision: 1.26 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use SDFileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename($0); 46 print "\n$ScriptName: Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@SDFilesList); 56 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 57 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 print "Checking input SD file(s)...\n"; 63 my(%SDFilesInfo); 64 RetrieveSDFilesInfo(); 65 66 # Generate output files... 67 my($FileIndex); 68 if (@SDFilesList > 1) { 69 print "\nProcessing SD files...\n"; 70 } 71 for $FileIndex (0 .. $#SDFilesList) { 72 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 73 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 74 SortSDFile($FileIndex); 75 } 76 } 77 print "\n$ScriptName:Done...\n\n"; 78 79 $EndTime = new Benchmark; 80 $TotalTime = timediff ($EndTime, $StartTime); 81 print "Total time: ", timestr($TotalTime), "\n"; 82 83 ############################################################################### 84 85 # Sort it out... 86 sub SortSDFile { 87 my($Index) = @_; 88 my($SDFile, $NewSDFile, $KeyDataFieldName); 89 90 $SDFile = $SDFilesList[$Index]; 91 $NewSDFile = $SDFilesInfo{OutFile}[$Index]; 92 $KeyDataFieldName = $SDFilesInfo{KeyDataFieldName}[$Index]; 93 94 print "Generating new SD file $NewSDFile...\n"; 95 open NEWSDFILE, ">$NewSDFile" or die "Error: Couldn't open $NewSDFile: $! \n"; 96 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n"; 97 98 # Go over all compound records and store 'em using key value as hash... 99 my(%KeyToCompundRecordsMap, @InvalidCompoundRecords, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues, $KeyDataFieldValue); 100 %KeyToCompundRecordsMap = (); 101 @InvalidCompoundRecords = (); 102 $CmpdCount = 0; 103 104 COMPOUND: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 105 $CmpdCount++; 106 @CmpdLines = split "\n", $CmpdString; 107 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 108 #Make sure data field value is okay... 109 if (!(IsNotEmpty($DataFieldValues{$KeyDataFieldName}) && ($DataFieldValues{$KeyDataFieldName} !~ /\n/))) { 110 push @InvalidCompoundRecords, $CmpdString; 111 if ($OptionsInfo{DetailLevel} >= 3 ) { 112 print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName :\n $CmpdString\n\n"; 113 } 114 elsif ($OptionsInfo{DetailLevel} >= 2) { 115 print "Ignoring compound record $CmpdCount: Contains empty value for key data field $KeyDataFieldName...\n"; 116 } 117 next COMPOUND; 118 } 119 $KeyDataFieldValue = $DataFieldValues{$KeyDataFieldName}; 120 if ($OptionsInfo{KeyData} =~ /^numeric$/i) { 121 if (!IsFloat($KeyDataFieldValue)) { 122 push @InvalidCompoundRecords, $CmpdString; 123 if ($OptionsInfo{DetailLevel} >= 3 ) { 124 print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName :\n $CmpdString\n\n"; 125 } 126 elsif ($OptionsInfo{DetailLevel} >= 2) { 127 print "Ignoring compound record $CmpdCount: Contains non-numerical value for key data field $KeyDataFieldName...\n"; 128 } 129 next COMPOUND; 130 } 131 } 132 if (exists($KeyToCompundRecordsMap{$KeyDataFieldValue})) { 133 # Append to existing coompund data... 134 $KeyToCompundRecordsMap{$KeyDataFieldValue} .= "\n" . $CmpdString; 135 } 136 else { 137 $KeyToCompundRecordsMap{$KeyDataFieldValue} = $CmpdString; 138 } 139 } 140 141 if ($OptionsInfo{Sort} =~ /^ascending$/i) { 142 if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) { 143 for $KeyDataFieldValue (sort { lc($a) cmp lc($b) } keys %KeyToCompundRecordsMap ) { 144 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n"; 145 } 146 } 147 else { 148 for $KeyDataFieldValue (sort { $a <=> $b } keys %KeyToCompundRecordsMap ) { 149 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n"; 150 } 151 } 152 } 153 else { 154 if ($OptionsInfo{KeyData} =~ /^alphanumeric$/i) { 155 for $KeyDataFieldValue (sort { lc($b) cmp lc($a) } keys %KeyToCompundRecordsMap ) { 156 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n"; 157 } 158 } 159 else { 160 for $KeyDataFieldValue (sort { $b <=> $a } keys %KeyToCompundRecordsMap ) { 161 print NEWSDFILE "$KeyToCompundRecordsMap{$KeyDataFieldValue}\n"; 162 } 163 } 164 } 165 # Append the records containing data not appropriate for sorting... 166 if (@InvalidCompoundRecords) { 167 print "Placing ", scalar(@InvalidCompoundRecords)," compound record(s) with invalid data field key data the end...\n"; 168 for $CmpdString (@InvalidCompoundRecords) { 169 print NEWSDFILE "$CmpdString\n"; 170 } 171 } 172 close NEWSDFILE; 173 close SDFILE; 174 } 175 176 # Retrieve information about input SD files... 177 sub RetrieveSDFilesInfo { 178 my($Index, $SDFile, $FileDir, $FileName, $FileExt, $OutFileRoot, $OutFile, $DataFieldName); 179 180 %SDFilesInfo = (); 181 182 @{$SDFilesInfo{FileOkay}} = (); 183 @{$SDFilesInfo{OutFile}} = (); 184 @{$SDFilesInfo{KeyDataFieldName}} = (); 185 186 FILELIST: for $Index (0 .. $#SDFilesList) { 187 $SDFile = $SDFilesList[$Index]; 188 $SDFilesInfo{FileOkay}[$Index] = 0; 189 $SDFilesInfo{OutFile}[$Index] = ""; 190 $SDFilesInfo{KeyDataFieldName}[$Index] = ""; 191 192 if (!(-e $SDFile)) { 193 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 194 next FILELIST; 195 } 196 if (!CheckFileType($SDFile, "sd sdf")) { 197 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 198 next FILELIST; 199 } 200 $FileDir = ""; $FileName = ""; $FileExt = ""; 201 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 202 if ($Options{root} && (@SDFilesList == 1)) { 203 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 204 if ($RootFileName && $RootFileExt) { 205 $FileName = $RootFileName; 206 } 207 else { 208 $FileName = $Options{root}; 209 } 210 $OutFileRoot = $FileName; 211 } 212 else { 213 $OutFileRoot = $FileName . "SortedByDataField"; 214 } 215 216 $OutFile = $OutFileRoot . ".$FileExt"; 217 if (lc($OutFile) eq lc($SDFile)) { 218 warn "Warning: Ignoring file $SDFile:Output file name, $OutFile, is same as input SD file name, $SDFile\n"; 219 next FILELIST; 220 } 221 if (!$Options{overwrite}) { 222 if (-e $OutFile) { 223 warn "Warning: Ignoring file $SDFile: The file $OutFile already exists\n"; 224 next FILELIST; 225 } 226 } 227 # Setup data field name... 228 if ($OptionsInfo{SpecifiedDataFieldName}) { 229 $DataFieldName = $OptionsInfo{SpecifiedDataFieldName}; 230 } 231 else { 232 my($CmpdString, @CmpdLines, @DataFieldNames); 233 @DataFieldNames = (); 234 if (!open(SDFILE, "$SDFile")) { 235 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 236 next FILELIST; 237 } 238 $CmpdString = ReadCmpdString(\*SDFILE); 239 close SDFILE; 240 241 @CmpdLines = split "\n", $CmpdString; 242 @DataFieldNames = GetCmpdDataHeaderLabels(\@CmpdLines); 243 $DataFieldName = $DataFieldNames[0]; 244 } 245 246 $SDFilesInfo{FileOkay}[$Index] = 1; 247 $SDFilesInfo{OutFile}[$Index] = "$OutFile"; 248 $SDFilesInfo{KeyDataFieldName}[$Index] = $DataFieldName; 249 } 250 } 251 252 # Process option values... 253 sub ProcessOptions { 254 $OptionsInfo{DetailLevel} = $Options{detail}; 255 256 $OptionsInfo{Key} = defined $Options{key} ? $Options{key} : undef; 257 $OptionsInfo{SpecifiedDataFieldName} = ""; 258 if (defined $Options{key}) { 259 $OptionsInfo{SpecifiedDataFieldName} = $Options{key}; 260 } 261 262 $OptionsInfo{KeyData} = $Options{keydata}; 263 $OptionsInfo{Sort} = $Options{sort}; 264 265 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 266 $OptionsInfo{Root} = defined $Options{root} ? $Options{root} : undef; 267 } 268 269 # Setup script usage and retrieve command line arguments specified using various options... 270 sub SetupScriptUsage { 271 272 # Retrieve all the options... 273 %Options = (); 274 $Options{detail} = 1; 275 $Options{sort} = "ascending"; 276 $Options{keydata} = "numeric"; 277 if (!GetOptions(\%Options, "detail|d=i", "help|h", "key|k=s", "keydata=s", "overwrite|o", "root|r=s", "sort|s=s", "workingdir|w=s")) { 278 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 279 } 280 if ($Options{workingdir}) { 281 if (! -d $Options{workingdir}) { 282 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 283 } 284 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 285 } 286 if ($Options{keydata} !~ /^(numeric|alphanumeric)$/i) { 287 die "Error: The value specified, $Options{keydata}, for option \"--keydata\" is not valid. Allowed values: numeric or alphanumeric\n"; 288 } 289 if ($Options{sort} !~ /^(ascending|descending)$/i) { 290 die "Error: The value specified, $Options{sort}, for option \"-s --sort\" is not valid. Allowed values: ascending or descending\n"; 291 } 292 if (!IsPositiveInteger($Options{detail})) { 293 die "Error: The value specified, $Options{detail}, for option \"-d --detail\" is not valid. Allowed values: > 0\n"; 294 } 295 } 296