MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: SplitSDFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:21 $
   5 # $Revision: 1.36 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Benchmark;
  34 use SDFileUtil;
  35 use FileUtil;
  36 
  37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  38 
  39 # Autoflush STDOUT
  40 $| = 1;
  41 
  42 # Starting message...
  43 $ScriptName = basename $0;
  44 print "\n$ScriptName:Starting...\n\n";
  45 $StartTime = new Benchmark;
  46 
  47 # Get the options and setup script...
  48 SetupScriptUsage();
  49 if ($Options{help} || @ARGV < 1) {
  50   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  51 }
  52 
  53 my(@SDFilesList);
  54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
  55 
  56 # Process options...
  57 print "Processing options...\n";
  58 my(%OptionsInfo);
  59 ProcessOptions();
  60 
  61 # Setup information about input files...
  62 my(%SDFilesInfo);
  63 print "Checking input SD file(s)...\n";
  64 RetrieveSDFilesInfo();
  65 
  66 # Process input files..
  67 my($FileIndex);
  68 if (@SDFilesList > 1) {
  69   print "\nProcessing SD files...\n";
  70 }
  71 for $FileIndex (0 .. $#SDFilesList) {
  72   if ($SDFilesInfo{FileOkay}[$FileIndex]) {
  73     print "\nProcessing file $SDFilesList[$FileIndex]...\n";
  74     SplitSDFile($FileIndex);
  75   }
  76 }
  77 print "\n$ScriptName:Done...\n\n";
  78 
  79 $EndTime = new Benchmark;
  80 $TotalTime = timediff ($EndTime, $StartTime);
  81 print "Total time: ", timestr($TotalTime), "\n";
  82 
  83 ###############################################################################
  84 
  85 # Split a SD file...
  86 #
  87 sub SplitSDFile {
  88   my($FileIndex) = @_;
  89 
  90   if ($OptionsInfo{Mode} =~ /^Files$/i) {
  91     SplitSDFileByNumOfFiles($FileIndex);
  92   }
  93   elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) {
  94     SplitSDFileByNumOfCmpds($FileIndex);
  95   }
  96 }
  97 
  98 # Split SD into specified number of files...
  99 #
 100 sub SplitSDFileByNumOfFiles {
 101   my($FileIndex) = @_;
 102   my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
 103 
 104   $SDFile = $SDFilesList[$FileIndex];
 105 
 106   if (!open SDFILE, "$SDFile") {
 107     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 108     return;
 109   }
 110 
 111   $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
 112 
 113   # Count number of compounds to figure out maximum number of compound per file...
 114   $CmpdCount = 0;
 115   while (<SDFILE>) {
 116     if (/^\$\$\$\$/) {
 117       $CmpdCount++;
 118     }
 119   }
 120   close SDFILE;
 121 
 122   if ($CmpdCount < $MaxNumOfFiles) {
 123     warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n";
 124     return;
 125   }
 126 
 127   $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles;
 128 
 129   SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
 130 }
 131 
 132 # Split SD into files containing specified number of compounds...
 133 #
 134 sub SplitSDFileByNumOfCmpds {
 135   my($FileIndex) = @_;
 136 
 137   if ($OptionsInfo{NumOfCmpds} == 1) {
 138     SplitSDFileByOneCmpdPerFile($FileIndex);
 139   }
 140   else {
 141     SplitSDFileByNumOfCmpdsPerFile($FileIndex);
 142   }
 143 }
 144 
 145 # Split SD into files containing one compound per file...
 146 #
 147 sub SplitSDFileByOneCmpdPerFile {
 148   my($FileIndex) = @_;
 149   my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues);
 150 
 151   $SDFile = $SDFilesList[$FileIndex];
 152 
 153   if (!open SDFILE, "$SDFile") {
 154     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 155     return;
 156   }
 157 
 158   print "\n";
 159 
 160   $CmpdCount = 0;
 161 
 162   $FileExt = $SDFilesInfo{FileExt}[$FileIndex];
 163 
 164   $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
 165   $OverwriteFiles = $OptionsInfo{OverwriteFiles};
 166 
 167   $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0;
 168   $DataFieldName = $OptionsInfo{DataField};
 169 
 170   $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0;
 171 
 172   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
 173     $CmpdCount++;
 174 
 175     # Setup SD file name...
 176     $NewSDFileRoot = '';
 177     if ($UseDataField) {
 178       @CmpdLines = split "\n", $CmpdString;
 179       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
 180       if (exists $DataFieldValues{$DataFieldName}) {
 181         $NewSDFileRoot = $DataFieldValues{$DataFieldName};
 182       }
 183     }
 184     elsif ($UseMolName) {
 185       @CmpdLines = split "\n", $CmpdString;
 186       $NewSDFileRoot = $CmpdLines[0];
 187     }
 188 
 189     # Check for any invalid file name characters in data field or molname values...
 190     if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) {
 191       $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g;
 192     }
 193 
 194     # Fall back plan for SD file name...
 195     if (!$NewSDFileRoot) {
 196       $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
 197     }
 198 
 199     $NewSDFile = "${NewSDFileRoot}.${FileExt}";
 200 
 201     if (!$OverwriteFiles) {
 202       if (-e $NewSDFile) {
 203         warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n";
 204         next CMPDSTRING;
 205       }
 206     }
 207 
 208     # Write out new SD file...
 209 
 210     print "Generating $NewSDFile file\n";
 211     open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n";
 212     print NEWSDFILE "$CmpdString\n";
 213     close NEWSDFILE;
 214 
 215   }
 216   close SDFILE;
 217 }
 218 
 219 # Split SD into files containing specified number of compounds per file...
 220 #
 221 sub SplitSDFileByNumOfCmpdsPerFile {
 222   my($FileIndex) = @_;
 223   my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
 224 
 225   $SDFile = $SDFilesList[$FileIndex];
 226 
 227   if (!open SDFILE, "$SDFile") {
 228     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 229     return;
 230   }
 231 
 232   $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds};
 233 
 234   # Count number of compounds to figure out maximum number of files...
 235   $CmpdCount = 0;
 236   while (<SDFILE>) {
 237     if (/^\$\$\$\$/) {
 238       $CmpdCount++;
 239     }
 240   }
 241   close SDFILE;
 242 
 243   $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile;
 244 
 245   if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) {
 246     $MaxNumOfFiles++;
 247   }
 248 
 249   if ($CmpdCount <= $MaxCmpdsPerFile) {
 250     warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n";
 251     return;
 252   }
 253 
 254   SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
 255 }
 256 
 257 # Split SD files into specified number of files with specified number of compounds
 258 # in each file...
 259 #
 260 sub SplitSDFileByNumOfFilesAndCmpds {
 261   my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_;
 262   my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList);
 263 
 264   $SDFile = $SDFilesList[$FileIndex];
 265 
 266   if (!open SDFILE, "$SDFile") {
 267     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
 268     return;
 269   }
 270 
 271   # Setup new file names list...
 272   @NewSDFilesList = ();
 273   for $NewFileIndex (1 .. $NumOfFiles) {
 274     $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex];
 275     if (!$OptionsInfo{OverwriteFiles}) {
 276       if (-e $NewFileName) {
 277         warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n";
 278         return;
 279       }
 280     }
 281     push @NewSDFilesList, $NewFileName;
 282   }
 283 
 284   $MaxCmpdsCount = $NumOfCmpdsPerFile;
 285 
 286   $CmpdCount = 0;
 287   $NewFileIndex = 1;
 288 
 289   open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
 290   print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n";
 291 
 292   open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
 293 
 294   while (<SDFILE>) {
 295     s/(\r\n)|(\r)/\n/g;
 296     print NEWSDFILE;
 297 
 298     if ( /^\$\$\$\$/ ) {
 299       $CmpdCount++;
 300       if ($NewFileIndex <= $NumOfFiles) {
 301         if ($CmpdCount >= $MaxCmpdsCount) {
 302           if ($NewFileIndex < $NumOfFiles) {
 303             close NEWSDFILE;
 304           }
 305           $NewFileIndex++;
 306           $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex;
 307 
 308           if ($NewFileIndex <= $NumOfFiles) {
 309             open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
 310             print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n";
 311           }
 312         }
 313       }
 314     }
 315   }
 316   close NEWSDFILE;
 317 }
 318 
 319 # Retrieve information about SD files...
 320 #
 321 sub RetrieveSDFilesInfo {
 322   my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
 323 
 324   %SDFilesInfo = ();
 325   @{$SDFilesInfo{FileOkay}} = ();
 326   @{$SDFilesInfo{FileExt}} = ();
 327   @{$SDFilesInfo{OutFileRoot}} = ();
 328 
 329   FILELIST: for $Index (0 .. $#SDFilesList) {
 330     $SDFile = $SDFilesList[$Index];
 331 
 332     $SDFilesInfo{FileOkay}[$Index] = 0;
 333     $SDFilesInfo{FileExt}[$Index] = '';
 334     $SDFilesInfo{OutFileRoot}[$Index] = '';
 335 
 336     $SDFile = $SDFilesList[$Index];
 337     if (!(-e $SDFile)) {
 338       warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
 339       next FILELIST;
 340     }
 341     if (!CheckFileType($SDFile, "sd sdf")) {
 342       warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
 343       next FILELIST;
 344     }
 345 
 346     # Setup output file root...
 347     $FileDir = ""; $FileName = ""; $FileExt = "";
 348     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
 349 
 350     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
 351       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
 352       if ($RootFileName && $RootFileExt) {
 353         $FileName = $RootFileName;
 354       }
 355       else {
 356         $FileName = $OptionsInfo{OutFileRoot};
 357       }
 358       $OutFileRoot = $FileName;
 359     }
 360     else {
 361       $OutFileRoot = "$FileName";
 362     }
 363 
 364     $SDFilesInfo{FileOkay}[$Index] = 1;
 365     $SDFilesInfo{FileExt}[$Index] = $FileExt;
 366     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
 367   }
 368 }
 369 
 370 # Process option values...
 371 sub ProcessOptions {
 372   %OptionsInfo = ();
 373 
 374   $OptionsInfo{Mode} = $Options{mode};
 375 
 376   $OptionsInfo{CmpdsMode} = $Options{cmpdsmode};
 377 
 378   $OptionsInfo{NumOfFiles} = $Options{numfiles};
 379   $OptionsInfo{NumOfCmpds} = $Options{numcmpds};
 380 
 381   $OptionsInfo{DataField} = '';
 382   if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) {
 383     if (!$Options{datafield}) {
 384       die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n";
 385     }
 386     $OptionsInfo{DataField} = $Options{datafield};
 387   }
 388 
 389   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
 390 
 391   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
 392 }
 393 
 394 
 395 # Setup script usage  and retrieve command line arguments specified using various options...
 396 sub SetupScriptUsage {
 397 
 398   # Retrieve all the options...
 399   %Options = ();
 400 
 401   $Options{cmpdsmode} = 'RootPrefix';
 402   $Options{mode} = 'Files';
 403 
 404   $Options{numfiles} = 2;
 405   $Options{numcmpds} = 1;
 406 
 407 
 408   if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 409     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 410   }
 411   if ($Options{workingdir}) {
 412     if (! -d $Options{workingdir}) {
 413       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
 414     }
 415     chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
 416   }
 417   if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) {
 418     die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
 419   }
 420   if ($Options{mode} !~ /^(Cmpds|Files)$/i) {
 421     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n";
 422   }
 423   if ($Options{numfiles} < 2) {
 424     die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
 425   }
 426   if ($Options{numcmpds} < 1) {
 427     die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n";
 428   }
 429 }
 430