MayaChemTools

   1 #!/usr/bin/perl -w
   2 #
   3 # $RCSfile: TextFilesToSDFiles.pl,v $
   4 # $Date: 2015/02/28 20:46:21 $
   5 # $Revision: 1.25 $
   6 #
   7 # Author: Manish Sud <msud@san.rr.com>
   8 #
   9 # Copyright (C) 2015 Manish Sud. All rights reserved.
  10 #
  11 # This file is part of MayaChemTools.
  12 #
  13 # MayaChemTools is free software; you can redistribute it and/or modify it under
  14 # the terms of the GNU Lesser General Public License as published by the Free
  15 # Software Foundation; either version 3 of the License, or (at your option) any
  16 # later version.
  17 #
  18 # MayaChemTools is distributed in the hope that it will be useful, but without
  19 # any warranty; without even the implied warranty of merchantability of fitness
  20 # for a particular purpose.  See the GNU Lesser General Public License for more
  21 # details.
  22 #
  23 # You should have received a copy of the GNU Lesser General Public License
  24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
  25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
  26 # Boston, MA, 02111-1307, USA.
  27 #
  28 
  29 use strict;
  30 use FindBin; use lib "$FindBin::Bin/../lib";
  31 use Getopt::Long;
  32 use File::Basename;
  33 use Text::ParseWords;
  34 use Benchmark;
  35 use FileUtil;
  36 use TextUtil;
  37 use SDFileUtil;
  38 
  39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
  40 
  41 # Autoflush STDOUT
  42 $| = 1;
  43 
  44 # Starting message...
  45 $ScriptName = basename $0;
  46 print "\n$ScriptName:Starting...\n\n";
  47 $StartTime = new Benchmark;
  48 
  49 # Get the options and setup script...
  50 SetupScriptUsage();
  51 if ($Options{help} || @ARGV < 1) {
  52   die GetUsageFromPod("$FindBin::Bin/$ScriptName");
  53 }
  54 
  55 my(@TextFilesList);
  56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv");
  57 
  58 print "Processing options...\n";
  59 my(%OptionsInfo);
  60 ProcessOptions();
  61 
  62 print "Checking input text file(s)...\n";
  63 my(%TextFilesInfo);
  64 RetrieveTextFilesInfo();
  65 
  66 # Generate output files...
  67 my($FileIndex);
  68 if (@TextFilesList > 1) {
  69   print "\nProcessing text files...\n";
  70 }
  71 for $FileIndex (0 .. $#TextFilesList) {
  72   if ($TextFilesInfo{FileOkay}[$FileIndex]) {
  73     print "\nProcessing file $TextFilesList[$FileIndex]...\n";
  74     ConvertTextFile($FileIndex);
  75   }
  76 }
  77 print "\n$ScriptName:Done...\n\n";
  78 
  79 $EndTime = new Benchmark;
  80 $TotalTime = timediff ($EndTime, $StartTime);
  81 print "Total time: ", timestr($TotalTime), "\n";
  82 
  83 ###############################################################################
  84 
  85 # Convert text file to SD file...
  86 sub ConvertTextFile {
  87   my($Index) = @_;
  88   my($TextFile, $SDFile, $Line, $InDelim, $Label, $Value, $ColIndex, $ColCount, @ColLabels, @LineWords);
  89 
  90   $TextFile = $TextFilesList[$Index];
  91   $InDelim = $TextFilesInfo{InDelim}[$Index];
  92   $SDFile = $TextFilesInfo{OutSDFile}[$Index];
  93   @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]};
  94   $ColCount = @ColLabels;
  95 
  96   print "Generating SD file $SDFile...\n";
  97   open SDFILE, ">$SDFile" or die "Error: Couldn't open $SDFile: $! \n";
  98   open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n";
  99   if ($OptionsInfo{ColLabelsPresent}) {
 100     # Skip over column labels from old file...
 101     $Line = GetTextLine(\*TEXTFILE);
 102   }
 103   my($Date) = GenerateMiscLineDateStamp();
 104   while ($Line = GetTextLine(\*TEXTFILE)) {
 105     @LineWords = quotewords($InDelim, 0, $Line);
 106 
 107     # Write out empty CTAB block...
 108     print SDFILE GenerateEmptyCtabBlockLines($Date), "\n";
 109 
 110     # Write out data fields and values...
 111     for $ColIndex (0 .. $#LineWords) {
 112       if ($ColIndex < $ColCount) {
 113         $Label = $ColLabels[$ColIndex];
 114         $Value = $LineWords[$ColIndex];
 115         print SDFILE "> <$Label>\n$Value\n\n";
 116       }
 117     }
 118     print SDFILE "\$\$\$\$\n";
 119   }
 120   close SDFILE;
 121   close TEXTFILE;
 122 }
 123 
 124 # Retrieve information about input text files...
 125 sub RetrieveTextFilesInfo {
 126   my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @LineWords, @ColLabels, $OutFileRoot,  $OutFile, $ColNum, $ColLabel);
 127 
 128   %TextFilesInfo = ();
 129 
 130   @{$TextFilesInfo{FileOkay}} = ();
 131   @{$TextFilesInfo{ColCount}} = ();
 132   @{$TextFilesInfo{ColLabels}} = ();
 133   @{$TextFilesInfo{InDelim}} = ();
 134   @{$TextFilesInfo{OutSDFile}} = ();
 135 
 136 
 137   FILELIST: for $Index (0 .. $#TextFilesList) {
 138     $TextFile = $TextFilesList[$Index];
 139 
 140     $TextFilesInfo{FileOkay}[$Index] = 0;
 141     $TextFilesInfo{ColCount}[$Index] = 0;
 142     $TextFilesInfo{InDelim}[$Index] = "";
 143     $TextFilesInfo{OutSDFile}[$Index] = "";
 144 
 145     @{$TextFilesInfo{ColLabels}[$Index]} = ();
 146 
 147     if (!(-e $TextFile)) {
 148       warn "Warning: Ignoring file $TextFile: It doesn't exist\n";
 149       next FILELIST;
 150     }
 151     if (!CheckFileType($TextFile, "csv tsv")) {
 152       warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n";
 153       next FILELIST;
 154     }
 155     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 156     if ($FileExt =~ /^tsv$/i) {
 157       $InDelim = "\t";
 158     }
 159     else {
 160       $InDelim = "\,";
 161       if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 162         warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n";
 163         next FILELIST;
 164       }
 165       if ($Options{indelim} =~ /^semicolon$/i) {
 166         $InDelim = "\;";
 167       }
 168     }
 169     if (!open TEXTFILE, "$TextFile") {
 170       warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n";
 171       next FILELIST;
 172     }
 173     $Line = GetTextLine(\*TEXTFILE);
 174     @LineWords = quotewords($InDelim, 0, $Line);
 175     @ColLabels = ();
 176     if ($OptionsInfo{ColLabelsPresent}) {
 177       push @ColLabels, @LineWords;
 178     }
 179     else {
 180       for $ColNum (1 .. @LineWords) {
 181         $ColLabel = "Column${ColNum}Data";
 182         push @ColLabels, $ColLabel;
 183       }
 184     }
 185     close TEXTFILE;
 186 
 187     $FileDir = ""; $FileName = ""; $FileExt = "";
 188     ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile);
 189     if ($Options{root} && (@TextFilesList == 1)) {
 190       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root});
 191       if ($RootFileName && $RootFileExt) {
 192         $FileName = $RootFileName;
 193       }
 194       else {
 195         $FileName = $Options{root};
 196       }
 197       $OutFileRoot = $FileName;
 198     }
 199     else {
 200       $OutFileRoot = "${FileName}WithNoStrData";
 201     }
 202 
 203     $OutFile = "${OutFileRoot}.sdf";
 204     if (!$Options{overwrite}) {
 205       if (-e $OutFile) {
 206         warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n";
 207         next FILELIST;
 208       }
 209     }
 210     $TextFilesInfo{FileOkay}[$Index] = 1;
 211     $TextFilesInfo{InDelim}[$Index] = $InDelim;
 212     $TextFilesInfo{OutSDFile}[$Index] = "$OutFile";
 213 
 214     $TextFilesInfo{ColCount}[$Index] = @ColLabels;
 215     push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels;
 216   }
 217 }
 218 
 219 # Process option values...
 220 sub ProcessOptions {
 221   %OptionsInfo = ();
 222 
 223   $OptionsInfo{Label} = $Options{label};
 224   $OptionsInfo{ColLabelsPresent} = ($Options{label} =~ /^yes$/i) ? 1 : 0;
 225 
 226   $OptionsInfo{InDelim} = $Options{indelim};
 227   $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
 228 
 229   $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
 230 
 231 }
 232 
 233 # Setup script usage  and retrieve command line arguments specified using various options...
 234 sub SetupScriptUsage {
 235 
 236   # Retrieve all the options...
 237   %Options = ();
 238   $Options{label} = "yes";
 239   $Options{indelim} = "comma";
 240   if (!GetOptions(\%Options, "help|h", "indelim=s", "label|l=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
 241     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
 242   }
 243   if ($Options{workingdir}) {
 244     if (! -d $Options{workingdir}) {
 245       die "Error: The value specified, $Options{workingdir},  for option \"-w --workingdir\" is not a directory name.\n";
 246     }
 247     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
 248   }
 249   if ($Options{indelim} !~ /^(comma|semicolon)$/i) {
 250     die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n";
 251   }
 252   if ($Options{label} !~ /^(yes|no)$/i) {
 253     die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n";
 254   }
 255 }
 256