1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: TextFilesToSDFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:21 $ 5 # $Revision: 1.25 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use FileUtil; 36 use TextUtil; 37 use SDFileUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename $0; 46 print "\n$ScriptName:Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@TextFilesList); 56 @TextFilesList = ExpandFileNames(\@ARGV, "csv tsv"); 57 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 print "Checking input text file(s)...\n"; 63 my(%TextFilesInfo); 64 RetrieveTextFilesInfo(); 65 66 # Generate output files... 67 my($FileIndex); 68 if (@TextFilesList > 1) { 69 print "\nProcessing text files...\n"; 70 } 71 for $FileIndex (0 .. $#TextFilesList) { 72 if ($TextFilesInfo{FileOkay}[$FileIndex]) { 73 print "\nProcessing file $TextFilesList[$FileIndex]...\n"; 74 ConvertTextFile($FileIndex); 75 } 76 } 77 print "\n$ScriptName:Done...\n\n"; 78 79 $EndTime = new Benchmark; 80 $TotalTime = timediff ($EndTime, $StartTime); 81 print "Total time: ", timestr($TotalTime), "\n"; 82 83 ############################################################################### 84 85 # Convert text file to SD file... 86 sub ConvertTextFile { 87 my($Index) = @_; 88 my($TextFile, $SDFile, $Line, $InDelim, $Label, $Value, $ColIndex, $ColCount, @ColLabels, @LineWords); 89 90 $TextFile = $TextFilesList[$Index]; 91 $InDelim = $TextFilesInfo{InDelim}[$Index]; 92 $SDFile = $TextFilesInfo{OutSDFile}[$Index]; 93 @ColLabels = @{$TextFilesInfo{ColLabels}[$Index]}; 94 $ColCount = @ColLabels; 95 96 print "Generating SD file $SDFile...\n"; 97 open SDFILE, ">$SDFile" or die "Error: Couldn't open $SDFile: $! \n"; 98 open TEXTFILE, "$TextFile" or die "Error: Can't open $TextFile: $! \n"; 99 if ($OptionsInfo{ColLabelsPresent}) { 100 # Skip over column labels from old file... 101 $Line = GetTextLine(\*TEXTFILE); 102 } 103 my($Date) = GenerateMiscLineDateStamp(); 104 while ($Line = GetTextLine(\*TEXTFILE)) { 105 @LineWords = quotewords($InDelim, 0, $Line); 106 107 # Write out empty CTAB block... 108 print SDFILE GenerateEmptyCtabBlockLines($Date), "\n"; 109 110 # Write out data fields and values... 111 for $ColIndex (0 .. $#LineWords) { 112 if ($ColIndex < $ColCount) { 113 $Label = $ColLabels[$ColIndex]; 114 $Value = $LineWords[$ColIndex]; 115 print SDFILE "> <$Label>\n$Value\n\n"; 116 } 117 } 118 print SDFILE "\$\$\$\$\n"; 119 } 120 close SDFILE; 121 close TEXTFILE; 122 } 123 124 # Retrieve information about input text files... 125 sub RetrieveTextFilesInfo { 126 my($Index, $TextFile, $FileDir, $FileName, $FileExt, $InDelim, $Line, @LineWords, @ColLabels, $OutFileRoot, $OutFile, $ColNum, $ColLabel); 127 128 %TextFilesInfo = (); 129 130 @{$TextFilesInfo{FileOkay}} = (); 131 @{$TextFilesInfo{ColCount}} = (); 132 @{$TextFilesInfo{ColLabels}} = (); 133 @{$TextFilesInfo{InDelim}} = (); 134 @{$TextFilesInfo{OutSDFile}} = (); 135 136 137 FILELIST: for $Index (0 .. $#TextFilesList) { 138 $TextFile = $TextFilesList[$Index]; 139 140 $TextFilesInfo{FileOkay}[$Index] = 0; 141 $TextFilesInfo{ColCount}[$Index] = 0; 142 $TextFilesInfo{InDelim}[$Index] = ""; 143 $TextFilesInfo{OutSDFile}[$Index] = ""; 144 145 @{$TextFilesInfo{ColLabels}[$Index]} = (); 146 147 if (!(-e $TextFile)) { 148 warn "Warning: Ignoring file $TextFile: It doesn't exist\n"; 149 next FILELIST; 150 } 151 if (!CheckFileType($TextFile, "csv tsv")) { 152 warn "Warning: Ignoring file $TextFile: It's not a csv or tsv file\n"; 153 next FILELIST; 154 } 155 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 156 if ($FileExt =~ /^tsv$/i) { 157 $InDelim = "\t"; 158 } 159 else { 160 $InDelim = "\,"; 161 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 162 warn "Warning: Ignoring file $TextFile: The value specified, $Options{indelim}, for option \"--indelim\" is not valid for csv files\n"; 163 next FILELIST; 164 } 165 if ($Options{indelim} =~ /^semicolon$/i) { 166 $InDelim = "\;"; 167 } 168 } 169 if (!open TEXTFILE, "$TextFile") { 170 warn "Warning: Ignoring file $TextFile: Couldn't open it: $! \n"; 171 next FILELIST; 172 } 173 $Line = GetTextLine(\*TEXTFILE); 174 @LineWords = quotewords($InDelim, 0, $Line); 175 @ColLabels = (); 176 if ($OptionsInfo{ColLabelsPresent}) { 177 push @ColLabels, @LineWords; 178 } 179 else { 180 for $ColNum (1 .. @LineWords) { 181 $ColLabel = "Column${ColNum}Data"; 182 push @ColLabels, $ColLabel; 183 } 184 } 185 close TEXTFILE; 186 187 $FileDir = ""; $FileName = ""; $FileExt = ""; 188 ($FileDir, $FileName, $FileExt) = ParseFileName($TextFile); 189 if ($Options{root} && (@TextFilesList == 1)) { 190 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($Options{root}); 191 if ($RootFileName && $RootFileExt) { 192 $FileName = $RootFileName; 193 } 194 else { 195 $FileName = $Options{root}; 196 } 197 $OutFileRoot = $FileName; 198 } 199 else { 200 $OutFileRoot = "${FileName}WithNoStrData"; 201 } 202 203 $OutFile = "${OutFileRoot}.sdf"; 204 if (!$Options{overwrite}) { 205 if (-e $OutFile) { 206 warn "Warning: Ignoring file $TextFile: The file $OutFile already exists\n"; 207 next FILELIST; 208 } 209 } 210 $TextFilesInfo{FileOkay}[$Index] = 1; 211 $TextFilesInfo{InDelim}[$Index] = $InDelim; 212 $TextFilesInfo{OutSDFile}[$Index] = "$OutFile"; 213 214 $TextFilesInfo{ColCount}[$Index] = @ColLabels; 215 push @{$TextFilesInfo{ColLabels}[$Index]}, @ColLabels; 216 } 217 } 218 219 # Process option values... 220 sub ProcessOptions { 221 %OptionsInfo = (); 222 223 $OptionsInfo{Label} = $Options{label}; 224 $OptionsInfo{ColLabelsPresent} = ($Options{label} =~ /^yes$/i) ? 1 : 0; 225 226 $OptionsInfo{InDelim} = $Options{indelim}; 227 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 228 229 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; 230 231 } 232 233 # Setup script usage and retrieve command line arguments specified using various options... 234 sub SetupScriptUsage { 235 236 # Retrieve all the options... 237 %Options = (); 238 $Options{label} = "yes"; 239 $Options{indelim} = "comma"; 240 if (!GetOptions(\%Options, "help|h", "indelim=s", "label|l=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { 241 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 242 } 243 if ($Options{workingdir}) { 244 if (! -d $Options{workingdir}) { 245 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 246 } 247 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 248 } 249 if ($Options{indelim} !~ /^(comma|semicolon)$/i) { 250 die "Error: The value specified, $Options{indelim}, for option \"--indelim\" is not valid. Allowed values: comma or semicolon\n"; 251 } 252 if ($Options{label} !~ /^(yes|no)$/i) { 253 die "Error: The value specified, $Options{label}, for option \"-l --label\" is not valid. Allowed values: yes or no\n"; 254 } 255 } 256