1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: SDToMolFiles.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.35 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use SDFileUtil; 36 use FileUtil; 37 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 39 40 # Autoflush STDOUT 41 $| = 1; 42 43 # Starting message... 44 $ScriptName = basename $0; 45 print "\n$ScriptName:Starting...\n\n"; 46 $StartTime = new Benchmark; 47 48 # Get the options and setup script... 49 SetupScriptUsage(); 50 if ($Options{help} || @ARGV < 1) { 51 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 52 } 53 54 my(@SDFilesList); 55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); 56 57 # Process options... 58 print "Processing options...\n"; 59 my(%OptionsInfo); 60 ProcessOptions(); 61 62 # Setup information about input files... 63 print "Checking input SD file(s)...\n"; 64 my(%SDFilesInfo); 65 RetrieveSDFilesInfo(); 66 67 # Process input files.. 68 my($FileIndex); 69 if (@SDFilesList > 1) { 70 print "\nProcessing SD files...\n"; 71 } 72 for $FileIndex (0 .. $#SDFilesList) { 73 if ($SDFilesInfo{FileOkay}[$FileIndex]) { 74 print "\nProcessing file $SDFilesList[$FileIndex]...\n"; 75 GenerateMolFiles($FileIndex); 76 } 77 } 78 print "\n$ScriptName:Done...\n\n"; 79 80 $EndTime = new Benchmark; 81 $TotalTime = timediff ($EndTime, $StartTime); 82 print "Total time: ", timestr($TotalTime), "\n"; 83 84 ############################################################################### 85 86 # Generate MOL files for a SD file... 87 # 88 sub GenerateMolFiles { 89 my($FileIndex) = @_; 90 my($SDFile, $MOLFile, $MOLFileRoot, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $MolEndDelimiter, $CmpdString, @CmpdLines, %DataFieldValues); 91 92 $SDFile = $SDFilesList[$FileIndex]; 93 94 if (!open SDFILE, "$SDFile") { 95 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; 96 return; 97 } 98 99 $CmpdCount = 0; 100 $MolEndDelimiter = "M END"; 101 102 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex]; 103 $OverwriteFiles = $OptionsInfo{OverwriteFiles}; 104 105 $UseDataField = ($OptionsInfo{Mode} =~ /^DataField$/i) ? 1 : 0; 106 $DataFieldName = $OptionsInfo{DataField}; 107 108 $UseMolName = ($OptionsInfo{Mode} =~ /^MolName$/i) ? 1 : 0; 109 110 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { 111 $CmpdCount++; 112 113 # Setup MOL file name... 114 $MOLFileRoot = ''; 115 if ($UseDataField) { 116 @CmpdLines = split "\n", $CmpdString; 117 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); 118 if (exists $DataFieldValues{$DataFieldName}) { 119 $MOLFileRoot = $DataFieldValues{$DataFieldName}; 120 } 121 } 122 elsif ($UseMolName) { 123 @CmpdLines = split "\n", $CmpdString; 124 $MOLFileRoot = $CmpdLines[0]; 125 } 126 127 # Check for any invalid file name characters in data field or molname values... 128 if ($MOLFileRoot && $MOLFileRoot =~ /[^a-zA-Z0-9_]/) { 129 $MOLFileRoot =~ s/[^a-zA-Z0-9_]//g; 130 } 131 # Fall back plan for MOL file name... 132 if (!$MOLFileRoot) { 133 $MOLFileRoot = "${OutFileRoot}Cmpd${CmpdCount}"; 134 } 135 136 $MOLFile = "${MOLFileRoot}.mol"; 137 138 if (!$OverwriteFiles) { 139 if (-e $MOLFile) { 140 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New MOL file, $MOLFile, already exists\n"; 141 next CMPDSTRING; 142 } 143 } 144 145 if (!($CmpdString =~ /$MolEndDelimiter/)) { 146 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: Invalid compound data\n"; 147 next CMPDSTRING; 148 } 149 150 # Write out MOL file... 151 152 print "Generating $MOLFile file...\n"; 153 open MOLFILE, ">$MOLFile" or die "Error: Can't open $MOLFile: $! \n"; 154 ($CmpdString) = split "$MolEndDelimiter", $CmpdString; 155 print MOLFILE "$CmpdString"; 156 print MOLFILE "$MolEndDelimiter\n"; 157 close MOLFILE; 158 159 } 160 161 close SDFILE; 162 } 163 164 # Retrieve information about SD files... 165 # 166 sub RetrieveSDFilesInfo { 167 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot); 168 169 %SDFilesInfo = (); 170 @{$SDFilesInfo{FileOkay}} = (); 171 @{$SDFilesInfo{OutFileRoot}} = (); 172 173 FILELIST: for $Index (0 .. $#SDFilesList) { 174 $SDFile = $SDFilesList[$Index]; 175 176 $SDFilesInfo{FileOkay}[$Index] = 0; 177 $SDFilesInfo{OutFileRoot}[$Index] = ''; 178 179 $SDFile = $SDFilesList[$Index]; 180 if (!(-e $SDFile)) { 181 warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; 182 next FILELIST; 183 } 184 if (!CheckFileType($SDFile, "sd sdf")) { 185 warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; 186 next FILELIST; 187 } 188 189 # Setup output file root... 190 $FileDir = ""; $FileName = ""; $FileExt = ""; 191 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); 192 193 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { 194 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); 195 if ($RootFileName && $RootFileExt) { 196 $FileName = $RootFileName; 197 } 198 else { 199 $FileName = $OptionsInfo{OutFileRoot}; 200 } 201 $OutFileRoot = $FileName; 202 } 203 else { 204 $OutFileRoot = "$FileName"; 205 } 206 207 $SDFilesInfo{FileOkay}[$Index] = 1; 208 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; 209 } 210 } 211 212 # Process option values... 213 sub ProcessOptions { 214 %OptionsInfo = (); 215 216 $OptionsInfo{Mode} = $Options{mode}; 217 218 $OptionsInfo{DataField} = ''; 219 if ($Options{mode} =~ /^DataField$/i) { 220 if (!$Options{datafield}) { 221 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" \"-m, --mode\". \n"; 222 } 223 $OptionsInfo{DataField} = $Options{datafield}; 224 } 225 226 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; 227 228 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; 229 } 230 231 # Setup script usage and retrieve command line arguments specified using various options... 232 sub SetupScriptUsage { 233 234 # Retrieve all the options... 235 %Options = (); 236 237 $Options{mode} = 'RootPrefix'; 238 239 if (!GetOptions(\%Options, "datafield|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { 240 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 241 } 242 if ($Options{workingdir}) { 243 if (! -d $Options{workingdir}) { 244 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 245 } 246 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 247 } 248 249 if ($Options{mode} !~ /^(DataField|MolName|RootPrefix)$/i) { 250 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n"; 251 } 252 } 253