1 #!/usr/bin/perl -w 2 # 3 # $RCSfile: MolFilesToSD.pl,v $ 4 # $Date: 2015/02/28 20:46:20 $ 5 # $Revision: 1.38 $ 6 # 7 # Author: Manish Sud <msud@san.rr.com> 8 # 9 # Copyright (C) 2015 Manish Sud. All rights reserved. 10 # 11 # This file is part of MayaChemTools. 12 # 13 # MayaChemTools is free software; you can redistribute it and/or modify it under 14 # the terms of the GNU Lesser General Public License as published by the Free 15 # Software Foundation; either version 3 of the License, or (at your option) any 16 # later version. 17 # 18 # MayaChemTools is distributed in the hope that it will be useful, but without 19 # any warranty; without even the implied warranty of merchantability of fitness 20 # for a particular purpose. See the GNU Lesser General Public License for more 21 # details. 22 # 23 # You should have received a copy of the GNU Lesser General Public License 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, 26 # Boston, MA, 02111-1307, USA. 27 # 28 29 use strict; 30 use FindBin; use lib "$FindBin::Bin/../lib"; 31 use Getopt::Long; 32 use File::Basename; 33 use Text::ParseWords; 34 use Benchmark; 35 use SDFileUtil; 36 use FileUtil; 37 use TextUtil; 38 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); 40 41 # Autoflush STDOUT 42 $| = 1; 43 44 # Starting message... 45 $ScriptName = basename $0; 46 print "\n$ScriptName:Starting...\n\n"; 47 $StartTime = new Benchmark; 48 49 # Get the options and setup script... 50 SetupScriptUsage(); 51 if ($Options{help} || @ARGV < 1) { 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); 53 } 54 55 my(@MOLFilesList); 56 @MOLFilesList = ExpandFileNames(\@ARGV, "mol"); 57 58 # Process options... 59 print "Processing options...\n"; 60 my(%OptionsInfo); 61 ProcessOptions(); 62 63 print "Generating SD file $OptionsInfo{SDFile}...\n"; 64 GenerateSDFile(); 65 66 print "\n$ScriptName:Done...\n\n"; 67 68 $EndTime = new Benchmark; 69 $TotalTime = timediff ($EndTime, $StartTime); 70 print "Total time: ", timestr($TotalTime), "\n"; 71 72 ############################################################################### 73 74 # Generate a SD file using all valid MDL MOL files... 75 sub GenerateSDFile { 76 my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt); 77 78 open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n"; 79 $FileCount = 0; 80 $FileOkayCount = 0; 81 82 FILELIST: for $Index (0 .. $#MOLFilesList) { 83 $MOLFile = $MOLFilesList[$Index]; 84 $FileCount++; 85 86 print "Processing file $MOLFile...\n"; 87 88 if (!(-e $MOLFile)) { 89 warn "Warning: Ignoring file $MOLFile: It doesn't exist\n"; 90 next FILELIST; 91 } 92 93 if (!CheckFileType($MOLFile, "mol")) { 94 warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n"; 95 next FILELIST; 96 } 97 98 if (!open MOLFILE, "$MOLFile") { 99 warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n"; 100 next FILELIST; 101 } 102 103 $FileOkayCount++; 104 105 if ($OptionsInfo{ModifyData}) { 106 $MolNameLine = <MOLFILE>; 107 if ($OptionsInfo{UseFilePrefix}) { 108 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile); 109 $CmpdID = $FileName; 110 } 111 else { 112 $CmpdID = $OptionsInfo{CompoundID} . "$FileCount"; 113 } 114 115 if ($OptionsInfo{AddMolNameLine}) { 116 print SDFILE "$CmpdID\n"; 117 } 118 else { 119 $MolNameLine =~ s/(\r\n)|(\r)/\n/g; 120 print SDFILE $MolNameLine; 121 } 122 123 while (<MOLFILE>) { 124 s/(\r\n)|(\r)/\n/g; 125 print SDFILE; 126 } 127 128 if ($OptionsInfo{AddDataField}) { 129 print SDFILE "> <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n"; 130 } 131 } 132 else { 133 while (<MOLFILE>) { 134 s/(\r\n)|(\r)/\n/g; 135 print SDFILE; 136 } 137 } 138 print SDFILE "\n\$\$\$\$\n"; 139 close MOLFILE; 140 } 141 close SDFILE; 142 143 print "\nNumber of files: $FileCount\n"; 144 print "Number of files processed successfully: $FileOkayCount\n"; 145 print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n"; 146 } 147 148 # Process option values... 149 sub ProcessOptions { 150 %OptionsInfo = (); 151 152 $OptionsInfo{Mode} = $Options{mode}; 153 154 $OptionsInfo{CompoundID} = $Options{compoundid}; 155 $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel}; 156 157 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; 158 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; 159 160 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; 161 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; 162 163 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; 164 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; 165 166 $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0; 167 168 $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0; 169 170 # Setup SD file name... 171 my($FileDir, $FileName, $FileExt, $SDFile); 172 if ($Options{root}) { 173 $FileDir = ""; $FileName = ""; $FileExt = ""; 174 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); 175 if ($FileName && $FileExt) { 176 $SDFile = $FileName; 177 } 178 else { 179 $SDFile = $Options{root}; 180 } 181 $SDFile .= ".sdf"; 182 } 183 else { 184 $FileDir = ""; $FileName = ""; $FileExt = ""; 185 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]); 186 $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf"; 187 } 188 189 if (!$Options{overwrite}) { 190 if (-e $SDFile) { 191 die "Error: The file $SDFile already exists.\n"; 192 } 193 } 194 $OptionsInfo{SDFile} = $SDFile; 195 196 } 197 198 # Setup script usage and retrieve command line arguments specified using various options... 199 sub SetupScriptUsage { 200 201 # Retrieve all the options... 202 %Options = (); 203 $Options{compoundid} = "Cmpd"; 204 $Options{datafieldlabel} = "Cmpd_ID"; 205 $Options{mode} = "none"; 206 207 if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { 208 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; 209 } 210 if ($Options{workingdir}) { 211 if (! -d $Options{workingdir}) { 212 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; 213 } 214 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; 215 } 216 if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) { 217 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n"; 218 } 219 } 220