| 0 | 1 #!/usr/bin/perl -w | 
|  | 2 # | 
|  | 3 # $RCSfile: SDToMolFiles.pl,v $ | 
|  | 4 # $Date: 2015/02/28 20:46:20 $ | 
|  | 5 # $Revision: 1.35 $ | 
|  | 6 # | 
|  | 7 # Author: Manish Sud <msud@san.rr.com> | 
|  | 8 # | 
|  | 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | 
|  | 10 # | 
|  | 11 # This file is part of MayaChemTools. | 
|  | 12 # | 
|  | 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | 
|  | 14 # the terms of the GNU Lesser General Public License as published by the Free | 
|  | 15 # Software Foundation; either version 3 of the License, or (at your option) any | 
|  | 16 # later version. | 
|  | 17 # | 
|  | 18 # MayaChemTools is distributed in the hope that it will be useful, but without | 
|  | 19 # any warranty; without even the implied warranty of merchantability of fitness | 
|  | 20 # for a particular purpose.  See the GNU Lesser General Public License for more | 
|  | 21 # details. | 
|  | 22 # | 
|  | 23 # You should have received a copy of the GNU Lesser General Public License | 
|  | 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | 
|  | 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | 
|  | 26 # Boston, MA, 02111-1307, USA. | 
|  | 27 # | 
|  | 28 | 
|  | 29 use strict; | 
|  | 30 use FindBin; use lib "$FindBin::Bin/../lib"; | 
|  | 31 use Getopt::Long; | 
|  | 32 use File::Basename; | 
|  | 33 use Text::ParseWords; | 
|  | 34 use Benchmark; | 
|  | 35 use SDFileUtil; | 
|  | 36 use FileUtil; | 
|  | 37 | 
|  | 38 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | 
|  | 39 | 
|  | 40 # Autoflush STDOUT | 
|  | 41 $| = 1; | 
|  | 42 | 
|  | 43 # Starting message... | 
|  | 44 $ScriptName = basename $0; | 
|  | 45 print "\n$ScriptName:Starting...\n\n"; | 
|  | 46 $StartTime = new Benchmark; | 
|  | 47 | 
|  | 48 # Get the options and setup script... | 
|  | 49 SetupScriptUsage(); | 
|  | 50 if ($Options{help} || @ARGV < 1) { | 
|  | 51   die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | 
|  | 52 } | 
|  | 53 | 
|  | 54 my(@SDFilesList); | 
|  | 55 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd"); | 
|  | 56 | 
|  | 57 # Process options... | 
|  | 58 print "Processing options...\n"; | 
|  | 59 my(%OptionsInfo); | 
|  | 60 ProcessOptions(); | 
|  | 61 | 
|  | 62 # Setup information about input files... | 
|  | 63 print "Checking input SD file(s)...\n"; | 
|  | 64 my(%SDFilesInfo); | 
|  | 65 RetrieveSDFilesInfo(); | 
|  | 66 | 
|  | 67 # Process input files.. | 
|  | 68 my($FileIndex); | 
|  | 69 if (@SDFilesList > 1) { | 
|  | 70   print "\nProcessing SD files...\n"; | 
|  | 71 } | 
|  | 72 for $FileIndex (0 .. $#SDFilesList) { | 
|  | 73   if ($SDFilesInfo{FileOkay}[$FileIndex]) { | 
|  | 74     print "\nProcessing file $SDFilesList[$FileIndex]...\n"; | 
|  | 75     GenerateMolFiles($FileIndex); | 
|  | 76   } | 
|  | 77 } | 
|  | 78 print "\n$ScriptName:Done...\n\n"; | 
|  | 79 | 
|  | 80 $EndTime = new Benchmark; | 
|  | 81 $TotalTime = timediff ($EndTime, $StartTime); | 
|  | 82 print "Total time: ", timestr($TotalTime), "\n"; | 
|  | 83 | 
|  | 84 ############################################################################### | 
|  | 85 | 
|  | 86 # Generate MOL files for a SD file... | 
|  | 87 # | 
|  | 88 sub GenerateMolFiles { | 
|  | 89   my($FileIndex) = @_; | 
|  | 90   my($SDFile, $MOLFile, $MOLFileRoot, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $MolEndDelimiter, $CmpdString, @CmpdLines, %DataFieldValues); | 
|  | 91 | 
|  | 92   $SDFile = $SDFilesList[$FileIndex]; | 
|  | 93 | 
|  | 94   if (!open SDFILE, "$SDFile") { | 
|  | 95     warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n"; | 
|  | 96     return; | 
|  | 97   } | 
|  | 98 | 
|  | 99   $CmpdCount = 0; | 
|  | 100   $MolEndDelimiter = "M  END"; | 
|  | 101 | 
|  | 102   $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex]; | 
|  | 103   $OverwriteFiles = $OptionsInfo{OverwriteFiles}; | 
|  | 104 | 
|  | 105   $UseDataField = ($OptionsInfo{Mode} =~ /^DataField$/i) ? 1 : 0; | 
|  | 106   $DataFieldName = $OptionsInfo{DataField}; | 
|  | 107 | 
|  | 108   $UseMolName = ($OptionsInfo{Mode} =~ /^MolName$/i) ? 1 : 0; | 
|  | 109 | 
|  | 110   CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) { | 
|  | 111     $CmpdCount++; | 
|  | 112 | 
|  | 113     # Setup MOL file name... | 
|  | 114     $MOLFileRoot = ''; | 
|  | 115     if ($UseDataField) { | 
|  | 116       @CmpdLines = split "\n", $CmpdString; | 
|  | 117       %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines); | 
|  | 118       if (exists $DataFieldValues{$DataFieldName}) { | 
|  | 119 	$MOLFileRoot = $DataFieldValues{$DataFieldName}; | 
|  | 120       } | 
|  | 121     } | 
|  | 122     elsif ($UseMolName) { | 
|  | 123       @CmpdLines = split "\n", $CmpdString; | 
|  | 124       $MOLFileRoot = $CmpdLines[0]; | 
|  | 125     } | 
|  | 126 | 
|  | 127     # Check for any invalid file name characters in data field or molname values... | 
|  | 128     if ($MOLFileRoot && $MOLFileRoot =~ /[^a-zA-Z0-9_]/) { | 
|  | 129       $MOLFileRoot =~ s/[^a-zA-Z0-9_]//g; | 
|  | 130     } | 
|  | 131     # Fall back plan for MOL file name... | 
|  | 132     if (!$MOLFileRoot) { | 
|  | 133       $MOLFileRoot = "${OutFileRoot}Cmpd${CmpdCount}"; | 
|  | 134     } | 
|  | 135 | 
|  | 136     $MOLFile = "${MOLFileRoot}.mol"; | 
|  | 137 | 
|  | 138     if (!$OverwriteFiles) { | 
|  | 139       if (-e $MOLFile) { | 
|  | 140 	warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New MOL file, $MOLFile, already exists\n"; | 
|  | 141 	next CMPDSTRING; | 
|  | 142       } | 
|  | 143     } | 
|  | 144 | 
|  | 145     if (!($CmpdString =~ /$MolEndDelimiter/)) { | 
|  | 146       warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: Invalid compound data\n"; | 
|  | 147       next CMPDSTRING; | 
|  | 148     } | 
|  | 149 | 
|  | 150     # Write out MOL file... | 
|  | 151 | 
|  | 152     print "Generating $MOLFile file...\n"; | 
|  | 153     open MOLFILE, ">$MOLFile" or die "Error: Can't open $MOLFile: $! \n"; | 
|  | 154     ($CmpdString) = split "$MolEndDelimiter", $CmpdString; | 
|  | 155     print MOLFILE "$CmpdString"; | 
|  | 156     print MOLFILE "$MolEndDelimiter\n"; | 
|  | 157     close MOLFILE; | 
|  | 158 | 
|  | 159   } | 
|  | 160 | 
|  | 161   close SDFILE; | 
|  | 162 } | 
|  | 163 | 
|  | 164 # Retrieve information about SD files... | 
|  | 165 # | 
|  | 166 sub RetrieveSDFilesInfo { | 
|  | 167   my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot); | 
|  | 168 | 
|  | 169   %SDFilesInfo = (); | 
|  | 170   @{$SDFilesInfo{FileOkay}} = (); | 
|  | 171   @{$SDFilesInfo{OutFileRoot}} = (); | 
|  | 172 | 
|  | 173   FILELIST: for $Index (0 .. $#SDFilesList) { | 
|  | 174     $SDFile = $SDFilesList[$Index]; | 
|  | 175 | 
|  | 176     $SDFilesInfo{FileOkay}[$Index] = 0; | 
|  | 177     $SDFilesInfo{OutFileRoot}[$Index] = ''; | 
|  | 178 | 
|  | 179     $SDFile = $SDFilesList[$Index]; | 
|  | 180     if (!(-e $SDFile)) { | 
|  | 181       warn "Warning: Ignoring file $SDFile: It doesn't exist\n"; | 
|  | 182       next FILELIST; | 
|  | 183     } | 
|  | 184     if (!CheckFileType($SDFile, "sd sdf")) { | 
|  | 185       warn "Warning: Ignoring file $SDFile: It's not a SD file\n"; | 
|  | 186       next FILELIST; | 
|  | 187     } | 
|  | 188 | 
|  | 189     # Setup output file root... | 
|  | 190     $FileDir = ""; $FileName = ""; $FileExt = ""; | 
|  | 191     ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile); | 
|  | 192 | 
|  | 193     if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) { | 
|  | 194       my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot}); | 
|  | 195       if ($RootFileName && $RootFileExt) { | 
|  | 196 	$FileName = $RootFileName; | 
|  | 197       } | 
|  | 198       else { | 
|  | 199 	$FileName = $OptionsInfo{OutFileRoot}; | 
|  | 200       } | 
|  | 201       $OutFileRoot = $FileName; | 
|  | 202     } | 
|  | 203     else { | 
|  | 204       $OutFileRoot = "$FileName"; | 
|  | 205     } | 
|  | 206 | 
|  | 207     $SDFilesInfo{FileOkay}[$Index] = 1; | 
|  | 208     $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot; | 
|  | 209   } | 
|  | 210 } | 
|  | 211 | 
|  | 212 # Process option values... | 
|  | 213 sub ProcessOptions { | 
|  | 214   %OptionsInfo = (); | 
|  | 215 | 
|  | 216   $OptionsInfo{Mode} = $Options{mode}; | 
|  | 217 | 
|  | 218   $OptionsInfo{DataField} = ''; | 
|  | 219   if ($Options{mode} =~ /^DataField$/i) { | 
|  | 220     if (!$Options{datafield}) { | 
|  | 221       die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" \"-m, --mode\". \n"; | 
|  | 222     } | 
|  | 223     $OptionsInfo{DataField} = $Options{datafield}; | 
|  | 224   } | 
|  | 225 | 
|  | 226   $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0; | 
|  | 227 | 
|  | 228   $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0; | 
|  | 229 } | 
|  | 230 | 
|  | 231 # Setup script usage  and retrieve command line arguments specified using various options... | 
|  | 232 sub SetupScriptUsage { | 
|  | 233 | 
|  | 234   # Retrieve all the options... | 
|  | 235   %Options = (); | 
|  | 236 | 
|  | 237   $Options{mode} = 'RootPrefix'; | 
|  | 238 | 
|  | 239   if (!GetOptions(\%Options, "datafield|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { | 
|  | 240     die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | 
|  | 241   } | 
|  | 242   if ($Options{workingdir}) { | 
|  | 243     if (! -d $Options{workingdir}) { | 
|  | 244       die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | 
|  | 245     } | 
|  | 246     chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | 
|  | 247   } | 
|  | 248 | 
|  | 249   if ($Options{mode} !~ /^(DataField|MolName|RootPrefix)$/i) { | 
|  | 250     die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n"; | 
|  | 251   } | 
|  | 252 } | 
|  | 253 | 
|  | 254 __END__ | 
|  | 255 | 
|  | 256 =head1 NAME | 
|  | 257 | 
|  | 258 SDToMolFiles.pl - Generate MDLMOL file(s) from SD file(s) | 
|  | 259 | 
|  | 260 =head1 SYNOPSIS | 
|  | 261 | 
|  | 262 SDToMolFiles.pl SDFile(s)... | 
|  | 263 | 
|  | 264 SDToMolFiles.pl [B<-d, --DataField> DataFieldName] | 
|  | 265 [B<-m, --mode> DataField | MolName | RootPrefix] [B<-h, --help>] | 
|  | 266 [B<-o, --overwrite>] [B<-r, --root> rootname] | 
|  | 267 [B<-w, --workingdir> dirname] SDFile(s)... | 
|  | 268 | 
|  | 269 =head1 DESCRIPTION | 
|  | 270 | 
|  | 271 Generate MDLMOL file(s) from I<SDFile(s)>. All header data labels and values in | 
|  | 272 SDFile(s) are simply ignored; other appopriate data from SDFile(s) is transferred to MDLMOL | 
|  | 273 files. Multiple I<SDFile(s)> names are separated by spaces. The valid file extensions are | 
|  | 274 I<.sdf> and I<.sd>. All other file names are ignored. All the SD files in a current | 
|  | 275 directory can be specified either by I<*.sdf> or the current directory name. | 
|  | 276 | 
|  | 277 =head1 OPTIONS | 
|  | 278 | 
|  | 279 =over 4 | 
|  | 280 | 
|  | 281 =item B<-d, --DataField> I<DataFieldName> | 
|  | 282 | 
|  | 283 Specify I<SDFile(s)> datafield label name whose value is used for generation of MDLMOL | 
|  | 284 file names. Default value: I<None>. | 
|  | 285 | 
|  | 286 =item B<-h, --help> | 
|  | 287 | 
|  | 288 Print this help message. | 
|  | 289 | 
|  | 290 =item B<-m, --mode> I<DataField | MolName | RootPrefix> | 
|  | 291 | 
|  | 292 Specify how to generate MDLMOL file names: use a I<SDFile(s)> datafield value; use | 
|  | 293 molname line from I<SDFile(s)>; generate a sequential ID using root prefix specified | 
|  | 294 by B<-r, --root> option. | 
|  | 295 | 
|  | 296 Possible values: I<DataField | MolName | RootPrefix | RootPrefix>. | 
|  | 297 Default: I<RootPrefix>. | 
|  | 298 | 
|  | 299 For empty I<MolName> and I<DataField> values during these specified modes, file | 
|  | 300 name is automatically generated using I<RootPrefix>. | 
|  | 301 | 
|  | 302 For I<RootPrefix> value of B<-m, --mode> option, MDLMOL file names are generated | 
|  | 303 using by appending compound record number to value of B<-r, --root> option. For | 
|  | 304 example: I<RootName>Cmd<RecordNumber>.mol. | 
|  | 305 | 
|  | 306 Allowed characters in file names are: a-zA-Z0-9_. All other characters in datafield | 
|  | 307 values, molname line, and root prefix are ignore during generation of file names. | 
|  | 308 | 
|  | 309 =item B<-o, --overwrite> | 
|  | 310 | 
|  | 311 Overwrite existing files. | 
|  | 312 | 
|  | 313 =item B<-r, --root> I<rootname> | 
|  | 314 | 
|  | 315 Specify root name to used during I<RootPrefix> B<-m, --mode> option value. | 
|  | 316 New MDLMOL file names are generated using the root: <Root>Cmpd<RecordNumber>.mol | 
|  | 317 Default for new file names: <InitialSDFileName>Cmpd<RecordNumber>.mol. This option | 
|  | 318 is ignored for multiple input files. | 
|  | 319 | 
|  | 320 =item B<-w, --workingdir> I<dirname> | 
|  | 321 | 
|  | 322 Location of working directory. Default: current directory. | 
|  | 323 | 
|  | 324 =back | 
|  | 325 | 
|  | 326 =head1 EXAMPLES | 
|  | 327 | 
|  | 328 To generate MDLMOL files from Sample1*.sdf and Sample2*.sd files, type: | 
|  | 329 | 
|  | 330     % SDToMolFiles.pl -o Sample1*.sdf Sample2*.sd | 
|  | 331 | 
|  | 332 To generate Sample*.mol files from Sample1.sdf, type: | 
|  | 333 | 
|  | 334     % SDToMolFiles.pl -r Sample -o Sample1.sdf | 
|  | 335 | 
|  | 336 To generate MOL files from Sample1.sdf using molname line data for generating | 
|  | 337 MOL file names, type: | 
|  | 338 | 
|  | 339     % SDToMolFiles.pl -m MolName -r Sample -o Sample1.sdf | 
|  | 340 | 
|  | 341 To generate MOL files from Sample1.sdf using a specific data field values for | 
|  | 342 generating MOL file names, type: | 
|  | 343 | 
|  | 344     % SDToMolFiles.pl -m DataField --DataField MolID -r Sample | 
|  | 345       -o Sample1.sdf | 
|  | 346 | 
|  | 347 =head1 AUTHOR | 
|  | 348 | 
|  | 349 | 
|  | 350 =head1 AUTHOR | 
|  | 351 | 
|  | 352 Manish Sud <msud@san.rr.com> | 
|  | 353 | 
|  | 354 =head1 SEE ALSO | 
|  | 355 | 
|  | 356 InfoSDFiles.pl, MolFilesToSD.pl | 
|  | 357 | 
|  | 358 =head1 COPYRIGHT | 
|  | 359 | 
|  | 360 Copyright (C) 2015 Manish Sud. All rights reserved. | 
|  | 361 | 
|  | 362 This file is part of MayaChemTools. | 
|  | 363 | 
|  | 364 MayaChemTools is free software; you can redistribute it and/or modify it under | 
|  | 365 the terms of the GNU Lesser General Public License as published by the Free | 
|  | 366 Software Foundation; either version 3 of the License, or (at your option) | 
|  | 367 any later version. | 
|  | 368 | 
|  | 369 =cut |