Mercurial > repos > deepakjadmin > mayatool3_test3
comparison mayachemtools/bin/MolFilesToSD.pl @ 0:73ae111cf86f draft
Uploaded
| author | deepakjadmin |
|---|---|
| date | Wed, 20 Jan 2016 11:55:01 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:73ae111cf86f |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 # | |
| 3 # $RCSfile: MolFilesToSD.pl,v $ | |
| 4 # $Date: 2015/02/28 20:46:20 $ | |
| 5 # $Revision: 1.38 $ | |
| 6 # | |
| 7 # Author: Manish Sud <msud@san.rr.com> | |
| 8 # | |
| 9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 10 # | |
| 11 # This file is part of MayaChemTools. | |
| 12 # | |
| 13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 14 # the terms of the GNU Lesser General Public License as published by the Free | |
| 15 # Software Foundation; either version 3 of the License, or (at your option) any | |
| 16 # later version. | |
| 17 # | |
| 18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
| 19 # any warranty; without even the implied warranty of merchantability of fitness | |
| 20 # for a particular purpose. See the GNU Lesser General Public License for more | |
| 21 # details. | |
| 22 # | |
| 23 # You should have received a copy of the GNU Lesser General Public License | |
| 24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
| 25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
| 26 # Boston, MA, 02111-1307, USA. | |
| 27 # | |
| 28 | |
| 29 use strict; | |
| 30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
| 31 use Getopt::Long; | |
| 32 use File::Basename; | |
| 33 use Text::ParseWords; | |
| 34 use Benchmark; | |
| 35 use SDFileUtil; | |
| 36 use FileUtil; | |
| 37 use TextUtil; | |
| 38 | |
| 39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
| 40 | |
| 41 # Autoflush STDOUT | |
| 42 $| = 1; | |
| 43 | |
| 44 # Starting message... | |
| 45 $ScriptName = basename $0; | |
| 46 print "\n$ScriptName:Starting...\n\n"; | |
| 47 $StartTime = new Benchmark; | |
| 48 | |
| 49 # Get the options and setup script... | |
| 50 SetupScriptUsage(); | |
| 51 if ($Options{help} || @ARGV < 1) { | |
| 52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
| 53 } | |
| 54 | |
| 55 my(@MOLFilesList); | |
| 56 @MOLFilesList = ExpandFileNames(\@ARGV, "mol"); | |
| 57 | |
| 58 # Process options... | |
| 59 print "Processing options...\n"; | |
| 60 my(%OptionsInfo); | |
| 61 ProcessOptions(); | |
| 62 | |
| 63 print "Generating SD file $OptionsInfo{SDFile}...\n"; | |
| 64 GenerateSDFile(); | |
| 65 | |
| 66 print "\n$ScriptName:Done...\n\n"; | |
| 67 | |
| 68 $EndTime = new Benchmark; | |
| 69 $TotalTime = timediff ($EndTime, $StartTime); | |
| 70 print "Total time: ", timestr($TotalTime), "\n"; | |
| 71 | |
| 72 ############################################################################### | |
| 73 | |
| 74 # Generate a SD file using all valid MDL MOL files... | |
| 75 sub GenerateSDFile { | |
| 76 my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt); | |
| 77 | |
| 78 open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n"; | |
| 79 $FileCount = 0; | |
| 80 $FileOkayCount = 0; | |
| 81 | |
| 82 FILELIST: for $Index (0 .. $#MOLFilesList) { | |
| 83 $MOLFile = $MOLFilesList[$Index]; | |
| 84 $FileCount++; | |
| 85 | |
| 86 print "Processing file $MOLFile...\n"; | |
| 87 | |
| 88 if (!(-e $MOLFile)) { | |
| 89 warn "Warning: Ignoring file $MOLFile: It doesn't exist\n"; | |
| 90 next FILELIST; | |
| 91 } | |
| 92 | |
| 93 if (!CheckFileType($MOLFile, "mol")) { | |
| 94 warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n"; | |
| 95 next FILELIST; | |
| 96 } | |
| 97 | |
| 98 if (!open MOLFILE, "$MOLFile") { | |
| 99 warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n"; | |
| 100 next FILELIST; | |
| 101 } | |
| 102 | |
| 103 $FileOkayCount++; | |
| 104 | |
| 105 if ($OptionsInfo{ModifyData}) { | |
| 106 $MolNameLine = <MOLFILE>; | |
| 107 if ($OptionsInfo{UseFilePrefix}) { | |
| 108 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile); | |
| 109 $CmpdID = $FileName; | |
| 110 } | |
| 111 else { | |
| 112 $CmpdID = $OptionsInfo{CompoundID} . "$FileCount"; | |
| 113 } | |
| 114 | |
| 115 if ($OptionsInfo{AddMolNameLine}) { | |
| 116 print SDFILE "$CmpdID\n"; | |
| 117 } | |
| 118 else { | |
| 119 $MolNameLine =~ s/(\r\n)|(\r)/\n/g; | |
| 120 print SDFILE $MolNameLine; | |
| 121 } | |
| 122 | |
| 123 while (<MOLFILE>) { | |
| 124 s/(\r\n)|(\r)/\n/g; | |
| 125 print SDFILE; | |
| 126 } | |
| 127 | |
| 128 if ($OptionsInfo{AddDataField}) { | |
| 129 print SDFILE "> <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n"; | |
| 130 } | |
| 131 } | |
| 132 else { | |
| 133 while (<MOLFILE>) { | |
| 134 s/(\r\n)|(\r)/\n/g; | |
| 135 print SDFILE; | |
| 136 } | |
| 137 } | |
| 138 print SDFILE "\n\$\$\$\$\n"; | |
| 139 close MOLFILE; | |
| 140 } | |
| 141 close SDFILE; | |
| 142 | |
| 143 print "\nNumber of files: $FileCount\n"; | |
| 144 print "Number of files processed successfully: $FileOkayCount\n"; | |
| 145 print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n"; | |
| 146 } | |
| 147 | |
| 148 # Process option values... | |
| 149 sub ProcessOptions { | |
| 150 %OptionsInfo = (); | |
| 151 | |
| 152 $OptionsInfo{Mode} = $Options{mode}; | |
| 153 | |
| 154 $OptionsInfo{CompoundID} = $Options{compoundid}; | |
| 155 $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel}; | |
| 156 | |
| 157 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
| 158 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; | |
| 159 | |
| 160 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; | |
| 161 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; | |
| 162 | |
| 163 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; | |
| 164 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; | |
| 165 | |
| 166 $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0; | |
| 167 | |
| 168 $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0; | |
| 169 | |
| 170 # Setup SD file name... | |
| 171 my($FileDir, $FileName, $FileExt, $SDFile); | |
| 172 if ($Options{root}) { | |
| 173 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 174 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); | |
| 175 if ($FileName && $FileExt) { | |
| 176 $SDFile = $FileName; | |
| 177 } | |
| 178 else { | |
| 179 $SDFile = $Options{root}; | |
| 180 } | |
| 181 $SDFile .= ".sdf"; | |
| 182 } | |
| 183 else { | |
| 184 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
| 185 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]); | |
| 186 $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf"; | |
| 187 } | |
| 188 | |
| 189 if (!$Options{overwrite}) { | |
| 190 if (-e $SDFile) { | |
| 191 die "Error: The file $SDFile already exists.\n"; | |
| 192 } | |
| 193 } | |
| 194 $OptionsInfo{SDFile} = $SDFile; | |
| 195 | |
| 196 } | |
| 197 | |
| 198 # Setup script usage and retrieve command line arguments specified using various options... | |
| 199 sub SetupScriptUsage { | |
| 200 | |
| 201 # Retrieve all the options... | |
| 202 %Options = (); | |
| 203 $Options{compoundid} = "Cmpd"; | |
| 204 $Options{datafieldlabel} = "Cmpd_ID"; | |
| 205 $Options{mode} = "none"; | |
| 206 | |
| 207 if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { | |
| 208 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
| 209 } | |
| 210 if ($Options{workingdir}) { | |
| 211 if (! -d $Options{workingdir}) { | |
| 212 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
| 213 } | |
| 214 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
| 215 } | |
| 216 if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) { | |
| 217 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n"; | |
| 218 } | |
| 219 } | |
| 220 | |
| 221 __END__ | |
| 222 | |
| 223 =head1 NAME | |
| 224 | |
| 225 MolFilesToSD.pl - Generate a SD file from MDLMOL File(s) | |
| 226 | |
| 227 =head1 SYNOPSIS | |
| 228 | |
| 229 MolFilesToSD.pl MDLMOLFile(s)... | |
| 230 | |
| 231 MolFilesToSD.pl [B<-c, --compoundid> usefileprefix | idlabel] [B<-d, --datafieldlabel> fieldlabel] | |
| 232 [B<-h, --help>] [B<-m, --mode> molnameline | datafield | both | none] [B<-o, --overwrite>] | |
| 233 [B<-r, --root> rootname] [B<-w, --workingdir> dirname] MDLMOLFile(s)... | |
| 234 | |
| 235 =head1 DESCRIPTION | |
| 236 | |
| 237 Generate a SD file from I<MDLMOL File(s)>. Multiple file names are separated by spaces. | |
| 238 The valid file extension is I<.mol>. All other file names are ignored. All the files in a current | |
| 239 directory can be specified by I<*.mol>, or the current directory name. | |
| 240 | |
| 241 =head1 OPTIONS | |
| 242 | |
| 243 =over 4 | |
| 244 | |
| 245 =item B<-c, --compoundid> I<usefileprefix | idlabel> | |
| 246 | |
| 247 Specify how to generate compound IDs: use MOL filename prefix or generate | |
| 248 a new compound ID by combining I<idlabel> with compound number. Possible | |
| 249 values: I<usefileprefix | idlabel>. By default, I<Cmd> is used as a I<idlabel> to generate | |
| 250 these types of compound IDs: Cmpd1, Cmpd2 and so on. | |
| 251 | |
| 252 Example: To generate compound IDs like Mol_ID1, Mol_ID2 and so on, specify | |
| 253 "MolID" value for this option. | |
| 254 | |
| 255 =item B<-d, --datafieldlabel> I<fieldlabel> | |
| 256 | |
| 257 Specify data field label for adding compound ID field into SD file during I<datafield | both> | |
| 258 values of B<-m, --mode> option. Default: <Cmpd_ID>. | |
| 259 | |
| 260 =item B<-h, --help> | |
| 261 | |
| 262 Print this help message. | |
| 263 | |
| 264 =item B<-m, --mode> I<molnameline | datafield | both | none> | |
| 265 | |
| 266 Specify how to add compopund ID into SD file: relplace the molname line, | |
| 267 add a new data field, replace the molname line and add data field, or do | |
| 268 nothing. Possible values: I<molnameline | datafield | both | none>. | |
| 269 Default: I<nothing>. | |
| 270 | |
| 271 Use B<-c, --compoundid> to specify compound ID generation process. | |
| 272 | |
| 273 =item B<-o, --overwrite> | |
| 274 | |
| 275 Overwrite existing files. | |
| 276 | |
| 277 =item B<-r, --root> I<rootname> | |
| 278 | |
| 279 New SD file name is generated using the root: <Root>.sdf. Default new file | |
| 280 name: <InitialMOLFileName>1To<Count>.sdf. | |
| 281 | |
| 282 =item B<-w, --workingdir> I<dirname> | |
| 283 | |
| 284 Location of working directory. Default: current directory. | |
| 285 | |
| 286 =back | |
| 287 | |
| 288 =head1 EXAMPLES | |
| 289 | |
| 290 To generate NewSample.sdf file from Sample*.mol files, type: | |
| 291 | |
| 292 % MolFilesToSD.pl -r NewSample -o Sample*.mol | |
| 293 | |
| 294 To generate NewSample.sdf with Cmpd1, Cmpd2 and so on as compound ID in | |
| 295 MolName line and Cmpd_ID datafield from Sample*.mol files, type: | |
| 296 | |
| 297 % MolFilesToSD.pl -r NewSample -m both -o Sample*.mol | |
| 298 | |
| 299 =head1 AUTHOR | |
| 300 | |
| 301 Manish Sud <msud@san.rr.com> | |
| 302 | |
| 303 =head1 SEE ALSO | |
| 304 | |
| 305 InfoSDFiles.pl, SDToMolFiles.pl | |
| 306 | |
| 307 =head1 COPYRIGHT | |
| 308 | |
| 309 Copyright (C) 2015 Manish Sud. All rights reserved. | |
| 310 | |
| 311 This file is part of MayaChemTools. | |
| 312 | |
| 313 MayaChemTools is free software; you can redistribute it and/or modify it under | |
| 314 the terms of the GNU Lesser General Public License as published by the Free | |
| 315 Software Foundation; either version 3 of the License, or (at your option) | |
| 316 any later version. | |
| 317 | |
| 318 =cut |
