Mercurial > repos > deepakjadmin > mayatool3_test2
comparison bin/MolFilesToSD.pl @ 0:4816e4a8ae95 draft default tip
Uploaded
author | deepakjadmin |
---|---|
date | Wed, 20 Jan 2016 09:23:18 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4816e4a8ae95 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # $RCSfile: MolFilesToSD.pl,v $ | |
4 # $Date: 2015/02/28 20:46:20 $ | |
5 # $Revision: 1.38 $ | |
6 # | |
7 # Author: Manish Sud <msud@san.rr.com> | |
8 # | |
9 # Copyright (C) 2015 Manish Sud. All rights reserved. | |
10 # | |
11 # This file is part of MayaChemTools. | |
12 # | |
13 # MayaChemTools is free software; you can redistribute it and/or modify it under | |
14 # the terms of the GNU Lesser General Public License as published by the Free | |
15 # Software Foundation; either version 3 of the License, or (at your option) any | |
16 # later version. | |
17 # | |
18 # MayaChemTools is distributed in the hope that it will be useful, but without | |
19 # any warranty; without even the implied warranty of merchantability of fitness | |
20 # for a particular purpose. See the GNU Lesser General Public License for more | |
21 # details. | |
22 # | |
23 # You should have received a copy of the GNU Lesser General Public License | |
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or | |
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330, | |
26 # Boston, MA, 02111-1307, USA. | |
27 # | |
28 | |
29 use strict; | |
30 use FindBin; use lib "$FindBin::Bin/../lib"; | |
31 use Getopt::Long; | |
32 use File::Basename; | |
33 use Text::ParseWords; | |
34 use Benchmark; | |
35 use SDFileUtil; | |
36 use FileUtil; | |
37 use TextUtil; | |
38 | |
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime); | |
40 | |
41 # Autoflush STDOUT | |
42 $| = 1; | |
43 | |
44 # Starting message... | |
45 $ScriptName = basename $0; | |
46 print "\n$ScriptName:Starting...\n\n"; | |
47 $StartTime = new Benchmark; | |
48 | |
49 # Get the options and setup script... | |
50 SetupScriptUsage(); | |
51 if ($Options{help} || @ARGV < 1) { | |
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName"); | |
53 } | |
54 | |
55 my(@MOLFilesList); | |
56 @MOLFilesList = ExpandFileNames(\@ARGV, "mol"); | |
57 | |
58 # Process options... | |
59 print "Processing options...\n"; | |
60 my(%OptionsInfo); | |
61 ProcessOptions(); | |
62 | |
63 print "Generating SD file $OptionsInfo{SDFile}...\n"; | |
64 GenerateSDFile(); | |
65 | |
66 print "\n$ScriptName:Done...\n\n"; | |
67 | |
68 $EndTime = new Benchmark; | |
69 $TotalTime = timediff ($EndTime, $StartTime); | |
70 print "Total time: ", timestr($TotalTime), "\n"; | |
71 | |
72 ############################################################################### | |
73 | |
74 # Generate a SD file using all valid MDL MOL files... | |
75 sub GenerateSDFile { | |
76 my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt); | |
77 | |
78 open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n"; | |
79 $FileCount = 0; | |
80 $FileOkayCount = 0; | |
81 | |
82 FILELIST: for $Index (0 .. $#MOLFilesList) { | |
83 $MOLFile = $MOLFilesList[$Index]; | |
84 $FileCount++; | |
85 | |
86 print "Processing file $MOLFile...\n"; | |
87 | |
88 if (!(-e $MOLFile)) { | |
89 warn "Warning: Ignoring file $MOLFile: It doesn't exist\n"; | |
90 next FILELIST; | |
91 } | |
92 | |
93 if (!CheckFileType($MOLFile, "mol")) { | |
94 warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n"; | |
95 next FILELIST; | |
96 } | |
97 | |
98 if (!open MOLFILE, "$MOLFile") { | |
99 warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n"; | |
100 next FILELIST; | |
101 } | |
102 | |
103 $FileOkayCount++; | |
104 | |
105 if ($OptionsInfo{ModifyData}) { | |
106 $MolNameLine = <MOLFILE>; | |
107 if ($OptionsInfo{UseFilePrefix}) { | |
108 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile); | |
109 $CmpdID = $FileName; | |
110 } | |
111 else { | |
112 $CmpdID = $OptionsInfo{CompoundID} . "$FileCount"; | |
113 } | |
114 | |
115 if ($OptionsInfo{AddMolNameLine}) { | |
116 print SDFILE "$CmpdID\n"; | |
117 } | |
118 else { | |
119 $MolNameLine =~ s/(\r\n)|(\r)/\n/g; | |
120 print SDFILE $MolNameLine; | |
121 } | |
122 | |
123 while (<MOLFILE>) { | |
124 s/(\r\n)|(\r)/\n/g; | |
125 print SDFILE; | |
126 } | |
127 | |
128 if ($OptionsInfo{AddDataField}) { | |
129 print SDFILE "> <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n"; | |
130 } | |
131 } | |
132 else { | |
133 while (<MOLFILE>) { | |
134 s/(\r\n)|(\r)/\n/g; | |
135 print SDFILE; | |
136 } | |
137 } | |
138 print SDFILE "\n\$\$\$\$\n"; | |
139 close MOLFILE; | |
140 } | |
141 close SDFILE; | |
142 | |
143 print "\nNumber of files: $FileCount\n"; | |
144 print "Number of files processed successfully: $FileOkayCount\n"; | |
145 print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n"; | |
146 } | |
147 | |
148 # Process option values... | |
149 sub ProcessOptions { | |
150 %OptionsInfo = (); | |
151 | |
152 $OptionsInfo{Mode} = $Options{mode}; | |
153 | |
154 $OptionsInfo{CompoundID} = $Options{compoundid}; | |
155 $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel}; | |
156 | |
157 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef; | |
158 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef; | |
159 | |
160 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; | |
161 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; | |
162 | |
163 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0; | |
164 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0; | |
165 | |
166 $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0; | |
167 | |
168 $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0; | |
169 | |
170 # Setup SD file name... | |
171 my($FileDir, $FileName, $FileExt, $SDFile); | |
172 if ($Options{root}) { | |
173 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
174 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root}); | |
175 if ($FileName && $FileExt) { | |
176 $SDFile = $FileName; | |
177 } | |
178 else { | |
179 $SDFile = $Options{root}; | |
180 } | |
181 $SDFile .= ".sdf"; | |
182 } | |
183 else { | |
184 $FileDir = ""; $FileName = ""; $FileExt = ""; | |
185 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]); | |
186 $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf"; | |
187 } | |
188 | |
189 if (!$Options{overwrite}) { | |
190 if (-e $SDFile) { | |
191 die "Error: The file $SDFile already exists.\n"; | |
192 } | |
193 } | |
194 $OptionsInfo{SDFile} = $SDFile; | |
195 | |
196 } | |
197 | |
198 # Setup script usage and retrieve command line arguments specified using various options... | |
199 sub SetupScriptUsage { | |
200 | |
201 # Retrieve all the options... | |
202 %Options = (); | |
203 $Options{compoundid} = "Cmpd"; | |
204 $Options{datafieldlabel} = "Cmpd_ID"; | |
205 $Options{mode} = "none"; | |
206 | |
207 if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) { | |
208 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n"; | |
209 } | |
210 if ($Options{workingdir}) { | |
211 if (! -d $Options{workingdir}) { | |
212 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n"; | |
213 } | |
214 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n"; | |
215 } | |
216 if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) { | |
217 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n"; | |
218 } | |
219 } | |
220 | |
221 __END__ | |
222 | |
223 =head1 NAME | |
224 | |
225 MolFilesToSD.pl - Generate a SD file from MDLMOL File(s) | |
226 | |
227 =head1 SYNOPSIS | |
228 | |
229 MolFilesToSD.pl MDLMOLFile(s)... | |
230 | |
231 MolFilesToSD.pl [B<-c, --compoundid> usefileprefix | idlabel] [B<-d, --datafieldlabel> fieldlabel] | |
232 [B<-h, --help>] [B<-m, --mode> molnameline | datafield | both | none] [B<-o, --overwrite>] | |
233 [B<-r, --root> rootname] [B<-w, --workingdir> dirname] MDLMOLFile(s)... | |
234 | |
235 =head1 DESCRIPTION | |
236 | |
237 Generate a SD file from I<MDLMOL File(s)>. Multiple file names are separated by spaces. | |
238 The valid file extension is I<.mol>. All other file names are ignored. All the files in a current | |
239 directory can be specified by I<*.mol>, or the current directory name. | |
240 | |
241 =head1 OPTIONS | |
242 | |
243 =over 4 | |
244 | |
245 =item B<-c, --compoundid> I<usefileprefix | idlabel> | |
246 | |
247 Specify how to generate compound IDs: use MOL filename prefix or generate | |
248 a new compound ID by combining I<idlabel> with compound number. Possible | |
249 values: I<usefileprefix | idlabel>. By default, I<Cmd> is used as a I<idlabel> to generate | |
250 these types of compound IDs: Cmpd1, Cmpd2 and so on. | |
251 | |
252 Example: To generate compound IDs like Mol_ID1, Mol_ID2 and so on, specify | |
253 "MolID" value for this option. | |
254 | |
255 =item B<-d, --datafieldlabel> I<fieldlabel> | |
256 | |
257 Specify data field label for adding compound ID field into SD file during I<datafield | both> | |
258 values of B<-m, --mode> option. Default: <Cmpd_ID>. | |
259 | |
260 =item B<-h, --help> | |
261 | |
262 Print this help message. | |
263 | |
264 =item B<-m, --mode> I<molnameline | datafield | both | none> | |
265 | |
266 Specify how to add compopund ID into SD file: relplace the molname line, | |
267 add a new data field, replace the molname line and add data field, or do | |
268 nothing. Possible values: I<molnameline | datafield | both | none>. | |
269 Default: I<nothing>. | |
270 | |
271 Use B<-c, --compoundid> to specify compound ID generation process. | |
272 | |
273 =item B<-o, --overwrite> | |
274 | |
275 Overwrite existing files. | |
276 | |
277 =item B<-r, --root> I<rootname> | |
278 | |
279 New SD file name is generated using the root: <Root>.sdf. Default new file | |
280 name: <InitialMOLFileName>1To<Count>.sdf. | |
281 | |
282 =item B<-w, --workingdir> I<dirname> | |
283 | |
284 Location of working directory. Default: current directory. | |
285 | |
286 =back | |
287 | |
288 =head1 EXAMPLES | |
289 | |
290 To generate NewSample.sdf file from Sample*.mol files, type: | |
291 | |
292 % MolFilesToSD.pl -r NewSample -o Sample*.mol | |
293 | |
294 To generate NewSample.sdf with Cmpd1, Cmpd2 and so on as compound ID in | |
295 MolName line and Cmpd_ID datafield from Sample*.mol files, type: | |
296 | |
297 % MolFilesToSD.pl -r NewSample -m both -o Sample*.mol | |
298 | |
299 =head1 AUTHOR | |
300 | |
301 Manish Sud <msud@san.rr.com> | |
302 | |
303 =head1 SEE ALSO | |
304 | |
305 InfoSDFiles.pl, SDToMolFiles.pl | |
306 | |
307 =head1 COPYRIGHT | |
308 | |
309 Copyright (C) 2015 Manish Sud. All rights reserved. | |
310 | |
311 This file is part of MayaChemTools. | |
312 | |
313 MayaChemTools is free software; you can redistribute it and/or modify it under | |
314 the terms of the GNU Lesser General Public License as published by the Free | |
315 Software Foundation; either version 3 of the License, or (at your option) | |
316 any later version. | |
317 | |
318 =cut |