0
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # $RCSfile: MolFilesToSD.pl,v $
|
|
4 # $Date: 2015/02/28 20:46:20 $
|
|
5 # $Revision: 1.38 $
|
|
6 #
|
|
7 # Author: Manish Sud <msud@san.rr.com>
|
|
8 #
|
|
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
10 #
|
|
11 # This file is part of MayaChemTools.
|
|
12 #
|
|
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
14 # the terms of the GNU Lesser General Public License as published by the Free
|
|
15 # Software Foundation; either version 3 of the License, or (at your option) any
|
|
16 # later version.
|
|
17 #
|
|
18 # MayaChemTools is distributed in the hope that it will be useful, but without
|
|
19 # any warranty; without even the implied warranty of merchantability of fitness
|
|
20 # for a particular purpose. See the GNU Lesser General Public License for more
|
|
21 # details.
|
|
22 #
|
|
23 # You should have received a copy of the GNU Lesser General Public License
|
|
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
|
|
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
|
|
26 # Boston, MA, 02111-1307, USA.
|
|
27 #
|
|
28
|
|
29 use strict;
|
|
30 use FindBin; use lib "$FindBin::Bin/../lib";
|
|
31 use Getopt::Long;
|
|
32 use File::Basename;
|
|
33 use Text::ParseWords;
|
|
34 use Benchmark;
|
|
35 use SDFileUtil;
|
|
36 use FileUtil;
|
|
37 use TextUtil;
|
|
38
|
|
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
|
|
40
|
|
41 # Autoflush STDOUT
|
|
42 $| = 1;
|
|
43
|
|
44 # Starting message...
|
|
45 $ScriptName = basename $0;
|
|
46 print "\n$ScriptName:Starting...\n\n";
|
|
47 $StartTime = new Benchmark;
|
|
48
|
|
49 # Get the options and setup script...
|
|
50 SetupScriptUsage();
|
|
51 if ($Options{help} || @ARGV < 1) {
|
|
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
|
|
53 }
|
|
54
|
|
55 my(@MOLFilesList);
|
|
56 @MOLFilesList = ExpandFileNames(\@ARGV, "mol");
|
|
57
|
|
58 # Process options...
|
|
59 print "Processing options...\n";
|
|
60 my(%OptionsInfo);
|
|
61 ProcessOptions();
|
|
62
|
|
63 print "Generating SD file $OptionsInfo{SDFile}...\n";
|
|
64 GenerateSDFile();
|
|
65
|
|
66 print "\n$ScriptName:Done...\n\n";
|
|
67
|
|
68 $EndTime = new Benchmark;
|
|
69 $TotalTime = timediff ($EndTime, $StartTime);
|
|
70 print "Total time: ", timestr($TotalTime), "\n";
|
|
71
|
|
72 ###############################################################################
|
|
73
|
|
74 # Generate a SD file using all valid MDL MOL files...
|
|
75 sub GenerateSDFile {
|
|
76 my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt);
|
|
77
|
|
78 open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n";
|
|
79 $FileCount = 0;
|
|
80 $FileOkayCount = 0;
|
|
81
|
|
82 FILELIST: for $Index (0 .. $#MOLFilesList) {
|
|
83 $MOLFile = $MOLFilesList[$Index];
|
|
84 $FileCount++;
|
|
85
|
|
86 print "Processing file $MOLFile...\n";
|
|
87
|
|
88 if (!(-e $MOLFile)) {
|
|
89 warn "Warning: Ignoring file $MOLFile: It doesn't exist\n";
|
|
90 next FILELIST;
|
|
91 }
|
|
92
|
|
93 if (!CheckFileType($MOLFile, "mol")) {
|
|
94 warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n";
|
|
95 next FILELIST;
|
|
96 }
|
|
97
|
|
98 if (!open MOLFILE, "$MOLFile") {
|
|
99 warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n";
|
|
100 next FILELIST;
|
|
101 }
|
|
102
|
|
103 $FileOkayCount++;
|
|
104
|
|
105 if ($OptionsInfo{ModifyData}) {
|
|
106 $MolNameLine = <MOLFILE>;
|
|
107 if ($OptionsInfo{UseFilePrefix}) {
|
|
108 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile);
|
|
109 $CmpdID = $FileName;
|
|
110 }
|
|
111 else {
|
|
112 $CmpdID = $OptionsInfo{CompoundID} . "$FileCount";
|
|
113 }
|
|
114
|
|
115 if ($OptionsInfo{AddMolNameLine}) {
|
|
116 print SDFILE "$CmpdID\n";
|
|
117 }
|
|
118 else {
|
|
119 $MolNameLine =~ s/(\r\n)|(\r)/\n/g;
|
|
120 print SDFILE $MolNameLine;
|
|
121 }
|
|
122
|
|
123 while (<MOLFILE>) {
|
|
124 s/(\r\n)|(\r)/\n/g;
|
|
125 print SDFILE;
|
|
126 }
|
|
127
|
|
128 if ($OptionsInfo{AddDataField}) {
|
|
129 print SDFILE "> <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n";
|
|
130 }
|
|
131 }
|
|
132 else {
|
|
133 while (<MOLFILE>) {
|
|
134 s/(\r\n)|(\r)/\n/g;
|
|
135 print SDFILE;
|
|
136 }
|
|
137 }
|
|
138 print SDFILE "\n\$\$\$\$\n";
|
|
139 close MOLFILE;
|
|
140 }
|
|
141 close SDFILE;
|
|
142
|
|
143 print "\nNumber of files: $FileCount\n";
|
|
144 print "Number of files processed successfully: $FileOkayCount\n";
|
|
145 print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n";
|
|
146 }
|
|
147
|
|
148 # Process option values...
|
|
149 sub ProcessOptions {
|
|
150 %OptionsInfo = ();
|
|
151
|
|
152 $OptionsInfo{Mode} = $Options{mode};
|
|
153
|
|
154 $OptionsInfo{CompoundID} = $Options{compoundid};
|
|
155 $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel};
|
|
156
|
|
157 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
|
|
158 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
|
|
159
|
|
160 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
|
|
161 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
|
|
162
|
|
163 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
|
|
164 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
|
|
165
|
|
166 $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0;
|
|
167
|
|
168 $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0;
|
|
169
|
|
170 # Setup SD file name...
|
|
171 my($FileDir, $FileName, $FileExt, $SDFile);
|
|
172 if ($Options{root}) {
|
|
173 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
174 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
|
|
175 if ($FileName && $FileExt) {
|
|
176 $SDFile = $FileName;
|
|
177 }
|
|
178 else {
|
|
179 $SDFile = $Options{root};
|
|
180 }
|
|
181 $SDFile .= ".sdf";
|
|
182 }
|
|
183 else {
|
|
184 $FileDir = ""; $FileName = ""; $FileExt = "";
|
|
185 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]);
|
|
186 $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf";
|
|
187 }
|
|
188
|
|
189 if (!$Options{overwrite}) {
|
|
190 if (-e $SDFile) {
|
|
191 die "Error: The file $SDFile already exists.\n";
|
|
192 }
|
|
193 }
|
|
194 $OptionsInfo{SDFile} = $SDFile;
|
|
195
|
|
196 }
|
|
197
|
|
198 # Setup script usage and retrieve command line arguments specified using various options...
|
|
199 sub SetupScriptUsage {
|
|
200
|
|
201 # Retrieve all the options...
|
|
202 %Options = ();
|
|
203 $Options{compoundid} = "Cmpd";
|
|
204 $Options{datafieldlabel} = "Cmpd_ID";
|
|
205 $Options{mode} = "none";
|
|
206
|
|
207 if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
|
|
208 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
|
|
209 }
|
|
210 if ($Options{workingdir}) {
|
|
211 if (! -d $Options{workingdir}) {
|
|
212 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
|
|
213 }
|
|
214 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
|
|
215 }
|
|
216 if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) {
|
|
217 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n";
|
|
218 }
|
|
219 }
|
|
220
|
|
221 __END__
|
|
222
|
|
223 =head1 NAME
|
|
224
|
|
225 MolFilesToSD.pl - Generate a SD file from MDLMOL File(s)
|
|
226
|
|
227 =head1 SYNOPSIS
|
|
228
|
|
229 MolFilesToSD.pl MDLMOLFile(s)...
|
|
230
|
|
231 MolFilesToSD.pl [B<-c, --compoundid> usefileprefix | idlabel] [B<-d, --datafieldlabel> fieldlabel]
|
|
232 [B<-h, --help>] [B<-m, --mode> molnameline | datafield | both | none] [B<-o, --overwrite>]
|
|
233 [B<-r, --root> rootname] [B<-w, --workingdir> dirname] MDLMOLFile(s)...
|
|
234
|
|
235 =head1 DESCRIPTION
|
|
236
|
|
237 Generate a SD file from I<MDLMOL File(s)>. Multiple file names are separated by spaces.
|
|
238 The valid file extension is I<.mol>. All other file names are ignored. All the files in a current
|
|
239 directory can be specified by I<*.mol>, or the current directory name.
|
|
240
|
|
241 =head1 OPTIONS
|
|
242
|
|
243 =over 4
|
|
244
|
|
245 =item B<-c, --compoundid> I<usefileprefix | idlabel>
|
|
246
|
|
247 Specify how to generate compound IDs: use MOL filename prefix or generate
|
|
248 a new compound ID by combining I<idlabel> with compound number. Possible
|
|
249 values: I<usefileprefix | idlabel>. By default, I<Cmd> is used as a I<idlabel> to generate
|
|
250 these types of compound IDs: Cmpd1, Cmpd2 and so on.
|
|
251
|
|
252 Example: To generate compound IDs like Mol_ID1, Mol_ID2 and so on, specify
|
|
253 "MolID" value for this option.
|
|
254
|
|
255 =item B<-d, --datafieldlabel> I<fieldlabel>
|
|
256
|
|
257 Specify data field label for adding compound ID field into SD file during I<datafield | both>
|
|
258 values of B<-m, --mode> option. Default: <Cmpd_ID>.
|
|
259
|
|
260 =item B<-h, --help>
|
|
261
|
|
262 Print this help message.
|
|
263
|
|
264 =item B<-m, --mode> I<molnameline | datafield | both | none>
|
|
265
|
|
266 Specify how to add compopund ID into SD file: relplace the molname line,
|
|
267 add a new data field, replace the molname line and add data field, or do
|
|
268 nothing. Possible values: I<molnameline | datafield | both | none>.
|
|
269 Default: I<nothing>.
|
|
270
|
|
271 Use B<-c, --compoundid> to specify compound ID generation process.
|
|
272
|
|
273 =item B<-o, --overwrite>
|
|
274
|
|
275 Overwrite existing files.
|
|
276
|
|
277 =item B<-r, --root> I<rootname>
|
|
278
|
|
279 New SD file name is generated using the root: <Root>.sdf. Default new file
|
|
280 name: <InitialMOLFileName>1To<Count>.sdf.
|
|
281
|
|
282 =item B<-w, --workingdir> I<dirname>
|
|
283
|
|
284 Location of working directory. Default: current directory.
|
|
285
|
|
286 =back
|
|
287
|
|
288 =head1 EXAMPLES
|
|
289
|
|
290 To generate NewSample.sdf file from Sample*.mol files, type:
|
|
291
|
|
292 % MolFilesToSD.pl -r NewSample -o Sample*.mol
|
|
293
|
|
294 To generate NewSample.sdf with Cmpd1, Cmpd2 and so on as compound ID in
|
|
295 MolName line and Cmpd_ID datafield from Sample*.mol files, type:
|
|
296
|
|
297 % MolFilesToSD.pl -r NewSample -m both -o Sample*.mol
|
|
298
|
|
299 =head1 AUTHOR
|
|
300
|
|
301 Manish Sud <msud@san.rr.com>
|
|
302
|
|
303 =head1 SEE ALSO
|
|
304
|
|
305 InfoSDFiles.pl, SDToMolFiles.pl
|
|
306
|
|
307 =head1 COPYRIGHT
|
|
308
|
|
309 Copyright (C) 2015 Manish Sud. All rights reserved.
|
|
310
|
|
311 This file is part of MayaChemTools.
|
|
312
|
|
313 MayaChemTools is free software; you can redistribute it and/or modify it under
|
|
314 the terms of the GNU Lesser General Public License as published by the Free
|
|
315 Software Foundation; either version 3 of the License, or (at your option)
|
|
316 any later version.
|
|
317
|
|
318 =cut
|