comparison bin/MolFilesToSD.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: MolFilesToSD.pl,v $
4 # $Date: 2015/02/28 20:46:20 $
5 # $Revision: 1.38 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Text::ParseWords;
34 use Benchmark;
35 use SDFileUtil;
36 use FileUtil;
37 use TextUtil;
38
39 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
40
41 # Autoflush STDOUT
42 $| = 1;
43
44 # Starting message...
45 $ScriptName = basename $0;
46 print "\n$ScriptName:Starting...\n\n";
47 $StartTime = new Benchmark;
48
49 # Get the options and setup script...
50 SetupScriptUsage();
51 if ($Options{help} || @ARGV < 1) {
52 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
53 }
54
55 my(@MOLFilesList);
56 @MOLFilesList = ExpandFileNames(\@ARGV, "mol");
57
58 # Process options...
59 print "Processing options...\n";
60 my(%OptionsInfo);
61 ProcessOptions();
62
63 print "Generating SD file $OptionsInfo{SDFile}...\n";
64 GenerateSDFile();
65
66 print "\n$ScriptName:Done...\n\n";
67
68 $EndTime = new Benchmark;
69 $TotalTime = timediff ($EndTime, $StartTime);
70 print "Total time: ", timestr($TotalTime), "\n";
71
72 ###############################################################################
73
74 # Generate a SD file using all valid MDL MOL files...
75 sub GenerateSDFile {
76 my($MOLFile, $Index, $FileCount, $FileOkayCount, $MolNameLine, $CmpdID, $FileDir, $FileName, $FileExt);
77
78 open SDFILE, ">$OptionsInfo{SDFile}" or die "Error: Can't open $OptionsInfo{SDFile}: $! \n";
79 $FileCount = 0;
80 $FileOkayCount = 0;
81
82 FILELIST: for $Index (0 .. $#MOLFilesList) {
83 $MOLFile = $MOLFilesList[$Index];
84 $FileCount++;
85
86 print "Processing file $MOLFile...\n";
87
88 if (!(-e $MOLFile)) {
89 warn "Warning: Ignoring file $MOLFile: It doesn't exist\n";
90 next FILELIST;
91 }
92
93 if (!CheckFileType($MOLFile, "mol")) {
94 warn "Warning: Ignoring file $MOLFile: It's not a MDLMOL file\n";
95 next FILELIST;
96 }
97
98 if (!open MOLFILE, "$MOLFile") {
99 warn "Warning: Ignoring file $MOLFile: Couldn't open it: $! \n";
100 next FILELIST;
101 }
102
103 $FileOkayCount++;
104
105 if ($OptionsInfo{ModifyData}) {
106 $MolNameLine = <MOLFILE>;
107 if ($OptionsInfo{UseFilePrefix}) {
108 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFile);
109 $CmpdID = $FileName;
110 }
111 else {
112 $CmpdID = $OptionsInfo{CompoundID} . "$FileCount";
113 }
114
115 if ($OptionsInfo{AddMolNameLine}) {
116 print SDFILE "$CmpdID\n";
117 }
118 else {
119 $MolNameLine =~ s/(\r\n)|(\r)/\n/g;
120 print SDFILE $MolNameLine;
121 }
122
123 while (<MOLFILE>) {
124 s/(\r\n)|(\r)/\n/g;
125 print SDFILE;
126 }
127
128 if ($OptionsInfo{AddDataField}) {
129 print SDFILE "> <$OptionsInfo{DataFieldLabel}>\n${CmpdID}\n";
130 }
131 }
132 else {
133 while (<MOLFILE>) {
134 s/(\r\n)|(\r)/\n/g;
135 print SDFILE;
136 }
137 }
138 print SDFILE "\n\$\$\$\$\n";
139 close MOLFILE;
140 }
141 close SDFILE;
142
143 print "\nNumber of files: $FileCount\n";
144 print "Number of files processed successfully: $FileOkayCount\n";
145 print "Number of files ignored: " . ($FileCount - $FileOkayCount) . "\n";
146 }
147
148 # Process option values...
149 sub ProcessOptions {
150 %OptionsInfo = ();
151
152 $OptionsInfo{Mode} = $Options{mode};
153
154 $OptionsInfo{CompoundID} = $Options{compoundid};
155 $OptionsInfo{DataFieldLabel} = $Options{datafieldlabel};
156
157 $OptionsInfo{Overwrite} = defined $Options{overwrite} ? $Options{overwrite} : undef;
158 $OptionsInfo{OutFileRoot} = defined $Options{root} ? $Options{root} : undef;
159
160 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
161 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
162
163 $OptionsInfo{AddMolNameLine} = ($Options{mode} =~ /^(molnameline|both)$/i) ? 1 : 0;
164 $OptionsInfo{AddDataField} = ($Options{mode} =~ /^(datafield|both)$/i) ? 1 : 0;
165
166 $OptionsInfo{ModifyData} = ($OptionsInfo{AddMolNameLine} || $OptionsInfo{AddDataField}) ? 1 : 0;
167
168 $OptionsInfo{UseFilePrefix} = ($Options{compoundid} =~ /^usefileprefix$/i) ? 1 : 0;
169
170 # Setup SD file name...
171 my($FileDir, $FileName, $FileExt, $SDFile);
172 if ($Options{root}) {
173 $FileDir = ""; $FileName = ""; $FileExt = "";
174 ($FileDir, $FileName, $FileExt) = ParseFileName($Options{root});
175 if ($FileName && $FileExt) {
176 $SDFile = $FileName;
177 }
178 else {
179 $SDFile = $Options{root};
180 }
181 $SDFile .= ".sdf";
182 }
183 else {
184 $FileDir = ""; $FileName = ""; $FileExt = "";
185 ($FileDir, $FileName, $FileExt) = ParseFileName($MOLFilesList[0]);
186 $SDFile = $FileName . "1To" . @MOLFilesList . ".sdf";
187 }
188
189 if (!$Options{overwrite}) {
190 if (-e $SDFile) {
191 die "Error: The file $SDFile already exists.\n";
192 }
193 }
194 $OptionsInfo{SDFile} = $SDFile;
195
196 }
197
198 # Setup script usage and retrieve command line arguments specified using various options...
199 sub SetupScriptUsage {
200
201 # Retrieve all the options...
202 %Options = ();
203 $Options{compoundid} = "Cmpd";
204 $Options{datafieldlabel} = "Cmpd_ID";
205 $Options{mode} = "none";
206
207 if (!GetOptions(\%Options, "compoundid|c=s", "datafieldlabel|d=s", "help|h", "mode|m=s", "overwrite|o", "root|r=s", "workingdir|w=s")) {
208 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
209 }
210 if ($Options{workingdir}) {
211 if (! -d $Options{workingdir}) {
212 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
213 }
214 chdir $Options{workingdir} or die "Error: Couldn't chdir $Options{workingdir}: $! \n";
215 }
216 if ($Options{mode} !~ /^(molnameline|datafield|both|none)$/i ) {
217 die "Error: The value specified, $Options{mode}, for option \"-m --mode\" is not valid. Allowed values: molnameline, datafield, both, or none\n";
218 }
219 }
220
221 __END__
222
223 =head1 NAME
224
225 MolFilesToSD.pl - Generate a SD file from MDLMOL File(s)
226
227 =head1 SYNOPSIS
228
229 MolFilesToSD.pl MDLMOLFile(s)...
230
231 MolFilesToSD.pl [B<-c, --compoundid> usefileprefix | idlabel] [B<-d, --datafieldlabel> fieldlabel]
232 [B<-h, --help>] [B<-m, --mode> molnameline | datafield | both | none] [B<-o, --overwrite>]
233 [B<-r, --root> rootname] [B<-w, --workingdir> dirname] MDLMOLFile(s)...
234
235 =head1 DESCRIPTION
236
237 Generate a SD file from I<MDLMOL File(s)>. Multiple file names are separated by spaces.
238 The valid file extension is I<.mol>. All other file names are ignored. All the files in a current
239 directory can be specified by I<*.mol>, or the current directory name.
240
241 =head1 OPTIONS
242
243 =over 4
244
245 =item B<-c, --compoundid> I<usefileprefix | idlabel>
246
247 Specify how to generate compound IDs: use MOL filename prefix or generate
248 a new compound ID by combining I<idlabel> with compound number. Possible
249 values: I<usefileprefix | idlabel>. By default, I<Cmd> is used as a I<idlabel> to generate
250 these types of compound IDs: Cmpd1, Cmpd2 and so on.
251
252 Example: To generate compound IDs like Mol_ID1, Mol_ID2 and so on, specify
253 "MolID" value for this option.
254
255 =item B<-d, --datafieldlabel> I<fieldlabel>
256
257 Specify data field label for adding compound ID field into SD file during I<datafield | both>
258 values of B<-m, --mode> option. Default: <Cmpd_ID>.
259
260 =item B<-h, --help>
261
262 Print this help message.
263
264 =item B<-m, --mode> I<molnameline | datafield | both | none>
265
266 Specify how to add compopund ID into SD file: relplace the molname line,
267 add a new data field, replace the molname line and add data field, or do
268 nothing. Possible values: I<molnameline | datafield | both | none>.
269 Default: I<nothing>.
270
271 Use B<-c, --compoundid> to specify compound ID generation process.
272
273 =item B<-o, --overwrite>
274
275 Overwrite existing files.
276
277 =item B<-r, --root> I<rootname>
278
279 New SD file name is generated using the root: <Root>.sdf. Default new file
280 name: <InitialMOLFileName>1To<Count>.sdf.
281
282 =item B<-w, --workingdir> I<dirname>
283
284 Location of working directory. Default: current directory.
285
286 =back
287
288 =head1 EXAMPLES
289
290 To generate NewSample.sdf file from Sample*.mol files, type:
291
292 % MolFilesToSD.pl -r NewSample -o Sample*.mol
293
294 To generate NewSample.sdf with Cmpd1, Cmpd2 and so on as compound ID in
295 MolName line and Cmpd_ID datafield from Sample*.mol files, type:
296
297 % MolFilesToSD.pl -r NewSample -m both -o Sample*.mol
298
299 =head1 AUTHOR
300
301 Manish Sud <msud@san.rr.com>
302
303 =head1 SEE ALSO
304
305 InfoSDFiles.pl, SDToMolFiles.pl
306
307 =head1 COPYRIGHT
308
309 Copyright (C) 2015 Manish Sud. All rights reserved.
310
311 This file is part of MayaChemTools.
312
313 MayaChemTools is free software; you can redistribute it and/or modify it under
314 the terms of the GNU Lesser General Public License as published by the Free
315 Software Foundation; either version 3 of the License, or (at your option)
316 any later version.
317
318 =cut