annotate mayachemtools/bin/SplitSDFiles.pl @ 9:ab29fa5c8c1f draft default tip

Uploaded
author deepakjadmin
date Thu, 15 Dec 2016 14:18:03 -0500
parents 73ae111cf86f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
1 #!/usr/bin/perl -w
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
2 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: SplitSDFiles.pl,v $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:46:21 $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.36 $
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
6 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
8 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
10 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
12 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
17 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
22 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
27 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
28
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
30 use FindBin; use lib "$FindBin::Bin/../lib";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
31 use Getopt::Long;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
32 use File::Basename;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
33 use Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
34 use SDFileUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
35 use FileUtil;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
36
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
38
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
39 # Autoflush STDOUT
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
40 $| = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
41
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
42 # Starting message...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
43 $ScriptName = basename $0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
44 print "\n$ScriptName:Starting...\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
45 $StartTime = new Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
46
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
47 # Get the options and setup script...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
48 SetupScriptUsage();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
49 if ($Options{help} || @ARGV < 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
51 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
52
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
53 my(@SDFilesList);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
55
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
56 # Process options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
57 print "Processing options...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
58 my(%OptionsInfo);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
59 ProcessOptions();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
60
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
61 # Setup information about input files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
62 my(%SDFilesInfo);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
63 print "Checking input SD file(s)...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
64 RetrieveSDFilesInfo();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
65
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
66 # Process input files..
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
67 my($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
68 if (@SDFilesList > 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
69 print "\nProcessing SD files...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
70 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
71 for $FileIndex (0 .. $#SDFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
72 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
73 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
74 SplitSDFile($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
75 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
76 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
77 print "\n$ScriptName:Done...\n\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
78
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
79 $EndTime = new Benchmark;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
80 $TotalTime = timediff ($EndTime, $StartTime);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
81 print "Total time: ", timestr($TotalTime), "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
82
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
83 ###############################################################################
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
84
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
85 # Split a SD file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
86 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
87 sub SplitSDFile {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
88 my($FileIndex) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
89
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
90 if ($OptionsInfo{Mode} =~ /^Files$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
91 SplitSDFileByNumOfFiles($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
92 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
93 elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
94 SplitSDFileByNumOfCmpds($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
95 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
96 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
97
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
98 # Split SD into specified number of files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
99 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
100 sub SplitSDFileByNumOfFiles {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
101 my($FileIndex) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
102 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
103
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
104 $SDFile = $SDFilesList[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
105
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
106 if (!open SDFILE, "$SDFile") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
107 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
108 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
109 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
110
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
111 $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
112
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
113 # Count number of compounds to figure out maximum number of compound per file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
114 $CmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
115 while (<SDFILE>) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
116 if (/^\$\$\$\$/) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
117 $CmpdCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
118 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
119 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
120 close SDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
121
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
122 if ($CmpdCount < $MaxNumOfFiles) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
123 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
124 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
125 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
126
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
127 $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
128
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
129 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
130 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
131
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
132 # Split SD into files containing specified number of compounds...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
133 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
134 sub SplitSDFileByNumOfCmpds {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
135 my($FileIndex) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
136
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
137 if ($OptionsInfo{NumOfCmpds} == 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
138 SplitSDFileByOneCmpdPerFile($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
139 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
140 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
141 SplitSDFileByNumOfCmpdsPerFile($FileIndex);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
142 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
143 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
144
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
145 # Split SD into files containing one compound per file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
146 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
147 sub SplitSDFileByOneCmpdPerFile {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
148 my($FileIndex) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
149 my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
150
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
151 $SDFile = $SDFilesList[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
152
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
153 if (!open SDFILE, "$SDFile") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
154 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
155 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
156 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
157
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
158 print "\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
159
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
160 $CmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
161
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
162 $FileExt = $SDFilesInfo{FileExt}[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
163
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
164 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
165 $OverwriteFiles = $OptionsInfo{OverwriteFiles};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
166
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
167 $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
168 $DataFieldName = $OptionsInfo{DataField};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
169
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
170 $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
171
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
172 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
173 $CmpdCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
174
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
175 # Setup SD file name...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
176 $NewSDFileRoot = '';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
177 if ($UseDataField) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
178 @CmpdLines = split "\n", $CmpdString;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
179 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
180 if (exists $DataFieldValues{$DataFieldName}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
181 $NewSDFileRoot = $DataFieldValues{$DataFieldName};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
182 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
183 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
184 elsif ($UseMolName) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
185 @CmpdLines = split "\n", $CmpdString;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
186 $NewSDFileRoot = $CmpdLines[0];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
187 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
188
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
189 # Check for any invalid file name characters in data field or molname values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
190 if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
191 $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
192 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
193
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
194 # Fall back plan for SD file name...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
195 if (!$NewSDFileRoot) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
196 $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
197 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
198
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
199 $NewSDFile = "${NewSDFileRoot}.${FileExt}";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
200
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
201 if (!$OverwriteFiles) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
202 if (-e $NewSDFile) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
203 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
204 next CMPDSTRING;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
205 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
206 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
207
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
208 # Write out new SD file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
209
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
210 print "Generating $NewSDFile file\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
211 open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
212 print NEWSDFILE "$CmpdString\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
213 close NEWSDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
214
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
215 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
216 close SDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
217 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
218
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
219 # Split SD into files containing specified number of compounds per file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
220 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
221 sub SplitSDFileByNumOfCmpdsPerFile {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
222 my($FileIndex) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
223 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
224
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
225 $SDFile = $SDFilesList[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
226
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
227 if (!open SDFILE, "$SDFile") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
228 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
229 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
230 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
231
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
232 $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
233
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
234 # Count number of compounds to figure out maximum number of files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
235 $CmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
236 while (<SDFILE>) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
237 if (/^\$\$\$\$/) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
238 $CmpdCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
239 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
240 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
241 close SDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
242
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
243 $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
244
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
245 if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
246 $MaxNumOfFiles++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
247 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
248
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
249 if ($CmpdCount <= $MaxCmpdsPerFile) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
250 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
251 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
252 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
253
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
254 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
255 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
256
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
257 # Split SD files into specified number of files with specified number of compounds
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
258 # in each file...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
259 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
260 sub SplitSDFileByNumOfFilesAndCmpds {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
261 my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
262 my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
263
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
264 $SDFile = $SDFilesList[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
265
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
266 if (!open SDFILE, "$SDFile") {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
267 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
268 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
269 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
270
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
271 # Setup new file names list...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
272 @NewSDFilesList = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
273 for $NewFileIndex (1 .. $NumOfFiles) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
274 $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
275 if (!$OptionsInfo{OverwriteFiles}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
276 if (-e $NewFileName) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
277 warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
278 return;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
279 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
280 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
281 push @NewSDFilesList, $NewFileName;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
282 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
283
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
284 $MaxCmpdsCount = $NumOfCmpdsPerFile;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
285
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
286 $CmpdCount = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
287 $NewFileIndex = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
288
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
289 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
290 print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
291
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
292 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
293
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
294 while (<SDFILE>) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
295 s/(\r\n)|(\r)/\n/g;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
296 print NEWSDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
297
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
298 if ( /^\$\$\$\$/ ) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
299 $CmpdCount++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
300 if ($NewFileIndex <= $NumOfFiles) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
301 if ($CmpdCount >= $MaxCmpdsCount) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
302 if ($NewFileIndex < $NumOfFiles) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
303 close NEWSDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
304 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
305 $NewFileIndex++;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
306 $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
307
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
308 if ($NewFileIndex <= $NumOfFiles) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
309 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
310 print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
311 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
312 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
313 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
314 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
315 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
316 close NEWSDFILE;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
317 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
318
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
319 # Retrieve information about SD files...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
320 #
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
321 sub RetrieveSDFilesInfo {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
322 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
323
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
324 %SDFilesInfo = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
325 @{$SDFilesInfo{FileOkay}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
326 @{$SDFilesInfo{FileExt}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
327 @{$SDFilesInfo{OutFileRoot}} = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
328
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
329 FILELIST: for $Index (0 .. $#SDFilesList) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
330 $SDFile = $SDFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
331
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
332 $SDFilesInfo{FileOkay}[$Index] = 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
333 $SDFilesInfo{FileExt}[$Index] = '';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
334 $SDFilesInfo{OutFileRoot}[$Index] = '';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
335
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
336 $SDFile = $SDFilesList[$Index];
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
337 if (!(-e $SDFile)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
338 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
339 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
340 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
341 if (!CheckFileType($SDFile, "sd sdf")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
342 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
343 next FILELIST;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
344 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
345
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
346 # Setup output file root...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
347 $FileDir = ""; $FileName = ""; $FileExt = "";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
348 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
349
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
350 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
351 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
352 if ($RootFileName && $RootFileExt) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
353 $FileName = $RootFileName;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
354 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
355 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
356 $FileName = $OptionsInfo{OutFileRoot};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
357 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
358 $OutFileRoot = $FileName;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
359 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
360 else {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
361 $OutFileRoot = "$FileName";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
362 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
363
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
364 $SDFilesInfo{FileOkay}[$Index] = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
365 $SDFilesInfo{FileExt}[$Index] = $FileExt;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
366 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
367 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
368 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
369
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
370 # Process option values...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
371 sub ProcessOptions {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
372 %OptionsInfo = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
373
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
374 $OptionsInfo{Mode} = $Options{mode};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
375
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
376 $OptionsInfo{CmpdsMode} = $Options{cmpdsmode};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
377
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
378 $OptionsInfo{NumOfFiles} = $Options{numfiles};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
379 $OptionsInfo{NumOfCmpds} = $Options{numcmpds};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
380
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
381 $OptionsInfo{DataField} = '';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
382 if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
383 if (!$Options{datafield}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
384 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
385 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
386 $OptionsInfo{DataField} = $Options{datafield};
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
387 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
388
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
389 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
390
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
391 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
392 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
393
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
394
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
395 # Setup script usage and retrieve command line arguments specified using various options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
396 sub SetupScriptUsage {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
397
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
398 # Retrieve all the options...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
399 %Options = ();
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
400
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
401 $Options{cmpdsmode} = 'RootPrefix';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
402 $Options{mode} = 'Files';
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
403
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
404 $Options{numfiles} = 2;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
405 $Options{numcmpds} = 1;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
406
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
407
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
408 if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
409 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
410 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
411 if ($Options{workingdir}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
412 if (! -d $Options{workingdir}) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
413 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
414 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
415 chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
416 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
417 if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
418 die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
419 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
420 if ($Options{mode} !~ /^(Cmpds|Files)$/i) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
421 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
422 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
423 if ($Options{numfiles} < 2) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
424 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
425 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
426 if ($Options{numcmpds} < 1) {
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
427 die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n";
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
428 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
429 }
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
430
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
431 __END__
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
432
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
433 =head1 NAME
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
434
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
435 SplitSDFiles.pl - Split SDFile(s) into multiple SD files
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
436
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
437 =head1 SYNOPSIS
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
438
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
439 SplitSDFiles.pl SDFile(s)...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
440
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
441 SplitSDFiles.pl [B<-c, --CmpdsMode> DataField | MolName | RootPrefix]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
442 [B<-d, --DataField> DataFieldName] [B<-h, --help>] [B<-m, --mode> Cmpds | Files]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
443 [B<-n, --numfiles> number] [B<--numcmpds> number] [B<-o, --overwrite>]
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
444 [B<-r, --root> rootname] [B<-w,--workingdir> dirname] SDFile(s)...
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
445
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
446 =head1 DESCRIPTION
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
447
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
448 Split I<SDFile(s)> into multiple SD files. Each new SDFile contains a compound
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
449 subset of similar size from the initial file. Multiple I<SDFile(s)> names are separated
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
450 by space. The valid file extensions are I<.sdf> and I<.sd>. All other file names are
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
451 ignored. All the SD files in a current directory can be specified either by I<*.sdf>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
452 or the current directory name.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
453
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
454 =head1 OPTIONS
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
455
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
456 =over 4
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
457
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
458 =item B<-c, --CmpdsMode> I<DataField | MolName | RootPrefix>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
459
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
460 This option is only used during I<Cmpds> value of <-m, --mode> option with
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
461 specified B<--numcmpds> value of 1.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
462
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
463 Specify how to generate new file names during I<Cmpds> value of <-m, --mode>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
464 option: use I<SDFile(s)> datafield value or molname line for a specific compound;
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
465 generate a sequential ID using root prefix specified by B<-r, --root> option.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
466
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
467 Possible values: I<DataField | MolName | RootPrefix | RootPrefix>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
468 Default: I<RootPrefix>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
469
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
470 For empty I<MolName> and I<DataField> values during these specified modes, file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
471 name is automatically generated using I<RootPrefix>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
472
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
473 For I<RootPrefix> value of B<-c, --CmpdsMode> option, new file names are
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
474 generated using by appending compound record number to value of B<-r, --root> option.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
475 For example: I<RootName>Cmd<RecordNumber>.sdf.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
476
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
477 Allowed characters in file names are: a-zA-Z0-9_. All other characters in datafield
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
478 values, molname line, and root prefix are ignore during generation of file names.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
479
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
480 =item B<-d, --DataField> I<DataFieldName>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
481
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
482 This option is only used during I<DataField> value of <-c, --CmpdsMode> option.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
483
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
484 Specify I<SDFile(s)> datafield label name whose value is used for generation of new file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
485 for a specific compound. Default value: I<None>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
486
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
487 =item B<-h, --help>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
488
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
489 Print this help message.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
490
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
491 =item B<-m, --mode> I<Cmpds | Files>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
492
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
493 Specify how to split I<SDFile(s)>: split into files with each file containing specified
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
494 number of compounds or split into a specified number of files.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
495
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
496 Possible values: I<Cmpds | Files>. Default: I<Files>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
497
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
498 For I<Cmpds> value of B<-m, --mode> option, value of B<--numcmpds> option
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
499 determines the number of new files. And value of B<-n, --numfiles> option is
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
500 used to figure out the number of new files for I<Files> value of B<-m, --mode> option.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
501
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
502 =item B<-n, --numfiles> I<number>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
503
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
504 Number of new files to generate for each I<SDFile(s)>. Default: I<2>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
505
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
506 This value is only used during I<Files> value of B<-m, --mode> option.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
507
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
508 =item B<--numcmpds> I<number>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
509
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
510 Number of compounds in each new file corresponding to each I<SDFile(s)>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
511 Default: I<1>.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
512
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
513 This value is only used during I<Cmpds> value of B<-m, --mode> option.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
514
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
515 =item B<-o, --overwrite>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
516
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
517 Overwrite existing files.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
518
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
519 =item B<-r, --root> I<rootname>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
520
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
521 New SD file names are generated using the root: <Root>Part<Count>.sdf.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
522 Default new file names: <InitialSDFileName> Part<Count>.sdf. This option
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
523 is ignored for multiple input files.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
524
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
525 =item B<-w,--workingdir> I<dirname>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
526
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
527 Location of working directory. Default: current directory.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
528
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
529 =back
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
530
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
531 =head1 EXAMPLES
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
532
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
533 To split each SD file into 5 new SD files, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
534
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
535 % SplitSDFiles.pl -n 5 -o Sample1.sdf Sample2.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
536 % SplitSDFiles.pl -n 5 -o *.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
537
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
538 To split Sample1.sdf into 10 new NewSample*.sdf files, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
539
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
540 % SplitSDFiles.pl -m Files -n 10 -r NewSample -o Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
541
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
542 To split Sample1.sdf into new NewSample*.sdf files containing maximum of 5 compounds
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
543 in each file, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
544
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
545 % SplitSDFiles.pl -m Cmpds --numcmpds 5 -r NewSample -o Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
546
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
547 To split Sample1.sdf into new SD files containing one compound each with new file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
548 names corresponding to molname line, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
549
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
550 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c MolName -o Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
551
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
552 To split Sample1.sdf into new SD files containing one compound each with new file
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
553 names corresponding to value of datafield MolID, type:
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
554
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
555 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c DataField -d MolID
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
556 -o Sample1.sdf
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
557
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
558 =head1 AUTHOR
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
559
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
560 Manish Sud <msud@san.rr.com>
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
561
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
562 =head1 SEE ALSO
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
563
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
564 InfoSDFiles.pl, JoinSDFiles.pl, MolFilesToSD.pl, SDToMolFiles.pl
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
565
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
566 =head1 COPYRIGHT
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
567
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
568 Copyright (C) 2015 Manish Sud. All rights reserved.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
569
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
570 This file is part of MayaChemTools.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
571
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
572 MayaChemTools is free software; you can redistribute it and/or modify it under
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
573 the terms of the GNU Lesser General Public License as published by the Free
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
574 Software Foundation; either version 3 of the License, or (at your option)
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
575 any later version.
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
576
73ae111cf86f Uploaded
deepakjadmin
parents:
diff changeset
577 =cut