annotate mayachemtool/mayachemtools/bin/SplitSDFiles.pl @ 0:68300206e90d draft default tip

Uploaded
author deepakjadmin
date Thu, 05 Nov 2015 02:41:30 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
1 #!/usr/bin/perl -w
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
2 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: SplitSDFiles.pl,v $
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:46:21 $
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.36 $
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
6 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
8 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
10 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
12 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
17 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
22 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
27 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
28
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
30 use FindBin; use lib "$FindBin::Bin/../lib";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
31 use Getopt::Long;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
32 use File::Basename;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
33 use Benchmark;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
34 use SDFileUtil;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
35 use FileUtil;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
36
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
38
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
39 # Autoflush STDOUT
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
40 $| = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
41
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
42 # Starting message...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
43 $ScriptName = basename $0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
44 print "\n$ScriptName:Starting...\n\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
45 $StartTime = new Benchmark;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
46
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
47 # Get the options and setup script...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
48 SetupScriptUsage();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
49 if ($Options{help} || @ARGV < 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
51 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
52
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
53 my(@SDFilesList);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
55
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
56 # Process options...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
57 print "Processing options...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
58 my(%OptionsInfo);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
59 ProcessOptions();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
60
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
61 # Setup information about input files...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
62 my(%SDFilesInfo);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
63 print "Checking input SD file(s)...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
64 RetrieveSDFilesInfo();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
65
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
66 # Process input files..
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
67 my($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
68 if (@SDFilesList > 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
69 print "\nProcessing SD files...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
70 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
71 for $FileIndex (0 .. $#SDFilesList) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
72 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
73 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
74 SplitSDFile($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
75 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
76 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
77 print "\n$ScriptName:Done...\n\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
78
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
79 $EndTime = new Benchmark;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
80 $TotalTime = timediff ($EndTime, $StartTime);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
81 print "Total time: ", timestr($TotalTime), "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
82
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
83 ###############################################################################
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
84
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
85 # Split a SD file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
86 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
87 sub SplitSDFile {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
88 my($FileIndex) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
89
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
90 if ($OptionsInfo{Mode} =~ /^Files$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
91 SplitSDFileByNumOfFiles($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
92 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
93 elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
94 SplitSDFileByNumOfCmpds($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
95 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
96 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
97
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
98 # Split SD into specified number of files...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
99 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
100 sub SplitSDFileByNumOfFiles {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
101 my($FileIndex) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
102 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
103
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
104 $SDFile = $SDFilesList[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
105
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
106 if (!open SDFILE, "$SDFile") {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
107 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
108 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
109 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
110
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
111 $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
112
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
113 # Count number of compounds to figure out maximum number of compound per file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
114 $CmpdCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
115 while (<SDFILE>) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
116 if (/^\$\$\$\$/) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
117 $CmpdCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
118 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
119 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
120 close SDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
121
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
122 if ($CmpdCount < $MaxNumOfFiles) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
123 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
124 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
125 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
126
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
127 $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
128
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
129 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
130 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
131
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
132 # Split SD into files containing specified number of compounds...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
133 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
134 sub SplitSDFileByNumOfCmpds {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
135 my($FileIndex) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
136
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
137 if ($OptionsInfo{NumOfCmpds} == 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
138 SplitSDFileByOneCmpdPerFile($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
139 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
140 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
141 SplitSDFileByNumOfCmpdsPerFile($FileIndex);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
142 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
143 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
144
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
145 # Split SD into files containing one compound per file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
146 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
147 sub SplitSDFileByOneCmpdPerFile {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
148 my($FileIndex) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
149 my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
150
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
151 $SDFile = $SDFilesList[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
152
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
153 if (!open SDFILE, "$SDFile") {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
154 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
155 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
156 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
157
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
158 print "\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
159
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
160 $CmpdCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
161
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
162 $FileExt = $SDFilesInfo{FileExt}[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
163
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
164 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
165 $OverwriteFiles = $OptionsInfo{OverwriteFiles};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
166
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
167 $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
168 $DataFieldName = $OptionsInfo{DataField};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
169
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
170 $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
171
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
172 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
173 $CmpdCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
174
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
175 # Setup SD file name...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
176 $NewSDFileRoot = '';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
177 if ($UseDataField) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
178 @CmpdLines = split "\n", $CmpdString;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
179 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
180 if (exists $DataFieldValues{$DataFieldName}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
181 $NewSDFileRoot = $DataFieldValues{$DataFieldName};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
182 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
183 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
184 elsif ($UseMolName) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
185 @CmpdLines = split "\n", $CmpdString;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
186 $NewSDFileRoot = $CmpdLines[0];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
187 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
188
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
189 # Check for any invalid file name characters in data field or molname values...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
190 if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
191 $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
192 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
193
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
194 # Fall back plan for SD file name...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
195 if (!$NewSDFileRoot) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
196 $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
197 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
198
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
199 $NewSDFile = "${NewSDFileRoot}.${FileExt}";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
200
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
201 if (!$OverwriteFiles) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
202 if (-e $NewSDFile) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
203 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
204 next CMPDSTRING;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
205 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
206 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
207
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
208 # Write out new SD file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
209
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
210 print "Generating $NewSDFile file\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
211 open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
212 print NEWSDFILE "$CmpdString\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
213 close NEWSDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
214
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
215 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
216 close SDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
217 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
218
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
219 # Split SD into files containing specified number of compounds per file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
220 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
221 sub SplitSDFileByNumOfCmpdsPerFile {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
222 my($FileIndex) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
223 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
224
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
225 $SDFile = $SDFilesList[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
226
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
227 if (!open SDFILE, "$SDFile") {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
228 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
229 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
230 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
231
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
232 $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
233
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
234 # Count number of compounds to figure out maximum number of files...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
235 $CmpdCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
236 while (<SDFILE>) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
237 if (/^\$\$\$\$/) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
238 $CmpdCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
239 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
240 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
241 close SDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
242
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
243 $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
244
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
245 if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
246 $MaxNumOfFiles++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
247 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
248
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
249 if ($CmpdCount <= $MaxCmpdsPerFile) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
250 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
251 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
252 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
253
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
254 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
255 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
256
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
257 # Split SD files into specified number of files with specified number of compounds
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
258 # in each file...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
259 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
260 sub SplitSDFileByNumOfFilesAndCmpds {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
261 my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
262 my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
263
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
264 $SDFile = $SDFilesList[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
265
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
266 if (!open SDFILE, "$SDFile") {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
267 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
268 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
269 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
270
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
271 # Setup new file names list...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
272 @NewSDFilesList = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
273 for $NewFileIndex (1 .. $NumOfFiles) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
274 $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
275 if (!$OptionsInfo{OverwriteFiles}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
276 if (-e $NewFileName) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
277 warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
278 return;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
279 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
280 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
281 push @NewSDFilesList, $NewFileName;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
282 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
283
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
284 $MaxCmpdsCount = $NumOfCmpdsPerFile;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
285
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
286 $CmpdCount = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
287 $NewFileIndex = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
288
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
289 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
290 print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
291
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
292 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
293
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
294 while (<SDFILE>) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
295 s/(\r\n)|(\r)/\n/g;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
296 print NEWSDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
297
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
298 if ( /^\$\$\$\$/ ) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
299 $CmpdCount++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
300 if ($NewFileIndex <= $NumOfFiles) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
301 if ($CmpdCount >= $MaxCmpdsCount) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
302 if ($NewFileIndex < $NumOfFiles) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
303 close NEWSDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
304 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
305 $NewFileIndex++;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
306 $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
307
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
308 if ($NewFileIndex <= $NumOfFiles) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
309 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
310 print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
311 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
312 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
313 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
314 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
315 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
316 close NEWSDFILE;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
317 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
318
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
319 # Retrieve information about SD files...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
320 #
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
321 sub RetrieveSDFilesInfo {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
322 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
323
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
324 %SDFilesInfo = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
325 @{$SDFilesInfo{FileOkay}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
326 @{$SDFilesInfo{FileExt}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
327 @{$SDFilesInfo{OutFileRoot}} = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
328
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
329 FILELIST: for $Index (0 .. $#SDFilesList) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
330 $SDFile = $SDFilesList[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
331
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
332 $SDFilesInfo{FileOkay}[$Index] = 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
333 $SDFilesInfo{FileExt}[$Index] = '';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
334 $SDFilesInfo{OutFileRoot}[$Index] = '';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
335
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
336 $SDFile = $SDFilesList[$Index];
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
337 if (!(-e $SDFile)) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
338 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
339 next FILELIST;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
340 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
341 if (!CheckFileType($SDFile, "sd sdf")) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
342 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
343 next FILELIST;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
344 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
345
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
346 # Setup output file root...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
347 $FileDir = ""; $FileName = ""; $FileExt = "";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
348 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
349
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
350 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
351 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
352 if ($RootFileName && $RootFileExt) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
353 $FileName = $RootFileName;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
354 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
355 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
356 $FileName = $OptionsInfo{OutFileRoot};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
357 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
358 $OutFileRoot = $FileName;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
359 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
360 else {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
361 $OutFileRoot = "$FileName";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
362 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
363
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
364 $SDFilesInfo{FileOkay}[$Index] = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
365 $SDFilesInfo{FileExt}[$Index] = $FileExt;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
366 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
367 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
368 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
369
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
370 # Process option values...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
371 sub ProcessOptions {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
372 %OptionsInfo = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
373
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
374 $OptionsInfo{Mode} = $Options{mode};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
375
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
376 $OptionsInfo{CmpdsMode} = $Options{cmpdsmode};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
377
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
378 $OptionsInfo{NumOfFiles} = $Options{numfiles};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
379 $OptionsInfo{NumOfCmpds} = $Options{numcmpds};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
380
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
381 $OptionsInfo{DataField} = '';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
382 if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
383 if (!$Options{datafield}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
384 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
385 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
386 $OptionsInfo{DataField} = $Options{datafield};
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
387 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
388
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
389 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
390
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
391 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
392 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
393
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
394
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
395 # Setup script usage and retrieve command line arguments specified using various options...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
396 sub SetupScriptUsage {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
397
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
398 # Retrieve all the options...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
399 %Options = ();
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
400
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
401 $Options{cmpdsmode} = 'RootPrefix';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
402 $Options{mode} = 'Files';
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
403
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
404 $Options{numfiles} = 2;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
405 $Options{numcmpds} = 1;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
406
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
407
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
408 if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
409 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
410 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
411 if ($Options{workingdir}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
412 if (! -d $Options{workingdir}) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
413 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
414 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
415 chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
416 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
417 if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
418 die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
419 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
420 if ($Options{mode} !~ /^(Cmpds|Files)$/i) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
421 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
422 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
423 if ($Options{numfiles} < 2) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
424 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
425 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
426 if ($Options{numcmpds} < 1) {
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
427 die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n";
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
428 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
429 }
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
430
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
431 __END__
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
432
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
433 =head1 NAME
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
434
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
435 SplitSDFiles.pl - Split SDFile(s) into multiple SD files
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
436
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
437 =head1 SYNOPSIS
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
438
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
439 SplitSDFiles.pl SDFile(s)...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
440
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
441 SplitSDFiles.pl [B<-c, --CmpdsMode> DataField | MolName | RootPrefix]
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
442 [B<-d, --DataField> DataFieldName] [B<-h, --help>] [B<-m, --mode> Cmpds | Files]
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
443 [B<-n, --numfiles> number] [B<--numcmpds> number] [B<-o, --overwrite>]
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
444 [B<-r, --root> rootname] [B<-w,--workingdir> dirname] SDFile(s)...
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
445
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
446 =head1 DESCRIPTION
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
447
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
448 Split I<SDFile(s)> into multiple SD files. Each new SDFile contains a compound
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
449 subset of similar size from the initial file. Multiple I<SDFile(s)> names are separated
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
450 by space. The valid file extensions are I<.sdf> and I<.sd>. All other file names are
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
451 ignored. All the SD files in a current directory can be specified either by I<*.sdf>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
452 or the current directory name.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
453
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
454 =head1 OPTIONS
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
455
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
456 =over 4
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
457
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
458 =item B<-c, --CmpdsMode> I<DataField | MolName | RootPrefix>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
459
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
460 This option is only used during I<Cmpds> value of <-m, --mode> option with
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
461 specified B<--numcmpds> value of 1.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
462
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
463 Specify how to generate new file names during I<Cmpds> value of <-m, --mode>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
464 option: use I<SDFile(s)> datafield value or molname line for a specific compound;
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
465 generate a sequential ID using root prefix specified by B<-r, --root> option.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
466
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
467 Possible values: I<DataField | MolName | RootPrefix | RootPrefix>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
468 Default: I<RootPrefix>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
469
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
470 For empty I<MolName> and I<DataField> values during these specified modes, file
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
471 name is automatically generated using I<RootPrefix>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
472
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
473 For I<RootPrefix> value of B<-c, --CmpdsMode> option, new file names are
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
474 generated using by appending compound record number to value of B<-r, --root> option.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
475 For example: I<RootName>Cmd<RecordNumber>.sdf.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
476
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
477 Allowed characters in file names are: a-zA-Z0-9_. All other characters in datafield
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
478 values, molname line, and root prefix are ignore during generation of file names.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
479
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
480 =item B<-d, --DataField> I<DataFieldName>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
481
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
482 This option is only used during I<DataField> value of <-c, --CmpdsMode> option.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
483
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
484 Specify I<SDFile(s)> datafield label name whose value is used for generation of new file
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
485 for a specific compound. Default value: I<None>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
486
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
487 =item B<-h, --help>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
488
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
489 Print this help message.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
490
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
491 =item B<-m, --mode> I<Cmpds | Files>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
492
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
493 Specify how to split I<SDFile(s)>: split into files with each file containing specified
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
494 number of compounds or split into a specified number of files.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
495
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
496 Possible values: I<Cmpds | Files>. Default: I<Files>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
497
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
498 For I<Cmpds> value of B<-m, --mode> option, value of B<--numcmpds> option
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
499 determines the number of new files. And value of B<-n, --numfiles> option is
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
500 used to figure out the number of new files for I<Files> value of B<-m, --mode> option.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
501
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
502 =item B<-n, --numfiles> I<number>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
503
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
504 Number of new files to generate for each I<SDFile(s)>. Default: I<2>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
505
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
506 This value is only used during I<Files> value of B<-m, --mode> option.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
507
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
508 =item B<--numcmpds> I<number>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
509
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
510 Number of compounds in each new file corresponding to each I<SDFile(s)>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
511 Default: I<1>.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
512
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
513 This value is only used during I<Cmpds> value of B<-m, --mode> option.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
514
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
515 =item B<-o, --overwrite>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
516
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
517 Overwrite existing files.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
518
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
519 =item B<-r, --root> I<rootname>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
520
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
521 New SD file names are generated using the root: <Root>Part<Count>.sdf.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
522 Default new file names: <InitialSDFileName> Part<Count>.sdf. This option
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
523 is ignored for multiple input files.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
524
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
525 =item B<-w,--workingdir> I<dirname>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
526
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
527 Location of working directory. Default: current directory.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
528
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
529 =back
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
530
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
531 =head1 EXAMPLES
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
532
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
533 To split each SD file into 5 new SD files, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
534
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
535 % SplitSDFiles.pl -n 5 -o Sample1.sdf Sample2.sdf
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
536 % SplitSDFiles.pl -n 5 -o *.sdf
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
537
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
538 To split Sample1.sdf into 10 new NewSample*.sdf files, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
539
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
540 % SplitSDFiles.pl -m Files -n 10 -r NewSample -o Sample1.sdf
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
541
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
542 To split Sample1.sdf into new NewSample*.sdf files containing maximum of 5 compounds
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
543 in each file, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
544
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
545 % SplitSDFiles.pl -m Cmpds --numcmpds 5 -r NewSample -o Sample1.sdf
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
546
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
547 To split Sample1.sdf into new SD files containing one compound each with new file
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
548 names corresponding to molname line, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
549
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
550 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c MolName -o Sample1.sdf
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
551
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
552 To split Sample1.sdf into new SD files containing one compound each with new file
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
553 names corresponding to value of datafield MolID, type:
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
554
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
555 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c DataField -d MolID
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
556 -o Sample1.sdf
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
557
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
558 =head1 AUTHOR
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
559
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
560 Manish Sud <msud@san.rr.com>
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
561
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
562 =head1 SEE ALSO
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
563
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
564 InfoSDFiles.pl, JoinSDFiles.pl, MolFilesToSD.pl, SDToMolFiles.pl
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
565
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
566 =head1 COPYRIGHT
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
567
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
568 Copyright (C) 2015 Manish Sud. All rights reserved.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
569
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
570 This file is part of MayaChemTools.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
571
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
572 MayaChemTools is free software; you can redistribute it and/or modify it under
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
573 the terms of the GNU Lesser General Public License as published by the Free
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
574 Software Foundation; either version 3 of the License, or (at your option)
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
575 any later version.
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
576
68300206e90d Uploaded
deepakjadmin
parents:
diff changeset
577 =cut