annotate bin/SplitSDFiles.pl @ 3:90ea638ce878 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:11:59 -0500
parents 2abf0d43254d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
1 #!/usr/bin/perl -w
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
2 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
3 # $RCSfile: SplitSDFiles.pl,v $
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
4 # $Date: 2015/02/28 20:46:21 $
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
5 # $Revision: 1.36 $
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
6 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
7 # Author: Manish Sud <msud@san.rr.com>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
8 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
10 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
11 # This file is part of MayaChemTools.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
12 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
14 # the terms of the GNU Lesser General Public License as published by the Free
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
15 # Software Foundation; either version 3 of the License, or (at your option) any
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
16 # later version.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
17 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
18 # MayaChemTools is distributed in the hope that it will be useful, but without
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
19 # any warranty; without even the implied warranty of merchantability of fitness
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
20 # for a particular purpose. See the GNU Lesser General Public License for more
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
21 # details.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
22 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
23 # You should have received a copy of the GNU Lesser General Public License
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
26 # Boston, MA, 02111-1307, USA.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
27 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
28
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
29 use strict;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
30 use FindBin; use lib "$FindBin::Bin/../lib";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
31 use Getopt::Long;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
32 use File::Basename;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
33 use Benchmark;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
34 use SDFileUtil;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
35 use FileUtil;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
36
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
38
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
39 # Autoflush STDOUT
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
40 $| = 1;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
41
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
42 # Starting message...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
43 $ScriptName = basename $0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
44 print "\n$ScriptName:Starting...\n\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
45 $StartTime = new Benchmark;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
46
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
47 # Get the options and setup script...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
48 SetupScriptUsage();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
49 if ($Options{help} || @ARGV < 1) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
51 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
52
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
53 my(@SDFilesList);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
55
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
56 # Process options...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
57 print "Processing options...\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
58 my(%OptionsInfo);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
59 ProcessOptions();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
60
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
61 # Setup information about input files...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
62 my(%SDFilesInfo);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
63 print "Checking input SD file(s)...\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
64 RetrieveSDFilesInfo();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
65
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
66 # Process input files..
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
67 my($FileIndex);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
68 if (@SDFilesList > 1) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
69 print "\nProcessing SD files...\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
70 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
71 for $FileIndex (0 .. $#SDFilesList) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
72 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
73 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
74 SplitSDFile($FileIndex);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
75 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
76 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
77 print "\n$ScriptName:Done...\n\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
78
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
79 $EndTime = new Benchmark;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
80 $TotalTime = timediff ($EndTime, $StartTime);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
81 print "Total time: ", timestr($TotalTime), "\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
82
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
83 ###############################################################################
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
84
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
85 # Split a SD file...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
86 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
87 sub SplitSDFile {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
88 my($FileIndex) = @_;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
89
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
90 if ($OptionsInfo{Mode} =~ /^Files$/i) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
91 SplitSDFileByNumOfFiles($FileIndex);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
92 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
93 elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
94 SplitSDFileByNumOfCmpds($FileIndex);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
95 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
96 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
97
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
98 # Split SD into specified number of files...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
99 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
100 sub SplitSDFileByNumOfFiles {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
101 my($FileIndex) = @_;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
102 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
103
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
104 $SDFile = $SDFilesList[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
105
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
106 if (!open SDFILE, "$SDFile") {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
107 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
108 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
109 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
110
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
111 $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
112
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
113 # Count number of compounds to figure out maximum number of compound per file...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
114 $CmpdCount = 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
115 while (<SDFILE>) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
116 if (/^\$\$\$\$/) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
117 $CmpdCount++;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
118 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
119 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
120 close SDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
121
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
122 if ($CmpdCount < $MaxNumOfFiles) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
123 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
124 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
125 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
126
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
127 $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
128
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
129 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
130 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
131
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
132 # Split SD into files containing specified number of compounds...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
133 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
134 sub SplitSDFileByNumOfCmpds {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
135 my($FileIndex) = @_;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
136
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
137 if ($OptionsInfo{NumOfCmpds} == 1) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
138 SplitSDFileByOneCmpdPerFile($FileIndex);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
139 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
140 else {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
141 SplitSDFileByNumOfCmpdsPerFile($FileIndex);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
142 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
143 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
144
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
145 # Split SD into files containing one compound per file...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
146 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
147 sub SplitSDFileByOneCmpdPerFile {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
148 my($FileIndex) = @_;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
149 my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
150
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
151 $SDFile = $SDFilesList[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
152
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
153 if (!open SDFILE, "$SDFile") {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
154 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
155 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
156 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
157
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
158 print "\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
159
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
160 $CmpdCount = 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
161
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
162 $FileExt = $SDFilesInfo{FileExt}[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
163
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
164 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
165 $OverwriteFiles = $OptionsInfo{OverwriteFiles};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
166
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
167 $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
168 $DataFieldName = $OptionsInfo{DataField};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
169
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
170 $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
171
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
172 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
173 $CmpdCount++;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
174
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
175 # Setup SD file name...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
176 $NewSDFileRoot = '';
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
177 if ($UseDataField) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
178 @CmpdLines = split "\n", $CmpdString;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
179 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
180 if (exists $DataFieldValues{$DataFieldName}) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
181 $NewSDFileRoot = $DataFieldValues{$DataFieldName};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
182 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
183 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
184 elsif ($UseMolName) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
185 @CmpdLines = split "\n", $CmpdString;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
186 $NewSDFileRoot = $CmpdLines[0];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
187 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
188
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
189 # Check for any invalid file name characters in data field or molname values...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
190 if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
191 $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
192 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
193
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
194 # Fall back plan for SD file name...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
195 if (!$NewSDFileRoot) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
196 $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
197 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
198
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
199 $NewSDFile = "${NewSDFileRoot}.${FileExt}";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
200
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
201 if (!$OverwriteFiles) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
202 if (-e $NewSDFile) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
203 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
204 next CMPDSTRING;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
205 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
206 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
207
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
208 # Write out new SD file...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
209
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
210 print "Generating $NewSDFile file\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
211 open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
212 print NEWSDFILE "$CmpdString\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
213 close NEWSDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
214
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
215 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
216 close SDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
217 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
218
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
219 # Split SD into files containing specified number of compounds per file...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
220 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
221 sub SplitSDFileByNumOfCmpdsPerFile {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
222 my($FileIndex) = @_;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
223 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
224
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
225 $SDFile = $SDFilesList[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
226
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
227 if (!open SDFILE, "$SDFile") {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
228 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
229 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
230 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
231
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
232 $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
233
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
234 # Count number of compounds to figure out maximum number of files...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
235 $CmpdCount = 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
236 while (<SDFILE>) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
237 if (/^\$\$\$\$/) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
238 $CmpdCount++;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
239 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
240 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
241 close SDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
242
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
243 $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
244
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
245 if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
246 $MaxNumOfFiles++;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
247 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
248
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
249 if ($CmpdCount <= $MaxCmpdsPerFile) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
250 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
251 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
252 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
253
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
254 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
255 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
256
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
257 # Split SD files into specified number of files with specified number of compounds
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
258 # in each file...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
259 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
260 sub SplitSDFileByNumOfFilesAndCmpds {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
261 my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
262 my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
263
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
264 $SDFile = $SDFilesList[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
265
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
266 if (!open SDFILE, "$SDFile") {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
267 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
268 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
269 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
270
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
271 # Setup new file names list...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
272 @NewSDFilesList = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
273 for $NewFileIndex (1 .. $NumOfFiles) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
274 $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
275 if (!$OptionsInfo{OverwriteFiles}) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
276 if (-e $NewFileName) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
277 warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
278 return;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
279 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
280 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
281 push @NewSDFilesList, $NewFileName;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
282 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
283
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
284 $MaxCmpdsCount = $NumOfCmpdsPerFile;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
285
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
286 $CmpdCount = 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
287 $NewFileIndex = 1;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
288
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
289 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
290 print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
291
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
292 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
293
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
294 while (<SDFILE>) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
295 s/(\r\n)|(\r)/\n/g;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
296 print NEWSDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
297
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
298 if ( /^\$\$\$\$/ ) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
299 $CmpdCount++;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
300 if ($NewFileIndex <= $NumOfFiles) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
301 if ($CmpdCount >= $MaxCmpdsCount) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
302 if ($NewFileIndex < $NumOfFiles) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
303 close NEWSDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
304 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
305 $NewFileIndex++;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
306 $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
307
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
308 if ($NewFileIndex <= $NumOfFiles) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
309 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
310 print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
311 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
312 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
313 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
314 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
315 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
316 close NEWSDFILE;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
317 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
318
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
319 # Retrieve information about SD files...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
320 #
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
321 sub RetrieveSDFilesInfo {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
322 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
323
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
324 %SDFilesInfo = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
325 @{$SDFilesInfo{FileOkay}} = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
326 @{$SDFilesInfo{FileExt}} = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
327 @{$SDFilesInfo{OutFileRoot}} = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
328
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
329 FILELIST: for $Index (0 .. $#SDFilesList) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
330 $SDFile = $SDFilesList[$Index];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
331
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
332 $SDFilesInfo{FileOkay}[$Index] = 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
333 $SDFilesInfo{FileExt}[$Index] = '';
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
334 $SDFilesInfo{OutFileRoot}[$Index] = '';
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
335
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
336 $SDFile = $SDFilesList[$Index];
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
337 if (!(-e $SDFile)) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
338 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
339 next FILELIST;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
340 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
341 if (!CheckFileType($SDFile, "sd sdf")) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
342 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
343 next FILELIST;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
344 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
345
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
346 # Setup output file root...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
347 $FileDir = ""; $FileName = ""; $FileExt = "";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
348 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
349
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
350 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
351 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
352 if ($RootFileName && $RootFileExt) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
353 $FileName = $RootFileName;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
354 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
355 else {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
356 $FileName = $OptionsInfo{OutFileRoot};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
357 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
358 $OutFileRoot = $FileName;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
359 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
360 else {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
361 $OutFileRoot = "$FileName";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
362 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
363
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
364 $SDFilesInfo{FileOkay}[$Index] = 1;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
365 $SDFilesInfo{FileExt}[$Index] = $FileExt;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
366 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
367 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
368 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
369
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
370 # Process option values...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
371 sub ProcessOptions {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
372 %OptionsInfo = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
373
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
374 $OptionsInfo{Mode} = $Options{mode};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
375
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
376 $OptionsInfo{CmpdsMode} = $Options{cmpdsmode};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
377
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
378 $OptionsInfo{NumOfFiles} = $Options{numfiles};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
379 $OptionsInfo{NumOfCmpds} = $Options{numcmpds};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
380
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
381 $OptionsInfo{DataField} = '';
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
382 if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
383 if (!$Options{datafield}) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
384 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
385 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
386 $OptionsInfo{DataField} = $Options{datafield};
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
387 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
388
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
389 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
390
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
391 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
392 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
393
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
394
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
395 # Setup script usage and retrieve command line arguments specified using various options...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
396 sub SetupScriptUsage {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
397
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
398 # Retrieve all the options...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
399 %Options = ();
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
400
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
401 $Options{cmpdsmode} = 'RootPrefix';
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
402 $Options{mode} = 'Files';
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
403
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
404 $Options{numfiles} = 2;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
405 $Options{numcmpds} = 1;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
406
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
407
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
408 if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
409 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
410 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
411 if ($Options{workingdir}) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
412 if (! -d $Options{workingdir}) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
413 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
414 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
415 chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
416 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
417 if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
418 die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
419 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
420 if ($Options{mode} !~ /^(Cmpds|Files)$/i) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
421 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
422 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
423 if ($Options{numfiles} < 2) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
424 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
425 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
426 if ($Options{numcmpds} < 1) {
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
427 die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n";
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
428 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
429 }
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
430
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
431 __END__
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
432
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
433 =head1 NAME
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
434
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
435 SplitSDFiles.pl - Split SDFile(s) into multiple SD files
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
436
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
437 =head1 SYNOPSIS
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
438
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
439 SplitSDFiles.pl SDFile(s)...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
440
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
441 SplitSDFiles.pl [B<-c, --CmpdsMode> DataField | MolName | RootPrefix]
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
442 [B<-d, --DataField> DataFieldName] [B<-h, --help>] [B<-m, --mode> Cmpds | Files]
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
443 [B<-n, --numfiles> number] [B<--numcmpds> number] [B<-o, --overwrite>]
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
444 [B<-r, --root> rootname] [B<-w,--workingdir> dirname] SDFile(s)...
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
445
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
446 =head1 DESCRIPTION
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
447
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
448 Split I<SDFile(s)> into multiple SD files. Each new SDFile contains a compound
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
449 subset of similar size from the initial file. Multiple I<SDFile(s)> names are separated
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
450 by space. The valid file extensions are I<.sdf> and I<.sd>. All other file names are
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
451 ignored. All the SD files in a current directory can be specified either by I<*.sdf>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
452 or the current directory name.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
453
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
454 =head1 OPTIONS
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
455
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
456 =over 4
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
457
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
458 =item B<-c, --CmpdsMode> I<DataField | MolName | RootPrefix>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
459
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
460 This option is only used during I<Cmpds> value of <-m, --mode> option with
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
461 specified B<--numcmpds> value of 1.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
462
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
463 Specify how to generate new file names during I<Cmpds> value of <-m, --mode>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
464 option: use I<SDFile(s)> datafield value or molname line for a specific compound;
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
465 generate a sequential ID using root prefix specified by B<-r, --root> option.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
466
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
467 Possible values: I<DataField | MolName | RootPrefix | RootPrefix>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
468 Default: I<RootPrefix>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
469
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
470 For empty I<MolName> and I<DataField> values during these specified modes, file
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
471 name is automatically generated using I<RootPrefix>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
472
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
473 For I<RootPrefix> value of B<-c, --CmpdsMode> option, new file names are
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
474 generated using by appending compound record number to value of B<-r, --root> option.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
475 For example: I<RootName>Cmd<RecordNumber>.sdf.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
476
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
477 Allowed characters in file names are: a-zA-Z0-9_. All other characters in datafield
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
478 values, molname line, and root prefix are ignore during generation of file names.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
479
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
480 =item B<-d, --DataField> I<DataFieldName>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
481
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
482 This option is only used during I<DataField> value of <-c, --CmpdsMode> option.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
483
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
484 Specify I<SDFile(s)> datafield label name whose value is used for generation of new file
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
485 for a specific compound. Default value: I<None>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
486
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
487 =item B<-h, --help>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
488
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
489 Print this help message.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
490
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
491 =item B<-m, --mode> I<Cmpds | Files>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
492
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
493 Specify how to split I<SDFile(s)>: split into files with each file containing specified
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
494 number of compounds or split into a specified number of files.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
495
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
496 Possible values: I<Cmpds | Files>. Default: I<Files>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
497
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
498 For I<Cmpds> value of B<-m, --mode> option, value of B<--numcmpds> option
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
499 determines the number of new files. And value of B<-n, --numfiles> option is
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
500 used to figure out the number of new files for I<Files> value of B<-m, --mode> option.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
501
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
502 =item B<-n, --numfiles> I<number>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
503
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
504 Number of new files to generate for each I<SDFile(s)>. Default: I<2>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
505
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
506 This value is only used during I<Files> value of B<-m, --mode> option.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
507
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
508 =item B<--numcmpds> I<number>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
509
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
510 Number of compounds in each new file corresponding to each I<SDFile(s)>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
511 Default: I<1>.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
512
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
513 This value is only used during I<Cmpds> value of B<-m, --mode> option.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
514
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
515 =item B<-o, --overwrite>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
516
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
517 Overwrite existing files.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
518
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
519 =item B<-r, --root> I<rootname>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
520
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
521 New SD file names are generated using the root: <Root>Part<Count>.sdf.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
522 Default new file names: <InitialSDFileName> Part<Count>.sdf. This option
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
523 is ignored for multiple input files.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
524
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
525 =item B<-w,--workingdir> I<dirname>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
526
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
527 Location of working directory. Default: current directory.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
528
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
529 =back
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
530
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
531 =head1 EXAMPLES
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
532
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
533 To split each SD file into 5 new SD files, type:
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
534
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
535 % SplitSDFiles.pl -n 5 -o Sample1.sdf Sample2.sdf
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
536 % SplitSDFiles.pl -n 5 -o *.sdf
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
537
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
538 To split Sample1.sdf into 10 new NewSample*.sdf files, type:
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
539
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
540 % SplitSDFiles.pl -m Files -n 10 -r NewSample -o Sample1.sdf
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
541
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
542 To split Sample1.sdf into new NewSample*.sdf files containing maximum of 5 compounds
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
543 in each file, type:
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
544
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
545 % SplitSDFiles.pl -m Cmpds --numcmpds 5 -r NewSample -o Sample1.sdf
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
546
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
547 To split Sample1.sdf into new SD files containing one compound each with new file
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
548 names corresponding to molname line, type:
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
549
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
550 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c MolName -o Sample1.sdf
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
551
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
552 To split Sample1.sdf into new SD files containing one compound each with new file
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
553 names corresponding to value of datafield MolID, type:
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
554
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
555 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c DataField -d MolID
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
556 -o Sample1.sdf
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
557
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
558 =head1 AUTHOR
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
559
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
560 Manish Sud <msud@san.rr.com>
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
561
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
562 =head1 SEE ALSO
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
563
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
564 InfoSDFiles.pl, JoinSDFiles.pl, MolFilesToSD.pl, SDToMolFiles.pl
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
565
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
566 =head1 COPYRIGHT
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
567
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
568 Copyright (C) 2015 Manish Sud. All rights reserved.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
569
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
570 This file is part of MayaChemTools.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
571
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
572 MayaChemTools is free software; you can redistribute it and/or modify it under
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
573 the terms of the GNU Lesser General Public License as published by the Free
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
574 Software Foundation; either version 3 of the License, or (at your option)
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
575 any later version.
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
576
2abf0d43254d Uploaded
deepakjadmin
parents:
diff changeset
577 =cut