comparison bin/SplitSDFiles.pl @ 0:4816e4a8ae95 draft default tip

Uploaded
author deepakjadmin
date Wed, 20 Jan 2016 09:23:18 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4816e4a8ae95
1 #!/usr/bin/perl -w
2 #
3 # $RCSfile: SplitSDFiles.pl,v $
4 # $Date: 2015/02/28 20:46:21 $
5 # $Revision: 1.36 $
6 #
7 # Author: Manish Sud <msud@san.rr.com>
8 #
9 # Copyright (C) 2015 Manish Sud. All rights reserved.
10 #
11 # This file is part of MayaChemTools.
12 #
13 # MayaChemTools is free software; you can redistribute it and/or modify it under
14 # the terms of the GNU Lesser General Public License as published by the Free
15 # Software Foundation; either version 3 of the License, or (at your option) any
16 # later version.
17 #
18 # MayaChemTools is distributed in the hope that it will be useful, but without
19 # any warranty; without even the implied warranty of merchantability of fitness
20 # for a particular purpose. See the GNU Lesser General Public License for more
21 # details.
22 #
23 # You should have received a copy of the GNU Lesser General Public License
24 # along with MayaChemTools; if not, see <http://www.gnu.org/licenses/> or
25 # write to the Free Software Foundation Inc., 59 Temple Place, Suite 330,
26 # Boston, MA, 02111-1307, USA.
27 #
28
29 use strict;
30 use FindBin; use lib "$FindBin::Bin/../lib";
31 use Getopt::Long;
32 use File::Basename;
33 use Benchmark;
34 use SDFileUtil;
35 use FileUtil;
36
37 my($ScriptName, %Options, $StartTime, $EndTime, $TotalTime);
38
39 # Autoflush STDOUT
40 $| = 1;
41
42 # Starting message...
43 $ScriptName = basename $0;
44 print "\n$ScriptName:Starting...\n\n";
45 $StartTime = new Benchmark;
46
47 # Get the options and setup script...
48 SetupScriptUsage();
49 if ($Options{help} || @ARGV < 1) {
50 die GetUsageFromPod("$FindBin::Bin/$ScriptName");
51 }
52
53 my(@SDFilesList);
54 @SDFilesList = ExpandFileNames(\@ARGV, "sdf sd");
55
56 # Process options...
57 print "Processing options...\n";
58 my(%OptionsInfo);
59 ProcessOptions();
60
61 # Setup information about input files...
62 my(%SDFilesInfo);
63 print "Checking input SD file(s)...\n";
64 RetrieveSDFilesInfo();
65
66 # Process input files..
67 my($FileIndex);
68 if (@SDFilesList > 1) {
69 print "\nProcessing SD files...\n";
70 }
71 for $FileIndex (0 .. $#SDFilesList) {
72 if ($SDFilesInfo{FileOkay}[$FileIndex]) {
73 print "\nProcessing file $SDFilesList[$FileIndex]...\n";
74 SplitSDFile($FileIndex);
75 }
76 }
77 print "\n$ScriptName:Done...\n\n";
78
79 $EndTime = new Benchmark;
80 $TotalTime = timediff ($EndTime, $StartTime);
81 print "Total time: ", timestr($TotalTime), "\n";
82
83 ###############################################################################
84
85 # Split a SD file...
86 #
87 sub SplitSDFile {
88 my($FileIndex) = @_;
89
90 if ($OptionsInfo{Mode} =~ /^Files$/i) {
91 SplitSDFileByNumOfFiles($FileIndex);
92 }
93 elsif ($OptionsInfo{Mode} =~ /^Cmpds$/i) {
94 SplitSDFileByNumOfCmpds($FileIndex);
95 }
96 }
97
98 # Split SD into specified number of files...
99 #
100 sub SplitSDFileByNumOfFiles {
101 my($FileIndex) = @_;
102 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
103
104 $SDFile = $SDFilesList[$FileIndex];
105
106 if (!open SDFILE, "$SDFile") {
107 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
108 return;
109 }
110
111 $MaxNumOfFiles = $OptionsInfo{NumOfFiles};
112
113 # Count number of compounds to figure out maximum number of compound per file...
114 $CmpdCount = 0;
115 while (<SDFILE>) {
116 if (/^\$\$\$\$/) {
117 $CmpdCount++;
118 }
119 }
120 close SDFILE;
121
122 if ($CmpdCount < $MaxNumOfFiles) {
123 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is smaller than number of new files, $MaxNumOfFiles\n";
124 return;
125 }
126
127 $MaxCmpdsPerFile = int $CmpdCount / $MaxNumOfFiles;
128
129 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
130 }
131
132 # Split SD into files containing specified number of compounds...
133 #
134 sub SplitSDFileByNumOfCmpds {
135 my($FileIndex) = @_;
136
137 if ($OptionsInfo{NumOfCmpds} == 1) {
138 SplitSDFileByOneCmpdPerFile($FileIndex);
139 }
140 else {
141 SplitSDFileByNumOfCmpdsPerFile($FileIndex);
142 }
143 }
144
145 # Split SD into files containing one compound per file...
146 #
147 sub SplitSDFileByOneCmpdPerFile {
148 my($FileIndex) = @_;
149 my($SDFile, $NewSDFile, $NewSDFileRoot, $FileExt, $OutFileRoot, $OverwriteFiles, $UseDataField, $DataFieldName, $UseMolName, $CmpdCount, $CmpdString, @CmpdLines, %DataFieldValues);
150
151 $SDFile = $SDFilesList[$FileIndex];
152
153 if (!open SDFILE, "$SDFile") {
154 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
155 return;
156 }
157
158 print "\n";
159
160 $CmpdCount = 0;
161
162 $FileExt = $SDFilesInfo{FileExt}[$FileIndex];
163
164 $OutFileRoot = $SDFilesInfo{OutFileRoot}[$FileIndex];
165 $OverwriteFiles = $OptionsInfo{OverwriteFiles};
166
167 $UseDataField = ($OptionsInfo{CmpdsMode} =~ /^DataField$/i) ? 1 : 0;
168 $DataFieldName = $OptionsInfo{DataField};
169
170 $UseMolName = ($OptionsInfo{CmpdsMode} =~ /^MolName$/i) ? 1 : 0;
171
172 CMPDSTRING: while ($CmpdString = ReadCmpdString(\*SDFILE)) {
173 $CmpdCount++;
174
175 # Setup SD file name...
176 $NewSDFileRoot = '';
177 if ($UseDataField) {
178 @CmpdLines = split "\n", $CmpdString;
179 %DataFieldValues = GetCmpdDataHeaderLabelsAndValues(\@CmpdLines);
180 if (exists $DataFieldValues{$DataFieldName}) {
181 $NewSDFileRoot = $DataFieldValues{$DataFieldName};
182 }
183 }
184 elsif ($UseMolName) {
185 @CmpdLines = split "\n", $CmpdString;
186 $NewSDFileRoot = $CmpdLines[0];
187 }
188
189 # Check for any invalid file name characters in data field or molname values...
190 if ($NewSDFileRoot && $NewSDFileRoot =~ /[^a-zA-Z0-9_]/) {
191 $NewSDFileRoot =~ s/[^a-zA-Z0-9_]//g;
192 }
193
194 # Fall back plan for SD file name...
195 if (!$NewSDFileRoot) {
196 $NewSDFileRoot = "${OutFileRoot}Cmpd${CmpdCount}";
197 }
198
199 $NewSDFile = "${NewSDFileRoot}.${FileExt}";
200
201 if (!$OverwriteFiles) {
202 if (-e $NewSDFile) {
203 warn "Warning: Ignoring compound number, $CmpdCount, in $SDFile: New SD file, $NewSDFile, already exists\n";
204 next CMPDSTRING;
205 }
206 }
207
208 # Write out new SD file...
209
210 print "Generating $NewSDFile file\n";
211 open NEWSDFILE, ">$NewSDFile" or die "Error: Can't open $NewSDFile: $! \n";
212 print NEWSDFILE "$CmpdString\n";
213 close NEWSDFILE;
214
215 }
216 close SDFILE;
217 }
218
219 # Split SD into files containing specified number of compounds per file...
220 #
221 sub SplitSDFileByNumOfCmpdsPerFile {
222 my($FileIndex) = @_;
223 my($SDFile, $CmpdCount, $MaxCmpdsPerFile, $MaxNumOfFiles);
224
225 $SDFile = $SDFilesList[$FileIndex];
226
227 if (!open SDFILE, "$SDFile") {
228 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
229 return;
230 }
231
232 $MaxCmpdsPerFile = $OptionsInfo{NumOfCmpds};
233
234 # Count number of compounds to figure out maximum number of files...
235 $CmpdCount = 0;
236 while (<SDFILE>) {
237 if (/^\$\$\$\$/) {
238 $CmpdCount++;
239 }
240 }
241 close SDFILE;
242
243 $MaxNumOfFiles = int $CmpdCount / $MaxCmpdsPerFile;
244
245 if (($MaxNumOfFiles * $MaxCmpdsPerFile) < $CmpdCount) {
246 $MaxNumOfFiles++;
247 }
248
249 if ($CmpdCount <= $MaxCmpdsPerFile) {
250 warn "Warning: Ignoring file $SDFile: Total number of compounds, $CmpdCount, is <= specified number of compunds per file, $MaxCmpdsPerFile\n";
251 return;
252 }
253
254 SplitSDFileByNumOfFilesAndCmpds($FileIndex, $MaxNumOfFiles, $MaxCmpdsPerFile);
255 }
256
257 # Split SD files into specified number of files with specified number of compounds
258 # in each file...
259 #
260 sub SplitSDFileByNumOfFilesAndCmpds {
261 my($FileIndex, $NumOfFiles, $NumOfCmpdsPerFile) = @_;
262 my($SDFile, $CmpdCount, $NewFileIndex, $NewFileName, $MaxCmpdsCount, @NewSDFilesList);
263
264 $SDFile = $SDFilesList[$FileIndex];
265
266 if (!open SDFILE, "$SDFile") {
267 warn "Warning: Ignoring file $SDFile: Couldn't open it: $! \n";
268 return;
269 }
270
271 # Setup new file names list...
272 @NewSDFilesList = ();
273 for $NewFileIndex (1 .. $NumOfFiles) {
274 $NewFileName = $SDFilesInfo{OutFileRoot}[$FileIndex] . "Part${NewFileIndex}." . $SDFilesInfo{FileExt}[$FileIndex];
275 if (!$OptionsInfo{OverwriteFiles}) {
276 if (-e $NewFileName) {
277 warn "Warning: Ignoring file $SDFile: New SD file, $NewFileName, already exists\n";
278 return;
279 }
280 }
281 push @NewSDFilesList, $NewFileName;
282 }
283
284 $MaxCmpdsCount = $NumOfCmpdsPerFile;
285
286 $CmpdCount = 0;
287 $NewFileIndex = 1;
288
289 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex -1]: $! \n";
290 print "\nGenerating $NewSDFilesList[$NewFileIndex - 1] file\n";
291
292 open SDFILE, "$SDFile" or die "Error: Can't open $SDFile: $! \n";
293
294 while (<SDFILE>) {
295 s/(\r\n)|(\r)/\n/g;
296 print NEWSDFILE;
297
298 if ( /^\$\$\$\$/ ) {
299 $CmpdCount++;
300 if ($NewFileIndex <= $NumOfFiles) {
301 if ($CmpdCount >= $MaxCmpdsCount) {
302 if ($NewFileIndex < $NumOfFiles) {
303 close NEWSDFILE;
304 }
305 $NewFileIndex++;
306 $MaxCmpdsCount = $NumOfCmpdsPerFile * $NewFileIndex;
307
308 if ($NewFileIndex <= $NumOfFiles) {
309 open NEWSDFILE, ">$NewSDFilesList[$NewFileIndex - 1]" or die "Error: Can't open $NewSDFilesList[$NewFileIndex - 1]: $! \n";
310 print "Generating $NewSDFilesList[$NewFileIndex - 1] file...\n";
311 }
312 }
313 }
314 }
315 }
316 close NEWSDFILE;
317 }
318
319 # Retrieve information about SD files...
320 #
321 sub RetrieveSDFilesInfo {
322 my($SDFile, $Index, $FileDir, $FileName, $FileExt, $OutFileRoot);
323
324 %SDFilesInfo = ();
325 @{$SDFilesInfo{FileOkay}} = ();
326 @{$SDFilesInfo{FileExt}} = ();
327 @{$SDFilesInfo{OutFileRoot}} = ();
328
329 FILELIST: for $Index (0 .. $#SDFilesList) {
330 $SDFile = $SDFilesList[$Index];
331
332 $SDFilesInfo{FileOkay}[$Index] = 0;
333 $SDFilesInfo{FileExt}[$Index] = '';
334 $SDFilesInfo{OutFileRoot}[$Index] = '';
335
336 $SDFile = $SDFilesList[$Index];
337 if (!(-e $SDFile)) {
338 warn "Warning: Ignoring file $SDFile: It doesn't exist\n";
339 next FILELIST;
340 }
341 if (!CheckFileType($SDFile, "sd sdf")) {
342 warn "Warning: Ignoring file $SDFile: It's not a SD file\n";
343 next FILELIST;
344 }
345
346 # Setup output file root...
347 $FileDir = ""; $FileName = ""; $FileExt = "";
348 ($FileDir, $FileName, $FileExt) = ParseFileName($SDFile);
349
350 if ($OptionsInfo{OutFileRoot} && (@SDFilesList == 1)) {
351 my ($RootFileDir, $RootFileName, $RootFileExt) = ParseFileName($OptionsInfo{OutFileRoot});
352 if ($RootFileName && $RootFileExt) {
353 $FileName = $RootFileName;
354 }
355 else {
356 $FileName = $OptionsInfo{OutFileRoot};
357 }
358 $OutFileRoot = $FileName;
359 }
360 else {
361 $OutFileRoot = "$FileName";
362 }
363
364 $SDFilesInfo{FileOkay}[$Index] = 1;
365 $SDFilesInfo{FileExt}[$Index] = $FileExt;
366 $SDFilesInfo{OutFileRoot}[$Index] = $OutFileRoot;
367 }
368 }
369
370 # Process option values...
371 sub ProcessOptions {
372 %OptionsInfo = ();
373
374 $OptionsInfo{Mode} = $Options{mode};
375
376 $OptionsInfo{CmpdsMode} = $Options{cmpdsmode};
377
378 $OptionsInfo{NumOfFiles} = $Options{numfiles};
379 $OptionsInfo{NumOfCmpds} = $Options{numcmpds};
380
381 $OptionsInfo{DataField} = '';
382 if ($Options{mode} =~ /^Cmpds$/i && $Options{cmpdsmode} =~ /^DataField$/i) {
383 if (!$Options{datafield}) {
384 die "Error: You must specify a value for \"-d, --DataField\" option in \"DataField\" value of \"-c, --CmpdsMode\" during \"Cmpds\" \"-m, --mode\" value. \n";
385 }
386 $OptionsInfo{DataField} = $Options{datafield};
387 }
388
389 $OptionsInfo{OverwriteFiles} = $Options{overwrite} ? 1 : 0;
390
391 $OptionsInfo{OutFileRoot} = $Options{root} ? $Options{root} : 0;
392 }
393
394
395 # Setup script usage and retrieve command line arguments specified using various options...
396 sub SetupScriptUsage {
397
398 # Retrieve all the options...
399 %Options = ();
400
401 $Options{cmpdsmode} = 'RootPrefix';
402 $Options{mode} = 'Files';
403
404 $Options{numfiles} = 2;
405 $Options{numcmpds} = 1;
406
407
408 if (!GetOptions(\%Options, "cmpdsmode|c=s", "datafield|d=s", "help|h", "mode|m=s", "numfiles|n=i", "numcmpds=i", "overwrite|o", "root|r=s", "workingdir|w=s")) {
409 die "\nTo get a list of valid options and their values, use \"$ScriptName -h\" or\n\"perl -S $ScriptName -h\" command and try again...\n";
410 }
411 if ($Options{workingdir}) {
412 if (! -d $Options{workingdir}) {
413 die "Error: The value specified, $Options{workingdir}, for option \"-w --workingdir\" is not a directory name.\n";
414 }
415 chdir $Options{workingdir} or die "Error: Error: Couldn't chdir $Options{workingdir}: $! \n";
416 }
417 if ($Options{cmpdsmode} !~ /^(DataField|MolName|RootPrefix)$/i) {
418 die "Error: The value specified, $Options{cmpdsmode}, for option \"-c, --CmpdsMode\" is not valid. Allowed values: DataField, MolName, RootPrefix\n";
419 }
420 if ($Options{mode} !~ /^(Cmpds|Files)$/i) {
421 die "Error: The value specified, $Options{mode}, for option \"-m, --mode\" is not valid. Allowed values: Cmpds, Files\n";
422 }
423 if ($Options{numfiles} < 2) {
424 die "Error: The value specified, $Options{numfiles}, for option \"-n --numfiles\" is not valid. Allowed values: >= 2 \n";
425 }
426 if ($Options{numcmpds} < 1) {
427 die "Error: The value specified, $Options{numcmpds}, for option \"-n --numcmpds\" is not valid. Allowed values: >= 1 \n";
428 }
429 }
430
431 __END__
432
433 =head1 NAME
434
435 SplitSDFiles.pl - Split SDFile(s) into multiple SD files
436
437 =head1 SYNOPSIS
438
439 SplitSDFiles.pl SDFile(s)...
440
441 SplitSDFiles.pl [B<-c, --CmpdsMode> DataField | MolName | RootPrefix]
442 [B<-d, --DataField> DataFieldName] [B<-h, --help>] [B<-m, --mode> Cmpds | Files]
443 [B<-n, --numfiles> number] [B<--numcmpds> number] [B<-o, --overwrite>]
444 [B<-r, --root> rootname] [B<-w,--workingdir> dirname] SDFile(s)...
445
446 =head1 DESCRIPTION
447
448 Split I<SDFile(s)> into multiple SD files. Each new SDFile contains a compound
449 subset of similar size from the initial file. Multiple I<SDFile(s)> names are separated
450 by space. The valid file extensions are I<.sdf> and I<.sd>. All other file names are
451 ignored. All the SD files in a current directory can be specified either by I<*.sdf>
452 or the current directory name.
453
454 =head1 OPTIONS
455
456 =over 4
457
458 =item B<-c, --CmpdsMode> I<DataField | MolName | RootPrefix>
459
460 This option is only used during I<Cmpds> value of <-m, --mode> option with
461 specified B<--numcmpds> value of 1.
462
463 Specify how to generate new file names during I<Cmpds> value of <-m, --mode>
464 option: use I<SDFile(s)> datafield value or molname line for a specific compound;
465 generate a sequential ID using root prefix specified by B<-r, --root> option.
466
467 Possible values: I<DataField | MolName | RootPrefix | RootPrefix>.
468 Default: I<RootPrefix>.
469
470 For empty I<MolName> and I<DataField> values during these specified modes, file
471 name is automatically generated using I<RootPrefix>.
472
473 For I<RootPrefix> value of B<-c, --CmpdsMode> option, new file names are
474 generated using by appending compound record number to value of B<-r, --root> option.
475 For example: I<RootName>Cmd<RecordNumber>.sdf.
476
477 Allowed characters in file names are: a-zA-Z0-9_. All other characters in datafield
478 values, molname line, and root prefix are ignore during generation of file names.
479
480 =item B<-d, --DataField> I<DataFieldName>
481
482 This option is only used during I<DataField> value of <-c, --CmpdsMode> option.
483
484 Specify I<SDFile(s)> datafield label name whose value is used for generation of new file
485 for a specific compound. Default value: I<None>.
486
487 =item B<-h, --help>
488
489 Print this help message.
490
491 =item B<-m, --mode> I<Cmpds | Files>
492
493 Specify how to split I<SDFile(s)>: split into files with each file containing specified
494 number of compounds or split into a specified number of files.
495
496 Possible values: I<Cmpds | Files>. Default: I<Files>.
497
498 For I<Cmpds> value of B<-m, --mode> option, value of B<--numcmpds> option
499 determines the number of new files. And value of B<-n, --numfiles> option is
500 used to figure out the number of new files for I<Files> value of B<-m, --mode> option.
501
502 =item B<-n, --numfiles> I<number>
503
504 Number of new files to generate for each I<SDFile(s)>. Default: I<2>.
505
506 This value is only used during I<Files> value of B<-m, --mode> option.
507
508 =item B<--numcmpds> I<number>
509
510 Number of compounds in each new file corresponding to each I<SDFile(s)>.
511 Default: I<1>.
512
513 This value is only used during I<Cmpds> value of B<-m, --mode> option.
514
515 =item B<-o, --overwrite>
516
517 Overwrite existing files.
518
519 =item B<-r, --root> I<rootname>
520
521 New SD file names are generated using the root: <Root>Part<Count>.sdf.
522 Default new file names: <InitialSDFileName> Part<Count>.sdf. This option
523 is ignored for multiple input files.
524
525 =item B<-w,--workingdir> I<dirname>
526
527 Location of working directory. Default: current directory.
528
529 =back
530
531 =head1 EXAMPLES
532
533 To split each SD file into 5 new SD files, type:
534
535 % SplitSDFiles.pl -n 5 -o Sample1.sdf Sample2.sdf
536 % SplitSDFiles.pl -n 5 -o *.sdf
537
538 To split Sample1.sdf into 10 new NewSample*.sdf files, type:
539
540 % SplitSDFiles.pl -m Files -n 10 -r NewSample -o Sample1.sdf
541
542 To split Sample1.sdf into new NewSample*.sdf files containing maximum of 5 compounds
543 in each file, type:
544
545 % SplitSDFiles.pl -m Cmpds --numcmpds 5 -r NewSample -o Sample1.sdf
546
547 To split Sample1.sdf into new SD files containing one compound each with new file
548 names corresponding to molname line, type:
549
550 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c MolName -o Sample1.sdf
551
552 To split Sample1.sdf into new SD files containing one compound each with new file
553 names corresponding to value of datafield MolID, type:
554
555 % SplitSDFiles.pl -m Cmpds --numcmpds 1 -c DataField -d MolID
556 -o Sample1.sdf
557
558 =head1 AUTHOR
559
560 Manish Sud <msud@san.rr.com>
561
562 =head1 SEE ALSO
563
564 InfoSDFiles.pl, JoinSDFiles.pl, MolFilesToSD.pl, SDToMolFiles.pl
565
566 =head1 COPYRIGHT
567
568 Copyright (C) 2015 Manish Sud. All rights reserved.
569
570 This file is part of MayaChemTools.
571
572 MayaChemTools is free software; you can redistribute it and/or modify it under
573 the terms of the GNU Lesser General Public License as published by the Free
574 Software Foundation; either version 3 of the License, or (at your option)
575 any later version.
576
577 =cut