|
0
|
1 #!/usr/bin/perl
|
|
|
2
|
|
|
3 #BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}};
|
|
|
4 #use lib (@INC,$ENV{"PERL_MOD_DIR"});
|
|
|
5 #no lib "$ENV{PERL_MOD_DIR}/i686-linux";
|
|
|
6 #no lib ".";
|
|
|
7
|
|
|
8 =head1 NAME
|
|
|
9
|
|
|
10 split_multifasta.pl - split a single FASTA file containing multiple sequences into separate files.
|
|
|
11
|
|
|
12 =head1 SYNOPSIS
|
|
|
13
|
|
|
14 USAGE: split_multifasta.pl
|
|
|
15 --input_file=/path/to/some_file.fsa
|
|
|
16 --output_dir=/path/to/somedir
|
|
|
17 [ --output_list=/path/to/somefile.list
|
|
|
18 --output_subdir_size=1000
|
|
|
19 --output_subdir_prefix=fasta
|
|
|
20 --seqs_per_file=1
|
|
|
21 --compress_output=1
|
|
|
22 ]
|
|
|
23
|
|
|
24
|
|
|
25 split_multifasta.pl --in snapdmel.aa --output_dir=./ --f=snaa --seqs_per_file=1000
|
|
|
26
|
|
|
27 =head1 OPTIONS
|
|
|
28
|
|
|
29 B<--input_file,-i>
|
|
|
30 The input multi-fasta file to split.
|
|
|
31
|
|
|
32 B<--output_dir,-o>
|
|
|
33 The directory to which the output files will be written.
|
|
|
34
|
|
|
35 B<--output_list,-s>
|
|
|
36 Write a list file containing the paths of each of the regular output files. This may be useful
|
|
|
37 for later scripts that can accept a list as input.
|
|
|
38
|
|
|
39 B<--output_file_prefix,-f>
|
|
|
40 If defined, each file created will have this string prepended to its name. This is ignored unless
|
|
|
41 writing multiple sequences to each output file using the --seqs_per_file option with a value greater
|
|
|
42 than 1, else each file created will just be a number.
|
|
|
43
|
|
|
44 B<--output_subdir_size,-u>
|
|
|
45 If defined, this script will create numbered subdirectories in the output directory, each
|
|
|
46 containing this many sequences files. Once this limit is reached, another subdirectory
|
|
|
47 is created.
|
|
|
48
|
|
|
49 B<--output_subdir_prefix,-p>
|
|
|
50 To be used along with --output_subdir_size, this allows more control of the names of the
|
|
|
51 subdirectories created. Rather than just incrementing numbers (like 10), each subdirectory
|
|
|
52 will be named with this prefix (like prefix10).
|
|
|
53
|
|
|
54 B<--compress_output,-c>
|
|
|
55 Output fasta files will be gzipped when written.
|
|
|
56
|
|
|
57 B<--debug,-d>
|
|
|
58 Debug level. Use a large number to turn on verbose debugging.
|
|
|
59
|
|
|
60 B<--log,-l>
|
|
|
61 Log file
|
|
|
62
|
|
|
63 B<--help,-h>
|
|
|
64 This help message
|
|
|
65
|
|
|
66 =head1 DESCRIPTION
|
|
|
67
|
|
|
68 This script is used to split a single FASTA file containing multiple sequences into separate
|
|
|
69 files containing one sequence each.
|
|
|
70
|
|
|
71 =head1 INPUT
|
|
|
72
|
|
|
73 The input is defined with --input_file and should be a single fasta file. File extensions are
|
|
|
74 ignored. When creating this multi-entry FASTA file, one should take care to make the first
|
|
|
75 *word* after the > symbol a unique value, as it will be used as the file name for that sequence.
|
|
|
76 For example:
|
|
|
77
|
|
|
78 >gi53791237 Tragulus javanicus p97bcnt gene for p97Bcnt
|
|
|
79 ACAGGAGAAGAGACTGAAGAGACACGTTCAGGAGAAGAGCAAGAGAAGCCTAAAGAAATGCAAGAAGTTA
|
|
|
80 AACTCACCAAATCACTTGTTGAAGAAGTCAGGTAACATGACATTCACAAACTTCAAAACTAGTTCTTTAA
|
|
|
81 AAAGGAACATCTCTCTTTTAATATGTATGCATTATTAATTTATTTACTCATTGGCGTGGAGGAGGAAATG
|
|
|
82
|
|
|
83 >gi15387669 Corynebacterium callunae pCC1 plasmid
|
|
|
84 ATGCATGCTAGTGTGGTGAGTATGAGCACACACATTCATGGGCACCGCCGGGGTGCAGGGGGGCTTGCCC
|
|
|
85 CTTGTCCATGCGGGGTGTGGGGCTTGCCCCGCCGATAGAGACCGGCCACCACCATGGCACCCGGTCGCGG
|
|
|
86 GGTGATCGGCCACCACCACCGCCCCCGGCCACTCTCCCCCTGTCTAGGCCATATTTCAGGCCGTCCACTG
|
|
|
87
|
|
|
88 Whitespace is ignored within the input file. See the OUTPUT section for more on creation of
|
|
|
89 output files.
|
|
|
90
|
|
|
91 =head1 OUTPUT
|
|
|
92
|
|
|
93 The name of each output sequence file is pulled from the FASTA header of that sequence. The
|
|
|
94 first *word* after the > symbol will be used as the file name, along with the extension .fsa.
|
|
|
95 The word is defined as all the text after the > symbol up to the first whitespace.
|
|
|
96
|
|
|
97 If the above example were your input file, two files would be created:
|
|
|
98
|
|
|
99 gi53791237.fsa
|
|
|
100 gi15387669.fsa
|
|
|
101
|
|
|
102 Any characters other than a-z A-Z 0-9 . _ - in the ID will be changed into an
|
|
|
103 underscore. This only occurs in the file name; the original FASTA header within the file
|
|
|
104 will be unmodified.
|
|
|
105
|
|
|
106 You can pass a path to the optional --output_list to create a text file containing the full paths
|
|
|
107 to each of the FASTA files created by this script.
|
|
|
108
|
|
|
109 Two other optional arguments, --output_subdir_size and --output_subdir_prefix, can be used
|
|
|
110 on input sets that are too large to write out to one directory. This depends on the limitations
|
|
|
111 of your file system, but you usually don't want 100,000 files written in the same directory.
|
|
|
112
|
|
|
113 If you have an FASTA file containing 95000 sequences, and use the following option:
|
|
|
114
|
|
|
115 --output_dir=/some/path
|
|
|
116 --output_subdir_size=30000
|
|
|
117
|
|
|
118 The following will be created:
|
|
|
119
|
|
|
120 directory file count
|
|
|
121 ---------------------------------
|
|
|
122 /some/path/1/ 30000
|
|
|
123 /some/path/2/ 30000
|
|
|
124 /some/path/3/ 30000
|
|
|
125 /some/path/4/ 5000
|
|
|
126
|
|
|
127 If you choose to create a list file (and you probably want to), it will contain these proper paths.
|
|
|
128
|
|
|
129 You may not want the subdirectories to simply be numbers, as above, so you can use the
|
|
|
130 --output_subdir_prefix option. For example:
|
|
|
131
|
|
|
132 --output_dir=/some/path
|
|
|
133 --output_subdir_size=30000
|
|
|
134 --output_subdir_prefix=fasta
|
|
|
135
|
|
|
136 The following will be created:
|
|
|
137
|
|
|
138 directory file count
|
|
|
139 ---------------------------------
|
|
|
140 /some/path/fasta1/ 30000
|
|
|
141 /some/path/fasta2/ 30000
|
|
|
142 /some/path/fasta3/ 30000
|
|
|
143 /some/path/fasta4/ 5000
|
|
|
144
|
|
|
145 Finally, you can write multiple sequences to each output file using the --seqs_per_file option, which
|
|
|
146 can be used along with --outupt_subdir_size and --output_subdir_prefix. The main difference to note
|
|
|
147 is that, if you use --seqs_per_file, the fasta file created will no longer be named using values
|
|
|
148 taken from the header, since it will contain multiple headers. Instead, the file will simply be
|
|
|
149 named using sequential numbers starting at 1 (like 1.fsa). For example:
|
|
|
150
|
|
|
151 --output_dir=/some/path
|
|
|
152 --output_subdir_size=3000
|
|
|
153 --output_subdir_prefix=fasta
|
|
|
154 --seqs_per_file=10
|
|
|
155
|
|
|
156 The following will be created:
|
|
|
157
|
|
|
158 directory file count
|
|
|
159 ---------------------------------
|
|
|
160 /some/path/fasta1/ 3000
|
|
|
161 /some/path/fasta2/ 3000
|
|
|
162 /some/path/fasta3/ 3000
|
|
|
163 /some/path/fasta4/ 500
|
|
|
164
|
|
|
165 =head1 CONTACT
|
|
|
166
|
|
|
167 Joshua Orvis
|
|
|
168 jorvis@tigr.org
|
|
|
169
|
|
|
170 =cut
|
|
|
171
|
|
|
172 use strict;
|
|
|
173 use Getopt::Long;
|
|
|
174 # qw(:config no_ignore_case no_auto_abbrev pass_through);
|
|
|
175 use Pod::Usage;
|
|
|
176 # BEGIN {
|
|
|
177 # use Ergatis::Logger;
|
|
|
178 # }
|
|
|
179
|
|
|
180 my %options = ();
|
|
|
181 my $results = GetOptions (\%options,
|
|
|
182 'input_file|i=s',
|
|
|
183 'output_dir|o=s',
|
|
|
184 'output_file_prefix|f=s',
|
|
|
185 'output_list|s=s',
|
|
|
186 'output_subdir_size|u=s',
|
|
|
187 'output_subdir_prefix|p=s',
|
|
|
188 'seqs_per_file|n|e=s',
|
|
|
189 'compress_output|c=s',
|
|
|
190 'log|l=s',
|
|
|
191 'debug=s',
|
|
|
192 'help|h') || pod2usage();
|
|
|
193
|
|
|
194 # my $logfile = $options{'log'} || Ergatis::Logger::get_default_logfilename();
|
|
|
195 # my $logger = new Ergatis::Logger('LOG_FILE'=>$logfile,
|
|
|
196 # 'LOG_LEVEL'=>$options{'debug'});
|
|
|
197 # $logger = $logger->get_logger();
|
|
|
198
|
|
|
199
|
|
|
200 my $logfile = $options{'log'} || "log.file";
|
|
|
201 my $logger = new logger('LOG_FILE'=>$logfile,
|
|
|
202 'LOG_LEVEL'=>$options{'debug'});
|
|
|
203
|
|
|
204 ## display documentation
|
|
|
205 if( $options{'help'} ){
|
|
|
206 pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
|
|
|
207 }
|
|
|
208
|
|
|
209 ## make sure everything passed was peachy
|
|
|
210 &check_parameters(\%options);
|
|
|
211
|
|
|
212 ## open the list file if one was passed
|
|
|
213 my $listfh;
|
|
|
214 if (defined $options{output_list}) {
|
|
|
215 open($listfh, ">$options{output_list}") || $logger->logdie("couldn't create $options{output_list} list file");
|
|
|
216 }
|
|
|
217
|
|
|
218 my $first = 1;
|
|
|
219 my $seq = '';
|
|
|
220 my $header;
|
|
|
221
|
|
|
222 my $sfh;
|
|
|
223
|
|
|
224 ## load the sequence file
|
|
|
225 if ($options{'input_file'} =~ /\.(gz|gzip)$/) {
|
|
|
226 open ($sfh, "<:gzip", $options{'input_file'})
|
|
|
227 || $logger->logdie("can't open sequence file:\n$!");
|
|
|
228 } else {
|
|
|
229 open ($sfh, "<$options{'input_file'}")
|
|
|
230 || $logger->logdie("can't open sequence file:\n$!");
|
|
|
231 }
|
|
|
232
|
|
|
233 my $sub_dir = 1;
|
|
|
234 my $seq_file_count = 0;
|
|
|
235
|
|
|
236 ## keep track of how many sequences are in the current output file
|
|
|
237 my $seqs_in_file = 0;
|
|
|
238 my $group_filename_prefix = 1;
|
|
|
239
|
|
|
240 ## holds the output file handle
|
|
|
241 my $ofh;
|
|
|
242
|
|
|
243 while (<$sfh>) {
|
|
|
244 ## if we find a header line ...
|
|
|
245 if (/^\>(.*)/) {
|
|
|
246
|
|
|
247 ## write the previous sequence before continuing with this one
|
|
|
248 unless ($first) {
|
|
|
249 &writeSequence(\$header, \$seq);
|
|
|
250
|
|
|
251 ## reset the sequence
|
|
|
252 $seq = '';
|
|
|
253 }
|
|
|
254
|
|
|
255 $first = 0;
|
|
|
256 $header = $1;
|
|
|
257
|
|
|
258 ## else we've found a sequence line
|
|
|
259 } else {
|
|
|
260 ## skip it if it is just whitespace
|
|
|
261 next if (/^\s*$/);
|
|
|
262
|
|
|
263 ## record this portion of the sequence
|
|
|
264 $seq .= $_;
|
|
|
265 }
|
|
|
266 }
|
|
|
267
|
|
|
268 ## don't forget the last sequence
|
|
|
269 &writeSequence(\$header, \$seq);
|
|
|
270
|
|
|
271 exit;
|
|
|
272
|
|
|
273 sub check_parameters {
|
|
|
274 my $options = shift;
|
|
|
275
|
|
|
276 ## make sure input_file and output_dir were passed
|
|
|
277 unless ( $options{input_file} && $options{output_dir} ) {
|
|
|
278 $logger->logdie("You must pass both --input_file and --output_dir");
|
|
|
279 }
|
|
|
280
|
|
|
281 ## make sure input_file exists
|
|
|
282 if (! -e $options{input_file} ) {
|
|
|
283 if ( -e "$options{input_file}.gz" ) {
|
|
|
284 $options{input_file} .= '.gz';
|
|
|
285 } else {
|
|
|
286 $logger->logdie("the input file passed ($options{input_file}) cannot be read or does not exist");
|
|
|
287 }
|
|
|
288 }
|
|
|
289
|
|
|
290 ## make sure the output_dir exists
|
|
|
291 if (! -e "$options{output_dir}") {
|
|
|
292 $logger->logdie("the output directory passed could not be read or does not exist");
|
|
|
293 }
|
|
|
294
|
|
|
295 ## seqs_per_file, if passed, must be at least one
|
|
|
296 if (defined $options{seqs_per_file} && $options{seqs_per_file} < 1) {
|
|
|
297 $logger->logdie("seq_per_file setting cannot be less than one");
|
|
|
298 }
|
|
|
299
|
|
|
300 ## handle some defaults
|
|
|
301 $options{output_subdir_size} = 0 unless ($options{output_subdir_size});
|
|
|
302 $options{output_subdir_prefix} = '' unless ($options{output_subdir_prefix});
|
|
|
303 $options{seqs_per_file} = 1 unless ($options{seqs_per_file});
|
|
|
304 $options{output_file_prefix} = '' unless ($options{output_file_prefix});
|
|
|
305 }
|
|
|
306
|
|
|
307 sub writeSequence {
|
|
|
308 my ($header, $seq) = @_;
|
|
|
309
|
|
|
310 ## the id used to write the output file will be the first thing
|
|
|
311 ## in the header up to the first whitespace. get that.
|
|
|
312 $$header =~ /^(\S+)/ || $logger->logdie( "can't pull out an id on header $$header" );
|
|
|
313 my $id = $1;
|
|
|
314
|
|
|
315 ## because it is going to be the filename, we're going to take out the characters that are bad form to use
|
|
|
316 ## legal characters = a-z A-Z 0-9 - . _
|
|
|
317 $id =~ s/[^a-z0-9\-_.]/_/gi;
|
|
|
318
|
|
|
319 my $dirpath;
|
|
|
320
|
|
|
321 ## if we're writing more than one sequence to a file, change the id from
|
|
|
322 ## fasta header to the current group file name
|
|
|
323 if ($options{seqs_per_file} > 1) {
|
|
|
324 $id = $group_filename_prefix;
|
|
|
325
|
|
|
326 ## did the user ask for a file prefix?
|
|
|
327 if ( $options{output_file_prefix} ) {
|
|
|
328 $id = $options{output_file_prefix} . $id;
|
|
|
329 }
|
|
|
330 }
|
|
|
331
|
|
|
332
|
|
|
333 ## the path depends on whether we are using output subdirectories
|
|
|
334 if ($options{output_subdir_size}) {
|
|
|
335 $dirpath = "$options{'output_dir'}/$options{output_subdir_prefix}$sub_dir";
|
|
|
336 } else {
|
|
|
337 $dirpath = "$options{'output_dir'}";
|
|
|
338 }
|
|
|
339
|
|
|
340 ## did the user ask for a file prefix?
|
|
|
341 my $filepath = "$dirpath/$id.fsa";
|
|
|
342
|
|
|
343 ## take any // out of the filepath
|
|
|
344 $filepath =~ s|/+|/|g;
|
|
|
345
|
|
|
346 ## write the sequence
|
|
|
347 $logger->debug("Writing sequence to $filepath") if ($logger->is_debug());
|
|
|
348
|
|
|
349 ## open a new output file if we need to
|
|
|
350 ## if we're writing multiple sequences per file, we only open a new
|
|
|
351 ## one when $seqs_in_file = 0 (first sequence)
|
|
|
352 if ($seqs_in_file == 0) {
|
|
|
353
|
|
|
354 ## if the directory we want to write to doesn't exist yet, create it
|
|
|
355 mkdir($dirpath) unless (-e $dirpath);
|
|
|
356
|
|
|
357
|
|
|
358 if ($options{'compress_output'}) {
|
|
|
359 open ($ofh, ">:gzip", $filepath.".gz")
|
|
|
360 || $logger->logdie("can't create '$filepath.gz':\n$!");
|
|
|
361 } else {
|
|
|
362 open ($ofh, ">$filepath") || $logger->logdie("can't create '$filepath':\n$!");
|
|
|
363
|
|
|
364 }
|
|
|
365 $seq_file_count++;
|
|
|
366
|
|
|
367 ## add the file we just wrote to the list, if we were asked to
|
|
|
368 if (defined $options{output_list}) {
|
|
|
369 print $listfh "$filepath\n";
|
|
|
370 }
|
|
|
371 }
|
|
|
372
|
|
|
373 ## if we're doing output subdirs and hit our size limit, increment to the next dir
|
|
|
374 if ($options{output_subdir_size} && $options{output_subdir_size} == $seq_file_count) {
|
|
|
375 $seq_file_count = 0;
|
|
|
376 $sub_dir++;
|
|
|
377 }
|
|
|
378
|
|
|
379 ## write the sequence
|
|
|
380 print $ofh ">$$header\n$$seq\n";
|
|
|
381 $seqs_in_file++;
|
|
|
382
|
|
|
383 ## if we hit the limit of how many we want in each file, set the next file name and
|
|
|
384 ## reset the count of seqs within the file
|
|
|
385 if ($options{seqs_per_file} == $seqs_in_file) {
|
|
|
386 $seqs_in_file = 0;
|
|
|
387 $group_filename_prefix++;
|
|
|
388 }
|
|
|
389 }
|
|
|
390
|
|
|
391
|
|
|
392 package logger;
|
|
|
393
|
|
|
394 sub new {
|
|
|
395 my $packname= shift;
|
|
|
396 my %args= @_;
|
|
|
397 my $self= \%args;
|
|
|
398 bless($self,$packname);
|
|
|
399 return $self;
|
|
|
400 }
|
|
|
401
|
|
|
402 sub get_logger {
|
|
|
403 my $self= shift;
|
|
|
404 return $self;
|
|
|
405 }
|
|
|
406
|
|
|
407 sub logdie {
|
|
|
408 my $self= shift;
|
|
|
409 die @_;
|
|
|
410 }
|
|
|
411
|
|
|
412 sub debug {
|
|
|
413 my $self= shift;
|
|
|
414 warn @_;
|
|
|
415 }
|
|
|
416
|
|
|
417 sub is_debug {
|
|
|
418 shift->{LOG_LEVEL} || 0;
|
|
|
419 }
|
|
|
420
|
|
|
421
|
|
|
422 1;
|