comparison split_multifasta.pl @ 0:efd5c022b54d draft

planemo upload
author mingchen0919
date Mon, 09 Apr 2018 12:27:49 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:efd5c022b54d
1 #!/usr/bin/perl
2
3 #BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}};
4 #use lib (@INC,$ENV{"PERL_MOD_DIR"});
5 #no lib "$ENV{PERL_MOD_DIR}/i686-linux";
6 #no lib ".";
7
8 =head1 NAME
9
10 split_multifasta.pl - split a single FASTA file containing multiple sequences into separate files.
11
12 =head1 SYNOPSIS
13
14 USAGE: split_multifasta.pl
15 --input_file=/path/to/some_file.fsa
16 --output_dir=/path/to/somedir
17 [ --output_list=/path/to/somefile.list
18 --output_subdir_size=1000
19 --output_subdir_prefix=fasta
20 --seqs_per_file=1
21 --compress_output=1
22 ]
23
24
25 split_multifasta.pl --in snapdmel.aa --output_dir=./ --f=snaa --seqs_per_file=1000
26
27 =head1 OPTIONS
28
29 B<--input_file,-i>
30 The input multi-fasta file to split.
31
32 B<--output_dir,-o>
33 The directory to which the output files will be written.
34
35 B<--output_list,-s>
36 Write a list file containing the paths of each of the regular output files. This may be useful
37 for later scripts that can accept a list as input.
38
39 B<--output_file_prefix,-f>
40 If defined, each file created will have this string prepended to its name. This is ignored unless
41 writing multiple sequences to each output file using the --seqs_per_file option with a value greater
42 than 1, else each file created will just be a number.
43
44 B<--output_subdir_size,-u>
45 If defined, this script will create numbered subdirectories in the output directory, each
46 containing this many sequences files. Once this limit is reached, another subdirectory
47 is created.
48
49 B<--output_subdir_prefix,-p>
50 To be used along with --output_subdir_size, this allows more control of the names of the
51 subdirectories created. Rather than just incrementing numbers (like 10), each subdirectory
52 will be named with this prefix (like prefix10).
53
54 B<--compress_output,-c>
55 Output fasta files will be gzipped when written.
56
57 B<--debug,-d>
58 Debug level. Use a large number to turn on verbose debugging.
59
60 B<--log,-l>
61 Log file
62
63 B<--help,-h>
64 This help message
65
66 =head1 DESCRIPTION
67
68 This script is used to split a single FASTA file containing multiple sequences into separate
69 files containing one sequence each.
70
71 =head1 INPUT
72
73 The input is defined with --input_file and should be a single fasta file. File extensions are
74 ignored. When creating this multi-entry FASTA file, one should take care to make the first
75 *word* after the > symbol a unique value, as it will be used as the file name for that sequence.
76 For example:
77
78 >gi53791237 Tragulus javanicus p97bcnt gene for p97Bcnt
79 ACAGGAGAAGAGACTGAAGAGACACGTTCAGGAGAAGAGCAAGAGAAGCCTAAAGAAATGCAAGAAGTTA
80 AACTCACCAAATCACTTGTTGAAGAAGTCAGGTAACATGACATTCACAAACTTCAAAACTAGTTCTTTAA
81 AAAGGAACATCTCTCTTTTAATATGTATGCATTATTAATTTATTTACTCATTGGCGTGGAGGAGGAAATG
82
83 >gi15387669 Corynebacterium callunae pCC1 plasmid
84 ATGCATGCTAGTGTGGTGAGTATGAGCACACACATTCATGGGCACCGCCGGGGTGCAGGGGGGCTTGCCC
85 CTTGTCCATGCGGGGTGTGGGGCTTGCCCCGCCGATAGAGACCGGCCACCACCATGGCACCCGGTCGCGG
86 GGTGATCGGCCACCACCACCGCCCCCGGCCACTCTCCCCCTGTCTAGGCCATATTTCAGGCCGTCCACTG
87
88 Whitespace is ignored within the input file. See the OUTPUT section for more on creation of
89 output files.
90
91 =head1 OUTPUT
92
93 The name of each output sequence file is pulled from the FASTA header of that sequence. The
94 first *word* after the > symbol will be used as the file name, along with the extension .fsa.
95 The word is defined as all the text after the > symbol up to the first whitespace.
96
97 If the above example were your input file, two files would be created:
98
99 gi53791237.fsa
100 gi15387669.fsa
101
102 Any characters other than a-z A-Z 0-9 . _ - in the ID will be changed into an
103 underscore. This only occurs in the file name; the original FASTA header within the file
104 will be unmodified.
105
106 You can pass a path to the optional --output_list to create a text file containing the full paths
107 to each of the FASTA files created by this script.
108
109 Two other optional arguments, --output_subdir_size and --output_subdir_prefix, can be used
110 on input sets that are too large to write out to one directory. This depends on the limitations
111 of your file system, but you usually don't want 100,000 files written in the same directory.
112
113 If you have an FASTA file containing 95000 sequences, and use the following option:
114
115 --output_dir=/some/path
116 --output_subdir_size=30000
117
118 The following will be created:
119
120 directory file count
121 ---------------------------------
122 /some/path/1/ 30000
123 /some/path/2/ 30000
124 /some/path/3/ 30000
125 /some/path/4/ 5000
126
127 If you choose to create a list file (and you probably want to), it will contain these proper paths.
128
129 You may not want the subdirectories to simply be numbers, as above, so you can use the
130 --output_subdir_prefix option. For example:
131
132 --output_dir=/some/path
133 --output_subdir_size=30000
134 --output_subdir_prefix=fasta
135
136 The following will be created:
137
138 directory file count
139 ---------------------------------
140 /some/path/fasta1/ 30000
141 /some/path/fasta2/ 30000
142 /some/path/fasta3/ 30000
143 /some/path/fasta4/ 5000
144
145 Finally, you can write multiple sequences to each output file using the --seqs_per_file option, which
146 can be used along with --outupt_subdir_size and --output_subdir_prefix. The main difference to note
147 is that, if you use --seqs_per_file, the fasta file created will no longer be named using values
148 taken from the header, since it will contain multiple headers. Instead, the file will simply be
149 named using sequential numbers starting at 1 (like 1.fsa). For example:
150
151 --output_dir=/some/path
152 --output_subdir_size=3000
153 --output_subdir_prefix=fasta
154 --seqs_per_file=10
155
156 The following will be created:
157
158 directory file count
159 ---------------------------------
160 /some/path/fasta1/ 3000
161 /some/path/fasta2/ 3000
162 /some/path/fasta3/ 3000
163 /some/path/fasta4/ 500
164
165 =head1 CONTACT
166
167 Joshua Orvis
168 jorvis@tigr.org
169
170 =cut
171
172 use strict;
173 use Getopt::Long;
174 # qw(:config no_ignore_case no_auto_abbrev pass_through);
175 use Pod::Usage;
176 # BEGIN {
177 # use Ergatis::Logger;
178 # }
179
180 my %options = ();
181 my $results = GetOptions (\%options,
182 'input_file|i=s',
183 'output_dir|o=s',
184 'output_file_prefix|f=s',
185 'output_list|s=s',
186 'output_subdir_size|u=s',
187 'output_subdir_prefix|p=s',
188 'seqs_per_file|n|e=s',
189 'compress_output|c=s',
190 'log|l=s',
191 'debug=s',
192 'help|h') || pod2usage();
193
194 # my $logfile = $options{'log'} || Ergatis::Logger::get_default_logfilename();
195 # my $logger = new Ergatis::Logger('LOG_FILE'=>$logfile,
196 # 'LOG_LEVEL'=>$options{'debug'});
197 # $logger = $logger->get_logger();
198
199
200 my $logfile = $options{'log'} || "log.file";
201 my $logger = new logger('LOG_FILE'=>$logfile,
202 'LOG_LEVEL'=>$options{'debug'});
203
204 ## display documentation
205 if( $options{'help'} ){
206 pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
207 }
208
209 ## make sure everything passed was peachy
210 &check_parameters(\%options);
211
212 ## open the list file if one was passed
213 my $listfh;
214 if (defined $options{output_list}) {
215 open($listfh, ">$options{output_list}") || $logger->logdie("couldn't create $options{output_list} list file");
216 }
217
218 my $first = 1;
219 my $seq = '';
220 my $header;
221
222 my $sfh;
223
224 ## load the sequence file
225 if ($options{'input_file'} =~ /\.(gz|gzip)$/) {
226 open ($sfh, "<:gzip", $options{'input_file'})
227 || $logger->logdie("can't open sequence file:\n$!");
228 } else {
229 open ($sfh, "<$options{'input_file'}")
230 || $logger->logdie("can't open sequence file:\n$!");
231 }
232
233 my $sub_dir = 1;
234 my $seq_file_count = 0;
235
236 ## keep track of how many sequences are in the current output file
237 my $seqs_in_file = 0;
238 my $group_filename_prefix = 1;
239
240 ## holds the output file handle
241 my $ofh;
242
243 while (<$sfh>) {
244 ## if we find a header line ...
245 if (/^\>(.*)/) {
246
247 ## write the previous sequence before continuing with this one
248 unless ($first) {
249 &writeSequence(\$header, \$seq);
250
251 ## reset the sequence
252 $seq = '';
253 }
254
255 $first = 0;
256 $header = $1;
257
258 ## else we've found a sequence line
259 } else {
260 ## skip it if it is just whitespace
261 next if (/^\s*$/);
262
263 ## record this portion of the sequence
264 $seq .= $_;
265 }
266 }
267
268 ## don't forget the last sequence
269 &writeSequence(\$header, \$seq);
270
271 exit;
272
273 sub check_parameters {
274 my $options = shift;
275
276 ## make sure input_file and output_dir were passed
277 unless ( $options{input_file} && $options{output_dir} ) {
278 $logger->logdie("You must pass both --input_file and --output_dir");
279 }
280
281 ## make sure input_file exists
282 if (! -e $options{input_file} ) {
283 if ( -e "$options{input_file}.gz" ) {
284 $options{input_file} .= '.gz';
285 } else {
286 $logger->logdie("the input file passed ($options{input_file}) cannot be read or does not exist");
287 }
288 }
289
290 ## make sure the output_dir exists
291 if (! -e "$options{output_dir}") {
292 $logger->logdie("the output directory passed could not be read or does not exist");
293 }
294
295 ## seqs_per_file, if passed, must be at least one
296 if (defined $options{seqs_per_file} && $options{seqs_per_file} < 1) {
297 $logger->logdie("seq_per_file setting cannot be less than one");
298 }
299
300 ## handle some defaults
301 $options{output_subdir_size} = 0 unless ($options{output_subdir_size});
302 $options{output_subdir_prefix} = '' unless ($options{output_subdir_prefix});
303 $options{seqs_per_file} = 1 unless ($options{seqs_per_file});
304 $options{output_file_prefix} = '' unless ($options{output_file_prefix});
305 }
306
307 sub writeSequence {
308 my ($header, $seq) = @_;
309
310 ## the id used to write the output file will be the first thing
311 ## in the header up to the first whitespace. get that.
312 $$header =~ /^(\S+)/ || $logger->logdie( "can't pull out an id on header $$header" );
313 my $id = $1;
314
315 ## because it is going to be the filename, we're going to take out the characters that are bad form to use
316 ## legal characters = a-z A-Z 0-9 - . _
317 $id =~ s/[^a-z0-9\-_.]/_/gi;
318
319 my $dirpath;
320
321 ## if we're writing more than one sequence to a file, change the id from
322 ## fasta header to the current group file name
323 if ($options{seqs_per_file} > 1) {
324 $id = $group_filename_prefix;
325
326 ## did the user ask for a file prefix?
327 if ( $options{output_file_prefix} ) {
328 $id = $options{output_file_prefix} . $id;
329 }
330 }
331
332
333 ## the path depends on whether we are using output subdirectories
334 if ($options{output_subdir_size}) {
335 $dirpath = "$options{'output_dir'}/$options{output_subdir_prefix}$sub_dir";
336 } else {
337 $dirpath = "$options{'output_dir'}";
338 }
339
340 ## did the user ask for a file prefix?
341 my $filepath = "$dirpath/$id.fsa";
342
343 ## take any // out of the filepath
344 $filepath =~ s|/+|/|g;
345
346 ## write the sequence
347 $logger->debug("Writing sequence to $filepath") if ($logger->is_debug());
348
349 ## open a new output file if we need to
350 ## if we're writing multiple sequences per file, we only open a new
351 ## one when $seqs_in_file = 0 (first sequence)
352 if ($seqs_in_file == 0) {
353
354 ## if the directory we want to write to doesn't exist yet, create it
355 mkdir($dirpath) unless (-e $dirpath);
356
357
358 if ($options{'compress_output'}) {
359 open ($ofh, ">:gzip", $filepath.".gz")
360 || $logger->logdie("can't create '$filepath.gz':\n$!");
361 } else {
362 open ($ofh, ">$filepath") || $logger->logdie("can't create '$filepath':\n$!");
363
364 }
365 $seq_file_count++;
366
367 ## add the file we just wrote to the list, if we were asked to
368 if (defined $options{output_list}) {
369 print $listfh "$filepath\n";
370 }
371 }
372
373 ## if we're doing output subdirs and hit our size limit, increment to the next dir
374 if ($options{output_subdir_size} && $options{output_subdir_size} == $seq_file_count) {
375 $seq_file_count = 0;
376 $sub_dir++;
377 }
378
379 ## write the sequence
380 print $ofh ">$$header\n$$seq\n";
381 $seqs_in_file++;
382
383 ## if we hit the limit of how many we want in each file, set the next file name and
384 ## reset the count of seqs within the file
385 if ($options{seqs_per_file} == $seqs_in_file) {
386 $seqs_in_file = 0;
387 $group_filename_prefix++;
388 }
389 }
390
391
392 package logger;
393
394 sub new {
395 my $packname= shift;
396 my %args= @_;
397 my $self= \%args;
398 bless($self,$packname);
399 return $self;
400 }
401
402 sub get_logger {
403 my $self= shift;
404 return $self;
405 }
406
407 sub logdie {
408 my $self= shift;
409 die @_;
410 }
411
412 sub debug {
413 my $self= shift;
414 warn @_;
415 }
416
417 sub is_debug {
418 shift->{LOG_LEVEL} || 0;
419 }
420
421
422 1;