annotate split_multifasta.pl @ 1:8ef62ca3938b draft default tip

initial tool
author mingchen0919
date Mon, 09 Apr 2018 12:30:44 -0400
parents efd5c022b54d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
1 #!/usr/bin/perl
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
2
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
3 #BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}};
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
4 #use lib (@INC,$ENV{"PERL_MOD_DIR"});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
5 #no lib "$ENV{PERL_MOD_DIR}/i686-linux";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
6 #no lib ".";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
7
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
8 =head1 NAME
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
9
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
10 split_multifasta.pl - split a single FASTA file containing multiple sequences into separate files.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
11
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
12 =head1 SYNOPSIS
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
13
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
14 USAGE: split_multifasta.pl
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
15 --input_file=/path/to/some_file.fsa
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
16 --output_dir=/path/to/somedir
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
17 [ --output_list=/path/to/somefile.list
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
18 --output_subdir_size=1000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
19 --output_subdir_prefix=fasta
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
20 --seqs_per_file=1
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
21 --compress_output=1
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
22 ]
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
23
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
24
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
25 split_multifasta.pl --in snapdmel.aa --output_dir=./ --f=snaa --seqs_per_file=1000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
26
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
27 =head1 OPTIONS
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
28
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
29 B<--input_file,-i>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
30 The input multi-fasta file to split.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
31
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
32 B<--output_dir,-o>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
33 The directory to which the output files will be written.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
34
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
35 B<--output_list,-s>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
36 Write a list file containing the paths of each of the regular output files. This may be useful
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
37 for later scripts that can accept a list as input.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
38
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
39 B<--output_file_prefix,-f>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
40 If defined, each file created will have this string prepended to its name. This is ignored unless
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
41 writing multiple sequences to each output file using the --seqs_per_file option with a value greater
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
42 than 1, else each file created will just be a number.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
43
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
44 B<--output_subdir_size,-u>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
45 If defined, this script will create numbered subdirectories in the output directory, each
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
46 containing this many sequences files. Once this limit is reached, another subdirectory
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
47 is created.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
48
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
49 B<--output_subdir_prefix,-p>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
50 To be used along with --output_subdir_size, this allows more control of the names of the
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
51 subdirectories created. Rather than just incrementing numbers (like 10), each subdirectory
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
52 will be named with this prefix (like prefix10).
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
53
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
54 B<--compress_output,-c>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
55 Output fasta files will be gzipped when written.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
56
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
57 B<--debug,-d>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
58 Debug level. Use a large number to turn on verbose debugging.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
59
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
60 B<--log,-l>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
61 Log file
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
62
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
63 B<--help,-h>
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
64 This help message
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
65
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
66 =head1 DESCRIPTION
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
67
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
68 This script is used to split a single FASTA file containing multiple sequences into separate
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
69 files containing one sequence each.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
70
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
71 =head1 INPUT
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
72
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
73 The input is defined with --input_file and should be a single fasta file. File extensions are
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
74 ignored. When creating this multi-entry FASTA file, one should take care to make the first
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
75 *word* after the > symbol a unique value, as it will be used as the file name for that sequence.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
76 For example:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
77
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
78 >gi53791237 Tragulus javanicus p97bcnt gene for p97Bcnt
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
79 ACAGGAGAAGAGACTGAAGAGACACGTTCAGGAGAAGAGCAAGAGAAGCCTAAAGAAATGCAAGAAGTTA
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
80 AACTCACCAAATCACTTGTTGAAGAAGTCAGGTAACATGACATTCACAAACTTCAAAACTAGTTCTTTAA
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
81 AAAGGAACATCTCTCTTTTAATATGTATGCATTATTAATTTATTTACTCATTGGCGTGGAGGAGGAAATG
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
82
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
83 >gi15387669 Corynebacterium callunae pCC1 plasmid
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
84 ATGCATGCTAGTGTGGTGAGTATGAGCACACACATTCATGGGCACCGCCGGGGTGCAGGGGGGCTTGCCC
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
85 CTTGTCCATGCGGGGTGTGGGGCTTGCCCCGCCGATAGAGACCGGCCACCACCATGGCACCCGGTCGCGG
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
86 GGTGATCGGCCACCACCACCGCCCCCGGCCACTCTCCCCCTGTCTAGGCCATATTTCAGGCCGTCCACTG
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
87
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
88 Whitespace is ignored within the input file. See the OUTPUT section for more on creation of
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
89 output files.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
90
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
91 =head1 OUTPUT
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
92
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
93 The name of each output sequence file is pulled from the FASTA header of that sequence. The
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
94 first *word* after the > symbol will be used as the file name, along with the extension .fsa.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
95 The word is defined as all the text after the > symbol up to the first whitespace.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
96
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
97 If the above example were your input file, two files would be created:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
98
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
99 gi53791237.fsa
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
100 gi15387669.fsa
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
101
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
102 Any characters other than a-z A-Z 0-9 . _ - in the ID will be changed into an
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
103 underscore. This only occurs in the file name; the original FASTA header within the file
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
104 will be unmodified.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
105
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
106 You can pass a path to the optional --output_list to create a text file containing the full paths
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
107 to each of the FASTA files created by this script.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
108
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
109 Two other optional arguments, --output_subdir_size and --output_subdir_prefix, can be used
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
110 on input sets that are too large to write out to one directory. This depends on the limitations
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
111 of your file system, but you usually don't want 100,000 files written in the same directory.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
112
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
113 If you have an FASTA file containing 95000 sequences, and use the following option:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
114
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
115 --output_dir=/some/path
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
116 --output_subdir_size=30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
117
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
118 The following will be created:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
119
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
120 directory file count
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
121 ---------------------------------
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
122 /some/path/1/ 30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
123 /some/path/2/ 30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
124 /some/path/3/ 30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
125 /some/path/4/ 5000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
126
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
127 If you choose to create a list file (and you probably want to), it will contain these proper paths.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
128
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
129 You may not want the subdirectories to simply be numbers, as above, so you can use the
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
130 --output_subdir_prefix option. For example:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
131
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
132 --output_dir=/some/path
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
133 --output_subdir_size=30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
134 --output_subdir_prefix=fasta
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
135
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
136 The following will be created:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
137
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
138 directory file count
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
139 ---------------------------------
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
140 /some/path/fasta1/ 30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
141 /some/path/fasta2/ 30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
142 /some/path/fasta3/ 30000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
143 /some/path/fasta4/ 5000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
144
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
145 Finally, you can write multiple sequences to each output file using the --seqs_per_file option, which
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
146 can be used along with --outupt_subdir_size and --output_subdir_prefix. The main difference to note
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
147 is that, if you use --seqs_per_file, the fasta file created will no longer be named using values
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
148 taken from the header, since it will contain multiple headers. Instead, the file will simply be
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
149 named using sequential numbers starting at 1 (like 1.fsa). For example:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
150
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
151 --output_dir=/some/path
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
152 --output_subdir_size=3000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
153 --output_subdir_prefix=fasta
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
154 --seqs_per_file=10
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
155
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
156 The following will be created:
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
157
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
158 directory file count
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
159 ---------------------------------
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
160 /some/path/fasta1/ 3000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
161 /some/path/fasta2/ 3000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
162 /some/path/fasta3/ 3000
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
163 /some/path/fasta4/ 500
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
164
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
165 =head1 CONTACT
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
166
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
167 Joshua Orvis
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
168 jorvis@tigr.org
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
169
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
170 =cut
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
171
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
172 use strict;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
173 use Getopt::Long;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
174 # qw(:config no_ignore_case no_auto_abbrev pass_through);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
175 use Pod::Usage;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
176 # BEGIN {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
177 # use Ergatis::Logger;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
178 # }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
179
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
180 my %options = ();
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
181 my $results = GetOptions (\%options,
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
182 'input_file|i=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
183 'output_dir|o=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
184 'output_file_prefix|f=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
185 'output_list|s=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
186 'output_subdir_size|u=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
187 'output_subdir_prefix|p=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
188 'seqs_per_file|n|e=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
189 'compress_output|c=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
190 'log|l=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
191 'debug=s',
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
192 'help|h') || pod2usage();
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
193
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
194 # my $logfile = $options{'log'} || Ergatis::Logger::get_default_logfilename();
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
195 # my $logger = new Ergatis::Logger('LOG_FILE'=>$logfile,
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
196 # 'LOG_LEVEL'=>$options{'debug'});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
197 # $logger = $logger->get_logger();
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
198
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
199
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
200 my $logfile = $options{'log'} || "log.file";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
201 my $logger = new logger('LOG_FILE'=>$logfile,
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
202 'LOG_LEVEL'=>$options{'debug'});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
203
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
204 ## display documentation
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
205 if( $options{'help'} ){
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
206 pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
207 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
208
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
209 ## make sure everything passed was peachy
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
210 &check_parameters(\%options);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
211
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
212 ## open the list file if one was passed
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
213 my $listfh;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
214 if (defined $options{output_list}) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
215 open($listfh, ">$options{output_list}") || $logger->logdie("couldn't create $options{output_list} list file");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
216 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
217
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
218 my $first = 1;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
219 my $seq = '';
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
220 my $header;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
221
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
222 my $sfh;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
223
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
224 ## load the sequence file
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
225 if ($options{'input_file'} =~ /\.(gz|gzip)$/) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
226 open ($sfh, "<:gzip", $options{'input_file'})
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
227 || $logger->logdie("can't open sequence file:\n$!");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
228 } else {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
229 open ($sfh, "<$options{'input_file'}")
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
230 || $logger->logdie("can't open sequence file:\n$!");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
231 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
232
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
233 my $sub_dir = 1;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
234 my $seq_file_count = 0;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
235
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
236 ## keep track of how many sequences are in the current output file
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
237 my $seqs_in_file = 0;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
238 my $group_filename_prefix = 1;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
239
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
240 ## holds the output file handle
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
241 my $ofh;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
242
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
243 while (<$sfh>) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
244 ## if we find a header line ...
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
245 if (/^\>(.*)/) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
246
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
247 ## write the previous sequence before continuing with this one
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
248 unless ($first) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
249 &writeSequence(\$header, \$seq);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
250
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
251 ## reset the sequence
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
252 $seq = '';
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
253 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
254
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
255 $first = 0;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
256 $header = $1;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
257
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
258 ## else we've found a sequence line
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
259 } else {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
260 ## skip it if it is just whitespace
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
261 next if (/^\s*$/);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
262
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
263 ## record this portion of the sequence
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
264 $seq .= $_;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
265 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
266 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
267
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
268 ## don't forget the last sequence
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
269 &writeSequence(\$header, \$seq);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
270
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
271 exit;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
272
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
273 sub check_parameters {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
274 my $options = shift;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
275
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
276 ## make sure input_file and output_dir were passed
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
277 unless ( $options{input_file} && $options{output_dir} ) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
278 $logger->logdie("You must pass both --input_file and --output_dir");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
279 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
280
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
281 ## make sure input_file exists
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
282 if (! -e $options{input_file} ) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
283 if ( -e "$options{input_file}.gz" ) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
284 $options{input_file} .= '.gz';
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
285 } else {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
286 $logger->logdie("the input file passed ($options{input_file}) cannot be read or does not exist");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
287 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
288 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
289
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
290 ## make sure the output_dir exists
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
291 if (! -e "$options{output_dir}") {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
292 $logger->logdie("the output directory passed could not be read or does not exist");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
293 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
294
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
295 ## seqs_per_file, if passed, must be at least one
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
296 if (defined $options{seqs_per_file} && $options{seqs_per_file} < 1) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
297 $logger->logdie("seq_per_file setting cannot be less than one");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
298 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
299
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
300 ## handle some defaults
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
301 $options{output_subdir_size} = 0 unless ($options{output_subdir_size});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
302 $options{output_subdir_prefix} = '' unless ($options{output_subdir_prefix});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
303 $options{seqs_per_file} = 1 unless ($options{seqs_per_file});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
304 $options{output_file_prefix} = '' unless ($options{output_file_prefix});
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
305 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
306
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
307 sub writeSequence {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
308 my ($header, $seq) = @_;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
309
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
310 ## the id used to write the output file will be the first thing
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
311 ## in the header up to the first whitespace. get that.
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
312 $$header =~ /^(\S+)/ || $logger->logdie( "can't pull out an id on header $$header" );
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
313 my $id = $1;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
314
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
315 ## because it is going to be the filename, we're going to take out the characters that are bad form to use
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
316 ## legal characters = a-z A-Z 0-9 - . _
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
317 $id =~ s/[^a-z0-9\-_.]/_/gi;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
318
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
319 my $dirpath;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
320
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
321 ## if we're writing more than one sequence to a file, change the id from
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
322 ## fasta header to the current group file name
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
323 if ($options{seqs_per_file} > 1) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
324 $id = $group_filename_prefix;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
325
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
326 ## did the user ask for a file prefix?
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
327 if ( $options{output_file_prefix} ) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
328 $id = $options{output_file_prefix} . $id;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
329 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
330 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
331
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
332
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
333 ## the path depends on whether we are using output subdirectories
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
334 if ($options{output_subdir_size}) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
335 $dirpath = "$options{'output_dir'}/$options{output_subdir_prefix}$sub_dir";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
336 } else {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
337 $dirpath = "$options{'output_dir'}";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
338 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
339
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
340 ## did the user ask for a file prefix?
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
341 my $filepath = "$dirpath/$id.fsa";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
342
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
343 ## take any // out of the filepath
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
344 $filepath =~ s|/+|/|g;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
345
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
346 ## write the sequence
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
347 $logger->debug("Writing sequence to $filepath") if ($logger->is_debug());
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
348
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
349 ## open a new output file if we need to
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
350 ## if we're writing multiple sequences per file, we only open a new
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
351 ## one when $seqs_in_file = 0 (first sequence)
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
352 if ($seqs_in_file == 0) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
353
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
354 ## if the directory we want to write to doesn't exist yet, create it
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
355 mkdir($dirpath) unless (-e $dirpath);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
356
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
357
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
358 if ($options{'compress_output'}) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
359 open ($ofh, ">:gzip", $filepath.".gz")
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
360 || $logger->logdie("can't create '$filepath.gz':\n$!");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
361 } else {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
362 open ($ofh, ">$filepath") || $logger->logdie("can't create '$filepath':\n$!");
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
363
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
364 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
365 $seq_file_count++;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
366
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
367 ## add the file we just wrote to the list, if we were asked to
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
368 if (defined $options{output_list}) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
369 print $listfh "$filepath\n";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
370 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
371 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
372
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
373 ## if we're doing output subdirs and hit our size limit, increment to the next dir
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
374 if ($options{output_subdir_size} && $options{output_subdir_size} == $seq_file_count) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
375 $seq_file_count = 0;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
376 $sub_dir++;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
377 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
378
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
379 ## write the sequence
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
380 print $ofh ">$$header\n$$seq\n";
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
381 $seqs_in_file++;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
382
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
383 ## if we hit the limit of how many we want in each file, set the next file name and
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
384 ## reset the count of seqs within the file
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
385 if ($options{seqs_per_file} == $seqs_in_file) {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
386 $seqs_in_file = 0;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
387 $group_filename_prefix++;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
388 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
389 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
390
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
391
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
392 package logger;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
393
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
394 sub new {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
395 my $packname= shift;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
396 my %args= @_;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
397 my $self= \%args;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
398 bless($self,$packname);
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
399 return $self;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
400 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
401
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
402 sub get_logger {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
403 my $self= shift;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
404 return $self;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
405 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
406
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
407 sub logdie {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
408 my $self= shift;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
409 die @_;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
410 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
411
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
412 sub debug {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
413 my $self= shift;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
414 warn @_;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
415 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
416
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
417 sub is_debug {
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
418 shift->{LOG_LEVEL} || 0;
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
419 }
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
420
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
421
efd5c022b54d planemo upload
mingchen0919
parents:
diff changeset
422 1;