Mercurial > repos > mingchen0919 > split_multifasta
comparison split_multifasta.pl @ 0:efd5c022b54d draft
planemo upload
| author | mingchen0919 |
|---|---|
| date | Mon, 09 Apr 2018 12:27:49 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:efd5c022b54d |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 #BEGIN{foreach (@INC) {s/\/usr\/local\/packages/\/local\/platform/}}; | |
| 4 #use lib (@INC,$ENV{"PERL_MOD_DIR"}); | |
| 5 #no lib "$ENV{PERL_MOD_DIR}/i686-linux"; | |
| 6 #no lib "."; | |
| 7 | |
| 8 =head1 NAME | |
| 9 | |
| 10 split_multifasta.pl - split a single FASTA file containing multiple sequences into separate files. | |
| 11 | |
| 12 =head1 SYNOPSIS | |
| 13 | |
| 14 USAGE: split_multifasta.pl | |
| 15 --input_file=/path/to/some_file.fsa | |
| 16 --output_dir=/path/to/somedir | |
| 17 [ --output_list=/path/to/somefile.list | |
| 18 --output_subdir_size=1000 | |
| 19 --output_subdir_prefix=fasta | |
| 20 --seqs_per_file=1 | |
| 21 --compress_output=1 | |
| 22 ] | |
| 23 | |
| 24 | |
| 25 split_multifasta.pl --in snapdmel.aa --output_dir=./ --f=snaa --seqs_per_file=1000 | |
| 26 | |
| 27 =head1 OPTIONS | |
| 28 | |
| 29 B<--input_file,-i> | |
| 30 The input multi-fasta file to split. | |
| 31 | |
| 32 B<--output_dir,-o> | |
| 33 The directory to which the output files will be written. | |
| 34 | |
| 35 B<--output_list,-s> | |
| 36 Write a list file containing the paths of each of the regular output files. This may be useful | |
| 37 for later scripts that can accept a list as input. | |
| 38 | |
| 39 B<--output_file_prefix,-f> | |
| 40 If defined, each file created will have this string prepended to its name. This is ignored unless | |
| 41 writing multiple sequences to each output file using the --seqs_per_file option with a value greater | |
| 42 than 1, else each file created will just be a number. | |
| 43 | |
| 44 B<--output_subdir_size,-u> | |
| 45 If defined, this script will create numbered subdirectories in the output directory, each | |
| 46 containing this many sequences files. Once this limit is reached, another subdirectory | |
| 47 is created. | |
| 48 | |
| 49 B<--output_subdir_prefix,-p> | |
| 50 To be used along with --output_subdir_size, this allows more control of the names of the | |
| 51 subdirectories created. Rather than just incrementing numbers (like 10), each subdirectory | |
| 52 will be named with this prefix (like prefix10). | |
| 53 | |
| 54 B<--compress_output,-c> | |
| 55 Output fasta files will be gzipped when written. | |
| 56 | |
| 57 B<--debug,-d> | |
| 58 Debug level. Use a large number to turn on verbose debugging. | |
| 59 | |
| 60 B<--log,-l> | |
| 61 Log file | |
| 62 | |
| 63 B<--help,-h> | |
| 64 This help message | |
| 65 | |
| 66 =head1 DESCRIPTION | |
| 67 | |
| 68 This script is used to split a single FASTA file containing multiple sequences into separate | |
| 69 files containing one sequence each. | |
| 70 | |
| 71 =head1 INPUT | |
| 72 | |
| 73 The input is defined with --input_file and should be a single fasta file. File extensions are | |
| 74 ignored. When creating this multi-entry FASTA file, one should take care to make the first | |
| 75 *word* after the > symbol a unique value, as it will be used as the file name for that sequence. | |
| 76 For example: | |
| 77 | |
| 78 >gi53791237 Tragulus javanicus p97bcnt gene for p97Bcnt | |
| 79 ACAGGAGAAGAGACTGAAGAGACACGTTCAGGAGAAGAGCAAGAGAAGCCTAAAGAAATGCAAGAAGTTA | |
| 80 AACTCACCAAATCACTTGTTGAAGAAGTCAGGTAACATGACATTCACAAACTTCAAAACTAGTTCTTTAA | |
| 81 AAAGGAACATCTCTCTTTTAATATGTATGCATTATTAATTTATTTACTCATTGGCGTGGAGGAGGAAATG | |
| 82 | |
| 83 >gi15387669 Corynebacterium callunae pCC1 plasmid | |
| 84 ATGCATGCTAGTGTGGTGAGTATGAGCACACACATTCATGGGCACCGCCGGGGTGCAGGGGGGCTTGCCC | |
| 85 CTTGTCCATGCGGGGTGTGGGGCTTGCCCCGCCGATAGAGACCGGCCACCACCATGGCACCCGGTCGCGG | |
| 86 GGTGATCGGCCACCACCACCGCCCCCGGCCACTCTCCCCCTGTCTAGGCCATATTTCAGGCCGTCCACTG | |
| 87 | |
| 88 Whitespace is ignored within the input file. See the OUTPUT section for more on creation of | |
| 89 output files. | |
| 90 | |
| 91 =head1 OUTPUT | |
| 92 | |
| 93 The name of each output sequence file is pulled from the FASTA header of that sequence. The | |
| 94 first *word* after the > symbol will be used as the file name, along with the extension .fsa. | |
| 95 The word is defined as all the text after the > symbol up to the first whitespace. | |
| 96 | |
| 97 If the above example were your input file, two files would be created: | |
| 98 | |
| 99 gi53791237.fsa | |
| 100 gi15387669.fsa | |
| 101 | |
| 102 Any characters other than a-z A-Z 0-9 . _ - in the ID will be changed into an | |
| 103 underscore. This only occurs in the file name; the original FASTA header within the file | |
| 104 will be unmodified. | |
| 105 | |
| 106 You can pass a path to the optional --output_list to create a text file containing the full paths | |
| 107 to each of the FASTA files created by this script. | |
| 108 | |
| 109 Two other optional arguments, --output_subdir_size and --output_subdir_prefix, can be used | |
| 110 on input sets that are too large to write out to one directory. This depends on the limitations | |
| 111 of your file system, but you usually don't want 100,000 files written in the same directory. | |
| 112 | |
| 113 If you have an FASTA file containing 95000 sequences, and use the following option: | |
| 114 | |
| 115 --output_dir=/some/path | |
| 116 --output_subdir_size=30000 | |
| 117 | |
| 118 The following will be created: | |
| 119 | |
| 120 directory file count | |
| 121 --------------------------------- | |
| 122 /some/path/1/ 30000 | |
| 123 /some/path/2/ 30000 | |
| 124 /some/path/3/ 30000 | |
| 125 /some/path/4/ 5000 | |
| 126 | |
| 127 If you choose to create a list file (and you probably want to), it will contain these proper paths. | |
| 128 | |
| 129 You may not want the subdirectories to simply be numbers, as above, so you can use the | |
| 130 --output_subdir_prefix option. For example: | |
| 131 | |
| 132 --output_dir=/some/path | |
| 133 --output_subdir_size=30000 | |
| 134 --output_subdir_prefix=fasta | |
| 135 | |
| 136 The following will be created: | |
| 137 | |
| 138 directory file count | |
| 139 --------------------------------- | |
| 140 /some/path/fasta1/ 30000 | |
| 141 /some/path/fasta2/ 30000 | |
| 142 /some/path/fasta3/ 30000 | |
| 143 /some/path/fasta4/ 5000 | |
| 144 | |
| 145 Finally, you can write multiple sequences to each output file using the --seqs_per_file option, which | |
| 146 can be used along with --outupt_subdir_size and --output_subdir_prefix. The main difference to note | |
| 147 is that, if you use --seqs_per_file, the fasta file created will no longer be named using values | |
| 148 taken from the header, since it will contain multiple headers. Instead, the file will simply be | |
| 149 named using sequential numbers starting at 1 (like 1.fsa). For example: | |
| 150 | |
| 151 --output_dir=/some/path | |
| 152 --output_subdir_size=3000 | |
| 153 --output_subdir_prefix=fasta | |
| 154 --seqs_per_file=10 | |
| 155 | |
| 156 The following will be created: | |
| 157 | |
| 158 directory file count | |
| 159 --------------------------------- | |
| 160 /some/path/fasta1/ 3000 | |
| 161 /some/path/fasta2/ 3000 | |
| 162 /some/path/fasta3/ 3000 | |
| 163 /some/path/fasta4/ 500 | |
| 164 | |
| 165 =head1 CONTACT | |
| 166 | |
| 167 Joshua Orvis | |
| 168 jorvis@tigr.org | |
| 169 | |
| 170 =cut | |
| 171 | |
| 172 use strict; | |
| 173 use Getopt::Long; | |
| 174 # qw(:config no_ignore_case no_auto_abbrev pass_through); | |
| 175 use Pod::Usage; | |
| 176 # BEGIN { | |
| 177 # use Ergatis::Logger; | |
| 178 # } | |
| 179 | |
| 180 my %options = (); | |
| 181 my $results = GetOptions (\%options, | |
| 182 'input_file|i=s', | |
| 183 'output_dir|o=s', | |
| 184 'output_file_prefix|f=s', | |
| 185 'output_list|s=s', | |
| 186 'output_subdir_size|u=s', | |
| 187 'output_subdir_prefix|p=s', | |
| 188 'seqs_per_file|n|e=s', | |
| 189 'compress_output|c=s', | |
| 190 'log|l=s', | |
| 191 'debug=s', | |
| 192 'help|h') || pod2usage(); | |
| 193 | |
| 194 # my $logfile = $options{'log'} || Ergatis::Logger::get_default_logfilename(); | |
| 195 # my $logger = new Ergatis::Logger('LOG_FILE'=>$logfile, | |
| 196 # 'LOG_LEVEL'=>$options{'debug'}); | |
| 197 # $logger = $logger->get_logger(); | |
| 198 | |
| 199 | |
| 200 my $logfile = $options{'log'} || "log.file"; | |
| 201 my $logger = new logger('LOG_FILE'=>$logfile, | |
| 202 'LOG_LEVEL'=>$options{'debug'}); | |
| 203 | |
| 204 ## display documentation | |
| 205 if( $options{'help'} ){ | |
| 206 pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} ); | |
| 207 } | |
| 208 | |
| 209 ## make sure everything passed was peachy | |
| 210 &check_parameters(\%options); | |
| 211 | |
| 212 ## open the list file if one was passed | |
| 213 my $listfh; | |
| 214 if (defined $options{output_list}) { | |
| 215 open($listfh, ">$options{output_list}") || $logger->logdie("couldn't create $options{output_list} list file"); | |
| 216 } | |
| 217 | |
| 218 my $first = 1; | |
| 219 my $seq = ''; | |
| 220 my $header; | |
| 221 | |
| 222 my $sfh; | |
| 223 | |
| 224 ## load the sequence file | |
| 225 if ($options{'input_file'} =~ /\.(gz|gzip)$/) { | |
| 226 open ($sfh, "<:gzip", $options{'input_file'}) | |
| 227 || $logger->logdie("can't open sequence file:\n$!"); | |
| 228 } else { | |
| 229 open ($sfh, "<$options{'input_file'}") | |
| 230 || $logger->logdie("can't open sequence file:\n$!"); | |
| 231 } | |
| 232 | |
| 233 my $sub_dir = 1; | |
| 234 my $seq_file_count = 0; | |
| 235 | |
| 236 ## keep track of how many sequences are in the current output file | |
| 237 my $seqs_in_file = 0; | |
| 238 my $group_filename_prefix = 1; | |
| 239 | |
| 240 ## holds the output file handle | |
| 241 my $ofh; | |
| 242 | |
| 243 while (<$sfh>) { | |
| 244 ## if we find a header line ... | |
| 245 if (/^\>(.*)/) { | |
| 246 | |
| 247 ## write the previous sequence before continuing with this one | |
| 248 unless ($first) { | |
| 249 &writeSequence(\$header, \$seq); | |
| 250 | |
| 251 ## reset the sequence | |
| 252 $seq = ''; | |
| 253 } | |
| 254 | |
| 255 $first = 0; | |
| 256 $header = $1; | |
| 257 | |
| 258 ## else we've found a sequence line | |
| 259 } else { | |
| 260 ## skip it if it is just whitespace | |
| 261 next if (/^\s*$/); | |
| 262 | |
| 263 ## record this portion of the sequence | |
| 264 $seq .= $_; | |
| 265 } | |
| 266 } | |
| 267 | |
| 268 ## don't forget the last sequence | |
| 269 &writeSequence(\$header, \$seq); | |
| 270 | |
| 271 exit; | |
| 272 | |
| 273 sub check_parameters { | |
| 274 my $options = shift; | |
| 275 | |
| 276 ## make sure input_file and output_dir were passed | |
| 277 unless ( $options{input_file} && $options{output_dir} ) { | |
| 278 $logger->logdie("You must pass both --input_file and --output_dir"); | |
| 279 } | |
| 280 | |
| 281 ## make sure input_file exists | |
| 282 if (! -e $options{input_file} ) { | |
| 283 if ( -e "$options{input_file}.gz" ) { | |
| 284 $options{input_file} .= '.gz'; | |
| 285 } else { | |
| 286 $logger->logdie("the input file passed ($options{input_file}) cannot be read or does not exist"); | |
| 287 } | |
| 288 } | |
| 289 | |
| 290 ## make sure the output_dir exists | |
| 291 if (! -e "$options{output_dir}") { | |
| 292 $logger->logdie("the output directory passed could not be read or does not exist"); | |
| 293 } | |
| 294 | |
| 295 ## seqs_per_file, if passed, must be at least one | |
| 296 if (defined $options{seqs_per_file} && $options{seqs_per_file} < 1) { | |
| 297 $logger->logdie("seq_per_file setting cannot be less than one"); | |
| 298 } | |
| 299 | |
| 300 ## handle some defaults | |
| 301 $options{output_subdir_size} = 0 unless ($options{output_subdir_size}); | |
| 302 $options{output_subdir_prefix} = '' unless ($options{output_subdir_prefix}); | |
| 303 $options{seqs_per_file} = 1 unless ($options{seqs_per_file}); | |
| 304 $options{output_file_prefix} = '' unless ($options{output_file_prefix}); | |
| 305 } | |
| 306 | |
| 307 sub writeSequence { | |
| 308 my ($header, $seq) = @_; | |
| 309 | |
| 310 ## the id used to write the output file will be the first thing | |
| 311 ## in the header up to the first whitespace. get that. | |
| 312 $$header =~ /^(\S+)/ || $logger->logdie( "can't pull out an id on header $$header" ); | |
| 313 my $id = $1; | |
| 314 | |
| 315 ## because it is going to be the filename, we're going to take out the characters that are bad form to use | |
| 316 ## legal characters = a-z A-Z 0-9 - . _ | |
| 317 $id =~ s/[^a-z0-9\-_.]/_/gi; | |
| 318 | |
| 319 my $dirpath; | |
| 320 | |
| 321 ## if we're writing more than one sequence to a file, change the id from | |
| 322 ## fasta header to the current group file name | |
| 323 if ($options{seqs_per_file} > 1) { | |
| 324 $id = $group_filename_prefix; | |
| 325 | |
| 326 ## did the user ask for a file prefix? | |
| 327 if ( $options{output_file_prefix} ) { | |
| 328 $id = $options{output_file_prefix} . $id; | |
| 329 } | |
| 330 } | |
| 331 | |
| 332 | |
| 333 ## the path depends on whether we are using output subdirectories | |
| 334 if ($options{output_subdir_size}) { | |
| 335 $dirpath = "$options{'output_dir'}/$options{output_subdir_prefix}$sub_dir"; | |
| 336 } else { | |
| 337 $dirpath = "$options{'output_dir'}"; | |
| 338 } | |
| 339 | |
| 340 ## did the user ask for a file prefix? | |
| 341 my $filepath = "$dirpath/$id.fsa"; | |
| 342 | |
| 343 ## take any // out of the filepath | |
| 344 $filepath =~ s|/+|/|g; | |
| 345 | |
| 346 ## write the sequence | |
| 347 $logger->debug("Writing sequence to $filepath") if ($logger->is_debug()); | |
| 348 | |
| 349 ## open a new output file if we need to | |
| 350 ## if we're writing multiple sequences per file, we only open a new | |
| 351 ## one when $seqs_in_file = 0 (first sequence) | |
| 352 if ($seqs_in_file == 0) { | |
| 353 | |
| 354 ## if the directory we want to write to doesn't exist yet, create it | |
| 355 mkdir($dirpath) unless (-e $dirpath); | |
| 356 | |
| 357 | |
| 358 if ($options{'compress_output'}) { | |
| 359 open ($ofh, ">:gzip", $filepath.".gz") | |
| 360 || $logger->logdie("can't create '$filepath.gz':\n$!"); | |
| 361 } else { | |
| 362 open ($ofh, ">$filepath") || $logger->logdie("can't create '$filepath':\n$!"); | |
| 363 | |
| 364 } | |
| 365 $seq_file_count++; | |
| 366 | |
| 367 ## add the file we just wrote to the list, if we were asked to | |
| 368 if (defined $options{output_list}) { | |
| 369 print $listfh "$filepath\n"; | |
| 370 } | |
| 371 } | |
| 372 | |
| 373 ## if we're doing output subdirs and hit our size limit, increment to the next dir | |
| 374 if ($options{output_subdir_size} && $options{output_subdir_size} == $seq_file_count) { | |
| 375 $seq_file_count = 0; | |
| 376 $sub_dir++; | |
| 377 } | |
| 378 | |
| 379 ## write the sequence | |
| 380 print $ofh ">$$header\n$$seq\n"; | |
| 381 $seqs_in_file++; | |
| 382 | |
| 383 ## if we hit the limit of how many we want in each file, set the next file name and | |
| 384 ## reset the count of seqs within the file | |
| 385 if ($options{seqs_per_file} == $seqs_in_file) { | |
| 386 $seqs_in_file = 0; | |
| 387 $group_filename_prefix++; | |
| 388 } | |
| 389 } | |
| 390 | |
| 391 | |
| 392 package logger; | |
| 393 | |
| 394 sub new { | |
| 395 my $packname= shift; | |
| 396 my %args= @_; | |
| 397 my $self= \%args; | |
| 398 bless($self,$packname); | |
| 399 return $self; | |
| 400 } | |
| 401 | |
| 402 sub get_logger { | |
| 403 my $self= shift; | |
| 404 return $self; | |
| 405 } | |
| 406 | |
| 407 sub logdie { | |
| 408 my $self= shift; | |
| 409 die @_; | |
| 410 } | |
| 411 | |
| 412 sub debug { | |
| 413 my $self= shift; | |
| 414 warn @_; | |
| 415 } | |
| 416 | |
| 417 sub is_debug { | |
| 418 shift->{LOG_LEVEL} || 0; | |
| 419 } | |
| 420 | |
| 421 | |
| 422 1; |
