comparison splitbc.pl @ 0:da4101033e10 draft default tip

planemo upload
author oinizan
date Wed, 18 Oct 2017 05:30:40 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:da4101033e10
1 #!/usr/bin/perl
2
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools. fastx_barcode_splitter.pl
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
5 #
6 # This program is free software: you can redistribute it and/or modify
7 # it under the terms of the GNU Affero General Public License as
8 # published by the Free Software Foundation, either version 3 of the
9 # License, or (at your option) any later version.
10 #
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU Affero General Public License for more details.
15 #
16 # You should have received a copy of the GNU Affero General Public License
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 use strict;
19 use warnings;
20 use IO::Handle;
21 use IO::Zlib;
22 use PerlIO::gzip;
23 use Data::Dumper;
24 use Getopt::Long;
25 use Carp;
26 use Pod::Usage;
27
28 =pod
29
30 =head1 NAME
31
32 splitbc.pl
33
34 =head1 SYNOPSIS
35
36 splitbc.pl read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol]
37 [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug]
38
39 =head1 DESCRIPTION
40
41 splitbc.pl classify fasta/fastq single or paired end reads in function of barcode forward or reverse in the first or both reads.
42 Also, whether reads are RAD sequence, it checks that barcode is followed by RAD tag.
43
44 =head1 OPTIONS
45
46 =over
47 =item B<--bcfile FILE>
48
49 Barcodes file name. (splitbc.pl --doc for details)
50
51 =item B<--prefix-r1 PREFIX>
52
53 File prefix. The sample name will be placed where the % is placed.
54
55 =item B<--prefix-r2 PREFIX>
56
57 File prefix. The sample name will be placed where the % is placed.
58
59 =item B<--bol>
60
61 Try to match barcodes at the BEGINNING of sequences.
62 (What biologists would call the 5' end, and programmers
63 would call index 0.)
64
65 =item B<--eol>
66
67 Try to match barcodes at the END of sequences.
68 (What biologists would call the 3' end, and programmers
69 would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed.
70 NOTE: one of --bol, --eol must be specified, but not both.
71
72 =item B<--mismatches N>
73
74 Max. number of mismatches allowed. default is 1.
75
76 =item B<--exact>
77
78 Same as '--mismatches 0'. If both --exact and --mismatches are specified, '--exact' takes precedence.
79
80 =item B <--partial N>
81
82 Allow partial overlap of barcodes. (splitbc.pl --doc for details)
83 (Default is not partial matching)
84
85 item B<--trim>
86
87 Should the barecode be trimmed.
88
89 =item B<--trim2>
90
91 Shoud the read 2 be trimmed to have the same length as the read1
92 NOTE: one of --trim, --trim2 must be specified, but not both.
93
94 =item B<--no_adapt>
95
96 there is no adaptator (T or A ) between barcode and the sequence
97
98 =item B<--rad STR>
99
100 sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
101 NOTE: --rad implies --no_adapt
102
103 =item B<--radTAG STR>
104
105 sequence retain at the begining of the sequence after cliping.
106 NOTE: if define rad, radTAG must be defined to, and conversely
107 Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file.
108
109 =item B<--TAG_mismatch N>
110
111 Max. number of mismatches allowed in the radTAG sequence. default is 0.
112
113 =item B<--quiet>
114
115 Don't print counts and summary at the end of the run.
116 (Default is to print.)
117
118 =item B<--debug>
119
120 Print lots of useless debug information to STDERR.
121
122 =item B<--help>
123
124 This helpful help screen.
125
126 =back
127
128 =head1 AUTHORS
129
130 Jérôme Mariette
131 modified by Maria Bernard
132
133 =head1 VERSION
134
135 1.1
136
137 =head1 DATE
138
139 modified 21/05/2013
140
141 =head1 KEYWORDS
142
143 Barcode RAD
144
145 =head1 EXAMPLE
146
147 (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is the barcodes file):
148 $0 cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
149 --prefix-r1 /tmp/bla_%.txt"
150
151 =head1 CHANGELOG
152
153 1.1: 18/10/2013 Maria
154 construct list of best barcode to deal with equal match. In this case reads will be written in "ambiguous" files
155
156 =cut
157
158 ##
159 ## This program splits a FASTQ/FASTA file into several smaller files,
160 ## Based on barcode matching.
161 ##
162 ## run with "--help" for usage information
163 ##
164 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008
165
166 # Forward declarations
167 sub load_barcode_file ($);
168 sub parse_command_line ;
169 sub match_sequences ;
170 sub mismatch_count($$) ;
171 sub print_results;
172 sub open_and_detect_input_format;
173 sub read_record;
174 sub write_record($);
175 sub usage();
176
177 # Global flags and arguments,
178 # Set by command line argumens
179 my $barcode_file ;
180 my $barcodes_at_eol = 0 ;
181 my $barcodes_at_bol = 0 ;
182 my $exact_match = 0 ;
183 my $allow_partial_overlap = 0;
184 my $allowed_mismatches = 1;
185 my $newfile_prefix_r1 = '';
186 my $newfile_prefix_r2 = '';
187 my $quiet = 0 ;
188 my $debug = 0 ;
189 my $trim = 0 ;
190 my $trim2 = 0 ;
191 my $adapt = 1 ;
192 my $no_adapt = 0 ;
193 my $rad;
194 my $radTAG;
195 my $TAG_mm=0;
196 my $fastq_format = 1;
197
198 # Global variables
199 # Populated by 'parse_command_line'
200 my $radTAG_length=0;
201 my $check_rad=0;
202
203 # Global variables
204 # Populated by 'create_output_files'
205 my %filenames;
206 my %files;
207 my %counts = ( 'unmatched' => 0 );
208 my $barcodes_length;
209 my @barcodes;
210 my $input_file_io;
211 my $input_file_io_r2;
212 my $paired = 0;
213 my $write_r2 = 0;
214
215 # The Four lines per record in FASTQ format.
216 # (when using FASTA format, only the first two are used)
217 my $seq_name;
218 my $seq_bases;
219 my $seq_name2;
220 my $seq_qualities;
221 my $seq_len;
222 my $seq_name_r2;
223 my $seq_bases_r2;
224 my $seq_name2_r2;
225 my $seq_qualities_r2;
226 my $seq_len_r2;
227
228 #
229 # Start of Program
230 #
231 parse_command_line ;
232
233 load_barcode_file ( $barcode_file ) ;
234
235 open_and_detect_input_format;
236
237 match_sequences ;
238
239 print_results unless $quiet;
240
241 #
242 # End of program
243 ########################################################################
244
245
246 sub parse_command_line {
247 my $help;
248 my $doc;
249 my $version;
250
251 pod2usage(1) if (scalar @ARGV==0);
252
253 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
254 "eol" => \$barcodes_at_eol,
255 "bol" => \$barcodes_at_bol,
256 "exact" => \$exact_match,
257 "prefix-r1=s" => \$newfile_prefix_r1,
258 "prefix-r2=s" => \$newfile_prefix_r2,
259 "quiet" => \$quiet,
260 "partial=i" => \$allow_partial_overlap,
261 "trim" => \$trim,
262 "trim2" => \$trim2,
263 "debug" => \$debug,
264 "mismatches=i" => \$allowed_mismatches,
265 "no_adapt"=> \$no_adapt,
266 "rad=s"=> \$rad,
267 "radTAG=s"=> \$radTAG,
268 "TAG_mismatch=i"=> \$TAG_mm,
269 "help" => \$help,
270 "doc" => \$doc,
271 "version" => \$version
272 ) ;
273
274 pod2usage(1) if ($help);
275 doc() if ($doc);
276 version() if ($version);
277
278 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
279 die "Error: prefix path/filename not specified (use '--prefix-r1 [PATH]%.fq')\n" unless defined $newfile_prefix_r1;
280 # If a read2 file is provided
281 if ($#ARGV == 1) {
282 die "Error: prefix path/filename not specified (use '--prefix-r2 [PATH]%.fq')\n" unless defined $newfile_prefix_r2;
283 }
284
285 if ($barcodes_at_bol == $barcodes_at_eol) {
286 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
287 die "Error: must specify either --eol or --bol\n" ;
288 }
289
290 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
291
292 $allowed_mismatches = 0 if $exact_match;
293
294 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
295
296 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
297 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
298
299 $trim = 1 if $trim2 ;
300
301 if ((defined $rad && !defined $radTAG) || (!defined $rad && defined $radTAG)){
302 die "Error: you must defined --rad AND --radTAG option\n";
303 }
304 if ( $no_adapt){
305 $adapt=0;
306 }
307 if (defined $rad ){
308 chomp $rad;
309 chomp $radTAG;
310 die "Error: $rad does not ended by $radTAG\n" if ($rad !~ /$radTAG$/g);
311 $adapt=0;
312 $check_rad=1;
313 #~ warn $rad, " ",$radTAG;
314 $radTAG_length=length($radTAG);
315 }
316 exit unless $result;
317 }
318
319
320
321 #
322 # Read the barcode file
323 #
324 sub load_barcode_file ($) {
325 my $filename = shift or croak "Missing barcode file name";
326
327 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
328 while (<BCFILE>) {
329 next if m/^#/;
330 chomp $_;
331 my @arr = split(' ',$_);
332 my $ident = $arr[0];
333 my $barcode = uc($arr[1]);
334
335 # Sanity checks on the barcodes
336 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
337 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
338 unless $barcode =~ m/^[AGCT]+$/;
339
340 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
341 unless $ident =~ m/^\w+$/;
342
343 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
344 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
345 if length($barcode)<=$allowed_mismatches;
346
347 $barcodes_length = length($barcode) unless defined $barcodes_length;
348 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
349 unless $barcodes_length == length($barcode);
350
351 push @barcodes, [$ident, $barcode];
352
353 if ($allow_partial_overlap>0) {
354 foreach my $i (1 .. $allow_partial_overlap) {
355 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
356 push @barcodes, [$ident, $barcode];
357 }
358 }
359 }
360 close BCFILE;
361
362 if ($debug) {
363 print STDERR "barcode\tsequence\n";
364 foreach my $barcoderef (@barcodes) {
365 my ($ident, $seq) = @{$barcoderef};
366 print STDERR $ident,"\t", $seq ,"\t",length($seq),"\n";
367 }
368 }
369 }
370
371 # Create one output file for each barcode.
372 # (Also create a file for the dummy 'unmatched' barcode)
373 sub create_output_files {
374 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
375 $barcodes{'unmatched'} = 1 ;
376 $barcodes{'ambiguous'} = 1 ;
377
378 foreach my $ident (keys %barcodes) {
379
380 my $new_filename = $newfile_prefix_r1;
381 $new_filename =~ s/%/$ident/g;
382 $filenames{$ident} = $new_filename;
383 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
384 $files{$ident} = $file ;
385
386 if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){
387 my $new_filename = $newfile_prefix_r1;
388 $new_filename =~ s/%/${ident}_2rad/g;
389 $filenames{$ident."_2rad"} = $new_filename;
390 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
391 $files{$ident."_2rad"} = $file ;
392 }
393
394 if ($paired == 1) {
395 my $new_filename_r2 = $newfile_prefix_r2;
396 $new_filename_r2 =~ s/%/$ident/g;
397 $filenames{$ident."r2"} = $new_filename_r2;
398 open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n";
399 $files{$ident."r2"} = $file ;
400
401 if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){
402 my $new_filename_r2 = $newfile_prefix_r2;
403 $new_filename_r2 =~ s/%/${ident}_2rad/g;
404 $filenames{$ident."_2radr2"} = $new_filename_r2;
405 open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n";
406 $files{$ident."_2radr2"} = $file ;
407 }
408 }
409 }
410 }
411
412 sub reverse_complement {
413 my $dna = shift;
414
415 # reverse the DNA sequence
416 my $revcomp = reverse($dna);
417
418 # complement the reversed DNA sequence
419 $revcomp =~ tr/ACGTacgt/TGCAtgca/;
420 return $revcomp;
421 }
422
423 sub match_sequences {
424
425 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
426 $barcodes{'unmatched'} = 1 ;
427 $barcodes{'ambiguous'} = 1 ;
428
429 #reset counters
430 foreach my $ident ( keys %barcodes ) {
431 $counts{$ident} = 0;
432 }
433
434 create_output_files;
435
436 # Read file FASTQ file
437 # split accotding to barcodes
438 do {
439 chomp $seq_bases;
440 #~ print STDERR "sequence ".length($seq_bases)." $seq_bases: \n" if $debug;
441 if ($fastq_format){
442 chomp $seq_qualities;
443 }
444
445 if ($paired == 1) {
446 chomp $seq_bases_r2;
447 if ($fastq_format){
448 chomp $seq_qualities_r2;
449 }
450 }
451 my $best_barcode_mismatches_count = $barcodes_length;
452 my $best_barcode_ident = undef;
453 my $sequence_fragment;
454 my $sequence_fragment_radTAG;
455 my $start = 0;
456 my $cut_len = $seq_len;
457 my $start2 = 0;
458 my $cut_len2 = $seq_len_r2;
459 # warn $start , " " , $cut_len , " " , $start2 , " " , $cut_len2;
460 #Try all barcodes, find the one with the lowest mismatch count
461 if ($barcodes_at_bol) {
462 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
463 if (defined $rad)
464 {
465 $sequence_fragment_radTAG = substr $seq_bases, $barcodes_length, $radTAG_length;
466 }
467 if ($trim){
468 $start=$barcodes_length+$adapt;
469 $cut_len=$seq_len-$barcodes_length-$adapt;
470 if ($trim2) {
471 $start2 = 0 ;
472 $cut_len2=$cut_len;
473 }
474 }
475 } elsif($barcodes_at_eol) {
476 if ($paired != 1){
477 $sequence_fragment = substr $seq_bases, - $barcodes_length ;
478 if (defined $rad){
479 $sequence_fragment_radTAG = substr $seq_bases, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length;
480 }
481 if ($trim){
482 $start=0 ;
483 $cut_len=$seq_len-$barcodes_length-$adapt;
484 }
485 } else {
486 $sequence_fragment = substr $seq_bases_r2, - $barcodes_length;
487 if (defined $rad){
488 $sequence_fragment_radTAG = substr $seq_bases_r2, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length;
489 }
490 if ($trim){
491 $start2=0 ;
492 $cut_len2=$seq_len_r2-$barcodes_length-$adapt;
493 if ($trim2) {
494 $start = 0 ;
495 $cut_len=$cut_len2 ;
496 }
497 }
498 }
499 #~ warn "sequence fragment : $sequence_fragment\n" if $debug;
500 #~ $sequence_fragment = reverse_complement $sequence_fragment;
501 #~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug;
502 }
503 # warn $start," ", $cut_len, " ",$start2, " ",$cut_len2;
504
505 foreach my $barcoderef (@barcodes) {
506 my ($ident, $barcode) = @{$barcoderef};
507
508 # Get DNA fragment (in the length of the barcodes)
509 # The barcode will be tested only against this fragment
510 # (no point in testing the barcode against the whole sequence)
511 my $mm=$barcodes_length;
512
513 # check validity of RAD.
514 if (defined $rad){
515 my $mm_rad = mismatch_count($sequence_fragment_radTAG, $radTAG) ;
516 if ($mm_rad<= $TAG_mm){ $mm = mismatch_count($sequence_fragment, $barcode) ; }
517 }else {
518 $mm = mismatch_count($sequence_fragment, $barcode) ;
519 }
520
521 # if this is a partial match, add the non-overlap as a mismatch
522 # (partial barcodes are shorter than the length of the original barcodes)
523 $mm += ($barcodes_length - length($barcode));
524
525 if ( $mm < $best_barcode_mismatches_count ) {
526 $best_barcode_mismatches_count = $mm ;
527 $best_barcode_ident = $ident ;
528 }elsif($mm == $best_barcode_mismatches_count){
529 $best_barcode_ident = "ambiguous"
530 }
531 }
532
533 if ($best_barcode_mismatches_count<=$allowed_mismatches && $best_barcode_ident ne 'ambiguous'){
534 $seq_bases = substr $seq_bases, $start, $cut_len;
535 if ($fastq_format) {
536 $seq_qualities = substr $seq_qualities, $start, $cut_len;
537 }
538 if ($paired == 1 ) {
539 $seq_bases_r2 = substr $seq_bases_r2, $start2, $cut_len2;
540 if ($fastq_format) {
541 $seq_qualities_r2 = substr $seq_qualities_r2, $start2, $cut_len2;
542 }
543 }
544 }
545
546 if ( (!defined $best_barcode_ident) || ($best_barcode_mismatches_count>$allowed_mismatches)){
547 $best_barcode_ident = 'unmatched';
548 }
549
550 #get the file associated with the matched barcode.
551 #(note: there's also a file associated with 'unmatched' barcode)
552
553 if (defined $rad && $best_barcode_ident ne 'unmatched' && $best_barcode_ident ne 'ambiguous'){
554 my $nb_rad =()= ($seq_bases =~ /$rad/g);
555 #~ warn $seq_bases," ",$nb_rad;
556 $best_barcode_ident.='_2rad' if ($nb_rad > 0);
557 #~ warn $best_barcode_ident;
558 }
559
560 $counts{$best_barcode_ident}++;
561 my $file = $files{$best_barcode_ident};
562 #~ warn "$seq_name , $best_barcode_ident \n";
563 #~ warn $file ;
564 $write_r2 = 0;
565 write_record($file);
566 if ($paired == 1) {
567 #~ warn $best_barcode_ident."r2";
568 my $file = $files{$best_barcode_ident."r2"};
569 $write_r2 = 1;
570 #~ warn $best_barcode_ident."r2";
571 write_record($file);
572 }
573
574 } while ( read_record );
575 }
576
577 #Quickly calculate hamming distance between two strings
578 #
579 #NOTE: Strings must be same length.
580 # returns number of different characters.
581 #see http://www.perlmonks.org/?node_id=500235
582 sub mismatch_count($$) {
583 length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] )
584 }
585
586
587
588 sub print_results
589 {
590 print "Barcode\tCount\tLocation\n";
591 my $total = 0 ;
592 foreach my $ident (sort keys %counts) {
593 print $ident, "\t", $counts{$ident};
594 if ($paired) {
595 print "(*2)";
596 }
597 print "\t",$filenames{$ident};
598 if ($paired) {
599 print " & ".$filenames{$ident."r2"};
600 }
601 print "\n";
602 $total += $counts{$ident};
603 }
604 print "total\t",$total;
605 if ($paired) {
606 print "(*2)";
607 }
608 print "\n";
609 }
610
611
612 sub read_record
613 {
614 $seq_name = $input_file_io->getline();
615
616 return undef unless defined $seq_name; # End of file?
617
618 $seq_bases = $input_file_io->getline();
619 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
620 $seq_len = length($seq_bases)-1;
621 # If using FASTQ format, read two more lines
622 if ($fastq_format) {
623 $seq_name2 = $input_file_io->getline();
624 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
625
626 $seq_qualities = $input_file_io->getline();
627 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
628 }
629
630 if ($paired == 1) {
631 $seq_name_r2 = $input_file_io_r2->getline();
632
633 return undef unless defined $seq_name_r2; # End of file?
634
635 $seq_bases_r2 = $input_file_io_r2->getline();
636 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases_r2;
637 $seq_len_r2 = length($seq_bases_r2)-1;
638 # If using FASTQ format, read two more lines
639 if ($fastq_format) {
640 $seq_name2_r2 = $input_file_io_r2->getline();
641 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2;
642
643 $seq_qualities_r2 = $input_file_io_r2->getline();
644 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2;
645 }
646 }
647
648 return 1;
649 }
650
651 sub write_record($)
652 {
653 if ($write_r2 == 1) {
654 my $file = shift;
655 croak "Bad file handle" unless defined $file;
656
657 print $file $seq_name_r2;
658 print $file $seq_bases_r2,"\n";
659
660 #if using FASTQ format, write two more lines
661 if ($fastq_format) {
662 print $file $seq_name2_r2;
663 print $file $seq_qualities_r2,"\n";
664 }
665 } else {
666 my $file = shift;
667 croak "Bad file handle $seq_name" unless defined $file;
668
669 print $file $seq_name;
670 print $file $seq_bases,"\n";
671
672 #if using FASTQ format, write two more lines
673 if ($fastq_format) {
674 print $file $seq_name2;
675 print $file $seq_qualities,"\n";
676 }
677 }
678 }
679
680 sub open_and_detect_input_format
681 {
682 # $input_file_io = new IO::Handle;
683 # die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
684
685 if ($ARGV[0] =~ /.gz$/){
686 die "Failed to open '$ARGV[0]' " unless open ($input_file_io, "<:gzip", $ARGV[0] ) ;
687 #~ $input_file_io = IO::Zlib->new();
688 #~ die "Failed to open '$ARGV[0]' " unless $input_file_io->open($ARGV[0],"r");
689 }else{
690 $input_file_io= IO::Handle->new();
691 die "Failed to open '$ARGV[0]' " unless open($input_file_io , $ARGV[0]);
692 }
693
694
695 # If a read2 file is provided
696 if ($#ARGV == 1) {
697 if ($ARGV[1] =~ /.gz$/){
698 die "Failed to open '$ARGV[0]' " unless open ($input_file_io_r2 , "<:gzip", $ARGV[1] ) ;
699 #~ $input_file_io_r2 = IO::Zlib->new();
700 #~ die "Failed to open '$ARGV[0]' " unless $input_file_io_r2->open($ARGV[1],"r");
701 }else{
702 $input_file_io_r2 = IO::Handle->new();
703 die "Failed to open '$ARGV[1]' " unless open($input_file_io_r2 , $ARGV[1]);
704 }
705 $paired = 1;
706 }
707
708 # Get the first characeter, and push it back
709 my $first_char = $input_file_io->getc();
710 $input_file_io->ungetc(ord $first_char);
711 $seq_name = $input_file_io->getline();
712 $seq_bases = $input_file_io->getline();
713 $seq_len = length($seq_bases)-1;
714 if ($paired == 1) {
715 $seq_name_r2 = $input_file_io_r2->getline();
716 $seq_bases_r2 = $input_file_io_r2->getline();
717 $seq_len_r2 = length($seq_bases_r2)-1;
718 }
719 if ($first_char eq '>') {
720 # FASTA format
721 $fastq_format = 0 ;
722 print STDERR "Detected FASTA format\n" if $debug;
723 } elsif ($first_char eq '@') {
724 # FASTQ format
725 $fastq_format = 1;
726 $seq_name2 = $input_file_io->getline();
727 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
728 $seq_qualities = $input_file_io->getline();
729 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
730 if ($paired == 1) {
731 $seq_name2_r2 = $input_file_io_r2->getline();
732 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2;
733 $seq_qualities_r2 = $input_file_io_r2->getline();
734 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2;
735 }
736 print STDERR "Detected FASTQ format\n" if $debug;
737 } else {
738 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
739 }
740 # warn "len1: ", $seq_len, "; len2: ", $seq_len_r2, " ;barcode len: ", $barcodes_length, "; no adapt: ", $adapt;
741 }
742
743 sub doc()
744 {
745 print<<EOF;
746 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
747
748 This program reads FASTA/FASTQ file and splits it into several smaller files,
749 Based on barcode matching.
750 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
751 Output files will be writen to disk.
752 Summary will be printed to STDOUT.
753
754 usage: $0 read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol]
755 [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug]
756
757 Arguments:
758
759 --bcfile FILE - Barcodes file name. (see explanation below.)
760 --prefix-r1 PREFIX - File prefix. The sample name will be placed where the % is placed.
761 --prefix-r2 PREFIX - File prefix. The sample name will be placed where the % is placed.
762 --bol - Try to match barcodes at the BEGINNING of sequences.
763 (What biologists would call the 5' end, and programmers
764 would call index 0.)
765 --eol - Try to match barcodes at the END of sequences.
766 (What biologists would call the 3' end, and programmers
767 would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed.
768 NOTE: one of --bol, --eol must be specified, but not both.
769 --mismatches N - Max. number of mismatches allowed. default is 1.
770 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
771 are specified, '--exact' takes precedence.
772 --partial N - Allow partial overlap of barcodes. (see explanation below.)
773 (Default is not partial matching)
774 --trim - Should the barecode be trimmed.
775 --trim2 - Shoud the read 2 be trimmed to have the same length as the read1
776 NOTE: one of --trim, --trim2 must be specified, but not both.
777 --no_adapt - there is no adaptator (T or A ) between barcode and the sequence
778 --rad STR - sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
779 NOTE: --rad implies --no_adapt
780 --radTAG STR - sequence retain at the begining of the sequence after cliping.
781 NOTE: if define rad, radTAG must be defined to, and conversely
782 Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file.
783 --TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0.
784 --quiet - Don't print counts and summary at the end of the run.
785 (Default is to print.)
786 --debug - Print lots of useless debug information to STDERR.
787 --help - This helpful help screen.
788
789 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
790 the barcodes file):
791
792 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
793 --prefix-r1 /tmp/bla_%.txt"
794
795 Barcode file format
796 -------------------
797 Barcode files are simple text files. Each line should contain an identifier
798 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
799 separated by a TAB character. Example:
800
801 #This line is a comment (starts with a 'number' sign)
802 BC1 GATCT
803 BC2 ATCGT
804 BC3 GTGAT
805 BC4 TGTCT
806
807 For each barcode, a new FASTQ file will be created (with the barcode's
808 identifier as part of the file name). Sequences matching the barcode
809 will be stored in the appropriate file.
810
811 Running the above example (assuming "mybarcodes.txt" contains the above
812 barcodes), will create the following files:
813 /tmp/bla_BC1.txt
814 /tmp/bla_BC2.txt
815 /tmp/bla_BC3.txt
816 /tmp/bla_BC4.txt
817 /tmp/bla_unmatched.txt
818 The 'unmatched' file will contain all sequences that didn't match any barcode.
819
820 Barcode matching
821 ----------------
822
823 ** Without partial matching:
824
825 Count mismatches between the FASTA/Q sequences and the barcodes.
826 The barcode which matched with the lowest mismatches count (providing the
827 count is small or equal to '--mismatches N') 'gets' the sequences.
828
829 Example (using the above barcodes):
830 Input Sequence:
831 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
832
833 Matching with '--bol --mismatches 1':
834 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
835 GATCT (1 mismatch, BC1)
836 ATCGT (4 mismatches, BC2)
837 GTGAT (3 mismatches, BC3)
838 TGTCT (3 mismatches, BC4)
839
840 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
841 If '--exact' or '--mismatches 0' were specified, this sequence would be
842 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
843 it is above the maximum allowed mismatches).
844
845 Matching with '--eol' (end of line) does the same, but from the other side
846 of the sequence.
847
848 ** With partial matching (very similar to indels):
849
850 Same as above, with the following addition: barcodes are also checked for
851 partial overlap (number of allowed non-overlapping bases is '--partial N').
852
853 Example:
854 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
855 (Same as above, but note the missing 'G' at the beginning.)
856
857 Matching (without partial overlapping) against BC1 yields 4 mismatches:
858 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
859 GATCT (4 mismatches)
860
861 Partial overlapping would also try the following match:
862 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
863 GATCT (1 mismatch)
864
865 Note: scoring counts a missing base as a mismatch, so the final
866 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
867 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
868 seqeunce will be classified as BC1.
869
870 EOF
871
872 exit 1;
873 }
874
875 sub version()
876 {
877 print STDOUT "1.0";
878 exit 0 ;
879 }