0
|
1 #!/usr/bin/perl
|
|
2
|
|
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools. fastx_barcode_splitter.pl
|
|
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu)
|
|
5 #
|
|
6 # This program is free software: you can redistribute it and/or modify
|
|
7 # it under the terms of the GNU Affero General Public License as
|
|
8 # published by the Free Software Foundation, either version 3 of the
|
|
9 # License, or (at your option) any later version.
|
|
10 #
|
|
11 # This program is distributed in the hope that it will be useful,
|
|
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
14 # GNU Affero General Public License for more details.
|
|
15 #
|
|
16 # You should have received a copy of the GNU Affero General Public License
|
|
17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
18 use strict;
|
|
19 use warnings;
|
|
20 use IO::Handle;
|
|
21 use IO::Zlib;
|
|
22 use PerlIO::gzip;
|
|
23 use Data::Dumper;
|
|
24 use Getopt::Long;
|
|
25 use Carp;
|
|
26 use Pod::Usage;
|
|
27
|
|
28 =pod
|
|
29
|
|
30 =head1 NAME
|
|
31
|
|
32 splitbc.pl
|
|
33
|
|
34 =head1 SYNOPSIS
|
|
35
|
|
36 splitbc.pl read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol]
|
|
37 [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug]
|
|
38
|
|
39 =head1 DESCRIPTION
|
|
40
|
|
41 splitbc.pl classify fasta/fastq single or paired end reads in function of barcode forward or reverse in the first or both reads.
|
|
42 Also, whether reads are RAD sequence, it checks that barcode is followed by RAD tag.
|
|
43
|
|
44 =head1 OPTIONS
|
|
45
|
|
46 =over
|
|
47 =item B<--bcfile FILE>
|
|
48
|
|
49 Barcodes file name. (splitbc.pl --doc for details)
|
|
50
|
|
51 =item B<--prefix-r1 PREFIX>
|
|
52
|
|
53 File prefix. The sample name will be placed where the % is placed.
|
|
54
|
|
55 =item B<--prefix-r2 PREFIX>
|
|
56
|
|
57 File prefix. The sample name will be placed where the % is placed.
|
|
58
|
|
59 =item B<--bol>
|
|
60
|
|
61 Try to match barcodes at the BEGINNING of sequences.
|
|
62 (What biologists would call the 5' end, and programmers
|
|
63 would call index 0.)
|
|
64
|
|
65 =item B<--eol>
|
|
66
|
|
67 Try to match barcodes at the END of sequences.
|
|
68 (What biologists would call the 3' end, and programmers
|
|
69 would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed.
|
|
70 NOTE: one of --bol, --eol must be specified, but not both.
|
|
71
|
|
72 =item B<--mismatches N>
|
|
73
|
|
74 Max. number of mismatches allowed. default is 1.
|
|
75
|
|
76 =item B<--exact>
|
|
77
|
|
78 Same as '--mismatches 0'. If both --exact and --mismatches are specified, '--exact' takes precedence.
|
|
79
|
|
80 =item B <--partial N>
|
|
81
|
|
82 Allow partial overlap of barcodes. (splitbc.pl --doc for details)
|
|
83 (Default is not partial matching)
|
|
84
|
|
85 item B<--trim>
|
|
86
|
|
87 Should the barecode be trimmed.
|
|
88
|
|
89 =item B<--trim2>
|
|
90
|
|
91 Shoud the read 2 be trimmed to have the same length as the read1
|
|
92 NOTE: one of --trim, --trim2 must be specified, but not both.
|
|
93
|
|
94 =item B<--no_adapt>
|
|
95
|
|
96 there is no adaptator (T or A ) between barcode and the sequence
|
|
97
|
|
98 =item B<--rad STR>
|
|
99
|
|
100 sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
|
|
101 NOTE: --rad implies --no_adapt
|
|
102
|
|
103 =item B<--radTAG STR>
|
|
104
|
|
105 sequence retain at the begining of the sequence after cliping.
|
|
106 NOTE: if define rad, radTAG must be defined to, and conversely
|
|
107 Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file.
|
|
108
|
|
109 =item B<--TAG_mismatch N>
|
|
110
|
|
111 Max. number of mismatches allowed in the radTAG sequence. default is 0.
|
|
112
|
|
113 =item B<--quiet>
|
|
114
|
|
115 Don't print counts and summary at the end of the run.
|
|
116 (Default is to print.)
|
|
117
|
|
118 =item B<--debug>
|
|
119
|
|
120 Print lots of useless debug information to STDERR.
|
|
121
|
|
122 =item B<--help>
|
|
123
|
|
124 This helpful help screen.
|
|
125
|
|
126 =back
|
|
127
|
|
128 =head1 AUTHORS
|
|
129
|
|
130 Jérôme Mariette
|
|
131 modified by Maria Bernard
|
|
132
|
|
133 =head1 VERSION
|
|
134
|
|
135 1.1
|
|
136
|
|
137 =head1 DATE
|
|
138
|
|
139 modified 21/05/2013
|
|
140
|
|
141 =head1 KEYWORDS
|
|
142
|
|
143 Barcode RAD
|
|
144
|
|
145 =head1 EXAMPLE
|
|
146
|
|
147 (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is the barcodes file):
|
|
148 $0 cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
|
|
149 --prefix-r1 /tmp/bla_%.txt"
|
|
150
|
|
151 =head1 CHANGELOG
|
|
152
|
|
153 1.1: 18/10/2013 Maria
|
|
154 construct list of best barcode to deal with equal match. In this case reads will be written in "ambiguous" files
|
|
155
|
|
156 =cut
|
|
157
|
|
158 ##
|
|
159 ## This program splits a FASTQ/FASTA file into several smaller files,
|
|
160 ## Based on barcode matching.
|
|
161 ##
|
|
162 ## run with "--help" for usage information
|
|
163 ##
|
|
164 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008
|
|
165
|
|
166 # Forward declarations
|
|
167 sub load_barcode_file ($);
|
|
168 sub parse_command_line ;
|
|
169 sub match_sequences ;
|
|
170 sub mismatch_count($$) ;
|
|
171 sub print_results;
|
|
172 sub open_and_detect_input_format;
|
|
173 sub read_record;
|
|
174 sub write_record($);
|
|
175 sub usage();
|
|
176
|
|
177 # Global flags and arguments,
|
|
178 # Set by command line argumens
|
|
179 my $barcode_file ;
|
|
180 my $barcodes_at_eol = 0 ;
|
|
181 my $barcodes_at_bol = 0 ;
|
|
182 my $exact_match = 0 ;
|
|
183 my $allow_partial_overlap = 0;
|
|
184 my $allowed_mismatches = 1;
|
|
185 my $newfile_prefix_r1 = '';
|
|
186 my $newfile_prefix_r2 = '';
|
|
187 my $quiet = 0 ;
|
|
188 my $debug = 0 ;
|
|
189 my $trim = 0 ;
|
|
190 my $trim2 = 0 ;
|
|
191 my $adapt = 1 ;
|
|
192 my $no_adapt = 0 ;
|
|
193 my $rad;
|
|
194 my $radTAG;
|
|
195 my $TAG_mm=0;
|
|
196 my $fastq_format = 1;
|
|
197
|
|
198 # Global variables
|
|
199 # Populated by 'parse_command_line'
|
|
200 my $radTAG_length=0;
|
|
201 my $check_rad=0;
|
|
202
|
|
203 # Global variables
|
|
204 # Populated by 'create_output_files'
|
|
205 my %filenames;
|
|
206 my %files;
|
|
207 my %counts = ( 'unmatched' => 0 );
|
|
208 my $barcodes_length;
|
|
209 my @barcodes;
|
|
210 my $input_file_io;
|
|
211 my $input_file_io_r2;
|
|
212 my $paired = 0;
|
|
213 my $write_r2 = 0;
|
|
214
|
|
215 # The Four lines per record in FASTQ format.
|
|
216 # (when using FASTA format, only the first two are used)
|
|
217 my $seq_name;
|
|
218 my $seq_bases;
|
|
219 my $seq_name2;
|
|
220 my $seq_qualities;
|
|
221 my $seq_len;
|
|
222 my $seq_name_r2;
|
|
223 my $seq_bases_r2;
|
|
224 my $seq_name2_r2;
|
|
225 my $seq_qualities_r2;
|
|
226 my $seq_len_r2;
|
|
227
|
|
228 #
|
|
229 # Start of Program
|
|
230 #
|
|
231 parse_command_line ;
|
|
232
|
|
233 load_barcode_file ( $barcode_file ) ;
|
|
234
|
|
235 open_and_detect_input_format;
|
|
236
|
|
237 match_sequences ;
|
|
238
|
|
239 print_results unless $quiet;
|
|
240
|
|
241 #
|
|
242 # End of program
|
|
243 ########################################################################
|
|
244
|
|
245
|
|
246 sub parse_command_line {
|
|
247 my $help;
|
|
248 my $doc;
|
|
249 my $version;
|
|
250
|
|
251 pod2usage(1) if (scalar @ARGV==0);
|
|
252
|
|
253 my $result = GetOptions ( "bcfile=s" => \$barcode_file,
|
|
254 "eol" => \$barcodes_at_eol,
|
|
255 "bol" => \$barcodes_at_bol,
|
|
256 "exact" => \$exact_match,
|
|
257 "prefix-r1=s" => \$newfile_prefix_r1,
|
|
258 "prefix-r2=s" => \$newfile_prefix_r2,
|
|
259 "quiet" => \$quiet,
|
|
260 "partial=i" => \$allow_partial_overlap,
|
|
261 "trim" => \$trim,
|
|
262 "trim2" => \$trim2,
|
|
263 "debug" => \$debug,
|
|
264 "mismatches=i" => \$allowed_mismatches,
|
|
265 "no_adapt"=> \$no_adapt,
|
|
266 "rad=s"=> \$rad,
|
|
267 "radTAG=s"=> \$radTAG,
|
|
268 "TAG_mismatch=i"=> \$TAG_mm,
|
|
269 "help" => \$help,
|
|
270 "doc" => \$doc,
|
|
271 "version" => \$version
|
|
272 ) ;
|
|
273
|
|
274 pod2usage(1) if ($help);
|
|
275 doc() if ($doc);
|
|
276 version() if ($version);
|
|
277
|
|
278 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file;
|
|
279 die "Error: prefix path/filename not specified (use '--prefix-r1 [PATH]%.fq')\n" unless defined $newfile_prefix_r1;
|
|
280 # If a read2 file is provided
|
|
281 if ($#ARGV == 1) {
|
|
282 die "Error: prefix path/filename not specified (use '--prefix-r2 [PATH]%.fq')\n" unless defined $newfile_prefix_r2;
|
|
283 }
|
|
284
|
|
285 if ($barcodes_at_bol == $barcodes_at_eol) {
|
|
286 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol;
|
|
287 die "Error: must specify either --eol or --bol\n" ;
|
|
288 }
|
|
289
|
|
290 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0;
|
|
291
|
|
292 $allowed_mismatches = 0 if $exact_match;
|
|
293
|
|
294 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0);
|
|
295
|
|
296 die "Error: partial overlap value ($allow_partial_overlap) bigger than " .
|
|
297 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches);
|
|
298
|
|
299 $trim = 1 if $trim2 ;
|
|
300
|
|
301 if ((defined $rad && !defined $radTAG) || (!defined $rad && defined $radTAG)){
|
|
302 die "Error: you must defined --rad AND --radTAG option\n";
|
|
303 }
|
|
304 if ( $no_adapt){
|
|
305 $adapt=0;
|
|
306 }
|
|
307 if (defined $rad ){
|
|
308 chomp $rad;
|
|
309 chomp $radTAG;
|
|
310 die "Error: $rad does not ended by $radTAG\n" if ($rad !~ /$radTAG$/g);
|
|
311 $adapt=0;
|
|
312 $check_rad=1;
|
|
313 #~ warn $rad, " ",$radTAG;
|
|
314 $radTAG_length=length($radTAG);
|
|
315 }
|
|
316 exit unless $result;
|
|
317 }
|
|
318
|
|
319
|
|
320
|
|
321 #
|
|
322 # Read the barcode file
|
|
323 #
|
|
324 sub load_barcode_file ($) {
|
|
325 my $filename = shift or croak "Missing barcode file name";
|
|
326
|
|
327 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n";
|
|
328 while (<BCFILE>) {
|
|
329 next if m/^#/;
|
|
330 chomp $_;
|
|
331 my @arr = split(' ',$_);
|
|
332 my $ident = $arr[0];
|
|
333 my $barcode = uc($arr[1]);
|
|
334
|
|
335 # Sanity checks on the barcodes
|
|
336 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode;
|
|
337 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n"
|
|
338 unless $barcode =~ m/^[AGCT]+$/;
|
|
339
|
|
340 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n"
|
|
341 unless $ident =~ m/^\w+$/;
|
|
342
|
|
343 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " .
|
|
344 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n"
|
|
345 if length($barcode)<=$allowed_mismatches;
|
|
346
|
|
347 $barcodes_length = length($barcode) unless defined $barcodes_length;
|
|
348 die "Error: found barcodes in different lengths. this feature is not supported yet.\n"
|
|
349 unless $barcodes_length == length($barcode);
|
|
350
|
|
351 push @barcodes, [$ident, $barcode];
|
|
352
|
|
353 if ($allow_partial_overlap>0) {
|
|
354 foreach my $i (1 .. $allow_partial_overlap) {
|
|
355 substr $barcode, ($barcodes_at_bol)?0:-1, 1, '';
|
|
356 push @barcodes, [$ident, $barcode];
|
|
357 }
|
|
358 }
|
|
359 }
|
|
360 close BCFILE;
|
|
361
|
|
362 if ($debug) {
|
|
363 print STDERR "barcode\tsequence\n";
|
|
364 foreach my $barcoderef (@barcodes) {
|
|
365 my ($ident, $seq) = @{$barcoderef};
|
|
366 print STDERR $ident,"\t", $seq ,"\t",length($seq),"\n";
|
|
367 }
|
|
368 }
|
|
369 }
|
|
370
|
|
371 # Create one output file for each barcode.
|
|
372 # (Also create a file for the dummy 'unmatched' barcode)
|
|
373 sub create_output_files {
|
|
374 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
|
|
375 $barcodes{'unmatched'} = 1 ;
|
|
376 $barcodes{'ambiguous'} = 1 ;
|
|
377
|
|
378 foreach my $ident (keys %barcodes) {
|
|
379
|
|
380 my $new_filename = $newfile_prefix_r1;
|
|
381 $new_filename =~ s/%/$ident/g;
|
|
382 $filenames{$ident} = $new_filename;
|
|
383 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
|
|
384 $files{$ident} = $file ;
|
|
385
|
|
386 if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){
|
|
387 my $new_filename = $newfile_prefix_r1;
|
|
388 $new_filename =~ s/%/${ident}_2rad/g;
|
|
389 $filenames{$ident."_2rad"} = $new_filename;
|
|
390 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n";
|
|
391 $files{$ident."_2rad"} = $file ;
|
|
392 }
|
|
393
|
|
394 if ($paired == 1) {
|
|
395 my $new_filename_r2 = $newfile_prefix_r2;
|
|
396 $new_filename_r2 =~ s/%/$ident/g;
|
|
397 $filenames{$ident."r2"} = $new_filename_r2;
|
|
398 open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n";
|
|
399 $files{$ident."r2"} = $file ;
|
|
400
|
|
401 if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){
|
|
402 my $new_filename_r2 = $newfile_prefix_r2;
|
|
403 $new_filename_r2 =~ s/%/${ident}_2rad/g;
|
|
404 $filenames{$ident."_2radr2"} = $new_filename_r2;
|
|
405 open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n";
|
|
406 $files{$ident."_2radr2"} = $file ;
|
|
407 }
|
|
408 }
|
|
409 }
|
|
410 }
|
|
411
|
|
412 sub reverse_complement {
|
|
413 my $dna = shift;
|
|
414
|
|
415 # reverse the DNA sequence
|
|
416 my $revcomp = reverse($dna);
|
|
417
|
|
418 # complement the reversed DNA sequence
|
|
419 $revcomp =~ tr/ACGTacgt/TGCAtgca/;
|
|
420 return $revcomp;
|
|
421 }
|
|
422
|
|
423 sub match_sequences {
|
|
424
|
|
425 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers;
|
|
426 $barcodes{'unmatched'} = 1 ;
|
|
427 $barcodes{'ambiguous'} = 1 ;
|
|
428
|
|
429 #reset counters
|
|
430 foreach my $ident ( keys %barcodes ) {
|
|
431 $counts{$ident} = 0;
|
|
432 }
|
|
433
|
|
434 create_output_files;
|
|
435
|
|
436 # Read file FASTQ file
|
|
437 # split accotding to barcodes
|
|
438 do {
|
|
439 chomp $seq_bases;
|
|
440 #~ print STDERR "sequence ".length($seq_bases)." $seq_bases: \n" if $debug;
|
|
441 if ($fastq_format){
|
|
442 chomp $seq_qualities;
|
|
443 }
|
|
444
|
|
445 if ($paired == 1) {
|
|
446 chomp $seq_bases_r2;
|
|
447 if ($fastq_format){
|
|
448 chomp $seq_qualities_r2;
|
|
449 }
|
|
450 }
|
|
451 my $best_barcode_mismatches_count = $barcodes_length;
|
|
452 my $best_barcode_ident = undef;
|
|
453 my $sequence_fragment;
|
|
454 my $sequence_fragment_radTAG;
|
|
455 my $start = 0;
|
|
456 my $cut_len = $seq_len;
|
|
457 my $start2 = 0;
|
|
458 my $cut_len2 = $seq_len_r2;
|
|
459 # warn $start , " " , $cut_len , " " , $start2 , " " , $cut_len2;
|
|
460 #Try all barcodes, find the one with the lowest mismatch count
|
|
461 if ($barcodes_at_bol) {
|
|
462 $sequence_fragment = substr $seq_bases, 0, $barcodes_length;
|
|
463 if (defined $rad)
|
|
464 {
|
|
465 $sequence_fragment_radTAG = substr $seq_bases, $barcodes_length, $radTAG_length;
|
|
466 }
|
|
467 if ($trim){
|
|
468 $start=$barcodes_length+$adapt;
|
|
469 $cut_len=$seq_len-$barcodes_length-$adapt;
|
|
470 if ($trim2) {
|
|
471 $start2 = 0 ;
|
|
472 $cut_len2=$cut_len;
|
|
473 }
|
|
474 }
|
|
475 } elsif($barcodes_at_eol) {
|
|
476 if ($paired != 1){
|
|
477 $sequence_fragment = substr $seq_bases, - $barcodes_length ;
|
|
478 if (defined $rad){
|
|
479 $sequence_fragment_radTAG = substr $seq_bases, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length;
|
|
480 }
|
|
481 if ($trim){
|
|
482 $start=0 ;
|
|
483 $cut_len=$seq_len-$barcodes_length-$adapt;
|
|
484 }
|
|
485 } else {
|
|
486 $sequence_fragment = substr $seq_bases_r2, - $barcodes_length;
|
|
487 if (defined $rad){
|
|
488 $sequence_fragment_radTAG = substr $seq_bases_r2, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length;
|
|
489 }
|
|
490 if ($trim){
|
|
491 $start2=0 ;
|
|
492 $cut_len2=$seq_len_r2-$barcodes_length-$adapt;
|
|
493 if ($trim2) {
|
|
494 $start = 0 ;
|
|
495 $cut_len=$cut_len2 ;
|
|
496 }
|
|
497 }
|
|
498 }
|
|
499 #~ warn "sequence fragment : $sequence_fragment\n" if $debug;
|
|
500 #~ $sequence_fragment = reverse_complement $sequence_fragment;
|
|
501 #~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug;
|
|
502 }
|
|
503 # warn $start," ", $cut_len, " ",$start2, " ",$cut_len2;
|
|
504
|
|
505 foreach my $barcoderef (@barcodes) {
|
|
506 my ($ident, $barcode) = @{$barcoderef};
|
|
507
|
|
508 # Get DNA fragment (in the length of the barcodes)
|
|
509 # The barcode will be tested only against this fragment
|
|
510 # (no point in testing the barcode against the whole sequence)
|
|
511 my $mm=$barcodes_length;
|
|
512
|
|
513 # check validity of RAD.
|
|
514 if (defined $rad){
|
|
515 my $mm_rad = mismatch_count($sequence_fragment_radTAG, $radTAG) ;
|
|
516 if ($mm_rad<= $TAG_mm){ $mm = mismatch_count($sequence_fragment, $barcode) ; }
|
|
517 }else {
|
|
518 $mm = mismatch_count($sequence_fragment, $barcode) ;
|
|
519 }
|
|
520
|
|
521 # if this is a partial match, add the non-overlap as a mismatch
|
|
522 # (partial barcodes are shorter than the length of the original barcodes)
|
|
523 $mm += ($barcodes_length - length($barcode));
|
|
524
|
|
525 if ( $mm < $best_barcode_mismatches_count ) {
|
|
526 $best_barcode_mismatches_count = $mm ;
|
|
527 $best_barcode_ident = $ident ;
|
|
528 }elsif($mm == $best_barcode_mismatches_count){
|
|
529 $best_barcode_ident = "ambiguous"
|
|
530 }
|
|
531 }
|
|
532
|
|
533 if ($best_barcode_mismatches_count<=$allowed_mismatches && $best_barcode_ident ne 'ambiguous'){
|
|
534 $seq_bases = substr $seq_bases, $start, $cut_len;
|
|
535 if ($fastq_format) {
|
|
536 $seq_qualities = substr $seq_qualities, $start, $cut_len;
|
|
537 }
|
|
538 if ($paired == 1 ) {
|
|
539 $seq_bases_r2 = substr $seq_bases_r2, $start2, $cut_len2;
|
|
540 if ($fastq_format) {
|
|
541 $seq_qualities_r2 = substr $seq_qualities_r2, $start2, $cut_len2;
|
|
542 }
|
|
543 }
|
|
544 }
|
|
545
|
|
546 if ( (!defined $best_barcode_ident) || ($best_barcode_mismatches_count>$allowed_mismatches)){
|
|
547 $best_barcode_ident = 'unmatched';
|
|
548 }
|
|
549
|
|
550 #get the file associated with the matched barcode.
|
|
551 #(note: there's also a file associated with 'unmatched' barcode)
|
|
552
|
|
553 if (defined $rad && $best_barcode_ident ne 'unmatched' && $best_barcode_ident ne 'ambiguous'){
|
|
554 my $nb_rad =()= ($seq_bases =~ /$rad/g);
|
|
555 #~ warn $seq_bases," ",$nb_rad;
|
|
556 $best_barcode_ident.='_2rad' if ($nb_rad > 0);
|
|
557 #~ warn $best_barcode_ident;
|
|
558 }
|
|
559
|
|
560 $counts{$best_barcode_ident}++;
|
|
561 my $file = $files{$best_barcode_ident};
|
|
562 #~ warn "$seq_name , $best_barcode_ident \n";
|
|
563 #~ warn $file ;
|
|
564 $write_r2 = 0;
|
|
565 write_record($file);
|
|
566 if ($paired == 1) {
|
|
567 #~ warn $best_barcode_ident."r2";
|
|
568 my $file = $files{$best_barcode_ident."r2"};
|
|
569 $write_r2 = 1;
|
|
570 #~ warn $best_barcode_ident."r2";
|
|
571 write_record($file);
|
|
572 }
|
|
573
|
|
574 } while ( read_record );
|
|
575 }
|
|
576
|
|
577 #Quickly calculate hamming distance between two strings
|
|
578 #
|
|
579 #NOTE: Strings must be same length.
|
|
580 # returns number of different characters.
|
|
581 #see http://www.perlmonks.org/?node_id=500235
|
|
582 sub mismatch_count($$) {
|
|
583 length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] )
|
|
584 }
|
|
585
|
|
586
|
|
587
|
|
588 sub print_results
|
|
589 {
|
|
590 print "Barcode\tCount\tLocation\n";
|
|
591 my $total = 0 ;
|
|
592 foreach my $ident (sort keys %counts) {
|
|
593 print $ident, "\t", $counts{$ident};
|
|
594 if ($paired) {
|
|
595 print "(*2)";
|
|
596 }
|
|
597 print "\t",$filenames{$ident};
|
|
598 if ($paired) {
|
|
599 print " & ".$filenames{$ident."r2"};
|
|
600 }
|
|
601 print "\n";
|
|
602 $total += $counts{$ident};
|
|
603 }
|
|
604 print "total\t",$total;
|
|
605 if ($paired) {
|
|
606 print "(*2)";
|
|
607 }
|
|
608 print "\n";
|
|
609 }
|
|
610
|
|
611
|
|
612 sub read_record
|
|
613 {
|
|
614 $seq_name = $input_file_io->getline();
|
|
615
|
|
616 return undef unless defined $seq_name; # End of file?
|
|
617
|
|
618 $seq_bases = $input_file_io->getline();
|
|
619 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases;
|
|
620 $seq_len = length($seq_bases)-1;
|
|
621 # If using FASTQ format, read two more lines
|
|
622 if ($fastq_format) {
|
|
623 $seq_name2 = $input_file_io->getline();
|
|
624 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
|
|
625
|
|
626 $seq_qualities = $input_file_io->getline();
|
|
627 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
|
|
628 }
|
|
629
|
|
630 if ($paired == 1) {
|
|
631 $seq_name_r2 = $input_file_io_r2->getline();
|
|
632
|
|
633 return undef unless defined $seq_name_r2; # End of file?
|
|
634
|
|
635 $seq_bases_r2 = $input_file_io_r2->getline();
|
|
636 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases_r2;
|
|
637 $seq_len_r2 = length($seq_bases_r2)-1;
|
|
638 # If using FASTQ format, read two more lines
|
|
639 if ($fastq_format) {
|
|
640 $seq_name2_r2 = $input_file_io_r2->getline();
|
|
641 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2;
|
|
642
|
|
643 $seq_qualities_r2 = $input_file_io_r2->getline();
|
|
644 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2;
|
|
645 }
|
|
646 }
|
|
647
|
|
648 return 1;
|
|
649 }
|
|
650
|
|
651 sub write_record($)
|
|
652 {
|
|
653 if ($write_r2 == 1) {
|
|
654 my $file = shift;
|
|
655 croak "Bad file handle" unless defined $file;
|
|
656
|
|
657 print $file $seq_name_r2;
|
|
658 print $file $seq_bases_r2,"\n";
|
|
659
|
|
660 #if using FASTQ format, write two more lines
|
|
661 if ($fastq_format) {
|
|
662 print $file $seq_name2_r2;
|
|
663 print $file $seq_qualities_r2,"\n";
|
|
664 }
|
|
665 } else {
|
|
666 my $file = shift;
|
|
667 croak "Bad file handle $seq_name" unless defined $file;
|
|
668
|
|
669 print $file $seq_name;
|
|
670 print $file $seq_bases,"\n";
|
|
671
|
|
672 #if using FASTQ format, write two more lines
|
|
673 if ($fastq_format) {
|
|
674 print $file $seq_name2;
|
|
675 print $file $seq_qualities,"\n";
|
|
676 }
|
|
677 }
|
|
678 }
|
|
679
|
|
680 sub open_and_detect_input_format
|
|
681 {
|
|
682 # $input_file_io = new IO::Handle;
|
|
683 # die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r");
|
|
684
|
|
685 if ($ARGV[0] =~ /.gz$/){
|
|
686 die "Failed to open '$ARGV[0]' " unless open ($input_file_io, "<:gzip", $ARGV[0] ) ;
|
|
687 #~ $input_file_io = IO::Zlib->new();
|
|
688 #~ die "Failed to open '$ARGV[0]' " unless $input_file_io->open($ARGV[0],"r");
|
|
689 }else{
|
|
690 $input_file_io= IO::Handle->new();
|
|
691 die "Failed to open '$ARGV[0]' " unless open($input_file_io , $ARGV[0]);
|
|
692 }
|
|
693
|
|
694
|
|
695 # If a read2 file is provided
|
|
696 if ($#ARGV == 1) {
|
|
697 if ($ARGV[1] =~ /.gz$/){
|
|
698 die "Failed to open '$ARGV[0]' " unless open ($input_file_io_r2 , "<:gzip", $ARGV[1] ) ;
|
|
699 #~ $input_file_io_r2 = IO::Zlib->new();
|
|
700 #~ die "Failed to open '$ARGV[0]' " unless $input_file_io_r2->open($ARGV[1],"r");
|
|
701 }else{
|
|
702 $input_file_io_r2 = IO::Handle->new();
|
|
703 die "Failed to open '$ARGV[1]' " unless open($input_file_io_r2 , $ARGV[1]);
|
|
704 }
|
|
705 $paired = 1;
|
|
706 }
|
|
707
|
|
708 # Get the first characeter, and push it back
|
|
709 my $first_char = $input_file_io->getc();
|
|
710 $input_file_io->ungetc(ord $first_char);
|
|
711 $seq_name = $input_file_io->getline();
|
|
712 $seq_bases = $input_file_io->getline();
|
|
713 $seq_len = length($seq_bases)-1;
|
|
714 if ($paired == 1) {
|
|
715 $seq_name_r2 = $input_file_io_r2->getline();
|
|
716 $seq_bases_r2 = $input_file_io_r2->getline();
|
|
717 $seq_len_r2 = length($seq_bases_r2)-1;
|
|
718 }
|
|
719 if ($first_char eq '>') {
|
|
720 # FASTA format
|
|
721 $fastq_format = 0 ;
|
|
722 print STDERR "Detected FASTA format\n" if $debug;
|
|
723 } elsif ($first_char eq '@') {
|
|
724 # FASTQ format
|
|
725 $fastq_format = 1;
|
|
726 $seq_name2 = $input_file_io->getline();
|
|
727 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2;
|
|
728 $seq_qualities = $input_file_io->getline();
|
|
729 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities;
|
|
730 if ($paired == 1) {
|
|
731 $seq_name2_r2 = $input_file_io_r2->getline();
|
|
732 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2;
|
|
733 $seq_qualities_r2 = $input_file_io_r2->getline();
|
|
734 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2;
|
|
735 }
|
|
736 print STDERR "Detected FASTQ format\n" if $debug;
|
|
737 } else {
|
|
738 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n";
|
|
739 }
|
|
740 # warn "len1: ", $seq_len, "; len2: ", $seq_len_r2, " ;barcode len: ", $barcodes_length, "; no adapt: ", $adapt;
|
|
741 }
|
|
742
|
|
743 sub doc()
|
|
744 {
|
|
745 print<<EOF;
|
|
746 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008
|
|
747
|
|
748 This program reads FASTA/FASTQ file and splits it into several smaller files,
|
|
749 Based on barcode matching.
|
|
750 FASTA/FASTQ data is read from STDIN (format is auto-detected.)
|
|
751 Output files will be writen to disk.
|
|
752 Summary will be printed to STDOUT.
|
|
753
|
|
754 usage: $0 read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol]
|
|
755 [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug]
|
|
756
|
|
757 Arguments:
|
|
758
|
|
759 --bcfile FILE - Barcodes file name. (see explanation below.)
|
|
760 --prefix-r1 PREFIX - File prefix. The sample name will be placed where the % is placed.
|
|
761 --prefix-r2 PREFIX - File prefix. The sample name will be placed where the % is placed.
|
|
762 --bol - Try to match barcodes at the BEGINNING of sequences.
|
|
763 (What biologists would call the 5' end, and programmers
|
|
764 would call index 0.)
|
|
765 --eol - Try to match barcodes at the END of sequences.
|
|
766 (What biologists would call the 3' end, and programmers
|
|
767 would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed.
|
|
768 NOTE: one of --bol, --eol must be specified, but not both.
|
|
769 --mismatches N - Max. number of mismatches allowed. default is 1.
|
|
770 --exact - Same as '--mismatches 0'. If both --exact and --mismatches
|
|
771 are specified, '--exact' takes precedence.
|
|
772 --partial N - Allow partial overlap of barcodes. (see explanation below.)
|
|
773 (Default is not partial matching)
|
|
774 --trim - Should the barecode be trimmed.
|
|
775 --trim2 - Shoud the read 2 be trimmed to have the same length as the read1
|
|
776 NOTE: one of --trim, --trim2 must be specified, but not both.
|
|
777 --no_adapt - there is no adaptator (T or A ) between barcode and the sequence
|
|
778 --rad STR - sequence are RADSeq, barcode is immediately followed by rad sequence <STR>.
|
|
779 NOTE: --rad implies --no_adapt
|
|
780 --radTAG STR - sequence retain at the begining of the sequence after cliping.
|
|
781 NOTE: if define rad, radTAG must be defined to, and conversely
|
|
782 Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file.
|
|
783 --TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0.
|
|
784 --quiet - Don't print counts and summary at the end of the run.
|
|
785 (Default is to print.)
|
|
786 --debug - Print lots of useless debug information to STDERR.
|
|
787 --help - This helpful help screen.
|
|
788
|
|
789 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is
|
|
790 the barcodes file):
|
|
791
|
|
792 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\
|
|
793 --prefix-r1 /tmp/bla_%.txt"
|
|
794
|
|
795 Barcode file format
|
|
796 -------------------
|
|
797 Barcode files are simple text files. Each line should contain an identifier
|
|
798 (descriptive name for the barcode), and the barcode itself (A/C/G/T),
|
|
799 separated by a TAB character. Example:
|
|
800
|
|
801 #This line is a comment (starts with a 'number' sign)
|
|
802 BC1 GATCT
|
|
803 BC2 ATCGT
|
|
804 BC3 GTGAT
|
|
805 BC4 TGTCT
|
|
806
|
|
807 For each barcode, a new FASTQ file will be created (with the barcode's
|
|
808 identifier as part of the file name). Sequences matching the barcode
|
|
809 will be stored in the appropriate file.
|
|
810
|
|
811 Running the above example (assuming "mybarcodes.txt" contains the above
|
|
812 barcodes), will create the following files:
|
|
813 /tmp/bla_BC1.txt
|
|
814 /tmp/bla_BC2.txt
|
|
815 /tmp/bla_BC3.txt
|
|
816 /tmp/bla_BC4.txt
|
|
817 /tmp/bla_unmatched.txt
|
|
818 The 'unmatched' file will contain all sequences that didn't match any barcode.
|
|
819
|
|
820 Barcode matching
|
|
821 ----------------
|
|
822
|
|
823 ** Without partial matching:
|
|
824
|
|
825 Count mismatches between the FASTA/Q sequences and the barcodes.
|
|
826 The barcode which matched with the lowest mismatches count (providing the
|
|
827 count is small or equal to '--mismatches N') 'gets' the sequences.
|
|
828
|
|
829 Example (using the above barcodes):
|
|
830 Input Sequence:
|
|
831 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
832
|
|
833 Matching with '--bol --mismatches 1':
|
|
834 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
835 GATCT (1 mismatch, BC1)
|
|
836 ATCGT (4 mismatches, BC2)
|
|
837 GTGAT (3 mismatches, BC3)
|
|
838 TGTCT (3 mismatches, BC4)
|
|
839
|
|
840 This sequence will be classified as 'BC1' (it has the lowest mismatch count).
|
|
841 If '--exact' or '--mismatches 0' were specified, this sequence would be
|
|
842 classified as 'unmatched' (because, although BC1 had the lowest mismatch count,
|
|
843 it is above the maximum allowed mismatches).
|
|
844
|
|
845 Matching with '--eol' (end of line) does the same, but from the other side
|
|
846 of the sequence.
|
|
847
|
|
848 ** With partial matching (very similar to indels):
|
|
849
|
|
850 Same as above, with the following addition: barcodes are also checked for
|
|
851 partial overlap (number of allowed non-overlapping bases is '--partial N').
|
|
852
|
|
853 Example:
|
|
854 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
855 (Same as above, but note the missing 'G' at the beginning.)
|
|
856
|
|
857 Matching (without partial overlapping) against BC1 yields 4 mismatches:
|
|
858 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
859 GATCT (4 mismatches)
|
|
860
|
|
861 Partial overlapping would also try the following match:
|
|
862 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG
|
|
863 GATCT (1 mismatch)
|
|
864
|
|
865 Note: scoring counts a missing base as a mismatch, so the final
|
|
866 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch).
|
|
867 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this
|
|
868 seqeunce will be classified as BC1.
|
|
869
|
|
870 EOF
|
|
871
|
|
872 exit 1;
|
|
873 }
|
|
874
|
|
875 sub version()
|
|
876 {
|
|
877 print STDOUT "1.0";
|
|
878 exit 0 ;
|
|
879 }
|