Mercurial > repos > oinizan > demultiplex
comparison splitbc.pl @ 0:da4101033e10 draft default tip
planemo upload
author | oinizan |
---|---|
date | Wed, 18 Oct 2017 05:30:40 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:da4101033e10 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # FASTX-toolkit - FASTA/FASTQ preprocessing tools. fastx_barcode_splitter.pl | |
4 # Copyright (C) 2009 A. Gordon (gordon@cshl.edu) | |
5 # | |
6 # This program is free software: you can redistribute it and/or modify | |
7 # it under the terms of the GNU Affero General Public License as | |
8 # published by the Free Software Foundation, either version 3 of the | |
9 # License, or (at your option) any later version. | |
10 # | |
11 # This program is distributed in the hope that it will be useful, | |
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 # GNU Affero General Public License for more details. | |
15 # | |
16 # You should have received a copy of the GNU Affero General Public License | |
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
18 use strict; | |
19 use warnings; | |
20 use IO::Handle; | |
21 use IO::Zlib; | |
22 use PerlIO::gzip; | |
23 use Data::Dumper; | |
24 use Getopt::Long; | |
25 use Carp; | |
26 use Pod::Usage; | |
27 | |
28 =pod | |
29 | |
30 =head1 NAME | |
31 | |
32 splitbc.pl | |
33 | |
34 =head1 SYNOPSIS | |
35 | |
36 splitbc.pl read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol] | |
37 [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug] | |
38 | |
39 =head1 DESCRIPTION | |
40 | |
41 splitbc.pl classify fasta/fastq single or paired end reads in function of barcode forward or reverse in the first or both reads. | |
42 Also, whether reads are RAD sequence, it checks that barcode is followed by RAD tag. | |
43 | |
44 =head1 OPTIONS | |
45 | |
46 =over | |
47 =item B<--bcfile FILE> | |
48 | |
49 Barcodes file name. (splitbc.pl --doc for details) | |
50 | |
51 =item B<--prefix-r1 PREFIX> | |
52 | |
53 File prefix. The sample name will be placed where the % is placed. | |
54 | |
55 =item B<--prefix-r2 PREFIX> | |
56 | |
57 File prefix. The sample name will be placed where the % is placed. | |
58 | |
59 =item B<--bol> | |
60 | |
61 Try to match barcodes at the BEGINNING of sequences. | |
62 (What biologists would call the 5' end, and programmers | |
63 would call index 0.) | |
64 | |
65 =item B<--eol> | |
66 | |
67 Try to match barcodes at the END of sequences. | |
68 (What biologists would call the 3' end, and programmers | |
69 would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed. | |
70 NOTE: one of --bol, --eol must be specified, but not both. | |
71 | |
72 =item B<--mismatches N> | |
73 | |
74 Max. number of mismatches allowed. default is 1. | |
75 | |
76 =item B<--exact> | |
77 | |
78 Same as '--mismatches 0'. If both --exact and --mismatches are specified, '--exact' takes precedence. | |
79 | |
80 =item B <--partial N> | |
81 | |
82 Allow partial overlap of barcodes. (splitbc.pl --doc for details) | |
83 (Default is not partial matching) | |
84 | |
85 item B<--trim> | |
86 | |
87 Should the barecode be trimmed. | |
88 | |
89 =item B<--trim2> | |
90 | |
91 Shoud the read 2 be trimmed to have the same length as the read1 | |
92 NOTE: one of --trim, --trim2 must be specified, but not both. | |
93 | |
94 =item B<--no_adapt> | |
95 | |
96 there is no adaptator (T or A ) between barcode and the sequence | |
97 | |
98 =item B<--rad STR> | |
99 | |
100 sequence are RADSeq, barcode is immediately followed by rad sequence <STR>. | |
101 NOTE: --rad implies --no_adapt | |
102 | |
103 =item B<--radTAG STR> | |
104 | |
105 sequence retain at the begining of the sequence after cliping. | |
106 NOTE: if define rad, radTAG must be defined to, and conversely | |
107 Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file. | |
108 | |
109 =item B<--TAG_mismatch N> | |
110 | |
111 Max. number of mismatches allowed in the radTAG sequence. default is 0. | |
112 | |
113 =item B<--quiet> | |
114 | |
115 Don't print counts and summary at the end of the run. | |
116 (Default is to print.) | |
117 | |
118 =item B<--debug> | |
119 | |
120 Print lots of useless debug information to STDERR. | |
121 | |
122 =item B<--help> | |
123 | |
124 This helpful help screen. | |
125 | |
126 =back | |
127 | |
128 =head1 AUTHORS | |
129 | |
130 Jérôme Mariette | |
131 modified by Maria Bernard | |
132 | |
133 =head1 VERSION | |
134 | |
135 1.1 | |
136 | |
137 =head1 DATE | |
138 | |
139 modified 21/05/2013 | |
140 | |
141 =head1 KEYWORDS | |
142 | |
143 Barcode RAD | |
144 | |
145 =head1 EXAMPLE | |
146 | |
147 (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is the barcodes file): | |
148 $0 cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ | |
149 --prefix-r1 /tmp/bla_%.txt" | |
150 | |
151 =head1 CHANGELOG | |
152 | |
153 1.1: 18/10/2013 Maria | |
154 construct list of best barcode to deal with equal match. In this case reads will be written in "ambiguous" files | |
155 | |
156 =cut | |
157 | |
158 ## | |
159 ## This program splits a FASTQ/FASTA file into several smaller files, | |
160 ## Based on barcode matching. | |
161 ## | |
162 ## run with "--help" for usage information | |
163 ## | |
164 ## Assaf Gordon <gordon@cshl.edu> , 11sep2008 | |
165 | |
166 # Forward declarations | |
167 sub load_barcode_file ($); | |
168 sub parse_command_line ; | |
169 sub match_sequences ; | |
170 sub mismatch_count($$) ; | |
171 sub print_results; | |
172 sub open_and_detect_input_format; | |
173 sub read_record; | |
174 sub write_record($); | |
175 sub usage(); | |
176 | |
177 # Global flags and arguments, | |
178 # Set by command line argumens | |
179 my $barcode_file ; | |
180 my $barcodes_at_eol = 0 ; | |
181 my $barcodes_at_bol = 0 ; | |
182 my $exact_match = 0 ; | |
183 my $allow_partial_overlap = 0; | |
184 my $allowed_mismatches = 1; | |
185 my $newfile_prefix_r1 = ''; | |
186 my $newfile_prefix_r2 = ''; | |
187 my $quiet = 0 ; | |
188 my $debug = 0 ; | |
189 my $trim = 0 ; | |
190 my $trim2 = 0 ; | |
191 my $adapt = 1 ; | |
192 my $no_adapt = 0 ; | |
193 my $rad; | |
194 my $radTAG; | |
195 my $TAG_mm=0; | |
196 my $fastq_format = 1; | |
197 | |
198 # Global variables | |
199 # Populated by 'parse_command_line' | |
200 my $radTAG_length=0; | |
201 my $check_rad=0; | |
202 | |
203 # Global variables | |
204 # Populated by 'create_output_files' | |
205 my %filenames; | |
206 my %files; | |
207 my %counts = ( 'unmatched' => 0 ); | |
208 my $barcodes_length; | |
209 my @barcodes; | |
210 my $input_file_io; | |
211 my $input_file_io_r2; | |
212 my $paired = 0; | |
213 my $write_r2 = 0; | |
214 | |
215 # The Four lines per record in FASTQ format. | |
216 # (when using FASTA format, only the first two are used) | |
217 my $seq_name; | |
218 my $seq_bases; | |
219 my $seq_name2; | |
220 my $seq_qualities; | |
221 my $seq_len; | |
222 my $seq_name_r2; | |
223 my $seq_bases_r2; | |
224 my $seq_name2_r2; | |
225 my $seq_qualities_r2; | |
226 my $seq_len_r2; | |
227 | |
228 # | |
229 # Start of Program | |
230 # | |
231 parse_command_line ; | |
232 | |
233 load_barcode_file ( $barcode_file ) ; | |
234 | |
235 open_and_detect_input_format; | |
236 | |
237 match_sequences ; | |
238 | |
239 print_results unless $quiet; | |
240 | |
241 # | |
242 # End of program | |
243 ######################################################################## | |
244 | |
245 | |
246 sub parse_command_line { | |
247 my $help; | |
248 my $doc; | |
249 my $version; | |
250 | |
251 pod2usage(1) if (scalar @ARGV==0); | |
252 | |
253 my $result = GetOptions ( "bcfile=s" => \$barcode_file, | |
254 "eol" => \$barcodes_at_eol, | |
255 "bol" => \$barcodes_at_bol, | |
256 "exact" => \$exact_match, | |
257 "prefix-r1=s" => \$newfile_prefix_r1, | |
258 "prefix-r2=s" => \$newfile_prefix_r2, | |
259 "quiet" => \$quiet, | |
260 "partial=i" => \$allow_partial_overlap, | |
261 "trim" => \$trim, | |
262 "trim2" => \$trim2, | |
263 "debug" => \$debug, | |
264 "mismatches=i" => \$allowed_mismatches, | |
265 "no_adapt"=> \$no_adapt, | |
266 "rad=s"=> \$rad, | |
267 "radTAG=s"=> \$radTAG, | |
268 "TAG_mismatch=i"=> \$TAG_mm, | |
269 "help" => \$help, | |
270 "doc" => \$doc, | |
271 "version" => \$version | |
272 ) ; | |
273 | |
274 pod2usage(1) if ($help); | |
275 doc() if ($doc); | |
276 version() if ($version); | |
277 | |
278 die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; | |
279 die "Error: prefix path/filename not specified (use '--prefix-r1 [PATH]%.fq')\n" unless defined $newfile_prefix_r1; | |
280 # If a read2 file is provided | |
281 if ($#ARGV == 1) { | |
282 die "Error: prefix path/filename not specified (use '--prefix-r2 [PATH]%.fq')\n" unless defined $newfile_prefix_r2; | |
283 } | |
284 | |
285 if ($barcodes_at_bol == $barcodes_at_eol) { | |
286 die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; | |
287 die "Error: must specify either --eol or --bol\n" ; | |
288 } | |
289 | |
290 die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; | |
291 | |
292 $allowed_mismatches = 0 if $exact_match; | |
293 | |
294 die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); | |
295 | |
296 die "Error: partial overlap value ($allow_partial_overlap) bigger than " . | |
297 "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); | |
298 | |
299 $trim = 1 if $trim2 ; | |
300 | |
301 if ((defined $rad && !defined $radTAG) || (!defined $rad && defined $radTAG)){ | |
302 die "Error: you must defined --rad AND --radTAG option\n"; | |
303 } | |
304 if ( $no_adapt){ | |
305 $adapt=0; | |
306 } | |
307 if (defined $rad ){ | |
308 chomp $rad; | |
309 chomp $radTAG; | |
310 die "Error: $rad does not ended by $radTAG\n" if ($rad !~ /$radTAG$/g); | |
311 $adapt=0; | |
312 $check_rad=1; | |
313 #~ warn $rad, " ",$radTAG; | |
314 $radTAG_length=length($radTAG); | |
315 } | |
316 exit unless $result; | |
317 } | |
318 | |
319 | |
320 | |
321 # | |
322 # Read the barcode file | |
323 # | |
324 sub load_barcode_file ($) { | |
325 my $filename = shift or croak "Missing barcode file name"; | |
326 | |
327 open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; | |
328 while (<BCFILE>) { | |
329 next if m/^#/; | |
330 chomp $_; | |
331 my @arr = split(' ',$_); | |
332 my $ident = $arr[0]; | |
333 my $barcode = uc($arr[1]); | |
334 | |
335 # Sanity checks on the barcodes | |
336 die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; | |
337 die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" | |
338 unless $barcode =~ m/^[AGCT]+$/; | |
339 | |
340 die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" | |
341 unless $ident =~ m/^\w+$/; | |
342 | |
343 die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . | |
344 "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" | |
345 if length($barcode)<=$allowed_mismatches; | |
346 | |
347 $barcodes_length = length($barcode) unless defined $barcodes_length; | |
348 die "Error: found barcodes in different lengths. this feature is not supported yet.\n" | |
349 unless $barcodes_length == length($barcode); | |
350 | |
351 push @barcodes, [$ident, $barcode]; | |
352 | |
353 if ($allow_partial_overlap>0) { | |
354 foreach my $i (1 .. $allow_partial_overlap) { | |
355 substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; | |
356 push @barcodes, [$ident, $barcode]; | |
357 } | |
358 } | |
359 } | |
360 close BCFILE; | |
361 | |
362 if ($debug) { | |
363 print STDERR "barcode\tsequence\n"; | |
364 foreach my $barcoderef (@barcodes) { | |
365 my ($ident, $seq) = @{$barcoderef}; | |
366 print STDERR $ident,"\t", $seq ,"\t",length($seq),"\n"; | |
367 } | |
368 } | |
369 } | |
370 | |
371 # Create one output file for each barcode. | |
372 # (Also create a file for the dummy 'unmatched' barcode) | |
373 sub create_output_files { | |
374 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | |
375 $barcodes{'unmatched'} = 1 ; | |
376 $barcodes{'ambiguous'} = 1 ; | |
377 | |
378 foreach my $ident (keys %barcodes) { | |
379 | |
380 my $new_filename = $newfile_prefix_r1; | |
381 $new_filename =~ s/%/$ident/g; | |
382 $filenames{$ident} = $new_filename; | |
383 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; | |
384 $files{$ident} = $file ; | |
385 | |
386 if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){ | |
387 my $new_filename = $newfile_prefix_r1; | |
388 $new_filename =~ s/%/${ident}_2rad/g; | |
389 $filenames{$ident."_2rad"} = $new_filename; | |
390 open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; | |
391 $files{$ident."_2rad"} = $file ; | |
392 } | |
393 | |
394 if ($paired == 1) { | |
395 my $new_filename_r2 = $newfile_prefix_r2; | |
396 $new_filename_r2 =~ s/%/$ident/g; | |
397 $filenames{$ident."r2"} = $new_filename_r2; | |
398 open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n"; | |
399 $files{$ident."r2"} = $file ; | |
400 | |
401 if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){ | |
402 my $new_filename_r2 = $newfile_prefix_r2; | |
403 $new_filename_r2 =~ s/%/${ident}_2rad/g; | |
404 $filenames{$ident."_2radr2"} = $new_filename_r2; | |
405 open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n"; | |
406 $files{$ident."_2radr2"} = $file ; | |
407 } | |
408 } | |
409 } | |
410 } | |
411 | |
412 sub reverse_complement { | |
413 my $dna = shift; | |
414 | |
415 # reverse the DNA sequence | |
416 my $revcomp = reverse($dna); | |
417 | |
418 # complement the reversed DNA sequence | |
419 $revcomp =~ tr/ACGTacgt/TGCAtgca/; | |
420 return $revcomp; | |
421 } | |
422 | |
423 sub match_sequences { | |
424 | |
425 my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; | |
426 $barcodes{'unmatched'} = 1 ; | |
427 $barcodes{'ambiguous'} = 1 ; | |
428 | |
429 #reset counters | |
430 foreach my $ident ( keys %barcodes ) { | |
431 $counts{$ident} = 0; | |
432 } | |
433 | |
434 create_output_files; | |
435 | |
436 # Read file FASTQ file | |
437 # split accotding to barcodes | |
438 do { | |
439 chomp $seq_bases; | |
440 #~ print STDERR "sequence ".length($seq_bases)." $seq_bases: \n" if $debug; | |
441 if ($fastq_format){ | |
442 chomp $seq_qualities; | |
443 } | |
444 | |
445 if ($paired == 1) { | |
446 chomp $seq_bases_r2; | |
447 if ($fastq_format){ | |
448 chomp $seq_qualities_r2; | |
449 } | |
450 } | |
451 my $best_barcode_mismatches_count = $barcodes_length; | |
452 my $best_barcode_ident = undef; | |
453 my $sequence_fragment; | |
454 my $sequence_fragment_radTAG; | |
455 my $start = 0; | |
456 my $cut_len = $seq_len; | |
457 my $start2 = 0; | |
458 my $cut_len2 = $seq_len_r2; | |
459 # warn $start , " " , $cut_len , " " , $start2 , " " , $cut_len2; | |
460 #Try all barcodes, find the one with the lowest mismatch count | |
461 if ($barcodes_at_bol) { | |
462 $sequence_fragment = substr $seq_bases, 0, $barcodes_length; | |
463 if (defined $rad) | |
464 { | |
465 $sequence_fragment_radTAG = substr $seq_bases, $barcodes_length, $radTAG_length; | |
466 } | |
467 if ($trim){ | |
468 $start=$barcodes_length+$adapt; | |
469 $cut_len=$seq_len-$barcodes_length-$adapt; | |
470 if ($trim2) { | |
471 $start2 = 0 ; | |
472 $cut_len2=$cut_len; | |
473 } | |
474 } | |
475 } elsif($barcodes_at_eol) { | |
476 if ($paired != 1){ | |
477 $sequence_fragment = substr $seq_bases, - $barcodes_length ; | |
478 if (defined $rad){ | |
479 $sequence_fragment_radTAG = substr $seq_bases, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length; | |
480 } | |
481 if ($trim){ | |
482 $start=0 ; | |
483 $cut_len=$seq_len-$barcodes_length-$adapt; | |
484 } | |
485 } else { | |
486 $sequence_fragment = substr $seq_bases_r2, - $barcodes_length; | |
487 if (defined $rad){ | |
488 $sequence_fragment_radTAG = substr $seq_bases_r2, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length; | |
489 } | |
490 if ($trim){ | |
491 $start2=0 ; | |
492 $cut_len2=$seq_len_r2-$barcodes_length-$adapt; | |
493 if ($trim2) { | |
494 $start = 0 ; | |
495 $cut_len=$cut_len2 ; | |
496 } | |
497 } | |
498 } | |
499 #~ warn "sequence fragment : $sequence_fragment\n" if $debug; | |
500 #~ $sequence_fragment = reverse_complement $sequence_fragment; | |
501 #~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug; | |
502 } | |
503 # warn $start," ", $cut_len, " ",$start2, " ",$cut_len2; | |
504 | |
505 foreach my $barcoderef (@barcodes) { | |
506 my ($ident, $barcode) = @{$barcoderef}; | |
507 | |
508 # Get DNA fragment (in the length of the barcodes) | |
509 # The barcode will be tested only against this fragment | |
510 # (no point in testing the barcode against the whole sequence) | |
511 my $mm=$barcodes_length; | |
512 | |
513 # check validity of RAD. | |
514 if (defined $rad){ | |
515 my $mm_rad = mismatch_count($sequence_fragment_radTAG, $radTAG) ; | |
516 if ($mm_rad<= $TAG_mm){ $mm = mismatch_count($sequence_fragment, $barcode) ; } | |
517 }else { | |
518 $mm = mismatch_count($sequence_fragment, $barcode) ; | |
519 } | |
520 | |
521 # if this is a partial match, add the non-overlap as a mismatch | |
522 # (partial barcodes are shorter than the length of the original barcodes) | |
523 $mm += ($barcodes_length - length($barcode)); | |
524 | |
525 if ( $mm < $best_barcode_mismatches_count ) { | |
526 $best_barcode_mismatches_count = $mm ; | |
527 $best_barcode_ident = $ident ; | |
528 }elsif($mm == $best_barcode_mismatches_count){ | |
529 $best_barcode_ident = "ambiguous" | |
530 } | |
531 } | |
532 | |
533 if ($best_barcode_mismatches_count<=$allowed_mismatches && $best_barcode_ident ne 'ambiguous'){ | |
534 $seq_bases = substr $seq_bases, $start, $cut_len; | |
535 if ($fastq_format) { | |
536 $seq_qualities = substr $seq_qualities, $start, $cut_len; | |
537 } | |
538 if ($paired == 1 ) { | |
539 $seq_bases_r2 = substr $seq_bases_r2, $start2, $cut_len2; | |
540 if ($fastq_format) { | |
541 $seq_qualities_r2 = substr $seq_qualities_r2, $start2, $cut_len2; | |
542 } | |
543 } | |
544 } | |
545 | |
546 if ( (!defined $best_barcode_ident) || ($best_barcode_mismatches_count>$allowed_mismatches)){ | |
547 $best_barcode_ident = 'unmatched'; | |
548 } | |
549 | |
550 #get the file associated with the matched barcode. | |
551 #(note: there's also a file associated with 'unmatched' barcode) | |
552 | |
553 if (defined $rad && $best_barcode_ident ne 'unmatched' && $best_barcode_ident ne 'ambiguous'){ | |
554 my $nb_rad =()= ($seq_bases =~ /$rad/g); | |
555 #~ warn $seq_bases," ",$nb_rad; | |
556 $best_barcode_ident.='_2rad' if ($nb_rad > 0); | |
557 #~ warn $best_barcode_ident; | |
558 } | |
559 | |
560 $counts{$best_barcode_ident}++; | |
561 my $file = $files{$best_barcode_ident}; | |
562 #~ warn "$seq_name , $best_barcode_ident \n"; | |
563 #~ warn $file ; | |
564 $write_r2 = 0; | |
565 write_record($file); | |
566 if ($paired == 1) { | |
567 #~ warn $best_barcode_ident."r2"; | |
568 my $file = $files{$best_barcode_ident."r2"}; | |
569 $write_r2 = 1; | |
570 #~ warn $best_barcode_ident."r2"; | |
571 write_record($file); | |
572 } | |
573 | |
574 } while ( read_record ); | |
575 } | |
576 | |
577 #Quickly calculate hamming distance between two strings | |
578 # | |
579 #NOTE: Strings must be same length. | |
580 # returns number of different characters. | |
581 #see http://www.perlmonks.org/?node_id=500235 | |
582 sub mismatch_count($$) { | |
583 length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) | |
584 } | |
585 | |
586 | |
587 | |
588 sub print_results | |
589 { | |
590 print "Barcode\tCount\tLocation\n"; | |
591 my $total = 0 ; | |
592 foreach my $ident (sort keys %counts) { | |
593 print $ident, "\t", $counts{$ident}; | |
594 if ($paired) { | |
595 print "(*2)"; | |
596 } | |
597 print "\t",$filenames{$ident}; | |
598 if ($paired) { | |
599 print " & ".$filenames{$ident."r2"}; | |
600 } | |
601 print "\n"; | |
602 $total += $counts{$ident}; | |
603 } | |
604 print "total\t",$total; | |
605 if ($paired) { | |
606 print "(*2)"; | |
607 } | |
608 print "\n"; | |
609 } | |
610 | |
611 | |
612 sub read_record | |
613 { | |
614 $seq_name = $input_file_io->getline(); | |
615 | |
616 return undef unless defined $seq_name; # End of file? | |
617 | |
618 $seq_bases = $input_file_io->getline(); | |
619 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; | |
620 $seq_len = length($seq_bases)-1; | |
621 # If using FASTQ format, read two more lines | |
622 if ($fastq_format) { | |
623 $seq_name2 = $input_file_io->getline(); | |
624 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; | |
625 | |
626 $seq_qualities = $input_file_io->getline(); | |
627 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; | |
628 } | |
629 | |
630 if ($paired == 1) { | |
631 $seq_name_r2 = $input_file_io_r2->getline(); | |
632 | |
633 return undef unless defined $seq_name_r2; # End of file? | |
634 | |
635 $seq_bases_r2 = $input_file_io_r2->getline(); | |
636 die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases_r2; | |
637 $seq_len_r2 = length($seq_bases_r2)-1; | |
638 # If using FASTQ format, read two more lines | |
639 if ($fastq_format) { | |
640 $seq_name2_r2 = $input_file_io_r2->getline(); | |
641 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2; | |
642 | |
643 $seq_qualities_r2 = $input_file_io_r2->getline(); | |
644 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2; | |
645 } | |
646 } | |
647 | |
648 return 1; | |
649 } | |
650 | |
651 sub write_record($) | |
652 { | |
653 if ($write_r2 == 1) { | |
654 my $file = shift; | |
655 croak "Bad file handle" unless defined $file; | |
656 | |
657 print $file $seq_name_r2; | |
658 print $file $seq_bases_r2,"\n"; | |
659 | |
660 #if using FASTQ format, write two more lines | |
661 if ($fastq_format) { | |
662 print $file $seq_name2_r2; | |
663 print $file $seq_qualities_r2,"\n"; | |
664 } | |
665 } else { | |
666 my $file = shift; | |
667 croak "Bad file handle $seq_name" unless defined $file; | |
668 | |
669 print $file $seq_name; | |
670 print $file $seq_bases,"\n"; | |
671 | |
672 #if using FASTQ format, write two more lines | |
673 if ($fastq_format) { | |
674 print $file $seq_name2; | |
675 print $file $seq_qualities,"\n"; | |
676 } | |
677 } | |
678 } | |
679 | |
680 sub open_and_detect_input_format | |
681 { | |
682 # $input_file_io = new IO::Handle; | |
683 # die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); | |
684 | |
685 if ($ARGV[0] =~ /.gz$/){ | |
686 die "Failed to open '$ARGV[0]' " unless open ($input_file_io, "<:gzip", $ARGV[0] ) ; | |
687 #~ $input_file_io = IO::Zlib->new(); | |
688 #~ die "Failed to open '$ARGV[0]' " unless $input_file_io->open($ARGV[0],"r"); | |
689 }else{ | |
690 $input_file_io= IO::Handle->new(); | |
691 die "Failed to open '$ARGV[0]' " unless open($input_file_io , $ARGV[0]); | |
692 } | |
693 | |
694 | |
695 # If a read2 file is provided | |
696 if ($#ARGV == 1) { | |
697 if ($ARGV[1] =~ /.gz$/){ | |
698 die "Failed to open '$ARGV[0]' " unless open ($input_file_io_r2 , "<:gzip", $ARGV[1] ) ; | |
699 #~ $input_file_io_r2 = IO::Zlib->new(); | |
700 #~ die "Failed to open '$ARGV[0]' " unless $input_file_io_r2->open($ARGV[1],"r"); | |
701 }else{ | |
702 $input_file_io_r2 = IO::Handle->new(); | |
703 die "Failed to open '$ARGV[1]' " unless open($input_file_io_r2 , $ARGV[1]); | |
704 } | |
705 $paired = 1; | |
706 } | |
707 | |
708 # Get the first characeter, and push it back | |
709 my $first_char = $input_file_io->getc(); | |
710 $input_file_io->ungetc(ord $first_char); | |
711 $seq_name = $input_file_io->getline(); | |
712 $seq_bases = $input_file_io->getline(); | |
713 $seq_len = length($seq_bases)-1; | |
714 if ($paired == 1) { | |
715 $seq_name_r2 = $input_file_io_r2->getline(); | |
716 $seq_bases_r2 = $input_file_io_r2->getline(); | |
717 $seq_len_r2 = length($seq_bases_r2)-1; | |
718 } | |
719 if ($first_char eq '>') { | |
720 # FASTA format | |
721 $fastq_format = 0 ; | |
722 print STDERR "Detected FASTA format\n" if $debug; | |
723 } elsif ($first_char eq '@') { | |
724 # FASTQ format | |
725 $fastq_format = 1; | |
726 $seq_name2 = $input_file_io->getline(); | |
727 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; | |
728 $seq_qualities = $input_file_io->getline(); | |
729 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; | |
730 if ($paired == 1) { | |
731 $seq_name2_r2 = $input_file_io_r2->getline(); | |
732 die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2; | |
733 $seq_qualities_r2 = $input_file_io_r2->getline(); | |
734 die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2; | |
735 } | |
736 print STDERR "Detected FASTQ format\n" if $debug; | |
737 } else { | |
738 die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; | |
739 } | |
740 # warn "len1: ", $seq_len, "; len2: ", $seq_len_r2, " ;barcode len: ", $barcodes_length, "; no adapt: ", $adapt; | |
741 } | |
742 | |
743 sub doc() | |
744 { | |
745 print<<EOF; | |
746 Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 | |
747 | |
748 This program reads FASTA/FASTQ file and splits it into several smaller files, | |
749 Based on barcode matching. | |
750 FASTA/FASTQ data is read from STDIN (format is auto-detected.) | |
751 Output files will be writen to disk. | |
752 Summary will be printed to STDOUT. | |
753 | |
754 usage: $0 read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol] | |
755 [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug] | |
756 | |
757 Arguments: | |
758 | |
759 --bcfile FILE - Barcodes file name. (see explanation below.) | |
760 --prefix-r1 PREFIX - File prefix. The sample name will be placed where the % is placed. | |
761 --prefix-r2 PREFIX - File prefix. The sample name will be placed where the % is placed. | |
762 --bol - Try to match barcodes at the BEGINNING of sequences. | |
763 (What biologists would call the 5' end, and programmers | |
764 would call index 0.) | |
765 --eol - Try to match barcodes at the END of sequences. | |
766 (What biologists would call the 3' end, and programmers | |
767 would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed. | |
768 NOTE: one of --bol, --eol must be specified, but not both. | |
769 --mismatches N - Max. number of mismatches allowed. default is 1. | |
770 --exact - Same as '--mismatches 0'. If both --exact and --mismatches | |
771 are specified, '--exact' takes precedence. | |
772 --partial N - Allow partial overlap of barcodes. (see explanation below.) | |
773 (Default is not partial matching) | |
774 --trim - Should the barecode be trimmed. | |
775 --trim2 - Shoud the read 2 be trimmed to have the same length as the read1 | |
776 NOTE: one of --trim, --trim2 must be specified, but not both. | |
777 --no_adapt - there is no adaptator (T or A ) between barcode and the sequence | |
778 --rad STR - sequence are RADSeq, barcode is immediately followed by rad sequence <STR>. | |
779 NOTE: --rad implies --no_adapt | |
780 --radTAG STR - sequence retain at the begining of the sequence after cliping. | |
781 NOTE: if define rad, radTAG must be defined to, and conversely | |
782 Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file. | |
783 --TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0. | |
784 --quiet - Don't print counts and summary at the end of the run. | |
785 (Default is to print.) | |
786 --debug - Print lots of useless debug information to STDERR. | |
787 --help - This helpful help screen. | |
788 | |
789 Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is | |
790 the barcodes file): | |
791 | |
792 \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ | |
793 --prefix-r1 /tmp/bla_%.txt" | |
794 | |
795 Barcode file format | |
796 ------------------- | |
797 Barcode files are simple text files. Each line should contain an identifier | |
798 (descriptive name for the barcode), and the barcode itself (A/C/G/T), | |
799 separated by a TAB character. Example: | |
800 | |
801 #This line is a comment (starts with a 'number' sign) | |
802 BC1 GATCT | |
803 BC2 ATCGT | |
804 BC3 GTGAT | |
805 BC4 TGTCT | |
806 | |
807 For each barcode, a new FASTQ file will be created (with the barcode's | |
808 identifier as part of the file name). Sequences matching the barcode | |
809 will be stored in the appropriate file. | |
810 | |
811 Running the above example (assuming "mybarcodes.txt" contains the above | |
812 barcodes), will create the following files: | |
813 /tmp/bla_BC1.txt | |
814 /tmp/bla_BC2.txt | |
815 /tmp/bla_BC3.txt | |
816 /tmp/bla_BC4.txt | |
817 /tmp/bla_unmatched.txt | |
818 The 'unmatched' file will contain all sequences that didn't match any barcode. | |
819 | |
820 Barcode matching | |
821 ---------------- | |
822 | |
823 ** Without partial matching: | |
824 | |
825 Count mismatches between the FASTA/Q sequences and the barcodes. | |
826 The barcode which matched with the lowest mismatches count (providing the | |
827 count is small or equal to '--mismatches N') 'gets' the sequences. | |
828 | |
829 Example (using the above barcodes): | |
830 Input Sequence: | |
831 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
832 | |
833 Matching with '--bol --mismatches 1': | |
834 GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
835 GATCT (1 mismatch, BC1) | |
836 ATCGT (4 mismatches, BC2) | |
837 GTGAT (3 mismatches, BC3) | |
838 TGTCT (3 mismatches, BC4) | |
839 | |
840 This sequence will be classified as 'BC1' (it has the lowest mismatch count). | |
841 If '--exact' or '--mismatches 0' were specified, this sequence would be | |
842 classified as 'unmatched' (because, although BC1 had the lowest mismatch count, | |
843 it is above the maximum allowed mismatches). | |
844 | |
845 Matching with '--eol' (end of line) does the same, but from the other side | |
846 of the sequence. | |
847 | |
848 ** With partial matching (very similar to indels): | |
849 | |
850 Same as above, with the following addition: barcodes are also checked for | |
851 partial overlap (number of allowed non-overlapping bases is '--partial N'). | |
852 | |
853 Example: | |
854 Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
855 (Same as above, but note the missing 'G' at the beginning.) | |
856 | |
857 Matching (without partial overlapping) against BC1 yields 4 mismatches: | |
858 ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
859 GATCT (4 mismatches) | |
860 | |
861 Partial overlapping would also try the following match: | |
862 -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG | |
863 GATCT (1 mismatch) | |
864 | |
865 Note: scoring counts a missing base as a mismatch, so the final | |
866 mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). | |
867 If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this | |
868 seqeunce will be classified as BC1. | |
869 | |
870 EOF | |
871 | |
872 exit 1; | |
873 } | |
874 | |
875 sub version() | |
876 { | |
877 print STDOUT "1.0"; | |
878 exit 0 ; | |
879 } |