Mercurial > repos > oinizan > demultiplex
diff splitbc.pl @ 0:da4101033e10 draft default tip
planemo upload
author | oinizan |
---|---|
date | Wed, 18 Oct 2017 05:30:40 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/splitbc.pl Wed Oct 18 05:30:40 2017 -0400 @@ -0,0 +1,879 @@ +#!/usr/bin/perl + +# FASTX-toolkit - FASTA/FASTQ preprocessing tools. fastx_barcode_splitter.pl +# Copyright (C) 2009 A. Gordon (gordon@cshl.edu) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +use strict; +use warnings; +use IO::Handle; +use IO::Zlib; +use PerlIO::gzip; +use Data::Dumper; +use Getopt::Long; +use Carp; +use Pod::Usage; + +=pod + +=head1 NAME + + splitbc.pl + +=head1 SYNOPSIS + + splitbc.pl read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol] + [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug] + +=head1 DESCRIPTION + + splitbc.pl classify fasta/fastq single or paired end reads in function of barcode forward or reverse in the first or both reads. + Also, whether reads are RAD sequence, it checks that barcode is followed by RAD tag. + +=head1 OPTIONS + +=over +=item B<--bcfile FILE> + + Barcodes file name. (splitbc.pl --doc for details) + +=item B<--prefix-r1 PREFIX> + + File prefix. The sample name will be placed where the % is placed. + +=item B<--prefix-r2 PREFIX> + + File prefix. The sample name will be placed where the % is placed. + +=item B<--bol> + + Try to match barcodes at the BEGINNING of sequences. + (What biologists would call the 5' end, and programmers + would call index 0.) + +=item B<--eol> + + Try to match barcodes at the END of sequences. + (What biologists would call the 3' end, and programmers + would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed. + NOTE: one of --bol, --eol must be specified, but not both. + +=item B<--mismatches N> + + Max. number of mismatches allowed. default is 1. + +=item B<--exact> + + Same as '--mismatches 0'. If both --exact and --mismatches are specified, '--exact' takes precedence. + +=item B <--partial N> + + Allow partial overlap of barcodes. (splitbc.pl --doc for details) + (Default is not partial matching) + +item B<--trim> + + Should the barecode be trimmed. + +=item B<--trim2> + + Shoud the read 2 be trimmed to have the same length as the read1 + NOTE: one of --trim, --trim2 must be specified, but not both. + +=item B<--no_adapt> + + there is no adaptator (T or A ) between barcode and the sequence + +=item B<--rad STR> + + sequence are RADSeq, barcode is immediately followed by rad sequence <STR>. + NOTE: --rad implies --no_adapt + +=item B<--radTAG STR> + + sequence retain at the begining of the sequence after cliping. + NOTE: if define rad, radTAG must be defined to, and conversely + Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file. + +=item B<--TAG_mismatch N> + + Max. number of mismatches allowed in the radTAG sequence. default is 0. + +=item B<--quiet> + + Don't print counts and summary at the end of the run. + (Default is to print.) + +=item B<--debug> + + Print lots of useless debug information to STDERR. + +=item B<--help> + + This helpful help screen. + +=back + +=head1 AUTHORS + + Jérôme Mariette + modified by Maria Bernard + +=head1 VERSION + + 1.1 + +=head1 DATE + + modified 21/05/2013 + +=head1 KEYWORDS + + Barcode RAD + +=head1 EXAMPLE + + (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is the barcodes file): + $0 cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ + --prefix-r1 /tmp/bla_%.txt" + +=head1 CHANGELOG + + 1.1: 18/10/2013 Maria + construct list of best barcode to deal with equal match. In this case reads will be written in "ambiguous" files + +=cut + +## +## This program splits a FASTQ/FASTA file into several smaller files, +## Based on barcode matching. +## +## run with "--help" for usage information +## +## Assaf Gordon <gordon@cshl.edu> , 11sep2008 + +# Forward declarations +sub load_barcode_file ($); +sub parse_command_line ; +sub match_sequences ; +sub mismatch_count($$) ; +sub print_results; +sub open_and_detect_input_format; +sub read_record; +sub write_record($); +sub usage(); + +# Global flags and arguments, +# Set by command line argumens +my $barcode_file ; +my $barcodes_at_eol = 0 ; +my $barcodes_at_bol = 0 ; +my $exact_match = 0 ; +my $allow_partial_overlap = 0; +my $allowed_mismatches = 1; +my $newfile_prefix_r1 = ''; +my $newfile_prefix_r2 = ''; +my $quiet = 0 ; +my $debug = 0 ; +my $trim = 0 ; +my $trim2 = 0 ; +my $adapt = 1 ; +my $no_adapt = 0 ; +my $rad; +my $radTAG; +my $TAG_mm=0; +my $fastq_format = 1; + +# Global variables +# Populated by 'parse_command_line' +my $radTAG_length=0; +my $check_rad=0; + +# Global variables +# Populated by 'create_output_files' +my %filenames; +my %files; +my %counts = ( 'unmatched' => 0 ); +my $barcodes_length; +my @barcodes; +my $input_file_io; +my $input_file_io_r2; +my $paired = 0; +my $write_r2 = 0; + +# The Four lines per record in FASTQ format. +# (when using FASTA format, only the first two are used) +my $seq_name; +my $seq_bases; +my $seq_name2; +my $seq_qualities; +my $seq_len; +my $seq_name_r2; +my $seq_bases_r2; +my $seq_name2_r2; +my $seq_qualities_r2; +my $seq_len_r2; + +# +# Start of Program +# +parse_command_line ; + +load_barcode_file ( $barcode_file ) ; + +open_and_detect_input_format; + +match_sequences ; + +print_results unless $quiet; + +# +# End of program +######################################################################## + + +sub parse_command_line { + my $help; + my $doc; + my $version; + + pod2usage(1) if (scalar @ARGV==0); + + my $result = GetOptions ( "bcfile=s" => \$barcode_file, + "eol" => \$barcodes_at_eol, + "bol" => \$barcodes_at_bol, + "exact" => \$exact_match, + "prefix-r1=s" => \$newfile_prefix_r1, + "prefix-r2=s" => \$newfile_prefix_r2, + "quiet" => \$quiet, + "partial=i" => \$allow_partial_overlap, + "trim" => \$trim, + "trim2" => \$trim2, + "debug" => \$debug, + "mismatches=i" => \$allowed_mismatches, + "no_adapt"=> \$no_adapt, + "rad=s"=> \$rad, + "radTAG=s"=> \$radTAG, + "TAG_mismatch=i"=> \$TAG_mm, + "help" => \$help, + "doc" => \$doc, + "version" => \$version + ) ; + + pod2usage(1) if ($help); + doc() if ($doc); + version() if ($version); + + die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; + die "Error: prefix path/filename not specified (use '--prefix-r1 [PATH]%.fq')\n" unless defined $newfile_prefix_r1; + # If a read2 file is provided + if ($#ARGV == 1) { + die "Error: prefix path/filename not specified (use '--prefix-r2 [PATH]%.fq')\n" unless defined $newfile_prefix_r2; + } + + if ($barcodes_at_bol == $barcodes_at_eol) { + die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; + die "Error: must specify either --eol or --bol\n" ; + } + + die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; + + $allowed_mismatches = 0 if $exact_match; + + die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); + + die "Error: partial overlap value ($allow_partial_overlap) bigger than " . + "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); + + $trim = 1 if $trim2 ; + + if ((defined $rad && !defined $radTAG) || (!defined $rad && defined $radTAG)){ + die "Error: you must defined --rad AND --radTAG option\n"; + } + if ( $no_adapt){ + $adapt=0; + } + if (defined $rad ){ + chomp $rad; + chomp $radTAG; + die "Error: $rad does not ended by $radTAG\n" if ($rad !~ /$radTAG$/g); + $adapt=0; + $check_rad=1; + #~ warn $rad, " ",$radTAG; + $radTAG_length=length($radTAG); + } + exit unless $result; +} + + + +# +# Read the barcode file +# +sub load_barcode_file ($) { + my $filename = shift or croak "Missing barcode file name"; + + open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; + while (<BCFILE>) { + next if m/^#/; + chomp $_; + my @arr = split(' ',$_); + my $ident = $arr[0]; + my $barcode = uc($arr[1]); + + # Sanity checks on the barcodes + die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; + die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" + unless $barcode =~ m/^[AGCT]+$/; + + die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" + unless $ident =~ m/^\w+$/; + + die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . + "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" + if length($barcode)<=$allowed_mismatches; + + $barcodes_length = length($barcode) unless defined $barcodes_length; + die "Error: found barcodes in different lengths. this feature is not supported yet.\n" + unless $barcodes_length == length($barcode); + + push @barcodes, [$ident, $barcode]; + + if ($allow_partial_overlap>0) { + foreach my $i (1 .. $allow_partial_overlap) { + substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; + push @barcodes, [$ident, $barcode]; + } + } + } + close BCFILE; + + if ($debug) { + print STDERR "barcode\tsequence\n"; + foreach my $barcoderef (@barcodes) { + my ($ident, $seq) = @{$barcoderef}; + print STDERR $ident,"\t", $seq ,"\t",length($seq),"\n"; + } + } +} + +# Create one output file for each barcode. +# (Also create a file for the dummy 'unmatched' barcode) +sub create_output_files { + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; + $barcodes{'ambiguous'} = 1 ; + + foreach my $ident (keys %barcodes) { + + my $new_filename = $newfile_prefix_r1; + $new_filename =~ s/%/$ident/g; + $filenames{$ident} = $new_filename; + open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident} = $file ; + + if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){ + my $new_filename = $newfile_prefix_r1; + $new_filename =~ s/%/${ident}_2rad/g; + $filenames{$ident."_2rad"} = $new_filename; + open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident."_2rad"} = $file ; + } + + if ($paired == 1) { + my $new_filename_r2 = $newfile_prefix_r2; + $new_filename_r2 =~ s/%/$ident/g; + $filenames{$ident."r2"} = $new_filename_r2; + open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident."r2"} = $file ; + + if (defined $rad && $ident ne "unmatched" && $ident ne "ambiguous"){ + my $new_filename_r2 = $newfile_prefix_r2; + $new_filename_r2 =~ s/%/${ident}_2rad/g; + $filenames{$ident."_2radr2"} = $new_filename_r2; + open my $file, ">$new_filename_r2" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident."_2radr2"} = $file ; + } + } + } +} + +sub reverse_complement { + my $dna = shift; + + # reverse the DNA sequence + my $revcomp = reverse($dna); + + # complement the reversed DNA sequence + $revcomp =~ tr/ACGTacgt/TGCAtgca/; + return $revcomp; +} + +sub match_sequences { + + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; + $barcodes{'ambiguous'} = 1 ; + + #reset counters + foreach my $ident ( keys %barcodes ) { + $counts{$ident} = 0; + } + + create_output_files; + + # Read file FASTQ file + # split accotding to barcodes + do { + chomp $seq_bases; + #~ print STDERR "sequence ".length($seq_bases)." $seq_bases: \n" if $debug; + if ($fastq_format){ + chomp $seq_qualities; + } + + if ($paired == 1) { + chomp $seq_bases_r2; + if ($fastq_format){ + chomp $seq_qualities_r2; + } + } + my $best_barcode_mismatches_count = $barcodes_length; + my $best_barcode_ident = undef; + my $sequence_fragment; + my $sequence_fragment_radTAG; + my $start = 0; + my $cut_len = $seq_len; + my $start2 = 0; + my $cut_len2 = $seq_len_r2; +# warn $start , " " , $cut_len , " " , $start2 , " " , $cut_len2; + #Try all barcodes, find the one with the lowest mismatch count + if ($barcodes_at_bol) { + $sequence_fragment = substr $seq_bases, 0, $barcodes_length; + if (defined $rad) + { + $sequence_fragment_radTAG = substr $seq_bases, $barcodes_length, $radTAG_length; + } + if ($trim){ + $start=$barcodes_length+$adapt; + $cut_len=$seq_len-$barcodes_length-$adapt; + if ($trim2) { + $start2 = 0 ; + $cut_len2=$cut_len; + } + } + } elsif($barcodes_at_eol) { + if ($paired != 1){ + $sequence_fragment = substr $seq_bases, - $barcodes_length ; + if (defined $rad){ + $sequence_fragment_radTAG = substr $seq_bases, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length; + } + if ($trim){ + $start=0 ; + $cut_len=$seq_len-$barcodes_length-$adapt; + } + } else { + $sequence_fragment = substr $seq_bases_r2, - $barcodes_length; + if (defined $rad){ + $sequence_fragment_radTAG = substr $seq_bases_r2, - ($barcodes_length+$adapt+$radTAG_length), $radTAG_length; + } + if ($trim){ + $start2=0 ; + $cut_len2=$seq_len_r2-$barcodes_length-$adapt; + if ($trim2) { + $start = 0 ; + $cut_len=$cut_len2 ; + } + } + } + #~ warn "sequence fragment : $sequence_fragment\n" if $debug; + #~ $sequence_fragment = reverse_complement $sequence_fragment; + #~ warn "sequence fragment revcomp: $sequence_fragment \n" if $debug; + } +# warn $start," ", $cut_len, " ",$start2, " ",$cut_len2; + + foreach my $barcoderef (@barcodes) { + my ($ident, $barcode) = @{$barcoderef}; + + # Get DNA fragment (in the length of the barcodes) + # The barcode will be tested only against this fragment + # (no point in testing the barcode against the whole sequence) + my $mm=$barcodes_length; + + # check validity of RAD. + if (defined $rad){ + my $mm_rad = mismatch_count($sequence_fragment_radTAG, $radTAG) ; + if ($mm_rad<= $TAG_mm){ $mm = mismatch_count($sequence_fragment, $barcode) ; } + }else { + $mm = mismatch_count($sequence_fragment, $barcode) ; + } + + # if this is a partial match, add the non-overlap as a mismatch + # (partial barcodes are shorter than the length of the original barcodes) + $mm += ($barcodes_length - length($barcode)); + + if ( $mm < $best_barcode_mismatches_count ) { + $best_barcode_mismatches_count = $mm ; + $best_barcode_ident = $ident ; + }elsif($mm == $best_barcode_mismatches_count){ + $best_barcode_ident = "ambiguous" + } + } + + if ($best_barcode_mismatches_count<=$allowed_mismatches && $best_barcode_ident ne 'ambiguous'){ + $seq_bases = substr $seq_bases, $start, $cut_len; + if ($fastq_format) { + $seq_qualities = substr $seq_qualities, $start, $cut_len; + } + if ($paired == 1 ) { + $seq_bases_r2 = substr $seq_bases_r2, $start2, $cut_len2; + if ($fastq_format) { + $seq_qualities_r2 = substr $seq_qualities_r2, $start2, $cut_len2; + } + } + } + + if ( (!defined $best_barcode_ident) || ($best_barcode_mismatches_count>$allowed_mismatches)){ + $best_barcode_ident = 'unmatched'; + } + + #get the file associated with the matched barcode. + #(note: there's also a file associated with 'unmatched' barcode) + + if (defined $rad && $best_barcode_ident ne 'unmatched' && $best_barcode_ident ne 'ambiguous'){ + my $nb_rad =()= ($seq_bases =~ /$rad/g); + #~ warn $seq_bases," ",$nb_rad; + $best_barcode_ident.='_2rad' if ($nb_rad > 0); + #~ warn $best_barcode_ident; + } + + $counts{$best_barcode_ident}++; + my $file = $files{$best_barcode_ident}; + #~ warn "$seq_name , $best_barcode_ident \n"; + #~ warn $file ; + $write_r2 = 0; + write_record($file); + if ($paired == 1) { + #~ warn $best_barcode_ident."r2"; + my $file = $files{$best_barcode_ident."r2"}; + $write_r2 = 1; + #~ warn $best_barcode_ident."r2"; + write_record($file); + } + + } while ( read_record ); +} + +#Quickly calculate hamming distance between two strings +# +#NOTE: Strings must be same length. +# returns number of different characters. +#see http://www.perlmonks.org/?node_id=500235 +sub mismatch_count($$) { + length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) +} + + + +sub print_results +{ + print "Barcode\tCount\tLocation\n"; + my $total = 0 ; + foreach my $ident (sort keys %counts) { + print $ident, "\t", $counts{$ident}; + if ($paired) { + print "(*2)"; + } + print "\t",$filenames{$ident}; + if ($paired) { + print " & ".$filenames{$ident."r2"}; + } + print "\n"; + $total += $counts{$ident}; + } + print "total\t",$total; + if ($paired) { + print "(*2)"; + } + print "\n"; +} + + +sub read_record +{ + $seq_name = $input_file_io->getline(); + + return undef unless defined $seq_name; # End of file? + + $seq_bases = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; + $seq_len = length($seq_bases)-1; + # If using FASTQ format, read two more lines + if ($fastq_format) { + $seq_name2 = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; + + $seq_qualities = $input_file_io->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; + } + + if ($paired == 1) { + $seq_name_r2 = $input_file_io_r2->getline(); + + return undef unless defined $seq_name_r2; # End of file? + + $seq_bases_r2 = $input_file_io_r2->getline(); + die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases_r2; + $seq_len_r2 = length($seq_bases_r2)-1; + # If using FASTQ format, read two more lines + if ($fastq_format) { + $seq_name2_r2 = $input_file_io_r2->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2; + + $seq_qualities_r2 = $input_file_io_r2->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2; + } + } + + return 1; +} + +sub write_record($) +{ + if ($write_r2 == 1) { + my $file = shift; + croak "Bad file handle" unless defined $file; + + print $file $seq_name_r2; + print $file $seq_bases_r2,"\n"; + + #if using FASTQ format, write two more lines + if ($fastq_format) { + print $file $seq_name2_r2; + print $file $seq_qualities_r2,"\n"; + } + } else { + my $file = shift; + croak "Bad file handle $seq_name" unless defined $file; + + print $file $seq_name; + print $file $seq_bases,"\n"; + + #if using FASTQ format, write two more lines + if ($fastq_format) { + print $file $seq_name2; + print $file $seq_qualities,"\n"; + } + } +} + +sub open_and_detect_input_format +{ +# $input_file_io = new IO::Handle; +# die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); + + if ($ARGV[0] =~ /.gz$/){ + die "Failed to open '$ARGV[0]' " unless open ($input_file_io, "<:gzip", $ARGV[0] ) ; + #~ $input_file_io = IO::Zlib->new(); + #~ die "Failed to open '$ARGV[0]' " unless $input_file_io->open($ARGV[0],"r"); + }else{ + $input_file_io= IO::Handle->new(); + die "Failed to open '$ARGV[0]' " unless open($input_file_io , $ARGV[0]); + } + + + # If a read2 file is provided + if ($#ARGV == 1) { + if ($ARGV[1] =~ /.gz$/){ + die "Failed to open '$ARGV[0]' " unless open ($input_file_io_r2 , "<:gzip", $ARGV[1] ) ; + #~ $input_file_io_r2 = IO::Zlib->new(); + #~ die "Failed to open '$ARGV[0]' " unless $input_file_io_r2->open($ARGV[1],"r"); + }else{ + $input_file_io_r2 = IO::Handle->new(); + die "Failed to open '$ARGV[1]' " unless open($input_file_io_r2 , $ARGV[1]); + } + $paired = 1; + } + + # Get the first characeter, and push it back + my $first_char = $input_file_io->getc(); + $input_file_io->ungetc(ord $first_char); + $seq_name = $input_file_io->getline(); + $seq_bases = $input_file_io->getline(); + $seq_len = length($seq_bases)-1; + if ($paired == 1) { + $seq_name_r2 = $input_file_io_r2->getline(); + $seq_bases_r2 = $input_file_io_r2->getline(); + $seq_len_r2 = length($seq_bases_r2)-1; + } + if ($first_char eq '>') { + # FASTA format + $fastq_format = 0 ; + print STDERR "Detected FASTA format\n" if $debug; + } elsif ($first_char eq '@') { + # FASTQ format + $fastq_format = 1; + $seq_name2 = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; + $seq_qualities = $input_file_io->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; + if ($paired == 1) { + $seq_name2_r2 = $input_file_io_r2->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2_r2; + $seq_qualities_r2 = $input_file_io_r2->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities_r2; + } + print STDERR "Detected FASTQ format\n" if $debug; + } else { + die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; + } +# warn "len1: ", $seq_len, "; len2: ", $seq_len_r2, " ;barcode len: ", $barcodes_length, "; no adapt: ", $adapt; +} + +sub doc() +{ +print<<EOF; +Barcode Splitter, by Assaf Gordon (gordon\@cshl.edu), 11sep2008 + +This program reads FASTA/FASTQ file and splits it into several smaller files, +Based on barcode matching. +FASTA/FASTQ data is read from STDIN (format is auto-detected.) +Output files will be writen to disk. +Summary will be printed to STDOUT. + +usage: $0 read_file_r1 [read_file_r2] --bcfile FILE --prefix-r1 r1.%.fq [--prefix-r2 r2.%.fq] --bol|--eol] + [--mismatches N] [--exact] [--partial N] [--trim|--trim2] [--no_adapt] [--rad STR --radTAG STR] [--help] [--quiet] [--debug] + +Arguments: + +--bcfile FILE - Barcodes file name. (see explanation below.) +--prefix-r1 PREFIX - File prefix. The sample name will be placed where the % is placed. +--prefix-r2 PREFIX - File prefix. The sample name will be placed where the % is placed. +--bol - Try to match barcodes at the BEGINNING of sequences. + (What biologists would call the 5' end, and programmers + would call index 0.) +--eol - Try to match barcodes at the END of sequences. + (What biologists would call the 3' end, and programmers + would call the end of the string.). Give barcode as they appear in the sequence file, so reverse complement if needed. + NOTE: one of --bol, --eol must be specified, but not both. +--mismatches N - Max. number of mismatches allowed. default is 1. +--exact - Same as '--mismatches 0'. If both --exact and --mismatches + are specified, '--exact' takes precedence. +--partial N - Allow partial overlap of barcodes. (see explanation below.) + (Default is not partial matching) +--trim - Should the barecode be trimmed. +--trim2 - Shoud the read 2 be trimmed to have the same length as the read1 + NOTE: one of --trim, --trim2 must be specified, but not both. +--no_adapt - there is no adaptator (T or A ) between barcode and the sequence +--rad STR - sequence are RADSeq, barcode is immediately followed by rad sequence <STR>. + NOTE: --rad implies --no_adapt +--radTAG STR - sequence retain at the begining of the sequence after cliping. + NOTE: if define rad, radTAG must be defined to, and conversely + Barcode will be splited if they are folowed by radTAG, otherwise they will be recorded in the unmatched file. +--TAG_mismatch N - Max. number of mismatches allowed in the radTAG sequence. default is 0. +--quiet - Don't print counts and summary at the end of the run. + (Default is to print.) +--debug - Print lots of useless debug information to STDERR. +--help - This helpful help screen. + +Example (Assuming 's_2_100.txt' is a FASTQ file, 'mybarcodes.txt' is +the barcodes file): + + \$ cat s_2_100.txt | $0 --bcfile mybarcodes.txt --bol --mismatches 2 \\ + --prefix-r1 /tmp/bla_%.txt" + +Barcode file format +------------------- +Barcode files are simple text files. Each line should contain an identifier +(descriptive name for the barcode), and the barcode itself (A/C/G/T), +separated by a TAB character. Example: + + #This line is a comment (starts with a 'number' sign) + BC1 GATCT + BC2 ATCGT + BC3 GTGAT + BC4 TGTCT + +For each barcode, a new FASTQ file will be created (with the barcode's +identifier as part of the file name). Sequences matching the barcode +will be stored in the appropriate file. + +Running the above example (assuming "mybarcodes.txt" contains the above +barcodes), will create the following files: + /tmp/bla_BC1.txt + /tmp/bla_BC2.txt + /tmp/bla_BC3.txt + /tmp/bla_BC4.txt + /tmp/bla_unmatched.txt +The 'unmatched' file will contain all sequences that didn't match any barcode. + +Barcode matching +---------------- + +** Without partial matching: + +Count mismatches between the FASTA/Q sequences and the barcodes. +The barcode which matched with the lowest mismatches count (providing the +count is small or equal to '--mismatches N') 'gets' the sequences. + +Example (using the above barcodes): +Input Sequence: + GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + +Matching with '--bol --mismatches 1': + GATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + GATCT (1 mismatch, BC1) + ATCGT (4 mismatches, BC2) + GTGAT (3 mismatches, BC3) + TGTCT (3 mismatches, BC4) + +This sequence will be classified as 'BC1' (it has the lowest mismatch count). +If '--exact' or '--mismatches 0' were specified, this sequence would be +classified as 'unmatched' (because, although BC1 had the lowest mismatch count, +it is above the maximum allowed mismatches). + +Matching with '--eol' (end of line) does the same, but from the other side +of the sequence. + +** With partial matching (very similar to indels): + +Same as above, with the following addition: barcodes are also checked for +partial overlap (number of allowed non-overlapping bases is '--partial N'). + +Example: +Input sequence is ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG +(Same as above, but note the missing 'G' at the beginning.) + +Matching (without partial overlapping) against BC1 yields 4 mismatches: + ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + GATCT (4 mismatches) + +Partial overlapping would also try the following match: + -ATTTACTATGTAAAGATAGAAGGAATAAGGTGAAG + GATCT (1 mismatch) + +Note: scoring counts a missing base as a mismatch, so the final +mismatch count is 2 (1 'real' mismatch, 1 'missing base' mismatch). +If running with '--mismatches 2' (meaning allowing upto 2 mismatches) - this +seqeunce will be classified as BC1. + +EOF + +exit 1; +} + +sub version() +{ + print STDOUT "1.0"; + exit 0 ; +}