$name demultiplex

# HG changeset patch # User davidvanzessen # Date 1399537079 14400 # Node ID 25cd500a0852fac7cf79ac3391baad7f659d2182 Uploaded diff -r 000000000000 -r 25cd500a0852 demultiplex.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/demultiplex.xml Thu May 08 04:17:59 2014 -0400 @@ -0,0 +1,187 @@ + + + + fastx_toolkit + + + r_wrapper.sh $input $out_file $out_file.files_path $EOL $mismatches $partial $input.name + #for $i, $b in enumerate($barcodes) + "$b.id" + "$b.mid" + #end for + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Splitting FASTA or FASTQ files, this tool uses sff2fastq (https://github.com/indraniel/sff2fastq) to extract a fastq file and fastx_barcode_splitter.pl (http://hannonlab.cshl.edu/fastx_toolkit/commandline.html) to demultiplex. + + diff -r 000000000000 -r 25cd500a0852 fastx_barcode_splitter.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastx_barcode_splitter.pl Thu May 08 04:17:59 2014 -0400 @@ -0,0 +1,472 @@ +#!/usr/bin/perl + +# FASTX-toolkit - FASTA/FASTQ preprocessing tools. +# Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +use strict; +use warnings; +use IO::Handle; +use Data::Dumper; +use Getopt::Long; +use Carp; + +## +## This program splits a FASTQ/FASTA file into several smaller files, +## Based on barcode matching. +## +## run with "--help" for usage information +## +## Assaf Gordon , 11sep2008 + +# Forward declarations +sub load_barcode_file ($); +sub parse_command_line ; +sub match_sequences ; +sub mismatch_count($$) ; +sub print_results; +sub open_and_detect_input_format; +sub read_record; +sub write_record($); +sub usage(); + +# Global flags and arguments, +# Set by command line argumens +my $barcode_file ; +my $barcodes_at_eol = 0 ; +my $barcodes_at_bol = 0 ; +my $exact_match = 0 ; +my $allow_partial_overlap = 0; +my $allowed_mismatches = 1; +my $newfile_suffix = ''; +my $newfile_prefix ; +my $quiet = 0 ; +my $debug = 0 ; +my $fastq_format = 1; + +# Global variables +# Populated by 'create_output_files' +my %filenames; +my %files; +my %counts = ( 'unmatched' => 0 ); +my $barcodes_length; +my @barcodes; +my $input_file_io; + + +# The Four lines per record in FASTQ format. +# (when using FASTA format, only the first two are used) +my $seq_name; +my $seq_bases; +my $seq_name2; +my $seq_qualities; + + +# +# Start of Program +# +parse_command_line ; + +load_barcode_file ( $barcode_file ) ; + +open_and_detect_input_format; + +match_sequences ; + +print_results unless $quiet; + +# +# End of program +# + + + + + + + + +sub parse_command_line { + my $help; + + usage() if (scalar @ARGV==0); + + my $result = GetOptions ( "bcfile=s" => \$barcode_file, + "eol" => \$barcodes_at_eol, + "bol" => \$barcodes_at_bol, + "exact" => \$exact_match, + "prefix=s" => \$newfile_prefix, + "suffix=s" => \$newfile_suffix, + "quiet" => \$quiet, + "partial=i" => \$allow_partial_overlap, + "debug" => \$debug, + "mismatches=i" => \$allowed_mismatches, + "help" => \$help + ) ; + + usage() if ($help); + + die "Error: barcode file not specified (use '--bcfile [FILENAME]')\n" unless defined $barcode_file; + die "Error: prefix path/filename not specified (use '--prefix [PATH]')\n" unless defined $newfile_prefix; + + if ($barcodes_at_bol == $barcodes_at_eol) { + die "Error: can't specify both --eol & --bol\n" if $barcodes_at_eol; + die "Error: must specify either --eol or --bol\n" ; + } + + die "Error: invalid for value partial matches (valid values are 0 or greater)\n" if $allow_partial_overlap<0; + + $allowed_mismatches = 0 if $exact_match; + + die "Error: invalid value for mismatches (valid values are 0 or more)\n" if ($allowed_mismatches<0); + + die "Error: partial overlap value ($allow_partial_overlap) bigger than " . + "max. allowed mismatches ($allowed_mismatches)\n" if ($allow_partial_overlap > $allowed_mismatches); + + + exit unless $result; +} + + + +# +# Read the barcode file +# +sub load_barcode_file ($) { + my $filename = shift or croak "Missing barcode file name"; + + open BCFILE,"<$filename" or die "Error: failed to open barcode file ($filename)\n"; + while () { + next if m/^#/; + chomp; + my ($ident, $barcode) = split ; + + $barcode = uc($barcode); + + # Sanity checks on the barcodes + die "Error: bad data at barcode file ($filename) line $.\n" unless defined $barcode; + die "Error: bad barcode value ($barcode) at barcode file ($filename) line $.\n" + unless $barcode =~ m/^[AGCT]+$/; + + die "Error: bad identifier value ($ident) at barcode file ($filename) line $. (must be alphanumeric)\n" + unless $ident =~ m/^\w+$/; + + die "Error: badcode($ident, $barcode) is shorter or equal to maximum number of " . + "mismatches ($allowed_mismatches). This makes no sense. Specify fewer mismatches.\n" + if length($barcode)<=$allowed_mismatches; + + $barcodes_length = length($barcode) unless defined $barcodes_length; + die "Error: found barcodes in different lengths. this feature is not supported yet.\n" + unless $barcodes_length == length($barcode); + + push @barcodes, [$ident, $barcode]; + + if ($allow_partial_overlap>0) { + foreach my $i (1 .. $allow_partial_overlap) { + substr $barcode, ($barcodes_at_bol)?0:-1, 1, ''; + push @barcodes, [$ident, $barcode]; + } + } + } + close BCFILE; + + if ($debug) { + print STDERR "barcode\tsequence\n"; + foreach my $barcoderef (@barcodes) { + my ($ident, $seq) = @{$barcoderef}; + print STDERR $ident,"\t", $seq ,"\n"; + } + } +} + +# Create one output file for each barcode. +# (Also create a file for the dummy 'unmatched' barcode) +sub create_output_files { + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; + + foreach my $ident (keys %barcodes) { + my $new_filename = $newfile_prefix . $ident . $newfile_suffix; + $filenames{$ident} = $new_filename; + open my $file, ">$new_filename" or die "Error: failed to create output file ($new_filename)\n"; + $files{$ident} = $file ; + } +} + +sub match_sequences { + + my %barcodes = map { $_->[0] => 1 } @barcodes; #generate a uniq list of barcode identifiers; + $barcodes{'unmatched'} = 1 ; + + #reset counters + foreach my $ident ( keys %barcodes ) { + $counts{$ident} = 0; + } + + create_output_files; + + # Read file FASTQ file + # split accotding to barcodes + while ( read_record ) { + chomp $seq_bases; + + print STDERR "sequence $seq_bases: \n" if $debug; + + my $best_barcode_mismatches_count = $barcodes_length; + my $best_barcode_ident = undef; + + #Try all barcodes, find the one with the lowest mismatch count + foreach my $barcoderef (@barcodes) { + my ($ident, $barcode) = @{$barcoderef}; + + # Get DNA fragment (in the length of the barcodes) + # The barcode will be tested only against this fragment + # (no point in testing the barcode against the whole sequence) + my $sequence_fragment; + if ($barcodes_at_bol) { + $sequence_fragment = substr $seq_bases, 0, $barcodes_length; + } else { + $sequence_fragment = substr $seq_bases, - $barcodes_length; + } + + my $mm = mismatch_count($sequence_fragment, $barcode) ; + + # if this is a partial match, add the non-overlap as a mismatch + # (partial barcodes are shorter than the length of the original barcodes) + $mm += ($barcodes_length - length($barcode)); + + if ( $mm < $best_barcode_mismatches_count ) { + $best_barcode_mismatches_count = $mm ; + $best_barcode_ident = $ident ; + } + } + + $best_barcode_ident = 'unmatched' + if ( (!defined $best_barcode_ident) || $best_barcode_mismatches_count>$allowed_mismatches) ; + + print STDERR "sequence $seq_bases matched barcode: $best_barcode_ident\n" if $debug; + + $counts{$best_barcode_ident}++; + + #get the file associated with the matched barcode. + #(note: there's also a file associated with 'unmatched' barcode) + my $file = $files{$best_barcode_ident}; + + write_record($file); + } +} + +#Quickly calculate hamming distance between two strings +# +#NOTE: Strings must be same length. +# returns number of different characters. +#see http://www.perlmonks.org/?node_id=500235 +sub mismatch_count($$) { length( $_[ 0 ] ) - ( ( $_[ 0 ] ^ $_[ 1 ] ) =~ tr[\0][\0] ) } + + + +sub print_results +{ + print "Barcode\tCount\tLocation\n"; + my $total = 0 ; + foreach my $ident (sort keys %counts) { + print $ident, "\t", $counts{$ident},"\t",$filenames{$ident},"\n"; + $total += $counts{$ident}; + } + print "total\t",$total,"\n"; +} + + +sub read_record +{ + $seq_name = $input_file_io->getline(); + + return undef unless defined $seq_name; # End of file? + + $seq_bases = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequences\n" unless defined $seq_bases; + + # If using FASTQ format, read two more lines + if ($fastq_format) { + $seq_name2 = $input_file_io->getline(); + die "Error: bad input file, expecting line with sequence name2\n" unless defined $seq_name2; + + $seq_qualities = $input_file_io->getline(); + die "Error: bad input file, expecting line with quality scores\n" unless defined $seq_qualities; + } + return 1; +} + +sub write_record($) +{ + my $file = shift; + + croak "Bad file handle" unless defined $file; + + print $file $seq_name; + print $file $seq_bases,"\n"; + + #if using FASTQ format, write two more lines + if ($fastq_format) { + print $file $seq_name2; + print $file $seq_qualities; + } +} + +sub open_and_detect_input_format +{ + $input_file_io = new IO::Handle; + die "Failed to open STDIN " unless $input_file_io->fdopen(fileno(STDIN),"r"); + + # Get the first characeter, and push it back + my $first_char = $input_file_io->getc(); + $input_file_io->ungetc(ord $first_char); + + if ($first_char eq '>') { + # FASTA format + $fastq_format = 0 ; + print STDERR "Detected FASTA format\n" if $debug; + } elsif ($first_char eq '@') { + # FASTQ format + $fastq_format = 1; + print STDERR "Detected FASTQ format\n" if $debug; + } else { + die "Error: unknown file format. First character = '$first_char' (expecting > or \@)\n"; + } +} + +sub usage() +{ +print<> $outDir/barcodes.txt +done + +cd $outDir +echo "$3" +result=`$dir/sff2fastq $input | $dir/fastx_barcode_splitter.pl --bcfile $outDir/barcodes.txt --prefix "$prefix" --suffix ".fastq" --$EOL --mismatches $mismatches --partial $partial` +echo "$result" | tail -n +2 | sed 's/\t/,/g' > output.txt +echo "$name demultiplex" >> $output +ls +while IFS=, read barcode count location + do + if [ "total" == "$barcode" ] + then + echo "" >> $output + break + fi + file=$name"_"$barcode + cat $file.fastq | perl -e '$i=0;while(< >){if(/^\@/&&$i==0){s/^\@/\>/;print;}elsif($i==1){print;$i=-3}$i++;}' > $file.fasta + cat $file.fastq + echo "" >> $output +done < output.txt +echo "" >> $output diff -r 000000000000 -r 25cd500a0852 sff2fastq Binary file sff2fastq has changed diff -r 000000000000 -r 25cd500a0852 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Thu May 08 04:17:59 2014 -0400 @@ -0,0 +1,6 @@ + + + + + +

ID	Count	FASTQ	FASTA
$barcode	$count
$barcode	$count	$file.fastq	$file.fasta