Mercurial > repos > iuc > fastq_screen
changeset 0:8a8adbf98ecc draft
First upload
author | iuc |
---|---|
date | Fri, 16 May 2014 07:57:33 -0400 |
parents | |
children | 3480daf4ed27 |
files | bowtie2_indices.loc.sample fastq_screen fastq_screen.xml tool_data_table_conf.xml.sample tool_dependencies.xml |
diffstat | 5 files changed, 1338 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bowtie2_indices.loc.sample Fri May 16 07:57:33 2014 -0400 @@ -0,0 +1,37 @@ +#This is a sample file distributed with Galaxy that enables tools +#to use a directory of Bowtie2 indexed sequences data files. You will +#need to create these data files and then create a bowtie_indices.loc +#file similar to this one (store it in this directory) that points to +#the directories in which those files are stored. The bowtie2_indices.loc +#file has this format (longer white space characters are TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_base_path> +# +#So, for example, if you had hg18 indexed stored in +#/depot/data2/galaxy/bowtie2/hg18/, +#then the bowtie2_indices.loc entry would look like this: +# +#hg18 hg18 hg18 /depot/data2/galaxy/bowtie2/hg18/hg18 +# +#and your /depot/data2/galaxy/bowtie2/hg18/ directory +#would contain hg18.*.ebwt files: +# +#-rw-r--r-- 1 james universe 830134 2005-09-13 10:12 hg18.1.ebwt +#-rw-r--r-- 1 james universe 527388 2005-09-13 10:12 hg18.2.ebwt +#-rw-r--r-- 1 james universe 269808 2005-09-13 10:12 hg18.3.ebwt +#...etc... +# +#Your bowtie2_indices.loc file should include an entry per line for each +#index set you have stored. The "file" in the path does not actually +#exist, but it is the prefix for the actual index files. For example: +# +#hg18canon hg18 hg18 Canonical /depot/data2/galaxy/bowtie2/hg18/hg18canon +#hg18full hg18 hg18 Full /depot/data2/galaxy/bowtie2/hg18/hg18full +#/orig/path/hg19 hg19 hg19 /depot/data2/galaxy/bowtie2/hg19/hg19 +#...etc... +# +#Note that for backwards compatibility with workflows, the unique ID of +#an entry must be the path that was in the original loc file, because that +#is the value stored in the workflow for that parameter. That is why the +#hg19 entry above looks odd. New genomes can be better-looking. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_screen Fri May 16 07:57:33 2014 -0400 @@ -0,0 +1,1073 @@ +#!/usr/bin/perl +use warnings; +use strict; +use Getopt::Long; +use FindBin qw($RealBin); +use File::Copy; +use IO::Uncompress::Gunzip qw(gunzip $GunzipError); +use File::Temp qw/ tempfile tempdir /; + +our $VERSION = "0.4.2"; + +########################################################################### +########################################################################### +## ## +## Copyright 2013, Simon Andrews (simon.andrews@babraham.ac.uk) ## +## Mark Fiers (Plant & Food Research, NZ) ## +## Steven Wingett (steven.wingett@babraham.ac.uk) ## +## ## +## This program is free software: you can redistribute it and/or modify ## +## it under the terms of the GNU General Public License as published by ## +## the Free Software Foundation, either version 3 of the License, or ## +## (at your option) any later version. ## +## ## +## This program is distributed in the hope that it will be useful, ## +## but WITHOUT ANY WARRANTY; without even the implied warranty of ## +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## +## GNU General Public License for more details. ## +## ## +## You should have received a copy of the GNU General Public License ## +## along with this program. If not, see <http://www.gnu.org/licenses/>. ## +########################################################################### +########################################################################### + + +unless(@ARGV){ #Display a help message if no arguments specified + print while (<DATA>); + exit; +} + + +## Option variables +my $subset_count; +my $outdir; +my $illumina; +my $quiet; +my $help; +my $version; +my $paired; +my $threads; +my $color; +my $conf; +my $bowtie_opts; +my $bowtie2_opts; +my $nohits; +my $aligner; + +my $config_result = GetOptions("subset=i" => \$subset_count, + "outdir=s" => \$outdir, + "illumina1_3" => \$illumina, + "quiet" => \$quiet, + "help" => \$help, + "version" => \$version, + "paired" => \$paired, + "conf=s" => \$conf, + "color" => \$color, + "bowtie=s" => \$bowtie_opts, + "bowtie2=s" => \$bowtie2_opts, + "threads=i" => \$threads, + "nohits" => \$nohits, + "aligner=s" => \$aligner + ); + +die "Could not parse options" unless ($config_result); + +if ($help) { + print while (<DATA>); + exit; +} + +if ($version) { + print "fastq_screen v$VERSION\n"; + exit; +} + +if ($color) { + $color = '-C'; +} +else { + $color = ''; +} + +if(defined $aligner){ + if( ($aligner ne 'bowtie') and ($aligner ne 'bowtie2') ){ + die "Valid options for --aligner are 'bowtie' or 'bowtie2' only.\n"; + }elsif( ($aligner eq 'bowtie') and (defined $bowtie2_opts) ){ + die "Bowtie selected as the aligner yet bowtie2 options specified.\n"; + }elsif( ($aligner eq 'bowtie2') and (defined $bowtie_opts) ){ + die "Bowtie 2 selected as the aligner yet bowtie options specified.\n"; + } +} + +$bowtie_opts = '' unless ($bowtie_opts); # Get undef warning otherwise +$bowtie2_opts = '' unless ($bowtie2_opts); # Get undef warning otherwise + +# Configuration +my $number_of_threads = 1; +my $path_to_bowtie = 'bowtie'; +my $path_to_bowtie2 = 'bowtie2'; +my @libraries; +my %library_bowtie_version; # Records which library should be used with which version of bowtie (i.e bowtie1 or bowtie2) + +load_configuration($conf); + +# Override the configuration default if they've +# manually specified a number of threads to use +$number_of_threads = $threads if ($threads); + +unless (@libraries) { + die "No search libraries were configured"; +} + + +my @files = @ARGV; + +die "No files to process" unless (@files); + +my $index = 0; +while ($index <= $#files) { + if ($paired) { + process_file($files[$index],$files[$index+1]); + $index+=2; + } + else { + process_file($files[$index]); + $index++; + } +} + + + +sub process_file { + + my ($file,$file2) = @_; + + my %seq_genomes; # Stores the hits to each genome + my $readsprocessed = 0; # Stores the total number of sequences searched + + + if ($file2) { + warn "Processing $file and $file2\n" unless ($quiet); + } + else { + warn "Processing $file\n" unless ($quiet); + } + + # Check that we can actually find the files we're working with + unless (-e $file) { + warn "Couldn't locate file $file - skipping\n"; + return; + } + if ($file2) { + unless (-e $file2) { + warn "Couldn't locate file $file2 - skipping\n"; + return; + } + } + + # Work out the output file name we're going to use + my $outfile = $file; + $outfile =~ s/\.(txt|seq|fastq)$//i; + + # We can try to remove the end specific part of the name + # if they're using standard Illumina naming. It doesn't + # really matter if this fails + if ($paired) { + $outfile =~ s/s_(\d)_1_sequence/s_${1}_sequence/; + } + + $outfile .= "_screen.txt"; + + if ($outdir) { + $outfile = (split(/\//,$outfile))[-1]; + $outfile = $outdir."/".$outfile; + } + + if (-e $outfile) { + warn "Output file $outfile already exists - skipping\n"; + return; + } + + open (OUT,'>',$outfile) or do { + warn "Couldn't write to $outfile: $!"; + return; + }; + + #Print out the Version and other information to the output file + + if($subset_count){ + print OUT "#Fastq_screen version: $VERSION\t#Reads in subset: $subset_count\n"; + }else{ + print OUT "#Fastq_screen version: $VERSION\n"; + } + + #Print the headers to the output file + print OUT join ("\t",('Library','#Reads_processed','#Unmapped','%Unmapped','#One_hit_one_library','%One_hit_one_library','#Multiple_hits_one_library','%Multiple_hits_one_library','#One_hit_multiple_libraries','%One_hit_multiple_libraries','Multiple_hits_multiple_libraries','%Multiple_hits_multiple_libraries')),"\n"; + + my $temp_file; + my $temp_file2; + + my $read_length = get_read_length($file); + + if ($read_length < 0) { + warn "Failed to calculate read length from $file"; + return; + } + + if ($read_length < 20) { + warn "Ignoring reads shorter than 20bp\n"; + $read_length = 20; + } + + # We don't use a seed of >40 even if the reads are that long + $read_length = 40 if ($read_length > 40); + + + # Count the sequences in the file. + # We need to make a subset of these sequences + + # First we need to count how many sequences are in the original file + my $seqcount = 0; + + warn "Counting sequences in $file\n" unless ($quiet); + + #Check if the read file is compressed and open accordingly + if($file =~ /\.gz$/){ + open (IN,"zcat \'$file\' |") or do { + warn "Can't read $file: $!"; + return; + }; + }else{ + open (IN,$file) or do { + warn "Can't read $file: $!"; + return; + }; + } + + ++$seqcount while (<IN>); + + $seqcount = int($seqcount/4); + + if ($subset_count and $seqcount > $subset_count*2) { # We actually need to do the reduction + + $temp_file = $file; + $temp_file .= "_temp_subset.txt"; + + # Since we're writing we need to do it in + # the directory we know we're allowed to write + # to. + + if ($outdir) { + $temp_file = (split(/\//,$temp_file))[-1]; + $temp_file = $outdir."/".$temp_file; + } + + open (TEMP,'>',$temp_file) or do { + warn "Can't write temp subset file: $!"; + return; + }; + + my $interval = sprintf("%.0f",($seqcount/$subset_count)); + + warn "Making reduced seq file with ratio $interval:1\n" unless ($quiet); + + #Move back to the start of the file. The Seek function will not work if reading the file using zcat, so + #close and open the file to move back to the start. + close IN; + if($file =~ /\.gz$/){ + open (IN,"zcat \'$file\' |") or do { + warn "Can't read $file: $!"; + return; + }; + }else{ + open (IN,$file) or do { + warn "Can't read $file: $!"; + return; + }; + } + + my $current_count = 0; + while (<IN>) { + if (/^@/) { + my $record = $_; + $record .= scalar <IN>; + $record .= scalar <IN>; + $record .= scalar <IN>; + + if ($current_count % $interval == 0) { + print TEMP $record; + ++$readsprocessed; + } + ++$current_count; + } + } + + close (TEMP) or do { + warn "Can't write temp subset file: $!"; + return; + }; + + $file = $temp_file; + + if ($file2) { # We need to do the reduction in file 2 as well + + $temp_file2 = $file2; + $temp_file2 .= "_temp_subset.txt"; + + # Since we're writing we need to do it in + # the directory we know we're allowed to write + # to. + + if ($outdir) { + $temp_file2 = (split(/\//,$temp_file2))[-1]; + $temp_file2 = $outdir."/".$temp_file2; + } + + open (TEMP,'>',$temp_file2) or do { + warn "Can't write temp2 subset file: $!"; + return; + }; + + warn "Making reduced seq file 2 with ratio $interval:1\n" unless ($quiet); + + #Check if the read file is compressed and open accordingly + if($file2 =~ /\.gz$/){ + open (IN,"zcat \'$file2\' |") or do { + warn "Failed to read $file2: $!"; + unlink ($temp_file) or warn "Couldn't delete temp file $temp_file: $!"; + return; + }; + }else{ + open (IN,$file2) or do { + warn "Failed to read $file2: $!"; + unlink ($temp_file) or warn "Couldn't delete temp file $temp_file: $!"; + return; + }; + } + + my $current_count = 0; + while (<IN>) { + if (/^@/) { + my $record = $_; + $record .= scalar <IN>; + $record .= scalar <IN>; + $record .= scalar <IN>; + + print TEMP $record if ($current_count % $interval ==0); + ++$current_count; + } + } + + close (TEMP) or do { + warn "Can't write temp2 subset file: $!"; + return; + }; + + $file2 = $temp_file2; + } + } + else { + $readsprocessed = $seqcount; + if ($subset_count) { + warn "Not making subset of $subset_count since $seqcount actual seqs is too low or close enough\n" unless ($quiet); + } + #If no file reduction was made and the original sequence file(s) were zipped, then they need to be unzipped before + #sending to Bowtie. + if ($file =~ /\.gz$/) { + + my $temp_copied_file = $file; + $temp_copied_file .= "_temp_copied.gz"; + + # Since we're writing we need to do it in + # the directory we know we're allowed to write + # to. + if ($outdir) { + $temp_copied_file = (split(/\//,$temp_copied_file))[-1]; + $temp_copied_file = $outdir."/".$temp_copied_file; + } + + #Copy the file to the output directory then unzip + #Copying the file first avoids unzipping problems if the filename is a symbolic link + copy ("$file", "$temp_copied_file"); + gunzip $temp_copied_file => "$temp_copied_file".".txt" or die "gunzip failed: $GunzipError\n"; + $file = $temp_copied_file; + unlink $temp_copied_file; + $file = $temp_copied_file.".txt"; + $temp_file = $file; #Removes the unzipped file after Bowtie is finished mapping + } + + if(($file2) and ($file2 =~ /\.gz$/)) { + + my $temp_copied_file2 = $file2; + $temp_copied_file2 .= "_temp_copied.gz"; + + if ($outdir) { + $temp_copied_file2 = (split(/\//,$temp_copied_file2))[-1]; + $temp_copied_file2 = $outdir."/".$temp_copied_file2; + } + copy ("$file2", "$temp_copied_file2"); + gunzip $temp_copied_file2 => "$temp_copied_file2".".txt" or die "gunzip failed: $GunzipError\n"; + unlink $temp_copied_file2; + $file2 = $temp_copied_file2.".txt"; + $temp_file2 = $file2; #Removes the unzipped file after Bowtie is finished mapping + } + } + + + my $library_index = -1; # Make lists in the same order as @libraries + + foreach my $library (@libraries) { + #Write Bowtie/Bowtie2 Standard Error to a temporary output file + #Generate a random filename and place in $outdir (if specified) + my $error_fh; + my $error_filename; + if($outdir){ + ($error_fh, $error_filename) = tempfile( 'aligner_standard_error.XXXXXXXX', SUFFIX => '.txt', DIR => $outdir); + }else{ + ($error_fh, $error_filename) = tempfile( 'aligner_standard_error.XXXXXXXX', SUFFIX => '.txt'); + } + + if($file2){ + warn "Searching $file and $file2 against $library->[0]\n" unless ($quiet); + }else{ + warn "Searching $file against $library->[0]\n" unless ($quiet); + } + + my $illumina_flag = ''; + $illumina_flag = "--phred64-quals" if ($illumina); + + my $bowtie_command; + + + #Count the index of the library being used + $library_index++; + + + #Determine whether to execute bowtie1 or bowtie2, whether this is a paired-end or single-end, and execute accordingly + if($library_bowtie_version{$library->[1]} == 1 ){ + if ($file2) { + $bowtie_command = "$path_to_bowtie $bowtie_opts $illumina_flag $color -l $read_length -k 2 --chunkmbs 512 --maxins 1000 -p $number_of_threads $library->[1] -1 \'$file\' -2 \'$file2\' 2>$error_filename |"; + } + else { + $bowtie_command = "$path_to_bowtie $bowtie_opts $illumina_flag $color -l $read_length -k 2 --chunkmbs 512 -p $number_of_threads $library->[1] \'$file\' 2>$error_filename |"; + } + }else{ + if ($file2) { + $bowtie_command = "$path_to_bowtie2 $bowtie2_opts $illumina_flag -k 2 --very-fast-local --no-hd --no-unal --maxins 1000 -p $number_of_threads $library->[1] -1 \'$file\' -2 \'$file2\' 2>$error_filename|"; + } + else { + $bowtie_command = "$path_to_bowtie2 $bowtie2_opts $illumina_flag -k 2 --very-fast-local --no-hd --no-unal -p $number_of_threads $library->[1] \'$file\' 2>$error_filename |"; + } + } + + open (BOWTIE,$bowtie_command) or die "Failed to launch bowtie command '$bowtie_command': $!"; + + my $seqname_suffix = 1; #Explained below + + while (<BOWTIE>) { + + my $line = $_; + chomp $line; + + #Check output is in native original bowtie format or SAM format + unless( ($line =~ /^.+\t[\+\-]\t.+\t.+\t[TAGCN]+\t.+/) or ($line =~ /^.+\t\d+\t\S+\t\d+\t\d+\t\S+\t.+\t\d+\t\S+\t[ATCGN]+\t.+/) ){ + # This is a progress message coming from stderr + warn "Bowtie/Bowtie2 output not in correct format: $_\n"; + next; + } + + my ($seqname) = split(/\t/, $line); + + #Unlike the original bowtie native output format, bowtie2 removes the '/1' or '/2' from the end of a pair-end read (in fact bowtie adds the '/1' or '/2' + #if it is missing in the FASTQ file). This will cause counting problems. To remedy this add '/1' or '/2' or alternating alignments + if( ($library_bowtie_version{$library->[1]} == 2) and $file2 ){ #Bowtie2 used as aligner and a paired-end read + $seqname = $seqname.'/'.$seqname_suffix; + $seqname_suffix = abs($seqname_suffix - 3); #seqname_suffix alternates between 1 and 2 + } + + + # Initialise an array of zero values the same length as @libraries + unless (exists $seq_genomes{$seqname}){ + for my $position(0..$#libraries){ + $seq_genomes{$seqname}->[$position] = 0; + } + } + + + # If there is already a hit for this read against this library, set the hit score to 2 + # If this is the first hit, set the score to 1 + if($seq_genomes{$seqname}->[$library_index] == 1){ + $seq_genomes{$seqname}->[$library_index] = 2; + }else{ + $seq_genomes{$seqname}->[$library_index] = 1; + } + } + + + #Check the Standard Error output file and report any errors + #Bowtie reports the alignment summary to standard error, so ignore the alignment summary + while(<$error_fh>){ + unless (/^\#/ or /^Reported/ or /^No alignments/ or /reads; of these/ or /were unpaired; of these/ or /aligned/ or /overall alignment rate/ or /mates make up the pairs; of these/ or /were paired; of these/ or /----/) { + warn "Bowtie/Bowtie2 warning: $_"; + } + } + close $error_fh; + unlink $error_filename or die "Could not delete temporary Standard Error file '$error_filename' : $!"; + } + + + # Collate the hit results from the Bowtie searches. + #Each read/read pair can be classified as 1) maps to no libraries; + #2) maps uniquely to a specified library; 3) maps multiple times to a specified library, but to no others; + #4) maps once to a specified library and to at least one other library; 5) maps multiple + #times to a specified library and to at least one other library. + + my @one_hit_one_library; + my @one_hit_multiple_libraries; + my @multiple_hits_one_library; + my @multiple_hits_multiple_libraries; + + # Initialise thoses arrays with zero values, making them the same length as @libraries + for my $position(0..$#libraries){ + $one_hit_one_library[$position] = 0; + $one_hit_multiple_libraries[$position] = 0; + $multiple_hits_one_library[$position] = 0; + $multiple_hits_multiple_libraries[$position] = 0; + } + + foreach my $seq_id(keys %seq_genomes){ + my $numlib = 0; + + # Count up the number of libraries where this sequence + # had a hit + foreach (@{$seq_genomes{$seq_id}}) { + $numlib++ if ($_); + } + + #Identify the hits + # Classify the type of hit + if($numlib == 1){ + foreach my $index (0..$#{$seq_genomes{$seq_id}}){ + if($seq_genomes{$seq_id}->[$index] == 2){ + $multiple_hits_one_library[$index]++; #Maps multiple times to a specified library + }else{ + $one_hit_one_library[$index] += $seq_genomes{$seq_id}->[$index]; #Maps uniquely / does not map (i.e. add 0) + } + } + }else{ + foreach my $index (0..$#{$seq_genomes{$seq_id}}){ + if($seq_genomes{$seq_id}->[$index] == 2){ + $multiple_hits_multiple_libraries[$index]++; #Maps multiple times to a specified library, but to no others + }else{ + $one_hit_multiple_libraries[$index] += $seq_genomes{$seq_id}->[$index]; #Maps multiple times / does not map (i.e. add 0) + } + } + } + } + + # If we're doing a paired end search then all our counts will + # be double what they should be since we'll get hits for both + # of the sequences in the pair with different ids (bowtie1 will + # make them different even if they weren't originally). We + # therefore need to halve the counts before calculating percentages. + # The easiest way to do this is to double the expected total + # count + + $readsprocessed *= 2 if ($file2); + + # Summarise the counts and write the text report + foreach my $index (0..$#libraries){ + my $library = $libraries[$index]; + + my $percent_one_hit_one_library = ($one_hit_one_library[$index] / $readsprocessed) * 100; + $percent_one_hit_one_library = sprintf("%.2f",$percent_one_hit_one_library); + + my $percent_one_hit_multiple_libraries = ($one_hit_multiple_libraries[$index] / $readsprocessed) * 100; + $percent_one_hit_multiple_libraries = sprintf("%.2f",$percent_one_hit_multiple_libraries); + + my $percent_multiple_hits_one_library = ($multiple_hits_one_library[$index] / $readsprocessed) * 100; + $percent_multiple_hits_one_library = sprintf("%.2f", $percent_multiple_hits_one_library); + + my $percent_multiple_hits_multiple_libraries = ($multiple_hits_multiple_libraries[$index] / $readsprocessed) * 100; + $percent_multiple_hits_multiple_libraries = sprintf("%.2f", $percent_multiple_hits_multiple_libraries); + + my $percent_unmapped = 100 - $percent_one_hit_one_library - $percent_one_hit_multiple_libraries - $percent_multiple_hits_one_library - $percent_multiple_hits_multiple_libraries; + $percent_unmapped = sprintf("%.2f", $percent_unmapped); + + my $reads_unmapped = $readsprocessed - $one_hit_one_library[$index] - $one_hit_multiple_libraries[$index] - $multiple_hits_one_library[$index] - $multiple_hits_multiple_libraries[$index]; + + print OUT join("\t",($library->[0], $readsprocessed, $reads_unmapped, $percent_unmapped, $one_hit_one_library[$index], $percent_one_hit_one_library, $multiple_hits_one_library[$index], $percent_multiple_hits_one_library, $one_hit_multiple_libraries[$index], $percent_one_hit_multiple_libraries, $multiple_hits_multiple_libraries[$index], $percent_multiple_hits_multiple_libraries)),"\n"; + } + + #Calculate the number of reads that mapped to none of the libraries + my $percent_hit_no_libraries = (($readsprocessed - (scalar keys %seq_genomes)) / $readsprocessed) * 100; + $percent_hit_no_libraries = sprintf("%.2f",$percent_hit_no_libraries); + print OUT "\n\%Hit_no_libraries: $percent_hit_no_libraries\n"; + + close IN; + close OUT or die "Coudn't write to $outfile: $!"; + + + if($nohits){ + if($file2){ + find_no_hits(\%seq_genomes, $file, $file2, $outfile); + }else{ + find_no_hits(\%seq_genomes, $file, $outfile); + } + } + + unlink($temp_file) or warn "Unable to delete temp file '$temp_file'" if ($temp_file); + + unlink($temp_file2) or warn "Unable to delete temp file '$temp_file'" if ($temp_file2); + + + #Check whether the module GD::Graph is installed + eval{ + require GD::Graph::bars; + GD::Graph::pie->import(); + }; + + unless($@) { + make_graph($outfile); + }else{ + warn "Perl module GD::Graph::bars not installed, skipping charts\n"; + } +} + +print "Processing complete\n"; + + + +##################################################################################################### +#Subroutines +##################################################################################################### + + +sub load_configuration { + + # Find the config file + + my ($conf_file) = @_; + + # If they haven't specified a conf file then look + # in the directory containing the program. + $conf_file = "$RealBin/fastq_screen.conf" unless ($conf_file); + + unless (-e $conf_file) { + die "Couldn't find fastq_screen.conf at '$conf_file'"; + } + + warn "Reading configuration from '$conf_file'\n" unless ($quiet); + + open (CONF,$conf_file) or die "Can't read $conf_file : $!"; + + #Determine aligner to use from 1) paths specified and 2) databases if no aligner specified + if(!defined $aligner){ + my $bowtie1_path_exists = 0; + my $bowtie2_path_exists = 0; + my $bowtie1_database_exists = 0; + my $bowtie2_database_exists = 0; + + while (<CONF>) { + chomp; + s/^\s+//; + s/\s+$//; + + next if (/^\#/); + next unless ($_); + + my ($name) = split(/\s+/); + + if ($name eq 'BOWTIE'){ + if( defined ((split(/\s+/,$_,2))[1]) ){ + $bowtie1_path_exists = 1; + } + }elsif($name eq 'BOWTIE2'){ + if( defined ((split(/\s+/,$_,2))[1]) ){ + $bowtie2_path_exists = 1; + } + }elsif ($name eq 'DATABASE') { + my (undef, undef, undef, $bowtie_version) = split(/\s+/,$_,4); + + if(!defined $bowtie_version){ #Determine whether an index should be used with bowtie1 or bowtie2 + $bowtie1_database_exists = 1; + }elsif($bowtie_version =~ /bowtie2/i){ + $bowtie2_database_exists = 1; + + } + } + } + + #Is bowtie or bowtie2 in path? + unless(system "which bowtie >/dev/null 2>&1"){ + $bowtie1_path_exists = 1; + } + unless(system "which bowtie2 >/dev/null 2>&1"){ + $bowtie2_path_exists = 1; + } + + #Decide whether to proceed and display message + if($bowtie1_path_exists and $bowtie2_path_exists and $bowtie1_database_exists and $bowtie2_database_exists){ + $aligner = 'bowtie'; + warn "Aligner not specified (--aligner). Path to bowtie and bowtie2 both specified, bowtie and bowtie2 database both present: defaulting to bowtie\n"; + }elsif($bowtie1_path_exists and $bowtie1_database_exists){ + $aligner = 'bowtie'; + warn "Aligner not specified (--aligner). Path to bowtie specified and bowtie database(s) present: mapping with bowtie\n"; + }elsif($bowtie2_path_exists and $bowtie2_database_exists){ + $aligner = 'bowtie2'; + warn "Aligner not specified (--aligner). Path to bowtie2 specified and bowtie2 database(s) present: mapping with bowtie2\n"; + }else{ + die "Aligner not specified (--aligner). No path to bowtie or bowtie2, or, no bowtie or bowtie2 databases specified: lease adjust configuration file.\n"; + } + seek (CONF,0,0); #Return to start of conf file + } + + + while (<CONF>) { + chomp; + s/^\s+//; + s/\s+$//; + + next if (/^\#/); + next unless ($_); + + my ($name) = split(/\s+/); + + if ( ($name eq 'BOWTIE') and ($aligner eq 'bowtie') ) { + + $path_to_bowtie = (split(/\s+/,$_,2))[1]; + + die "Empty value set for BOWTIE config parameter" unless ($path_to_bowtie); + + warn "Using '$path_to_bowtie' as bowtie path\n" unless ($quiet); + } + + elsif( ($name eq 'BOWTIE2') and ($aligner eq 'bowtie2') ) { + $path_to_bowtie2 = (split(/\s+/,$_,2))[1]; + + die "Empty value set for BOWTIE2 config parameter" unless ($path_to_bowtie2); + + warn "Using '$path_to_bowtie2' as bowtie2 path\n" unless ($quiet); + } + + elsif ($name eq 'THREADS') { + $number_of_threads = (split(/\s+/))[1]; + unless ($number_of_threads =~ /^\d+$/) { + die "Invalid number of threads '$number_of_threads set in conf file"; + } + warn "Using $number_of_threads threads for searches\n" unless ($quiet); + } + + elsif ($name eq 'DATABASE') { + my (undef,$db_name,$db_path,$bowtie_version) = split(/\s+/,$_,4); + + if(!defined $bowtie_version){ #Determine whether an index should be used with bowtie1 or bowtie2 + $bowtie_version = 1; + }elsif($bowtie_version =~ /bowtie2/i){ + $bowtie_version = 2; + #Bowtie2 cannot process colorspace reads + if($color){ + die "Bowtie2 cannot process colorspace reads, in the configuration file please specify original Bowtie indices for '$db_name'\n"; + } + }elsif($bowtie_version =~ /bowtie/i){ + $bowtie_version = 1; + }else{ + die "Could not determine which bowtie version to use for: '$db_name\t$db_path\t$bowtie_version'\nPlease modify the configuraion file by entering after the path to the desired sequence aligner either i)'Bowtie2' or ii)leave blank to default to the original Bowtie\n"; + } + + if( ($bowtie_version == 1) and ($aligner eq 'bowtie2') ){ #Check the index is for the aligner to be used + next; + }elsif( ($bowtie_version == 2) and ($aligner eq 'bowtie') ){ + next; + } + + $library_bowtie_version{$db_path} = $bowtie_version; + + # Check to see that there's a bowtie or bowtie2 index at that location + if($bowtie_version == 1){ + unless (-e "$db_path.1.ebwt") { + warn "Skipping DATABASE '$db_name' since no bowtie index was found at '$db_path'\n"; + next; + } + }else{ + unless (-e "$db_path.1.bt2") { + warn "Skipping DATABASE '$db_name' since no bowtie index was found at '$db_path'\n"; + next; + } + } + + warn "Adding database $db_name\n" unless ($quiet); + push @libraries, [$db_name,$db_path]; + } + } + close CONF; +} + + + +sub get_read_length { + + my ($file) = @_; + + #Check if the read file is compressed and open accordingly + if($file =~ /\.gz$/){ + open (IN,"zcat \'$file\' |") or do { + warn "Failed to read $file: $!"; + return 0; + }; + }else{ + open (IN,$file) or do { + warn "Failed to read $file: $!"; + return 0; + }; + } + + my $shortest_length = -1; + + my $current_count = 0; + while (<IN>) { + if (/^@/) { + my $seq = scalar <IN>; + chomp $seq; + if ($shortest_length < 0 or length $seq < $shortest_length) { + $shortest_length = length $seq; + } + + # Skip the rest of the record + $_ = scalar <IN>; + $_ = scalar <IN>; + + ++$current_count; + last if ($current_count > 1000); + } + } + return $shortest_length; +} + + + +sub make_graph { + + my ($file) = @_; + + my $outfile = $file; + $outfile =~ s/\.txt$//; + + open (IN,$file) or die "Can't open data file '$file' to create graph"; + + my @data; + my $percent_no_hits; + + $_ = <IN>; #Header + $_ = <IN>; #Header + + while (<IN>) { + chomp; + + if(/^\s*$/){ + next; + } + + if(/^\%Hit_no_libraries: (\S+)/){ + $percent_no_hits = $1; + last; + } + + my ($name, undef, undef, $unmapped, undef, $percent_one_hit_one_library, undef, $percent_multiple_hits_one_library, undef, $percent_one_hit_multiple_libraries, undef, $percent_multiple_hits_multiple_libraries) = split(/\t/); + + push @{$data[0]},$name; + push @{$data[1]},$percent_one_hit_one_library; + push @{$data[2]},$percent_multiple_hits_one_library; + push @{$data[3]},$percent_one_hit_multiple_libraries; + push @{$data[4]},$percent_multiple_hits_multiple_libraries; + } + + close IN; + + #Add a separate bar for the reads that hit none of the libraries + push @{$data[0]},'No hits'; + push @{$data[1]},$percent_no_hits; + push @{$data[2]},'0'; + push @{$data[3]},'0'; + push @{$data[4]},'0'; + + # my $graph = GD::Graph::bars->new(75* @{$data[0]},350); + # too small - ross fix may 12 2014 + my $graph = GD::Graph::bars->new(1024,768); + + + $graph->set_legend('One hit\one library', 'Multiple hits\one library', 'One hit\multiple libraries','Multiple hits\multiple libraries'); + + $graph -> set ( + y_label => '% Mapped', + title => ((split(/\//,$outfile))[-1]), + y_max_value => 100, + bar_spacing => 15, + transparent => 0, + overwrite => 1, + cumulate => 1, + dclrs => [qw(lblue dblue lred dred)], + ); + + my $gd = $graph -> plot (\@data); + + + + $outfile .= ".png"; + + open (IMG,'>',$outfile) or die "Can't write graph to $outfile: $!"; + binmode IMG; + print IMG $gd->png(); + close IMG or die "Can't write graph to $outfile: $!"; +} + + + +sub find_no_hits{ + my $seq_genomes; + my $file; + my $file2; + my $no_hits_filename; + my @filenames = (); + my $count = 0; + + #Determine if we have a paired read + if(scalar @_ == 4){ + ($seq_genomes, $file, $file2, $no_hits_filename) = @_; + push(@filenames, ($file, $file2)); + }else{ + ($seq_genomes, $file, $no_hits_filename) = @_; + push(@filenames, $file); + } + + $no_hits_filename .= "_no_hits.txt"; + + foreach my $filename (@filenames){ + $count++; + + open(IN, $filename) or die "Can't read $filename : $!"; + + open(OUT, ">>$no_hits_filename") or die "Could not write to $no_hits_filename"; + + while(<IN>){ + if (/^@/) { + my $seq_id = $_; + chomp $seq_id; + $seq_id = substr($seq_id,1); #Remove the '@' symbol to match Bowtie output + my $rest_of_record = scalar <IN>; + $rest_of_record .= scalar <IN>; + $rest_of_record .= scalar <IN>; + + #Read files containing paired-end reads will sometimes name the reads according to whether they are the #1 or #2 mates + # by appending a /1 or /2 suffix to the read name. If no such suffix is present in Bowtie's input, the suffix will be added + #when Bowtie prints read names in alignments. + #Paired files without the /1, /2 convention will cause problems for fastq_screen, since the Bowte output read name will not match + #that in the input file. The code below corrects for this. + if($file2){ + my $mate = substr($seq_id, -2); + unless(($mate eq '/1') or ($mate eq '/2')){ + $seq_id .= '/'."$count"; + } + } + + unless(exists $$seq_genomes{$seq_id}){ + print OUT "@"."$seq_id"."\n"."$rest_of_record"; + } + } + } + close IN; + close OUT or die "Coudn't write to $no_hits_filename: $!"; + } +} + + + +__DATA__ + +Fastq Screen - Screen sequences against a panel of databases + +Synopsis + + fastq_screen [OPTION]... [FastQ FILE]... + +Function + + Fastq Screen is intended to be used as part of a QC pipeline. + It allows you to take a sequence dataset and search it + against a set of bowtie databases. It will then generate + both a text and a graphical summary of the results to see if + the sequence dataset contains the kind of sequences you expect + or not. + +Options + + --help -h Print program help and exit + + --subset Don't use the whole sequence file to search, but + create a temporary dataset of this size. The + dataset created will be of approximately (within + a factor of 2) of this size. If the real dataset + is smaller than twice the specified size then the + whole dataset will be used. Subsets will be taken + evenly from throughout the whole original dataset + + --paired Files are paired end. Files must be specified in + the correct order with pairs of files coming + immediately after one another. Results files will + be named after the first file in the pair if the + names differ between the two files. + + --outdir Specify a directory in which to save output files. + If no directory is specified then output files + are saved into the same directory as the input + file. + + --illumina1_3 Assume that the quality values are in encoded in + Illumina v1.3 format. Defaults to Sanger format + if this flag is not specified + + --quiet Supress all progress reports on stderr and only + report errors + + --version Print the program version and exit + + --threads Specify across how many threads bowtie will be + allowed to run. Overrides the default value set + in the conf file + + --conf Manually specify a location for the configuration + file to be used for this run. If not specified + then the file will be taken from the same directory + as the fastq_screen program + + --color FastQ files are in colorspace. This requires that + the libraries configures in the config file are + colorspace indices. + + --bowtie Specify extra parameters to be passed to bowtie. + These parameters should be quoted to clearly + delimit bowtie parameters from fastq_screen + parameters. You should not try to use this option + to override the normal search or reporting options + for bowtie which are set automatically but it might + be useful to allow reads to be trimmed before + alignment etc. + + --bowtie2 Specify extra parameters to be passed to bowtie 2. + These parameters should be quoted to clearly + delimit bowtie2 parameters from fastq_screen + parameters. You should not try to use this option + to override the normal search or reporting options + for bowtie which are set automatically but it might + be useful to allow reads to be trimmed before + alignment etc. + + --nohits Writes to a file the sequences that did not map to + any of the specified genome libraries. If the + subset option is also specified, only reads from + the temporary dataset that failed to align to the + reference genomes will be written to the output file. + + --aligner Specify the aligner to use for the mapping. Valid + arguments are 'bowtie' or 'bowtie2'. +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fastq_screen.xml Fri May 16 07:57:33 2014 -0400 @@ -0,0 +1,189 @@ +<tool id="fastq_screen" name="fastq_screen" version="0.4.2"> + <description>Screen for contamination</description> + <requirements> + <requirement type="package" version="0.4.2">fastq_screen</requirement> + <requirement type="package" version="2.1.0">bowtie2</requirement> + </requirements> + <command> + fastq_screen --aligner="bowtie2" --outdir="." --conf="$fastqrunconf" + #if $sampN > 0: + --subset "$sampN" + #end if + "$input1" + #if $singlePaired.sPaired == "paired": + "$input2" + #end if + ; mv *_screen.png ${outpng} ; mv *_screen.txt ${outtext} + </command> + + <stdio> + <regex match=".*" source="both" level="warning" description="fastqc_screen perl script output"/> + </stdio> + + <inputs> + <param name="jobName" type="text" size="120" value="fastq_screen" label="Job narrative (included in output names as a reminder)" + help="Only letters, numbers and underscores _ will be retained in this field"> + <sanitizer invalid_char=""> + <valid initial="string.letters,string.digits"><add value="_" /> </valid> + </sanitizer> + </param> + <param name="sampN" type="integer" size="20" value="500000" label="Sample this number of reads. Set to 0 or less to use all" + help="Time/precision trade off - fewer reads takes a little less time trading off precision of the estimates."/> + <conditional name="singlePaired"> + <param name="sPaired" type="select" label="Single ended or mate-pair ended reads in this library?"> + <option value="single" selected="true">Single-end</option> + <option value="paired">Paired-end</option> + </param> + <when value="single"> + <param format="fastqsanger,fastq" name="input1" type="data" label="RNA-Seq FASTQ file" help="Nucleotide-space: Must have Sanger-scaled quality values with ASCII offset 33"/> + </when> + <when value="paired"> + <param format="fastqsanger,fastq" name="input1" type="data" label="RNA-Seq FASTQ file, forward reads" help="Nucleotide-space: Must have Sanger-scaled quality values with ASCII offset 33" /> + <param format="fastqsanger,fastq" name="input2" type="data" label="RNA-Seq FASTQ file, reverse reads" help="Nucleotide-space: Must have Sanger-scaled quality values with ASCII offset 33" /> + </when> + </conditional> + + <!-- Genome source. --> + <repeat name="refGenomes" title="Installed organism reference sequences to check for alignment to your fastq" min="1" + help="For checking cell culture sequence for contamination, Mycoplasma Genitalium might be a good choice eg"> + <param name="ref" type="select" label="Bowtie2 reference genome"> + <options from_data_table="bowtie2_indexes"> + <filter type="sort_by" column="3"/> + <validator type="no_options" message="No indexes are available for bowtie2"/> + </options> + </param> + </repeat> + </inputs> + + <outputs> + <data format="tabular" name="outtext" label="${jobName}.xls"/> + <data format="png" name="outpng" label="${jobName}.png"/> + </outputs> + <configfiles> + <configfile name="fastqrunconf"> +###### autogenerated by fastq_screen.xml for fastq_screen run +BOWTIE2 /data/app/bin/bowtie2 +#for $refs in $refGenomes: +DATABASE $refs.ref.fields.value $refs.ref.fields.path BOWTIE2 +#end for + </configfile> + </configfiles> + +<help> + +**What it does** +This is a Galaxy wrapper exposing software from Babraham -fastq_screen_ +Designed to search sequence data in fastq files for matches to contaminants or to check the likely +species. +In QC checking, you can use it to look for (eg) sequence from contaminating mycoplasmae in cell cultures - it may be non-differential but it will be pro-inflammatory and, well, less than ideal. + +Here's the help from the perl script used by this wrapper: + +Fastq Screen - Screen sequences against a panel of databases + +Synopsis + + fastq_screen [OPTION]... [FastQ FILE]... + +Function + + Fastq Screen is intended to be used as part of a QC pipeline. + It allows you to take a sequence dataset and search it + against a set of bowtie databases. It will then generate + both a text and a graphical summary of the results to see if + the sequence dataset contains the kind of sequences you expect + or not. + +Options + + --help -h Print program help and exit + + --subset Don't use the whole sequence file to search, but + create a temporary dataset of this size. The + dataset created will be of approximately (within + a factor of 2) of this size. If the real dataset + is smaller than twice the specified size then the + whole dataset will be used. Subsets will be taken + evenly from throughout the whole original dataset + + --paired Files are paired end. Files must be specified in + the correct order with pairs of files coming + immediately after one another. Results files will + be named after the first file in the pair if the + names differ between the two files. + + --outdir Specify a directory in which to save output files. + If no directory is specified then output files + are saved into the same directory as the input + file. + + --illumina1_3 Assume that the quality values are in encoded in + Illumina v1.3 format. Defaults to Sanger format + if this flag is not specified + + --quiet Supress all progress reports on stderr and only + report errors + + --version Print the program version and exit + + --threads Specify across how many threads bowtie will be + allowed to run. Overrides the default value set + in the conf file + + --conf Manually specify a location for the configuration + file to be used for this run. If not specified + then the file will be taken from the same directory + as the fastq_screen program + + --color FastQ files are in colorspace. This requires that + the libraries configures in the config file are + colorspace indices. + + --bowtie Specify extra parameters to be passed to bowtie. + These parameters should be quoted to clearly + delimit bowtie parameters from fastq_screen + parameters. You should not try to use this option + to override the normal search or reporting options + for bowtie which are set automatically but it might + be useful to allow reads to be trimmed before + alignment etc. + + --bowtie2 Specify extra parameters to be passed to bowtie 2. + These parameters should be quoted to clearly + delimit bowtie2 parameters from fastq_screen + parameters. You should not try to use this option + to override the normal search or reporting options + for bowtie which are set automatically but it might + be useful to allow reads to be trimmed before + alignment etc. + + --nohits Writes to a file the sequences that did not map to + any of the specified genome libraries. If the + subset option is also specified, only reads from + the temporary dataset that failed to align to the + reference genomes will be written to the output file. + + --aligner Specify the aligner to use for the mapping. Valid + arguments are 'bowtie' or 'bowtie2'. + + +**Attributions** + +Note that each component has its own license. +Good luck with figuring out your obligations. + +fastq_screen - see the web site at Fastq_screen_ + +Galaxy_ (that's what you are using right now!) for gluing everything together + + +Code and documentation comprising this tool was written by Ross Lazarus and that part is Licensed_ the same way as other rgenetics artefacts + +.. _Fastq_screen: http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen + +.. _Galaxy: http://getgalaxy.org + +.. _Licensed: https://www.gnu.org/licenses/lgpl.html + +</help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Fri May 16 07:57:33 2014 -0400 @@ -0,0 +1,8 @@ +<!-- Use the file tool_data_table_conf.xml.oldlocstyle if you don't want to update your loc files as changed in revision 4550:535d276c92bc--> +<tables> + <!-- Locations of indexes in the Bowtie2 mapper format --> + <table name="bowtie2_indexes" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/bowtie2_indices.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Fri May 16 07:57:33 2014 -0400 @@ -0,0 +1,31 @@ +<?xml version="1.0"?> +<tool_dependency> + <package name="bowtie2" version="2.1.0"> + <repository changeset_revision="606d435a57a4" name="package_bowtie2_2_1_0" owner="devteam" toolshed="http://testtoolshed.g2.bx.psu.edu" /> + </package> + <package name="fastq_screen" version="0.4.2"> + <install version="1.0"> + <actions> + <action type="download_by_url">http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/fastq_screen_v0.4.2.tar.gz</action> + <action type="move_directory_files"> + <source_directory>.</source_directory> + <destination_directory>$INSTALL_DIR</destination_directory> + </action> + <action type="set_environment"> + <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR</environment_variable> + </action> + </actions> + </install> + <readme> +Galaxy wrapper for http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/ from Simon Andrews. +Useful if you want to (eg) check your RNA sequence from a cell culture for contamination from (eg) Mycoplasma Genitalium, or if you have sequence from an unknown organism :( +Permanently set to use your Galaxy bowtie2 indices - expects Bowtie2 to be installed so the Galaxy package_bowtie2 is installed if it's not already there. + +###WARNING### The fastq_screen perl script requires that the GD::Graph http://search.cpan.org/~bwarfield/GDGraph/Graph.pm module be installed. +Otherwise, NO plot will be created. This wrapper relies on the system perl - a proper Galaxy perl dependency is just too icky to think about. + +Send code if you think otherwise. + </readme> + </package> + +</tool_dependency>