Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm @ 3:d30fa12e4cc5 default tip
Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author | devteam <devteam@galaxyproject.org> |
---|---|
date | Mon, 13 Jan 2014 10:38:30 -0500 |
parents | 1f6dce3d34e0 |
children |
line wrap: on
line source
=pod =head1 NAME Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl; =head1 DESCRIPTION 'SWEmbl' Is a base class for other classes dealing with SWEmbl It contains virtually nothing so it may disappear and just pass to Funcgen =cut package Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl; use warnings; use strict; use Bio::EnsEMBL::Funcgen::Utils::Helper; use Bio::EnsEMBL::DBSQL::DBAdaptor; use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; use Bio::EnsEMBL::Funcgen::InputSet; use Bio::EnsEMBL::Funcgen::DataSet; use Bio::EnsEMBL::Funcgen::FeatureSet; use Bio::EnsEMBL::Funcgen::AnnotatedFeature; use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen'); use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); use Data::Dumper; sub fetch_input { my $self = shift @_; $self->SUPER::fetch_input(); my $efgdba = $self->_efgdba; my $aa = $efgdba->get_AnalysisAdaptor; my $analysis = $self->param('analysis') || throw "Need to specify the analysis"; my $analysis_obj = $aa->fetch_by_logic_name($analysis); #Here we do not create or modify the analysis on the fly!! - similarly as cell or feature type... if(!$analysis_obj){ throw "Analysis $analysis is not in the database"; } $self->_analysis($analysis_obj); $self->_feature_set_name($self->_set_name()."_".$analysis); my $cell_type = $self->_cell_type()->name; my $feature_type = $self->_feature_type()->name; my $file_type = $self->_file_type(); my $experiment_name = $self->_experiment_name(); #TODO Change this to accept samse and sampe and others?? - add an extra parameter when needed? #Also not necessarily .gz ... maybe force the use of the parameter 'data_file' in the input_id #my $input_file = $self->param('data_file') || $cell_type."_".$feature_type.".samse.".$file_type.".gz"; my $input_file = $self->param('data_file') || $self->_set_name().".samse.".$file_type.".gz"; $self->_input_file($input_file); my $work_dir = $self->_work_dir."/alignments/".$self->_species()."/".$self->_assembly()."/".$experiment_name; my $input_dir = $self->param('input_dir') || $work_dir; $self->_input_dir($input_dir); my $skip_control = $self->_skip_control($self->param('skip_control')); if(!$skip_control){ #May also be passed as input_id, but then input_id may need to be TEXT #Control must be in same dir as input file... maybe change this... if(!$self->param('control_file')){ #For the moment the control file has to be the same file type as the input file, but does not need to be the case... #TODO Need to validate here if that's the case... my $control_feature = $self->param('control_feature') || throw "Need to define 'control_feature'"; my $control_file = $cell_type."_".$control_feature."_".$experiment_name.".samse.".$file_type.".gz"; $self->_control_file($control_file); } } return 1; } # Private function only to be called by subclasses of this class # prepares data to be used with SWEMBL... # sorts the input, removes mythochondria and unaligned reads... sub _preprocess_file{ #Consider using hash to process input my ($self, $input, $output, $file_type) = (shift, shift, shift, shift); #For the moment we always overwrite any existent file... #Maybe reuse previously cached files? How to check if they are corrupted? #Maybe create a -reuse flag? #running piped system commands is a potential source of untraceable errors!! #TODO try changing this... e.g. with Bio::DB::Sam ... ? my $command; if($file_type eq 'bed'){ $command = "gzip -dc ${input}"; #Remove mitochondria before SWEMBL warn "Excluding mytochondria reads before passing to SWEMBL. Make sure Bed file has MT for mytochondria"; warn "Duplicates are not removed in Bed files while they are in sam files"; #This probably won't work for many BED files... e.g chrM or sam-like "beds"... $command = " | grep -v '^MT' "; $command .= " | sort -k1,1 -k2,2n -k3,3n | gzip -c > $output"; } elsif($file_type eq 'sam'){ #Manual Alternative to samtools #$command = "gzip -dc ".$self->param($parameter); #$command .= ' | grep -vE \'^@\' | grep -vE "^[^[:space:]]+[[:blank:]]4[[:blank:]].*$" | sort -k3,3 -k4,4n | gzip -c > '.$work_file; #Sometimes the input sam file may have incorrect headings which will mess up subsequent steps... #$command = "gzip -dc $input | grep -v '^\@' | "; # This is not likely to happen now $command = "gzip -dc $input | "; #PREPROCESSING... remove mitochondria before SWEMBL : we need to cater for different approaches #TODO do this in a better, more generic way $command .= "grep -vE '^[^[:space:]]+[[:blank:]][^[:space:]]+[[:blank:]][^[:space:]]+\:[^[:space:]]+\:MT\:' | "; $command .= "grep -v '^MT' | grep -v '^chrM' | "; #Remove unmapped reads... $command .= $self->_bin_dir()."/samtools view -uSh -t ".$self->_sam_header()." -F 4 - | "; # This piped sort is not working!! (this problem has been reported in the samtools mailing list) #TODO Check why this pipe in the sort is not working... #$command .= "samtools sort -o - ".$self->param($parameter)."_tmp | " ; #$command .= "samtools view -h - | gzip -c > $work_file"; # Create temp file for now and remove it when we figure out what's wrong with the piped sort! $command .= $self->_bin_dir()."/samtools sort - ${input}_tmp ; " ; #Add a remove duplicates step (this is not supported with BED files for the moment) $command .= $self->_bin_dir()."/samtools rmdup -s ${input}_tmp.bam - | "; $command .= $self->_bin_dir()."/samtools view -h - | gzip -c > $output"; #Alternative with no rmdup... #$command .= $self->_bin_dir()."/samtools view -h ${input}_tmp.bam | gzip -c > $output"; $command .= " ; rm -f ${input}_tmp.bam"; } else{ throw("$file_type file format not supported"); } system($command) && throw("Failed processing $input with command $command"); return 1; } # Private function only to be called by subclasses of this class # gets the number of reads in a sam or bed file #sub _get_number_of_reads { # my ($self, $file, $file_type) = (shift, shift, shift); # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; } # my $nbr_reads = 0; # #If needed, add an option to check if is zipped or not... # my $open_cmd = "gzip -dc $file |"; # open(FILE,$open_cmd); # while(<FILE>){ # if($file_type eq "sam"){ # next if /^\@SQ/; # }else { # next if /track name=/o; # } # $nbr_reads++; # } # close FILE; # return $nbr_reads; #} # Private function only to be called by subclasses of this class # gets the number of reads in a sam or bed file #sub _get_slices { # #NOT DONE!! # my ($self, $file, $file_type) = (shift, shift, shift); # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; } # my $nbr_reads = 0; # #If needed, add an option to check if is zipped or not... # my $open_cmd = "gzip -dc $file |"; # open(FILE,$open_cmd); # while(<FILE>){ # if($file_type eq "sam"){ # next if /^@SQ/; # }else { # next if /track name=/o; # } # $nbr_reads++; # } # close FILE; # return $nbr_reads; #} #Private getter / setter to the Feature Set Name sub _feature_set_name { return $_[0]->_getter_setter('feature_set_name',$_[1]); } #Private getter / setter to the input folder sub _input_dir { return $_[0]->_getter_setter('input_dir',$_[1]); } #Private getter / setter to the Input subset (usually a file name) sub _input_file { return $_[0]->_getter_setter('input_file',$_[1]); } #Private getter / setter to the skip control option sub _skip_control { return $_[0]->_getter_setter('skip_control',$_[1]); } #Private getter / setter to the control file sub _control_file { return $_[0]->_getter_setter('control_file',$_[1]); } 1;