diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,219 @@
+=pod 
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl;
+
+=head1 DESCRIPTION
+
+'SWEmbl' Is a base class for other classes dealing with SWEmbl
+It contains virtually nothing so it may disappear and just pass to Funcgen
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl;
+
+use warnings;
+use strict;
+use Bio::EnsEMBL::Funcgen::Utils::Helper;
+use Bio::EnsEMBL::DBSQL::DBAdaptor;
+use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; 
+use Bio::EnsEMBL::Funcgen::InputSet;
+use Bio::EnsEMBL::Funcgen::DataSet;
+use Bio::EnsEMBL::Funcgen::FeatureSet;
+use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
+
+use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen');
+
+use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
+use Data::Dumper;
+
+sub fetch_input {   
+  my $self = shift @_;
+
+  $self->SUPER::fetch_input();
+
+  my $efgdba = $self->_efgdba;
+
+  my $aa = $efgdba->get_AnalysisAdaptor;
+  my $analysis = $self->param('analysis') || throw "Need to specify the analysis";
+  my $analysis_obj = $aa->fetch_by_logic_name($analysis);
+  #Here we do not create or modify the analysis on the fly!! - similarly as cell or feature type...
+  if(!$analysis_obj){ throw "Analysis $analysis is not in the database"; }
+  $self->_analysis($analysis_obj);
+
+  $self->_feature_set_name($self->_set_name()."_".$analysis);
+
+  my $cell_type = $self->_cell_type()->name;
+  my $feature_type = $self->_feature_type()->name;  
+  my $file_type = $self->_file_type();
+  my $experiment_name = $self->_experiment_name();
+
+  #TODO Change this to accept samse and sampe and others?? - add an extra parameter when needed?
+  #Also not necessarily .gz ... maybe force the use of the parameter 'data_file' in the input_id
+  #my $input_file =  $self->param('data_file') || $cell_type."_".$feature_type.".samse.".$file_type.".gz";
+  my $input_file =  $self->param('data_file') || $self->_set_name().".samse.".$file_type.".gz";
+  $self->_input_file($input_file);
+  
+  my $work_dir = $self->_work_dir."/alignments/".$self->_species()."/".$self->_assembly()."/".$experiment_name;
+  my $input_dir = $self->param('input_dir') || $work_dir;  
+  $self->_input_dir($input_dir);
+  
+  my $skip_control = $self->_skip_control($self->param('skip_control'));  
+  if(!$skip_control){
+    #May also be passed as input_id, but then input_id may need to be TEXT
+    #Control must be in same dir as input file... maybe change this...
+    if(!$self->param('control_file')){
+      #For the moment the control file has to be the same file type as the input file, but does not need to be the case...
+      #TODO Need to validate here if that's the case...
+      my $control_feature = $self->param('control_feature') || throw "Need to define 'control_feature'";
+      my $control_file = $cell_type."_".$control_feature."_".$experiment_name.".samse.".$file_type.".gz";
+      $self->_control_file($control_file);
+    }
+  }
+
+  return 1;
+}
+
+
+# Private function only to be called by subclasses of this class
+# prepares data to be used with SWEMBL...
+# sorts the input, removes mythochondria and unaligned reads...
+sub _preprocess_file{
+  
+  #Consider using hash to process input
+  my ($self, $input, $output, $file_type) = (shift, shift, shift, shift);
+
+  #For the moment we always overwrite any existent file...
+  #Maybe reuse previously cached files? How to check if they are corrupted?
+  #Maybe create a -reuse flag?
+  
+  #running piped system commands is a potential source of untraceable errors!!
+  #TODO try changing this... e.g. with Bio::DB::Sam ... ?
+  my $command;
+  if($file_type eq 'bed'){
+    $command = "gzip -dc ${input}";
+
+    #Remove mitochondria before SWEMBL 
+    warn "Excluding mytochondria reads before passing to SWEMBL. Make sure Bed file has MT for mytochondria";
+    warn "Duplicates are not removed in Bed files while they are in sam files";
+    #This probably won't work for many BED files... e.g chrM or sam-like "beds"...
+    $command = " | grep -v '^MT' ";
+    $command .= " | sort -k1,1 -k2,2n -k3,3n | gzip -c > $output";
+
+  } elsif($file_type eq 'sam'){
+
+    #Manual Alternative to samtools
+    #$command = "gzip -dc ".$self->param($parameter);
+    #$command .= ' | grep -vE \'^@\' | grep -vE "^[^[:space:]]+[[:blank:]]4[[:blank:]].*$" | sort -k3,3 -k4,4n | gzip -c > '.$work_file;
+
+    #Sometimes the input sam file may have incorrect headings which will mess up subsequent steps...
+    #$command = "gzip -dc $input | grep -v '^\@' | "; # This is not likely to happen now
+    $command = "gzip -dc $input | ";
+
+    #PREPROCESSING... remove mitochondria before SWEMBL : we need to cater for different approaches
+    #TODO do this in a better, more generic way
+    $command .= "grep -vE '^[^[:space:]]+[[:blank:]][^[:space:]]+[[:blank:]][^[:space:]]+\:[^[:space:]]+\:MT\:' | ";
+    $command .= "grep -v '^MT' | grep -v '^chrM' | ";
+
+    #Remove unmapped reads... 
+    $command .= $self->_bin_dir()."/samtools view -uSh -t ".$self->_sam_header()." -F 4 - | ";
+    
+    # This piped sort is not working!! (this problem has been reported in the samtools mailing list)
+    #TODO Check why this pipe in the sort is not working...
+    #$command .= "samtools sort -o - ".$self->param($parameter)."_tmp | " ;
+    #$command .= "samtools view -h - | gzip -c > $work_file";
+
+    # Create temp file for now and remove it when we figure out what's wrong with the piped sort!
+    $command .= $self->_bin_dir()."/samtools sort - ${input}_tmp ; " ;
+
+    #Add a remove duplicates step (this is not supported with BED files for the moment)
+    $command .= $self->_bin_dir()."/samtools rmdup -s ${input}_tmp.bam - | ";
+    $command .= $self->_bin_dir()."/samtools view -h - | gzip -c > $output";
+
+    #Alternative with no rmdup...
+    #$command .= $self->_bin_dir()."/samtools view -h ${input}_tmp.bam | gzip -c > $output";
+
+    $command .= " ; rm -f ${input}_tmp.bam";
+
+  }
+  else{
+    throw("$file_type file format not supported");
+  }
+
+  system($command) && throw("Failed processing $input with command $command");
+
+  return 1;
+}
+
+# Private function only to be called by subclasses of this class
+# gets the number of reads in a sam or bed file
+#sub _get_number_of_reads {
+#   my ($self, $file, $file_type) = (shift, shift, shift);
+#   if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; }
+#   my $nbr_reads = 0;
+#   #If needed, add an option to check if is zipped or not...
+#   my $open_cmd = "gzip -dc $file |";
+#   open(FILE,$open_cmd);
+#   while(<FILE>){
+#     if($file_type eq "sam"){
+#       next if /^\@SQ/;
+#     }else {
+#       next if /track name=/o;
+#     }
+#     $nbr_reads++;
+#   }
+#   close FILE;
+#   return $nbr_reads;
+#}
+
+# Private function only to be called by subclasses of this class
+# gets the number of reads in a sam or bed file
+#sub _get_slices {
+#  #NOT DONE!!
+#   my ($self, $file, $file_type) = (shift, shift, shift);
+#   if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; }
+#   my $nbr_reads = 0;
+#   #If needed, add an option to check if is zipped or not...
+#   my $open_cmd = "gzip -dc $file |";
+#   open(FILE,$open_cmd);
+#   while(<FILE>){
+#     if($file_type eq "sam"){
+#       next if /^@SQ/;
+#     }else {
+#       next if /track name=/o;
+#     }
+#     $nbr_reads++;
+#   }
+#   close FILE;
+#   return $nbr_reads;
+#}
+
+
+#Private getter / setter to the Feature Set Name
+sub _feature_set_name {
+  return $_[0]->_getter_setter('feature_set_name',$_[1]);
+}
+
+#Private getter / setter to the input folder
+sub _input_dir {
+  return $_[0]->_getter_setter('input_dir',$_[1]);
+}
+
+#Private getter / setter to the Input subset (usually a file name)
+sub _input_file {
+  return $_[0]->_getter_setter('input_file',$_[1]);
+}
+
+#Private getter / setter to the skip control option
+sub _skip_control {
+  return $_[0]->_getter_setter('skip_control',$_[1]);
+}
+
+#Private getter / setter to the control file
+sub _control_file {
+  return $_[0]->_getter_setter('control_file',$_[1]);
+}
+
+1;
+