Mercurial > repos > willmclaren > ensembl_vep

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSet.pm @ 0:21066c0abaf5 draft
Uploaded
author: willmclaren
date: Fri, 03 Aug 2012 10:04:48 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/InputSet.pm	Fri Aug 03 10:04:48 2012 -0400
@@ -0,0 +1,384 @@
+#
+# Ensembl module for Bio::EnsEMBL::Funcgen::InputSet
+#
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::InputSet - A module to represent InputSet object.
+ 
+
+=head1 SYNOPSIS
+
+use Bio::EnsEMBL::Funcgen::InputSet;
+
+#Create an InputSet
+
+my $inp_set = Bio::EnsEMBL::Funcgen::InputSet->new
+                 (
+	              -DBID            => $dbID,
+                  -ADAPTOR         => $self,
+                  -EXPERIMENT      => $exp,
+                  -FEATURE_TYPE => $ftype,
+                  -CELL_TYPE    => $ctype,
+                  -FORMAT       => 'READ_FORMAT',
+                  -VENDOR       => 'SOLEXA',
+                  -NAME         => 'ExpSet1',
+                  -REPLICATE    => 1,
+                 );
+
+# Add some InputSubsets
+
+$inp_set->add_new_subsets($subset_name, $
+
+
+
+
+=head1 DESCRIPTION
+
+An InputSet object provides a generic container for any non-array based feature import, 
+allowing tracking of file import via the status table and integration into Data and FeatureSets to
+provide traceability to the source experiment from a given FeatureSet.
+
+=cut
+
+use strict;
+use warnings;
+
+package Bio::EnsEMBL::Funcgen::InputSet;
+
+use Bio::EnsEMBL::Funcgen::InputSubset;
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate);
+use Bio::EnsEMBL::Funcgen::Set;
+use Bio::EnsEMBL::Analysis;
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Set);
+
+
+=head2 new
+
+
+
+  Example    : my $eset = Bio::EnsEMBL::Funcgen::InputSet->new(
+                                                                     -EXPERIMENT    => $exp,
+                                                                     -FEATURE_TYPE  => $ftype,
+                                                                     -CELL_TYPE     => $ctype,
+                                                                     -FORMAT        => 'READ_FORMAT',
+                                                                     -VENDOR        => 'SOLEXA',
+                                                                     -NAME          => 'ExpSet1',
+                                                                     -ANALYSIS      => $anal,
+                                                                     -FEATURE_CLASS => 'annotated',
+                                                                     );
+
+  Do we want to define subsets likes this or are we more likely to add them one by one?
+
+  Description: Constructor for InputSet objects.
+  Returntype : Bio::EnsEMBL::Funcgen::InputSet
+  Exceptions : Throws if no Experiment defined
+               Throws if CellType or FeatureType are not valid or stored
+  Caller     : General
+  Status     : At risk
+
+=cut
+
+sub new {
+  my $caller = shift;
+	
+  my $class = ref($caller) || $caller;
+	
+  #Add set_type here to overwrite default ref parsing in Set::set_type
+  #This need to stay like this until we patch the DB
+  my $self = $class->SUPER::new(@_);	
+ 
+  my ($exp, $format, $vendor, $rep)
+    = rearrange(['EXPERIMENT', 'FORMAT', 'VENDOR', 'REPLICATE'], @_);
+    
+  if (! (ref $exp && $exp->isa('Bio::EnsEMBL::Funcgen::Experiment') && $exp->dbID())){
+	throw('Must specify a valid stored Bio::EnsEMBL::Funcgen::Experiment');
+  }
+
+  
+  #These are set in Set, just validate here
+  throw ('Must provide a FeatureType') if(! defined $self->feature_type);
+  throw ('Must provide a CellType') if(! defined $self->cell_type);
+
+  my $type = $self->feature_class;
+
+  #Need to move these types to config
+
+  if(! ($type && grep /^${type}$/, ('annotated', 'result', 'segmentation'))){
+	throw("You must define a valid InputSet feature_class e.g. 'annotated' or 'result'");
+  }
+
+  if(($type eq 'result') &&
+	 ($format ne 'SEQUENCING')){
+	throw('InputSet does not yet support a result type InputSet which does not have the \'SEQUENCING\' format');
+	
+  }
+
+
+  #if(! defined $self->analysis){
+  ##default analysis hack for v47
+  ##Set directly to avoid dbID boolean check
+  #This is to support supporting_set cache in data_set?
+  $self->{'analysis'} = Bio::EnsEMBL::Analysis->new
+	(-logic_name => 'external',
+	 -id       => 0,#??someone needs to rewrite analysis
+	);
+  
+  #Change to direct setting for speed
+  $self->{format}     = $format;
+  $self->{vendor}     = $vendor;
+  $self->{replicate}  = $rep;
+  $self->{experiment} = $exp;
+  $self->{subsets}    = {};
+  
+  return $self;
+}
+
+
+=head2 add_new_subset
+
+  Arg [1]    : string - sub set name e.g. the file name (not path as we're restricted to 30 chars)
+  Arg [2]    : Bio::EnsEMBL::Funcgen::InputSubset - optional
+               If not defined will create a sparse InputSubset based on the name
+  Example    : $expset->add_new_subset($ss_name, $exp_subset);
+  Description: Adds input_subset
+  Returntype : none
+  Exceptions : Throws if set is already present
+               Throws if InputSubset is not valid or stored
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+#Do we still use the optional subset function?
+
+sub add_new_subset {
+  my ($self, $ss_name, $exp_sset) = @_;
+	
+  #Need to test $ss_name here
+  if(! ($ss_name && ref(\$ss_name) eq 'SCALAR')){#ref($exp_sset) would be 'REF'
+	throw('You must pass a InputSubset name');
+  }
+
+  if($self->get_subset_by_name($ss_name)){
+	throw("Subset $ss_name is already present in this InputSet, maybe you need to alter the filename?");
+  }
+
+  if(defined $exp_sset){
+
+	if(!(ref($exp_sset) && $exp_sset->isa('Bio::EnsEMBL::Funcgen::InputSubset') && $exp_sset->dbID())){
+	  throw('InputSubsets must be valid and stored');
+	}
+  }
+  else{
+	
+	$exp_sset = Bio::EnsEMBL::Funcgen::InputSubset->new(
+														-name => $ss_name,
+														-input_set => $self,
+													   );
+  }
+
+  $self->{subsets}{$ss_name} = $exp_sset;
+
+  return $self->{subsets}{$ss_name};
+}
+
+
+=head2 get_Experiment
+
+  Example    : my $exp = $exp_set->get_Experiment();
+  Description: Getter for the Experiment of this DataSet.
+  Returntype : Bio::EnsEMBL::Fuuncgen::Experiment
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub get_Experiment{  return $_[0]->{experiment}; }
+
+
+=head2 get_InputSubsets
+
+  Example    : my @subsets = @{$exp_set->get_InputSubsets()};
+  Description: Getter for the InputSubsets for this InputSet.
+  Returntype : Arrayref
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub get_InputSubsets{
+  my ($self)  = shift;
+
+  return [ values %{$self->{'subsets'}} ];
+}
+
+
+
+
+=head2 get_subset_by_name
+
+  Example    : my $subsets = $exp_set->get_subset_by_name('subset1');
+  Description: Getter for the subset of a given name for this InputSet.
+  Returntype : Bio::EnsEMBL::Funcgen::InputSubset
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub get_subset_by_name{
+  my ($self, $name) = @_;
+  return (exists $self->{'subsets'}{$name}) ? $self->{'subsets'}{$name} : undef;
+}
+
+
+=head2 get_subset_names
+
+  Example    : my @subset_names = @{$exp_set->get_subset_names()};
+  Description: Getter for the subset names for this InputSet.
+  Returntype : Arrayref
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub get_subset_names{
+  my ($self) = shift;
+  return [ keys %{$self->{'subsets'}} ];
+}
+
+
+
+
+=head2 vendor
+
+  Arg[1]     : String - vendor e.g. ILLUMINA
+  Example    : my $iset_vendor = $iset->vendor;
+  Description: Getter for the vendor attribute of this InputSet.
+  Returntype : String
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub vendor {  return $_[0]->{vendor}; }
+
+
+=head2 format
+
+  Arg[1]     : string - format i.e. product type/format
+  Example    : my $iset_format = $iset->format;
+  Description: Getter for the format attribute of this InputSet.
+  Returntype : String
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub format {  return $_[0]->{format}; }
+
+
+=head2 replicate
+
+  Arg[1]     : Integer - replicate 0 = merged or NA, >0 refers to individual replicate
+  Example    : if($iset->replicate){ #Do something replicate specific in here }
+  Description: Getter for the replicate attribute of this InputSet.
+  Returntype : Integer
+  Exceptions : None
+  Caller     : General
+  Status     : At Risk
+
+=cut
+
+sub replicate {  return $_[0]->{replicate}; }
+
+
+
+=head2 source_info
+
+  Example    : my $source_info = $input_set->source_info;
+  Description: Getter for the experiment source info i.e. [ $label, $url ]
+  Returntype : Listref
+  Exceptions : None
+  Caller     : General
+  Status     : At risk
+
+=cut
+
+#Currently handling redundant/absent InputSubset data
+
+sub source_info{
+  my $self = shift;
+
+  if(! defined $self->{source_info}){
+    #could have data_url as highest priority here
+    #but we need to ensure removal when adding archive ids
+    #so we link to the archive and not the old data url
+    
+    my $exp_group = $self->get_Experiment->experimental_group;
+    my %source_info; #Handles redundant InputSubsets
+    my ($proj_name, $proj_link, $source_label, $source_link);
+
+    if($exp_group->is_project){
+      $proj_name = $exp_group->name;
+      $proj_link = $exp_group->url;
+    }
+
+    foreach my $isset(@{$self->get_InputSubsets}){
+
+      if(defined $isset->archive_id ){
+        $source_label = $isset->archive_id;
+      
+        if(! exists $source_info{$source_label}){
+          $source_info{$source_label} = [$source_label, undef];
+          #source_link can is undef here as archive_id overrides display url
+          #undef links will automatically go to the SRA
+        }
+      }
+      elsif(defined $proj_name){
+        #$source_label = $self->experimental_group->name;
+        $source_link  = $isset->display_url || $proj_link;
+        
+        if(! exists $source_info{$source_link}){
+          $source_info{$source_link} = [$proj_name, $source_link];
+        }
+      }
+    }
+    
+    $self->{source_info} = [values %source_info];
+  }
+
+  return $self->{source_info};
+}
+
+
+
+1;
+
author	willmclaren
date	Fri, 03 Aug 2012 10:04:48 -0400
parents
children