diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/ArrayDesign.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/ArrayDesign.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,434 @@
+#
+# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
+#
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
+
+=head1 SYNOPSIS
+
+  my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign";
+  push @INC, $parser_type;
+  my $imp = $class->SUPER::new(@_);  my $imp = Bio::EnsEMBL::Funcgen::Importer->new(%params);
+
+  $imp->set_config();
+
+
+=head1 DESCRIPTION
+
+This is a definitions class which should not be instatiated directly, it 
+normally inherited from the Importer.  ArrayDesign contains meta data and methods 
+specific to handling array designs only (i.e. no experimental data), which have 
+been produced from the eFG array design software.
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign;
+
+use Bio::EnsEMBL::Funcgen::Array;
+use Bio::EnsEMBL::Funcgen::ProbeSet;
+use Bio::EnsEMBL::Funcgen::Probe;
+use Bio::EnsEMBL::Funcgen::ProbeFeature;
+use Bio::EnsEMBL::Funcgen::FeatureType;
+use Bio::EnsEMBL::Funcgen::ExperimentalChip;
+use Bio::EnsEMBL::Funcgen::ArrayChip;
+use Bio::EnsEMBL::Funcgen::Channel;
+use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file);
+use Bio::EnsEMBL::Funcgen::Utils::Helper;
+use strict;
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
+
+
+
+=head2 new
+
+  Example    : my $self = $class->SUPER::new(@_);
+  Description: Constructor method for ArrayDesign class
+  Returntype : Bio::EnsEMBL::Funcgen::Parsers::ArrayDesign
+  Exceptions : throws if Experiment name not defined or if caller is not Importer
+  Caller     : Bio::EnsEMBL::Funcgen::Importer
+  Status     : at risk
+
+=cut
+
+
+sub new{
+  my $caller = shift;
+
+  my $class = ref($caller) || $caller;
+  my $self = $class->SUPER::new();
+
+  throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
+
+
+  $self->{'config'} =  
+    {(     
+      probe_data   => ["probe"],
+      prb_fields   => ['SEQ_ID',  'POSITION', 'LENGTH', 'PROBE_SEQUENCE', 'PROBE_ID', 'UNIQUENESS_SCORE', 'TM', 'MAS_CYCLES'],
+      notes_fields => ['DESIGN_ID', 'DESIGN_NAME', 'DESCRIPTION'],
+     )};
+
+  return $self;
+}
+     
+
+
+=head2 set_config
+
+  Example    : my $self->set_config;
+  Description: Sets attribute dependent config
+  Returntype : None
+  Exceptions : None
+  Caller     : Bio::EnsEMBL::Funcgen::Importer
+  Status     : at risk
+
+=cut
+
+sub set_config{
+  my ($self) = @_;
+  
+  #placeholder method
+  #set paths
+
+  return;
+}
+
+
+=head2 read_array_data
+
+  Example    : $imp->read_array_data();
+  Description: Parses NimbleGen style DesignNotes.txt format files to create and store new Arrays
+  Returntype : none
+  Exceptions : None
+  Caller     : general
+  Status     : At risk - Can this be generic? Can we force the creation of a DesignNotes file on other formats?
+
+=cut
+
+#this is currently OLIGO specific.
+
+sub read_array_data{
+  my ($self, $design_notes) = @_;
+
+  $self->log("Reading and importing array data");
+
+  throw('You need to pass the path to a DesignNotes.txt file') if ! defined $design_notes;
+  $self->{'design_notes'} = $design_notes;
+
+  my ($line, $array, $array_chip, @data, %hpos);
+  my $oa_adaptor = $self->db->get_ArrayAdaptor();
+  my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
+  my $fh = open_file("<", $self->{'design_notes'});
+  
+  while ($line = <$fh>){
+    
+    $line =~ s/\r*\n//;#chump
+    @data =  split/\t/o, $line;
+    
+    
+
+
+    #We need to have a DESIGN vendor type?
+    #also need to be able to set file path independently of config
+
+    if($. == 1){
+      %hpos = %{$self->set_header_hash(\@data, $self->get_config('notes_fields'))};
+      next;
+    }
+
+    ### CREATE AND STORE Array and ArrayChips  
+    if(! defined $array ){
+      #This is treating each array chip as a separate array, unless arrayset is defined
+      #AT present we have no way of differentiating between different array_chips on same array???!!!
+      #Need to add functionality afterwards to collate array_chips into single array
+           
+      #This will use a stored array if present
+
+      $array = Bio::EnsEMBL::Funcgen::Array->new
+	(
+	 -NAME        => $self->array_name() || $data[$hpos{'DESIGN_NAME'}],
+	 -FORMAT      => uc($self->format()),
+	 -VENDOR      => uc($self->vendor()),
+	 -TYPE        => 'OLIGO',
+	 -DESCRIPTION => $data[$hpos{'DESCRIPTION'}],#need to trim the array chip specific description here
+	);
+
+      ($array) = @{$oa_adaptor->store($array)};
+
+
+      $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
+							  -ARRAY_ID  => $array->dbID(),
+							  -NAME      => $data[$hpos{'DESIGN_NAME'}],
+							  -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
+							  #add description?
+							 );
+
+      #This will use a stored array_chip if present
+      ($array_chip) = @{$ac_adaptor->store($array_chip)};
+      $array->add_ArrayChip($array_chip);
+        
+    }
+    elsif((! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])) && ($self->array_set())){
+      
+      $self->log("Generating new ArrayChip(".$data[$hpos{'DESIGN_NAME'}].". for same Array ".$array->name()."\n");
+      
+      $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
+							  -ARRAY_ID  => $array->dbID(),
+							  -NAME        => $data[$hpos{'DESIGN_NAME'}],
+							  -DESIGN_ID => $data[$hpos{'DESIGN_ID'}],
+							 );
+      
+      ($array_chip) = @{$ac_adaptor->store($array_chip)};
+      $array->add_ArrayChip($array_chip);
+      
+    }
+    elsif(! $array->get_ArrayChip_by_design_id($data[$hpos{'DESIGN_ID'}])){
+      throw("Found experiment with more than one design without -array_set");
+    }
+  }
+  
+
+  $self->add_Array($array);
+
+  close($fh);
+  
+  return;
+
+}
+
+
+
+
+
+=head2 read_probe_data
+
+  Example    : $imp->read_probe_data();
+  Description: Parses and imports probes, probe sets and features for a given array design
+  Returntype : none
+  Exceptions : throws is not tiling format
+  Caller     : Importer
+  Status     : at risk
+
+=cut
+
+
+#Assumes one chip_design per experimental set.
+sub read_probe_data{
+  my ($self, $array_file) = @_;
+  
+  $self->log("Reading and importing probe data");
+
+
+  my ($fh, $line, @data, @log, %hpos, %probe_pos);#, %duplicate_probes);
+  my $aa = $self->db->get_AnalysisAdaptor();
+  my $manal = $aa->fetch_by_logic_name('MASCycles');
+  my $uanal = $aa->fetch_by_logic_name('UScore');
+  my $tmanal= $aa->fetch_by_logic_name('NimblegenTM');
+
+  $array_file ||= $self->array_file();
+  $self->log("Parsing ".$self->vendor()." probe data (".localtime().")");
+  throw("ArrayDesign only accomodates a tiling design with no feature/probesets") if ($self->format() ne 'TILED');
+
+  ### Read in
+  # eFG prb file, not chiip info yet so only one ArrayChip per design
+  # potential to have pos file here for probes built on generic slices of genome
+
+  #We need to handle different coord systems and possibly different assmemblies
+  my $slice_a = $self->db->get_SliceAdaptor();
+  my $cs = $self->db->get_FGCoordSystemAdaptor()->fetch_by_name_schema_build_version(
+										     'chromosome', 
+										     $self->db->_get_schema_build($self->db->dnadb())
+										    );
+
+
+  #sanity check we're only dealing with one array/chip
+  my @arrays = @{$self->arrays()};
+  if(scalar(@arrays) != 1){
+    throw("Array DESIGN imports only accomodate one Array per import, please check ".$self->{'design_notes'});
+  }
+
+  my @achips = @{$arrays[0]->get_ArrayChips()};
+  if(scalar(@achips) != 1){
+    throw("Array DESIGN imports only accomodates one ArrayChip per import, please check ".$self->{'design_notes'});
+  }
+
+  my $achip = $achips[0];
+
+  #foreach my $array(@{$self->arrays()}){
+
+    
+  #  foreach my $achip(@{$array->get_ArrayChips()}){
+
+  $self->log("Importing array design(".$achip->name().") from ".$array_file);
+
+
+  if($achip->has_status('IMPORTED')){
+    $self->log("Skipping fully imported ArrayChip:\t".$achip->design_id());
+    return;
+  }elsif($self->recovery()){
+    $self->log("Rolling back partially imported ArrayChip:\t".$achip->design_id());
+    $self->db->rollback_ArrayChip([$achip]);
+  }
+      
+  $self->log("Importing ArrayChip:".$achip->design_id());
+            
+  
+  #OPEN PROBE IN/OUT FILES
+  $fh = open_file("<", $array_file);
+  my $f_out = open_file(">", $self->get_dir("output")."/probe.".$achip->name()."fasta")	if($self->{'_dump_fasta'});
+  my ($op, $of, %pfs);
+
+  #should define mapping_method arg to allows this to be set to LiftOver/EnsemblMap
+  my $anal = $self->db->get_AnalysisAdaptor()->fetch_by_logic_name("TileMap");##???
+  my $strand = 0;	#default for TileMap, should be config hash?
+  my $fasta = "";
+       
+  while($line = <$fh>){
+    $line =~ s/\r*\n//;
+    @data =  split/\t/o, $line;
+    my $loc = "";
+      
+    #SEQ_ID  WINDOW_START    WINDOW_END      POSITION        LENGTH  PROBE_SEQUENCE  TM      UNIQUENESS_SCORE        MAS_CYCLES
+    #X       3000001 3000100 3000041 56      TGACATCTTCAGTTCTTTACATAGTTTTCATATTAGTCCTCTATCAGATGTGGAGT        73.09   132     15
+
+    
+    if ($. == 1){	
+      %hpos = %{$self->set_header_hash(\@data, $self->get_config('prb_fields'))};
+      next;
+    }
+		  
+	
+    #This assumes tiling format with no feature/probe sets
+
+		  
+    if(%pfs){
+      $self->store_set_probes_features($achip->dbID(), \%pfs);
+      undef %pfs;
+    }
+	  
+				
+    #PROBE
+    $op = Bio::EnsEMBL::Funcgen::Probe->new(
+					    -NAME          => $data[$hpos{'PROBE_ID'}],
+					    -LENGTH        => $data[$hpos{'LENGTH'}],
+					    -ARRAY         => $arrays[0],
+					    -ARRAY_CHIP_ID => $achip->dbID(),
+					    -CLASS         => 'DESIGN',
+					   );
+
+    $op->add_Analysis_score($manal, $data[$hpos{'MAS_CYCLES'}]);
+    $op->add_Analysis_score($tmanal, $data[$hpos{'TM'}]);
+    $op->add_Analysis_CoordSystem_score($uanal, $cs, $data[$hpos{'UNIQUENESS_SCORE'}]);	
+    
+    #would need to pass cs to store USCORE, CYCLES and TM are seq/anal dependent not cs
+    #options:
+    #associate cs dependent scores with features
+    #we are duplicating cs in probe_design table
+    
+    #do we need another object? ProbeDesign
+    #-cs would be empty for all but uscore
+    #-mas_cycles analysis_id
+    #-uscore analysis_id cs_id
+    #-tm analysis_id  (anal most likely wont change so highly redundant)
+    #this would produce 3 records for each probe, with cs being empty for two and anal being redundant for the other
+    #would however provide for extensible design attributes
+    #would only need one tm and mas_cycles for each probe irrespective of cs
+    #could have separate table probe_design_feature?
+    #mmm doesn't have location, just cs
+    #just have empty cs fields, calls by cs would have to be in ('cs_id', 'NULL')
+    #or just make uscore method dependent on cs.
+    #or split table?
+    #ProbeDesign::add_analysis_score
+    #ProbeDesign::add_coord_sys_analysis_score
+    
+    #can we add this directly to the probe?
+    #separate retrieval in ProbeAdaptor so we're not joining everytime
+    #this would require generic get_analysis/analysis_coord_system_attribute method
+    
+
+
+    %{$pfs{$data[$hpos{'PROBE_ID'}]}} = (
+					 probe => $op,
+					 features => [],
+					);
+	
+				
+    #PROBE FEATURE
+    if(!  $self->cache_slice($data[$hpos{'SEQ_ID'}])){
+      warn("Skipping non-standard probe chromosome");
+      undef %pfs;
+      next;
+    }
+
+    my $end = ($data[$hpos{'POSITION'}] + $data[$hpos{'LENGTH'}]);
+
+    if ($self->{'_dump_fasta'}){
+      $loc .= $data[$hpos{'SEQ_ID'}].":".$data[$hpos{'POSITION'}]."-${end};";
+    }	
+				
+	
+    $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new
+      (
+       -START         => $data[$hpos{'POSITION'}],
+       -END           => $end,
+       -STRAND        => $strand,
+       -SLICE         => $self->cache_slice($data[$hpos{'SEQ_ID'}]),
+       -ANALYSIS      => $anal,
+       -MISMATCHCOUNT => 0,
+       -CIGAR_LINE    => $data[$hpos{'LENGTH'}].'M',
+       -PROBE         => undef,#Need to update this in the store method
+      );
+				
+	
+    push @{$pfs{$data[$hpos{'PROBE_ID'}]}{'features'}}, $of;
+	
+    if($self->{'_dump_fasta'}){			
+      #filter controls/randoms?  Or would it be sensible to see where they map
+      #wrap seq here?
+      $fasta .= ">".$data[$hpos{'PROBE_ID'}]."\t".$data[$hpos{'CHROMOSOME'}].
+	"\t$loc\n".$data[$hpos{'PROBE_SEQUENCE'}]."\n";
+    }
+  }
+      
+  #need to store last data here
+  $self->store_set_probes_features($achip->dbID(), \%pfs);
+  $self->log(join("\n", @log));
+  $achip->adaptor->set_status("IMPORTED", $achip);
+  $self->log("ArrayChip:\t".$achip->design_id()." has been IMPORTED");
+  
+  if ($self->{'_dump_fasta'}){
+    print $f_out $fasta if($self->{'_dump_fasta'});
+    close($f_out);
+  }
+   
+  $self->log("Finished parsing probe data");
+  #Total probe_sets:\t$psid\n".
+  #	       "Total probes:\t$pid\nTotal probe_features:\t$fid");
+  
+  return;
+}
+
+
+1;