diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Sanger.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Sanger.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,577 @@
+#
+# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Sanger
+#
+
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=head1 NAME
+
+Bio::EnsEMBL::Funcgen::Parsers::Sanger
+
+=head1 SYNOPSIS
+
+  my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Sanger";
+  push @INC, $parser_type;
+  my $imp = $class->SUPER::new(@_);
+
+
+=head1 DESCRIPTION
+
+This is a definitions class which should not be instatiated directly, it 
+normally inherited from the Importer.  Sanger contains meta data and methods 
+specific to Sanger PCR arrays to aid parsing and importing of experimental data.
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::Parsers::Sanger;
+
+use Bio::EnsEMBL::Funcgen::Array;
+use Bio::EnsEMBL::Funcgen::ProbeSet;
+use Bio::EnsEMBL::Funcgen::Probe;
+use Bio::EnsEMBL::Funcgen::ProbeFeature;
+use Bio::EnsEMBL::Funcgen::FeatureType;
+use Bio::EnsEMBL::Funcgen::ExperimentalChip;
+use Bio::EnsEMBL::Funcgen::ArrayChip;
+use Bio::EnsEMBL::Funcgen::Channel;
+use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file);
+use Bio::EnsEMBL::Funcgen::Utils::Helper;
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+use strict;
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
+
+=head2 new
+
+  Example    : my $self = $class->SUPER::new(@_);
+  Description: Constructor method for Sanger class
+  Returntype : Bio::EnsEMBL::Funcgen::Parsers::Sanger
+  Exceptions : throws if Experiment name not defined or if caller is not Importer
+  Caller     : Bio::EnsEMBL::Funcgen::Importer
+  Status     : at risk
+
+=cut
+
+
+sub new{
+  my $caller = shift;
+
+  my $class = ref($caller) || $caller;
+  my $self = $class->SUPER::new();
+
+  throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
+	
+  $self->{'config'} =   {(
+						#order of these data arrays is important!
+						array_data   => [],	#["array_chip"],
+						probe_data   => ["array_probe"],
+						results_data => ["and_import_result"],
+						#import_methods  => [],
+						#data paths here?
+						norm_method => undef,
+						#is this disabling -input_dir override option?
+					   )};
+
+
+
+  return $self;
+} 
+
+=head2 set_config
+
+  Example    : $imp->set_config();
+  Description: Sets a attribute dependent variables
+  Returntype : none
+  Exceptions : None
+  Caller     : Importer
+  Status     : At risk 
+
+=cut
+
+sub set_config{
+  my ($self) = @_;
+
+  #placeholder method for setting any attr dependant vars e.g. file paths etc.
+
+
+  return;
+}
+
+
+sub read_array_probe_data{
+  my ($self, $array_file) = @_;
+
+  warn("Remove hard coding for Sanger array import, and accomodate adf format");
+
+
+  $array_file ||= $self->array_file();
+  my ($line, $fh, @list, $array_file_format, $cmd);
+  my ($op, $of, $imported, $fimported, $fanal);
+  my $oa_adaptor = $self->db->get_ArrayAdaptor();
+  my $op_adaptor = $self->db->get_ProbeAdaptor();
+  my $of_adaptor = $self->db->get_ProbeFeatureAdaptor();
+  my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
+  my $ac_adaptor = $self->db->get_ArrayChipAdaptor();
+  my $slice_adaptor = $self->db->get_SliceAdaptor();
+  my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("SangerPCR")->dbID();
+  #have LiftOver? Could then use liftover in  pipeline to redo mappings
+
+  #store now checks whether already stored and updates array chips accordingly
+  my $array = Bio::EnsEMBL::Funcgen::Array->new
+    (
+     -NAME        => $self->array_name(),
+     -FORMAT      => uc($self->format()),
+     -VENDOR      => uc($self->vendor()),
+     -TYPE        => 'PCR',
+     -DESCRIPTION => "Sanger ENCODE PCR array 3.1.1",
+    );
+
+  ($array) = @{$oa_adaptor->store($array)};
+         
+  #This is treating each array chip as a separate array, unless arrayset is defined
+  #AT present we have no way of differentiating between different array_chips on same array???!!!
+  #Need to add functionality afterwards to collate array_chips into single array
+  my $array_chip = Bio::EnsEMBL::Funcgen::ArrayChip->new(
+														 -NAME      => $array->name(),
+														 -DESIGN_ID => $array->name(),
+														 -ARRAY_ID  =>$array->dbID(),
+														);
+
+  ($array_chip) = @{$ac_adaptor->store($array_chip)};
+  $array->add_ArrayChip($array_chip);
+  $self->add_Array($array);
+
+
+  #we also need to test wether the array as been imported as well as the mappings
+  #THis needs to use coord_sys-id not schema_build!!  Duplcaite entries for different schema_builds 
+  #with same assembly
+
+  my $dnadb_cs = $self->db->dnadb->get_CoordSystemAdaptor->fetch_by_name('chromosome');
+  my $fg_cs = $self->db->get_FGCoordSystemAdaptor->validate_and_store_coord_system($dnadb_cs);
+
+
+  #This fails if we're pointing to an old DB during the release cycle.  Will be fine if we manage to cs mapping dynamically
+
+
+  if ($array_chip->has_status('IMPORTED')) {
+    $imported = 1;
+    $self->log("Skipping ArrayChip probe import (".$array_chip->name().") already fully imported");
+
+	#need to build cache here, from file first else from DB????
+	#This is required for feature only imports
+	#as we won't have the probe dbID available
+
+	if(! $self->get_probe_cache_by_Array($array)){
+	  $self->get_probe_cache_by_Array($array, 1);
+	}
+
+
+
+  } elsif ($self->recovery()) {
+    $self->log("Rolling back partially imported ArrayChip:\t".$array_chip->name());
+    $self->db->rollback_ArrayChip([$array_chip]);	#This should really remove all CS imports too?
+  }
+
+
+  #should never really have CS imports if not IMPORTED
+  #there is however the potential to trash a lot of data if we were to remove the CS importes by mistake
+  #do we need to check whether any other sets are using the data?
+  #we have to check for result using relevant cs_id and cc_id
+  #no removal of probes is the key thing here as nothing is dependent on the feature_ids
+  #get all result sets by array chip?  or get all ExperimentalChips by array chip
+  #would have to be result set as we would find our own ecs.  May find our own rset
+  
+  
+  throw('This needs updating');
+
+  if ($array_chip->has_status('IMPORTED_CS_'.$fg_cs->dbID())) {
+    $fimported = 1;
+    $self->log("Skipping ArrayChip feature import (".$array_chip->name().") already fully imported for ".$self->data_version());
+  } elsif ($self->recovery()) {
+    $self->log("Rolling back partially imported ArrayChip features:\t".$array_chip->name());
+    $self->db->rollback_ArrayChip_features($array_chip, $fg_cs);
+  }
+
+
+  #need to check whether already imported on specified schema_build
+  #check for appropriate file given format in input dir or take path
+
+  #if (! $fimported) {#now need to do this irrespective of import status due to x y requirements
+  #need only do this once, i.e. if the cache isn't defined yet
+  #this is assuming cache will be built properly
+  #may cause problems if not cleaned up properly after use.
+
+  #ignore xy requirements for now, these should be associated with results file
+
+
+
+  #if (! defined $self->{'_probe_cache'}) {
+  if (! $fimported) {
+
+	
+
+	if (! $array_file) {
+
+	  if (! defined $self->get_dir('input')) {
+		throw("No input_dir defined, if you are running in a non Experiment context please use -array_file");
+      }
+      
+      #hacky ..do better?
+      for my $suffix ("gff", "adf") {
+		$cmd = $self->get_dir('input')."/".$self->array_name()."*".$suffix;
+		@list = `ls $cmd 2>/dev/null`;
+	
+		if ((scalar(@list) == 1) && 
+			($list[0] !~ /No such file or directory/o)) { ###this is only printed to STDERR?
+	  
+		  if (! defined $array_file) {
+			$array_file = $list[0];
+		  } else {
+			throw("Found more than one array file : $array_file\t$list[0]\nSpecify one with -array_file");
+		  }
+		}
+      }
+
+      throw("Cannot find array file. Specify one with -array_file") if (! defined $array_file);
+    }
+    
+    
+    if ($array_file =~ /gff/io) {
+      $array_file_format = "GFF";
+    } elsif ($array_file =~ /adf/io) {
+      $array_file_format = "ADF";
+      throw("Does not yet accomodate Sanger adf format");
+    } else {
+      throw("Could not determine array file format: $array_file");
+	}
+	
+
+	#if (! $fimported) {
+	$fanal = $self->db->get_AnalysisAdaptor->fetch_by_logic_name(($array_file_format eq "ADF") ? "VendorMap" : "LiftOver");
+	#}
+   
+	$self->log("Parsing ".$self->vendor()." array data (".localtime().")");
+	$fh = open_file($array_file);
+	my @lines = <$fh>;
+	close($fh);
+
+	
+
+	my ($chr, $start, $end, $strand, $pid);#, $x, $y, $meta_x, $meta_y, @xy);
+    
+	#avoid mutliple calls for same array
+	my $ac_dbid = $array->get_ArrayChip_by_design_id($array->name())->dbID();
+
+	#sort file to enable probe cache method for new feature imports
+	@lines = sort {(split/\t|\;/o, $a)[8] cmp (split/\t|\;/o, $b)[8]} @lines;
+
+	#This is not sorting properly!!
+
+	#my @tmp = map ((split/\t|\;/o, $_)[8], @lines);
+	#@tmp = sort @tmp;
+
+
+	#$self->log('Tmp sorted array is :\n'.join("\n", @tmp)."\n");
+
+
+
+
+	foreach $line(@lines) {
+	  $line =~ s/\r*\n//;
+      
+	  #($chr, $start, $end, $ratio, $pid) = split/\t/o, $line;
+	  #($chr, undef, undef, $start, $end, undef, $strand, undef, $pid, $x, $y, $meta_x, $meta_y) = split/\t|\;/o, $line;
+	  ($chr, undef, undef, $start, $end, undef, $strand, undef, $pid) = split/\t|\;/o, $line;
+	  
+
+	  if($self->ucsc_coords){
+		$start += 1;
+	  }
+
+
+	  #$meta_x =~ s/META_X=//;
+	  #$x =~ s/X=//;
+	  #$x = $x + (($meta_x -1)*26);
+	  #$meta_y =~ s/META_Y=//;
+	  #$y =~ s/Y=//;
+	  #$y = $y + (($meta_y -1)*25);
+	  $pid =~ s/reporter_id=//o;
+	  $chr  =~ s/chr//;
+	  $strand = ($strand eq "+") ? 0 : 1;
+	
+	  #Hack!!!!!!  This is still maintaining the probe entry (and result?)
+	  if (!  $self->cache_slice($chr)) {
+		warn("-- Skipping non standard probe (${pid}) with location:\t${chr}:${start}-${end}\n");
+		next;
+	  }
+
+
+	  #need to parse dependant on file format 
+	  #also need to account for duplicate probes on grid
+
+	  #need to test for imprted here for rebuilding the probe_info cache
+	  #this will result in always using first x y for the inital import (i.e. skip any probe already in cache)
+	  #or using last x y for previosuly imported as we can't check the cache as it will already be there
+	  #could check for x y
+	  #should always check x y as this will also implicitly check if it is in the cache
+	
+	  #if (! $self->get_probe_id_by_name($pid)) { #already present in cache
+	  #if(! (@xy = @{$self->get_probe_x_y_by_name($pid)})){
+	  
+		#can we not use store_set_probes_features
+		#would have to add x y to probe, which is not logical as probe can have many x y's
+		#keep like this and just change cache_probe_info
+	  
+	  if (! $imported) {
+		#when we utilise array coords, we need to look up probe cache and store again with new coords
+		#we're currently storing duplicates i.e. different ids with for same probe
+		#when we should be storing two records for the same probe/id
+		#the criteria for this will be different for each vendor, may have to check container etc for NimbleGen
+		
+		$op = Bio::EnsEMBL::Funcgen::Probe->new(
+												-NAME          => $pid,
+												-LENGTH        => ($end - $start),
+												-ARRAY         => $array,
+												-ARRAY_CHIP_ID => $ac_dbid,
+												-CLASS         => 'EXPERIMENTAL',
+											   );
+		
+		($op) = @{$op_adaptor->store($op)};
+		#$self->cache_probe_info($pid, $op->dbID, $x, $y);
+	  } else {
+		#update XY cache for previously imported array
+		#$self->cache_probe_info($pid, $self->get_probe_id_by_name($pid), $x, $y);
+	  }
+	  
+	  #if (! $fimported) {
+	  $of = Bio::EnsEMBL::Funcgen::ProbeFeature->new(
+													 -START         => $start,
+													 -END           => $end,
+													 -STRAND        => $strand,
+													 -SLICE         => $self->cache_slice($chr),
+													 -ANALYSIS      => $fanal,
+													 -MISMATCHCOUNT => 0,
+													 -PROBE_ID      => ($imported) ? 
+													 $self->get_probe_id_by_name_Array($pid, $array) : $op->dbID(),
+													);
+	  
+	  #get_probe_id will throw if not in cache, which means that we have an unimported probe 
+	  #for an ArrayChip which is flagged as imported, must have been omitted from the import deisgn
+	  #probably a manual fix required.  Can we log these and write an update/repair script.
+
+	  $of_adaptor->store($of);
+	  #}
+	
+	  #} else {
+	  #warn("Sanger does not accomodate on plate duplicates yet, result are not linked to X Y coords, using first coords for probe if present in results for $pid\n");ĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦĦ
+	  #}
+	}
+
+	$array_chip->adaptor->set_status('IMPORTED_CS_'.$fg_cs->dbID(), $array_chip) if ! $fimported;
+	$self->log("ArrayChip:\t".$array_chip->design_id()." has been IMPORTED_CS_".$fg_cs->dbID());
+	
+  }
+  
+  
+  
+  if (! $imported) {
+    $array_chip->adaptor->set_status('IMPORTED', $array_chip);
+    $self->log("ArrayChip:\t".$array_chip->design_id()." has been IMPORTED");
+	$self->resolve_probe_data();
+  }
+  
+  $self->log("Finished parsing ".$self->vendor()." array/probe data (".localtime().")");
+  #warn("Finished parsing ".$self->vendor()." array/probe data (".localtime().")");  
+  
+  return;
+}
+
+=head2 read_and_import_result_data
+
+  Example    : $imp->read_and_import_result_data();
+  Description: Parses and imports result for the sanger PCR array platform
+  Returntype : none
+  Exceptions : none
+  Caller     : Importer
+  Status     : At risk
+
+=cut
+
+sub read_and_import_result_data{
+  my $self = shift;
+
+  #change this to read_gff_chip_results
+  #as opposed to gff channel results
+  #This should also use the default logic names for the Vendor, or take a user defined list 
+  $self->log("Reading ".$self->vendor()." result data (".localtime().")");
+
+  my ($file, $chip_uid, $line, $echip);
+  my ($ratio, $pid, %chip_files, %roll_back);
+  my $of_adaptor = $self->db->get_ProbeFeatureAdaptor();
+  my $ec_adaptor = $self->db->get_ExperimentalChipAdaptor();
+  my $chan_adaptor = $self->db->get_ChannelAdaptor();
+  my $analysis = $self->db->get_AnalysisAdaptor->fetch_by_logic_name("SangerPCR");
+  my $result_adaptor = $self->db->get_ResultSetAdaptor();
+  #this is done to avoid having to self->array_name in loop, will make multiple array loop easier 
+  my $array = ${$self->arrays()}[0];
+
+  #This works a little differently as we're not parsing a meta file
+  #so the echips haven't been added yet.
+  #This is treating each array chip as a separate array, unless arrayset is defined
+  #AT present we have no way of differentiating between different array_chips on same array???!!!
+  #Need to add functionality afterwards to collate array_chips into single array
+
+  #First add the echips to the Experiment
+  
+  if (! @{$self->result_files()}) {
+    my $list = "ls ".$self->input_dir().'/[0-9]*-[0-9a-zA-Z]*\.all\.*';
+    my @rfiles = `$list`;
+    $self->result_files(\@rfiles);
+  }
+
+  
+  foreach $file(@{$self->result_files()}) {
+    chomp $file;
+    ($chip_uid = $file) =~ s/.*\///;
+    $chip_uid =~ s/\..*//;
+
+	$self->log("Found SANGER results file for $chip_uid:\t$file");
+    $chip_files{$chip_uid} = $file;
+    
+
+    $echip = $ec_adaptor->fetch_by_unique_id_vendor($chip_uid, 'SANGER');
+
+    #this should throw if not recovery
+    #Nee to check Nimbelgen methods
+
+    if ($echip) {
+  
+      if (! $self->recovery()) {
+		throw("ExperimentalChip(".$echip->unqiue_id().") already exists in the database\nMaybe you want to recover?");
+      }else{
+		#log pre-reg'd chips for rollback
+		$roll_back{$echip->dbID()} = 1;
+	  }
+    } else {
+
+      $echip =  Bio::EnsEMBL::Funcgen::ExperimentalChip->new
+		(
+		 -EXPERIMENT_ID  => $self->experiment->dbID(),
+		 -ARRAY_CHIP_ID  => $self->arrays->[0]->get_ArrayChip_by_design_id($array->name())->dbID(),
+		 -UNIQUE_ID      => $chip_uid,
+		);
+          
+      ($echip) = @{$ec_adaptor->store($echip)};	
+      $self->experiment->add_ExperimentalChip($echip); #if we need a contains method in  here , always add!!
+    }
+
+	#do we need DUMMY entries any more?
+
+    #sub this passing the echip?
+    foreach my $type ('DUMMY_TOTAL', 'DUMMY_EXPERIMENTAL') {
+
+      my $channel = $chan_adaptor->fetch_by_type_experimental_chip_id($type, $echip->dbID());
+      
+      if ($channel) {
+		if (! $self->recovery()) {
+		  throw("Channel(".$echip->unique_id().":$type) already exists in the database\nMaybe you want to recover?");
+		}
+      } else {
+
+		$channel =  Bio::EnsEMBL::Funcgen::Channel->new
+		  (
+		   -EXPERIMENTAL_CHIP_ID => $echip->dbID(),
+		   -TYPE                 => $type,
+		  );
+	
+		($channel) = @{$chan_adaptor->store($channel)};
+      }
+    }
+  }
+
+
+  
+  #Now get rset using experiment echips
+  my $rset = $self->get_import_ResultSet($analysis, 'experimental_chip');
+
+  if ($rset) {					#we have some new data
+
+    foreach my $echip (@{$self->experiment->get_ExperimentalChips()}) {
+      
+      if ($echip->has_status('IMPORTED_SangerPCR', $echip)) {
+		$self->log("ExperimentalChip(".$echip->unique_id().") has already been imported");
+      } else {
+
+		my $cc_id = $rset->get_chip_channel_id($echip->dbID());
+		
+		if ($self->recovery() && $roll_back{$echip->dbID()}){
+		  $self->log("Rolling back results for ExperimentalChip:\t".$echip->unique_id());
+		  $self->rollback_results($cc_id);
+		}
+
+		$self->log("Reading SANGER result file for ".$echip->unique_id().":\t".$chip_files{$echip->unique_id()});
+		$self->get_probe_cache_by_Array($array) || throw('Failed to reset probe cache handle');
+		my $fh = open_file($chip_files{$echip->unique_id()});
+		my @lines = <$fh>;
+		close($fh);
+
+		my $rfile_path = $self->get_dir("norm")."/result.SangerPCR.".$echip->unique_id().".txt";
+		my $rfile = open_file($rfile_path, '>');
+		my $r_string = "";
+		
+		
+		@lines = sort {(split/\t|\:/o, $a)[5] cmp (split/\t|\:/o, $b)[5]} @lines;
+
+		foreach my $line (@lines) {
+		  $line =~ s/\r*\n//o;
+		  
+		  ($ratio, undef, $pid) = (split/\t|\:/o, $line)[3..5];
+		  $pid =~ s/.*://o;
+		  
+		  $ratio = '\N' if $ratio eq 'NA'; #NULL is still useful info to store in result
+		  #my ($x, $y) = @{$self->get_probe_x_y_by_name($pid)};
+		  
+		  #this is throwing away the encode region which could be used for the probeset/family?	
+		  $r_string .= '\N'."\t".$self->get_probe_id_by_name_Array($pid, $array)."\t${ratio}\t${cc_id}\t".'\N'."\t".'\N'."\n";#${x}\t${y}\n";
+		}
+		
+		print $rfile $r_string;
+		close($rfile);
+
+		$self->log("Importing:\t$rfile_path");
+		$self->db->load_table_data("result",  $rfile_path);
+		$self->log("Finished importing:\t$rfile_path");
+		$echip->adaptor->set_status('IMPORTED_SangerPCR', $echip);
+	  }
+    }
+
+
+
+  } else {
+    $self->log("No new data, skipping result parse");
+  }
+
+  $self->log("Finished reading and importing ".$self->vendor()." result data (".localtime().")");
+  return;
+}
+
+
+
+1;