diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,246 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::Parsers::vista;
+
+use strict;
+
+# Parse data from LBL enhancers, see http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
+# e.g.
+#
+# >chr16:84987588-84988227 | element 1 | positive  | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
+# AACTGAAGGGACCCCGTTAGCATAtaaacaaaaggtggggggtagccccgagcctcttct
+# ctgacagccagtggcggcagtgatgaatttgtgaagttatctaattttccactgttttaa
+# ttagagacttgggctctgaggcctcgcagctggcttctttgtgctgtattctgttgcctg
+# acagag
+
+use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
+use Bio::EnsEMBL::Utils::Exception qw( throw );
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
+
+
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my $self = $class->SUPER::new(@_, type => 'Vista');
+
+  #Set default feature_type and feature_set config
+  $self->{static_config}{feature_types} = {(
+											'VISTA Target'   => {
+																 -name        => 'VISTA Target', 
+																 -class       => 'Search Region',
+																 -description => 'VISTA target region',
+																},
+											'VISTA Enhancer' => {
+																 -name        => 'VISTA Enhancer', 
+																 -class       => 'Enhancer',
+																 -description => 'Enhancer identified by positive VISTA assay',
+																},
+											'VISTA Target - Negative' => {
+																		  -name        => 'VISTA Target - Negative', 
+																		  -class => 'Search Region',
+																		  -description => 'Enhancer negative region identified by VISTA assay',
+																		 },
+										   )};
+  
+  $self->{static_config}{analyses} = {
+									  VISTA =>  { 
+												 -logic_name => 'VISTA',
+												 -description   => 'VISTA Enhancer Assay (http://enhancer.lbl.gov/)',
+												 -display_label => 'VISTA',
+												 -displayable   => 1,
+												},
+									 };
+
+  #This is used as the entry point to store/validate
+  #So all of the above needs to be referenced in here
+  $self->{static_config}{feature_sets} = {
+										  'VISTA enhancer set' => 
+										  {
+										   #Stored in this order 
+
+										   #Entries here are flexible
+										   #Can be omited if defined in feature_set
+										   #top level analyses/feature_types definition required if no DB defaults available
+										   #These can be a ref to the whole or subset of the top level analyses/feature_types hash
+										   #A key with an empty hash or undef(with or without a matching key in the top level analyses/feature_types hash
+										   
+										   #analyses      => $self->{static_config}{analyses},
+										   feature_types => $self->{static_config}{feature_types},
+
+										   #feature_type and analysis values must be string key to top level hash
+										   #This wont work for feature_types as they are not unique by name!!!!!!
+										   #This is why we have top level hash where we can define a unique compound key name
+
+										   feature_set   => 
+										   {
+											-feature_type      => 'VISTA Target',#feature_types config key name not object
+											-display_label     => 'VISTA Enhancers',
+											-description       => 'Experimentally validated enhancers',
+											-analysis          => 'VISTA',#analyses config key name not object
+										   },
+										  }
+										 };
+  
+  #$self->validate_and_store_feature_types;
+  $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
+  $self->set_feature_sets;
+
+  return $self;
+}
+
+
+
+# Parse file and return hashref containing:
+#
+# - arrayref of features
+# - arrayref of factors
+
+
+
+sub parse_and_load{
+  my ($self, $files, $old_assembly, $new_assembly) = @_;
+  
+  if(scalar(@$files) != 1){
+	throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));;
+  }
+  
+  my $file = $files->[0];
+  $self->log_header("Parsing and loading LBNL VISTA enhancer data from:\t$file");
+
+  my $extfeat_adaptor = $self->db->get_ExternalFeatureAdaptor;
+  my $dummy_analysis   = new Bio::EnsEMBL::Analysis(-logic_name => 'EnhancerProjection');  # this object is only used for projection
+  my $fset_config      = $self->{static_config}{feature_sets}{'VISTA enhancer set'};
+  my $feature_positive = $fset_config->{'feature_types'}{'VISTA Enhancer'};
+  my $feature_negative = $fset_config->{'feature_types'}{'VISTA Target - Negative'};
+  my $set              = $fset_config->{feature_set};
+
+  use Bio::EnsEMBL::Registry;
+  my %id_prefixes = (
+					 homo_sapiens => 'hs',
+					 mus_musculus => 'mm',
+					);
+  
+  my $species = Bio::EnsEMBL::Registry->get_alias($self->db->species);
+
+  if( (! defined $species) ||
+	  (! exists $id_prefixes{$species}) ){
+	throw("Failed to get a VISTA ID prefix for species alias:\t$species");
+  }
+
+  $species = $id_prefixes{$species};
+
+  ### Read file
+  open (FILE, "<$file") || die "Can't open $file";
+  my $cnt = 0;
+  my $skipped = 0;
+
+
+  while (<FILE>) {
+
+    next if ($_ !~ /^>/o); # only read headers
+
+    # OLD >chr16:84987588-84988227 | element 1 | positive  | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
+	# v66 >Mouse|chr12:112380949-112381824 | element 3 | positive  | neural tube[4/4] | hindbrain (rhombencephalon)[4/4] | forebrain[4/4]
+
+	#ID naming scheme change from LBNL-1 to hs1 or mm1
+	#But the flat file and url use two different naming schemes!
+	#VISTA URL is: where experiment id is element number and species_id 1 = human and 2 = mouse
+	#http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=1&organism_id=1
+
+	#Add links to cell_type for tissues in @expression_pattern?
+	#This would be vista specific cell_type_annotation? Or we could just have associated_cell_type? (without annotation)
+	#Just link for now
+
+    my (undef, $coords, $element, $posneg, @expression_patterns) = split /\s*\|\s*/o;#was \s+
+    # parse co-ordinates & id
+    my ($chr, $start, $end) = $coords =~ /chr([^:]+):(\d+)-(\d+)/o;
+    my ($element_number) = $element =~ /\s*element\s*(\d+)/o;
+
+    # seq_region ID and co-ordinates
+    my $chr_slice;
+
+    if ($old_assembly) {
+      $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly);
+    } else {
+      $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr);
+    }
+
+    if (!$chr_slice) {
+      warn "Can't get slice for chromosme $chr\n";
+      next;
+    }
+
+    my $seq_region_id = $chr_slice->get_seq_region_id;
+	throw("Can't get seq_region_id for chromosome $chr") if (!$seq_region_id);
+
+    # Assume these are all on the positive strand? Is this correct?
+    my $strand = 1;
+
+
+	my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
+	  (
+	   -start         => $start,#is this in UCSC coords?
+	   -end           => $end,  #is this in UCSC coords?
+	   -strand        => $strand,
+	   -feature_type  => $posneg eq 'positive' ? $feature_positive : $feature_negative,
+	   -slice         => $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, $strand, $old_assembly),
+	   -display_label => $species.$element_number,#"LBNL-$element_number",
+	   -feature_set   => $set,
+	  );
+	
+
+    # project if necessary
+    if ($new_assembly) {
+
+      $feature = $self->project_feature($feature, $dummy_analysis, $new_assembly);
+
+	  if(! defined $feature){
+		$skipped ++;
+		next;
+	  }
+    }
+
+	$cnt ++;
+	$extfeat_adaptor->store($feature);
+  }
+
+  close FILE;
+
+  $self->log('Parsed '.($cnt+$skipped).' features');
+  $self->log("Loaded $cnt features");
+  $self->log("Skipped $skipped features");
+
+  #Now set states
+  foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){
+	$set->adaptor->store_status($status, $set);
+  }
+  
+
+  return;
+
+}
+
+
+1;