Mercurial > repos > mahtabm > ensembl
view variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line source
=head1 LICENSE Copyright (c) 1999-2011 The European Bioinformatics Institute and Genome Research Limited. All rights reserved. This software is distributed under a modified Apache license. For license details, please see http://www.ensembl.org/info/about/code_licence.html =head1 CONTACT Please email comments or questions to the public Ensembl developers list at <ensembl-dev@ebi.ac.uk>. Questions may also be sent to the Ensembl help desk at <helpdesk@ensembl.org>. =cut package Bio::EnsEMBL::Funcgen::Parsers::vista; use strict; # Parse data from LBL enhancers, see http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1 # e.g. # # >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12] # AACTGAAGGGACCCCGTTAGCATAtaaacaaaaggtggggggtagccccgagcctcttct # ctgacagccagtggcggcagtgatgaatttgtgaagttatctaattttccactgttttaa # ttagagacttgggctctgaggcctcgcagctggcttctttgtgctgtattctgttgcctg # acagag use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; use Bio::EnsEMBL::Utils::Exception qw( throw ); use vars qw(@ISA); @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); sub new { my $caller = shift; my $class = ref($caller) || $caller; my $self = $class->SUPER::new(@_, type => 'Vista'); #Set default feature_type and feature_set config $self->{static_config}{feature_types} = {( 'VISTA Target' => { -name => 'VISTA Target', -class => 'Search Region', -description => 'VISTA target region', }, 'VISTA Enhancer' => { -name => 'VISTA Enhancer', -class => 'Enhancer', -description => 'Enhancer identified by positive VISTA assay', }, 'VISTA Target - Negative' => { -name => 'VISTA Target - Negative', -class => 'Search Region', -description => 'Enhancer negative region identified by VISTA assay', }, )}; $self->{static_config}{analyses} = { VISTA => { -logic_name => 'VISTA', -description => 'VISTA Enhancer Assay (http://enhancer.lbl.gov/)', -display_label => 'VISTA', -displayable => 1, }, }; #This is used as the entry point to store/validate #So all of the above needs to be referenced in here $self->{static_config}{feature_sets} = { 'VISTA enhancer set' => { #Stored in this order #Entries here are flexible #Can be omited if defined in feature_set #top level analyses/feature_types definition required if no DB defaults available #These can be a ref to the whole or subset of the top level analyses/feature_types hash #A key with an empty hash or undef(with or without a matching key in the top level analyses/feature_types hash #analyses => $self->{static_config}{analyses}, feature_types => $self->{static_config}{feature_types}, #feature_type and analysis values must be string key to top level hash #This wont work for feature_types as they are not unique by name!!!!!! #This is why we have top level hash where we can define a unique compound key name feature_set => { -feature_type => 'VISTA Target',#feature_types config key name not object -display_label => 'VISTA Enhancers', -description => 'Experimentally validated enhancers', -analysis => 'VISTA',#analyses config key name not object }, } }; #$self->validate_and_store_feature_types; $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); $self->set_feature_sets; return $self; } # Parse file and return hashref containing: # # - arrayref of features # - arrayref of factors sub parse_and_load{ my ($self, $files, $old_assembly, $new_assembly) = @_; if(scalar(@$files) != 1){ throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));; } my $file = $files->[0]; $self->log_header("Parsing and loading LBNL VISTA enhancer data from:\t$file"); my $extfeat_adaptor = $self->db->get_ExternalFeatureAdaptor; my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'EnhancerProjection'); # this object is only used for projection my $fset_config = $self->{static_config}{feature_sets}{'VISTA enhancer set'}; my $feature_positive = $fset_config->{'feature_types'}{'VISTA Enhancer'}; my $feature_negative = $fset_config->{'feature_types'}{'VISTA Target - Negative'}; my $set = $fset_config->{feature_set}; use Bio::EnsEMBL::Registry; my %id_prefixes = ( homo_sapiens => 'hs', mus_musculus => 'mm', ); my $species = Bio::EnsEMBL::Registry->get_alias($self->db->species); if( (! defined $species) || (! exists $id_prefixes{$species}) ){ throw("Failed to get a VISTA ID prefix for species alias:\t$species"); } $species = $id_prefixes{$species}; ### Read file open (FILE, "<$file") || die "Can't open $file"; my $cnt = 0; my $skipped = 0; while (<FILE>) { next if ($_ !~ /^>/o); # only read headers # OLD >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12] # v66 >Mouse|chr12:112380949-112381824 | element 3 | positive | neural tube[4/4] | hindbrain (rhombencephalon)[4/4] | forebrain[4/4] #ID naming scheme change from LBNL-1 to hs1 or mm1 #But the flat file and url use two different naming schemes! #VISTA URL is: where experiment id is element number and species_id 1 = human and 2 = mouse #http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=1&organism_id=1 #Add links to cell_type for tissues in @expression_pattern? #This would be vista specific cell_type_annotation? Or we could just have associated_cell_type? (without annotation) #Just link for now my (undef, $coords, $element, $posneg, @expression_patterns) = split /\s*\|\s*/o;#was \s+ # parse co-ordinates & id my ($chr, $start, $end) = $coords =~ /chr([^:]+):(\d+)-(\d+)/o; my ($element_number) = $element =~ /\s*element\s*(\d+)/o; # seq_region ID and co-ordinates my $chr_slice; if ($old_assembly) { $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly); } else { $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr); } if (!$chr_slice) { warn "Can't get slice for chromosme $chr\n"; next; } my $seq_region_id = $chr_slice->get_seq_region_id; throw("Can't get seq_region_id for chromosome $chr") if (!$seq_region_id); # Assume these are all on the positive strand? Is this correct? my $strand = 1; my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new ( -start => $start,#is this in UCSC coords? -end => $end, #is this in UCSC coords? -strand => $strand, -feature_type => $posneg eq 'positive' ? $feature_positive : $feature_negative, -slice => $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, $strand, $old_assembly), -display_label => $species.$element_number,#"LBNL-$element_number", -feature_set => $set, ); # project if necessary if ($new_assembly) { $feature = $self->project_feature($feature, $dummy_analysis, $new_assembly); if(! defined $feature){ $skipped ++; next; } } $cnt ++; $extfeat_adaptor->store($feature); } close FILE; $self->log('Parsed '.($cnt+$skipped).' features'); $self->log("Loaded $cnt features"); $self->log("Skipped $skipped features"); #Now set states foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ $set->adaptor->store_status($status, $set); } return; } 1;