Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/vista.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,246 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::vista; + +use strict; + +# Parse data from LBL enhancers, see http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1 +# e.g. +# +# >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12] +# AACTGAAGGGACCCCGTTAGCATAtaaacaaaaggtggggggtagccccgagcctcttct +# ctgacagccagtggcggcagtgatgaatttgtgaagttatctaattttccactgttttaa +# ttagagacttgggctctgaggcctcgcagctggcttctttgtgctgtattctgttgcctg +# acagag + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::Utils::Exception qw( throw ); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'Vista'); + + #Set default feature_type and feature_set config + $self->{static_config}{feature_types} = {( + 'VISTA Target' => { + -name => 'VISTA Target', + -class => 'Search Region', + -description => 'VISTA target region', + }, + 'VISTA Enhancer' => { + -name => 'VISTA Enhancer', + -class => 'Enhancer', + -description => 'Enhancer identified by positive VISTA assay', + }, + 'VISTA Target - Negative' => { + -name => 'VISTA Target - Negative', + -class => 'Search Region', + -description => 'Enhancer negative region identified by VISTA assay', + }, + )}; + + $self->{static_config}{analyses} = { + VISTA => { + -logic_name => 'VISTA', + -description => 'VISTA Enhancer Assay (http://enhancer.lbl.gov/)', + -display_label => 'VISTA', + -displayable => 1, + }, + }; + + #This is used as the entry point to store/validate + #So all of the above needs to be referenced in here + $self->{static_config}{feature_sets} = { + 'VISTA enhancer set' => + { + #Stored in this order + + #Entries here are flexible + #Can be omited if defined in feature_set + #top level analyses/feature_types definition required if no DB defaults available + #These can be a ref to the whole or subset of the top level analyses/feature_types hash + #A key with an empty hash or undef(with or without a matching key in the top level analyses/feature_types hash + + #analyses => $self->{static_config}{analyses}, + feature_types => $self->{static_config}{feature_types}, + + #feature_type and analysis values must be string key to top level hash + #This wont work for feature_types as they are not unique by name!!!!!! + #This is why we have top level hash where we can define a unique compound key name + + feature_set => + { + -feature_type => 'VISTA Target',#feature_types config key name not object + -display_label => 'VISTA Enhancers', + -description => 'Experimentally validated enhancers', + -analysis => 'VISTA',#analyses config key name not object + }, + } + }; + + #$self->validate_and_store_feature_types; + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + + +sub parse_and_load{ + my ($self, $files, $old_assembly, $new_assembly) = @_; + + if(scalar(@$files) != 1){ + throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));; + } + + my $file = $files->[0]; + $self->log_header("Parsing and loading LBNL VISTA enhancer data from:\t$file"); + + my $extfeat_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'EnhancerProjection'); # this object is only used for projection + my $fset_config = $self->{static_config}{feature_sets}{'VISTA enhancer set'}; + my $feature_positive = $fset_config->{'feature_types'}{'VISTA Enhancer'}; + my $feature_negative = $fset_config->{'feature_types'}{'VISTA Target - Negative'}; + my $set = $fset_config->{feature_set}; + + use Bio::EnsEMBL::Registry; + my %id_prefixes = ( + homo_sapiens => 'hs', + mus_musculus => 'mm', + ); + + my $species = Bio::EnsEMBL::Registry->get_alias($self->db->species); + + if( (! defined $species) || + (! exists $id_prefixes{$species}) ){ + throw("Failed to get a VISTA ID prefix for species alias:\t$species"); + } + + $species = $id_prefixes{$species}; + + ### Read file + open (FILE, "<$file") || die "Can't open $file"; + my $cnt = 0; + my $skipped = 0; + + + while (<FILE>) { + + next if ($_ !~ /^>/o); # only read headers + + # OLD >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12] + # v66 >Mouse|chr12:112380949-112381824 | element 3 | positive | neural tube[4/4] | hindbrain (rhombencephalon)[4/4] | forebrain[4/4] + + #ID naming scheme change from LBNL-1 to hs1 or mm1 + #But the flat file and url use two different naming schemes! + #VISTA URL is: where experiment id is element number and species_id 1 = human and 2 = mouse + #http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=1&organism_id=1 + + #Add links to cell_type for tissues in @expression_pattern? + #This would be vista specific cell_type_annotation? Or we could just have associated_cell_type? (without annotation) + #Just link for now + + my (undef, $coords, $element, $posneg, @expression_patterns) = split /\s*\|\s*/o;#was \s+ + # parse co-ordinates & id + my ($chr, $start, $end) = $coords =~ /chr([^:]+):(\d+)-(\d+)/o; + my ($element_number) = $element =~ /\s*element\s*(\d+)/o; + + # seq_region ID and co-ordinates + my $chr_slice; + + if ($old_assembly) { + $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly); + } else { + $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr); + } + + if (!$chr_slice) { + warn "Can't get slice for chromosme $chr\n"; + next; + } + + my $seq_region_id = $chr_slice->get_seq_region_id; + throw("Can't get seq_region_id for chromosome $chr") if (!$seq_region_id); + + # Assume these are all on the positive strand? Is this correct? + my $strand = 1; + + + my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -start => $start,#is this in UCSC coords? + -end => $end, #is this in UCSC coords? + -strand => $strand, + -feature_type => $posneg eq 'positive' ? $feature_positive : $feature_negative, + -slice => $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, $strand, $old_assembly), + -display_label => $species.$element_number,#"LBNL-$element_number", + -feature_set => $set, + ); + + + # project if necessary + if ($new_assembly) { + + $feature = $self->project_feature($feature, $dummy_analysis, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + } + + $cnt ++; + $extfeat_adaptor->store($feature); + } + + close FILE; + + $self->log('Parsed '.($cnt+$skipped).' features'); + $self->log("Loaded $cnt features"); + $self->log("Skipped $skipped features"); + + #Now set states + foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ + $set->adaptor->store_status($status, $set); + } + + + return; + +} + + +1;
