Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/cisred.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/cisred.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,487 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::cisred; + +use strict; + +use File::Basename; + +# To get files for CisRed data, download the following 2 files (e.g. via wget): +# +# http://www.cisred.org/content/databases_methods/human_2/data_files/motifs.txt +# +# http://www.cisred.org/content/databases_methods/human_2/data_files/search_regions.txt + + +#No longer valid urls, now use the following for ensembl formats for all species: +#http://www.bcgsc.ca/downloads/cisred/temp/cisRED4Ensembl/ +#naminf may not be obvious, may have to cross reference with above previous urls to get build info + +# Format of motifs.txt (note group_name often blank) + +#name chromosome start end strand group_name ensembl_gene +#craHsap1 1 168129978 168129997 -1 1 ENSG00000000457 +#craHsap2 1 168129772 168129781 -1 2 ENSG00000000457 +#craHsap3 1 168129745 168129756 -1 3 ENSG00000000457 +#craHsap4 1 168129746 168129753 -1 4 ENSG00000000457 +#craHsap5 1 168129745 168129752 -1 5 ENSG00000000457 +#craHsap6 1 168129741 168129757 -1 6 ENSG00000000457 + + +# Format of search_regions.txt +# name chromosome start end strand ensembl_gene_id +# 1 17 39822200 39824467 -1 ENSG00000005961 +# 8 17 23151483 23153621 -1 ENSG00000007171 +# 14 1 166434638 166437230 -1 ENSG00000007908 +# 19 1 23602820 23605631 -1 ENSG00000007968 + + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw ); + + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + + + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'cisRED'); + + #Set default feature_type and feature_set config + $self->{static_config}{feature_types} = + { + 'cisRED Search Region' => { + -name => 'cisRED Search Region', + -class => 'Search Region', + -description => 'cisRED search region', + }, + 'cisRED Motif' => { + -name => 'cisRED Motif', + -class => 'Regulatory Motif', + -description => 'cisRED atomic motif', + }, + }; + + $self->{static_config}{analyses} = + { + cisRED => { + -logic_name => 'cisRED', + -description => 'cisRED motif search (www.cisred.org)', + -display_label => 'cisRED', + -displayable => 1, + }, + }; + + $self->{static_config}{feature_sets} = + { + 'cisRED search regions' => + { + analyses => $self->{static_config}{analyses}, + feature_types => $self->{static_config}{feature_types}, + feature_set => { + #feature_type and analysis here are the keys from above + -feature_type => 'cisRED Search Region', + -display_label => 'cisRED searches', + -analysis => 'cisRED', + }, + xrefs => 1, + }, + + + 'cisRED motifs' => + { + analyses => $self->{static_config}{analyses}, + feature_types => $self->{static_config}{feature_types}, + feature_set => { + #feature_type and analysis here are the keys from above + -feature_type => 'cisRED Motif', + -analysis => 'cisRED', + }, + xrefs => 1, + }, + }; + + #$self->validate_and_store_feature_types; + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + +#To do +# 1 This needs to take both file names, motifs, then search regions. Like the Bed/GFF importers do. + + +sub parse_and_load { + my ($self, $files, $old_assembly, $new_assembly) = @_; + $self->log_header("Parsing cisRED data"); + + if(scalar(@$files) != 2){ + throw('You must currently define a motif and search file to load cisRED features from:\t'.join(' ', @$files)); + } + + + my $analysis_adaptor = $self->db->get_AnalysisAdaptor(); + #my %features_by_group; # name -> factor_id + my %groups; + my %slice_cache; + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + #my $display_name_cache = $self->build_display_name_cache('gene'); + # this object is only used for projection + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'CisREDProjection'); + + # ---------------------------------------- + # We need a "blank" factor for those features which aren't assigned factors + # Done this way to maintain referential integrity + #my $blank_factor_id = $self->get_blank_factor_id($db_adaptor); + + #More validation of files here? + my ($motif_file) = grep(/motif/, @$files); + my ($search_file) = grep(/search/, @$files); + my $species = $self->db->species; + if(! $species){ + throw('Must define a species to define the external_db'); + } + #Just to make sure we hav homo_sapiens and not Homo Sapiens + ($species = lc($species)) =~ s/ /_/; + + + # Parse motifs.txt file + $self->log_header("Parsing cisRED motifs from $motif_file"); + my $skipped = 0; + my $skipped_xref = 0; + #my $coords_changed = 0; + my $cnt = 0; + my $set = $self->{static_config}{feature_sets}{'cisRED motifs'}{feature_set}; + + + + open (FILE, "<$motif_file") || die "Can't open $motif_file"; + <FILE>; # skip header + + while (<FILE>) { + next if ($_ =~ /^\s*\#/o || $_ =~ /^\s*$/o); + chomp; + + #name chromosome start end strand group_name ensembl_gene + #craHsap1 1 168129978 168129997 - crtHsap40066,crtHsap40060 ENSG00000000457 + #craHsap2 1 168129772 168129781 - crtHsap40068,crtHsap40193,crtHsap40130 ENSG00000000457 + + #So we only ever have one atomic motif, which may belong to several groups + #Do not store atmoic motifs as feature types as this is the actual feature + #simply use the feature_set->feature_type and store the atmoic motif id as the name + + + my ($motif_name, $chromosome, $start, $end, $strand, $groups, $gene_id) = split/\t/o; + #($gene_id) = $gene_id =~ /(ENS.*G\d{11})/; + my @group_names = split/,/, $groups; + + #These are stranded features, so either - or +, never 0; + $strand = ($strand eq '-') ? -1 : 1; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for motif $motif_name\n"; + $skipped++; + next; + } + + + #get feature_type first + + #we are not maintaining this link in the DB! + #Do we need another xref for this or a different table? + + + #if ($group_name && $group_name ne '' && $group_name !~ /\s/o) { +# +# if(! exists $features_by_group{$group_name}){ +# $features_by_group{$group_name} = $ftype_adaptor->fetch_by_name('crtHsap'.$group_name); +# +# if(! defined $features_by_group{$group_name}){ +# ($features_by_group{$group_name}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new +# ( +# -name => 'crtHsap'.$group_name, +# -class => 'Regulatory Motif', +# -description => 'cisRED group', +# ))}; +# } +# } +# } + #}else{ + # throw("Found cisRED feature $motif_name with no group_name, unable to defined feature_type"); + #} + + foreach my $group(@group_names){ + + next if exists $groups{$group}; + + #else store the new group as a feature_type and set $group to be the feature_type + ($group) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => $group, + -class => 'Regulatory Motif', + -description => 'cisRED group', + ))}; + } + + + + #my $ftype = (defined $features_by_group{$group_name}) ? $features_by_group{$group_name} : $self->{'feature_sets'}{'cisRED group motifs'}->feature_type; + + + my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $motif_name, + -start => $start, + -end => $end, + -strand => $strand, + #-feature_type => $ftype, + -associated_feature_types => \@group_names, + -feature_set => $set, + -slice => $slice_cache{$chromosome}, + ); + + + + # project if necessary + if ($new_assembly) { + $feature = $self->project_feature($feature, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + + } + + ($feature) = @{$extf_adaptor->store($feature)}; + $cnt++; + + + + #We don't care so much about loading features for retired Genes here + #as the Genes are only used to define the search regions + #Not a direct alignment as with the miRanda set + + #However, adding an xref will create dead link in the browser + + #Build Xref here + if (! $gene_id) { + warn("No xref available for motif $motif_name\n"); + $skipped_xref++; + next; + } + + my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); + + #Handle release/version in xref version as stable_id version? + + my $dbentry = Bio::EnsEMBL::DBEntry->new + ( + -dbname => $species.'_core_Gene', + #-release => $self->db->_get_schema_build($self->db->dnadb), + #-release => '49_36b',#harcoded for human + -release => '49_37b', #hardcoded for mouse + -status => 'KNOWNXREF', + #-display_label_linkable => 1, + -db_display_name => 'EnsemblGene', + -type => 'MISC',#this is external_db.type + -primary_id => $gene_id, + -display_id => $display_name, + -info_type => 'MISC', + -info_text => 'GENE', + -linkage_annotation => 'cisRED motif gene', + -analysis => $set->analysis, + #could have version here if we use the correct dnadb to build the cache + ); + $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag + } + + + close FILE; + + $self->log("Stored $cnt cisRED ExternalFeature motif"); + $self->log("Skipped $skipped cisRED ExternalFeature motif imports"); + $self->log("Skipped an additional $skipped_xref DBEntry imports"); + + #Now store states + foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){ + $set->adaptor->store_status($status, $set); + } + + + + # ---------------------------------------- + # Search regions + # read search_regions.txt from same location as $file + + #my $search_regions_file = dirname($file) . "/search_regions.txt"; + #my $search_file; + #($search_regions_file = $file) =~ s/motifs/searchregions/; + + $skipped = 0; + $cnt = 0; + $skipped_xref = 0; + $set = $self->{static_config}{feature_sets}{'cisRED search regions'}{feature_set}; + + $self->log_header("Parsing cisRED search regions from $search_file"); + open (SEARCH_REGIONS, "<$search_file") || die "Can't open $search_file"; + <SEARCH_REGIONS>; # skip header + + while (<SEARCH_REGIONS>) { + chomp; + my ($id, $chromosome, $start, $end, $strand, $gene_id) = split; + my $display_id = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); + my $name = "CisRed_Search_$id"; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + }else{ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for search region $name\n"; + next; + } + + + + + + my $search_feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $name, + -start => $start, + -end => $end, + -strand => $strand, + -feature_set => $set, + -slice => $slice_cache{$chromosome}, + ); + + + + # project if necessary + if ($new_assembly) { + $search_feature = $self->project_feature($search_feature); + + if(! defined $search_feature){ + $skipped ++; + next; + } + } + + $extf_adaptor->store($search_feature); + $cnt++; + + #Build Xref here + #need to validate gene_id here!! + + if (! $gene_id) { + warn("Can't get internal ID for $gene_id\n"); + $skipped_xref++; + next; + } + + my $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $gene_id, 'gene'); + + my $dbentry = Bio::EnsEMBL::DBEntry->new + ( + -dbname => $species.'_core_Gene', + #-release => $self->db->dnadb->dbc->dbname, + -status => 'KNOWNXREF', + #-display_label_linkable => 1, + #-db_display_name => $self->db->dnadb->dbc->dbname, + -db_display_name => 'EnsemblGene', + -type => 'MISC', + -primary_id => $gene_id, + -display_id => $display_name, + -info_type => 'MISC', + -info_text => 'GENE', + -linkage_annotation => 'cisRED search region gene',#omit? + -analysis => $set->analysis, + #could have version here if we use the correct dnadb to build the cache + ); + $dbentry_adaptor->store($dbentry, $search_feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag + } + + close(SEARCH_REGIONS); + + + $self->log("Stored $cnt cisRED search region ExternalFeatures"); + $self->log("Skipped $skipped cisRED search region ExternalFeatures"); + $self->log("Skipped an additional $skipped_xref cisRED search region DBEntry imports"); + + #No MART_DISPLAYABLE here + $set->adaptor->store_status('DISPLAYABLE', $set); + + + #print "$coords_changed features had their co-ordinates changed as a result of assembly mapping.\n" if ($new_assembly); + + return; + +} + + + +1;
