Mercurial > repos > mahtabm > ensembl

diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/miranda.pm @ 0:1f6dce3d34e0
Uploaded
author: mahtabm
date: Thu, 11 Apr 2013 02:01:53 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/miranda.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,556 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2012 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::Parsers::miranda;
+
+use strict;
+
+use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
+use Bio::EnsEMBL::DBEntry;
+use Bio::EnsEMBL::Funcgen::ExternalFeature;
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
+
+
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my $self = $class->SUPER::new(@_, type => 'miRanda');
+
+  #Set default feature_type and feature_set config
+  $self->{static_config}{feature_types} = 
+	{
+	 'miRanda Target'   => {
+							-name        => 'miRanda Target',
+							-class       => 'RNA',
+							-description => 'miRanda microRNA target',
+						   },
+	};
+
+  $self->{static_config}{analyses} =
+	{
+	 miRanda => { 
+				 -logic_name    => 'miRanda',
+				 -description   => '<a href="http://www.ebi.ac.uk/enright-srv/microcosm/htdocs/targets/v5/consset.html">miRanda microRNA target predictions</a>',
+				 -display_label => 'miRanda Targets',
+				 -displayable   => 1,
+				},
+	};
+
+  $self->{static_config}{feature_sets}{'miRanda miRNA targets'} = 
+	{
+	 #analyses      => $self->{static_config}{analyses},
+	 #feature_types => $self->{static_config}{feature_types},
+	 feature_set => 
+	 {
+	  -feature_type      => 'miRanda Target',
+	  -display_name      => 'miRanda Targets',
+	  -description       => $self->{static_config}{analyses}{miRanda}{-description},
+	  -analysis          => 'miRanda',
+	 },
+	 xrefs => 1,
+   }; 
+
+
+  
+ 
+  #$self->validate_and_store_feature_types;
+  $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
+  $self->set_feature_sets;
+
+  return $self;
+}
+
+
+#TO DO
+# In loop logging should only be enable with verbose or change to debug?
+# Use count methods?
+# Sort input to enable cache clearing(caches max out at ~200MB so not essential)
+# Optimise slice cache testing(load only takes 8 min sso not essential)
+# Add verbose logging/debug to show reassigned xrefs?
+
+sub parse_and_load{
+  my ($self, $files, $old_assembly, $new_assembly) = @_;
+
+  #Add num files to config and check this in BaseImporter(generically)
+  if(scalar(@$files) != 1){
+	 throw('You must currently define a single file to load miRanda features from:\t'.join(' ', @$files));
+  }
+
+  my $file = $files->[0];
+  $self->log_header("Parsing miRanda data from:\t$file");
+
+  my $analysis_adaptor = $self->db->get_AnalysisAdaptor();
+  my $ftype_adaptor    = $self->db->get_FeatureTypeAdaptor();
+  my $extf_adaptor     = $self->db->get_ExternalFeatureAdaptor;
+  my $trans_adaptor    = $self->db->dnadb->get_TranscriptAdaptor;
+  my $dbentry_adaptor  = $self->db->get_DBEntryAdaptor; 
+  my $set              = $self->{static_config}{feature_sets}{'miRanda miRNA targets'}{feature_set};
+ 
+  my ($dbentry, $schema_build, %features_by_name, %slice_cache, $ens_display_name, %feature_cache);
+  my (@txs, @sid_dnames, %xref_cache, %seen_mifeat_trans, %skipped_features, %failed_reassignment);
+  my ($overlaps_3_utr);
+  # this object is only used for projection
+  my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'miRandaProjection');
+
+  #Some hairy counting to make sure the XREF reassignment
+  #is doing something sensible
+  #Can probably change all this to use the count methods
+  my $slice_skipped    = 0;
+  my $old_mid_skipped  = 0;
+  my $cnt              = 0;
+  my $proj_skipped     = 0;
+  my $seq_skipped      = 0;
+  my $row_cnt          = 0;
+  my $new_xrefs        = 0;
+  my $old_as_new_xrefs = 0;
+  my $failed_new_sids  = 0;
+  my $utr_failed_old_sids = 0;
+  my $no_proj_failed_old_sids = 0;
+  my $previously_skipped = 0;
+  my $with_overlapping_tx = 0;
+  my $retired_sids         = 0;
+  my $existing_sids    = 0;
+  my $total_xrefs = 0;
+  my $species          = $self->db->species;
+  my $analysis         = $set->analysis;
+
+
+  #Need to allow this to be passed
+  $schema_build ||= $self->db->_get_schema_build($self->db->dnadb);
+
+  if(! $species){
+	throw('Must define a species to define the external_db');
+  }
+  #Just to make sure we hav homo_sapiens and not Homo Sapiens
+  ($species = lc($species)) =~ s/ /_/;
+
+  my $edb_name        = $species.'_core_Transcript',
+
+
+
+  open (FILE, "<$file") || die "Can't open $file";
+
+  #We used to have redundant target features wrt xrefs
+	  #Similarity      mmu-miR-192     miRanda miRNA_target    5       127885168       127885188       -       .       15.2253 4.048320e-03    ENSMUST00000031367      Slc15a4
+	  #Similarity      mmu-miR-192     miRanda miRNA_target    5       127885168       127885188       -       .       15.2397 3.945600e-03    ENSMUST00000075376      Slc15a4
+	  
+  #Have now changed this to nr features with multiple xrefs
+  #Hence have had to remove transcript sid from diplay_label
+
+  #Several 'seen' caches have be implemented to prevent redundant storing
+  #Currently these span the whole data set, but could be cleaned after every feature
+  #if the input is sorted correctly
+
+  #Need to make sure we are counting correctly
+  #skipped counts will reflect lines in input, not nr features
+
+  
+  #Could add UnmappedObjects in here
+
+ LINE: while (<FILE>) {
+	next LINE if ($_ =~ /^\s*\#/o || $_ =~ /^\s*$/o);
+	$row_cnt ++;
+	#Added next for old miRbase IDs.
+
+	#Sanger
+	##GROUP SEQ     METHOD  FEATURE CHR     START   END     STRAND  PHASE   SCORE   PVALUE_OG       TRANSCRIPT_ID   EXTERNAL_NAME
+	#Similarity      mmu-miR-707     miRanda miRNA_target    2       120824620       120824640       +       .       15.3548 2.796540e-02    ENST00000295228 INHBB
+
+
+    my ($group, $id, $method, $feature, $chr, $start, $end, $strand, undef, undef, undef, $ens_id, $display_name) = split;
+	#We never use $diplay_name now, as it never match the transcript display name 
+	#which as the transcript number suffic attached to the gene display name
+	#e.g. BRCA2-001
+
+	#%seen_mifeat_trans handles redundancy between existant(i.e. not retired) sids in input
+	#file and those identified when trying to reannotated a retired sid
+	
+	#ALREADY ANNOTATED OR SKIPPED
+	if(exists  $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$ens_id} ){
+	  $self->log("Skipping previously reannotated miRNA target:\t".$id.':'.$chr.':'.$start.':'.$end.' - '.$ens_id);
+	  $old_as_new_xrefs ++;
+	}
+	elsif(exists $skipped_features{$id.':'.$chr.':'.$start.':'.$end}){
+	  $self->log("Skipping previous failed feature:\t".$id.':'.$chr.':'.$start.':'.$end."\t".
+				 $skipped_features{$id.':'.$chr.':'.$start.':'.$end});
+	  $previously_skipped++;
+	  #Could increment count in here to count old xrefs failed for each fail type
+	  next;
+	}
+
+
+	#Added next for old miRbase IDs.
+	if ( $id =~ /\*$/o ){
+	  $self->log("Skipping old miRbase ID:\t$id");
+
+	  $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Old invalid miRbase ID';
+	  $old_mid_skipped ++;
+
+	  #$old_xrefs_skipped ++;
+	  #Just want to make sure we have comparable
+	  #old skipped xrefs and re-annotated xrefs
+	  #so this is not useful here
+	  
+
+	  next LINE;
+	}
+	
+    $strand = ($strand =~ /\+/o) ? 1 : -1;
+
+	#Now moved transript xref info exclusively to xrefs
+    ##my $id = $ens_id =~ s/[\"\']//g;  # strip quotes
+	#my $id = $ens_id.':'.$seq;
+
+
+	#change this to only test once
+	#if exists and not defined then skip
+
+	if(! defined $slice_cache{$chr}){
+	
+	  #Was originally limiting to chromosome
+
+	  if($old_assembly){
+		$slice_cache{$chr} = $self->slice_adaptor->fetch_by_region(undef, 
+																	$chr, 
+																	undef, 
+																	undef, 
+																	undef, 
+																	$old_assembly);
+	  }else{
+		$slice_cache{$chr} = $self->slice_adaptor->fetch_by_region(undef, $chr);
+	  }
+
+	  if(! defined $slice_cache{$chr}){
+		warn "Can't get slice $chr for sequence $id\n";
+
+		$slice_skipped ++;
+		$skipped_features{$id.':'.$chr.':'.$start.':'.$end} = "Failed to fetch slice $chr";
+		
+		#Add UnmappedObject here?
+		next LINE;
+	  }
+	}
+
+	
+	#We can add coding xref to feature type based on the miRbase name
+	#.e.g hsa-mir-24-1
+	#However, this isn't stored as an xref
+	#It is stored in the gene.description
+	#e.g. hsa-mir-24-1 [Source:miRBase;Acc:MI0000080]
+	#Not easy to fetch as descriptions not indexed!
+	#
+
+	#Cache/store FeatureType
+
+	if(! exists $features_by_name{$id}){
+	  $features_by_name{$id} = $ftype_adaptor->fetch_by_name($id);
+	  
+	  if(! defined $features_by_name{$id}){
+		($features_by_name{$id}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
+															 (
+															  -name  => $id,
+															  -class => 'RNA',
+															  -description => $method.' '.$feature,
+															 ))};
+
+		#Need to add source gene xref here to enable target conequences implied by source variation
+
+	  }
+	}
+
+
+	#Make sure we have the target transcript before we store the feature
+	#Have to do this as we can't always run with the correct core DB
+	#as it may be too old. Hence we have to hard code the edb.release
+	
+	
+	##This should enever happen, as the search regions are defined by ens transcript
+	##i.e there is always a ensembl iD
+	
+	if (! $ens_id) {
+      warn("No xref available for miRNA $id\n");
+      $skipped_xref++;
+      next;
+    }
+   
+
+
+	if(exists $feature_cache{$id.':'.$chr.':'.$start.':'.$end}){
+	  $feature = $feature_cache{$id.':'.$chr.':'.$start.':'.$end};
+	}
+	else{
+
+	  $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
+		(
+		 -display_label => $id,
+		 -start         => $start,
+		 -end           => $end,
+		 -strand        => $strand,
+		 -feature_type  => $features_by_name{$id},
+		 -feature_set   => $set,
+		 -slice         => $slice_cache{$chr},
+		);
+
+	  # project if necessary
+	  if ($new_assembly) {
+		
+		$feature = $self->project_feature($feature, $new_assembly);
+		
+		if (! defined $feature) {
+		  $proj_skipped ++;
+		  $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Failed projection';
+		  next;
+		}		
+		
+		#This was failing as old assembly seqs are not stored, hence are returned as Ns
+		#if ($old_seq ne $feature->seq){
+		#  $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Projected sequence mismatch';
+		#  $seq_skipped++;
+		#  next;
+		#}
+
+		#Assembly mapping tends to ignore single seq mismatches, but does handle gaps.
+		#This is not generally a problemas a new assembly is normally a reshuffle of existing
+		#clones/contigs which will have exactly the same seq(actually there is a small amount of change)
+		#but this only affected a handful of transcripts
+		#old super(contigs) which have be integrated into a chromosome are not mapped
+		#so we will lose these
+		#the rest (retired clones/contigs and new clones/contigs) are aligned
+		#but again mismatches tend to be ignored.
+		
+		#There may be a tendency for this later case to contain more mismatches due to an entirely new
+		#clone seqeunce, hence let's filter/count these?
+
+		#Tested and import using the following hack using old 67 DB to test seq mismatches
+		#only 1 failed.
+		#my $old_slice = $v67_sa->fetch_by_region( undef, $chr, $start, $end, $strand);
+
+		#if ($old_slice->seq ne $feature->seq){
+		#  $skipped_features{$id.':'.$chr.':'.$start.':'.$end} = 'Projected sequence mismatch';
+		#  $seq_skipped++;
+		#  next LINE;
+		#}
+	  }
+	}
+
+
+
+	### DEFINE TRANSCRIPT XREF INFO
+
+	#Check transcript exists
+	$display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, $ens_id, 'transcript');
+	
+
+	# ATTEMPT TO REANNOTATE
+	if (! defined $display_name){ # Transcript does not exist in current release
+
+	  #Set to 0 as we are not storing this sid
+	  $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$ens_id} = 0;
+
+	  $self->log("$id $ens_id stable ID has been retired");
+	  $retired_sids ++;
+
+	  #Try and re-annotate on newer overlapping transcripts
+	  @txs = @{$trans_adaptor->fetch_all_by_Slice($feature->feature_Slice)};
+	  @sid_dnames = ();
+	  
+	  #COUNT unique miRNA features which have failed reannotated
+	  if(! exists $failed_reassignment{$id.':'.$chr.':'.$start.':'.$end}){
+		$failed_reassignment{$id.':'.$chr.':'.$start.':'.$end} = 1;
+	  }
+
+
+
+	  if (@txs) { #OVERLAPPING TRANSCRIPTS
+		$with_overlapping_tx ++;
+
+		foreach my $tx(@txs){
+
+		  #Check we have previously seen this xref
+		  if(! exists $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$tx->stable_id}){
+
+			 #Do UTR checking here
+			 $overlaps_3_utr = 0;
+
+			 #Transcript will always return wrt to feature_Slice of miRNA feature.
+			 #As miRNA are complimentary to the the mRNA, which is complimentary
+			 #to the sense strand of the gene, these should always be on the same strand.
+
+			 #Could add some more detailed strand/UTR fail counts in here
+			 #but leave for now
+			 #$same_strand = 1;
+
+			 if($tx->seq_region_strand != $strand){
+			   #$same_strand = 0;
+			 }
+			 elsif($strand == 1){
+			   
+			   if( ($end <= $tx->seq_region_end) &&
+				   ($start > $tx->coding_region_end) ){
+				 $overlaps_3_utr = 1;
+			   }
+
+			 }
+			 else{#Must be -1
+			   
+			   if( ($end < $tx->coding_region_start) &&
+				   ($start >= $tx->seq_region_start) ){
+				 $overlaps_3_utr = 1;
+			   }
+			 }
+			
+			 #could count no utr match here if we set same_strand boolean
+
+			 if($overlaps_3_utr){
+			   $display_name = $self->get_core_display_name_by_stable_id($self->db->dnadb, 
+																		 $tx->stable_id, 
+																		 'transcript');
+			   push @sid_dnames, [$tx->stable_id, $display_name];
+			   $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end}{$tx->stable_id} = 1;
+			   #want to count re-annotated xrefs here
+			   #These may have been represented in the original file
+			   $new_xrefs ++;
+			 }
+			 else{
+			   # FAILED annotate new sid
+			   # This currently includes +ve and -ve strand transcripts
+			   $failed_new_sids ++;
+			 }
+		   }
+		   
+		   if(! @sid_dnames){
+			 #FAILED TO REANNOTATE retired sid xref
+			 $utr_failed_old_sids ++;
+
+			 next LINE;
+		   }
+		 }
+	  }
+	  else{ #FAILED TO REANNOTATE XREF - NO OVERLAPPING TRANSCRIPTS
+		$no_proj_failed_old_sids ++;
+		next LINE;
+	  }
+
+
+	  #COUNT unique miRNA features which have failed reannotated
+	  if(@sid_dnames){
+		$failed_reassignment{$id.':'.$chr.':'.$start.':'.$end} = 0
+	  }
+
+	}
+	else { #Add xref for existing transcript	  
+	  $seen_mifeat_trans{$id.':'.$chr.':'.$start.':'.$end} = {$ens_id => 1};
+	  #$display_name = $ens_display_name;
+	  @sid_dnames = ([$ens_id, $display_name]);
+	  $existing_sids++;
+	}
+
+
+	#shouldn't need this if as we call next for everycase above
+	#if (@xref_details) {
+	#Only if we have target transcripts
+
+	# STORE FEATURE
+	if (! exists $feature_cache{$id.':'.$chr.':'.$start.':'.$end}){
+	  ($feature) = @{$extf_adaptor->store($feature)};
+	  $feature_cache{$id.':'.$chr.':'.$start.':'.$end} = $feature;
+	  $cnt++;
+	}
+
+ 
+	# STORE XREFS
+	foreach my $xref_info(@sid_dnames) {
+	  
+	  #Handle release/version in xref version as stable_id version?
+
+	  $dbentry = Bio::EnsEMBL::DBEntry->new
+		(
+		 -dbname                 => $edb_name,
+		 -release                => $schema_build,
+		 #-release                => '58_37k',#'46_36h', #Hard coded due to schema to old to use with API
+		 -status                 => 'KNOWNXREF',
+		 #-display_label_linkable => 1,
+		 -db_display_name        => 'EnsemblTranscript',
+		 -type                   => 'MISC',
+		 -primary_id             => $xref_info->[0],
+		 -display_id             => $xref_info->[1],
+		 -info_type              => 'MISC',
+		 -info_text              => 'TRANSCRIPT',
+		 -linkage_annotation     => 'miRanda target - negative influence',
+		 #could have version here if we use the correct dnadb to build the cache
+		 -analysis               =>  $analysis,
+		);
+	
+	  $dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1); #1 is ignore release flag  
+	  $total_xrefs++;
+	}
+  }
+
+  close FILE;
+  
+  #scalar context returns count
+  my $miRNA_failed_reassignment = grep/1/, values %failed_reassignment;
+  my $miRNA_skipped = $old_mid_skipped + $slice_skipped + 
+	$proj_skipped + $seq_skipped + $miRNA_failed_reassignment;
+ 
+  $self->log_header($set->name." Import Report");
+  $self->log(sprintf("%-090s", "miRNA feature:target pairs seen(i.e. input rows):").$row_cnt);
+  $self->log(sprintf("%-090s","Stored NR miRanda miRNA ExternalFeatures:").$cnt);
+  $self->log(sprintf("%-090s","Total skipped miRanda miRNA target features(inc reassigned):").$miRNA_skipped);
+  $self->log(sprintf("%-090s","Old miRbase IDs skipped").$old_mid_skipped);
+  $self->log(sprintf("%-090s","Skipped features on unknown slice:").$slice_skipped."\n");
+
+  if($new_assembly){
+	$self->log(sprintf("%-090s","Skipped due to failed assembly projection:").$proj_skipped);
+	$self->log(sprintf("%-090s","Skipped due to seq mismatch for assembly projection:")."$seq_skipped\n");
+  }
+
+  $self->log("The following numbers are counted from the mappable/valid miRNA target features");
+  $self->log(sprintf("%-090s", "Total stored Transcript xrefs(current and retired re-assigned):"). $total_xrefs);
+  $self->log(sprintf("%-090s", "Total current Transcript xrefs:"). $existing_sids);
+  $self->log(sprintf("%-090s", "Total skipped miRanda miRNA target xrefs:").($previously_skipped	+ $miRNA_skipped));
+  $self->log(sprintf("%-090s", "Retired Transcript xrefs:").$retired_sids);
+  $self->log(sprintf("%-090s", "Unique miRNA features which completely failed reassignment:").$miRNA_failed_reassignment);
+  $self->log(sprintf("%-090s", "Total new Xrefs assigned due to retired Transcript:").$new_xrefs); 
+  $self->log(sprintf("%-090s", "Previously assigned Transcript Xrefs skipped:").$old_as_new_xrefs);  
+  $self->log(sprintf("%-090s", "Retired Transcript Xrefs with no new overlapping Transcript:").$no_proj_failed_old_sids);
+  $self->log(sprintf("%-090s", "Retired Transcript Xrefs with new overlapping Transcript(s):").$with_overlapping_tx);
+  $self->log(sprintf("%-090s", "Retired Transcript Xrefs with new overlapping Transcript(s), all fail 3'UTR/strand test:").
+			 $utr_failed_old_sids);	
+  #This figure may include transcripts which would have failed in the original set!
+  $self->log(sprintf("%-090s","Total new Transcript xrefs considered which fail 3' UTR/strand test:"). $failed_new_sids);
+  $self->log(sprintf("%-090s","True new Xref assignments due to retired Transcripts:").($new_xrefs - $old_as_new_xrefs));
+
+  #We also want unique miRNA feature which we re-assigned
+  #Hard to calculate as the re-assignment might be to a current transcript in the input
+
+  #Now set states
+  foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){
+	$set->adaptor->store_status($status, $set);
+  }
+
+  return;
+}
+
+1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children