diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/redfly.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/redfly.pm	Thu Apr 11 02:01:53 2013 -0400
@@ -0,0 +1,430 @@
+=head1 LICENSE
+
+  Copyright (c) 1999-2011 The European Bioinformatics Institute and
+  Genome Research Limited.  All rights reserved.
+
+  This software is distributed under a modified Apache license.
+  For license details, please see
+
+    http://www.ensembl.org/info/about/code_licence.html
+
+=head1 CONTACT
+
+  Please email comments or questions to the public Ensembl
+  developers list at <ensembl-dev@ebi.ac.uk>.
+
+  Questions may also be sent to the Ensembl help desk at
+  <helpdesk@ensembl.org>.
+
+=cut
+
+package Bio::EnsEMBL::Funcgen::Parsers::redfly;
+
+use strict;
+
+use File::Basename;
+
+# To get files for REDfly, download the following 2 GFF3 files (e.g. via wget):
+#
+# http://redfly.ccr.buffalo.edu/datadumps/tbfs_dump.gff
+# http://redfly.ccr.buffalo.edu/datadumps/crm_dump.gff
+
+#contact?
+
+#TFBS
+#2L 	REDfly	regulatory_region	2456365	2456372	.	.	.	ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
+#2L 	REDfly	regulatory_region	2456352	2456369	.	.	.	ID="dl_dpp:REDFLY:TF000069"; Dbxref="Flybase:FBgn0000490", "PMID:8458580", "REDfly:645, "FlyBase:FBgn0000463"; Evidence="footprint/binding assay"; Factor="dl"; Target="dpp";
+
+#CRMs
+#2L 	REDfly	regulatory_region	2455781	2457764	.	.	.	ID="dpp_intron2"; Dbxref="Flybase:FBgn0000490", "PMID:8167377", "REDfly:247; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005304";
+#2L 	REDfly	regulatory_region	2445769	2446581	.	.	.	ID="dpp_dpp813"; Dbxref="Flybase:FBgn0000490", "PMID:7821226", "REDfly:246; Evidence="reporter construct (in vivo)"; Ontology_term="FBbt:00005653","FBbt:00001051";
+
+
+
+use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
+use Bio::EnsEMBL::DBEntry;
+use Bio::EnsEMBL::Funcgen::ExternalFeature;
+use Bio::EnsEMBL::Utils::Exception qw( throw );
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
+
+
+
+
+
+sub new {
+  my $caller = shift;
+  my $class = ref($caller) || $caller;
+
+  my $self = $class->SUPER::new(@_, type => 'REDfly');
+
+  #Set default feature_type and feature_set config
+
+  #We need to capture version/release/data of external feature sets.
+  #This can be nested in the description?  Need to add description to feature_set?
+
+  $self->{static_config}{feature_types} = 
+	{
+	 'REDfly TFBS'   => {
+						 -name        => 'REDfly TFBS',
+						 -class       => 'Transcription Factor',
+						 -description => 'REDfly transciption factor binding site',
+						},
+	 'REDfly CRM' => {
+					  -name        => 'REDfly CRM',
+					  -class       => 'Regulatory Motif',
+					  -description => 'REDfly cis regulatory motif',
+					 },
+	};
+  
+  
+  $self->{static_config}{analyses} = 
+	{
+	 'REDfly TFBS' => { 
+					   -logic_name    => 'REDfly TFBS',
+					   -description   => 'REDfly transcription factor binding sites (http://redfly.ccr.buffalo.edu/)',
+					   -display_label => 'REDfly TFBS',
+					   -displayable   => 1,
+					  },
+	 
+	  'REDfly CRM' => { 
+					   -logic_name    => 'REDfly CRM',
+					   -description   => 'REDfly cis regulatory motif (http://redfly.ccr.buffalo.edu/)',
+					   -display_label => 'REDfly CRM',
+					   -displayable   => 1,
+					  },
+	};
+  
+  $self->{static_config}{feature_sets} = 
+	{
+	 'REDfly TFBSs' => {
+						feature_set =>
+						{
+						 -feature_type  => 'REDfly TFBS',
+						 -analysis      => 'REDfly TFBS',
+						},
+						xrefs => 1,
+					   },
+	 
+	 'REDfly CRMs' => {
+					   feature_set   =>
+					   {
+						-feature_type  => 'REDfly CRM',
+						-analysis      => 'REDfly CRM',
+					   },
+					   
+					   xrefs => 1,
+					  },
+	};
+ 
+
+  #Move xref flag here?
+  $self->{config} =  {
+					  'REDfly CRMs' => {
+										#file  => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_crm.gff',
+										gff_attrs => {
+													  'ID' => 1,
+													 },
+									   },
+					  
+					  'REDfly TFBSs' => {
+										 #file  => $ENV{'EFG_DATA'}.'/input/REDFLY/redfly_tfbs.gff',
+										 gff_attrs => {
+													   'ID' => 1,
+													   'Factor' => 1,
+													   'Target' => 1,
+													  },
+										 desc_suffix => ' binding site',
+										}
+					 };
+  
+  
+  $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
+  $self->set_feature_sets;
+ 
+  return $self;
+}
+
+
+
+
+
+# Parse file and return hashref containing:
+#
+# - arrayref of features
+# - arrayref of factors
+
+
+
+
+sub parse_and_load {
+  my ($self, $files, $old_assembly, $new_assembly) = @_;
+
+  if(scalar(@$files) != 2){
+	throw('You must currently define a crm and tfbs file to load redfly features from:\t'.join(' ', @$files));
+  }
+
+  #More validation of files here?
+  $self->{config}{'REDfly CRMs'}{file}  = grep(/crm/,  @$files);
+  $self->{config}{'REDfly TFBSs'}{file} = grep(/tfbs/, @$files);
+
+  my %slice_cache;
+  my $extf_adaptor  = $self->db->get_ExternalFeatureAdaptor;
+  my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
+  my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
+  # this object is only used for projection
+  my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'REDflyProjection');#do we need this?
+  my $species = $self->db->species;
+
+  if(! $species){
+	throw('Must define a species to define the external_db');
+  }
+
+  #Just to make sure we hav homo_sapiens and not Homo Sapiens
+  ($species = lc($species)) =~ s/ /_/;
+
+
+  foreach my $import_set(@{$self->import_sets}){
+	$self->log_header("Parsing $import_set data");
+
+	my %factor_cache; # name -> factor_id
+	my %target_cache;
+	my $config = $self->{'config'}{$import_set};
+	my $fset =  $self->{static_config}{feature_sets}{$import_set}{feature_set};
+	my %gff_attrs =  %{$config->{'gff_attrs'}};
+	
+	
+	# Parse motifs.txt file
+	my $file =  $config->{'file'};
+	my $skipped = 0;
+	my $factor_cnt = 0;
+	my $factor_xref_cnt = 0;
+	my $feature_cnt = 0;
+	my $feature_target_cnt = 0;
+	
+	open (FILE, "<$file") || die("Can't open $file\n$!\n");
+	<FILE>; # skip header
+
+	LINE: while (my $line = <FILE>) {
+	  next if ($line =~ /^\s*\#/o || $line =~ /^\s*$/o);
+	  chomp $line;
+	  my %attr_cache;#Can we move this outside the loop and rely on it being reset each time?
+
+
+	  #GFF3
+	  #Is this format valid, missing " after REDfly xref
+	  #2L 	REDfly	regulatory_region	2456365	2456372	.	.	.	ID="Unspecified_dpp:REDFLY:TF000068"; Dbxref="Flybase:FBgn0000490", "PMID:8543160", "REDfly:644, "FlyBase:"; Evidence="footprint/binding assay"; Factor="Unspecified"; Target="dpp";
+	  #seq_name, source, feature, start, end, score, strand, frame, [attrs]
+	  my ($chromosome, undef, $feature, $start, $end, undef, undef, undef, $attrs) = split /\t/o, $line;
+	  my @attrs = split/\;\s+/o, $attrs;
+
+
+	  #UCSC coords
+	  $start ++;
+	  $end ++;
+
+
+
+	  foreach my $gff_attr(keys %gff_attrs){
+
+		if(($attr_cache{$gff_attr}) = grep {/^${gff_attr}\=/} @attrs){
+		  $attr_cache{$gff_attr} =~ s/(^${gff_attr}\=\")(.*)(\")/$2/;
+		  
+		  #warn "attr cache is  $attr_cache{$gff_attr} ";
+	
+		}
+		else{
+		  warn "Skipping import, unable to find mandatory $gff_attr attribute in:\t$line";
+		  next LINE;
+		}
+	  }
+	
+   
+	  #For TFBS
+	  #Factor = coding gene name display_label
+	  #Target = Target gene?
+	  #Ignore other xrefs for name, just put ID in feature as display_label
+
+	  #These are mixed up! and where not getting any coding xrefs!
+
+
+	  #For CRM
+	  #Can we split the ID and have Reguatory XREF?
+	  #e.g.  ID="dpp_dpp813"; => dpp
+
+
+
+
+	  #This can be moved to the BaseExternalParser
+
+	  if(! exists $slice_cache{$chromosome}){
+	
+		if($old_assembly){
+		  $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', 
+																			$chromosome, 
+																			undef, 
+																			undef, 
+																			undef, 
+																			$old_assembly);
+		}else{
+		  $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
+		}
+	  }
+
+	  if(! defined  $slice_cache{$chromosome}){
+		warn "Can't get slice $chromosome for motif $attr_cache{'ID'};\n";
+		$skipped++;
+		next;
+	  }
+	
+
+	#get feature_type first
+
+	#we are not maintaining this link in the DB!
+	#Do we need another xref for this or a different table?
+
+	  my $feature_type;
+	  
+	  #TFBSs
+	  if(exists $attr_cache{'Factor'}){
+
+		if(! exists $factor_cache{$attr_cache{'Factor'}}){
+		  
+		  $factor_cache{$attr_cache{'Factor'}} = $ftype_adaptor->fetch_by_name($attr_cache{'Factor'});
+		  
+		  if(! defined $factor_cache{$attr_cache{'Factor'}}){
+		  
+			#Would need to add CODING DBEntry here!
+			#Will this work on a scalar ref to a hash?
+			my $desc = (exists $config->{'desc_suffix'}) ? $attr_cache{'Factor'}.$config->{'desc_suffix'} : undef;
+			
+			($factor_cache{$attr_cache{'Factor'}}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
+																			 (
+																			  -name  => $attr_cache{'Factor'},
+																			  -class => $fset->feature_type->class,
+																			  -description => $desc,
+																			 ))};
+			
+			$feature_type = $factor_cache{$attr_cache{'Factor'}};
+			$factor_cnt ++;
+			my $stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $attr_cache{'Factor'});
+			
+			#Handle release/version in xref version as stable_id version?
+			
+			if(! defined $stable_id){
+			  warn "Could not generate CODING xref for feature_type:\t". $attr_cache{'Factor'}."\n";
+			}else{
+			  #warn "got $stable_id for ".$attr_cache{'Factor'};
+			  my $dbentry = Bio::EnsEMBL::DBEntry->new(
+													   -dbname                 => $species.'_core_Gene',
+													   #-release                => $self->db->dnadb->dbc->dbname,
+													   -status                 => 'KNOWNXREF',#This is for the external DB
+													   #-display_label_linkable => 1,
+													   -#db_display_name        => $self->db->dnadb->dbc->dbname,
+													   -db_display_name        => 'EnsemblGene',
+													   -type                   => 'MISC',#Is for the external_db 
+													   -primary_id             => $stable_id,
+													   -display_id             => $attr_cache{'Factor'},
+													   -info_type              => 'MISC',
+													   -into_text              => 'GENE',
+													   -linkage_annotation     => 'REDfly Coding'
+													   -analysis               => $fset->analysis,
+
+													   #-description            => 'cisRED motif gene xref',#This is now generic and no longer resitricted to REDfly
+													   #could have version here if we use the correct dnadb to build the cache
+													  );
+			  
+			  $dbentry_adaptor->store($dbentry, $factor_cache{$attr_cache{'Factor'}}->dbID, 'FeatureType', 1);#1 is ignore release flag
+			  $factor_xref_cnt ++;
+			}
+		  }
+		}
+	  }
+	  else{
+		#CRMs
+		$feature_type = $fset->feature_type;
+	  }
+
+
+	  #Now build actual feature
+	  $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
+	  (
+	   -display_label => $attr_cache{'ID'},
+	   -start         => $start,
+	   -end           => $end,
+	   -strand        => 0,
+	   -feature_type  => $feature_type,
+	   -feature_set   => $fset,
+	   -slice         => $slice_cache{$chromosome},
+	  );
+
+	  # project if necessary
+	  if ($new_assembly) {
+		$feature = $self->project_feature($feature, $new_assembly);
+		
+		if(! defined $feature){
+		  $skipped ++;
+		  next;
+		}
+	  }
+	  
+	  ($feature) = @{$extf_adaptor->store($feature)};
+	  $feature_cnt++;
+
+	  my $target = (exists $attr_cache{'Target'}) ?  $attr_cache{'Target'} : (split/_/, $attr_cache{'ID'})[0];
+	  my $stable_id;
+
+	  if($target ne 'Unspecified'){
+		$stable_id = $self->get_core_stable_id_by_display_name($self->db->dnadb, $target);
+	  }
+
+
+	  if(! defined $stable_id){
+		warn "Could not generate TARGET xref for feature:\t". $attr_cache{'ID'}."\n" if $target ne 'Unspecified';
+	  }
+	  else{
+		#Handle release/version in xref version as stable_id version?		
+		my $dbentry = Bio::EnsEMBL::DBEntry->new(
+												 -dbname                 => $species.'_core_Gene',
+												 #-release                => $self->db->dnadb->dbc->dbname,
+												 -status                 => 'KNOWNXREF',
+												 #-display_label_linkable => 1,
+												 -#db_display_name        => $self->db->dnadb->dbc->dbname,
+												 -db_display_name        => 'EnsemblGene',
+												 -type                   => 'MISC',#
+												 -primary_id             => $stable_id,
+												 -display_id             => $target,
+												 -info_type              => 'MISC',
+												 -info_text              => 'GENE',
+												 -linkage_annotation     => $fset->feature_type->name.' Target',
+												 -analysis               => $fset->analysis,
+
+												 #could have version here if we use the correct dnadb to build the cache
+											);
+
+		$dbentry_adaptor->store($dbentry, $feature->dbID, 'ExternalFeature', 1);#1 is ignore release flag
+		
+		$feature_target_cnt ++;
+	  }
+	}
+	
+	close FILE;
+
+	$self->log("Loaded ".$fset->name);
+	$self->log("$factor_cnt feature types");
+	$self->log("$factor_xref_cnt feature type coding xrefs");
+	$self->log("$feature_cnt features");
+	$self->log("$feature_target_cnt feature target xrefs");
+	$self->log("Skipped $skipped features");
+
+  }
+
+  return;
+}
+
+
+
+
+1;