view variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/biotiffin.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
line wrap: on
line source

=head1 LICENSE

  Copyright (c) 1999-2011 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=cut

package Bio::EnsEMBL::Funcgen::Parsers::biotiffin;

use strict;

use File::Basename;

# To get files for bioTIFFIN, download the following GFF file (e.g. via wget):
#
#  http://td-blade.gurdon.cam.ac.uk/tad26/fly-tiffinScan-tiffin12.dm3.gff.gz

# Thomas Down <thomas.down@gurdon.cam.ac.uk>

# 
# 3R      MotifScanner    TIFDMEM0000001  936391  936401  0.0     +       0
# 3R      MotifScanner    TIFDMEM0000001  13455911        13455921        0.0     -       0
# 3R      MotifScanner    TIFDMEM0000001  17062830        17062840        0.0     +       0

use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
use Bio::EnsEMBL::DBEntry;
use Bio::EnsEMBL::Funcgen::ExternalFeature;
use Bio::EnsEMBL::Utils::Exception qw( throw );
use Bio::EnsEMBL::Utils::Argument qw( rearrange );

use vars qw(@ISA);
@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);


sub new {
  my $caller = shift;
  my $class = ref($caller) || $caller;

  my $self = $class->SUPER::new(@_, type => 'BioTiffin');

  #Set default feature_type and feature_set config

  #We need to capture version/release/data of external feature sets.
  #This can be nested in the description?  Need to add description to feature_set?

  $self->{static_config}{feature_types} = 
	{
	 'BioTIFFIN Motif'   => {
							 name        => 'BioTIFFIN Motif',
							 class       => 'Regulatory Motif',
							 description => 'BioTIFFIN motif',
							}
	};
  
  $self->{static_config}{analyses} = 
	{
	 'BioTIFFIN Motif' => { 
						   -logic_name    => 'BioTIFFIN Motif',
						   -description   => 'BioTIFFIN regulatory motif database',
						   -display_label => 'BioTIFFIN motifs',
						   -displayable   => 1,
						  },
	};
  
  $self->{static_config}{feature_sets} = 
	{
	 'BioTIFFIN Motif' => 
	 {
	  feature_set => {
					  -feature_type => 'BioTIFFIN Motif',
					  -analysis     => 'BioTIFFIN Motif',
					 },
	  xrefs => 0,
	 }
	};
 

  #Move xref flag here?
  $self->{config} =  {
					  'BioTIFFIN Motif' => {
										file  => $ENV{'EFG_DATA'}.'/input/BioTIFFIN/fly-tiffinScan-tiffin12.dm3.gff',
										gff_attrs => {
													  'ID' => 1,
													 },
									   },					  
					 };
  
  $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
  $self->set_feature_sets;

  return $self;
}



# Parse file and return hashref containing:
#
# - arrayref of features
# - arrayref of factors




sub parse_and_load {
  my ($self, $files, $old_assembly, $new_assembly) = @_;

  if(scalar(@$files) != 1){
	throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));
  }


  my %slice_cache;
  my $extf_adaptor  = $self->db->get_ExternalFeatureAdaptor;
  my $dbentry_adaptor = $self->db->get_DBEntryAdaptor;
  my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor;
  # this object is only used for projection
  my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'BioTIFFINProjection');#do we need this?
  my $species = $self->db->species;
  if(! $species){
	throw('Must define a species to define the external_db');
  }
  #Just to make sure we hav homo_sapiens and not Homo Sapiens
  ($species = lc($species)) =~ s/ /_/;

  
  if(scalar @{$self->import_sets} != 1){
	throw('biotiffin parser currently only supports one import FeatureSet');
  }

  my ($import_set) = @{$self->import_sets};


  #foreach my $import_set(@{$self->import_sets}){
  $self->log_header("Parsing $import_set data");
  
  my %motif_cache; # name -> factor_id
  my $config = $self->{'config'}{$import_set};
  my $fset =  $self->{static_config}{feature_sets}{$import_set}{feature_set};
  my %gff_attrs =  %{$config->{'gff_attrs'}};
  
  
  # Parse motifs.txt file
  #my $file =  $config->{'file'};
  my $file = $files->[0];
  my $skipped = 0;
  my $motif_cnt = 0;
  my $factor_xref_cnt = 0;
  my $feature_cnt = 0;
  my $feature_target_cnt = 0;
	
  open (FILE, "<$file") || die("Can't open $file\n$!\n");

 LINE: while (my $line = <FILE>) {
	  chomp $line;
	  
	  #GFF3
		#3R      MotifScanner    TIFDMEM0000001  936391  936401  0.0     +       0
		#3R      MotifScanner    TIFDMEM0000001  13455911        13455921        0.0     -       0
    #3R      MotifScanner    TIFDMEM0000001  17062830        17062840        0.0     +       0
    #3R      MotifScanner    TIFDMEM0000001  17973965        17973975        0.0     +       0

	  #seq_name, source, feature, start, end, score, strand, frame, [attrs]
	  my ($chromosome, $program, $feature, $start, $end, $score, $strand, undef) = split /\t/o, $line;

	  if(! exists $slice_cache{$chromosome}){
				
				if($old_assembly){
						$slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', 
																																							$chromosome, 
																																							undef, 
																																							undef, 
																																							undef, 
																																							$old_assembly);
				} else {
						$slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome);
				}
	  }
		
	  if(! defined  $slice_cache{$chromosome}){
				warn "Can't get slice $chromosome for motif $feature;\n";
				$skipped++;
				next;
	  }
				
		if(! exists $motif_cache{$feature}){
				
				$motif_cache{$feature} = $ftype_adaptor->fetch_by_name($feature);
				
				if(! defined $motif_cache{$feature}){
						
						($motif_cache{$feature}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new
																															 (
																																-name  => $feature,
																																-class => $fset->feature_type->class,
																																-description => $fset->feature_type->description,
																															 ))};						
						
						$motif_cnt ++;
				}
	  }
		
		my $feature_type = $motif_cache{$feature};
		
	  #Now build actual feature

	  $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
				(
				 -display_label => $feature,
				 -start         => $start,
				 -end           => $end,
				 -strand        => (($strand eq '+') ? 1 : -1),
				 -feature_type  => $feature_type,
				 -feature_set   => $fset,
				 -slice         => $slice_cache{$chromosome},
				);
		
		
	  # project if necessary
	  if ($new_assembly) {
				$feature = $self->project_feature($feature, $new_assembly);
		
				if(! defined $feature){
						$skipped ++;
						next;
				}
	  }
	  
	  ($feature) = @{$extf_adaptor->store($feature)};
	  $feature_cnt++;


	}
	
	close FILE;
	
	$self->log("Loaded ".$fset->name);
	$self->log("$motif_cnt feature types");
	$self->log("$feature_cnt features");
	$self->log("Skipped $skipped features");

#}

  return;
}

1;