Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/biotiffin.pm @ 0:1f6dce3d34e0
Uploaded
| author | mahtabm |
|---|---|
| date | Thu, 11 Apr 2013 02:01:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/biotiffin.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,259 @@ +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=cut + +package Bio::EnsEMBL::Funcgen::Parsers::biotiffin; + +use strict; + +use File::Basename; + +# To get files for bioTIFFIN, download the following GFF file (e.g. via wget): +# +# http://td-blade.gurdon.cam.ac.uk/tad26/fly-tiffinScan-tiffin12.dm3.gff.gz + +# Thomas Down <thomas.down@gurdon.cam.ac.uk> + +# +# 3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0 +# 3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0 +# 3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0 + +use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; +use Bio::EnsEMBL::DBEntry; +use Bio::EnsEMBL::Funcgen::ExternalFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw ); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); + +use vars qw(@ISA); +@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); + + +sub new { + my $caller = shift; + my $class = ref($caller) || $caller; + + my $self = $class->SUPER::new(@_, type => 'BioTiffin'); + + #Set default feature_type and feature_set config + + #We need to capture version/release/data of external feature sets. + #This can be nested in the description? Need to add description to feature_set? + + $self->{static_config}{feature_types} = + { + 'BioTIFFIN Motif' => { + name => 'BioTIFFIN Motif', + class => 'Regulatory Motif', + description => 'BioTIFFIN motif', + } + }; + + $self->{static_config}{analyses} = + { + 'BioTIFFIN Motif' => { + -logic_name => 'BioTIFFIN Motif', + -description => 'BioTIFFIN regulatory motif database', + -display_label => 'BioTIFFIN motifs', + -displayable => 1, + }, + }; + + $self->{static_config}{feature_sets} = + { + 'BioTIFFIN Motif' => + { + feature_set => { + -feature_type => 'BioTIFFIN Motif', + -analysis => 'BioTIFFIN Motif', + }, + xrefs => 0, + } + }; + + + #Move xref flag here? + $self->{config} = { + 'BioTIFFIN Motif' => { + file => $ENV{'EFG_DATA'}.'/input/BioTIFFIN/fly-tiffinScan-tiffin12.dm3.gff', + gff_attrs => { + 'ID' => 1, + }, + }, + }; + + $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); + $self->set_feature_sets; + + return $self; +} + + + +# Parse file and return hashref containing: +# +# - arrayref of features +# - arrayref of factors + + + + +sub parse_and_load { + my ($self, $files, $old_assembly, $new_assembly) = @_; + + if(scalar(@$files) != 1){ + throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files)); + } + + + my %slice_cache; + my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; + my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; + my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; + # this object is only used for projection + my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'BioTIFFINProjection');#do we need this? + my $species = $self->db->species; + if(! $species){ + throw('Must define a species to define the external_db'); + } + #Just to make sure we hav homo_sapiens and not Homo Sapiens + ($species = lc($species)) =~ s/ /_/; + + + if(scalar @{$self->import_sets} != 1){ + throw('biotiffin parser currently only supports one import FeatureSet'); + } + + my ($import_set) = @{$self->import_sets}; + + + #foreach my $import_set(@{$self->import_sets}){ + $self->log_header("Parsing $import_set data"); + + my %motif_cache; # name -> factor_id + my $config = $self->{'config'}{$import_set}; + my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set}; + my %gff_attrs = %{$config->{'gff_attrs'}}; + + + # Parse motifs.txt file + #my $file = $config->{'file'}; + my $file = $files->[0]; + my $skipped = 0; + my $motif_cnt = 0; + my $factor_xref_cnt = 0; + my $feature_cnt = 0; + my $feature_target_cnt = 0; + + open (FILE, "<$file") || die("Can't open $file\n$!\n"); + + LINE: while (my $line = <FILE>) { + chomp $line; + + #GFF3 + #3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0 + #3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0 + #3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0 + #3R MotifScanner TIFDMEM0000001 17973965 17973975 0.0 + 0 + + #seq_name, source, feature, start, end, score, strand, frame, [attrs] + my ($chromosome, $program, $feature, $start, $end, $score, $strand, undef) = split /\t/o, $line; + + if(! exists $slice_cache{$chromosome}){ + + if($old_assembly){ + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', + $chromosome, + undef, + undef, + undef, + $old_assembly); + } else { + $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); + } + } + + if(! defined $slice_cache{$chromosome}){ + warn "Can't get slice $chromosome for motif $feature;\n"; + $skipped++; + next; + } + + if(! exists $motif_cache{$feature}){ + + $motif_cache{$feature} = $ftype_adaptor->fetch_by_name($feature); + + if(! defined $motif_cache{$feature}){ + + ($motif_cache{$feature}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new + ( + -name => $feature, + -class => $fset->feature_type->class, + -description => $fset->feature_type->description, + ))}; + + $motif_cnt ++; + } + } + + my $feature_type = $motif_cache{$feature}; + + #Now build actual feature + + $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new + ( + -display_label => $feature, + -start => $start, + -end => $end, + -strand => (($strand eq '+') ? 1 : -1), + -feature_type => $feature_type, + -feature_set => $fset, + -slice => $slice_cache{$chromosome}, + ); + + + # project if necessary + if ($new_assembly) { + $feature = $self->project_feature($feature, $new_assembly); + + if(! defined $feature){ + $skipped ++; + next; + } + } + + ($feature) = @{$extf_adaptor->store($feature)}; + $feature_cnt++; + + + } + + close FILE; + + $self->log("Loaded ".$fset->name); + $self->log("$motif_cnt feature types"); + $self->log("$feature_cnt features"); + $self->log("Skipped $skipped features"); + +#} + + return; +} + +1;
