ensembl: variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm comparison

comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm @ 0:1f6dce3d34e0

Uploaded

author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:1f6dce3d34e0
+#
+# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet
+#
+=head1 LICENSE
+Copyright (c) 1999-2011 The European Bioinformatics Institute and
+Genome Research Limited.  All rights reserved.
+This software is distributed under a modified Apache license.
+For license details, please see
+http://www.ensembl.org/info/about/code_licence.html
+=head1 CONTACT
+Please email comments or questions to the public Ensembl
+developers list at <ensembl-dev@ebi.ac.uk>.
+Questions may also be sent to the Ensembl help desk at
+<helpdesk@ensembl.org>.
+=head1 NAME
+Bio::EnsEMBL::Funcgen::Parsers::InputSet
+=head1 SYNOPSIS
+use vars qw(@ISA);
+@ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);
+=head1 DESCRIPTION
+This is a base class to support simple file format parsers. For simple imports the vendor is
+set to the parser type i.e. the file format.  The generic read_and_import_simple_data assumes
+a one line per feature format, other format need there own read_and_import_format_data method,
+which will need defining in the result_data config element. Features are stored either as
+ResultFeature collections or AnnotatedFeatures dependan ton the input feature class.
+=cut
+# To do
+# Add Parsers for BAM/SAM
+# Rename to InputSet
+# Handle mysqlimport for large data sets e.g. reads
+# Incorporate collection code
+# Implement matrix storage
+package Bio::EnsEMBL::Funcgen::Parsers::InputSet;
+use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
+use Bio::EnsEMBL::Funcgen::SegmentationFeature;
+use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
+use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped);
+use Bio::EnsEMBL::Utils::Argument qw( rearrange );
+#use Bio::EnsEMBL::Funcgen::Utils::Helper;
+use strict;
+#config stuff, move to BaseImporter?
+use Bio::EnsEMBL::Analysis;
+use Bio::EnsEMBL::Funcgen::FeatureType;
+use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10
+#use vars qw(@ISA);
+#@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
+my %valid_types = (
+				   result       => undef,
+				   annotated    => undef,
+				   segmentation => undef,
+				  );
+=head2 new
+Example    : my $self = $class->SUPER::new(@_);
+Description: Constructor method for Bed class
+Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple
+Exceptions : throws if caller is not Importer
+Caller     : Bio::EnsEMBL::Funcgen::Parsers:Simple
+Status     : at risk
+=cut
+sub new{
+my $caller = shift;
+my $class = ref($caller) || $caller;
+my $self  = $class->SUPER::new(@_);
+#  my $config_file;
+($self->{'input_set_name'},
+$self->{'input_feature_class'},
+#$self->{'slices'},
+$self->{total_features},
+$self->{force},            #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser
+$self->{dbfile_data_root}, #only appropriate for result input_feature_class
+#  $config_file,              #User defined config hash file
+) = rearrange(['input_set_name', 'input_feature_class',
+				 'total_features', 'force', 'dbfile_data_root'], @_);
+#Could potentially take fields params directly to define a custom format
+#Take direct field mappings, plus special fields which needs parsing differently
+#i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping
+throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
+	if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
+$self->{'config'} =
+	{(
+	  #can we omit these?
+array_data   => [],#['experiment'],
+probe_data   => [],#["probe"],
+norm_method => undef,
+	  #protocols => {()},
+	  'results_data' => ["and_import"],
+)};
+#set up feature params
+$self->{'_feature_params'} = {};
+$self->{'_dbentry_params'} = [];
+#$self->{'counts'}   = {};
+#$self->{'slices'}   = [];
+#$self->{'seq_region_names'} = [];#Used for slice based import
+# USER CONFIG #
+#Here we need to read config based on external file
+#Should do something similar to set_feature_sets
+#and validate_and_store_feature_types in BaseExternalParser
+#but we are using define and validate sets instead
+#BaseExternalParser and BaseImporter really need to be merged
+#After we have stripped out all the array/experiment specific stuff
+#Do dev here so we are not developing more stuff in the Importer which will need migrating
+#to the BaseImporter
+#if($config_file){
+#	my $config;
+#	$self->log("Reading config file:\t".$config_file);
+#	if(! ($config = do "$config_file")){
+#	  throw("Couldn't parse config file:\t$config_file:\n$@") if $@;
+#	  throw("Couldn't do config:\t$config_file\n$!")          if ! defined $config;
+#	  throw("Couldn't compile config_file:\t$config_file")    if ! $config;
+#	}
+#	#At least check it is hash
+#	if(ref($config) ne 'HASH'){
+#	  throw("Config file does not define a valid HASH:\t$config_file");
+#	}
+#
+#	$self->{user_config} = $config;
+#  }
+return $self;
+}
+sub output_file{
+my ($self, $output_file) = @_;
+$self->{'output_file'} = $output_file if $output_file;
+return $self->{'output_file'};
+}
+sub input_file{
+my ($self, $input_file) = @_;
+$self->{'input_file'} = $input_file if $input_file;
+return $self->{'input_file'};
+}
+=head2 set_config
+Example    : my $self->set_config;
+Description: Sets attribute dependent config
+Returntype : None
+Exceptions : None
+Caller     : Bio::EnsEMBL::Funcgen::Importer
+Status     : at risk
+=cut
+sub set_config{
+my $self = shift;
+#Move all this to new when we fix the inheritance in Importer
+#We could set input_set_name to experiment name
+#But we would have to make warning in define_and_validate_sets mention -input_set_name
+throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name();
+#Mandatory checks
+if(! defined $self->feature_analysis){
+	throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports');
+}
+if(! exists $valid_types{$self->input_feature_class}){
+	throw("You must define a valid input_feature_class:\t".
+		  join(', ', keys %valid_types));
+}
+$self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature';
+#We need to undef norm method as it has been set to the env var
+$self->{'config'}{'norm_method'} = undef;
+#dirs are not set in config to enable generic get_dir method access
+$self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector
+#some convenience methods
+my $adaptor_method           = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor';
+$self->{'feature_adaptor'}   =  $self->db->$adaptor_method;
+$self->{'dbentry_adaptor'}   = $self->db->get_DBEntryAdaptor;
+$self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor;
+##$self->{'slice_adaptor'}     = $self->db->dnadb->get_SliceAdaptor;
+#Validate slices
+$self->slices($self->{'slices'}) if defined $self->{'slices'};
+#Move to new when we sort out inheritance
+$self->validate_and_store_config([$self->name]);
+#Could use input_set_name here?
+#This was to support >1 input set per experiment (name)
+#This current breaks for no config imports
+#i.e. standard Bed import e.g. result_feature collections
+#segmentation imports use Bed and config
+#allow no config imports in BaseImporter?
+#or ultimately set the params as part of the user_config?
+return;
+}
+sub define_sets{
+my ($self) = @_;
+my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name);
+if(! defined $eset){
+$eset = Bio::EnsEMBL::Funcgen::InputSet->new
+(
+-name         => $self->input_set_name(),
+-experiment   => $self->experiment(),
+-feature_type => $self->feature_type(),
+-cell_type    => $self->cell_type(),
+-vendor       => $self->vendor(),
+-format       => $self->format(),
+-analysis     => $self->feature_analysis,
+-feature_class => $self->input_feature_class,
+);
+($eset)  = @{$self->db->get_InputSetAdaptor->store($eset)};
+}
+#Use define_and_validate with fetch/append as we may have a pre-existing set
+#This now needs to handle ResultSets based on InputSets
+my $dset = $self->define_and_validate_sets
+	(
+	 -dbadaptor    => $self->db,
+	 -name         => $self->input_set_name,#or name?
+	 -feature_type => $self->feature_type,
+	 -cell_type    => $self->cell_type,
+	 -analysis     => $self->feature_analysis,
+	 -feature_class=> $self->input_feature_class,
+	 -description  => $self->feature_set_description,
+	 #-append          => 1,#Omit append to ensure we only have this eset
+	 -recovery     => $self->recovery,
+	 -supporting_sets => [$eset],
+	 -slices        => $self->slices,
+	 #Can't set rollback here, as we don't know until we've validated the files
+	 #Can't validate the files until we have the sets.
+	 #So we're doing this manually in validate_files
+	);
+#We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully
+#However we already have IMPORTED on the InputSubSet
+#We should add it to FeatureSet to remain consistent.
+#See Helper::define_and_validate_sets for more notes on
+#potential problems with FeatureSet IMPORTED status
+#define_and_validate_sets should also replicate ResultSets?
+#Questionable, mapped reads are never normalised across replicates
+#There are generally used as input for peak calling individually.
+#So files in this instance are expected to be separate parts of the same replicate
+#e.g. different chromosomes
+#Force one input file?
+#What if we want to link several assays(feature/cell_types) to the same experiment?
+$self->{'_data_set'} = $dset;
+return $dset;
+}
+#we have rollback functionality incorporated here
+sub validate_files{
+my ($self, $prepare) = @_;
+#Get file
+if (! @{$self->result_files()}) {
+	my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr
+	my @rfiles = `$list`;
+	$self->result_files(\@rfiles);
+}
+#We don't yet support multiple files
+if(scalar(@{$self->result_files()}) >1){
+	warn('Found more than one '.$self->vendor." file:\n".
+		 join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).".
+		 "  We need to resolve how we are going handle replicates with random cluster IDs");
+	#do we even need to?
+}
+#Here were are tracking the import of individual files by adding them as InputSubSets
+#Recovery would never know what to delete
+#So would need to delete all, Hence no point in setting status?
+#We do not rollback IMPORTED data here.  This is done via separate scripts
+#To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback
+#flag in the run script
+### VALIDATE FILES ###
+#We need validate all the files first, so the import doesn't fall over half way through
+#Or if we come across a rollback halfway through
+my (%new_data, $eset);
+my $dset = $self->data_set;
+if((scalar(@{$self->slices}) > 1) &&
+	 ! $prepare){
+	throw('Validate files does not yet support multiple Slice rollback');
+}
+#This all assumes that there is only ever 1 InputSet
+if($self->input_feature_class eq 'result'){
+	$eset = $dset->get_supporting_sets->[0]->get_InputSets->[0];
+}
+else{#annotated/segmentation
+	$eset =  $dset->get_supporting_sets->[0];
+}
+#IMPORTED status here may prevent
+#futher slice based imports
+#so we have wait to set this until we know all the slices
+#are loaded, unless we store slice based IMPORTED states
+#We currently get around this be never settign IMPORTED for slice based jobs
+#and always rolling back by slice before import
+#This loop supports multiple files
+my (@rollback_sets, %file_paths);
+my $auto_rollback = ($self->rollback) ? 0 : 1;
+foreach my $filepath( @{$self->result_files} ) {
+	my ($filename, $sub_set);
+	chomp $filepath;
+	($filename = $filepath) =~ s/.*\///;
+	$file_paths{$filename} = $filepath;
+	$filename =~ s/^prepared\.// if $self->prepared; 	#reset filename to that originally used to create the Inputsubsets
+	$self->log('Validating '.$self->vendor." file:\t$filename");
+	throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links
+	if( $sub_set = $eset->get_subset_by_name($filename) ){
+	  #IMPORTED status here is just for the file
+	  #Any changes to analysis or coord_system should result in different InputSubset(file)
+	  #Will only ever be imported into one Feature|ResultSet
+	  #Currently conflating recover_unimported and rollback
+	  #as they serve the same purpose until we implement InputSubset level recovery
+	  if( $sub_set->has_status('IMPORTED') ){
+		$new_data{$filepath} = 0;
+		$self->log("Found previously IMPORTED InputSubset:\t$filename");
+	  }
+	  else{
+		$self->log("Found existing InputSubset without IMPORTED status:\t$filename");
+		push @rollback_sets, $sub_set;
+	  }
+	}
+	else{
+	  $self->log("Found new InputSubset:\t${filename}");
+	  throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared;
+	  $new_data{$filepath} = 1;
+	  $sub_set = $eset->add_new_subset($filename);
+	  $self->input_set_adaptor->store_InputSubsets([$sub_set]);
+	}
+}
+#Does -recover allow a single extra new file to be added to an existing InputSet?
+if(@rollback_sets &&  #recoverable sets i.e. exists but not IMPORTED
+	 ( (! $self->recovery) && (! $self->rollback) ) ){
+	throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n".
+		  "You must specify -recover or -rollback to perform a full rollback");
+	if($self->recovery){
+	  #Change these to logger->warn
+	  $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined");
+	  $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n");
+	}
+}
+if($self->rollback){
+	#Check we have all existing InputSubsets files before we do full rollback
+	#Can probably remove this if we support InputSubset(file/slice) level rollback
+	$self->log('Rolling back all InputSubsets');
+	@rollback_sets = @{$eset->get_InputSubsets};
+	foreach my $isset(@rollback_sets){
+	  if(! exists $file_paths{$isset->name}){
+		throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name.
+			  "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names");
+	  }
+	}
+}
+foreach my $esset(@rollback_sets){
+	#This needs to be mapped to the specified filepaths
+	my $fp_key = $esset->name;
+	$fp_key = 'prepared.'.$fp_key if $self->prepared;
+	$new_data{$file_paths{$fp_key}} = 1;
+	$self->log("Revoking states for InputSubset:\t\t\t".$esset->name);
+	$eset->adaptor->revoke_states($esset);
+	if(! $prepare){
+	  #This was to avoid redundant rollback in prepare step
+	  $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name);
+	  if($self->input_feature_class eq 'result'){
+		#Can we do this by slice for parallelisation?
+		#This will only ever be a single ResultSet due to Helper::define_and_validate_sets
+		#flags are rollback_results and force(as this won't be a direct input to the product feature set)
+		$self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1);
+		#Do no have rollback_InputSet here as we may have parallel Slice based imports running
+	  }
+	  else{#annotated/segmentation
+		$self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]);
+		$self->rollback_InputSet($eset);
+		last;
+	  }
+	}
+}
+return \%new_data;
+}
+sub set_feature_separator{
+my ($self, $separator) = @_;
+#How do we test if something undefined was passed?
+#Rather than nothing passed at all?
+#Can't do this as this is the accessor
+#Need to split method
+throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') );
+$self->{'_feature_separator'} = $separator;
+}
+# SIMPLE ACCESSORS
+# Some of these can be called for each record
+# Trim the access time as much as possible
+sub input_feature_class{ return $_[0]->{'input_feature_class'}; }
+sub input_set_name{      return $_[0]->{'input_set_name'}; }  #Set in new
+sub feature_adaptor{     return $_[0]->{'feature_adaptor'}; }
+sub dbentry_adaptor{     return $_[0]->{'dbentry_adaptor'}; }
+sub input_set_adaptor{   return $_[0]->{'input_set_adaptor'}; }
+sub set{                 return $_[0]->{'set'}; } #Feature or Result, set in define_sets
+##sub slice_adaptor{       return $_[0]->{'slice_adaptor'}; }
+sub data_set{            return $_[0]->{'_data_set'}; }
+sub feature_separator{   return $_[0]->{'_feature_separator'}; }
+sub feature_params{      return $_[0]->{'_feature_params'}; }
+sub dbentry_params{      return $_[0]->{'_dbentry_params'}; }
+sub input_gzipped{       return $_[0]->{'input_gzipped'}; }
+sub input_file_operator{
+my ($self, $op) = @_;
+#Should be set in format parser
+$self->{'input_file_operator'} = $op if defined $op;
+return $self->{'input_file_operator'};
+}
+# prepare boolean, simply stores the sets and preprocesses the file
+# so we don't get each batch job trying to sort etc
+#Still need to implement prepare in other Parsers!!
+sub read_and_import_data{
+my ($self, $prepare) = @_;
+my $action = ($prepare) ? 'preparing' : 'importing';
+$self->log("Reading and $action ".$self->vendor()." data");
+my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines);
+if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){
+	throw('prepare mode is only currently implemented for the Bed parser');
+}
+#Test for conflicting run modes
+if($prepare &&
+	 ($self->batch_job || $self->prepared)){
+	#prepare should be called once by the runner, not in each batch_job
+	#don't prepare if already prepared
+	throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job');
+}
+my $dset = $self->define_sets;
+#We also need to account for bsub'd slice based import
+#seq alignments loaded into a ResultSet
+#Cannot have 0 window for ChIP Seq alignments
+#As this would mean storing all the individual reads
+#Hence we need to remap to a new assm before we import!
+if($self->input_feature_class eq 'result'){
+	$output_set = $dset->get_supporting_sets->[0];
+	$eset       = $output_set->get_InputSets->[0];
+	$self->result_set($output_set);#required for ResultFeature Collector and Bed Parser
+}
+else{#annotated/segmentation
+	$output_set = $dset->product_FeatureSet;
+	$eset       =  $dset->get_supporting_sets->[0];
+}
+#If we can do these the other way araound we can get define_sets to rollback the FeatureSet
+#Cyclical dependency for the sets :|
+my $new_data = $self->validate_files($prepare);
+my $seen_new_data = 0;
+### READ AND IMPORT FILES ###
+foreach my $filepath(@{$self->result_files()}) {
+	chomp $filepath;
+	($filename = $filepath) =~ s/.*\///;
+	$self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method
+	if($new_data->{$filepath} ){	#This will currently autovivify!
+	  $seen_new_data = 1;
+	  $self->{'input_gzipped'} = &is_gzipped($filepath);
+	  $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file');
+	  $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath);
+	  #We need to be able to optional open pipe to gzip | sort here
+	  #i.e. define open command
+	  $fh = open_file($filepath, $self->input_file_operator);
+	  #This my become way too large for some reads files
+	  #Currently no problems
+	  #This is not working as we are sorting the file!
+	  #$self->parse_header($fh) if $self->can('parse_header');
+	  #For result features some times we want to run
+	  #locally and just sort without dumping
+	  #i.e if we are not a batch job
+	  #as there is no need to dump if it is a single process
+	  #Should this be prepared?
+	  if((($self->input_feature_class eq 'result') && ! $prepare)){
+		#(($self->input_feature_class eq 'result') && (! $self->batch_job))){   #Local run on just 1 chr
+		#
+		#Use the ResultFeature Collector here
+		#Omiting the 0 wsize
+		#How are we going to omit 0 wsize when doing the fetch?
+		#simply check table name in ResultSet?
+		#Should we do this for multiple chrs?
+		#or fail here
+		# we need to pass self
+		#for access to get_Features_by_Slice
+		#which should be in the specific parser e.g Bed
+		#Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice?
+		#Could really do with separating the pure file parsers from the importer code, so these can be reused
+		#by other code. Then simply use Bed import parser for specific import functions and as wrapper to
+		#Bed file parser
+		#So should really have
+		#Parsers::File::Bed
+		#and
+		#Parsers::Import::Bed
+		#This can probably wait until we update BioPerl and just grab the Bed parser from there?
+		my $slices = $self->slices;
+		#Should this be caught in new?
+		if(! @$slices){
+		  throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name);
+		}
+		if(scalar(@$slices) > 1){
+		  throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n"
+				."Please submit these to the farm as single slice jobs");
+		}
+		#restrict to just 1 slice as we don't yet support disk seeking
+		#if the slices are not in the same order as they appear in the file
+		#also we want to parallelise this
+		#Set as attr for parse_Features_by_Slice in format sepcific Parsers
+		$self->file_handle(open_file($filepath, $self->input_file_operator));
+		foreach my $slice(@$slices){
+		  $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self,
+																	(
+																	 #Force needs reimplementing here?
+																	 -force            => $self->{force},
+																	 -dbfile_data_root => $self->{dbfile_data_root},
+																	));
+		}
+		warn "Need to update InputSubset status to IMPORTED after all slices have been loaded";
+		#Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures?
+		warn "Closing $filename\nDisregard the following 'Broken pipe' warning";
+		#Closing the read end of a pipe before the process writing to it at the other end
+		#is done writing results in the writer receiving a SIGPIPE. If the other end can't
+		#handle that, be sure to read all the data before closing the pipe.
+		#This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file?
+		#$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead?
+		#Or maybe an eval will catch the STDERR better?
+		#sub catch_SIGPIPE{
+		#  my $sig = shift @_;
+		#  print " Caught SIGPIPE: $sig $1 \n";
+		#  return;
+		#
+		#}
+		#$SIG{PIPE} = \&catch_SIGPIPE;
+		#my $retval =  eval { no warnings 'all'; $fh->close };
+		#if($@){
+		#  warn "after eval with error $@\nretval is $retval";
+		#}
+		#Neither of these catch gzip: stdout: Broken pipe
+		#IO::UnCompress::Gunzip?
+		$fh->close;
+	  }
+	  else{
+		#Revoke FeatureSet IMPORTED state here incase we fail halfway through
+		$output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare));
+		#What about IMPORTED_"CSVERSION"
+		#This may leave us with an incomplete import which still has
+		#an IMPORTED_CSVERSION state
+		#We need to depend on IMPORTED for completeness of set
+		#DAS currently only uses IMPORTED_CSVERSION
+		#This is okayish but we also need to write HCs for any sets
+		#which do not have IMPORTED state!
+		my ($line, @outlines, $out_fh);
+		if($prepare && ! $self->batch_job){
+		  #Assume we want gzipped output
+		  #filename is actull based on input, so may not have gz in file name
+		  $out_fh = open_file($self->output_file, "| gzip -c > %s");
+		}
+		while(defined ($line=<$fh>)){
+		  #Generic line processing
+		  #Move these to parse_line?
+		  $line =~ s/\r*\n//o;
+		  next if $line =~ /^\#/;
+		  next if $line =~ /^$/;
+		  #This has now been simplified to process_line method
+		  #my @fields = split/\t/o, $line;
+		  #start building parameters hash
+		  #foreach my $field_index(@{$self->get_field_indices}){
+		  #  my $field = $self->get_field_by_index($field_index);
+		  #  $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]);
+		  #  }
+		  #We also need to enable different parse line methods if we have different file
+		  #e.g. cisRED
+		  #Code refs?
+		  if($self->parse_line($line, $prepare)){
+			$self->count('total parsed lines');
+			#Cache or print to sorted file
+			if($prepare && ! $self->batch_job){
+			  if(scalar(@outlines) >1000){
+				print $out_fh join("\n", @outlines)."\n";
+				@outlines = ();
+			  }
+			  else{
+				push @outlines, $line;
+			  }
+			}
+		  }
+		}
+		close($fh);
+		#Print last of sorted file
+		if($prepare && ! $self->batch_job){
+		  print $out_fh join("\n", @outlines)."\n";
+		  close($out_fh);
+		  @outlines = ();
+		}
+		if(! $prepare){
+		  #Now we need to deal with anything left in the read cache
+		  $self->process_params if $self->can('process_params');
+		  #To speed things up we may need to also do file based import here with WRITE lock?
+		  #mysqlimport will write lock the table by default?
+		  #reset filename to that originally used to create the Inputsubsets
+		  $filename =~ s/^prepared\.// if $self->prepared;
+		  my $sub_set = $eset->get_subset_by_name($filename);
+		  $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job;
+		}
+	  }
+	  if($prepare){
+		$self->log("Finished preparing import from:\t$filepath");
+	  }
+	  else{
+		#Need to tweak this for slice based import
+		$self->log('Finished importing '.$self->counts('features').' '.
+				   $output_set->name." features from:\t$filepath");
+	  }
+	  #This currently fails here if the uncaught file sort was not successful
+	  foreach my $key (keys %{$self->counts}){
+		$self->log("Count $key:\t".$self->counts($key));
+	  }
+	}
+}
+#Here we should set IMPORTED on the FeatureSet
+#We could also log the first dbID of each feature in a subset to facilitate subset rollback
+#in feature table
+#this would be sketchy at best
+#delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id
+#This may already have IMPORTED status as we don't revoke the status whilst
+#updating to protect the feature set due to lack of supportingset tracking
+#see Helper::defined_and_validate_sets for more notes.
+#Is there any point in setting it if we don't revoke it?
+#To allow consistent status handling across sets. Just need to be aware of fset status caveat.
+#Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job
+if(! $prepare){
+	$output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job;
+	$self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data};
+	$self->log("Finished parsing and importing results");
+}
+return;
+}
+#Should be called from format parser e.g. BED, GFF, eQTL etc
+#Why don't we pass feature_params and dbentry_params directly?
+sub load_feature_and_xrefs{
+my $self = shift;
+#warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n";
+#This now only fails once on the first run and then
+#Need to count based on feature_type?
+#new rather than new fast here as we want to validate the import
+my $feature = $self->{feature_class}->new(%{$self->feature_params});
+($feature) = @{$self->feature_adaptor->store($feature)};
+$self->count('features');
+#Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key?
+##This needs to be handled in caller as we are validating loci?
+#if($self->ucsc_coords){
+#	$start += 1;
+#	$end   += 1;
+#  }
+#This needs to be put in a separate sub and called by the caller
+#if(!  $self->cache_slice($chr)){
+#  warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr";
+#}
+#else{
+#grab seq if dump fasta and available
+#my $seq;
+#if(exists $self->feature_params->{'sequence'}){
+#	  $seq = $self->feature_params->{'sequence'};
+#	  delete $self->feature_params->{'sequence'};
+#	}
+#	else{
+#	  $self->log('No fasta sequence available for '.$self->feature_params->display_label);
+#	}
+#  }
+#dump fasta here
+#if ($self->dump_fasta){
+#	$self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n";
+#  }
+#Store the xrefs
+foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){
+	my $ftype = $dbentry_hash->{feature_type};
+	delete $dbentry_hash->{feature_type};
+	my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash});
+	$self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag
+	#count here? no count in caller
+}
+#Clean data cache
+$self->{'_feature_params'} = {};
+$self->{'_dbentry_params'} = [];
+return $feature;
+}
+#This should really be handled in Bio::EnsEMBL::Feature?
+#Move to Helper?
+sub set_strand{
+my ($self, $strand) = @_;
+my $ens_strand = 0;
+my %strand_vals = (
+					 '1'  => 1,
+					 '0'  => 0,
+					 '-1' => -1,
+					 '+'  => 1,
+					 '-'  => -1,
+					 '.'  => 0,
+					);
+if($strand){
+	if(exists $strand_vals{$strand}){
+	  $ens_strand = $strand_vals{$strand};
+	}
+	else{
+	  throw("Could not identify strand value for $strand");
+	}
+}
+return $ens_strand;
+}
+sub total_features{
+my ($self, $total) = @_;
+$self->{'total_features'} = $total if defined $total;
+return $self->{'total_features'};
+}
+#Currently only required for Bed::parse_Features_by_Slice
+#filehandle
+sub file_handle{
+my ($self, $fh) = @_;
+$self->{'file_handle'} = $fh if defined $fh;
+return $self->{'file_handle'};
+}
+sub result_set{
+my ($self, $rset) = @_;
+#already tested/created by self
+$self->{'result_set'} = $rset if $rset;
+return $self->{'result_set'};
+}
+1;

Mercurial > repos > mahtabm > ensembl

comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm @ 0:1f6dce3d34e0