Mercurial > repos > mahtabm > ensembl

#
# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet
#

=head1 LICENSE

  Copyright (c) 1999-2011 The European Bioinformatics Institute and
  Genome Research Limited.  All rights reserved.

  This software is distributed under a modified Apache license.
  For license details, please see

    http://www.ensembl.org/info/about/code_licence.html

=head1 CONTACT

  Please email comments or questions to the public Ensembl
  developers list at <ensembl-dev@ebi.ac.uk>.

  Questions may also be sent to the Ensembl help desk at
  <helpdesk@ensembl.org>.

=head1 NAME

Bio::EnsEMBL::Funcgen::Parsers::InputSet

=head1 SYNOPSIS

  use vars qw(@ISA);
  @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);


=head1 DESCRIPTION

This is a base class to support simple file format parsers. For simple imports the vendor is
set to the parser type i.e. the file format.  The generic read_and_import_simple_data assumes
a one line per feature format, other format need there own read_and_import_format_data method,
which will need defining in the result_data config element. Features are stored either as
ResultFeature collections or AnnotatedFeatures dependan ton the input feature class.

=cut

# To do
# Add Parsers for BAM/SAM
# Rename to InputSet
# Handle mysqlimport for large data sets e.g. reads
# Incorporate collection code
# Implement matrix storage

package Bio::EnsEMBL::Funcgen::Parsers::InputSet;

use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
use Bio::EnsEMBL::Funcgen::SegmentationFeature;
use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped);
use Bio::EnsEMBL::Utils::Argument qw( rearrange );
#use Bio::EnsEMBL::Funcgen::Utils::Helper;
use strict;

#config stuff, move to BaseImporter?
use Bio::EnsEMBL::Analysis;
use Bio::EnsEMBL::Funcgen::FeatureType;


use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10

#use vars qw(@ISA);
#@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);

my %valid_types = (
				   result       => undef,
				   annotated    => undef,
				   segmentation => undef,
				  );


=head2 new

  Example    : my $self = $class->SUPER::new(@_);
  Description: Constructor method for Bed class
  Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple
  Exceptions : throws if caller is not Importer
  Caller     : Bio::EnsEMBL::Funcgen::Parsers:Simple
  Status     : at risk

=cut


sub new{
  my $caller = shift;

  my $class = ref($caller) || $caller;
  my $self  = $class->SUPER::new(@_);

  #  my $config_file;


  ($self->{'input_set_name'},
   $self->{'input_feature_class'},
   #$self->{'slices'},
   $self->{total_features},
   $self->{force},            #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser
   $self->{dbfile_data_root}, #only appropriate for result input_feature_class
 #  $config_file,              #User defined config hash file
  ) = rearrange(['input_set_name', 'input_feature_class',
				 'total_features', 'force', 'dbfile_data_root'], @_);


  #Could potentially take fields params directly to define a custom format
  #Take direct field mappings, plus special fields which needs parsing differently
  #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping


  throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
	if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));

  $self->{'config'} =
	{(
	  #can we omit these?
      array_data   => [],#['experiment'],
      probe_data   => [],#["probe"],
      norm_method => undef,
	  #protocols => {()},
	  'results_data' => ["and_import"],
     )};

  #set up feature params
  $self->{'_feature_params'} = {};
  $self->{'_dbentry_params'} = [];

  #$self->{'counts'}   = {};
  #$self->{'slices'}   = [];
  #$self->{'seq_region_names'} = [];#Used for slice based import


   # USER CONFIG #
  #Here we need to read config based on external file
  #Should do something similar to set_feature_sets
  #and validate_and_store_feature_types in BaseExternalParser
  #but we are using define and validate sets instead

  #BaseExternalParser and BaseImporter really need to be merged
  #After we have stripped out all the array/experiment specific stuff

  #Do dev here so we are not developing more stuff in the Importer which will need migrating
  #to the BaseImporter

  #if($config_file){
#	my $config;

#	$self->log("Reading config file:\t".$config_file);

#	if(! ($config = do "$config_file")){
#	  throw("Couldn't parse config file:\t$config_file:\n$@") if $@;
#	  throw("Couldn't do config:\t$config_file\n$!")          if ! defined $config;
#	  throw("Couldn't compile config_file:\t$config_file")    if ! $config;
#	}

#	#At least check it is hash
#	if(ref($config) ne 'HASH'){
#	  throw("Config file does not define a valid HASH:\t$config_file");
#	}
#
#	$self->{user_config} = $config;
#  }


  return $self;
}


sub output_file{
  my ($self, $output_file) = @_;

  $self->{'output_file'} = $output_file if $output_file;
  return $self->{'output_file'};
}

sub input_file{
  my ($self, $input_file) = @_;

  $self->{'input_file'} = $input_file if $input_file;
  return $self->{'input_file'};
}


=head2 set_config

  Example    : my $self->set_config;
  Description: Sets attribute dependent config
  Returntype : None
  Exceptions : None
  Caller     : Bio::EnsEMBL::Funcgen::Importer
  Status     : at risk

=cut


sub set_config{
  my $self = shift;

  #Move all this to new when we fix the inheritance in Importer

  #We could set input_set_name to experiment name
  #But we would have to make warning in define_and_validate_sets mention -input_set_name

  throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name();

  #Mandatory checks
  if(! defined $self->feature_analysis){
	throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports');
  }


  if(! exists $valid_types{$self->input_feature_class}){
	throw("You must define a valid input_feature_class:\t".
		  join(', ', keys %valid_types));
  }

  $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature';


  #We need to undef norm method as it has been set to the env var
  $self->{'config'}{'norm_method'} = undef;

  #dirs are not set in config to enable generic get_dir method access
  $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector

  #some convenience methods
  my $adaptor_method           = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor';
  $self->{'feature_adaptor'}   =  $self->db->$adaptor_method;
  $self->{'dbentry_adaptor'}   = $self->db->get_DBEntryAdaptor;
  $self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor;
  ##$self->{'slice_adaptor'}     = $self->db->dnadb->get_SliceAdaptor;

  #Validate slices
  $self->slices($self->{'slices'}) if defined $self->{'slices'};

  #Move to new when we sort out inheritance
  $self->validate_and_store_config([$self->name]);
  #Could use input_set_name here?
  #This was to support >1 input set per experiment (name)

  #This current breaks for no config imports
  #i.e. standard Bed import e.g. result_feature collections
  #segmentation imports use Bed and config
  #allow no config imports in BaseImporter?
  #or ultimately set the params as part of the user_config?

  return;
}


sub define_sets{
  my ($self) = @_;

  my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name);

  if(! defined $eset){
    $eset = Bio::EnsEMBL::Funcgen::InputSet->new
      (
       -name         => $self->input_set_name(),
       -experiment   => $self->experiment(),
       -feature_type => $self->feature_type(),
       -cell_type    => $self->cell_type(),
       -vendor       => $self->vendor(),
       -format       => $self->format(),
       -analysis     => $self->feature_analysis,
       -feature_class => $self->input_feature_class,
      );
    ($eset)  = @{$self->db->get_InputSetAdaptor->store($eset)};
  }

  #Use define_and_validate with fetch/append as we may have a pre-existing set
  #This now needs to handle ResultSets based on InputSets


  my $dset = $self->define_and_validate_sets
	(
	 -dbadaptor    => $self->db,
	 -name         => $self->input_set_name,#or name?
	 -feature_type => $self->feature_type,
	 -cell_type    => $self->cell_type,
	 -analysis     => $self->feature_analysis,
	 -feature_class=> $self->input_feature_class,
	 -description  => $self->feature_set_description,
	 #-append          => 1,#Omit append to ensure we only have this eset
	 -recovery     => $self->recovery,
	 -supporting_sets => [$eset],
	 -slices        => $self->slices,
	 #Can't set rollback here, as we don't know until we've validated the files
	 #Can't validate the files until we have the sets.
	 #So we're doing this manually in validate_files
	);

  #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully
  #However we already have IMPORTED on the InputSubSet
  #We should add it to FeatureSet to remain consistent.
  #See Helper::define_and_validate_sets for more notes on
  #potential problems with FeatureSet IMPORTED status


  #define_and_validate_sets should also replicate ResultSets?
  #Questionable, mapped reads are never normalised across replicates
  #There are generally used as input for peak calling individually.
  #So files in this instance are expected to be separate parts of the same replicate
  #e.g. different chromosomes
  #Force one input file?
  #What if we want to link several assays(feature/cell_types) to the same experiment?

  $self->{'_data_set'} = $dset;

  return $dset;
}


#we have rollback functionality incorporated here

sub validate_files{
  my ($self, $prepare) = @_;

  #Get file
  if (! @{$self->result_files()}) {
	my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr
	my @rfiles = `$list`;
	$self->result_files(\@rfiles);
  }

  #We don't yet support multiple files
  if(scalar(@{$self->result_files()}) >1){
	warn('Found more than one '.$self->vendor." file:\n".
		 join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).".
		 "  We need to resolve how we are going handle replicates with random cluster IDs");
	#do we even need to?
  }

  #Here were are tracking the import of individual files by adding them as InputSubSets
  #Recovery would never know what to delete
  #So would need to delete all, Hence no point in setting status?
  #We do not rollback IMPORTED data here.  This is done via separate scripts
  #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback
  #flag in the run script

  ### VALIDATE FILES ###
  #We need validate all the files first, so the import doesn't fall over half way through
  #Or if we come across a rollback halfway through
  my (%new_data, $eset);
  my $dset = $self->data_set;

  if((scalar(@{$self->slices}) > 1) &&
	 ! $prepare){
	throw('Validate files does not yet support multiple Slice rollback');
  }

  #This all assumes that there is only ever 1 InputSet

  if($self->input_feature_class eq 'result'){
	$eset = $dset->get_supporting_sets->[0]->get_InputSets->[0];
  }
  else{#annotated/segmentation
	$eset =  $dset->get_supporting_sets->[0];
  }


  #IMPORTED status here may prevent
  #futher slice based imports
  #so we have wait to set this until we know all the slices
  #are loaded, unless we store slice based IMPORTED states
  #We currently get around this be never settign IMPORTED for slice based jobs
  #and always rolling back by slice before import

  #This loop supports multiple files
  my (@rollback_sets, %file_paths);
  my $auto_rollback = ($self->rollback) ? 0 : 1;

  foreach my $filepath( @{$self->result_files} ) {
	my ($filename, $sub_set);
	chomp $filepath;
   	($filename = $filepath) =~ s/.*\///;
	$file_paths{$filename} = $filepath;
	$filename =~ s/^prepared\.// if $self->prepared; 	#reset filename to that originally used to create the Inputsubsets

	$self->log('Validating '.$self->vendor." file:\t$filename");
	throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links

	if( $sub_set = $eset->get_subset_by_name($filename) ){
	  #IMPORTED status here is just for the file
	  #Any changes to analysis or coord_system should result in different InputSubset(file)
	  #Will only ever be imported into one Feature|ResultSet

	  #Currently conflating recover_unimported and rollback
	  #as they serve the same purpose until we implement InputSubset level recovery


	  if( $sub_set->has_status('IMPORTED') ){
		$new_data{$filepath} = 0;
		$self->log("Found previously IMPORTED InputSubset:\t$filename");
	  }
	  else{
		$self->log("Found existing InputSubset without IMPORTED status:\t$filename");
		push @rollback_sets, $sub_set;
	  }
	}
	else{
	  $self->log("Found new InputSubset:\t${filename}");
	  throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared;
	  $new_data{$filepath} = 1;
	  $sub_set = $eset->add_new_subset($filename);
	  $self->input_set_adaptor->store_InputSubsets([$sub_set]);
	}
  }


  #Does -recover allow a single extra new file to be added to an existing InputSet?

  if(@rollback_sets &&  #recoverable sets i.e. exists but not IMPORTED
	 ( (! $self->recovery) && (! $self->rollback) ) ){
	throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n".
		  "You must specify -recover or -rollback to perform a full rollback");

	if($self->recovery){
	  #Change these to logger->warn
	  $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined");
	  $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n");
	}
  }


  if($self->rollback){
	#Check we have all existing InputSubsets files before we do full rollback
	#Can probably remove this if we support InputSubset(file/slice) level rollback
	$self->log('Rolling back all InputSubsets');
	@rollback_sets = @{$eset->get_InputSubsets};

	foreach my $isset(@rollback_sets){

	  if(! exists $file_paths{$isset->name}){
		throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name.
			  "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names");
	  }
	}
  }


  foreach my $esset(@rollback_sets){
	#This needs to be mapped to the specified filepaths
	my $fp_key = $esset->name;
	$fp_key = 'prepared.'.$fp_key if $self->prepared;

	$new_data{$file_paths{$fp_key}} = 1;
	$self->log("Revoking states for InputSubset:\t\t\t".$esset->name);
	$eset->adaptor->revoke_states($esset);

	if(! $prepare){
	  #This was to avoid redundant rollback in prepare step
	  $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name);

	  if($self->input_feature_class eq 'result'){
		#Can we do this by slice for parallelisation?
		#This will only ever be a single ResultSet due to Helper::define_and_validate_sets
		#flags are rollback_results and force(as this won't be a direct input to the product feature set)
		$self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1);
		#Do no have rollback_InputSet here as we may have parallel Slice based imports running
	  }
	  else{#annotated/segmentation
		$self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]);
		$self->rollback_InputSet($eset);
		last;
	  }
	}
  }

  return \%new_data;
}


sub set_feature_separator{
  my ($self, $separator) = @_;

  #How do we test if something undefined was passed?
  #Rather than nothing passed at all?
  #Can't do this as this is the accessor
  #Need to split method

  throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') );

  $self->{'_feature_separator'} = $separator;

}

# SIMPLE ACCESSORS
# Some of these can be called for each record
# Trim the access time as much as possible

sub input_feature_class{ return $_[0]->{'input_feature_class'}; }

sub input_set_name{      return $_[0]->{'input_set_name'}; }  #Set in new

sub feature_adaptor{     return $_[0]->{'feature_adaptor'}; }

sub dbentry_adaptor{     return $_[0]->{'dbentry_adaptor'}; }

sub input_set_adaptor{   return $_[0]->{'input_set_adaptor'}; }

sub set{                 return $_[0]->{'set'}; } #Feature or Result, set in define_sets

##sub slice_adaptor{       return $_[0]->{'slice_adaptor'}; }

sub data_set{            return $_[0]->{'_data_set'}; }

sub feature_separator{   return $_[0]->{'_feature_separator'}; }

sub feature_params{      return $_[0]->{'_feature_params'}; }

sub dbentry_params{      return $_[0]->{'_dbentry_params'}; }

sub input_gzipped{       return $_[0]->{'input_gzipped'}; }


sub input_file_operator{
  my ($self, $op) = @_;
  #Should be set in format parser
  $self->{'input_file_operator'} = $op if defined $op;

  return $self->{'input_file_operator'};
}

# prepare boolean, simply stores the sets and preprocesses the file
# so we don't get each batch job trying to sort etc


#Still need to implement prepare in other Parsers!!

sub read_and_import_data{
  my ($self, $prepare) = @_;

  my $action = ($prepare) ? 'preparing' : 'importing';

  $self->log("Reading and $action ".$self->vendor()." data");
  my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines);

  if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){
	throw('prepare mode is only currently implemented for the Bed parser');
  }


  #Test for conflicting run modes
  if($prepare &&
	 ($self->batch_job || $self->prepared)){
	#prepare should be called once by the runner, not in each batch_job
	#don't prepare if already prepared
	throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job');
  }

  my $dset = $self->define_sets;

  #We also need to account for bsub'd slice based import
  #seq alignments loaded into a ResultSet
  #Cannot have 0 window for ChIP Seq alignments
  #As this would mean storing all the individual reads
  #Hence we need to remap to a new assm before we import!

  if($self->input_feature_class eq 'result'){
	$output_set = $dset->get_supporting_sets->[0];
	$eset       = $output_set->get_InputSets->[0];
	$self->result_set($output_set);#required for ResultFeature Collector and Bed Parser
  }
  else{#annotated/segmentation
	$output_set = $dset->product_FeatureSet;
	$eset       =  $dset->get_supporting_sets->[0];
  }


  #If we can do these the other way araound we can get define_sets to rollback the FeatureSet
  #Cyclical dependency for the sets :|
  my $new_data = $self->validate_files($prepare);
  my $seen_new_data = 0;


  ### READ AND IMPORT FILES ###
  foreach my $filepath(@{$self->result_files()}) {
	chomp $filepath;

	($filename = $filepath) =~ s/.*\///;
	$self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method

	if($new_data->{$filepath} ){	#This will currently autovivify!
	  $seen_new_data = 1;
	  $self->{'input_gzipped'} = &is_gzipped($filepath);

	  $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file');

	  $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath);

	  #We need to be able to optional open pipe to gzip | sort here
	  #i.e. define open command
	  $fh = open_file($filepath, $self->input_file_operator);

	  #This my become way too large for some reads files
	  #Currently no problems
	  #This is not working as we are sorting the file!
	  #$self->parse_header($fh) if $self->can('parse_header');

	  #For result features some times we want to run
	  #locally and just sort without dumping
	  #i.e if we are not a batch job
	  #as there is no need to dump if it is a single process


	  #Should this be prepared?

	  if((($self->input_feature_class eq 'result') && ! $prepare)){
		#(($self->input_feature_class eq 'result') && (! $self->batch_job))){   #Local run on just 1 chr
		#

		#Use the ResultFeature Collector here
		#Omiting the 0 wsize
		#How are we going to omit 0 wsize when doing the fetch?
		#simply check table name in ResultSet?

		#Should we do this for multiple chrs?
		#or fail here
		# we need to pass self
		#for access to get_Features_by_Slice
		#which should be in the specific parser e.g Bed

		#Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice?
		#Could really do with separating the pure file parsers from the importer code, so these can be reused
		#by other code. Then simply use Bed import parser for specific import functions and as wrapper to
		#Bed file parser
		#So should really have
		#Parsers::File::Bed
		#and
		#Parsers::Import::Bed
		#This can probably wait until we update BioPerl and just grab the Bed parser from there?

		my $slices = $self->slices;

		#Should this be caught in new?
		if(! @$slices){
		  throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name);
		}


		if(scalar(@$slices) > 1){
		  throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n"
				."Please submit these to the farm as single slice jobs");
		}

		#restrict to just 1 slice as we don't yet support disk seeking
		#if the slices are not in the same order as they appear in the file
		#also we want to parallelise this

		#Set as attr for parse_Features_by_Slice in format sepcific Parsers


		$self->file_handle(open_file($filepath, $self->input_file_operator));


		foreach my $slice(@$slices){
		  $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self,
																	(
																	 #Force needs reimplementing here?
																	 -force            => $self->{force},
																	 -dbfile_data_root => $self->{dbfile_data_root},
																	));
		}

		warn "Need to update InputSubset status to IMPORTED after all slices have been loaded";
		#Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures?


		warn "Closing $filename\nDisregard the following 'Broken pipe' warning";

		#Closing the read end of a pipe before the process writing to it at the other end
		#is done writing results in the writer receiving a SIGPIPE. If the other end can't
		#handle that, be sure to read all the data before closing the pipe.
		#This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file?
		#$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead?
		#Or maybe an eval will catch the STDERR better?
		#sub catch_SIGPIPE{
		#  my $sig = shift @_;
		#  print " Caught SIGPIPE: $sig $1 \n";
		#  return;
		#
		#}
		#$SIG{PIPE} = \&catch_SIGPIPE;
		#my $retval =  eval { no warnings 'all'; $fh->close };
		#if($@){
		#  warn "after eval with error $@\nretval is $retval";
		#}
		#Neither of these catch gzip: stdout: Broken pipe

		#IO::UnCompress::Gunzip?


		$fh->close;
	  }
	  else{


		#Revoke FeatureSet IMPORTED state here incase we fail halfway through
		$output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare));

		#What about IMPORTED_"CSVERSION"
		#This may leave us with an incomplete import which still has
		#an IMPORTED_CSVERSION state
		#We need to depend on IMPORTED for completeness of set
		#DAS currently only uses IMPORTED_CSVERSION
		#This is okayish but we also need to write HCs for any sets
		#which do not have IMPORTED state!
		my ($line, @outlines, $out_fh);


		if($prepare && ! $self->batch_job){
		  #Assume we want gzipped output
		  #filename is actull based on input, so may not have gz in file name
		  $out_fh = open_file($self->output_file, "| gzip -c > %s");
		}


		while(defined ($line=<$fh>)){
		  #Generic line processing
		  #Move these to parse_line?
		  $line =~ s/\r*\n//o;
		  next if $line =~ /^\#/;
		  next if $line =~ /^$/;

		  #This has now been simplified to process_line method
		  #my @fields = split/\t/o, $line;
		  #start building parameters hash
		  #foreach my $field_index(@{$self->get_field_indices}){
		  #  my $field = $self->get_field_by_index($field_index);
		  #  $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]);
		  #  }


		  #We also need to enable different parse line methods if we have different file
		  #e.g. cisRED
		  #Code refs?


		  if($self->parse_line($line, $prepare)){
			$self->count('total parsed lines');

			#Cache or print to sorted file
			if($prepare && ! $self->batch_job){

			  if(scalar(@outlines) >1000){
				print $out_fh join("\n", @outlines)."\n";
				@outlines = ();
			  }
			  else{
				push @outlines, $line;
			  }
			}
		  }
		}

		close($fh);

		#Print last of sorted file
		if($prepare && ! $self->batch_job){
		  print $out_fh join("\n", @outlines)."\n";
		  close($out_fh);
		  @outlines = ();
		}

		if(! $prepare){
		  #Now we need to deal with anything left in the read cache
		  $self->process_params if $self->can('process_params');

		  #To speed things up we may need to also do file based import here with WRITE lock?
		  #mysqlimport will write lock the table by default?

		  #reset filename to that originally used to create the Inputsubsets
		  $filename =~ s/^prepared\.// if $self->prepared;

		  my $sub_set = $eset->get_subset_by_name($filename);
		  $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job;
		}
	  }


	  if($prepare){
		$self->log("Finished preparing import from:\t$filepath");
	  }
	  else{
		#Need to tweak this for slice based import
		$self->log('Finished importing '.$self->counts('features').' '.
				   $output_set->name." features from:\t$filepath");

	  }


	  #This currently fails here if the uncaught file sort was not successful

	  foreach my $key (keys %{$self->counts}){
		$self->log("Count $key:\t".$self->counts($key));
	  }
	}
  }

  #Here we should set IMPORTED on the FeatureSet
  #We could also log the first dbID of each feature in a subset to facilitate subset rollback
  #in feature table
  #this would be sketchy at best
  #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id
  #This may already have IMPORTED status as we don't revoke the status whilst
  #updating to protect the feature set due to lack of supportingset tracking
  #see Helper::defined_and_validate_sets for more notes.
  #Is there any point in setting it if we don't revoke it?
  #To allow consistent status handling across sets. Just need to be aware of fset status caveat.
  #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job

  if(! $prepare){
	$output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job;
	$self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data};
	$self->log("Finished parsing and importing results");
  }

  return;
}


#Should be called from format parser e.g. BED, GFF, eQTL etc
#Why don't we pass feature_params and dbentry_params directly?

sub load_feature_and_xrefs{
  my $self = shift;

  #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n";
  #This now only fails once on the first run and then
  #Need to count based on feature_type?

  #new rather than new fast here as we want to validate the import
  my $feature = $self->{feature_class}->new(%{$self->feature_params});
  ($feature) = @{$self->feature_adaptor->store($feature)};
  $self->count('features');

  #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key?


  ##This needs to be handled in caller as we are validating loci?
  #if($self->ucsc_coords){
  #	$start += 1;
  #	$end   += 1;
  #  }

  #This needs to be put in a separate sub and called by the caller
  #if(!  $self->cache_slice($chr)){
  #  warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr";
  #}
  #else{
  #grab seq if dump fasta and available
  #my $seq;
  #if(exists $self->feature_params->{'sequence'}){
  #	  $seq = $self->feature_params->{'sequence'};
  #	  delete $self->feature_params->{'sequence'};
  #	}
  #	else{
  #	  $self->log('No fasta sequence available for '.$self->feature_params->display_label);
  #	}
  #  }

  #dump fasta here
  #if ($self->dump_fasta){
  #	$self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n";
  #  }

  #Store the xrefs

  foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){
	my $ftype = $dbentry_hash->{feature_type};
	delete $dbentry_hash->{feature_type};

	my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash});
	$self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag
	#count here? no count in caller
  }


  #Clean data cache
  $self->{'_feature_params'} = {};
  $self->{'_dbentry_params'} = [];

  return $feature;
}

#This should really be handled in Bio::EnsEMBL::Feature?
#Move to Helper?

sub set_strand{
  my ($self, $strand) = @_;

  my $ens_strand = 0;

  my %strand_vals = (
					 '1'  => 1,
					 '0'  => 0,
					 '-1' => -1,
					 '+'  => 1,
					 '-'  => -1,
					 '.'  => 0,
					);

  if($strand){

	if(exists $strand_vals{$strand}){
	  $ens_strand = $strand_vals{$strand};
	}
	else{
	  throw("Could not identify strand value for $strand");
	}
  }

  return $ens_strand;
}

sub total_features{
  my ($self, $total) = @_;

  $self->{'total_features'} = $total if defined $total;
  return $self->{'total_features'};
}

#Currently only required for Bed::parse_Features_by_Slice

#filehandle


sub file_handle{
  my ($self, $fh) = @_;

  $self->{'file_handle'} = $fh if defined $fh;
  return $self->{'file_handle'};
}

sub result_set{
  my ($self, $rset) = @_;

  #already tested/created by self

  $self->{'result_set'} = $rset if $rset;
  return $self->{'result_set'};
}

1;
author	mahtabm
date	Thu, 11 Apr 2013 02:01:53 -0400
parents
children