Mercurial > repos > mahtabm > ensembl
diff variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm Thu Apr 11 02:01:53 2013 -0400 @@ -0,0 +1,955 @@ +# +# EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet +# + +=head1 LICENSE + + Copyright (c) 1999-2011 The European Bioinformatics Institute and + Genome Research Limited. All rights reserved. + + This software is distributed under a modified Apache license. + For license details, please see + + http://www.ensembl.org/info/about/code_licence.html + +=head1 CONTACT + + Please email comments or questions to the public Ensembl + developers list at <ensembl-dev@ebi.ac.uk>. + + Questions may also be sent to the Ensembl help desk at + <helpdesk@ensembl.org>. + +=head1 NAME + +Bio::EnsEMBL::Funcgen::Parsers::InputSet + +=head1 SYNOPSIS + + use vars qw(@ISA); + @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet); + + +=head1 DESCRIPTION + +This is a base class to support simple file format parsers. For simple imports the vendor is +set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes +a one line per feature format, other format need there own read_and_import_format_data method, +which will need defining in the result_data config element. Features are stored either as +ResultFeature collections or AnnotatedFeatures dependan ton the input feature class. + +=cut + +# To do +# Add Parsers for BAM/SAM +# Rename to InputSet +# Handle mysqlimport for large data sets e.g. reads +# Incorporate collection code +# Implement matrix storage + +package Bio::EnsEMBL::Funcgen::Parsers::InputSet; + +use Bio::EnsEMBL::Funcgen::AnnotatedFeature; +use Bio::EnsEMBL::Funcgen::SegmentationFeature; +use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate ); +use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped); +use Bio::EnsEMBL::Utils::Argument qw( rearrange ); +#use Bio::EnsEMBL::Funcgen::Utils::Helper; +use strict; + +#config stuff, move to BaseImporter? +use Bio::EnsEMBL::Analysis; +use Bio::EnsEMBL::Funcgen::FeatureType; + + +use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10 + +#use vars qw(@ISA); +#@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper); + +my %valid_types = ( + result => undef, + annotated => undef, + segmentation => undef, + ); + + +=head2 new + + Example : my $self = $class->SUPER::new(@_); + Description: Constructor method for Bed class + Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple + Exceptions : throws if caller is not Importer + Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple + Status : at risk + +=cut + + +sub new{ + my $caller = shift; + + my $class = ref($caller) || $caller; + my $self = $class->SUPER::new(@_); + + # my $config_file; + + + ($self->{'input_set_name'}, + $self->{'input_feature_class'}, + #$self->{'slices'}, + $self->{total_features}, + $self->{force}, #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser + $self->{dbfile_data_root}, #only appropriate for result input_feature_class + # $config_file, #User defined config hash file + ) = rearrange(['input_set_name', 'input_feature_class', + 'total_features', 'force', 'dbfile_data_root'], @_); + + + #Could potentially take fields params directly to define a custom format + #Take direct field mappings, plus special fields which needs parsing differently + #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping + + + throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly") + if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer")); + + $self->{'config'} = + {( + #can we omit these? + array_data => [],#['experiment'], + probe_data => [],#["probe"], + norm_method => undef, + #protocols => {()}, + 'results_data' => ["and_import"], + )}; + + #set up feature params + $self->{'_feature_params'} = {}; + $self->{'_dbentry_params'} = []; + + #$self->{'counts'} = {}; + #$self->{'slices'} = []; + #$self->{'seq_region_names'} = [];#Used for slice based import + + + # USER CONFIG # + #Here we need to read config based on external file + #Should do something similar to set_feature_sets + #and validate_and_store_feature_types in BaseExternalParser + #but we are using define and validate sets instead + + #BaseExternalParser and BaseImporter really need to be merged + #After we have stripped out all the array/experiment specific stuff + + #Do dev here so we are not developing more stuff in the Importer which will need migrating + #to the BaseImporter + + #if($config_file){ +# my $config; + +# $self->log("Reading config file:\t".$config_file); + +# if(! ($config = do "$config_file")){ +# throw("Couldn't parse config file:\t$config_file:\n$@") if $@; +# throw("Couldn't do config:\t$config_file\n$!") if ! defined $config; +# throw("Couldn't compile config_file:\t$config_file") if ! $config; +# } + +# #At least check it is hash +# if(ref($config) ne 'HASH'){ +# throw("Config file does not define a valid HASH:\t$config_file"); +# } +# +# $self->{user_config} = $config; +# } + + + return $self; +} + + +sub output_file{ + my ($self, $output_file) = @_; + + $self->{'output_file'} = $output_file if $output_file; + return $self->{'output_file'}; +} + +sub input_file{ + my ($self, $input_file) = @_; + + $self->{'input_file'} = $input_file if $input_file; + return $self->{'input_file'}; +} + + + +=head2 set_config + + Example : my $self->set_config; + Description: Sets attribute dependent config + Returntype : None + Exceptions : None + Caller : Bio::EnsEMBL::Funcgen::Importer + Status : at risk + +=cut + + +sub set_config{ + my $self = shift; + + #Move all this to new when we fix the inheritance in Importer + + #We could set input_set_name to experiment name + #But we would have to make warning in define_and_validate_sets mention -input_set_name + + throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name(); + + #Mandatory checks + if(! defined $self->feature_analysis){ + throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports'); + } + + + if(! exists $valid_types{$self->input_feature_class}){ + throw("You must define a valid input_feature_class:\t". + join(', ', keys %valid_types)); + } + + $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature'; + + + #We need to undef norm method as it has been set to the env var + $self->{'config'}{'norm_method'} = undef; + + #dirs are not set in config to enable generic get_dir method access + $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector + + #some convenience methods + my $adaptor_method = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor'; + $self->{'feature_adaptor'} = $self->db->$adaptor_method; + $self->{'dbentry_adaptor'} = $self->db->get_DBEntryAdaptor; + $self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor; + ##$self->{'slice_adaptor'} = $self->db->dnadb->get_SliceAdaptor; + + #Validate slices + $self->slices($self->{'slices'}) if defined $self->{'slices'}; + + #Move to new when we sort out inheritance + $self->validate_and_store_config([$self->name]); + #Could use input_set_name here? + #This was to support >1 input set per experiment (name) + + #This current breaks for no config imports + #i.e. standard Bed import e.g. result_feature collections + #segmentation imports use Bed and config + #allow no config imports in BaseImporter? + #or ultimately set the params as part of the user_config? + + return; +} + + +sub define_sets{ + my ($self) = @_; + + my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name); + + if(! defined $eset){ + $eset = Bio::EnsEMBL::Funcgen::InputSet->new + ( + -name => $self->input_set_name(), + -experiment => $self->experiment(), + -feature_type => $self->feature_type(), + -cell_type => $self->cell_type(), + -vendor => $self->vendor(), + -format => $self->format(), + -analysis => $self->feature_analysis, + -feature_class => $self->input_feature_class, + ); + ($eset) = @{$self->db->get_InputSetAdaptor->store($eset)}; + } + + #Use define_and_validate with fetch/append as we may have a pre-existing set + #This now needs to handle ResultSets based on InputSets + + + my $dset = $self->define_and_validate_sets + ( + -dbadaptor => $self->db, + -name => $self->input_set_name,#or name? + -feature_type => $self->feature_type, + -cell_type => $self->cell_type, + -analysis => $self->feature_analysis, + -feature_class=> $self->input_feature_class, + -description => $self->feature_set_description, + #-append => 1,#Omit append to ensure we only have this eset + -recovery => $self->recovery, + -supporting_sets => [$eset], + -slices => $self->slices, + #Can't set rollback here, as we don't know until we've validated the files + #Can't validate the files until we have the sets. + #So we're doing this manually in validate_files + ); + + #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully + #However we already have IMPORTED on the InputSubSet + #We should add it to FeatureSet to remain consistent. + #See Helper::define_and_validate_sets for more notes on + #potential problems with FeatureSet IMPORTED status + + + #define_and_validate_sets should also replicate ResultSets? + #Questionable, mapped reads are never normalised across replicates + #There are generally used as input for peak calling individually. + #So files in this instance are expected to be separate parts of the same replicate + #e.g. different chromosomes + #Force one input file? + #What if we want to link several assays(feature/cell_types) to the same experiment? + + $self->{'_data_set'} = $dset; + + return $dset; +} + + + + +#we have rollback functionality incorporated here + +sub validate_files{ + my ($self, $prepare) = @_; + + #Get file + if (! @{$self->result_files()}) { + my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr + my @rfiles = `$list`; + $self->result_files(\@rfiles); + } + + #We don't yet support multiple files + if(scalar(@{$self->result_files()}) >1){ + warn('Found more than one '.$self->vendor." file:\n". + join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).". + " We need to resolve how we are going handle replicates with random cluster IDs"); + #do we even need to? + } + + #Here were are tracking the import of individual files by adding them as InputSubSets + #Recovery would never know what to delete + #So would need to delete all, Hence no point in setting status? + #We do not rollback IMPORTED data here. This is done via separate scripts + #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback + #flag in the run script + + ### VALIDATE FILES ### + #We need validate all the files first, so the import doesn't fall over half way through + #Or if we come across a rollback halfway through + my (%new_data, $eset); + my $dset = $self->data_set; + + if((scalar(@{$self->slices}) > 1) && + ! $prepare){ + throw('Validate files does not yet support multiple Slice rollback'); + } + + #This all assumes that there is only ever 1 InputSet + + if($self->input_feature_class eq 'result'){ + $eset = $dset->get_supporting_sets->[0]->get_InputSets->[0]; + } + else{#annotated/segmentation + $eset = $dset->get_supporting_sets->[0]; + } + + + #IMPORTED status here may prevent + #futher slice based imports + #so we have wait to set this until we know all the slices + #are loaded, unless we store slice based IMPORTED states + #We currently get around this be never settign IMPORTED for slice based jobs + #and always rolling back by slice before import + + #This loop supports multiple files + my (@rollback_sets, %file_paths); + my $auto_rollback = ($self->rollback) ? 0 : 1; + + foreach my $filepath( @{$self->result_files} ) { + my ($filename, $sub_set); + chomp $filepath; + ($filename = $filepath) =~ s/.*\///; + $file_paths{$filename} = $filepath; + $filename =~ s/^prepared\.// if $self->prepared; #reset filename to that originally used to create the Inputsubsets + + $self->log('Validating '.$self->vendor." file:\t$filename"); + throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links + + if( $sub_set = $eset->get_subset_by_name($filename) ){ + #IMPORTED status here is just for the file + #Any changes to analysis or coord_system should result in different InputSubset(file) + #Will only ever be imported into one Feature|ResultSet + + #Currently conflating recover_unimported and rollback + #as they serve the same purpose until we implement InputSubset level recovery + + + if( $sub_set->has_status('IMPORTED') ){ + $new_data{$filepath} = 0; + $self->log("Found previously IMPORTED InputSubset:\t$filename"); + } + else{ + $self->log("Found existing InputSubset without IMPORTED status:\t$filename"); + push @rollback_sets, $sub_set; + } + } + else{ + $self->log("Found new InputSubset:\t${filename}"); + throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared; + $new_data{$filepath} = 1; + $sub_set = $eset->add_new_subset($filename); + $self->input_set_adaptor->store_InputSubsets([$sub_set]); + } + } + + + #Does -recover allow a single extra new file to be added to an existing InputSet? + + if(@rollback_sets && #recoverable sets i.e. exists but not IMPORTED + ( (! $self->recovery) && (! $self->rollback) ) ){ + throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n". + "You must specify -recover or -rollback to perform a full rollback"); + + if($self->recovery){ + #Change these to logger->warn + $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined"); + $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n"); + } + } + + + if($self->rollback){ + #Check we have all existing InputSubsets files before we do full rollback + #Can probably remove this if we support InputSubset(file/slice) level rollback + $self->log('Rolling back all InputSubsets'); + @rollback_sets = @{$eset->get_InputSubsets}; + + foreach my $isset(@rollback_sets){ + + if(! exists $file_paths{$isset->name}){ + throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name. + "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names"); + } + } + } + + + foreach my $esset(@rollback_sets){ + #This needs to be mapped to the specified filepaths + my $fp_key = $esset->name; + $fp_key = 'prepared.'.$fp_key if $self->prepared; + + $new_data{$file_paths{$fp_key}} = 1; + $self->log("Revoking states for InputSubset:\t\t\t".$esset->name); + $eset->adaptor->revoke_states($esset); + + if(! $prepare){ + #This was to avoid redundant rollback in prepare step + $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name); + + if($self->input_feature_class eq 'result'){ + #Can we do this by slice for parallelisation? + #This will only ever be a single ResultSet due to Helper::define_and_validate_sets + #flags are rollback_results and force(as this won't be a direct input to the product feature set) + $self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1); + #Do no have rollback_InputSet here as we may have parallel Slice based imports running + } + else{#annotated/segmentation + $self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]); + $self->rollback_InputSet($eset); + last; + } + } + } + + return \%new_data; +} + + + + +sub set_feature_separator{ + my ($self, $separator) = @_; + + #How do we test if something undefined was passed? + #Rather than nothing passed at all? + #Can't do this as this is the accessor + #Need to split method + + throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') ); + + $self->{'_feature_separator'} = $separator; + +} + +# SIMPLE ACCESSORS +# Some of these can be called for each record +# Trim the access time as much as possible + +sub input_feature_class{ return $_[0]->{'input_feature_class'}; } + +sub input_set_name{ return $_[0]->{'input_set_name'}; } #Set in new + +sub feature_adaptor{ return $_[0]->{'feature_adaptor'}; } + +sub dbentry_adaptor{ return $_[0]->{'dbentry_adaptor'}; } + +sub input_set_adaptor{ return $_[0]->{'input_set_adaptor'}; } + +sub set{ return $_[0]->{'set'}; } #Feature or Result, set in define_sets + +##sub slice_adaptor{ return $_[0]->{'slice_adaptor'}; } + +sub data_set{ return $_[0]->{'_data_set'}; } + +sub feature_separator{ return $_[0]->{'_feature_separator'}; } + +sub feature_params{ return $_[0]->{'_feature_params'}; } + +sub dbentry_params{ return $_[0]->{'_dbentry_params'}; } + +sub input_gzipped{ return $_[0]->{'input_gzipped'}; } + + +sub input_file_operator{ + my ($self, $op) = @_; + #Should be set in format parser + $self->{'input_file_operator'} = $op if defined $op; + + return $self->{'input_file_operator'}; +} + +# prepare boolean, simply stores the sets and preprocesses the file +# so we don't get each batch job trying to sort etc + + +#Still need to implement prepare in other Parsers!! + +sub read_and_import_data{ + my ($self, $prepare) = @_; + + my $action = ($prepare) ? 'preparing' : 'importing'; + + $self->log("Reading and $action ".$self->vendor()." data"); + my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines); + + if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){ + throw('prepare mode is only currently implemented for the Bed parser'); + } + + + #Test for conflicting run modes + if($prepare && + ($self->batch_job || $self->prepared)){ + #prepare should be called once by the runner, not in each batch_job + #don't prepare if already prepared + throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job'); + } + + my $dset = $self->define_sets; + + #We also need to account for bsub'd slice based import + #seq alignments loaded into a ResultSet + #Cannot have 0 window for ChIP Seq alignments + #As this would mean storing all the individual reads + #Hence we need to remap to a new assm before we import! + + if($self->input_feature_class eq 'result'){ + $output_set = $dset->get_supporting_sets->[0]; + $eset = $output_set->get_InputSets->[0]; + $self->result_set($output_set);#required for ResultFeature Collector and Bed Parser + } + else{#annotated/segmentation + $output_set = $dset->product_FeatureSet; + $eset = $dset->get_supporting_sets->[0]; + } + + + #If we can do these the other way araound we can get define_sets to rollback the FeatureSet + #Cyclical dependency for the sets :| + my $new_data = $self->validate_files($prepare); + my $seen_new_data = 0; + + + ### READ AND IMPORT FILES ### + foreach my $filepath(@{$self->result_files()}) { + chomp $filepath; + + ($filename = $filepath) =~ s/.*\///; + $self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method + + if($new_data->{$filepath} ){ #This will currently autovivify! + $seen_new_data = 1; + $self->{'input_gzipped'} = &is_gzipped($filepath); + + $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file'); + + $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath); + + #We need to be able to optional open pipe to gzip | sort here + #i.e. define open command + $fh = open_file($filepath, $self->input_file_operator); + + #This my become way too large for some reads files + #Currently no problems + #This is not working as we are sorting the file! + #$self->parse_header($fh) if $self->can('parse_header'); + + #For result features some times we want to run + #locally and just sort without dumping + #i.e if we are not a batch job + #as there is no need to dump if it is a single process + + + #Should this be prepared? + + if((($self->input_feature_class eq 'result') && ! $prepare)){ + #(($self->input_feature_class eq 'result') && (! $self->batch_job))){ #Local run on just 1 chr + # + + #Use the ResultFeature Collector here + #Omiting the 0 wsize + #How are we going to omit 0 wsize when doing the fetch? + #simply check table name in ResultSet? + + #Should we do this for multiple chrs? + #or fail here + # we need to pass self + #for access to get_Features_by_Slice + #which should be in the specific parser e.g Bed + + #Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice? + #Could really do with separating the pure file parsers from the importer code, so these can be reused + #by other code. Then simply use Bed import parser for specific import functions and as wrapper to + #Bed file parser + #So should really have + #Parsers::File::Bed + #and + #Parsers::Import::Bed + #This can probably wait until we update BioPerl and just grab the Bed parser from there? + + my $slices = $self->slices; + + #Should this be caught in new? + if(! @$slices){ + throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name); + } + + + if(scalar(@$slices) > 1){ + throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n" + ."Please submit these to the farm as single slice jobs"); + } + + #restrict to just 1 slice as we don't yet support disk seeking + #if the slices are not in the same order as they appear in the file + #also we want to parallelise this + + #Set as attr for parse_Features_by_Slice in format sepcific Parsers + + + $self->file_handle(open_file($filepath, $self->input_file_operator)); + + + foreach my $slice(@$slices){ + $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self, + ( + #Force needs reimplementing here? + -force => $self->{force}, + -dbfile_data_root => $self->{dbfile_data_root}, + )); + } + + warn "Need to update InputSubset status to IMPORTED after all slices have been loaded"; + #Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures? + + + + warn "Closing $filename\nDisregard the following 'Broken pipe' warning"; + + #Closing the read end of a pipe before the process writing to it at the other end + #is done writing results in the writer receiving a SIGPIPE. If the other end can't + #handle that, be sure to read all the data before closing the pipe. + #This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file? + #$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead? + #Or maybe an eval will catch the STDERR better? + #sub catch_SIGPIPE{ + # my $sig = shift @_; + # print " Caught SIGPIPE: $sig $1 \n"; + # return; + # + #} + #$SIG{PIPE} = \&catch_SIGPIPE; + #my $retval = eval { no warnings 'all'; $fh->close }; + #if($@){ + # warn "after eval with error $@\nretval is $retval"; + #} + #Neither of these catch gzip: stdout: Broken pipe + + #IO::UnCompress::Gunzip? + + + $fh->close; + } + else{ + + + #Revoke FeatureSet IMPORTED state here incase we fail halfway through + $output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare)); + + #What about IMPORTED_"CSVERSION" + #This may leave us with an incomplete import which still has + #an IMPORTED_CSVERSION state + #We need to depend on IMPORTED for completeness of set + #DAS currently only uses IMPORTED_CSVERSION + #This is okayish but we also need to write HCs for any sets + #which do not have IMPORTED state! + my ($line, @outlines, $out_fh); + + + if($prepare && ! $self->batch_job){ + #Assume we want gzipped output + #filename is actull based on input, so may not have gz in file name + $out_fh = open_file($self->output_file, "| gzip -c > %s"); + } + + + while(defined ($line=<$fh>)){ + #Generic line processing + #Move these to parse_line? + $line =~ s/\r*\n//o; + next if $line =~ /^\#/; + next if $line =~ /^$/; + + #This has now been simplified to process_line method + #my @fields = split/\t/o, $line; + #start building parameters hash + #foreach my $field_index(@{$self->get_field_indices}){ + # my $field = $self->get_field_by_index($field_index); + # $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]); + # } + + + #We also need to enable different parse line methods if we have different file + #e.g. cisRED + #Code refs? + + + if($self->parse_line($line, $prepare)){ + $self->count('total parsed lines'); + + #Cache or print to sorted file + if($prepare && ! $self->batch_job){ + + if(scalar(@outlines) >1000){ + print $out_fh join("\n", @outlines)."\n"; + @outlines = (); + } + else{ + push @outlines, $line; + } + } + } + } + + close($fh); + + #Print last of sorted file + if($prepare && ! $self->batch_job){ + print $out_fh join("\n", @outlines)."\n"; + close($out_fh); + @outlines = (); + } + + if(! $prepare){ + #Now we need to deal with anything left in the read cache + $self->process_params if $self->can('process_params'); + + #To speed things up we may need to also do file based import here with WRITE lock? + #mysqlimport will write lock the table by default? + + #reset filename to that originally used to create the Inputsubsets + $filename =~ s/^prepared\.// if $self->prepared; + + my $sub_set = $eset->get_subset_by_name($filename); + $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job; + } + } + + + if($prepare){ + $self->log("Finished preparing import from:\t$filepath"); + } + else{ + #Need to tweak this for slice based import + $self->log('Finished importing '.$self->counts('features').' '. + $output_set->name." features from:\t$filepath"); + + } + + + #This currently fails here if the uncaught file sort was not successful + + foreach my $key (keys %{$self->counts}){ + $self->log("Count $key:\t".$self->counts($key)); + } + } + } + + #Here we should set IMPORTED on the FeatureSet + #We could also log the first dbID of each feature in a subset to facilitate subset rollback + #in feature table + #this would be sketchy at best + #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id + #This may already have IMPORTED status as we don't revoke the status whilst + #updating to protect the feature set due to lack of supportingset tracking + #see Helper::defined_and_validate_sets for more notes. + #Is there any point in setting it if we don't revoke it? + #To allow consistent status handling across sets. Just need to be aware of fset status caveat. + #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job + + if(! $prepare){ + $output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job; + $self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data}; + $self->log("Finished parsing and importing results"); + } + + return; +} + + +#Should be called from format parser e.g. BED, GFF, eQTL etc +#Why don't we pass feature_params and dbentry_params directly? + +sub load_feature_and_xrefs{ + my $self = shift; + + #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n"; + #This now only fails once on the first run and then + #Need to count based on feature_type? + + #new rather than new fast here as we want to validate the import + my $feature = $self->{feature_class}->new(%{$self->feature_params}); + ($feature) = @{$self->feature_adaptor->store($feature)}; + $self->count('features'); + + #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key? + + + + ##This needs to be handled in caller as we are validating loci? + #if($self->ucsc_coords){ + # $start += 1; + # $end += 1; + # } + + #This needs to be put in a separate sub and called by the caller + #if(! $self->cache_slice($chr)){ + # warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr"; + #} + #else{ + #grab seq if dump fasta and available + #my $seq; + #if(exists $self->feature_params->{'sequence'}){ + # $seq = $self->feature_params->{'sequence'}; + # delete $self->feature_params->{'sequence'}; + # } + # else{ + # $self->log('No fasta sequence available for '.$self->feature_params->display_label); + # } + # } + + #dump fasta here + #if ($self->dump_fasta){ + # $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n"; + # } + + #Store the xrefs + + foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){ + my $ftype = $dbentry_hash->{feature_type}; + delete $dbentry_hash->{feature_type}; + + my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash}); + $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag + #count here? no count in caller + } + + + #Clean data cache + $self->{'_feature_params'} = {}; + $self->{'_dbentry_params'} = []; + + return $feature; +} + +#This should really be handled in Bio::EnsEMBL::Feature? +#Move to Helper? + +sub set_strand{ + my ($self, $strand) = @_; + + my $ens_strand = 0; + + my %strand_vals = ( + '1' => 1, + '0' => 0, + '-1' => -1, + '+' => 1, + '-' => -1, + '.' => 0, + ); + + if($strand){ + + if(exists $strand_vals{$strand}){ + $ens_strand = $strand_vals{$strand}; + } + else{ + throw("Could not identify strand value for $strand"); + } + } + + return $ens_strand; +} + +sub total_features{ + my ($self, $total) = @_; + + $self->{'total_features'} = $total if defined $total; + return $self->{'total_features'}; +} + +#Currently only required for Bed::parse_Features_by_Slice + +#filehandle + + +sub file_handle{ + my ($self, $fh) = @_; + + $self->{'file_handle'} = $fh if defined $fh; + return $self->{'file_handle'}; +} + +sub result_set{ + my ($self, $rset) = @_; + + #already tested/created by self + + $self->{'result_set'} = $rset if $rset; + return $self->{'result_set'}; +} + +1;