annotate variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/InputSet.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::InputSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 =head1 LICENSE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 Genome Research Limited. All rights reserved.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10 This software is distributed under a modified Apache license.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 For license details, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 http://www.ensembl.org/info/about/code_licence.html
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 =head1 CONTACT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 Please email comments or questions to the public Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 developers list at <ensembl-dev@ebi.ac.uk>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 Questions may also be sent to the Ensembl help desk at
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 <helpdesk@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 Bio::EnsEMBL::Funcgen::Parsers::InputSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35 This is a base class to support simple file format parsers. For simple imports the vendor is
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 set to the parser type i.e. the file format. The generic read_and_import_simple_data assumes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 a one line per feature format, other format need there own read_and_import_format_data method,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 which will need defining in the result_data config element. Features are stored either as
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 ResultFeature collections or AnnotatedFeatures dependan ton the input feature class.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 # To do
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 # Add Parsers for BAM/SAM
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 # Rename to InputSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 # Handle mysqlimport for large data sets e.g. reads
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47 # Incorporate collection code
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 # Implement matrix storage
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 package Bio::EnsEMBL::Funcgen::Parsers::InputSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 use Bio::EnsEMBL::Funcgen::SegmentationFeature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(species_chr_num open_file is_gzipped);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 #use Bio::EnsEMBL::Funcgen::Utils::Helper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 #config stuff, move to BaseImporter?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61 use Bio::EnsEMBL::Analysis;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 use Bio::EnsEMBL::Funcgen::FeatureType;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 use base qw(Bio::EnsEMBL::Funcgen::Parsers::BaseImporter); #@ISA change to parent with perl 5.10
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 #use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 #@ISA = qw(Bio::EnsEMBL::Funcgen::Utils::Helper);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 my %valid_types = (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 result => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 annotated => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 segmentation => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 =head2 new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 Example : my $self = $class->SUPER::new(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 Description: Constructor method for Bed class
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Simple
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 Exceptions : throws if caller is not Importer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 Caller : Bio::EnsEMBL::Funcgen::Parsers:Simple
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 Status : at risk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 sub new{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 my $caller = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 my $class = ref($caller) || $caller;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 my $self = $class->SUPER::new(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 # my $config_file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98 ($self->{'input_set_name'},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 $self->{'input_feature_class'},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 #$self->{'slices'},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 $self->{total_features},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 $self->{force}, #Is this generic enough to go in Importer? used by store_window_bins_by_Slice_Parser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103 $self->{dbfile_data_root}, #only appropriate for result input_feature_class
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 # $config_file, #User defined config hash file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 ) = rearrange(['input_set_name', 'input_feature_class',
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106 'total_features', 'force', 'dbfile_data_root'], @_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109 #Could potentially take fields params directly to define a custom format
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 #Take direct field mappings, plus special fields which needs parsing differently
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 #i.e. default is tab delimited, and GFF would define Attrs field as compound field and provide special parsing and field mapping
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 $self->{'config'} =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 {(
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 #can we omit these?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 array_data => [],#['experiment'],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 probe_data => [],#["probe"],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 norm_method => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 #protocols => {()},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124 'results_data' => ["and_import"],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 )};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127 #set up feature params
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 $self->{'_feature_params'} = {};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 $self->{'_dbentry_params'} = [];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131 #$self->{'counts'} = {};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 #$self->{'slices'} = [];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133 #$self->{'seq_region_names'} = [];#Used for slice based import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 # USER CONFIG #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 #Here we need to read config based on external file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 #Should do something similar to set_feature_sets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 #and validate_and_store_feature_types in BaseExternalParser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140 #but we are using define and validate sets instead
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142 #BaseExternalParser and BaseImporter really need to be merged
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143 #After we have stripped out all the array/experiment specific stuff
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 #Do dev here so we are not developing more stuff in the Importer which will need migrating
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146 #to the BaseImporter
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 #if($config_file){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 # my $config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 # $self->log("Reading config file:\t".$config_file);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153 # if(! ($config = do "$config_file")){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154 # throw("Couldn't parse config file:\t$config_file:\n$@") if $@;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155 # throw("Couldn't do config:\t$config_file\n$!") if ! defined $config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 # throw("Couldn't compile config_file:\t$config_file") if ! $config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 # #At least check it is hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 # if(ref($config) ne 'HASH'){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161 # throw("Config file does not define a valid HASH:\t$config_file");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 # $self->{user_config} = $config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 return $self;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 sub output_file{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 my ($self, $output_file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175 $self->{'output_file'} = $output_file if $output_file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176 return $self->{'output_file'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 sub input_file{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 my ($self, $input_file) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 $self->{'input_file'} = $input_file if $input_file;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 return $self->{'input_file'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 =head2 set_config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 Example : my $self->set_config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 Description: Sets attribute dependent config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192 Returntype : None
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193 Exceptions : None
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 Caller : Bio::EnsEMBL::Funcgen::Importer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 Status : at risk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 sub set_config{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 #Move all this to new when we fix the inheritance in Importer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 #We could set input_set_name to experiment name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 #But we would have to make warning in define_and_validate_sets mention -input_set_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208 throw('Must provide an -input_set name for a '.uc($self->vendor).' import') if ! defined $self->input_set_name();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 #Mandatory checks
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 if(! defined $self->feature_analysis){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 throw('Must define a -feature_analysis parameter for '.uc($self->vendor).' imports');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 if(! exists $valid_types{$self->input_feature_class}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 throw("You must define a valid input_feature_class:\t".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 join(', ', keys %valid_types));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 $self->{'feature_class'} = 'Bio::EnsEMBL::Funcgen::'.ucfirst($self->input_feature_class).'Feature';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224 #We need to undef norm method as it has been set to the env var
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 $self->{'config'}{'norm_method'} = undef;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 #dirs are not set in config to enable generic get_dir method access
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 $self->{dbfile_data_root} ||= $self->get_dir('output');#Required for Collector
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 #some convenience methods
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 my $adaptor_method = 'get_'.ucfirst($self->input_feature_class).'FeatureAdaptor';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232 $self->{'feature_adaptor'} = $self->db->$adaptor_method;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 $self->{'dbentry_adaptor'} = $self->db->get_DBEntryAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 $self->{'input_set_adaptor'} = $self->db->get_InputSetAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 ##$self->{'slice_adaptor'} = $self->db->dnadb->get_SliceAdaptor;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 #Validate slices
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238 $self->slices($self->{'slices'}) if defined $self->{'slices'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240 #Move to new when we sort out inheritance
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 $self->validate_and_store_config([$self->name]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242 #Could use input_set_name here?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 #This was to support >1 input set per experiment (name)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 #This current breaks for no config imports
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246 #i.e. standard Bed import e.g. result_feature collections
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 #segmentation imports use Bed and config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248 #allow no config imports in BaseImporter?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249 #or ultimately set the params as part of the user_config?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 sub define_sets{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 my ($self) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 my $eset = $self->db->get_InputSetAdaptor->fetch_by_name($self->input_set_name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 if(! defined $eset){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261 $eset = Bio::EnsEMBL::Funcgen::InputSet->new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262 (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263 -name => $self->input_set_name(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264 -experiment => $self->experiment(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 -feature_type => $self->feature_type(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266 -cell_type => $self->cell_type(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 -vendor => $self->vendor(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268 -format => $self->format(),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269 -analysis => $self->feature_analysis,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 -feature_class => $self->input_feature_class,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272 ($eset) = @{$self->db->get_InputSetAdaptor->store($eset)};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275 #Use define_and_validate with fetch/append as we may have a pre-existing set
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276 #This now needs to handle ResultSets based on InputSets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279 my $dset = $self->define_and_validate_sets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280 (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281 -dbadaptor => $self->db,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 -name => $self->input_set_name,#or name?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 -feature_type => $self->feature_type,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284 -cell_type => $self->cell_type,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 -analysis => $self->feature_analysis,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 -feature_class=> $self->input_feature_class,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287 -description => $self->feature_set_description,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288 #-append => 1,#Omit append to ensure we only have this eset
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289 -recovery => $self->recovery,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290 -supporting_sets => [$eset],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291 -slices => $self->slices,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292 #Can't set rollback here, as we don't know until we've validated the files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293 #Can't validate the files until we have the sets.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294 #So we're doing this manually in validate_files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297 #We are now using IMPORTED to define wheather a FeatureSet has been imported succesfully
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 #However we already have IMPORTED on the InputSubSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 #We should add it to FeatureSet to remain consistent.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300 #See Helper::define_and_validate_sets for more notes on
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 #potential problems with FeatureSet IMPORTED status
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 #define_and_validate_sets should also replicate ResultSets?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305 #Questionable, mapped reads are never normalised across replicates
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306 #There are generally used as input for peak calling individually.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307 #So files in this instance are expected to be separate parts of the same replicate
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308 #e.g. different chromosomes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 #Force one input file?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310 #What if we want to link several assays(feature/cell_types) to the same experiment?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 $self->{'_data_set'} = $dset;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 return $dset;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320 #we have rollback functionality incorporated here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322 sub validate_files{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323 my ($self, $prepare) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325 #Get file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326 if (! @{$self->result_files()}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 my $list = "ls ".$self->get_dir('input').'/'.$self->input_set_name().'*.';#.lc($self->vendor);#could use vendor here? Actually need suffix attr
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328 my @rfiles = `$list`;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329 $self->result_files(\@rfiles);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332 #We don't yet support multiple files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333 if(scalar(@{$self->result_files()}) >1){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 warn('Found more than one '.$self->vendor." file:\n".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335 join("\n", @{$self->result_files()})."\nThe InputSet parser does not yet handle multiple input files(e.g. replicates).".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336 " We need to resolve how we are going handle replicates with random cluster IDs");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337 #do we even need to?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
338 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
339
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
340 #Here were are tracking the import of individual files by adding them as InputSubSets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
341 #Recovery would never know what to delete
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
342 #So would need to delete all, Hence no point in setting status?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
343 #We do not rollback IMPORTED data here. This is done via separate scripts
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
344 #To reduce the rick of accidentally deleting/overwriting data by leaving a stry -rollback
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
345 #flag in the run script
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
346
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
347 ### VALIDATE FILES ###
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
348 #We need validate all the files first, so the import doesn't fall over half way through
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
349 #Or if we come across a rollback halfway through
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
350 my (%new_data, $eset);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
351 my $dset = $self->data_set;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
352
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
353 if((scalar(@{$self->slices}) > 1) &&
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
354 ! $prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
355 throw('Validate files does not yet support multiple Slice rollback');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
356 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
357
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
358 #This all assumes that there is only ever 1 InputSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
359
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
360 if($self->input_feature_class eq 'result'){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
361 $eset = $dset->get_supporting_sets->[0]->get_InputSets->[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
362 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
363 else{#annotated/segmentation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
364 $eset = $dset->get_supporting_sets->[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
365 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
366
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
367
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
368 #IMPORTED status here may prevent
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
369 #futher slice based imports
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
370 #so we have wait to set this until we know all the slices
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
371 #are loaded, unless we store slice based IMPORTED states
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
372 #We currently get around this be never settign IMPORTED for slice based jobs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
373 #and always rolling back by slice before import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
374
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
375 #This loop supports multiple files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
376 my (@rollback_sets, %file_paths);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
377 my $auto_rollback = ($self->rollback) ? 0 : 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
378
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
379 foreach my $filepath( @{$self->result_files} ) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
380 my ($filename, $sub_set);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
381 chomp $filepath;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
382 ($filename = $filepath) =~ s/.*\///;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
383 $file_paths{$filename} = $filepath;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
384 $filename =~ s/^prepared\.// if $self->prepared; #reset filename to that originally used to create the Inputsubsets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
385
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
386 $self->log('Validating '.$self->vendor." file:\t$filename");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
387 throw("Cannot find ".$self->vendor." file:\t$filepath") if(! -e $filepath);#Can deal with links
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
388
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
389 if( $sub_set = $eset->get_subset_by_name($filename) ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
390 #IMPORTED status here is just for the file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
391 #Any changes to analysis or coord_system should result in different InputSubset(file)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
392 #Will only ever be imported into one Feature|ResultSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
393
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
394 #Currently conflating recover_unimported and rollback
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
395 #as they serve the same purpose until we implement InputSubset level recovery
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
396
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
397
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
398 if( $sub_set->has_status('IMPORTED') ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
399 $new_data{$filepath} = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
400 $self->log("Found previously IMPORTED InputSubset:\t$filename");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
401 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
402 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
403 $self->log("Found existing InputSubset without IMPORTED status:\t$filename");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
404 push @rollback_sets, $sub_set;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
405 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
406 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
407 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
408 $self->log("Found new InputSubset:\t${filename}");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
409 throw("Should not have found new 'prepared' file:\t$filename") if $self->prepared;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
410 $new_data{$filepath} = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
411 $sub_set = $eset->add_new_subset($filename);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
412 $self->input_set_adaptor->store_InputSubsets([$sub_set]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
413 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
414 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
415
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
416
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
417 #Does -recover allow a single extra new file to be added to an existing InputSet?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
418
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
419 if(@rollback_sets && #recoverable sets i.e. exists but not IMPORTED
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
420 ( (! $self->recovery) && (! $self->rollback) ) ){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
421 throw("Found partially imported InputSubsets:\n\t".join("\n\t", (map $_->name, @rollback_sets))."\n".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
422 "You must specify -recover or -rollback to perform a full rollback");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
423
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
424 if($self->recovery){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
425 #Change these to logger->warn
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
426 $self->log("WARNING::\tCannot yet rollback for just an InputSubset, rolling back entire set? Unless slices defined");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
427 $self->log("WARNING::\tThis may be deleting previously imported data which you are not re-importing..list?!!!\n");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
428 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
429 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
430
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
431
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
432 if($self->rollback){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
433 #Check we have all existing InputSubsets files before we do full rollback
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
434 #Can probably remove this if we support InputSubset(file/slice) level rollback
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
435 $self->log('Rolling back all InputSubsets');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
436 @rollback_sets = @{$eset->get_InputSubsets};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
437
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
438 foreach my $isset(@rollback_sets){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
439
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
440 if(! exists $file_paths{$isset->name}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
441 throw("You are attempting a multiple InputSubset rollback without specifying an existing InputSubset:\t".$isset->name.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
442 "\nAborting rollback as data will be lost. Please specifying all existing InputSubset file names");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
443 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
444 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
445 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
446
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
447
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
448 foreach my $esset(@rollback_sets){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
449 #This needs to be mapped to the specified filepaths
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
450 my $fp_key = $esset->name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
451 $fp_key = 'prepared.'.$fp_key if $self->prepared;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
452
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
453 $new_data{$file_paths{$fp_key}} = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
454 $self->log("Revoking states for InputSubset:\t\t\t".$esset->name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
455 $eset->adaptor->revoke_states($esset);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
456
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
457 if(! $prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
458 #This was to avoid redundant rollback in prepare step
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
459 $self->log("Rolling back InputSubset:\t\t\t\t".$esset->name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
460
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
461 if($self->input_feature_class eq 'result'){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
462 #Can we do this by slice for parallelisation?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
463 #This will only ever be a single ResultSet due to Helper::define_and_validate_sets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
464 #flags are rollback_results and force(as this won't be a direct input to the product feature set)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
465 $self->rollback_ResultSet($self->data_set->get_supporting_sets->[0], 1, $self->slices->[0], 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
466 #Do no have rollback_InputSet here as we may have parallel Slice based imports running
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
467 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
468 else{#annotated/segmentation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
469 $self->rollback_FeatureSet($self->data_set->product_FeatureSet, undef, $self->slices->[0]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
470 $self->rollback_InputSet($eset);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
471 last;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
472 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
473 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
474 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
475
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
476 return \%new_data;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
477 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
478
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
479
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
480
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
481
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
482 sub set_feature_separator{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
483 my ($self, $separator) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
484
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
485 #How do we test if something undefined was passed?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
486 #Rather than nothing passed at all?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
487 #Can't do this as this is the accessor
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
488 #Need to split method
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
489
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
490 throw('Must provide a valid feature separator') if ( (! defined $separator) || ($separator eq '') );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
491
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
492 $self->{'_feature_separator'} = $separator;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
493
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
494 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
495
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
496 # SIMPLE ACCESSORS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
497 # Some of these can be called for each record
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
498 # Trim the access time as much as possible
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
499
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
500 sub input_feature_class{ return $_[0]->{'input_feature_class'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
501
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
502 sub input_set_name{ return $_[0]->{'input_set_name'}; } #Set in new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
503
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
504 sub feature_adaptor{ return $_[0]->{'feature_adaptor'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
505
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
506 sub dbentry_adaptor{ return $_[0]->{'dbentry_adaptor'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
507
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
508 sub input_set_adaptor{ return $_[0]->{'input_set_adaptor'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
509
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
510 sub set{ return $_[0]->{'set'}; } #Feature or Result, set in define_sets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
511
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
512 ##sub slice_adaptor{ return $_[0]->{'slice_adaptor'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
513
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
514 sub data_set{ return $_[0]->{'_data_set'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
515
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
516 sub feature_separator{ return $_[0]->{'_feature_separator'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
517
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
518 sub feature_params{ return $_[0]->{'_feature_params'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
519
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
520 sub dbentry_params{ return $_[0]->{'_dbentry_params'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
521
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
522 sub input_gzipped{ return $_[0]->{'input_gzipped'}; }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
523
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
524
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
525 sub input_file_operator{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
526 my ($self, $op) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
527 #Should be set in format parser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
528 $self->{'input_file_operator'} = $op if defined $op;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
529
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
530 return $self->{'input_file_operator'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
531 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
532
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
533 # prepare boolean, simply stores the sets and preprocesses the file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
534 # so we don't get each batch job trying to sort etc
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
535
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
536
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
537 #Still need to implement prepare in other Parsers!!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
538
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
539 sub read_and_import_data{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
540 my ($self, $prepare) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
541
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
542 my $action = ($prepare) ? 'preparing' : 'importing';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
543
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
544 $self->log("Reading and $action ".$self->vendor()." data");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
545 my ($eset, $filename, $output_set, $fh, $f_out, %feature_params, @lines);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
546
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
547 if($prepare && ! $self->isa('Bio::EnsEMBL::Funcgen::Parsers::Bed')){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
548 throw('prepare mode is only currently implemented for the Bed parser');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
549 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
550
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
551
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
552 #Test for conflicting run modes
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
553 if($prepare &&
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
554 ($self->batch_job || $self->prepared)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
555 #prepare should be called once by the runner, not in each batch_job
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
556 #don't prepare if already prepared
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
557 throw('You cannot run read_and_import_data in prepare mode with a -batch_job or -prepared job');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
558 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
559
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
560 my $dset = $self->define_sets;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
561
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
562 #We also need to account for bsub'd slice based import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
563 #seq alignments loaded into a ResultSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
564 #Cannot have 0 window for ChIP Seq alignments
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
565 #As this would mean storing all the individual reads
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
566 #Hence we need to remap to a new assm before we import!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
567
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
568 if($self->input_feature_class eq 'result'){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
569 $output_set = $dset->get_supporting_sets->[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
570 $eset = $output_set->get_InputSets->[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
571 $self->result_set($output_set);#required for ResultFeature Collector and Bed Parser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
572 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
573 else{#annotated/segmentation
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
574 $output_set = $dset->product_FeatureSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
575 $eset = $dset->get_supporting_sets->[0];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
576 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
577
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
578
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
579 #If we can do these the other way araound we can get define_sets to rollback the FeatureSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
580 #Cyclical dependency for the sets :|
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
581 my $new_data = $self->validate_files($prepare);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
582 my $seen_new_data = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
583
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
584
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
585 ### READ AND IMPORT FILES ###
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
586 foreach my $filepath(@{$self->result_files()}) {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
587 chomp $filepath;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
588
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
589 ($filename = $filepath) =~ s/.*\///;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
590 $self->input_file($filepath); #This is only used by Collector::ResultFeature::reinitialise_input method
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
591
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
592 if($new_data->{$filepath} ){ #This will currently autovivify!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
593 $seen_new_data = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
594 $self->{'input_gzipped'} = &is_gzipped($filepath);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
595
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
596 $filepath = $self->pre_process_file($filepath, $prepare) if $self->can('pre_process_file');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
597
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
598 $self->log_header(ucfirst($action).' '.$self->vendor." file:\t".$filepath);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
599
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
600 #We need to be able to optional open pipe to gzip | sort here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
601 #i.e. define open command
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
602 $fh = open_file($filepath, $self->input_file_operator);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
603
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
604 #This my become way too large for some reads files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
605 #Currently no problems
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
606 #This is not working as we are sorting the file!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
607 #$self->parse_header($fh) if $self->can('parse_header');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
608
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
609 #For result features some times we want to run
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
610 #locally and just sort without dumping
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
611 #i.e if we are not a batch job
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
612 #as there is no need to dump if it is a single process
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
613
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
614
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
615 #Should this be prepared?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
616
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
617 if((($self->input_feature_class eq 'result') && ! $prepare)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
618 #(($self->input_feature_class eq 'result') && (! $self->batch_job))){ #Local run on just 1 chr
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
619 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
620
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
621 #Use the ResultFeature Collector here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
622 #Omiting the 0 wsize
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
623 #How are we going to omit 0 wsize when doing the fetch?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
624 #simply check table name in ResultSet?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
625
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
626 #Should we do this for multiple chrs?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
627 #or fail here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
628 # we need to pass self
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
629 #for access to get_Features_by_Slice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
630 #which should be in the specific parser e.g Bed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
631
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
632 #Will this not clash with standard ResultFeature::get_ResultFeatures_by_Slice?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
633 #Could really do with separating the pure file parsers from the importer code, so these can be reused
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
634 #by other code. Then simply use Bed import parser for specific import functions and as wrapper to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
635 #Bed file parser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
636 #So should really have
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
637 #Parsers::File::Bed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
638 #and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
639 #Parsers::Import::Bed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
640 #This can probably wait until we update BioPerl and just grab the Bed parser from there?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
641
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
642 my $slices = $self->slices;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
643
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
644 #Should this be caught in new?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
645 if(! @$slices){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
646 throw("You must define a slice to generate ResultFeature Collections from InputSet:\t".$eset->name);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
647 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
648
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
649
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
650 if(scalar(@$slices) > 1){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
651 throw("InputSet parser does not yet support multi-Slice import for ResultFeature collections\n"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
652 ."Please submit these to the farm as single slice jobs");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
653 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
654
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
655 #restrict to just 1 slice as we don't yet support disk seeking
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
656 #if the slices are not in the same order as they appear in the file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
657 #also we want to parallelise this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
658
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
659 #Set as attr for parse_Features_by_Slice in format sepcific Parsers
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
660
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
661
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
662 $self->file_handle(open_file($filepath, $self->input_file_operator));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
663
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
664
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
665 foreach my $slice(@$slices){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
666 $self->feature_adaptor->store_window_bins_by_Slice_Parser($slice, $self,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
667 (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
668 #Force needs reimplementing here?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
669 -force => $self->{force},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
670 -dbfile_data_root => $self->{dbfile_data_root},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
671 ));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
672 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
673
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
674 warn "Need to update InputSubset status to IMPORTED after all slices have been loaded";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
675 #Do we even need to set RESULT_FEATURE_SET for input_set ResultFeatures?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
676
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
677
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
678
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
679 warn "Closing $filename\nDisregard the following 'Broken pipe' warning";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
680
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
681 #Closing the read end of a pipe before the process writing to it at the other end
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
682 #is done writing results in the writer receiving a SIGPIPE. If the other end can't
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
683 #handle that, be sure to read all the data before closing the pipe.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
684 #This suggests the gzip pipe has not finished reading, but it *should* be at the end of the file?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
685 #$SIG{PIPE} = 'IGNORE'; #Catch with subref and warn instead?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
686 #Or maybe an eval will catch the STDERR better?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
687 #sub catch_SIGPIPE{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
688 # my $sig = shift @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
689 # print " Caught SIGPIPE: $sig $1 \n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
690 # return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
691 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
692 #}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
693 #$SIG{PIPE} = \&catch_SIGPIPE;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
694 #my $retval = eval { no warnings 'all'; $fh->close };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
695 #if($@){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
696 # warn "after eval with error $@\nretval is $retval";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
697 #}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
698 #Neither of these catch gzip: stdout: Broken pipe
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
699
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
700 #IO::UnCompress::Gunzip?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
701
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
702
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
703 $fh->close;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
704 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
705 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
706
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
707
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
708 #Revoke FeatureSet IMPORTED state here incase we fail halfway through
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
709 $output_set->adaptor->revoke_status('IMPORTED', $output_set) if ($output_set->has_status('IMPORTED') && (! $prepare));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
710
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
711 #What about IMPORTED_"CSVERSION"
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
712 #This may leave us with an incomplete import which still has
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
713 #an IMPORTED_CSVERSION state
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
714 #We need to depend on IMPORTED for completeness of set
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
715 #DAS currently only uses IMPORTED_CSVERSION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
716 #This is okayish but we also need to write HCs for any sets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
717 #which do not have IMPORTED state!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
718 my ($line, @outlines, $out_fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
719
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
720
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
721 if($prepare && ! $self->batch_job){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
722 #Assume we want gzipped output
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
723 #filename is actull based on input, so may not have gz in file name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
724 $out_fh = open_file($self->output_file, "| gzip -c > %s");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
725 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
726
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
727
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
728 while(defined ($line=<$fh>)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
729 #Generic line processing
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
730 #Move these to parse_line?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
731 $line =~ s/\r*\n//o;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
732 next if $line =~ /^\#/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
733 next if $line =~ /^$/;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
734
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
735 #This has now been simplified to process_line method
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
736 #my @fields = split/\t/o, $line;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
737 #start building parameters hash
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
738 #foreach my $field_index(@{$self->get_field_indices}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
739 # my $field = $self->get_field_by_index($field_index);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
740 # $feature_params = ($field =~ /^-/) ? $fields[$field_index] : $self->$field($fields[$field_index]);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
741 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
742
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
743
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
744 #We also need to enable different parse line methods if we have different file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
745 #e.g. cisRED
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
746 #Code refs?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
747
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
748
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
749 if($self->parse_line($line, $prepare)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
750 $self->count('total parsed lines');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
751
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
752 #Cache or print to sorted file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
753 if($prepare && ! $self->batch_job){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
754
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
755 if(scalar(@outlines) >1000){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
756 print $out_fh join("\n", @outlines)."\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
757 @outlines = ();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
758 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
759 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
760 push @outlines, $line;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
761 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
762 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
763 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
764 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
765
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
766 close($fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
767
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
768 #Print last of sorted file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
769 if($prepare && ! $self->batch_job){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
770 print $out_fh join("\n", @outlines)."\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
771 close($out_fh);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
772 @outlines = ();
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
773 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
774
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
775 if(! $prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
776 #Now we need to deal with anything left in the read cache
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
777 $self->process_params if $self->can('process_params');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
778
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
779 #To speed things up we may need to also do file based import here with WRITE lock?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
780 #mysqlimport will write lock the table by default?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
781
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
782 #reset filename to that originally used to create the Inputsubsets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
783 $filename =~ s/^prepared\.// if $self->prepared;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
784
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
785 my $sub_set = $eset->get_subset_by_name($filename);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
786 $sub_set->adaptor->store_status('IMPORTED', $sub_set) if ! $self->batch_job;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
787 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
788 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
789
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
790
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
791 if($prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
792 $self->log("Finished preparing import from:\t$filepath");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
793 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
794 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
795 #Need to tweak this for slice based import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
796 $self->log('Finished importing '.$self->counts('features').' '.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
797 $output_set->name." features from:\t$filepath");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
798
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
799 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
800
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
801
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
802 #This currently fails here if the uncaught file sort was not successful
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
803
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
804 foreach my $key (keys %{$self->counts}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
805 $self->log("Count $key:\t".$self->counts($key));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
806 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
807 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
808 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
809
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
810 #Here we should set IMPORTED on the FeatureSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
811 #We could also log the first dbID of each feature in a subset to facilitate subset rollback
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
812 #in feature table
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
813 #this would be sketchy at best
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
814 #delete from annotated_feature where annotated_feature_id >= $first_subset_feature_id and feature_set_id=$feature_set_id
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
815 #This may already have IMPORTED status as we don't revoke the status whilst
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
816 #updating to protect the feature set due to lack of supportingset tracking
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
817 #see Helper::defined_and_validate_sets for more notes.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
818 #Is there any point in setting it if we don't revoke it?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
819 #To allow consistent status handling across sets. Just need to be aware of fset status caveat.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
820 #Also currently happens with ResultFeatures loaded by slice jobs, as this may already be set by a parallel job
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
821
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
822 if(! $prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
823 $output_set->adaptor->set_imported_states_by_Set($output_set) if $seen_new_data && ! $self->batch_job;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
824 $self->log("No new data, skipping result parse") if ! grep /^1$/o, values %{$new_data};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
825 $self->log("Finished parsing and importing results");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
826 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
827
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
828 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
829 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
830
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
831
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
832 #Should be called from format parser e.g. BED, GFF, eQTL etc
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
833 #Why don't we pass feature_params and dbentry_params directly?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
834
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
835 sub load_feature_and_xrefs{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
836 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
837
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
838 #warn "Loading ".($self->{_counts}{features}+1).' '.$self->feature_params->{-FEATURE_TYPE}->name."\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
839 #This now only fails once on the first run and then
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
840 #Need to count based on feature_type?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
841
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
842 #new rather than new fast here as we want to validate the import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
843 my $feature = $self->{feature_class}->new(%{$self->feature_params});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
844 ($feature) = @{$self->feature_adaptor->store($feature)};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
845 $self->count('features');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
846
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
847 #Add count based on FeatureType, should be ftype name and analysis to reflect unique ftype key?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
848
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
849
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
850
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
851 ##This needs to be handled in caller as we are validating loci?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
852 #if($self->ucsc_coords){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
853 # $start += 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
854 # $end += 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
855 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
856
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
857 #This needs to be put in a separate sub and called by the caller
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
858 #if(! $self->cache_slice($chr)){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
859 # warn "Skipping AnnotatedFeature import, cound non standard chromosome: $chr";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
860 #}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
861 #else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
862 #grab seq if dump fasta and available
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
863 #my $seq;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
864 #if(exists $self->feature_params->{'sequence'}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
865 # $seq = $self->feature_params->{'sequence'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
866 # delete $self->feature_params->{'sequence'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
867 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
868 # else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
869 # $self->log('No fasta sequence available for '.$self->feature_params->display_label);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
870 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
871 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
872
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
873 #dump fasta here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
874 #if ($self->dump_fasta){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
875 # $self->{'_fasta'} .= $self->generate_fasta_header($feature)."\n$seq\n";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
876 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
877
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
878 #Store the xrefs
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
879
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
880 foreach my $dbentry_hash(@{$self->{'_dbentry_params'}}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
881 my $ftype = $dbentry_hash->{feature_type};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
882 delete $dbentry_hash->{feature_type};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
883
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
884 my $dbentry = Bio::EnsEMBL::DBEntry->new(%{$dbentry_hash});
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
885 $self->dbentry_adaptor->store($dbentry, $feature->dbID, $ftype, 1);#1 is ignore release flag
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
886 #count here? no count in caller
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
887 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
888
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
889
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
890 #Clean data cache
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
891 $self->{'_feature_params'} = {};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
892 $self->{'_dbentry_params'} = [];
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
893
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
894 return $feature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
895 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
896
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
897 #This should really be handled in Bio::EnsEMBL::Feature?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
898 #Move to Helper?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
899
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
900 sub set_strand{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
901 my ($self, $strand) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
902
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
903 my $ens_strand = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
904
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
905 my %strand_vals = (
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
906 '1' => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
907 '0' => 0,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
908 '-1' => -1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
909 '+' => 1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
910 '-' => -1,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
911 '.' => 0,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
912 );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
913
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
914 if($strand){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
915
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
916 if(exists $strand_vals{$strand}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
917 $ens_strand = $strand_vals{$strand};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
918 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
919 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
920 throw("Could not identify strand value for $strand");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
921 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
922 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
923
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
924 return $ens_strand;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
925 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
926
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
927 sub total_features{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
928 my ($self, $total) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
929
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
930 $self->{'total_features'} = $total if defined $total;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
931 return $self->{'total_features'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
932 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
933
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
934 #Currently only required for Bed::parse_Features_by_Slice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
935
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
936 #filehandle
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
937
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
938
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
939 sub file_handle{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
940 my ($self, $fh) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
941
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
942 $self->{'file_handle'} = $fh if defined $fh;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
943 return $self->{'file_handle'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
944 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
945
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
946 sub result_set{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
947 my ($self, $rset) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
948
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
949 #already tested/created by self
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
950
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
951 $self->{'result_set'} = $rset if $rset;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
952 return $self->{'result_set'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
953 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
954
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
955 1;