annotate variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/Bed.pm @ 3:d30fa12e4cc5 default tip

Merge heads 2:a5976b2dce6f and 1:09613ce8151e which were created as a result of a recently fixed bug.
author devteam <devteam@galaxyproject.org>
date Mon, 13 Jan 2014 10:38:30 -0500
parents 1f6dce3d34e0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
1 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
2 # EnsEMBL module for Bio::EnsEMBL::Funcgen::Parsers::Bed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
3 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
4
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
5 =head1 LICENSE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
6
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
7 Copyright (c) 1999-2011 The European Bioinformatics Institute and
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
8 Genome Research Limited. All rights reserved.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
9
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
10 This software is distributed under a modified Apache license.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
11 For license details, please see
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
12
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
13 http://www.ensembl.org/info/about/code_licence.html
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
14
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
15 =head1 CONTACT
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
16
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
17 Please email comments or questions to the public Ensembl
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
18 developers list at <ensembl-dev@ebi.ac.uk>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
19
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
20 Questions may also be sent to the Ensembl help desk at
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
21 <helpdesk@ensembl.org>.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
22
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
23 =head1 NAME
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
24
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
25 Bio::EnsEMBL::Funcgen::Parsers::Bed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
26
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
27 =head1 SYNOPSIS
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
28
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
29 my $parser_type = "Bio::EnsEMBL::Funcgen::Parsers::Bed";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
30 push @INC, $parser_type;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
31 my $imp = $class->SUPER::new(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
32
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
33
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
34 =head1 DESCRIPTION
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
35
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
36 This is a definitions class which should not be instatiated directly, it
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
37 normally set by the Importer as the parent class. Bed contains meta
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
38 data and methods specific to data in bed format, to aid
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
39 parsing and importing of experimental data.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
40
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
41 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
42
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
43 #Import/Parser rework
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
44 #We now have potential to use indexed DBFile and Parsers
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
45 #Importer should become BaseImporter, inherited from InputSet/Nimblegen
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
46 #Have Bed(format) importer which sets the generic Bed(format) Parser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
47
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
48 package Bio::EnsEMBL::Funcgen::Parsers::Bed;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
49
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
50 use Bio::EnsEMBL::Funcgen::Parsers::InputSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
51 use Bio::EnsEMBL::Funcgen::FeatureSet;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
52 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
53 use Bio::EnsEMBL::Utils::Exception qw( throw warning deprecate );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
54 use Bio::EnsEMBL::Funcgen::Utils::EFGUtils qw(open_file is_bed);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
55 use Bio::EnsEMBL::Utils::Argument qw( rearrange );
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
56 use Bio::EnsEMBL::Funcgen::Utils::Helper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
57 use File::Basename;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
58 use strict;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
59
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
60 use Data::Dumper;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
61
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
62 use vars qw(@ISA);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
63 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::InputSet);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
64
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
65 #To do
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
66 # Extend this so we can import features to annotated_feature
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
67 # and profiles/reads to result_feature
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
68 # This will replace individual tables for current bed das sources
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
69 # Altho' idea of separate tables for each set is not necessarily a bad one
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
70 # We would have to have extra table or fields details registration
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
71 # Easier admin i.e. drop tables
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
72 # What about partitioning? Irrelevant if we are moving to matrix
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
73 # Not easy to patch dynamically named tables? Can't assign values to user varaible from query!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
74 # Would need to be a stored procedure
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
75 # Could have separate matrix files for result_features as we probably wouldn't want
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
76 # to do any cross set querying at this level.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
77 # Could we also use this in the RunnableDBs?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
78 # Simply separate those methods required by both normal bed annotated_feature import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
79 # and RunnableDB based annotated_feature import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
80 # Importing into result_feature requires a result_set which assumes a chip experiment
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
81 # We need to alter result_set such that it can be optionally associated with ExperimentalSets
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
82 # Or should we just go for a different table? read_feature?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
83 # NOTE: PARITION by key may run into problems if different sets have different windows
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
84 # Would also need to modify ResultFeatureAdaptor?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
85
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
86 =head2 new
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
87
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
88 Arg[0] : hash containing optional attributes:
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
89 -bed_reads => 0|1, #Set input as read alignments rather than peak calls (default is 0)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
90 Example : my $self = $class->SUPER::new(@_);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
91 Description: Constructor method for Bed class
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
92 Returntype : Bio::EnsEMBL::Funcgen::Parsers::Bed
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
93 Exceptions : throws if Experiment name not defined or if caller is not Importer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
94 Caller : Bio::EnsEMBL::Funcgen::Importer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
95 Status : at risk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
96
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
97 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
98
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
99 sub new{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
100 my $caller = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
101 my $class = ref($caller) || $caller;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
102 my $self = $class->SUPER::new(@_, no_disconnect => 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
103
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
104 throw("This is a skeleton class for Bio::EnsEMBL::Importer, should not be used directly")
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
105 if(! $self->isa("Bio::EnsEMBL::Funcgen::Importer"));
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
106
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
107
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
108 #This was over-riding InputSet new config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
109 #$self->{'config'} =
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
110 # {(
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
111 # #order of these method arrays is important!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
112 # #remove method arrays and execute serially?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
113 # #Just move these to individual Parser register_experiment methods
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
114 # array_data => [],#['experiment'],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
115 # probe_data => [],#["probe"],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
116 # results_data => ["and_import"],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
117 # norm_method => undef,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
118 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
119 # #Need to make these definable?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
120 # #have protocolfile arg and just parse tab2mage protocol section format
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
121 # #SEE NIMBLEGEN FOR EXAMPLE
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
122 # protocols => {()},
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
123 # )};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
124
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
125 $self->{'overhang_features'} = []; #Move to InputSet?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
126 #Maybe used by other formats
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
127
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
128 return $self;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
129 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
130
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
131
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
132 =head2 set_config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
133
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
134 Example : my $self->set_config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
135 Description: Sets attribute dependent config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
136 Returntype : None
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
137 Exceptions : None
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
138 Caller : Bio::EnsEMBL::Funcgen::Importer
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
139 Status : at risk -remove
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
140
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
141 =cut
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
142
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
143
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
144 sub set_config{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
145 my $self = shift;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
146
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
147 $self->SUPER::set_config;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
148 #dir are not set in config to enable generic get_dir method access
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
149 #Need to define output dir if we are processing reads
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
150
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
151 return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
152 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
153
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
154
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
155
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
156 sub pre_process_file{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
157 my ($self, $filepath, $prepare) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
158
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
159 #Test file format
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
160 throw("Input file is not bed format:\t$filepath") if ! &is_bed($filepath);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
161
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
162
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
163 #separate sort keys stop lexical sorting of start/end
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
164 #when faced with a non numerical seq_region_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
165 my $sort = ($prepare || ! $self->prepared) ? 'sort -k1,1 -k2,2n -k3,3n ' : '';
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
166
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
167
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
168 if($self->input_gzipped){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
169 $sort .= '|' if $sort;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
170 $self->input_file_operator("gzip -dc %s | $sort ");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
171 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
172 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
173 #This is really only required for read alignments
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
174 $self->input_file_operator("$sort %s |");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
175 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
176
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
177
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
178 if(! defined $self->output_file && $self->input_feature_class eq 'result'){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
179 my ($name) = fileparse($filepath);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
180 $name =~ s/\.gz// if $self->input_gzipped;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
181
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
182 if($prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
183 #This will be filtered for seq_region_name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
184 $self->output_file($self->get_dir('output')."/prepared.${name}.gz");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
185 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
186 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
187 #Not currently used as we use direct import
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
188 #via AnnotatedFeatures or ResultFeature Collections
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
189
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
190 #output_file would only be used DAS read mysqlimport loading
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
191 #This could also be used by SAM/BAM so put in InputSet
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
192
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
193 #Or do we need a file for each discrete set?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
194 #Each import consititutes one discrete data set
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
195 #So this is okay for replicates in a result set
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
196 #But not for different cell/feature types
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
197 #Use one file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
198 #This is imposed case in INputSet::validate_files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
199 #We can't pipe this to gzip as we need to mysqlimport it, which does not support gzip?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
200 #gzip -dc file.sql | mysql would work but would be much slower due ti individual inserts
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
201 #$self->output_file($self->get_dir('output')."/result_feature.${name}.txt");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
202 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
203 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
204
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
205 return $filepath;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
206 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
207
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
208
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
209 #sub parse_header{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
210 # my ($self, $fh) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
211 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
212 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
213 # #This will not work on a sorted file, so would have to
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
214 # #store header and test match every line!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
215 # #just test for track name= for now
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
216 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
217 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
218 # warn "PARSING HEADER";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
219 # my $nr = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
220 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
221 # for my $line(<$fh>){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
222 # $nr++;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
223 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
224 # #my $nr = $fh->input_line_number();#This always return 3451? Length of file?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
225 # #This is not yet reliable here!!!!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
226 # #Is this because of the gzip sort?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
227 # #So let's depend on count?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
228 # #If we don't know when the header is supposed to finish (i.e. multi line header)
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
229 # #We will need to decrement the seek position somehow
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
230 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
231 # warn "INPUT LINE = $nr $line";
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
232 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
233 # #exit;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
234 # if ($nr == 1){#$INPUT_LINE_NUMBER;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
235 # #sanity check here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
236 # return if($line =~ /track name=/o);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
237 # $self->log(":: WARNING ::\tBED file does not appear to have valid header. First line($nr) will be treated as data:\t$line");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
238 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
239 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
240 # exit;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
241 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
242 # }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
243 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
244 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
245 # exit;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
246 #
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
247 # return;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
248 #}
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
249
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
250
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
251
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
252 sub parse_line{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
253 my ($self, $line, $prepare) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
254
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
255 #Need to handle header here for bed is always $.?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
256 #Also files which do not have chr prefix? i.e. Ensembl BED rather than UCSC Bed with is also half open coords
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
257
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
258 #if ($. == 0){#$INPUT_LINE_NUMBER;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
259 # #sanity check here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
260 return 0 if($line =~ /track name=/o);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
261 $line =~ s/\r*\n//o;#chump accounts for windows files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
262
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
263
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
264 my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
265 #Should we define minimum fields or microbed format with no naqme and just score?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
266 #my ($chr, $start, $end, $score) = split/\t/o, $line;#Mikkelson hack
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
267 #Validate variables types here beofre we get a nasty error from bind_param?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
268
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
269 #Any more valid BED fields here?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
270 # thickStart - The starting position at which the feature is drawn thickly (for example, the start codon in gene displays).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
271 # thickEnd - The ending position at which the feature is drawn thickly (for example, the stop codon in gene displays).
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
272 # itemRgb - An RGB value of the form R,G,B (e.g. 255,0,0). If the track line itemRgb attribute is set to "On", this RBG value will determine the display color of the data contained in this BED line. NOTE: It is recommended that a simple color scheme (eight colors or less) be used with this attribute to avoid overwhelming the color resources of the Genome Browser and your Internet browser.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
273 # blockCount - The number of blocks (exons) in the BED line.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
274 # blockSizes - A comma-separated list of the block sizes. The number of items in this list should correspond to blockCount.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
275 # blockStarts - A comma-separated list of block starts. All of the blockStart positions should be calculated relative to chromStart. The number of items in this list should correspond to blockCount.
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
276
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
277 my $slice = $self->cache_slice($chr, undef, $prepare);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
278 #prepare counts total features for RPKM
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
279 #This also filter slices for those defined
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
280
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
281 if(! $slice){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
282 return 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
283 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
284 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
285 my $sr_name = $slice->seq_region_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
286 my $slice_name = $slice->name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
287
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
288
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
289 if(! $prepare){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
290
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
291 $strand = $self->set_strand($strand);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
292 $start += 1 if $self->ucsc_coords;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
293
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
294 # Set name dependantly on input class
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
295 my %name_param;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
296
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
297 if($self->input_feature_class eq 'segmentation'){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
298 #Expand this into pluggable config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
299 #defined by field position against the input param name
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
300
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
301 #Need to define ftype and analysis config outside
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
302 #of this parser anyway
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
303
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
304 #Can we use similar set up to external parser config
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
305 #but extract actual config to separate file
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
306
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
307
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
308
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
309 if(! exists $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
310 #No need to test is valid as we have already done this
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
311 #just need to make sure we don't initialise the hash key
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
312 throw("Found segmentation BED name which is not defined in the feature_types".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
313 " config for your feature_set:\t$name");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
314 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
315
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
316 #warn "$name ftype is ".$self->{user_config}{feature_sets}{$self->name}{feature_types}{$name};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
317 $name_param{'-FEATURE_TYPE'} = $self->{user_config}{feature_sets}{$self->name}{feature_types}{$name};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
318
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
319 #DISPLAY_LABEL Let this get autogenerated?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
320
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
321 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
322 else{#annotated
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
323 $name_param{'-DISPLAY_LABEL'} = $name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
324 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
325
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
326
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
327 $self->{_feature_params} = {
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
328 -START => $start,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
329 -END => $end,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
330 -STRAND => $strand,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
331 -SLICE => $self->cache_slice($chr),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
332 -SCORE => $score,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
333 -FEATURE_SET => $self->data_set->product_FeatureSet,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
334 %name_param,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
335 };
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
336
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
337 $self->load_feature_and_xrefs;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
338 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
339 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
340
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
341 return 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
342 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
343
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
344
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
345 #For the purposes of creating ResultFeature Collections
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
346 #Dependancy on creating features is overkill
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
347 #altho not critical as this is never used for display
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
348
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
349 #Should really move this to InputSet parser
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
350 #Altho this would require an extra method call per line to parse the record
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
351
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
352 sub parse_Features_by_Slice{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
353 my ($self, $slice) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
354
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
355 #Slice should have been checked by now in caller
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
356 if($slice->strand != 1){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
357 throw("Bed Parser does not support parsing features by non +ve/forward strand Slices\n".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
358 'This is to speed up generation of ResultFeature Collections for large sequencing data sets');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
359 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
360
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
361 my $slice_chr = $slice->seq_region_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
362
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
363 #This method assumes that method calls will walk through a seq_region
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
364 #using adjacent slices
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
365
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
366 #We need to maintain a feature cache, which contains all the features which over hang
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
367 #the current slice, such that we can include them in the next batch of features returned
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
368
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
369 my @features;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
370 my $slice_end = $slice->end;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
371 my $slice_start = $slice->start;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
372 my $last_slice = $self->last_slice;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
373 my $last_slice_end = ($last_slice) ? $last_slice->end : ($slice_start - 1);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
374 my $last_slice_name = ($last_slice) ? $last_slice->seq_region_name : $slice->seq_region_name;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
375 my $rset_id = $self->result_set->dbID;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
376
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
377 if(! ($slice_start == ($last_slice_end + 1) &&
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
378 ($slice->seq_region_name eq $last_slice_name))){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
379 #Need to reopen the file as we are doing a second pass over the same data
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
380 #This is not guaranteed to work for re-reading sets of slices
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
381 #This would also not be caught by this test
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
382
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
383 #To be safe we need to reset the file handle from the caller context
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
384
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
385 throw("Bed parser does not yet support parsing features from successive non-adjacent Slices\n".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
386 "Last slice end - Next slice start:\t$last_slice_name:${last_slice_end} - ".
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
387 $slice->seq_region_name.':'.$slice_start);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
388 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
389
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
390
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
391 #Deal with 5' overhang first
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
392 foreach my $feature(@{$self->overhang_features}){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
393 $feature = $feature->transfer($slice);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
394 push @features, $feature if $feature;#This should always be true
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
395 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
396
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
397 $self->{'overhang_features'} = []; #reset overhang features
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
398 my $fh = $self->file_handle;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
399 my ($line, $feature);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
400 my $parse = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
401 #Add counts here, or leave to Collector?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
402 my $seen_chr = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
403
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
404 #This currently parses the rest of the file once we have seen the data we want
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
405
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
406 while((defined ($line = <$fh>)) && $parse){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
407 #This does not catch the end of the file!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
408
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
409
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
410 if($self->last_line){#Deal with previous line first
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
411 $line = $self->last_line;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
412 $self->last_line('');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
413 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
414 else{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
415 $line = <$fh>;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
416 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
417
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
418 #Still need to chump here in case no other fields
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
419 $line =~ s/\r*\n//o if $line;#chump accounts for windows files
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
420
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
421 if(! $line){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
422 warn("Skipping empty line");
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
423 next;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
424 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
425
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
426 #We could use a generic method to parse here
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
427 #But it is small enough and simple enough to have twice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
428 my ($chr, $start, $end, $name, $score, $strand, @other_fields) = split/\s+/o, $line;#Shoudl this not be \t?
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
429
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
430 if($slice_chr eq $chr){#Skim through the file until we find the slice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
431 $seen_chr = 1;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
432 if($end >= $slice_start){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
433
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
434 if($start <= $slice_end){#feature is on slice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
435
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
436 $feature = Bio::EnsEMBL::Funcgen::Collection::ResultFeature->new_fast
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
437 ({
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
438 start => ($start - $slice_start + 1),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
439 end => ($end - $slice_start + 1),
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
440 strand => $strand,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
441 scores => [$score],
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
442 result_set_id => $rset_id,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
443 window_size => 0,#wsize
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
444 slice => $slice,
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
445 });
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
446 push @features, $feature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
447
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
448 if($end > $slice_end){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
449 #This will also capture last feature which may not be part of current slice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
450 $self->overhang_features($feature);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
451 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
452 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
453 else{#feature is past end of current slice
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
454 $parse = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
455 $self->last_line($line);#But maybe part of next slice chunk
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
456 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
457 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
458 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
459 elsif($seen_chr){
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
460 #We have reached the end of the chromsome!
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
461 $self->last_line($line);#in case we are parsing slice serially
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
462 $parse = 0;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
463 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
464 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
465
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
466 $self->last_slice($slice);
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
467 #$self->log("Added logging of parsing (seen = $seen_chr) for memory footprinting through file", 'logmemflag');
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
468
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
469 return \@features;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
470 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
471
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
472 #Move these potentially generic methods to InputSet Parser for use by other Parsers
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
473
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
474
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
475 sub last_line{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
476 my ($self, $lline) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
477
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
478 $self->{'last_line'} = $lline if defined $lline;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
479 return $self->{'last_line'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
480
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
481 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
482
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
483 sub last_slice{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
484 my ($self, $lslice) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
485
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
486 $self->{'last_slice'} = $lslice if $lslice;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
487 return $self->{'last_slice'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
488 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
489
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
490
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
491 sub overhang_features{
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
492 my ($self, $feature) = @_;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
493
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
494 push @{$self->{'overhang_features'}}, $feature if $feature;
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
495
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
496 return $self->{'overhang_features'};
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
497
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
498 }
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
499
1f6dce3d34e0 Uploaded
mahtabm
parents:
diff changeset
500 1;