0
|
1 =head1 LICENSE
|
|
2
|
|
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and
|
|
4 Genome Research Limited. All rights reserved.
|
|
5
|
|
6 This software is distributed under a modified Apache license.
|
|
7 For license details, please see
|
|
8
|
|
9 http://www.ensembl.org/info/about/code_licence.html
|
|
10
|
|
11 =head1 CONTACT
|
|
12
|
|
13 Please email comments or questions to the public Ensembl
|
|
14 developers list at <ensembl-dev@ebi.ac.uk>.
|
|
15
|
|
16 Questions may also be sent to the Ensembl help desk at
|
|
17 <helpdesk@ensembl.org>.
|
|
18
|
|
19 =cut
|
|
20
|
|
21 package Bio::EnsEMBL::Funcgen::Parsers::vista;
|
|
22
|
|
23 use strict;
|
|
24
|
|
25 # Parse data from LBL enhancers, see http://enhancer.lbl.gov/cgi-bin/imagedb.pl?show=1;search.result=yes;form=search;search.form=no;action=search;search.sequence=1
|
|
26 # e.g.
|
|
27 #
|
|
28 # >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
|
|
29 # AACTGAAGGGACCCCGTTAGCATAtaaacaaaaggtggggggtagccccgagcctcttct
|
|
30 # ctgacagccagtggcggcagtgatgaatttgtgaagttatctaattttccactgttttaa
|
|
31 # ttagagacttgggctctgaggcctcgcagctggcttctttgtgctgtattctgttgcctg
|
|
32 # acagag
|
|
33
|
|
34 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser;
|
|
35 use Bio::EnsEMBL::Utils::Exception qw( throw );
|
|
36
|
|
37 use vars qw(@ISA);
|
|
38 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser);
|
|
39
|
|
40
|
|
41
|
|
42 sub new {
|
|
43 my $caller = shift;
|
|
44 my $class = ref($caller) || $caller;
|
|
45
|
|
46 my $self = $class->SUPER::new(@_, type => 'Vista');
|
|
47
|
|
48 #Set default feature_type and feature_set config
|
|
49 $self->{static_config}{feature_types} = {(
|
|
50 'VISTA Target' => {
|
|
51 -name => 'VISTA Target',
|
|
52 -class => 'Search Region',
|
|
53 -description => 'VISTA target region',
|
|
54 },
|
|
55 'VISTA Enhancer' => {
|
|
56 -name => 'VISTA Enhancer',
|
|
57 -class => 'Enhancer',
|
|
58 -description => 'Enhancer identified by positive VISTA assay',
|
|
59 },
|
|
60 'VISTA Target - Negative' => {
|
|
61 -name => 'VISTA Target - Negative',
|
|
62 -class => 'Search Region',
|
|
63 -description => 'Enhancer negative region identified by VISTA assay',
|
|
64 },
|
|
65 )};
|
|
66
|
|
67 $self->{static_config}{analyses} = {
|
|
68 VISTA => {
|
|
69 -logic_name => 'VISTA',
|
|
70 -description => 'VISTA Enhancer Assay (http://enhancer.lbl.gov/)',
|
|
71 -display_label => 'VISTA',
|
|
72 -displayable => 1,
|
|
73 },
|
|
74 };
|
|
75
|
|
76 #This is used as the entry point to store/validate
|
|
77 #So all of the above needs to be referenced in here
|
|
78 $self->{static_config}{feature_sets} = {
|
|
79 'VISTA enhancer set' =>
|
|
80 {
|
|
81 #Stored in this order
|
|
82
|
|
83 #Entries here are flexible
|
|
84 #Can be omited if defined in feature_set
|
|
85 #top level analyses/feature_types definition required if no DB defaults available
|
|
86 #These can be a ref to the whole or subset of the top level analyses/feature_types hash
|
|
87 #A key with an empty hash or undef(with or without a matching key in the top level analyses/feature_types hash
|
|
88
|
|
89 #analyses => $self->{static_config}{analyses},
|
|
90 feature_types => $self->{static_config}{feature_types},
|
|
91
|
|
92 #feature_type and analysis values must be string key to top level hash
|
|
93 #This wont work for feature_types as they are not unique by name!!!!!!
|
|
94 #This is why we have top level hash where we can define a unique compound key name
|
|
95
|
|
96 feature_set =>
|
|
97 {
|
|
98 -feature_type => 'VISTA Target',#feature_types config key name not object
|
|
99 -display_label => 'VISTA Enhancers',
|
|
100 -description => 'Experimentally validated enhancers',
|
|
101 -analysis => 'VISTA',#analyses config key name not object
|
|
102 },
|
|
103 }
|
|
104 };
|
|
105
|
|
106 #$self->validate_and_store_feature_types;
|
|
107 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]);
|
|
108 $self->set_feature_sets;
|
|
109
|
|
110 return $self;
|
|
111 }
|
|
112
|
|
113
|
|
114
|
|
115 # Parse file and return hashref containing:
|
|
116 #
|
|
117 # - arrayref of features
|
|
118 # - arrayref of factors
|
|
119
|
|
120
|
|
121
|
|
122 sub parse_and_load{
|
|
123 my ($self, $files, $old_assembly, $new_assembly) = @_;
|
|
124
|
|
125 if(scalar(@$files) != 1){
|
|
126 throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files));;
|
|
127 }
|
|
128
|
|
129 my $file = $files->[0];
|
|
130 $self->log_header("Parsing and loading LBNL VISTA enhancer data from:\t$file");
|
|
131
|
|
132 my $extfeat_adaptor = $self->db->get_ExternalFeatureAdaptor;
|
|
133 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'EnhancerProjection'); # this object is only used for projection
|
|
134 my $fset_config = $self->{static_config}{feature_sets}{'VISTA enhancer set'};
|
|
135 my $feature_positive = $fset_config->{'feature_types'}{'VISTA Enhancer'};
|
|
136 my $feature_negative = $fset_config->{'feature_types'}{'VISTA Target - Negative'};
|
|
137 my $set = $fset_config->{feature_set};
|
|
138
|
|
139 use Bio::EnsEMBL::Registry;
|
|
140 my %id_prefixes = (
|
|
141 homo_sapiens => 'hs',
|
|
142 mus_musculus => 'mm',
|
|
143 );
|
|
144
|
|
145 my $species = Bio::EnsEMBL::Registry->get_alias($self->db->species);
|
|
146
|
|
147 if( (! defined $species) ||
|
|
148 (! exists $id_prefixes{$species}) ){
|
|
149 throw("Failed to get a VISTA ID prefix for species alias:\t$species");
|
|
150 }
|
|
151
|
|
152 $species = $id_prefixes{$species};
|
|
153
|
|
154 ### Read file
|
|
155 open (FILE, "<$file") || die "Can't open $file";
|
|
156 my $cnt = 0;
|
|
157 my $skipped = 0;
|
|
158
|
|
159
|
|
160 while (<FILE>) {
|
|
161
|
|
162 next if ($_ !~ /^>/o); # only read headers
|
|
163
|
|
164 # OLD >chr16:84987588-84988227 | element 1 | positive | neural tube[12/12] | hindbrain (rhombencephalon)[12/12] | limb[3/12] | cranial nerve[8/12]
|
|
165 # v66 >Mouse|chr12:112380949-112381824 | element 3 | positive | neural tube[4/4] | hindbrain (rhombencephalon)[4/4] | forebrain[4/4]
|
|
166
|
|
167 #ID naming scheme change from LBNL-1 to hs1 or mm1
|
|
168 #But the flat file and url use two different naming schemes!
|
|
169 #VISTA URL is: where experiment id is element number and species_id 1 = human and 2 = mouse
|
|
170 #http://enhancer.lbl.gov/cgi-bin/imagedb3.pl?form=presentation&show=1&experiment_id=1&organism_id=1
|
|
171
|
|
172 #Add links to cell_type for tissues in @expression_pattern?
|
|
173 #This would be vista specific cell_type_annotation? Or we could just have associated_cell_type? (without annotation)
|
|
174 #Just link for now
|
|
175
|
|
176 my (undef, $coords, $element, $posneg, @expression_patterns) = split /\s*\|\s*/o;#was \s+
|
|
177 # parse co-ordinates & id
|
|
178 my ($chr, $start, $end) = $coords =~ /chr([^:]+):(\d+)-(\d+)/o;
|
|
179 my ($element_number) = $element =~ /\s*element\s*(\d+)/o;
|
|
180
|
|
181 # seq_region ID and co-ordinates
|
|
182 my $chr_slice;
|
|
183
|
|
184 if ($old_assembly) {
|
|
185 $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, undef, $old_assembly);
|
|
186 } else {
|
|
187 $chr_slice = $self->slice_adaptor->fetch_by_region('chromosome', $chr);
|
|
188 }
|
|
189
|
|
190 if (!$chr_slice) {
|
|
191 warn "Can't get slice for chromosme $chr\n";
|
|
192 next;
|
|
193 }
|
|
194
|
|
195 my $seq_region_id = $chr_slice->get_seq_region_id;
|
|
196 throw("Can't get seq_region_id for chromosome $chr") if (!$seq_region_id);
|
|
197
|
|
198 # Assume these are all on the positive strand? Is this correct?
|
|
199 my $strand = 1;
|
|
200
|
|
201
|
|
202 my $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new
|
|
203 (
|
|
204 -start => $start,#is this in UCSC coords?
|
|
205 -end => $end, #is this in UCSC coords?
|
|
206 -strand => $strand,
|
|
207 -feature_type => $posneg eq 'positive' ? $feature_positive : $feature_negative,
|
|
208 -slice => $self->slice_adaptor->fetch_by_region('chromosome', $chr, undef, undef, $strand, $old_assembly),
|
|
209 -display_label => $species.$element_number,#"LBNL-$element_number",
|
|
210 -feature_set => $set,
|
|
211 );
|
|
212
|
|
213
|
|
214 # project if necessary
|
|
215 if ($new_assembly) {
|
|
216
|
|
217 $feature = $self->project_feature($feature, $dummy_analysis, $new_assembly);
|
|
218
|
|
219 if(! defined $feature){
|
|
220 $skipped ++;
|
|
221 next;
|
|
222 }
|
|
223 }
|
|
224
|
|
225 $cnt ++;
|
|
226 $extfeat_adaptor->store($feature);
|
|
227 }
|
|
228
|
|
229 close FILE;
|
|
230
|
|
231 $self->log('Parsed '.($cnt+$skipped).' features');
|
|
232 $self->log("Loaded $cnt features");
|
|
233 $self->log("Skipped $skipped features");
|
|
234
|
|
235 #Now set states
|
|
236 foreach my $status(qw(DISPLAYABLE MART_DISPLAYABLE)){
|
|
237 $set->adaptor->store_status($status, $set);
|
|
238 }
|
|
239
|
|
240
|
|
241 return;
|
|
242
|
|
243 }
|
|
244
|
|
245
|
|
246 1;
|