Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/Parsers/biotiffin.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =head1 LICENSE | |
2 | |
3 Copyright (c) 1999-2011 The European Bioinformatics Institute and | |
4 Genome Research Limited. All rights reserved. | |
5 | |
6 This software is distributed under a modified Apache license. | |
7 For license details, please see | |
8 | |
9 http://www.ensembl.org/info/about/code_licence.html | |
10 | |
11 =head1 CONTACT | |
12 | |
13 Please email comments or questions to the public Ensembl | |
14 developers list at <ensembl-dev@ebi.ac.uk>. | |
15 | |
16 Questions may also be sent to the Ensembl help desk at | |
17 <helpdesk@ensembl.org>. | |
18 | |
19 =cut | |
20 | |
21 package Bio::EnsEMBL::Funcgen::Parsers::biotiffin; | |
22 | |
23 use strict; | |
24 | |
25 use File::Basename; | |
26 | |
27 # To get files for bioTIFFIN, download the following GFF file (e.g. via wget): | |
28 # | |
29 # http://td-blade.gurdon.cam.ac.uk/tad26/fly-tiffinScan-tiffin12.dm3.gff.gz | |
30 | |
31 # Thomas Down <thomas.down@gurdon.cam.ac.uk> | |
32 | |
33 # | |
34 # 3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0 | |
35 # 3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0 | |
36 # 3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0 | |
37 | |
38 use Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser; | |
39 use Bio::EnsEMBL::DBEntry; | |
40 use Bio::EnsEMBL::Funcgen::ExternalFeature; | |
41 use Bio::EnsEMBL::Utils::Exception qw( throw ); | |
42 use Bio::EnsEMBL::Utils::Argument qw( rearrange ); | |
43 | |
44 use vars qw(@ISA); | |
45 @ISA = qw(Bio::EnsEMBL::Funcgen::Parsers::BaseExternalParser); | |
46 | |
47 | |
48 sub new { | |
49 my $caller = shift; | |
50 my $class = ref($caller) || $caller; | |
51 | |
52 my $self = $class->SUPER::new(@_, type => 'BioTiffin'); | |
53 | |
54 #Set default feature_type and feature_set config | |
55 | |
56 #We need to capture version/release/data of external feature sets. | |
57 #This can be nested in the description? Need to add description to feature_set? | |
58 | |
59 $self->{static_config}{feature_types} = | |
60 { | |
61 'BioTIFFIN Motif' => { | |
62 name => 'BioTIFFIN Motif', | |
63 class => 'Regulatory Motif', | |
64 description => 'BioTIFFIN motif', | |
65 } | |
66 }; | |
67 | |
68 $self->{static_config}{analyses} = | |
69 { | |
70 'BioTIFFIN Motif' => { | |
71 -logic_name => 'BioTIFFIN Motif', | |
72 -description => 'BioTIFFIN regulatory motif database', | |
73 -display_label => 'BioTIFFIN motifs', | |
74 -displayable => 1, | |
75 }, | |
76 }; | |
77 | |
78 $self->{static_config}{feature_sets} = | |
79 { | |
80 'BioTIFFIN Motif' => | |
81 { | |
82 feature_set => { | |
83 -feature_type => 'BioTIFFIN Motif', | |
84 -analysis => 'BioTIFFIN Motif', | |
85 }, | |
86 xrefs => 0, | |
87 } | |
88 }; | |
89 | |
90 | |
91 #Move xref flag here? | |
92 $self->{config} = { | |
93 'BioTIFFIN Motif' => { | |
94 file => $ENV{'EFG_DATA'}.'/input/BioTIFFIN/fly-tiffinScan-tiffin12.dm3.gff', | |
95 gff_attrs => { | |
96 'ID' => 1, | |
97 }, | |
98 }, | |
99 }; | |
100 | |
101 $self->validate_and_store_config([keys %{$self->{static_config}{feature_sets}}]); | |
102 $self->set_feature_sets; | |
103 | |
104 return $self; | |
105 } | |
106 | |
107 | |
108 | |
109 # Parse file and return hashref containing: | |
110 # | |
111 # - arrayref of features | |
112 # - arrayref of factors | |
113 | |
114 | |
115 | |
116 | |
117 sub parse_and_load { | |
118 my ($self, $files, $old_assembly, $new_assembly) = @_; | |
119 | |
120 if(scalar(@$files) != 1){ | |
121 throw('You must provide a unique file path to load VISTA features from:\t'.join(' ', @$files)); | |
122 } | |
123 | |
124 | |
125 my %slice_cache; | |
126 my $extf_adaptor = $self->db->get_ExternalFeatureAdaptor; | |
127 my $dbentry_adaptor = $self->db->get_DBEntryAdaptor; | |
128 my $ftype_adaptor = $self->db->get_FeatureTypeAdaptor; | |
129 # this object is only used for projection | |
130 my $dummy_analysis = new Bio::EnsEMBL::Analysis(-logic_name => 'BioTIFFINProjection');#do we need this? | |
131 my $species = $self->db->species; | |
132 if(! $species){ | |
133 throw('Must define a species to define the external_db'); | |
134 } | |
135 #Just to make sure we hav homo_sapiens and not Homo Sapiens | |
136 ($species = lc($species)) =~ s/ /_/; | |
137 | |
138 | |
139 if(scalar @{$self->import_sets} != 1){ | |
140 throw('biotiffin parser currently only supports one import FeatureSet'); | |
141 } | |
142 | |
143 my ($import_set) = @{$self->import_sets}; | |
144 | |
145 | |
146 #foreach my $import_set(@{$self->import_sets}){ | |
147 $self->log_header("Parsing $import_set data"); | |
148 | |
149 my %motif_cache; # name -> factor_id | |
150 my $config = $self->{'config'}{$import_set}; | |
151 my $fset = $self->{static_config}{feature_sets}{$import_set}{feature_set}; | |
152 my %gff_attrs = %{$config->{'gff_attrs'}}; | |
153 | |
154 | |
155 # Parse motifs.txt file | |
156 #my $file = $config->{'file'}; | |
157 my $file = $files->[0]; | |
158 my $skipped = 0; | |
159 my $motif_cnt = 0; | |
160 my $factor_xref_cnt = 0; | |
161 my $feature_cnt = 0; | |
162 my $feature_target_cnt = 0; | |
163 | |
164 open (FILE, "<$file") || die("Can't open $file\n$!\n"); | |
165 | |
166 LINE: while (my $line = <FILE>) { | |
167 chomp $line; | |
168 | |
169 #GFF3 | |
170 #3R MotifScanner TIFDMEM0000001 936391 936401 0.0 + 0 | |
171 #3R MotifScanner TIFDMEM0000001 13455911 13455921 0.0 - 0 | |
172 #3R MotifScanner TIFDMEM0000001 17062830 17062840 0.0 + 0 | |
173 #3R MotifScanner TIFDMEM0000001 17973965 17973975 0.0 + 0 | |
174 | |
175 #seq_name, source, feature, start, end, score, strand, frame, [attrs] | |
176 my ($chromosome, $program, $feature, $start, $end, $score, $strand, undef) = split /\t/o, $line; | |
177 | |
178 if(! exists $slice_cache{$chromosome}){ | |
179 | |
180 if($old_assembly){ | |
181 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', | |
182 $chromosome, | |
183 undef, | |
184 undef, | |
185 undef, | |
186 $old_assembly); | |
187 } else { | |
188 $slice_cache{$chromosome} = $self->slice_adaptor->fetch_by_region('chromosome', $chromosome); | |
189 } | |
190 } | |
191 | |
192 if(! defined $slice_cache{$chromosome}){ | |
193 warn "Can't get slice $chromosome for motif $feature;\n"; | |
194 $skipped++; | |
195 next; | |
196 } | |
197 | |
198 if(! exists $motif_cache{$feature}){ | |
199 | |
200 $motif_cache{$feature} = $ftype_adaptor->fetch_by_name($feature); | |
201 | |
202 if(! defined $motif_cache{$feature}){ | |
203 | |
204 ($motif_cache{$feature}) = @{$ftype_adaptor->store(Bio::EnsEMBL::Funcgen::FeatureType->new | |
205 ( | |
206 -name => $feature, | |
207 -class => $fset->feature_type->class, | |
208 -description => $fset->feature_type->description, | |
209 ))}; | |
210 | |
211 $motif_cnt ++; | |
212 } | |
213 } | |
214 | |
215 my $feature_type = $motif_cache{$feature}; | |
216 | |
217 #Now build actual feature | |
218 | |
219 $feature = Bio::EnsEMBL::Funcgen::ExternalFeature->new | |
220 ( | |
221 -display_label => $feature, | |
222 -start => $start, | |
223 -end => $end, | |
224 -strand => (($strand eq '+') ? 1 : -1), | |
225 -feature_type => $feature_type, | |
226 -feature_set => $fset, | |
227 -slice => $slice_cache{$chromosome}, | |
228 ); | |
229 | |
230 | |
231 # project if necessary | |
232 if ($new_assembly) { | |
233 $feature = $self->project_feature($feature, $new_assembly); | |
234 | |
235 if(! defined $feature){ | |
236 $skipped ++; | |
237 next; | |
238 } | |
239 } | |
240 | |
241 ($feature) = @{$extf_adaptor->store($feature)}; | |
242 $feature_cnt++; | |
243 | |
244 | |
245 } | |
246 | |
247 close FILE; | |
248 | |
249 $self->log("Loaded ".$fset->name); | |
250 $self->log("$motif_cnt feature types"); | |
251 $self->log("$feature_cnt features"); | |
252 $self->log("Skipped $skipped features"); | |
253 | |
254 #} | |
255 | |
256 return; | |
257 } | |
258 | |
259 1; |