annotate variant_effect_predictor/.#variant_effect_predictor.pl.1.3 @ 0:21066c0abaf5 draft

Uploaded
author willmclaren
date Fri, 03 Aug 2012 10:04:48 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1 #!/usr/bin/perl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
3 =head1 LICENSE
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
4
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
5 Copyright (c) 1999-2011 The European Bioinformatics Institute and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
6 Genome Research Limited. All rights reserved.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
7
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
8 This software is distributed under a modified Apache license.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
9 For license details, please see
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
10
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
11 http://www.ensembl.org/info/about/code_licence.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
12
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
13 =head1 CONTACT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
14
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
15 Please email comments or questions to the public Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
16 developers list at <dev@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
17
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
18 Questions may also be sent to the Ensembl help desk at
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
19 <helpdesk@ensembl.org>.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
20
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
21 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
22
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
23 =head1 NAME
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
24
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
25 Variant Effect Predictor - a script to predict the consequences of genomic variants
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
26
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
27 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
28
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
29 Version 2.2
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
30
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
31 by Will McLaren (wm2@ebi.ac.uk)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
32 =cut
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
33
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
34 use strict;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
35 use Getopt::Long;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
36 use FileHandle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
37 use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
38 use Bio::EnsEMBL::Variation::Utils::VEP qw(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
39 parse_line
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
40 vf_to_consequences
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
41 validate_vf
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
42 load_dumped_adaptor_cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
43 dump_adaptor_cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
44 get_all_consequences
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
45 get_slice
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
46 build_full_cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
47 read_cache_info
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
48 get_time
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
49 debug
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
50 @OUTPUT_COLS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
51 @REG_FEAT_TYPES
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
52 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
53
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
54 # global vars
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
55 my $VERSION = '2.2';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
56
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
57 # set output autoflush for progress bars
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
58 $| = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
59
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
60 # configure from command line opts
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
61 my $config = &configure(scalar @ARGV);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
63 # run the main sub routine
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
64 &main($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
65
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
66 # this is the main sub-routine - it needs the configured $config hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
67 sub main {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
68 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
69
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
70 debug("Starting...") unless defined $config->{quiet};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
71
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
72 my $tr_cache = {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
73 my $rf_cache = {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
74
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
75 # create a hash to hold slices so we don't get the same one twice
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
76 my %slice_cache = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
77
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
78 my @vfs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
79 my ($vf_count, $total_vf_count);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
80 my $in_file_handle = $config->{in_file_handle};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
81
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
82 # initialize line number in config
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
83 $config->{line_number} = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
84
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
85 # read the file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
86 while(<$in_file_handle>) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
87 chomp;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
88
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
89 $config->{line_number}++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
90
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
91 # header line?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
92 next if /^\#/;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
93
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
94 # some lines (pileup) may actually parse out into more than one variant
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
95 foreach my $vf(@{&parse_line($config, $_)}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
96
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
97 # validate the VF
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
98 next unless validate_vf($config, $vf);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
99
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
100 # now get the slice
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
101 if(!defined($vf->{slice})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
102 my $slice;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
103
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
104 # don't get slices if we're using cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
105 # we can steal them from transcript objects later
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
106 if((!defined($config->{cache}) && !defined($config->{whole_genome})) || defined($config->{check_ref}) || defined($config->{convert})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
107
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
108 # check if we have fetched this slice already
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
109 if(defined $slice_cache{$vf->{chr}}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
110 $slice = $slice_cache{$vf->{chr}};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
111 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
112
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
113 # if not create a new one
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
114 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
115
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
116 $slice = &get_slice($config, $vf->{chr});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
117
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
118 # if failed, warn and skip this line
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
119 if(!defined($slice)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
120 warn("WARNING: Could not fetch slice named ".$vf->{chr}." on line ".$config->{line_number}."\n") unless defined $config->{quiet};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
121 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
122 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
123
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
124 # store the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
125 $slice_cache{$vf->{chr}} = $slice;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
126 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
127 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
128
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
129 $vf->{slice} = $slice;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
130 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
131
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
132 # make a name if one doesn't exist
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
133 $vf->{variation_name} ||= $vf->{chr}.'_'.$vf->{start}.'_'.$vf->{allele_string};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
134
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
135 # jump out to convert here
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
136 if(defined($config->{convert})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
137 &convert_vf($config, $vf);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
138 next;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
139 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
140
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
141 if(defined $config->{whole_genome}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
142 push @vfs, $vf;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
143 $vf_count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
144 $total_vf_count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
145
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
146 if($vf_count == $config->{buffer_size}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
147 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
148
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
149 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
150
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
151 debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
152
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
153 @vfs = ();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
154 $vf_count = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
155 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
156 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
157 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
158 print_line($config, $_) foreach @{vf_to_consequences($config, $vf)};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
159 $vf_count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
160 $total_vf_count++;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
161 debug("Processed $vf_count variants") if $vf_count =~ /0$/ && defined($config->{verbose});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
162 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
163 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
164 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
165
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
166 # if in whole-genome mode, finish off the rest of the buffer
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
167 if(defined $config->{whole_genome} && scalar @vfs) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
168 debug("Read $vf_count variants into buffer") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
169
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
170 print_line($config, $_) foreach @{get_all_consequences($config, \@vfs, $tr_cache, $rf_cache)};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
171
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
172 debug("Processed $total_vf_count total variants") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
173 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
174
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
175 debug("Executed ", defined($Bio::EnsEMBL::DBSQL::StatementHandle::count_queries) ? $Bio::EnsEMBL::DBSQL::StatementHandle::count_queries : 'unknown number of', " SQL statements") if defined($config->{count_queries}) && !defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
176
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
177 debug("Finished!") unless defined $config->{quiet};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
178 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
179
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
180 # sets up configuration hash that is used throughout the script
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
181 sub configure {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
182 my $args = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
183
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
184 my $config = {};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
185
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
186 GetOptions(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
187 $config,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
188 'help', # displays help message
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
189
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
190 # input options,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
191 'config=s', # config file name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
192 'input_file=s', # input file name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
193 'format=s', # input file format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
194
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
195 # DB options
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
196 'species=s', # species e.g. human, homo_sapiens
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
197 'registry=s', # registry file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
198 'host=s', # database host
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
199 'port=s', # database port
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
200 'user=s', # database user name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
201 'password=s', # database password
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
202 'db_version=i', # Ensembl database version to use e.g. 62
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
203 'genomes', # automatically sets DB params for e!Genomes
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
204 'refseq', # use otherfeatures RefSeq DB instead of Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
205 #'no_disconnect', # disables disconnect_when_inactive
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
206
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
207 # runtime options
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
208 'most_severe', # only return most severe consequence
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
209 'summary', # only return one line per variation with all consquence types
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
210 'per_gene', # only return most severe per gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
211 'buffer_size=i', # number of variations to read in before analysis
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
212 'chunk_size=s', # size in bases of "chunks" used in internal hash structure
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
213 'failed=i', # include failed variations when finding existing
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
214 'no_whole_genome', # disables now default whole-genome mode
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
215 'whole_genome', # proxy for whole genome mode - now just warns user
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
216 'gp', # read coords from GP part of INFO column in VCF (probably only relevant to 1KG)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
217 'chr=s', # analyse only these chromosomes, e.g. 1-5,10,MT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
218 'check_ref', # check supplied reference allele against DB
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
219 'check_existing', # find existing co-located variations
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
220 'check_alleles', # only attribute co-located if alleles are the same
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
221 'check_frequency', # enable frequency checking
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
222 'freq_filter=s', # exclude or include
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
223 'freq_freq=f', # frequency to filter on
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
224 'freq_gt_lt=s', # gt or lt (greater than or less than)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
225 'freq_pop=s', # population to filter on
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
226
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
227 # verbosity options
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
228 'verbose', # print out a bit more info while running
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
229 'quiet', # print nothing to STDOUT (unless using -o stdout)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
230 'no_progress', # don't display progress bars
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
231
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
232 # output options
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
233 'output_file=s', # output file name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
234 'force_overwrite', # force overwrite of output file if already exists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
235 'terms=s', # consequence terms to use e.g. NCBI, SO
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
236 'coding_only', # only return results for consequences in coding regions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
237 'canonical', # indicates if transcript is canonical
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
238 'protein', # add e! protein ID to extra column
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
239 'hgnc', # add HGNC gene ID to extra column
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
240 'hgvs', # add HGVS names to extra column
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
241 'sift=s', # SIFT predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
242 'polyphen=s', # PolyPhen predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
243 'condel=s', # Condel predictions
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
244 'gene', # force gene column to be populated (disabled by default, enabled when using cache)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
245 'regulatory', # enable regulatory stuff
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
246 'convert=s', # convert input to another format (doesn't run VEP)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
247 'no_intergenic', # don't print out INTERGENIC consequences
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
248 'gvf', # produce gvf output
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
249
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
250 # cache stuff
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
251 'cache', # use cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
252 'write_cache', # enables writing to the cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
253 'build=s', # builds cache from DB from scratch; arg is either all (all top-level seqs) or a list of chrs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
254 'prefetch', # prefetch exons, translation, introns, codon table etc for each transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
255 'strip', # strips adaptors etc from objects before caching them
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
256 'rebuild=s', # rebuilds cache by reading in existing then redumping - probably don't need to use this any more
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
257 'dir=s', # dir where cache is found (defaults to $HOME/.vep/)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
258 'cache_region_size=i', # size of region in bases for each cache file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
259 'no_slice_cache', # tell API not to cache features on slice
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
260 'standalone', # standalone mode uses minimal set of modules installed in same dir, no DB connection
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
261 'skip_db_check', # don't compare DB parameters with cached
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
262 'compress=s', # by default we use zcat to decompress; user may want to specify gzcat or "gzip -dc"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
263
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
264 # debug
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
265 'cluck', # these two need some mods to Bio::EnsEMBL::DBSQL::StatementHandle to work. Clucks callback trace and SQL
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
266 'count_queries', # counts SQL queries executed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
267 'admin', # allows me to build off public hosts
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
268 'debug', # print out debug info
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
269 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
270
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
271 # print usage message if requested or no args supplied
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
272 if(defined($config->{help}) || !$args) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
273 &usage;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
274 exit(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
275 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
276
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
277 # config file?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
278 if(defined $config->{config}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
279
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
280 open CONFIG, $config->{config} or die "ERROR: Could not open config file \"".$config->{config}."\"\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
281
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
282 while(<CONFIG>) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
283 next if /^\#/;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
284 my ($key, $value) = split /\s+|\=/;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
285 $key =~ s/^\-//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
286 $config->{$key} = $value unless defined $config->{$key};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
287 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
288
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
289 close CONFIG;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
290 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
291
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
292 # can't be both quiet and verbose
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
293 die "ERROR: Can't be both quiet and verbose!" if defined($config->{quiet}) && defined($config->{verbose});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
294
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
295 # check file format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
296 if(defined $config->{format}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
297 die "ERROR: Unrecognised input format specified \"".$config->{format}."\"\n" unless $config->{format} =~ /pileup|vcf|guess|hgvs|ensembl|id/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
298 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
299
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
300 # check convert format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
301 if(defined $config->{convert}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
302 die "ERROR: Unrecognised output format for conversion specified \"".$config->{convert}."\"\n" unless $config->{convert} =~ /vcf|ensembl|pileup/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
303 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
304
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
305 # connection settings for Ensembl Genomes
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
306 if($config->{genomes}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
307 $config->{host} ||= 'mysql.ebi.ac.uk';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
308 $config->{port} ||= 4157;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
309 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
310
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
311 # connection settings for main Ensembl
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
312 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
313 $config->{species} ||= "homo_sapiens";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
314 $config->{host} ||= 'ensembldb.ensembl.org';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
315 $config->{port} ||= 5306;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
316 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
317
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
318 # refseq or core?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
319 if(defined($config->{refseq})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
320 die "ERROR: SIFT, PolyPhen and Condel predictions not available fore RefSeq transcripts" if defined $config->{sift} || defined $config->{polyphen} || defined $config->{condel};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
321
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
322 $config->{core_type} = 'otherfeatures';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
323 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
324 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
325 $config->{core_type} = 'core';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
326 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
327
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
328 # output term
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
329 if(defined $config->{terms}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
330 die "ERROR: Unrecognised consequence term type specified \"".$config->{terms}."\" - must be one of ensembl, so, ncbi\n" unless $config->{terms} =~ /ensembl|display|so|ncbi/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
331 if($config->{terms} =~ /ensembl|display/i) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
332 $config->{terms} = 'display';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
333 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
334 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
335 $config->{terms} = uc($config->{terms});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
336 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
337 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
338
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
339 # check nsSNP tools
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
340 foreach my $tool(grep {defined $config->{lc($_)}} qw(SIFT PolyPhen Condel)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
341 die "ERROR: Unrecognised option for $tool \"", $config->{lc($tool)}, "\" - must be one of p (prediction), s (score) or b (both)\n" unless $config->{lc($tool)} =~ /^(s|p|b)/;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
342
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
343 die "ERROR: $tool not available for this species\n" unless $config->{species} =~ /human|homo/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
344
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
345 die "ERROR: $tool not available in standalone mode\n" if defined($config->{standalone});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
346
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
347 # use V2 of the Condel algorithm, possibly gives fewer false positives
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
348 if($tool eq 'Condel' && $config->{lc($tool)} =~ /1$/) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
349 $Bio::EnsEMBL::Variation::Utils::Condel::USE_V2 = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
350 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
351 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
352
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
353 # force quiet if outputting to STDOUT
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
354 if(defined($config->{output_file}) && $config->{output_file} =~ /stdout/i) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
355 delete $config->{verbose} if defined($config->{verbose});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
356 $config->{quiet} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
357 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
358
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
359 # summarise options if verbose
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
360 if(defined $config->{verbose}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
361 my $header =<<INTRO;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
362 #----------------------------------#
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
363 # ENSEMBL VARIANT EFFECT PREDICTOR #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
364 #----------------------------------#
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
365
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
366 version $VERSION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
367
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
368 By Will McLaren (wm2\@ebi.ac.uk)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
369
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
370 Configuration options:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
371
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
372 INTRO
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
373 print $header;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
374
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
375 my $max_length = (sort {$a <=> $b} map {length($_)} keys %$config)[-1];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
376
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
377 foreach my $key(sort keys %$config) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
378 print $key.(' ' x (($max_length - length($key)) + 4)).$config->{$key}."\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
379 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
380
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
381 print "\n".("-" x 20)."\n\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
382 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
383
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
384 # set defaults
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
385 $config->{user} ||= 'anonymous';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
386 $config->{buffer_size} ||= 5000;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
387 $config->{chunk_size} ||= '50kb';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
388 $config->{output_file} ||= "variant_effect_output.txt";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
389 $config->{tmpdir} ||= '/tmp';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
390 $config->{format} ||= 'guess';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
391 $config->{terms} ||= 'display';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
392 $config->{gene} ||= 1 unless defined($config->{whole_genome});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
393 $config->{cache_region_size} ||= 1000000;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
394 $config->{dir} ||= join '/', ($ENV{'HOME'}, '.vep');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
395 $config->{compress} ||= 'zcat';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
396
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
397 # frequency filtering
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
398 if(defined($config->{check_frequency})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
399 foreach my $flag(qw(freq_freq freq_filter freq_pop freq_gt_lt)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
400 die "ERROR: To use --check_frequency you must also specify flag --$flag" unless defined $config->{$flag};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
401 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
402
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
403 # need to set check_existing
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
404 $config->{check_existing} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
405 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
406
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
407 $config->{check_existing} = 1 if defined $config->{check_alleles};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
408
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
409 # warn users still using whole_genome flag
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
410 if(defined($config->{whole_genome})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
411 debug("INFO: Whole-genome mode is now the default run-mode for the script. To disable it, use --no_whole_genome") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
412 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
413
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
414 $config->{whole_genome} = 1 unless defined $config->{no_whole_genome};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
415 $config->{include_failed} = 1 unless defined $config->{include_failed};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
416 $config->{chunk_size} =~ s/mb?/000000/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
417 $config->{chunk_size} =~ s/kb?/000/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
418 $config->{cache_region_size} =~ s/mb?/000000/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
419 $config->{cache_region_size} =~ s/kb?/000/i;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
420
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
421 # cluck and display executed SQL?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
422 $Bio::EnsEMBL::DBSQL::StatementHandle::cluck = 1 if defined($config->{cluck});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
423
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
424 # standalone needs cache, can't use HGVS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
425 if(defined($config->{standalone})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
426 $config->{cache} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
427
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
428 die("ERROR: Cannot generate HGVS coordinates in standalone mode") if defined($config->{hgvs});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
429 die("ERROR: Cannot use HGVS as input in standalone mode") if $config->{format} eq 'hgvs';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
430 die("ERROR: Cannot use variant identifiers as input in standalone mode") if $config->{format} eq 'id';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
431 die("ERROR: Cannot do frequency filtering in standalone mode") if defined($config->{check_frequency});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
432 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
433
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
434 # write_cache needs cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
435 $config->{cache} = 1 if defined $config->{write_cache};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
436
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
437 # no_slice_cache, prefetch and whole_genome have to be on to use cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
438 if(defined($config->{cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
439 $config->{prefetch} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
440 $config->{no_slice_cache} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
441 $config->{whole_genome} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
442 $config->{strip} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
443 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
444
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
445 $config->{build} = $config->{rebuild} if defined($config->{rebuild});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
446
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
447 # force options for full build
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
448 if(defined($config->{build})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
449 $config->{prefetch} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
450 $config->{gene} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
451 $config->{hgnc} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
452 $config->{no_slice_cache} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
453 $config->{cache} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
454 $config->{strip} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
455 $config->{write_cache} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
456 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
457
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
458 # connect to databases
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
459 $config->{reg} = &connect_to_dbs($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
460
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
461 # complete dir with species name and db_version
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
462 $config->{dir} .= '/'.(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
463 join '/', (
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
464 defined($config->{standalone}) ? $config->{species} : ($config->{reg}->get_alias($config->{species}) || $config->{species}),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
465 $config->{db_version} || $config->{reg}->software_version
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
466 )
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
467 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
468
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
469 if(defined($config->{cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
470 # read cache info
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
471 if(read_cache_info($config)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
472 debug("Read existing cache info") unless defined $config->{quiet};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
473 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
474 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
475
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
476 # include regulatory modules if requested
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
477 if(defined($config->{regulatory})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
478 # do the use statements here so that users don't have to have the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
479 # funcgen API install to use the rest of the script
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
480 eval q{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
481 use Bio::EnsEMBL::Funcgen::DBSQL::RegulatoryFeatureAdaptor;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
482 use Bio::EnsEMBL::Funcgen::DBSQL::MotifFeatureAdaptor;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
483 use Bio::EnsEMBL::Funcgen::MotifFeature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
484 use Bio::EnsEMBL::Funcgen::RegulatoryFeature;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
485 use Bio::EnsEMBL::Funcgen::BindingMatrix;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
486 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
487
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
488 if($@) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
489 die("ERROR: Ensembl Funcgen API must be installed to use --regulatory\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
490 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
491 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
492
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
493 # warn user cache directory doesn't exist
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
494 if(!-e $config->{dir}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
495
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
496 # if using write_cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
497 if(defined($config->{write_cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
498 debug("INFO: Cache directory ", $config->{dir}, " not found - it will be created") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
499 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
500
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
501 # want to read cache, not found
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
502 elsif(defined($config->{cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
503 die("ERROR: Cache directory ", $config->{dir}, " not found");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
504 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
505 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
506
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
507 # suppress warnings that the FeatureAdpators spit if using no_slice_cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
508 Bio::EnsEMBL::Utils::Exception::verbose(1999) if defined($config->{no_slice_cache});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
509
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
510 # get adaptors
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
511 if(defined($config->{cache}) && !defined($config->{write_cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
512
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
513 # try and load adaptors from cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
514 if(!&load_dumped_adaptor_cache($config)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
515 &get_adaptors($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
516 &dump_adaptor_cache($config) if defined($config->{write_cache});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
517 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
518
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
519 # check cached adaptors match DB params
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
520 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
521 my $dbc = $config->{sa}->{dbc};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
522
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
523 my $ok = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
524
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
525 if($dbc->{_host} ne $config->{host}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
526
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
527 # ens-livemirror, useastdb and ensembldb should all have identical DBs
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
528 unless(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
529 (
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
530 $dbc->{_host} eq 'ens-livemirror'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
531 || $dbc->{_host} eq 'ensembldb.ensembl.org'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
532 || $dbc->{_host} eq 'useastdb.ensembl.org'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
533 ) && (
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
534 $config->{host} eq 'ens-livemirror'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
535 || $config->{host} eq 'ensembldb.ensembl.org'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
536 || $config->{host} eq 'useastdb.ensembl.org'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
537 )
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
538 ) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
539 $ok = 0;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
540 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
541
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
542 # but we still need to reconnect
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
543 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, " - reconnecting to host") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
544
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
545 &get_adaptors($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
546 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
547
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
548 if(!$ok) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
549 if(defined($config->{skip_db_check})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
550 debug("INFO: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}) unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
551 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
552 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
553 die "ERROR: Defined host ", $config->{host}, " is different from cached ", $dbc->{_host}, ". If you are sure this is OK, rerun with -skip_db_check flag set";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
554 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
555 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
556 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
557 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
558 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
559 &get_adaptors($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
560 &dump_adaptor_cache($config) if defined($config->{write_cache})
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
561 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
562
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
563 # reg adaptors (only fetches if not retrieved from cache already)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
564 &get_reg_adaptors($config) if defined($config->{regulatory});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
565
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
566 # get terminal width for progress bars
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
567 unless(defined($config->{quiet})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
568 my $width;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
569
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
570 # module may not be installed
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
571 eval q{
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
572 use Term::ReadKey;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
573 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
574
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
575 if(!$@) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
576 my ($w, $h);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
577
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
578 # module may be installed, but e.g.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
579 eval {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
580 ($w, $h) = GetTerminalSize();
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
581 };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
582
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
583 $width = $w if defined $w;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
584 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
585
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
586 $width ||= 60;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
587 $width -= 12;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
588 $config->{terminal_width} = $width;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
589 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
590
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
591 # jump out to build cache if requested
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
592 if(defined($config->{build})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
593
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
594 if($config->{host} =~ /^(ensembl|useast)db\.ensembl\.org$/ && !defined($config->{admin})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
595 die("ERROR: Cannot build cache using public database server ", $config->{host}, "\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
596 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
597
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
598 # build the cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
599 debug("Building cache for ".$config->{species}) unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
600 build_full_cache($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
601
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
602 # exit script
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
603 debug("Finished building cache") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
604 exit(0);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
605 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
606
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
607 # warn user DB will be used for SIFT/PolyPhen/Condel/HGVS/frequency
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
608 if(defined($config->{cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
609
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
610 # these two def depend on DB
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
611 foreach my $param(grep {defined $config->{$_}} qw(hgvs check_frequency)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
612 debug("INFO: Database will be accessed when using --$param") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
613 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
614
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
615 # as does using HGVS or IDs as input
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
616 debug("INFO: Database will be accessed when using --format ", $config->{format}) if ($config->{format} eq 'id' || $config->{format} eq 'hgvs') && !defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
617
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
618 # the rest may be in the cache
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
619 foreach my $param(grep {defined $config->{$_}} qw(sift polyphen condel regulatory)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
620 next if defined($config->{'cache_'.$param});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
621 debug("INFO: Database will be accessed when using --$param; consider using the complete cache containing $param data (see documentation for details)") unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
622 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
623 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
624
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
625 # get list of chrs if supplied
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
626 if(defined($config->{chr})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
627 my %chrs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
628
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
629 foreach my $val(split /\,/, $config->{chr}) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
630 my @nnn = split /\-/, $val;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
631
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
632 foreach my $chr($nnn[0]..$nnn[-1]) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
633 $chrs{$chr} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
634 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
635 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
636
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
637 $config->{chr} = \%chrs;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
638 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
639
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
640 # get input file handle
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
641 $config->{in_file_handle} = &get_in_file_handle($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
642
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
643 # configure output file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
644 $config->{out_file_handle} = &get_out_file_handle($config);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
645
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
646 return $config;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
647 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
648
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
649 # connects to DBs; in standalone mode this just loads registry module
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
650 sub connect_to_dbs {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
651 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
652
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
653 # get registry
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
654 my $reg = 'Bio::EnsEMBL::Registry';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
655
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
656 unless(defined($config->{standalone})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
657 # load DB options from registry file if given
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
658 if(defined($config->{registry})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
659 debug("Loading DB config from registry file ", $config->{registry}) unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
660 $reg->load_all(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
661 $config->{registry},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
662 $config->{verbose},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
663 undef,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
664 $config->{no_slice_cache}
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
665 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
666 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
667
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
668 # otherwise manually connect to DB server
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
669 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
670 $reg->load_registry_from_db(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
671 -host => $config->{host},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
672 -user => $config->{user},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
673 -pass => $config->{password},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
674 -port => $config->{port},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
675 -db_version => $config->{db_version},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
676 -species => $config->{species} =~ /^[a-z]+\_[a-z]+/i ? $config->{species} : undef,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
677 -verbose => $config->{verbose},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
678 -no_cache => $config->{no_slice_cache},
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
679 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
680 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
681
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
682 eval { $reg->set_reconnect_when_lost() };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
683
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
684 if(defined($config->{verbose})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
685 # get a meta container adaptors to check version
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
686 my $core_mca = $reg->get_adaptor($config->{species}, 'core', 'metacontainer');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
687 my $var_mca = $reg->get_adaptor($config->{species}, 'variation', 'metacontainer');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
688
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
689 if($core_mca && $var_mca) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
690 debug(
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
691 "Connected to core version ", $core_mca->get_schema_version, " database ",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
692 "and variation version ", $var_mca->get_schema_version, " database"
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
693 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
694 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
695 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
696 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
697
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
698 return $reg;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
699 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
700
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
701 # get adaptors from DB
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
702 sub get_adaptors {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
703 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
704
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
705 die "ERROR: No registry" unless defined $config->{reg};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
706
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
707 $config->{vfa} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variationfeature');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
708 $config->{tva} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'transcriptvariation');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
709 $config->{pfpma} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'proteinfunctionpredictionmatrix');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
710 $config->{va} = $config->{reg}->get_adaptor($config->{species}, 'variation', 'variation');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
711
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
712 # get fake ones for species with no var DB
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
713 if(!defined($config->{vfa})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
714 $config->{vfa} = Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor->new_fake($config->{species});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
715 $config->{tva} = Bio::EnsEMBL::Variation::DBSQL::TranscriptVariationAdaptor->new_fake($config->{species});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
716 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
717 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
718 $config->{vfa}->db->include_failed_variations($config->{include_failed}) if defined($config->{vfa}->db) && $config->{vfa}->db->can('include_failed_variations');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
719 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
720
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
721 $config->{sa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'slice');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
722 $config->{ga} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'gene');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
723 $config->{ta} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'transcript');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
724 $config->{mca} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'metacontainer');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
725 $config->{csa} = $config->{reg}->get_adaptor($config->{species}, $config->{core_type}, 'coordsystem');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
726
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
727 # cache schema version
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
728 $config->{mca}->get_schema_version if defined $config->{mca};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
729
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
730 # check we got slice adaptor - can't continue without a core DB
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
731 die("ERROR: Could not connect to core database\n") unless defined $config->{sa};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
732 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
733
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
734 # gets regulatory adaptors
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
735 sub get_reg_adaptors {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
736 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
737
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
738 foreach my $type(@REG_FEAT_TYPES) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
739 next if defined($config->{$type.'_adaptor'});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
740
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
741 my $adaptor = $config->{reg}->get_adaptor($config->{species}, 'funcgen', $type);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
742 if(defined($adaptor)) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
743 $config->{$type.'_adaptor'} = $adaptor;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
744 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
745 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
746 delete $config->{regulatory};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
747 last;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
748 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
749 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
750 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
751
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
752 # gets file handle for input
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
753 sub get_in_file_handle {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
754 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
755
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
756 # define the filehandle to read input from
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
757 my $in_file_handle = new FileHandle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
758
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
759 if(defined($config->{input_file})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
760
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
761 # check defined input file exists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
762 die("ERROR: Could not find input file ", $config->{input_file}, "\n") unless -e $config->{input_file};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
763
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
764 if($config->{input_file} =~ /\.gz$/){
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
765 $in_file_handle->open($config->{compress}." ". $config->{input_file} . " | " ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
766 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
767 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
768 $in_file_handle->open( $config->{input_file} ) or die("ERROR: Could not read from input file ", $config->{input_file}, "\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
769 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
770 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
771
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
772 # no file specified - try to read data off command line
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
773 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
774 $in_file_handle = 'STDIN';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
775 debug("Reading input from STDIN (or maybe you forgot to specify an input file?)...") unless defined $config->{quiet};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
776 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
777
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
778 return $in_file_handle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
779 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
780
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
781 # gets file handle for output and adds header
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
782 sub get_out_file_handle {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
783 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
784
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
785 # define filehandle to write to
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
786 my $out_file_handle = new FileHandle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
787
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
788 # check if file exists
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
789 if(-e $config->{output_file} && !defined($config->{force_overwrite})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
790 die("ERROR: Output file ", $config->{output_file}, " already exists. Specify a different output file with --output_file or overwrite existing file with --force_overwrite\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
791 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
792
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
793 if($config->{output_file} =~ /stdout/i) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
794 $out_file_handle = *STDOUT;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
795 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
796 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
797 $out_file_handle->open(">".$config->{output_file}) or die("ERROR: Could not write to output file ", $config->{output_file}, "\n");
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
798 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
799
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
800 # file conversion, don't want to add normal headers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
801 if(defined($config->{convert})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
802 # header for VCF
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
803 if($config->{convert} =~ /vcf/i) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
804 print $out_file_handle join "\t", (
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
805 '#CHROM',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
806 'POS',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
807 'ID',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
808 'REF',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
809 'ALT',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
810 'QUAL',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
811 'FILTER',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
812 'INFO'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
813 );
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
814 print $out_file_handle "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
815 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
816
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
817 return $out_file_handle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
818 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
819
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
820 # GVF output, no header
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
821 elsif(defined($config->{gvf})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
822 return $out_file_handle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
823 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
824
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
825 # make header
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
826 my $time = &get_time;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
827 my $db_string = $config->{mca}->dbc->dbname." on ".$config->{mca}->dbc->host if defined $config->{mca};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
828 $db_string .= "\n## Using cache in ".$config->{dir} if defined($config->{cache});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
829 my $version_string =
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
830 "Using API version ".$config->{reg}->software_version.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
831 ", DB version ".(defined $config->{mca} && $config->{mca}->get_schema_version ? $config->{mca}->get_schema_version : '?');
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
832
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
833 my $header =<<HEAD;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
834 ## ENSEMBL VARIANT EFFECT PREDICTOR v$VERSION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
835 ## Output produced at $time
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
836 ## Connected to $db_string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
837 ## $version_string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
838 ## Extra column keys:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
839 ## CANONICAL : Indicates if transcript is canonical for this gene
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
840 ## HGNC : HGNC gene identifier
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
841 ## ENSP : Ensembl protein identifer
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
842 ## HGVSc : HGVS coding sequence name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
843 ## HGVSp : HGVS protein sequence name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
844 ## SIFT : SIFT prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
845 ## PolyPhen : PolyPhen prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
846 ## Condel : Condel SIFT/PolyPhen consensus prediction
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
847 ## MATRIX : The source and identifier of a transcription factor binding profile aligned at this position
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
848 ## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of a transcription factor binding profile
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
849 HEAD
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
850
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
851 # add headers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
852 print $out_file_handle $header;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
853
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
854 # add column headers
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
855 print $out_file_handle '#', (join "\t", @OUTPUT_COLS);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
856 print $out_file_handle "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
857
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
858 return $out_file_handle;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
859 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
860
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
861 # convert a variation feature to a line of output
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
862 sub convert_vf {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
863 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
864 my $vf = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
865
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
866 my $convert_method = 'convert_to_'.lc($config->{convert});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
867 my $method_ref = \&$convert_method;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
868
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
869 my $line = &$method_ref($config, $vf);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
870 my $handle = $config->{out_file_handle};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
871
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
872 if(scalar @$line) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
873 print $handle join "\t", @$line;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
874 print $handle "\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
875 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
876 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
877
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
878 # converts to Ensembl format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
879 sub convert_to_ensembl {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
880 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
881 my $vf = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
882
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
883 return [
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
884 $vf->{chr} || $vf->seq_region_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
885 $vf->start,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
886 $vf->end,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
887 $vf->allele_string,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
888 $vf->strand,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
889 $vf->variation_name
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
890 ];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
891 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
892
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
893 # converts to VCF format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
894 sub convert_to_vcf {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
895 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
896 my $vf = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
897
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
898 # look for imbalance in the allele string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
899 my %allele_lengths;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
900 my @alleles = split /\//, $vf->allele_string;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
901
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
902 foreach my $allele(@alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
903 $allele =~ s/\-//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
904 $allele_lengths{length($allele)} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
905 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
906
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
907 # in/del/unbalanced
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
908 if(scalar keys %allele_lengths > 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
909
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
910 # we need the ref base before the variation
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
911 # default to N in case we can't get it
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
912 my $prev_base = 'N';
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
913
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
914 unless(defined($config->{cache})) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
915 my $slice = $vf->slice->sub_Slice($vf->start - 1, $vf->start - 1);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
916 $prev_base = $slice->seq if defined($slice);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
917 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
918
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
919 for my $i(0..$#alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
920 $alleles[$i] =~ s/\-//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
921 $alleles[$i] = $prev_base.$alleles[$i];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
922 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
923
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
924 return [
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
925 $vf->{chr} || $vf->seq_region_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
926 $vf->start - 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
927 $vf->variation_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
928 shift @alleles,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
929 (join ",", @alleles),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
930 '.', '.', '.'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
931 ];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
932
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
933 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
934
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
935 # balanced sub
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
936 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
937 return [
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
938 $vf->{chr} || $vf->seq_region_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
939 $vf->start,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
940 $vf->variation_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
941 shift @alleles,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
942 (join ",", @alleles),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
943 '.', '.', '.'
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
944 ];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
945 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
946 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
947
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
948
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
949 # converts to pileup format
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
950 sub convert_to_pileup {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
951 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
952 my $vf = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
953
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
954 # look for imbalance in the allele string
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
955 my %allele_lengths;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
956 my @alleles = split /\//, $vf->allele_string;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
957
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
958 foreach my $allele(@alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
959 $allele =~ s/\-//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
960 $allele_lengths{length($allele)} = 1;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
961 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
962
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
963 # in/del
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
964 if(scalar keys %allele_lengths > 1) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
965
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
966 if($vf->allele_string =~ /\-/) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
967
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
968 # insertion?
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
969 if($alleles[0] eq '-') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
970 shift @alleles;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
971
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
972 for my $i(0..$#alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
973 $alleles[$i] =~ s/\-//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
974 $alleles[$i] = '+'.$alleles[$i];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
975 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
976 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
977
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
978 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
979 @alleles = grep {$_ ne '-'} @alleles;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
980
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
981 for my $i(0..$#alleles) {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
982 $alleles[$i] =~ s/\-//g;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
983 $alleles[$i] = '-'.$alleles[$i];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
984 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
985 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
986
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
987 @alleles = grep {$_ ne '-' && $_ ne '+'} @alleles;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
988
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
989 return [
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
990 $vf->{chr} || $vf->seq_region_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
991 $vf->start - 1,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
992 '*',
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
993 (join "/", @alleles),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
994 ];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
995 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
996
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
997 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
998 warn "WARNING: Unable to convert variant to pileup format on line number ", $config->{line_number} unless defined($config->{quiet});
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
999 return [];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1000 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1001
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1002 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1003
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1004 # balanced sub
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1005 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1006 return [
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1007 $vf->{chr} || $vf->seq_region_name,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1008 $vf->start,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1009 shift @alleles,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1010 (join "/", @alleles),
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1011 ];
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1012 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1013 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1014
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1015 # prints a line of output from the hash
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1016 sub print_line {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1017 my $config = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1018 my $line = shift;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1019 return unless defined($line);
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1020
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1021 my $output;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1022
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1023 # normal
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1024 if(ref($line) eq 'HASH') {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1025 $line->{Extra} = join ';', map { $_.'='.$line->{Extra}->{$_} } keys %{ $line->{Extra} || {} };
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1026 $output = join "\t", map { $line->{$_} || '-' } @OUTPUT_COLS;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1027 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1028
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1029 # gvf
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1030 else {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1031 $output = $line;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1032 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1033
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1034 my $fh = $config->{out_file_handle};
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1035 print $fh "$output\n";
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1036 }
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1037
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1038 # outputs usage message
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1039 sub usage {
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1040 my $usage =<<END;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1041 #----------------------------------#
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1042 # ENSEMBL VARIANT EFFECT PREDICTOR #
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1043 #----------------------------------#
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1044
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1045 version $VERSION
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1046
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1047 By Will McLaren (wm2\@ebi.ac.uk)
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1048
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1049 http://www.ensembl.org/info/docs/variation/vep/vep_script.html
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1050
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1051 Usage:
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1052 perl variant_effect_predictor.pl [arguments]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1053
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1054 Options
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1055 =======
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1056
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1057 --help Display this message and quit
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1058 --verbose Display verbose output as the script runs [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1059 --quiet Suppress status and warning messages [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1060 --no_progress Suppress progress bars [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1061
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1062 --config Load configuration from file. Any command line options
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1063 specified overwrite those in the file [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1064
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1065 -i | --input_file Input file - if not specified, reads from STDIN. Files
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1066 may be gzip compressed.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1067 --format Specify input file format - one of "ensembl", "pileup",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1068 "vcf", "hgvs", "id" or "guess" to try and work out format.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1069 -o | --output_file Output file. Write to STDOUT by specifying -o STDOUT - this
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1070 will force --quiet [default: "variant_effect_output.txt"]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1071 --force_overwrite Force overwriting of output file [default: quit if file
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1072 exists]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1073
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1074 --species [species] Species to use [default: "human"]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1075
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1076 -t | --terms Type of consequence terms to output - one of "ensembl", "SO",
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1077 "NCBI" [default: ensembl]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1078
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1079 --sift=[p|s|b] Add SIFT [p]rediction, [s]core or [b]oth [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1080 --polyphen=[p|s|b] Add PolyPhen [p]rediction, [s]core or [b]oth [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1081 --condel=[p|s|b] Add Condel SIFT/PolyPhen consensus [p]rediction, [s]core or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1082 [b]oth. Add 1 (e.g. b1) to option to use old Condel algorithm
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1083 [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1084
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1085 NB: SIFT, PolyPhen and Condel predictions are currently available for human only
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1086
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1087 --regulatory Look for overlaps with regulatory regions. The script can
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1088 also call if a variant falls in a high information position
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1089 within a transcription factor binding site. Output lines have
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1090 a Feature type of RegulatoryFeature or MotifFeature
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1091 [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1092
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1093 NB: Regulatory consequences are currently available for human and mouse only
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1094
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1095 --hgnc If specified, HGNC gene identifiers are output alongside the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1096 Ensembl Gene identifier [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1097 --hgvs Output HGVS identifiers (coding and protein). Requires database
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1098 connection [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1099 --protein Output Ensembl protein identifer [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1100 --gene Force output of Ensembl gene identifer - disabled by default
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1101 unless using --cache or --no_whole_genome [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1102 --canonical Indicate if the transcript for this consequence is the canonical
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1103 transcript for this gene [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1104
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1105 --coding_only Only return consequences that fall in the coding region of
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1106 transcripts [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1107 --most_severe Ouptut only the most severe consequence per variation.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1108 Transcript-specific columns will be left blank. [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1109 --summary Output only a comma-separated list of all consequences per
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1110 variation. Transcript-specific columns will be left blank.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1111 [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1112 --per_gene Output only the most severe consequence per gene. Where more
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1113 than one transcript has the same consequence, the transcript
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1114 chosen is arbitrary. [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1115
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1116
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1117 --check_ref If specified, checks supplied reference allele against stored
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1118 entry in Ensembl Core database [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1119 --check_existing If specified, checks for existing co-located variations in the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1120 Ensembl Variation database [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1121 --check_alleles If specified, the alleles of existing co-located variations
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1122 are compared to the input; an existing variation will only
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1123 be reported if no novel allele is in the input (strand is
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1124 accounted for) [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1125
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1126 --no_intergenic Excludes intergenic consequences from the output [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1127
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1128 --check_frequency Turns on frequency filtering. Use this to include or exclude
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1129 variants based on the frequency of co-located existing
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1130 variants in the Ensembl Variation database. You must also
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1131 specify all of the following --freq flags [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1132 --freq_pop [pop] Name of the population to use e.g. hapmap_ceu for CEU HapMap,
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1133 1kg_yri for YRI 1000 genomes. See documentation for more
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1134 details
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1135 --freq_freq [freq] Frequency to use in filter. Must be a number between 0 and 0.5
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1136 --freq_gt_lt [gt|lt] Specify whether the frequency should be greater than (gt) or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1137 less than (lt) --freq_freq
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1138 --freq_filter Specify whether variants that pass the above should be included
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1139 [exclude|include] or excluded from analysis
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1140
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1141 --chr [list] Select a subset of chromosomes to analyse from your file. Any
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1142 data not on this chromosome in the input will be skipped. The
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1143 list can be comma separated, with "-" characters representing
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1144 a range e.g. 1-5,8,15,X [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1145 --gp If specified, tries to read GRCh37 position from GP field in the
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1146 INFO column of a VCF file. Only applies when VCF is the input
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1147 format and human is the species [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1148
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1149 --convert Convert the input file to the output format specified.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1150 [ensembl|vcf|pileup] Converted output is written to the file specified in
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1151 --output_file. No consequence calculation is carried out when
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1152 doing file conversion. [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1153
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1154 --refseq Use the otherfeatures database to retrieve transcripts - this
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1155 database contains RefSeq transcripts (as well as CCDS and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1156 Ensembl EST alignments) [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1157 --host Manually define database host [default: "ensembldb.ensembl.org"]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1158 -u | --user Database username [default: "anonymous"]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1159 --port Database port [default: 5306]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1160 --password Database password [default: no password]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1161 --genomes Sets DB connection params for Ensembl Genomes [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1162 --registry Registry file to use defines DB connections [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1163 Defining a registry file overrides above connection settings.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1164 --db_version=[number] Force script to load DBs from a specific Ensembl version. Not
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1165 advised due to likely incompatibilities between API and DB
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1166
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1167 --no_whole_genome Run in old-style, non-whole genome mode [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1168 --buffer_size Sets the number of variants sent in each batch [default: 5000]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1169 Increasing buffer size can retrieve results more quickly
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1170 but requires more memory. Only applies to whole genome mode.
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1171
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1172 --cache Enables read-only use of cache [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1173 --dir [directory] Specify the base cache directory to use [default: "\$HOME/.vep/"]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1174 --write_cache Enable writing to cache [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1175 --build [all|list] Build a complete cache for the selected species. Build for all
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1176 chromosomes with --build all, or a list of chromosomes (see
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1177 --chr). DO NOT USE WHEN CONNECTED TO PUBLIC DB SERVERS AS THIS
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1178 VIOLATES OUR FAIR USAGE POLICY [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1179
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1180 --compress Specify utility to decompress cache files - may be "gzcat" or
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1181 "gzip -dc" Only use if default does not work [default: zcat]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1182
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1183 --skip_db_check ADVANCED! Force the script to use a cache built from a different
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1184 database than specified with --host. Only use this if you are
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1185 sure the hosts are compatible (e.g. ensembldb.ensembl.org and
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1186 useastdb.ensembl.org) [default: off]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1187 --cache_region_size ADVANCED! The size in base-pairs of the region covered by one
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1188 file in the cache. [default: 1MB]
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1189 END
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1190
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1191 print $usage;
21066c0abaf5 Uploaded
willmclaren
parents:
diff changeset
1192 }