comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm @ 0:1f6dce3d34e0

Uploaded
author mahtabm
date Thu, 11 Apr 2013 02:01:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:1f6dce3d34e0
1 =pod
2
3 =head1 NAME
4
5 Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl;
6
7 =head1 DESCRIPTION
8
9 'SWEmbl' Is a base class for other classes dealing with SWEmbl
10 It contains virtually nothing so it may disappear and just pass to Funcgen
11
12 =cut
13
14 package Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl;
15
16 use warnings;
17 use strict;
18 use Bio::EnsEMBL::Funcgen::Utils::Helper;
19 use Bio::EnsEMBL::DBSQL::DBAdaptor;
20 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
21 use Bio::EnsEMBL::Funcgen::InputSet;
22 use Bio::EnsEMBL::Funcgen::DataSet;
23 use Bio::EnsEMBL::Funcgen::FeatureSet;
24 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
25
26 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen');
27
28 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
29 use Data::Dumper;
30
31 sub fetch_input {
32 my $self = shift @_;
33
34 $self->SUPER::fetch_input();
35
36 my $efgdba = $self->_efgdba;
37
38 my $aa = $efgdba->get_AnalysisAdaptor;
39 my $analysis = $self->param('analysis') || throw "Need to specify the analysis";
40 my $analysis_obj = $aa->fetch_by_logic_name($analysis);
41 #Here we do not create or modify the analysis on the fly!! - similarly as cell or feature type...
42 if(!$analysis_obj){ throw "Analysis $analysis is not in the database"; }
43 $self->_analysis($analysis_obj);
44
45 $self->_feature_set_name($self->_set_name()."_".$analysis);
46
47 my $cell_type = $self->_cell_type()->name;
48 my $feature_type = $self->_feature_type()->name;
49 my $file_type = $self->_file_type();
50 my $experiment_name = $self->_experiment_name();
51
52 #TODO Change this to accept samse and sampe and others?? - add an extra parameter when needed?
53 #Also not necessarily .gz ... maybe force the use of the parameter 'data_file' in the input_id
54 #my $input_file = $self->param('data_file') || $cell_type."_".$feature_type.".samse.".$file_type.".gz";
55 my $input_file = $self->param('data_file') || $self->_set_name().".samse.".$file_type.".gz";
56 $self->_input_file($input_file);
57
58 my $work_dir = $self->_work_dir."/alignments/".$self->_species()."/".$self->_assembly()."/".$experiment_name;
59 my $input_dir = $self->param('input_dir') || $work_dir;
60 $self->_input_dir($input_dir);
61
62 my $skip_control = $self->_skip_control($self->param('skip_control'));
63 if(!$skip_control){
64 #May also be passed as input_id, but then input_id may need to be TEXT
65 #Control must be in same dir as input file... maybe change this...
66 if(!$self->param('control_file')){
67 #For the moment the control file has to be the same file type as the input file, but does not need to be the case...
68 #TODO Need to validate here if that's the case...
69 my $control_feature = $self->param('control_feature') || throw "Need to define 'control_feature'";
70 my $control_file = $cell_type."_".$control_feature."_".$experiment_name.".samse.".$file_type.".gz";
71 $self->_control_file($control_file);
72 }
73 }
74
75 return 1;
76 }
77
78
79 # Private function only to be called by subclasses of this class
80 # prepares data to be used with SWEMBL...
81 # sorts the input, removes mythochondria and unaligned reads...
82 sub _preprocess_file{
83
84 #Consider using hash to process input
85 my ($self, $input, $output, $file_type) = (shift, shift, shift, shift);
86
87 #For the moment we always overwrite any existent file...
88 #Maybe reuse previously cached files? How to check if they are corrupted?
89 #Maybe create a -reuse flag?
90
91 #running piped system commands is a potential source of untraceable errors!!
92 #TODO try changing this... e.g. with Bio::DB::Sam ... ?
93 my $command;
94 if($file_type eq 'bed'){
95 $command = "gzip -dc ${input}";
96
97 #Remove mitochondria before SWEMBL
98 warn "Excluding mytochondria reads before passing to SWEMBL. Make sure Bed file has MT for mytochondria";
99 warn "Duplicates are not removed in Bed files while they are in sam files";
100 #This probably won't work for many BED files... e.g chrM or sam-like "beds"...
101 $command = " | grep -v '^MT' ";
102 $command .= " | sort -k1,1 -k2,2n -k3,3n | gzip -c > $output";
103
104 } elsif($file_type eq 'sam'){
105
106 #Manual Alternative to samtools
107 #$command = "gzip -dc ".$self->param($parameter);
108 #$command .= ' | grep -vE \'^@\' | grep -vE "^[^[:space:]]+[[:blank:]]4[[:blank:]].*$" | sort -k3,3 -k4,4n | gzip -c > '.$work_file;
109
110 #Sometimes the input sam file may have incorrect headings which will mess up subsequent steps...
111 #$command = "gzip -dc $input | grep -v '^\@' | "; # This is not likely to happen now
112 $command = "gzip -dc $input | ";
113
114 #PREPROCESSING... remove mitochondria before SWEMBL : we need to cater for different approaches
115 #TODO do this in a better, more generic way
116 $command .= "grep -vE '^[^[:space:]]+[[:blank:]][^[:space:]]+[[:blank:]][^[:space:]]+\:[^[:space:]]+\:MT\:' | ";
117 $command .= "grep -v '^MT' | grep -v '^chrM' | ";
118
119 #Remove unmapped reads...
120 $command .= $self->_bin_dir()."/samtools view -uSh -t ".$self->_sam_header()." -F 4 - | ";
121
122 # This piped sort is not working!! (this problem has been reported in the samtools mailing list)
123 #TODO Check why this pipe in the sort is not working...
124 #$command .= "samtools sort -o - ".$self->param($parameter)."_tmp | " ;
125 #$command .= "samtools view -h - | gzip -c > $work_file";
126
127 # Create temp file for now and remove it when we figure out what's wrong with the piped sort!
128 $command .= $self->_bin_dir()."/samtools sort - ${input}_tmp ; " ;
129
130 #Add a remove duplicates step (this is not supported with BED files for the moment)
131 $command .= $self->_bin_dir()."/samtools rmdup -s ${input}_tmp.bam - | ";
132 $command .= $self->_bin_dir()."/samtools view -h - | gzip -c > $output";
133
134 #Alternative with no rmdup...
135 #$command .= $self->_bin_dir()."/samtools view -h ${input}_tmp.bam | gzip -c > $output";
136
137 $command .= " ; rm -f ${input}_tmp.bam";
138
139 }
140 else{
141 throw("$file_type file format not supported");
142 }
143
144 system($command) && throw("Failed processing $input with command $command");
145
146 return 1;
147 }
148
149 # Private function only to be called by subclasses of this class
150 # gets the number of reads in a sam or bed file
151 #sub _get_number_of_reads {
152 # my ($self, $file, $file_type) = (shift, shift, shift);
153 # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; }
154 # my $nbr_reads = 0;
155 # #If needed, add an option to check if is zipped or not...
156 # my $open_cmd = "gzip -dc $file |";
157 # open(FILE,$open_cmd);
158 # while(<FILE>){
159 # if($file_type eq "sam"){
160 # next if /^\@SQ/;
161 # }else {
162 # next if /track name=/o;
163 # }
164 # $nbr_reads++;
165 # }
166 # close FILE;
167 # return $nbr_reads;
168 #}
169
170 # Private function only to be called by subclasses of this class
171 # gets the number of reads in a sam or bed file
172 #sub _get_slices {
173 # #NOT DONE!!
174 # my ($self, $file, $file_type) = (shift, shift, shift);
175 # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; }
176 # my $nbr_reads = 0;
177 # #If needed, add an option to check if is zipped or not...
178 # my $open_cmd = "gzip -dc $file |";
179 # open(FILE,$open_cmd);
180 # while(<FILE>){
181 # if($file_type eq "sam"){
182 # next if /^@SQ/;
183 # }else {
184 # next if /track name=/o;
185 # }
186 # $nbr_reads++;
187 # }
188 # close FILE;
189 # return $nbr_reads;
190 #}
191
192
193 #Private getter / setter to the Feature Set Name
194 sub _feature_set_name {
195 return $_[0]->_getter_setter('feature_set_name',$_[1]);
196 }
197
198 #Private getter / setter to the input folder
199 sub _input_dir {
200 return $_[0]->_getter_setter('input_dir',$_[1]);
201 }
202
203 #Private getter / setter to the Input subset (usually a file name)
204 sub _input_file {
205 return $_[0]->_getter_setter('input_file',$_[1]);
206 }
207
208 #Private getter / setter to the skip control option
209 sub _skip_control {
210 return $_[0]->_getter_setter('skip_control',$_[1]);
211 }
212
213 #Private getter / setter to the control file
214 sub _control_file {
215 return $_[0]->_getter_setter('control_file',$_[1]);
216 }
217
218 1;
219