0
|
1 =pod
|
|
2
|
|
3 =head1 NAME
|
|
4
|
|
5 Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl;
|
|
6
|
|
7 =head1 DESCRIPTION
|
|
8
|
|
9 'SWEmbl' Is a base class for other classes dealing with SWEmbl
|
|
10 It contains virtually nothing so it may disappear and just pass to Funcgen
|
|
11
|
|
12 =cut
|
|
13
|
|
14 package Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl;
|
|
15
|
|
16 use warnings;
|
|
17 use strict;
|
|
18 use Bio::EnsEMBL::Funcgen::Utils::Helper;
|
|
19 use Bio::EnsEMBL::DBSQL::DBAdaptor;
|
|
20 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor;
|
|
21 use Bio::EnsEMBL::Funcgen::InputSet;
|
|
22 use Bio::EnsEMBL::Funcgen::DataSet;
|
|
23 use Bio::EnsEMBL::Funcgen::FeatureSet;
|
|
24 use Bio::EnsEMBL::Funcgen::AnnotatedFeature;
|
|
25
|
|
26 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen');
|
|
27
|
|
28 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump);
|
|
29 use Data::Dumper;
|
|
30
|
|
31 sub fetch_input {
|
|
32 my $self = shift @_;
|
|
33
|
|
34 $self->SUPER::fetch_input();
|
|
35
|
|
36 my $efgdba = $self->_efgdba;
|
|
37
|
|
38 my $aa = $efgdba->get_AnalysisAdaptor;
|
|
39 my $analysis = $self->param('analysis') || throw "Need to specify the analysis";
|
|
40 my $analysis_obj = $aa->fetch_by_logic_name($analysis);
|
|
41 #Here we do not create or modify the analysis on the fly!! - similarly as cell or feature type...
|
|
42 if(!$analysis_obj){ throw "Analysis $analysis is not in the database"; }
|
|
43 $self->_analysis($analysis_obj);
|
|
44
|
|
45 $self->_feature_set_name($self->_set_name()."_".$analysis);
|
|
46
|
|
47 my $cell_type = $self->_cell_type()->name;
|
|
48 my $feature_type = $self->_feature_type()->name;
|
|
49 my $file_type = $self->_file_type();
|
|
50 my $experiment_name = $self->_experiment_name();
|
|
51
|
|
52 #TODO Change this to accept samse and sampe and others?? - add an extra parameter when needed?
|
|
53 #Also not necessarily .gz ... maybe force the use of the parameter 'data_file' in the input_id
|
|
54 #my $input_file = $self->param('data_file') || $cell_type."_".$feature_type.".samse.".$file_type.".gz";
|
|
55 my $input_file = $self->param('data_file') || $self->_set_name().".samse.".$file_type.".gz";
|
|
56 $self->_input_file($input_file);
|
|
57
|
|
58 my $work_dir = $self->_work_dir."/alignments/".$self->_species()."/".$self->_assembly()."/".$experiment_name;
|
|
59 my $input_dir = $self->param('input_dir') || $work_dir;
|
|
60 $self->_input_dir($input_dir);
|
|
61
|
|
62 my $skip_control = $self->_skip_control($self->param('skip_control'));
|
|
63 if(!$skip_control){
|
|
64 #May also be passed as input_id, but then input_id may need to be TEXT
|
|
65 #Control must be in same dir as input file... maybe change this...
|
|
66 if(!$self->param('control_file')){
|
|
67 #For the moment the control file has to be the same file type as the input file, but does not need to be the case...
|
|
68 #TODO Need to validate here if that's the case...
|
|
69 my $control_feature = $self->param('control_feature') || throw "Need to define 'control_feature'";
|
|
70 my $control_file = $cell_type."_".$control_feature."_".$experiment_name.".samse.".$file_type.".gz";
|
|
71 $self->_control_file($control_file);
|
|
72 }
|
|
73 }
|
|
74
|
|
75 return 1;
|
|
76 }
|
|
77
|
|
78
|
|
79 # Private function only to be called by subclasses of this class
|
|
80 # prepares data to be used with SWEMBL...
|
|
81 # sorts the input, removes mythochondria and unaligned reads...
|
|
82 sub _preprocess_file{
|
|
83
|
|
84 #Consider using hash to process input
|
|
85 my ($self, $input, $output, $file_type) = (shift, shift, shift, shift);
|
|
86
|
|
87 #For the moment we always overwrite any existent file...
|
|
88 #Maybe reuse previously cached files? How to check if they are corrupted?
|
|
89 #Maybe create a -reuse flag?
|
|
90
|
|
91 #running piped system commands is a potential source of untraceable errors!!
|
|
92 #TODO try changing this... e.g. with Bio::DB::Sam ... ?
|
|
93 my $command;
|
|
94 if($file_type eq 'bed'){
|
|
95 $command = "gzip -dc ${input}";
|
|
96
|
|
97 #Remove mitochondria before SWEMBL
|
|
98 warn "Excluding mytochondria reads before passing to SWEMBL. Make sure Bed file has MT for mytochondria";
|
|
99 warn "Duplicates are not removed in Bed files while they are in sam files";
|
|
100 #This probably won't work for many BED files... e.g chrM or sam-like "beds"...
|
|
101 $command = " | grep -v '^MT' ";
|
|
102 $command .= " | sort -k1,1 -k2,2n -k3,3n | gzip -c > $output";
|
|
103
|
|
104 } elsif($file_type eq 'sam'){
|
|
105
|
|
106 #Manual Alternative to samtools
|
|
107 #$command = "gzip -dc ".$self->param($parameter);
|
|
108 #$command .= ' | grep -vE \'^@\' | grep -vE "^[^[:space:]]+[[:blank:]]4[[:blank:]].*$" | sort -k3,3 -k4,4n | gzip -c > '.$work_file;
|
|
109
|
|
110 #Sometimes the input sam file may have incorrect headings which will mess up subsequent steps...
|
|
111 #$command = "gzip -dc $input | grep -v '^\@' | "; # This is not likely to happen now
|
|
112 $command = "gzip -dc $input | ";
|
|
113
|
|
114 #PREPROCESSING... remove mitochondria before SWEMBL : we need to cater for different approaches
|
|
115 #TODO do this in a better, more generic way
|
|
116 $command .= "grep -vE '^[^[:space:]]+[[:blank:]][^[:space:]]+[[:blank:]][^[:space:]]+\:[^[:space:]]+\:MT\:' | ";
|
|
117 $command .= "grep -v '^MT' | grep -v '^chrM' | ";
|
|
118
|
|
119 #Remove unmapped reads...
|
|
120 $command .= $self->_bin_dir()."/samtools view -uSh -t ".$self->_sam_header()." -F 4 - | ";
|
|
121
|
|
122 # This piped sort is not working!! (this problem has been reported in the samtools mailing list)
|
|
123 #TODO Check why this pipe in the sort is not working...
|
|
124 #$command .= "samtools sort -o - ".$self->param($parameter)."_tmp | " ;
|
|
125 #$command .= "samtools view -h - | gzip -c > $work_file";
|
|
126
|
|
127 # Create temp file for now and remove it when we figure out what's wrong with the piped sort!
|
|
128 $command .= $self->_bin_dir()."/samtools sort - ${input}_tmp ; " ;
|
|
129
|
|
130 #Add a remove duplicates step (this is not supported with BED files for the moment)
|
|
131 $command .= $self->_bin_dir()."/samtools rmdup -s ${input}_tmp.bam - | ";
|
|
132 $command .= $self->_bin_dir()."/samtools view -h - | gzip -c > $output";
|
|
133
|
|
134 #Alternative with no rmdup...
|
|
135 #$command .= $self->_bin_dir()."/samtools view -h ${input}_tmp.bam | gzip -c > $output";
|
|
136
|
|
137 $command .= " ; rm -f ${input}_tmp.bam";
|
|
138
|
|
139 }
|
|
140 else{
|
|
141 throw("$file_type file format not supported");
|
|
142 }
|
|
143
|
|
144 system($command) && throw("Failed processing $input with command $command");
|
|
145
|
|
146 return 1;
|
|
147 }
|
|
148
|
|
149 # Private function only to be called by subclasses of this class
|
|
150 # gets the number of reads in a sam or bed file
|
|
151 #sub _get_number_of_reads {
|
|
152 # my ($self, $file, $file_type) = (shift, shift, shift);
|
|
153 # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; }
|
|
154 # my $nbr_reads = 0;
|
|
155 # #If needed, add an option to check if is zipped or not...
|
|
156 # my $open_cmd = "gzip -dc $file |";
|
|
157 # open(FILE,$open_cmd);
|
|
158 # while(<FILE>){
|
|
159 # if($file_type eq "sam"){
|
|
160 # next if /^\@SQ/;
|
|
161 # }else {
|
|
162 # next if /track name=/o;
|
|
163 # }
|
|
164 # $nbr_reads++;
|
|
165 # }
|
|
166 # close FILE;
|
|
167 # return $nbr_reads;
|
|
168 #}
|
|
169
|
|
170 # Private function only to be called by subclasses of this class
|
|
171 # gets the number of reads in a sam or bed file
|
|
172 #sub _get_slices {
|
|
173 # #NOT DONE!!
|
|
174 # my ($self, $file, $file_type) = (shift, shift, shift);
|
|
175 # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; }
|
|
176 # my $nbr_reads = 0;
|
|
177 # #If needed, add an option to check if is zipped or not...
|
|
178 # my $open_cmd = "gzip -dc $file |";
|
|
179 # open(FILE,$open_cmd);
|
|
180 # while(<FILE>){
|
|
181 # if($file_type eq "sam"){
|
|
182 # next if /^@SQ/;
|
|
183 # }else {
|
|
184 # next if /track name=/o;
|
|
185 # }
|
|
186 # $nbr_reads++;
|
|
187 # }
|
|
188 # close FILE;
|
|
189 # return $nbr_reads;
|
|
190 #}
|
|
191
|
|
192
|
|
193 #Private getter / setter to the Feature Set Name
|
|
194 sub _feature_set_name {
|
|
195 return $_[0]->_getter_setter('feature_set_name',$_[1]);
|
|
196 }
|
|
197
|
|
198 #Private getter / setter to the input folder
|
|
199 sub _input_dir {
|
|
200 return $_[0]->_getter_setter('input_dir',$_[1]);
|
|
201 }
|
|
202
|
|
203 #Private getter / setter to the Input subset (usually a file name)
|
|
204 sub _input_file {
|
|
205 return $_[0]->_getter_setter('input_file',$_[1]);
|
|
206 }
|
|
207
|
|
208 #Private getter / setter to the skip control option
|
|
209 sub _skip_control {
|
|
210 return $_[0]->_getter_setter('skip_control',$_[1]);
|
|
211 }
|
|
212
|
|
213 #Private getter / setter to the control file
|
|
214 sub _control_file {
|
|
215 return $_[0]->_getter_setter('control_file',$_[1]);
|
|
216 }
|
|
217
|
|
218 1;
|
|
219
|