Mercurial > repos > mahtabm > ensembl
comparison variant_effect_predictor/Bio/EnsEMBL/Funcgen/RunnableDB/SWEmbl.pm @ 0:1f6dce3d34e0
Uploaded
author | mahtabm |
---|---|
date | Thu, 11 Apr 2013 02:01:53 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1f6dce3d34e0 |
---|---|
1 =pod | |
2 | |
3 =head1 NAME | |
4 | |
5 Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl; | |
6 | |
7 =head1 DESCRIPTION | |
8 | |
9 'SWEmbl' Is a base class for other classes dealing with SWEmbl | |
10 It contains virtually nothing so it may disappear and just pass to Funcgen | |
11 | |
12 =cut | |
13 | |
14 package Bio::EnsEMBL::Funcgen::RunnableDB::SWEmbl; | |
15 | |
16 use warnings; | |
17 use strict; | |
18 use Bio::EnsEMBL::Funcgen::Utils::Helper; | |
19 use Bio::EnsEMBL::DBSQL::DBAdaptor; | |
20 use Bio::EnsEMBL::Funcgen::DBSQL::DBAdaptor; | |
21 use Bio::EnsEMBL::Funcgen::InputSet; | |
22 use Bio::EnsEMBL::Funcgen::DataSet; | |
23 use Bio::EnsEMBL::Funcgen::FeatureSet; | |
24 use Bio::EnsEMBL::Funcgen::AnnotatedFeature; | |
25 | |
26 use base ('Bio::EnsEMBL::Funcgen::RunnableDB::Funcgen'); | |
27 | |
28 use Bio::EnsEMBL::Utils::Exception qw(throw warning stack_trace_dump); | |
29 use Data::Dumper; | |
30 | |
31 sub fetch_input { | |
32 my $self = shift @_; | |
33 | |
34 $self->SUPER::fetch_input(); | |
35 | |
36 my $efgdba = $self->_efgdba; | |
37 | |
38 my $aa = $efgdba->get_AnalysisAdaptor; | |
39 my $analysis = $self->param('analysis') || throw "Need to specify the analysis"; | |
40 my $analysis_obj = $aa->fetch_by_logic_name($analysis); | |
41 #Here we do not create or modify the analysis on the fly!! - similarly as cell or feature type... | |
42 if(!$analysis_obj){ throw "Analysis $analysis is not in the database"; } | |
43 $self->_analysis($analysis_obj); | |
44 | |
45 $self->_feature_set_name($self->_set_name()."_".$analysis); | |
46 | |
47 my $cell_type = $self->_cell_type()->name; | |
48 my $feature_type = $self->_feature_type()->name; | |
49 my $file_type = $self->_file_type(); | |
50 my $experiment_name = $self->_experiment_name(); | |
51 | |
52 #TODO Change this to accept samse and sampe and others?? - add an extra parameter when needed? | |
53 #Also not necessarily .gz ... maybe force the use of the parameter 'data_file' in the input_id | |
54 #my $input_file = $self->param('data_file') || $cell_type."_".$feature_type.".samse.".$file_type.".gz"; | |
55 my $input_file = $self->param('data_file') || $self->_set_name().".samse.".$file_type.".gz"; | |
56 $self->_input_file($input_file); | |
57 | |
58 my $work_dir = $self->_work_dir."/alignments/".$self->_species()."/".$self->_assembly()."/".$experiment_name; | |
59 my $input_dir = $self->param('input_dir') || $work_dir; | |
60 $self->_input_dir($input_dir); | |
61 | |
62 my $skip_control = $self->_skip_control($self->param('skip_control')); | |
63 if(!$skip_control){ | |
64 #May also be passed as input_id, but then input_id may need to be TEXT | |
65 #Control must be in same dir as input file... maybe change this... | |
66 if(!$self->param('control_file')){ | |
67 #For the moment the control file has to be the same file type as the input file, but does not need to be the case... | |
68 #TODO Need to validate here if that's the case... | |
69 my $control_feature = $self->param('control_feature') || throw "Need to define 'control_feature'"; | |
70 my $control_file = $cell_type."_".$control_feature."_".$experiment_name.".samse.".$file_type.".gz"; | |
71 $self->_control_file($control_file); | |
72 } | |
73 } | |
74 | |
75 return 1; | |
76 } | |
77 | |
78 | |
79 # Private function only to be called by subclasses of this class | |
80 # prepares data to be used with SWEMBL... | |
81 # sorts the input, removes mythochondria and unaligned reads... | |
82 sub _preprocess_file{ | |
83 | |
84 #Consider using hash to process input | |
85 my ($self, $input, $output, $file_type) = (shift, shift, shift, shift); | |
86 | |
87 #For the moment we always overwrite any existent file... | |
88 #Maybe reuse previously cached files? How to check if they are corrupted? | |
89 #Maybe create a -reuse flag? | |
90 | |
91 #running piped system commands is a potential source of untraceable errors!! | |
92 #TODO try changing this... e.g. with Bio::DB::Sam ... ? | |
93 my $command; | |
94 if($file_type eq 'bed'){ | |
95 $command = "gzip -dc ${input}"; | |
96 | |
97 #Remove mitochondria before SWEMBL | |
98 warn "Excluding mytochondria reads before passing to SWEMBL. Make sure Bed file has MT for mytochondria"; | |
99 warn "Duplicates are not removed in Bed files while they are in sam files"; | |
100 #This probably won't work for many BED files... e.g chrM or sam-like "beds"... | |
101 $command = " | grep -v '^MT' "; | |
102 $command .= " | sort -k1,1 -k2,2n -k3,3n | gzip -c > $output"; | |
103 | |
104 } elsif($file_type eq 'sam'){ | |
105 | |
106 #Manual Alternative to samtools | |
107 #$command = "gzip -dc ".$self->param($parameter); | |
108 #$command .= ' | grep -vE \'^@\' | grep -vE "^[^[:space:]]+[[:blank:]]4[[:blank:]].*$" | sort -k3,3 -k4,4n | gzip -c > '.$work_file; | |
109 | |
110 #Sometimes the input sam file may have incorrect headings which will mess up subsequent steps... | |
111 #$command = "gzip -dc $input | grep -v '^\@' | "; # This is not likely to happen now | |
112 $command = "gzip -dc $input | "; | |
113 | |
114 #PREPROCESSING... remove mitochondria before SWEMBL : we need to cater for different approaches | |
115 #TODO do this in a better, more generic way | |
116 $command .= "grep -vE '^[^[:space:]]+[[:blank:]][^[:space:]]+[[:blank:]][^[:space:]]+\:[^[:space:]]+\:MT\:' | "; | |
117 $command .= "grep -v '^MT' | grep -v '^chrM' | "; | |
118 | |
119 #Remove unmapped reads... | |
120 $command .= $self->_bin_dir()."/samtools view -uSh -t ".$self->_sam_header()." -F 4 - | "; | |
121 | |
122 # This piped sort is not working!! (this problem has been reported in the samtools mailing list) | |
123 #TODO Check why this pipe in the sort is not working... | |
124 #$command .= "samtools sort -o - ".$self->param($parameter)."_tmp | " ; | |
125 #$command .= "samtools view -h - | gzip -c > $work_file"; | |
126 | |
127 # Create temp file for now and remove it when we figure out what's wrong with the piped sort! | |
128 $command .= $self->_bin_dir()."/samtools sort - ${input}_tmp ; " ; | |
129 | |
130 #Add a remove duplicates step (this is not supported with BED files for the moment) | |
131 $command .= $self->_bin_dir()."/samtools rmdup -s ${input}_tmp.bam - | "; | |
132 $command .= $self->_bin_dir()."/samtools view -h - | gzip -c > $output"; | |
133 | |
134 #Alternative with no rmdup... | |
135 #$command .= $self->_bin_dir()."/samtools view -h ${input}_tmp.bam | gzip -c > $output"; | |
136 | |
137 $command .= " ; rm -f ${input}_tmp.bam"; | |
138 | |
139 } | |
140 else{ | |
141 throw("$file_type file format not supported"); | |
142 } | |
143 | |
144 system($command) && throw("Failed processing $input with command $command"); | |
145 | |
146 return 1; | |
147 } | |
148 | |
149 # Private function only to be called by subclasses of this class | |
150 # gets the number of reads in a sam or bed file | |
151 #sub _get_number_of_reads { | |
152 # my ($self, $file, $file_type) = (shift, shift, shift); | |
153 # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; } | |
154 # my $nbr_reads = 0; | |
155 # #If needed, add an option to check if is zipped or not... | |
156 # my $open_cmd = "gzip -dc $file |"; | |
157 # open(FILE,$open_cmd); | |
158 # while(<FILE>){ | |
159 # if($file_type eq "sam"){ | |
160 # next if /^\@SQ/; | |
161 # }else { | |
162 # next if /track name=/o; | |
163 # } | |
164 # $nbr_reads++; | |
165 # } | |
166 # close FILE; | |
167 # return $nbr_reads; | |
168 #} | |
169 | |
170 # Private function only to be called by subclasses of this class | |
171 # gets the number of reads in a sam or bed file | |
172 #sub _get_slices { | |
173 # #NOT DONE!! | |
174 # my ($self, $file, $file_type) = (shift, shift, shift); | |
175 # if(($file_type ne "bed") && ($file_type ne "sam")){ throw "Only bed and sam file types supported"; } | |
176 # my $nbr_reads = 0; | |
177 # #If needed, add an option to check if is zipped or not... | |
178 # my $open_cmd = "gzip -dc $file |"; | |
179 # open(FILE,$open_cmd); | |
180 # while(<FILE>){ | |
181 # if($file_type eq "sam"){ | |
182 # next if /^@SQ/; | |
183 # }else { | |
184 # next if /track name=/o; | |
185 # } | |
186 # $nbr_reads++; | |
187 # } | |
188 # close FILE; | |
189 # return $nbr_reads; | |
190 #} | |
191 | |
192 | |
193 #Private getter / setter to the Feature Set Name | |
194 sub _feature_set_name { | |
195 return $_[0]->_getter_setter('feature_set_name',$_[1]); | |
196 } | |
197 | |
198 #Private getter / setter to the input folder | |
199 sub _input_dir { | |
200 return $_[0]->_getter_setter('input_dir',$_[1]); | |
201 } | |
202 | |
203 #Private getter / setter to the Input subset (usually a file name) | |
204 sub _input_file { | |
205 return $_[0]->_getter_setter('input_file',$_[1]); | |
206 } | |
207 | |
208 #Private getter / setter to the skip control option | |
209 sub _skip_control { | |
210 return $_[0]->_getter_setter('skip_control',$_[1]); | |
211 } | |
212 | |
213 #Private getter / setter to the control file | |
214 sub _control_file { | |
215 return $_[0]->_getter_setter('control_file',$_[1]); | |
216 } | |
217 | |
218 1; | |
219 |