Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_predict_wrapper.py @ 1:adcc4c457c3c draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit f3fb925b83a4982e0cf9a0c11ff93ecbb8e4e6d5"
author | bgruening |
---|---|
date | Wed, 22 Jan 2020 15:14:12 +0000 |
parents | |
children | 4cebb3439e1a |
comparison
equal
deleted
inserted
replaced
0:574b10dd07ef | 1:adcc4c457c3c |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import subprocess | |
4 import argparse | |
5 import shutil | |
6 import gplib | |
7 import gzip | |
8 import sys | |
9 import os | |
10 | |
11 | |
12 """ | |
13 | |
14 TOOL DEPENDENCIES | |
15 ================= | |
16 | |
17 GraphProt 1.1.7 | |
18 Best install via: | |
19 https://anaconda.org/bioconda/graphprot | |
20 Tested with: miniconda3, conda 4.7.12 | |
21 | |
22 | |
23 Script: What's my job this time, master? | |
24 Author: It'll be a though one. | |
25 Script: I take this as a given. | |
26 Author: Oh yeah? | |
27 Script: ... I'm ready. | |
28 | |
29 | |
30 OUTPUT FILES | |
31 ============ | |
32 | |
33 data_id.avg_profile | |
34 data_id.avg_profile.peaks.bed | |
35 --conf-out | |
36 data_id.avg_profile.p50.peaks.bed | |
37 --gen-site-bed | |
38 data_id.avg_profile.genomic_peaks.bed | |
39 --conf-out --gen-site-bed | |
40 data_id.avg_profile.p50.genomic_peaks.bed | |
41 --ws-pred | |
42 data_id.predictions | |
43 --ws-pred --conf-out | |
44 data_id.predictions | |
45 data_id.p50.predictions | |
46 | |
47 | |
48 EXAMPLE CALLS | |
49 ============= | |
50 | |
51 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gp-output | |
52 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed | |
53 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --gen-site-bed gp_data/test10_predict.bed --conf-out | |
54 python graphprot_predict_wrapper.py --model test2.model --params test2.params --fasta gp_data/test10_predict.fa --data-id test2pred --conf-out --ws-pred | |
55 | |
56 python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest | |
57 | |
58 python graphprot_predict_wrapper.py --model test-data/test.model --params test-data/test.params --fasta test-data/test_predict.fa --data-id predtest --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | |
59 | |
60 python graphprot_predict_wrapper.py --data-id GraphProt --fasta test-data/test_predict.fa --model test-data/test.model --params test-data/test.params --gen-site-bed test-data/test_predict.bed --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | |
61 | |
62 | |
63 pwd && python '/home/uhlm/Dokumente/Projekte/GraphProt_galaxy_new/galaxytools/tools/rna_tools/graphprot/graphprot_predict_wrapper.py' --data-id GraphProt --fasta /tmp/tmpmuslpc1h/files/0/8/c/dataset_08c48d88-e3b5-423b-acf6-bf89b8c60660.dat --model /tmp/tmpmuslpc1h/files/e/6/4/dataset_e6471bb4-e74c-4372-bc49-656f900e7191.dat --params /tmp/tmpmuslpc1h/files/b/6/5/dataset_b65e8cf4-d3e6-429e-8d57-1d401adf4b3c.dat --gen-site-bed /tmp/tmpmuslpc1h/files/5/1/a/dataset_51a38b65-5943-472d-853e-5d845fa8ac3e.dat --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | |
64 | |
65 | |
66 """ | |
67 | |
68 ################################################################################ | |
69 | |
70 def setup_argument_parser(): | |
71 """Setup argparse parser.""" | |
72 help_description = """ | |
73 Galaxy wrapper script for GraphProt (-action predict and -action | |
74 predict_profile) to compute whole site or position-wise scores for input | |
75 FASTA sequences. | |
76 By default, profile predictions are calculated, followed by average | |
77 profiles computions and peak regions extraction from average profiles. | |
78 If --ws-pred is set, whole site score predictions on input sequences | |
79 will be run instead. | |
80 If --conf-out is set, sites or peak regions with a score >= the median | |
81 score of positive training sites will be output. | |
82 If --gen-site-bed .bed file is provided, peak regions will be output | |
83 with genomic coordinates too. | |
84 | |
85 """ | |
86 # Define argument parser. | |
87 p = argparse.ArgumentParser(add_help=False, | |
88 prog="graphprot_predict_wrapper.py", | |
89 description=help_description, | |
90 formatter_class=argparse.MetavarTypeHelpFormatter) | |
91 | |
92 # Argument groups. | |
93 p_man = p.add_argument_group("REQUIRED ARGUMENTS") | |
94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") | |
95 | |
96 # Required arguments. | |
97 p_opt.add_argument("-h", "--help", | |
98 action="help", | |
99 help="Print help message") | |
100 p_man.add_argument("--fasta", | |
101 dest="in_fa", | |
102 type=str, | |
103 required = True, | |
104 help = "Sequences .fa file to predict on (option -fasta)") | |
105 p_man.add_argument("--model", | |
106 dest="in_model", | |
107 type=str, | |
108 required = True, | |
109 help = "GraphProt model file to use for predictions (option -model)") | |
110 p_man.add_argument("--params", | |
111 dest="in_params", | |
112 type=str, | |
113 required = True, | |
114 help = "Parameter file for given model") | |
115 p_man.add_argument("--data-id", | |
116 dest="data_id", | |
117 type=str, | |
118 required = True, | |
119 help = "Data ID (option -prefix)") | |
120 # ---> I'm a conditional argument <--- | |
121 p_opt.add_argument("--ws-pred", | |
122 dest = "ws_pred", | |
123 default = False, | |
124 action = "store_true", | |
125 help = "Run a whole site prediction instead of calculating profiles (default: false)") | |
126 # Additional arguments. | |
127 p_opt.add_argument("--sc-thr", | |
128 dest="score_thr", | |
129 type = float, | |
130 default = 0, | |
131 help = "Score threshold for extracting average profile peak regions (default: 0)") | |
132 p_opt.add_argument("--max-merge-dist", | |
133 dest="max_merge_dist", | |
134 type = int, | |
135 default = 0, | |
136 choices = [0,1,2,3,4,5,6,7,8,9,10], | |
137 help = "Maximum merge distance for nearby peak regions (default: report all non-overlapping regions)") | |
138 p_opt.add_argument("--gen-site-bed", | |
139 dest="genomic_sites_bed", | |
140 type=str, | |
141 help = ".bed file specifying the genomic regions of the input .fa sequences. Corrupt .bed information will be punished (default: false)") | |
142 p_opt.add_argument("--conf-out", | |
143 dest="conf_out", | |
144 default = False, | |
145 action = "store_true", | |
146 help = "Output filtered peak regions BED file or predictions file (if --ws-pred) using the median positive training site score for filtering (default: false)") | |
147 p_opt.add_argument("--gp-output", | |
148 dest = "gp_output", | |
149 default = False, | |
150 action = "store_true", | |
151 help = "Print output produced by GraphProt (default: false)") | |
152 p_opt.add_argument("--ap-extlr", | |
153 dest="ap_extlr", | |
154 type = int, | |
155 default = 5, | |
156 choices = [0,1,2,3,4,5,6,7,8,9,10], | |
157 help = "Define average profile up- and downstream extension to produce the average profile. The mean over small sequence windows (window length = --ap-extlr*2 + 1) is used to get position scores, thus the average profile is more smooth than the initial profile output by GraphProt (default: 5)") | |
158 return p | |
159 | |
160 | |
161 ################################################################################ | |
162 | |
163 if __name__ == '__main__': | |
164 | |
165 # Setup argparse. | |
166 parser = setup_argument_parser() | |
167 # Read in command line arguments. | |
168 args = parser.parse_args() | |
169 | |
170 """ | |
171 Do all sorts of sanity checking. | |
172 | |
173 """ | |
174 # Check for Linux. | |
175 assert "linux" in sys.platform, "please use Linux" | |
176 # Check tool availability. | |
177 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" | |
178 # Check file inputs. | |
179 assert os.path.exists(args.in_fa), "input .fa file \"%s\" not found" %(args.in_fa) | |
180 assert os.path.exists(args.in_model), "input .model file \"%s\" not found" %(args.in_model) | |
181 assert os.path.exists(args.in_params), "input .params file \"%s\" not found" %(args.in_params) | |
182 # Count .fa entries. | |
183 c_in_fa = gplib.count_fasta_headers(args.in_fa) | |
184 assert c_in_fa, "input .fa file \"%s\" no headers found" %(args.in_fa) | |
185 print("# input .fa sequences: %i" %(c_in_fa)) | |
186 # Read in FASTA sequences to check for uppercase sequences. | |
187 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) | |
188 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) | |
189 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. Please change sequences to uppercase (keep in mind GraphProt only scores uppercase regions (according to its viewpoint concept))" | |
190 if not args.ws_pred: | |
191 # Check for lowercase sequences. | |
192 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) | |
193 assert not c_lc_nt, "lowercase nucleotides not allowed in profile predictions, since GraphProt only scores uppercase regions (according to its viewpoint concept))" | |
194 # Check .bed. | |
195 if args.genomic_sites_bed: | |
196 # An array of checks, marvelous. | |
197 assert os.path.exists(args.genomic_sites_bed), "genomic .bed file \"%s\" not found" %(args.genomic_sites_bed) | |
198 # Check .bed for content. | |
199 assert gplib.count_file_rows(args.genomic_sites_bed), "genomic .bed file \"%s\" is empty" %(args.genomic_sites_bed) | |
200 # Check .bed for 6-column format. | |
201 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), "genomic .bed file \"%s\" appears to not be in 6-column .bed format" %(args.genomic_sites_bed) | |
202 # Check for unique column 4 IDs. | |
203 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), "genomic .bed file \"%s\" column 4 IDs not unique" %(args.genomic_sites_bed) | |
204 # Read in .bed regions, compare to FASTA sequences (compare IDs + lengths) | |
205 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) | |
206 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) | |
207 for seq_id in seq_len_dic: | |
208 seq_l = seq_len_dic[seq_id] | |
209 assert seq_id in reg_len_dic, "sequence ID \"\" missing in input .bed \"\"" %(seq_id, args.genomic_sites_bed) | |
210 reg_l = reg_len_dic[seq_id] | |
211 assert seq_l == reg_l, "sequence length differs from .bed region length (%i != %i)" %(seq_l, reg_l) | |
212 # Read in model parameters. | |
213 param_dic = gplib.graphprot_get_param_dic(args.in_params) | |
214 # Create GraphProt parameter string. | |
215 param_string = gplib.graphprot_get_param_string(args.in_params) | |
216 | |
217 """ | |
218 Run predictions. | |
219 | |
220 """ | |
221 if args.ws_pred: | |
222 # Do whole site prediction. | |
223 print("Starting whole site predictions on input .fa file (-action predict) ... ") | |
224 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model | |
225 output = subprocess.getoutput(check_cmd) | |
226 assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) | |
227 if args.gp_output: | |
228 print(output) | |
229 ws_predictions_file = args.data_id + ".predictions" | |
230 assert os.path.exists(ws_predictions_file), "Whole site prediction output .predictions file \"%s\" not found" %(ws_predictions_file) | |
231 if args.conf_out: | |
232 # Filter by pos_train_ws_pred_median median. | |
233 assert "pos_train_ws_pred_median" in param_dic, "whole site top scores median information missing in .params file" | |
234 pos_train_ws_pred_median = float(param_dic["pos_train_ws_pred_median"]) | |
235 # Filtered file. | |
236 filt_ws_predictions_file = args.data_id + ".p50.predictions" | |
237 print("Extracting p50 sites from whole site predictions (score threshold = %f) ... " %(pos_train_ws_pred_median)) | |
238 gplib.graphprot_filter_predictions_file(ws_predictions_file, filt_ws_predictions_file, | |
239 sc_thr=pos_train_ws_pred_median) | |
240 else: | |
241 # Do profile prediction. | |
242 print("Starting profile predictions on on input .fa file (-action predict_profile) ... ") | |
243 check_cmd = "GraphProt.pl -action predict_profile -prefix " + args.data_id + " -fasta " + args.in_fa + " " + param_string + " -model " + args.in_model | |
244 output = subprocess.getoutput(check_cmd) | |
245 assert output, "the following call of GraphProt.pl produced no output:\n%s" %(check_cmd) | |
246 if args.gp_output: | |
247 print(output) | |
248 profile_predictions_file = args.data_id + ".profile" | |
249 assert os.path.exists(profile_predictions_file), "Profile prediction output .profile file \"%s\" not found" %(profile_predictions_file) | |
250 | |
251 # Profile prediction output files. | |
252 avg_prof_file = args.data_id + ".avg_profile" | |
253 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" | |
254 avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" | |
255 avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" | |
256 avg_prof_gen_peaks_p50_file = args.data_id + ".avg_profile.p50.genomic_peaks.bed" | |
257 | |
258 # Get sequence IDs in order from input .fa file. | |
259 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) | |
260 # Calculate average profiles. | |
261 print("Getting average profile from profile (extlr for smoothing: %i) ... " %(args.ap_extlr)) | |
262 gplib.graphprot_profile_calculate_avg_profile(profile_predictions_file, | |
263 avg_prof_file, | |
264 ap_extlr=args.ap_extlr, | |
265 seq_ids_list=seq_ids_list, | |
266 method=2) | |
267 # Extract peak regions on sequences with threshold score 0. | |
268 print("Extracting peak regions from average profile (score threshold = 0) ... ") | |
269 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_file, | |
270 max_merge_dist=args.max_merge_dist, | |
271 sc_thr=args.score_thr) | |
272 # Convert peaks to genomic coordinates. | |
273 if args.genomic_sites_bed: | |
274 print("Converting peak regions to genomic coordinates ... ") | |
275 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, avg_prof_gen_peaks_file, | |
276 print_rows=False, | |
277 genomic_sites_bed=args.genomic_sites_bed) | |
278 # gplib.make_file_copy(avg_prof_gen_peaks_file, avg_prof_peaks_file) | |
279 # Extract peak regions with threshold score p50. | |
280 if args.conf_out: | |
281 sc_id = "pos_train_avg_profile_median_%i" %(args.ap_extlr) | |
282 # Filter by pos_train_ws_pred_median median. | |
283 assert sc_id in param_dic, "average profile extlr %i median information missing in .params file" %(args.ap_extlr) | |
284 p50_sc_thr = float(param_dic[sc_id]) | |
285 print("Extracting p50 peak regions from average profile (score threshold = %f) ... " %(p50_sc_thr)) | |
286 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, avg_prof_peaks_p50_file, | |
287 max_merge_dist=args.max_merge_dist, | |
288 sc_thr=p50_sc_thr) | |
289 # Convert peaks to genomic coordinates. | |
290 if args.genomic_sites_bed: | |
291 print("Converting p50 peak regions to genomic coordinates ... ") | |
292 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, avg_prof_gen_peaks_p50_file, | |
293 genomic_sites_bed=args.genomic_sites_bed) | |
294 # Done. | |
295 print("Script: I'm done.") | |
296 print("Author: ... ") | |
297 | |
298 |