Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_predict_wrapper.py @ 5:58ebf089377e draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 902e994cb04db968ce797afa676f5fa6512ab6d3
| author | bgruening |
|---|---|
| date | Tue, 06 Aug 2024 14:54:58 +0000 |
| parents | 2d6de5b769e6 |
| children |
comparison
equal
deleted
inserted
replaced
| 4:2d6de5b769e6 | 5:58ebf089377e |
|---|---|
| 4 import os | 4 import os |
| 5 import subprocess | 5 import subprocess |
| 6 import sys | 6 import sys |
| 7 | 7 |
| 8 import gplib | 8 import gplib |
| 9 | |
| 10 | 9 |
| 11 """ | 10 """ |
| 12 | 11 |
| 13 TOOL DEPENDENCIES | 12 TOOL DEPENDENCIES |
| 14 ================= | 13 ================= |
| 79 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 | 78 --sc-thr 0.0 --max-merge-dist 0 --conf-out --ap-extlr 5 |
| 80 | 79 |
| 81 """ | 80 """ |
| 82 | 81 |
| 83 | 82 |
| 84 ############################################################################### | 83 ####################################################################### |
| 84 | |
| 85 | 85 |
| 86 def setup_argument_parser(): | 86 def setup_argument_parser(): |
| 87 """Setup argparse parser.""" | 87 """Setup argparse parser.""" |
| 88 help_description = """ | 88 help_description = """ |
| 89 Galaxy wrapper script for GraphProt (-action predict and -action | 89 Galaxy wrapper script for GraphProt (-action predict and -action |
| 98 If --gen-site-bed .bed file is provided, peak regions will be output | 98 If --gen-site-bed .bed file is provided, peak regions will be output |
| 99 with genomic coordinates too. | 99 with genomic coordinates too. |
| 100 | 100 |
| 101 """ | 101 """ |
| 102 # Define argument parser. | 102 # Define argument parser. |
| 103 p = ap.ArgumentParser(add_help=False, | 103 p = ap.ArgumentParser( |
| 104 prog="graphprot_predict_wrapper.py", | 104 add_help=False, |
| 105 description=help_description, | 105 prog="graphprot_predict_wrapper.py", |
| 106 formatter_class=ap.MetavarTypeHelpFormatter) | 106 description=help_description, |
| 107 formatter_class=ap.MetavarTypeHelpFormatter, | |
| 108 ) | |
| 107 | 109 |
| 108 # Argument groups. | 110 # Argument groups. |
| 109 p_man = p.add_argument_group("REQUIRED ARGUMENTS") | 111 p_man = p.add_argument_group("REQUIRED ARGUMENTS") |
| 110 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") | 112 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") |
| 111 | 113 |
| 112 # Required arguments. | 114 # Required arguments. |
| 113 p_opt.add_argument("-h", "--help", | 115 p_opt.add_argument("-h", "--help", action="help", help="Print help message") |
| 114 action="help", | 116 p_man.add_argument( |
| 115 help="Print help message") | 117 "--fasta", |
| 116 p_man.add_argument("--fasta", | 118 dest="in_fa", |
| 117 dest="in_fa", | 119 type=str, |
| 118 type=str, | 120 required=True, |
| 119 required=True, | 121 help="Sequences .fa file to predict" " on (option -fasta)", |
| 120 help="Sequences .fa file to predict" | 122 ) |
| 121 " on (option -fasta)") | 123 p_man.add_argument( |
| 122 p_man.add_argument("--model", | 124 "--model", |
| 123 dest="in_model", | 125 dest="in_model", |
| 124 type=str, | 126 type=str, |
| 125 required=True, | 127 required=True, |
| 126 help="GraphProt model file to use for predictions" | 128 help="GraphProt model file to use for predictions" " (option -model)", |
| 127 " (option -model)") | 129 ) |
| 128 p_man.add_argument("--params", | 130 p_man.add_argument( |
| 129 dest="in_params", | 131 "--params", |
| 130 type=str, | 132 dest="in_params", |
| 131 required=True, | 133 type=str, |
| 132 help="Parameter file for given model") | 134 required=True, |
| 133 p_man.add_argument("--data-id", | 135 help="Parameter file for given model", |
| 134 dest="data_id", | 136 ) |
| 135 type=str, | 137 p_man.add_argument( |
| 136 required=True, | 138 "--data-id", |
| 137 help="Data ID (option -prefix)") | 139 dest="data_id", |
| 140 type=str, | |
| 141 required=True, | |
| 142 help="Data ID (option -prefix)", | |
| 143 ) | |
| 138 # ---> I'm a conditional argument <--- | 144 # ---> I'm a conditional argument <--- |
| 139 p_opt.add_argument("--ws-pred", | 145 p_opt.add_argument( |
| 140 dest="ws_pred", | 146 "--ws-pred", |
| 141 default=False, | 147 dest="ws_pred", |
| 142 action="store_true", | 148 default=False, |
| 143 help="Run a whole site prediction instead " | 149 action="store_true", |
| 144 "of calculating profiles (default: false)") | 150 help="Run a whole site prediction instead " |
| 151 "of calculating profiles (default: false)", | |
| 152 ) | |
| 145 # Additional arguments. | 153 # Additional arguments. |
| 146 p_opt.add_argument("--sc-thr", | 154 p_opt.add_argument( |
| 147 dest="score_thr", | 155 "--sc-thr", |
| 148 type=float, | 156 dest="score_thr", |
| 149 default=0, | 157 type=float, |
| 150 help="Score threshold for extracting " | 158 default=0, |
| 151 "average profile peak regions (default: 0)") | 159 help="Score threshold for extracting " |
| 152 p_opt.add_argument("--max-merge-dist", | 160 "average profile peak regions (default: 0)", |
| 153 dest="max_merge_dist", | 161 ) |
| 154 type=int, | 162 p_opt.add_argument( |
| 155 default=0, | 163 "--max-merge-dist", |
| 156 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | 164 dest="max_merge_dist", |
| 157 help="Maximum merge distance for nearby peak regions" | 165 type=int, |
| 158 " (default: report all non-overlapping regions)") | 166 default=0, |
| 159 p_opt.add_argument("--gen-site-bed", | 167 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], |
| 160 dest="genomic_sites_bed", | 168 help="Maximum merge distance for nearby peak regions" |
| 161 type=str, | 169 " (default: report all non-overlapping regions)", |
| 162 help=".bed file specifying the genomic regions of " | 170 ) |
| 163 "the input .fa sequences. Corrupt .bed " | 171 p_opt.add_argument( |
| 164 "information will be punished (default: false)") | 172 "--gen-site-bed", |
| 165 p_opt.add_argument("--conf-out", | 173 dest="genomic_sites_bed", |
| 166 dest="conf_out", | 174 type=str, |
| 167 default=False, | 175 help=".bed file specifying the genomic regions of " |
| 168 action="store_true", | 176 "the input .fa sequences. Corrupt .bed " |
| 169 help="Output filtered peak regions BED file or " | 177 "information will be punished (default: false)", |
| 170 "predictions file (if --ws-pred) using the median " | 178 ) |
| 171 "positive training site score for filtering " | 179 p_opt.add_argument( |
| 172 "(default: false)") | 180 "--conf-out", |
| 173 p_opt.add_argument("--gp-output", | 181 dest="conf_out", |
| 174 dest="gp_output", | 182 default=False, |
| 175 default=False, | 183 action="store_true", |
| 176 action="store_true", | 184 help="Output filtered peak regions BED file or " |
| 177 help="Print output produced by GraphProt " | 185 "predictions file (if --ws-pred) using the median " |
| 178 "(default: false)") | 186 "positive training site score for filtering " |
| 179 p_opt.add_argument("--ap-extlr", | 187 "(default: false)", |
| 180 dest="ap_extlr", | 188 ) |
| 181 type=int, | 189 p_opt.add_argument( |
| 182 default=5, | 190 "--gp-output", |
| 183 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | 191 dest="gp_output", |
| 184 help="Define average profile up- and downstream " | 192 default=False, |
| 185 "extension to produce the average profile. " | 193 action="store_true", |
| 186 "The mean over small sequence windows " | 194 help="Print output produced by GraphProt " "(default: false)", |
| 187 "(window length = --ap-extlr*2 + 1) is used to " | 195 ) |
| 188 "get position scores, thus the average profile " | 196 p_opt.add_argument( |
| 189 "is more smooth than the initial profile output " | 197 "--ap-extlr", |
| 190 "by GraphProt (default: 5)") | 198 dest="ap_extlr", |
| 199 type=int, | |
| 200 default=5, | |
| 201 choices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], | |
| 202 help="Define average profile up- and downstream " | |
| 203 "extension to produce the average profile. " | |
| 204 "The mean over small sequence windows " | |
| 205 "(window length = --ap-extlr*2 + 1) is used to " | |
| 206 "get position scores, thus the average profile " | |
| 207 "is more smooth than the initial profile output " | |
| 208 "by GraphProt (default: 5)", | |
| 209 ) | |
| 191 return p | 210 return p |
| 192 | 211 |
| 193 | 212 |
| 194 ############################################################################### | 213 ####################################################################### |
| 195 | 214 |
| 196 if __name__ == '__main__': | 215 if __name__ == "__main__": |
| 197 | 216 |
| 198 # Setup argparse. | 217 # Setup argparse. |
| 199 parser = setup_argument_parser() | 218 parser = setup_argument_parser() |
| 200 # Read in command line arguments. | 219 # Read in command line arguments. |
| 201 args = parser.parse_args() | 220 args = parser.parse_args() |
| 207 # Check for Linux. | 226 # Check for Linux. |
| 208 assert "linux" in sys.platform, "please use Linux" | 227 assert "linux" in sys.platform, "please use Linux" |
| 209 # Check tool availability. | 228 # Check tool availability. |
| 210 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" | 229 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" |
| 211 # Check file inputs. | 230 # Check file inputs. |
| 212 assert os.path.exists(args.in_fa), \ | 231 assert os.path.exists(args.in_fa), 'input .fa file "%s" not found' % (args.in_fa) |
| 213 "input .fa file \"%s\" not found" % (args.in_fa) | 232 assert os.path.exists(args.in_model), 'input .model file "%s" not found' % ( |
| 214 assert os.path.exists(args.in_model), \ | 233 args.in_model |
| 215 "input .model file \"%s\" not found" % (args.in_model) | 234 ) |
| 216 assert os.path.exists(args.in_params), \ | 235 assert os.path.exists(args.in_params), 'input .params file "%s" not found' % ( |
| 217 "input .params file \"%s\" not found" % (args.in_params) | 236 args.in_params |
| 237 ) | |
| 218 # Count .fa entries. | 238 # Count .fa entries. |
| 219 c_in_fa = gplib.count_fasta_headers(args.in_fa) | 239 c_in_fa = gplib.count_fasta_headers(args.in_fa) |
| 220 assert c_in_fa, "input .fa file \"%s\" no headers found" % (args.in_fa) | 240 assert c_in_fa, 'input .fa file "%s" no headers found' % (args.in_fa) |
| 221 print("# input .fa sequences: %i" % (c_in_fa)) | 241 print("# input .fa sequences: %i" % (c_in_fa)) |
| 222 # Read in FASTA sequences to check for uppercase sequences. | 242 # Read in FASTA sequences to check for uppercase sequences. |
| 223 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) | 243 seqs_dic = gplib.read_fasta_into_dic(args.in_fa) |
| 224 # Check for lowercase only sequences, which cause GP to crash. | 244 # Check for lowercase only sequences, which cause GP to crash. |
| 225 error_mess = "input sequences encountered containing "\ | 245 error_mess = ( |
| 226 "only lowercase characters or lowercase characters in between "\ | 246 "input sequences encountered containing " |
| 227 "uppercase characters. Please provide either all uppercase "\ | 247 "only lowercase characters or lowercase characters in between " |
| 228 "sequences or sequences containing uppercase regions surrounded "\ | 248 "uppercase characters. Please provide either all uppercase " |
| 229 "by lowercase context regions for structure calculation (see "\ | 249 "sequences or sequences containing uppercase regions surrounded " |
| 230 "viewpoint concept in original GraphProt publication "\ | 250 "by lowercase context regions for structure calculation (see " |
| 251 "viewpoint concept in original GraphProt publication " | |
| 231 "for more details)" | 252 "for more details)" |
| 253 ) | |
| 232 if args.ws_pred: | 254 if args.ws_pred: |
| 233 bad_ids = gplib.check_seqs_dic_format(seqs_dic) | 255 bad_ids = gplib.check_seqs_dic_format(seqs_dic) |
| 234 assert not bad_ids, "%s" % (error_mess) | 256 assert not bad_ids, "%s" % (error_mess) |
| 235 | 257 |
| 236 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) | 258 c_uc_nt = gplib.seqs_dic_count_uc_nts(seqs_dic) |
| 237 assert c_uc_nt, "no uppercase nucleotides in input .fa sequences. "\ | 259 assert c_uc_nt, ( |
| 238 "Please change sequences to uppercase (keep in mind "\ | 260 "no uppercase nucleotides in input .fa sequences. " |
| 239 "GraphProt only scores uppercase regions (according "\ | 261 "Please change sequences to uppercase (keep in mind " |
| 240 "to its viewpoint concept)" | 262 "GraphProt only scores uppercase regions (according " |
| 263 "to its viewpoint concept)" | |
| 264 ) | |
| 241 if not args.ws_pred: | 265 if not args.ws_pred: |
| 242 # Check for lowercase sequences. | 266 # Check for lowercase sequences. |
| 243 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) | 267 c_lc_nt = gplib.seqs_dic_count_lc_nts(seqs_dic) |
| 244 assert not c_lc_nt, "lowercase nucleotides in input .fa not allowed"\ | 268 assert not c_lc_nt, ( |
| 245 " in profile predictions, since GraphProt only scores "\ | 269 "lowercase nucleotides in input .fa not allowed" |
| 270 " in profile predictions, since GraphProt only scores " | |
| 246 "uppercase regions (according to its viewpoint concept)" | 271 "uppercase regions (according to its viewpoint concept)" |
| 272 ) | |
| 247 # Check .bed. | 273 # Check .bed. |
| 248 if args.genomic_sites_bed: | 274 if args.genomic_sites_bed: |
| 249 # An array of checks, marvelous. | 275 # An array of checks, marvelous. |
| 250 assert \ | 276 assert os.path.exists( |
| 251 os.path.exists(args.genomic_sites_bed), \ | 277 args.genomic_sites_bed |
| 252 "genomic .bed file \"%s\" not found" % (args.genomic_sites_bed) | 278 ), 'genomic .bed file "%s" not found' % (args.genomic_sites_bed) |
| 253 # Check .bed for content. | 279 # Check .bed for content. |
| 254 assert gplib.count_file_rows(args.genomic_sites_bed), \ | 280 assert gplib.count_file_rows( |
| 255 "genomic .bed file \"%s\" is empty" % (args.genomic_sites_bed) | 281 args.genomic_sites_bed |
| 282 ), 'genomic .bed file "%s" is empty' % (args.genomic_sites_bed) | |
| 256 # Check .bed for 6-column format. | 283 # Check .bed for 6-column format. |
| 257 assert gplib.bed_check_six_col_format(args.genomic_sites_bed), \ | 284 assert gplib.bed_check_six_col_format( |
| 258 "genomic .bed file \"%s\" appears to not "\ | 285 args.genomic_sites_bed |
| 259 "be in 6-column .bed format" % (args.genomic_sites_bed) | 286 ), 'genomic .bed file "%s" appears to not ' "be in 6-column .bed format" % ( |
| 287 args.genomic_sites_bed | |
| 288 ) | |
| 260 # Check for unique column 4 IDs. | 289 # Check for unique column 4 IDs. |
| 261 assert gplib.bed_check_unique_ids(args.genomic_sites_bed), \ | 290 assert gplib.bed_check_unique_ids( |
| 262 "genomic .bed file \"%s\" column 4 IDs not unique" % \ | 291 args.genomic_sites_bed |
| 263 (args.genomic_sites_bed) | 292 ), 'genomic .bed file "%s" column 4 IDs not unique' % (args.genomic_sites_bed) |
| 264 # Read in .bed regions, compare to FASTA sequences. | 293 # Read in .bed regions, compare to FASTA sequences. |
| 265 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) | 294 seq_len_dic = gplib.get_seq_lengths_from_seqs_dic(seqs_dic) |
| 266 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) | 295 reg_len_dic = gplib.bed_get_region_lengths(args.genomic_sites_bed) |
| 267 for seq_id in seq_len_dic: | 296 for seq_id in seq_len_dic: |
| 268 seq_l = seq_len_dic[seq_id] | 297 seq_l = seq_len_dic[seq_id] |
| 269 assert seq_id in reg_len_dic, \ | 298 assert ( |
| 270 "sequence ID \"%s\" missing in input .bed \"%s\"" % \ | 299 seq_id in reg_len_dic |
| 271 (seq_id, args.genomic_sites_bed) | 300 ), 'sequence ID "%s" missing in input .bed "%s"' % ( |
| 301 seq_id, | |
| 302 args.genomic_sites_bed, | |
| 303 ) | |
| 272 reg_l = reg_len_dic[seq_id] | 304 reg_l = reg_len_dic[seq_id] |
| 273 assert seq_l == reg_l, \ | 305 assert ( |
| 274 "sequence length differs from .bed region length (%i != %i)" \ | 306 seq_l == reg_l |
| 275 % (seq_l, reg_l) | 307 ), "sequence length differs from .bed region length (%i != %i)" % ( |
| 308 seq_l, | |
| 309 reg_l, | |
| 310 ) | |
| 276 # Read in model parameters. | 311 # Read in model parameters. |
| 277 param_dic = gplib.graphprot_get_param_dic(args.in_params) | 312 param_dic = gplib.graphprot_get_param_dic(args.in_params) |
| 278 # Create GraphProt parameter string. | 313 # Create GraphProt parameter string. |
| 279 param_string = gplib.graphprot_get_param_string(args.in_params) | 314 param_string = gplib.graphprot_get_param_string(args.in_params) |
| 280 | 315 |
| 282 Run predictions. | 317 Run predictions. |
| 283 | 318 |
| 284 """ | 319 """ |
| 285 if args.ws_pred: | 320 if args.ws_pred: |
| 286 # Do whole site prediction. | 321 # Do whole site prediction. |
| 287 print("Starting whole site predictions on input .fa file" | 322 print( |
| 288 " (-action predict) ... ") | 323 "Starting whole site predictions on input .fa file" |
| 289 check_cmd = "GraphProt.pl -action predict -prefix " \ | 324 " (-action predict) ... " |
| 290 + args.data_id + " -fasta " + args.in_fa + " " \ | 325 ) |
| 291 + param_string + " -model " + args.in_model | 326 check_cmd = ( |
| 327 "GraphProt.pl -action predict -prefix " | |
| 328 + args.data_id | |
| 329 + " -fasta " | |
| 330 + args.in_fa | |
| 331 + " " | |
| 332 + param_string | |
| 333 + " -model " | |
| 334 + args.in_model | |
| 335 ) | |
| 292 output = subprocess.getoutput(check_cmd) | 336 output = subprocess.getoutput(check_cmd) |
| 293 assert output, "the following call of GraphProt.pl "\ | 337 assert ( |
| 294 "produced no output:\n%s" % (check_cmd) | 338 output |
| 339 ), "the following call of GraphProt.pl " "produced no output:\n%s" % (check_cmd) | |
| 295 if args.gp_output: | 340 if args.gp_output: |
| 296 print(output) | 341 print(output) |
| 297 ws_predictions_file = args.data_id + ".predictions" | 342 ws_predictions_file = args.data_id + ".predictions" |
| 298 assert os.path.exists(ws_predictions_file), \ | 343 assert os.path.exists( |
| 299 "Whole site prediction output .predictions file \"%s\" not found" \ | 344 ws_predictions_file |
| 300 % (ws_predictions_file) | 345 ), 'Whole site prediction output .predictions file "%s" not found' % ( |
| 346 ws_predictions_file | |
| 347 ) | |
| 301 if args.conf_out: | 348 if args.conf_out: |
| 302 # Filter by pos_train_ws_pred_median median. | 349 # Filter by pos_train_ws_pred_median median. |
| 303 assert "pos_train_ws_pred_median" in param_dic, \ | 350 assert "pos_train_ws_pred_median" in param_dic, ( |
| 304 "whole site top scores median information "\ | 351 "whole site top scores median information " "missing in .params file" |
| 305 "missing in .params file" | 352 ) |
| 306 pos_tr_ws_pred_med = \ | 353 pos_tr_ws_pred_med = float(param_dic["pos_train_ws_pred_median"]) |
| 307 float(param_dic["pos_train_ws_pred_median"]) | |
| 308 # Filtered file. | 354 # Filtered file. |
| 309 filt_ws_predictions_file = args.data_id + ".p50.predictions" | 355 filt_ws_predictions_file = args.data_id + ".p50.predictions" |
| 310 print("Extracting p50 sites from whole site predictions" | 356 print( |
| 311 " (score threshold = %f) ... " % (pos_tr_ws_pred_med)) | 357 "Extracting p50 sites from whole site predictions" |
| 312 gplib.graphprot_filter_predictions_file(ws_predictions_file, | 358 " (score threshold = %f) ... " % (pos_tr_ws_pred_med) |
| 313 filt_ws_predictions_file, | 359 ) |
| 314 sc_thr=pos_tr_ws_pred_med) | 360 gplib.graphprot_filter_predictions_file( |
| 361 ws_predictions_file, filt_ws_predictions_file, sc_thr=pos_tr_ws_pred_med | |
| 362 ) | |
| 315 else: | 363 else: |
| 316 # Do profile prediction. | 364 # Do profile prediction. |
| 317 print("Starting profile predictions on on input .fa file " | 365 print( |
| 318 "(-action predict_profile) ... ") | 366 "Starting profile predictions on on input .fa file " |
| 319 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ | 367 "(-action predict_profile) ... " |
| 320 + args.data_id + " -fasta " + args.in_fa + " " + \ | 368 ) |
| 321 param_string + " -model " + args.in_model | 369 check_cmd = ( |
| 370 "GraphProt.pl -action predict_profile -prefix " | |
| 371 + args.data_id | |
| 372 + " -fasta " | |
| 373 + args.in_fa | |
| 374 + " " | |
| 375 + param_string | |
| 376 + " -model " | |
| 377 + args.in_model | |
| 378 ) | |
| 322 output = subprocess.getoutput(check_cmd) | 379 output = subprocess.getoutput(check_cmd) |
| 323 assert output, "the following call of GraphProt.pl produced "\ | 380 assert ( |
| 324 "no output:\n%s" % (check_cmd) | 381 output |
| 382 ), "the following call of GraphProt.pl produced " "no output:\n%s" % (check_cmd) | |
| 325 if args.gp_output: | 383 if args.gp_output: |
| 326 print(output) | 384 print(output) |
| 327 profile_predictions_file = args.data_id + ".profile" | 385 profile_predictions_file = args.data_id + ".profile" |
| 328 assert os.path.exists(profile_predictions_file), \ | 386 assert os.path.exists( |
| 329 "Profile prediction output .profile file \"%s\" not found" % \ | 387 profile_predictions_file |
| 330 (profile_predictions_file) | 388 ), 'Profile prediction output .profile file "%s" not found' % ( |
| 389 profile_predictions_file | |
| 390 ) | |
| 331 | 391 |
| 332 # Profile prediction output files. | 392 # Profile prediction output files. |
| 333 avg_prof_file = args.data_id + ".avg_profile" | 393 avg_prof_file = args.data_id + ".avg_profile" |
| 334 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" | 394 avg_prof_peaks_file = args.data_id + ".avg_profile.peaks.bed" |
| 335 avg_prof_gen_peaks_file = args.data_id + \ | 395 avg_prof_gen_peaks_file = args.data_id + ".avg_profile.genomic_peaks.bed" |
| 336 ".avg_profile.genomic_peaks.bed" | 396 avg_prof_peaks_p50_file = args.data_id + ".avg_profile.p50.peaks.bed" |
| 337 avg_prof_peaks_p50_file = args.data_id + \ | 397 avg_prof_gen_peaks_p50_file = ( |
| 338 ".avg_profile.p50.peaks.bed" | 398 args.data_id + ".avg_profile.p50.genomic_peaks.bed" |
| 339 avg_prof_gen_peaks_p50_file = args.data_id + \ | 399 ) |
| 340 ".avg_profile.p50.genomic_peaks.bed" | |
| 341 | 400 |
| 342 # Get sequence IDs in order from input .fa file. | 401 # Get sequence IDs in order from input .fa file. |
| 343 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) | 402 seq_ids_list = gplib.fasta_read_in_ids(args.in_fa) |
| 344 # Calculate average profiles. | 403 # Calculate average profiles. |
| 345 print("Getting average profile from profile " | 404 print( |
| 346 "(extlr for smoothing: %i) ... " % (args.ap_extlr)) | 405 "Getting average profile from profile " |
| 347 gplib.graphprot_profile_calc_avg_profile(profile_predictions_file, | 406 "(extlr for smoothing: %i) ... " % (args.ap_extlr) |
| 348 avg_prof_file, | 407 ) |
| 349 ap_extlr=args.ap_extlr, | 408 gplib.graphprot_profile_calc_avg_profile( |
| 350 seq_ids_list=seq_ids_list, | 409 profile_predictions_file, |
| 351 method=2) | 410 avg_prof_file, |
| 411 ap_extlr=args.ap_extlr, | |
| 412 seq_ids_list=seq_ids_list, | |
| 413 method=2, | |
| 414 ) | |
| 352 # Extract peak regions on sequences with threshold score 0. | 415 # Extract peak regions on sequences with threshold score 0. |
| 353 print("Extracting peak regions from average profile " | 416 print( |
| 354 "(score threshold = 0) ... ") | 417 "Extracting peak regions from average profile " "(score threshold = 0) ... " |
| 418 ) | |
| 355 killpep8 = args.max_merge_dist | 419 killpep8 = args.max_merge_dist |
| 356 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, | 420 gplib.graphprot_profile_extract_peak_regions( |
| 357 avg_prof_peaks_file, | 421 avg_prof_file, |
| 358 max_merge_dist=killpep8, | 422 avg_prof_peaks_file, |
| 359 sc_thr=args.score_thr) | 423 max_merge_dist=killpep8, |
| 424 sc_thr=args.score_thr, | |
| 425 ) | |
| 360 # Convert peaks to genomic coordinates. | 426 # Convert peaks to genomic coordinates. |
| 361 if args.genomic_sites_bed: | 427 if args.genomic_sites_bed: |
| 362 print("Converting peak regions to genomic coordinates ... ") | 428 print("Converting peak regions to genomic coordinates ... ") |
| 363 killit = args.genomic_sites_bed | 429 killit = args.genomic_sites_bed |
| 364 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_file, | 430 gplib.bed_peaks_to_genomic_peaks( |
| 365 avg_prof_gen_peaks_file, | 431 avg_prof_peaks_file, |
| 366 print_rows=False, | 432 avg_prof_gen_peaks_file, |
| 367 genomic_sites_bed=killit) | 433 print_rows=False, |
| 434 genomic_sites_bed=killit, | |
| 435 ) | |
| 368 # Extract peak regions with threshold score p50. | 436 # Extract peak regions with threshold score p50. |
| 369 if args.conf_out: | 437 if args.conf_out: |
| 370 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) | 438 sc_id = "pos_train_avg_profile_median_%i" % (args.ap_extlr) |
| 371 # Filter by pos_tr_ws_pred_med median. | 439 # Filter by pos_tr_ws_pred_med median. |
| 372 assert sc_id in param_dic, "average profile extlr %i median "\ | 440 assert sc_id in param_dic, ( |
| 441 "average profile extlr %i median " | |
| 373 "information missing in .params file" % (args.ap_extlr) | 442 "information missing in .params file" % (args.ap_extlr) |
| 443 ) | |
| 374 p50_sc_thr = float(param_dic[sc_id]) | 444 p50_sc_thr = float(param_dic[sc_id]) |
| 375 print("Extracting p50 peak regions from average profile " | 445 print( |
| 376 "(score threshold = %f) ... " % (p50_sc_thr)) | 446 "Extracting p50 peak regions from average profile " |
| 447 "(score threshold = %f) ... " % (p50_sc_thr) | |
| 448 ) | |
| 377 despair = avg_prof_peaks_p50_file | 449 despair = avg_prof_peaks_p50_file |
| 378 pain = args.max_merge_dist | 450 pain = args.max_merge_dist |
| 379 gplib.graphprot_profile_extract_peak_regions(avg_prof_file, | 451 gplib.graphprot_profile_extract_peak_regions( |
| 380 despair, | 452 avg_prof_file, despair, max_merge_dist=pain, sc_thr=p50_sc_thr |
| 381 max_merge_dist=pain, | 453 ) |
| 382 sc_thr=p50_sc_thr) | |
| 383 # Convert peaks to genomic coordinates. | 454 # Convert peaks to genomic coordinates. |
| 384 if args.genomic_sites_bed: | 455 if args.genomic_sites_bed: |
| 385 print("Converting p50 peak regions to " | 456 print("Converting p50 peak regions to " "genomic coordinates ... ") |
| 386 "genomic coordinates ... ") | |
| 387 madness = args.genomic_sites_bed | 457 madness = args.genomic_sites_bed |
| 388 gplib.bed_peaks_to_genomic_peaks(avg_prof_peaks_p50_file, | 458 gplib.bed_peaks_to_genomic_peaks( |
| 389 avg_prof_gen_peaks_p50_file, | 459 avg_prof_peaks_p50_file, |
| 390 genomic_sites_bed=madness) | 460 avg_prof_gen_peaks_p50_file, |
| 461 genomic_sites_bed=madness, | |
| 462 ) | |
| 391 # Done. | 463 # Done. |
| 392 print("Script: I'm done.") | 464 print("Script: I'm done.") |
| 393 print("Author: ... ") | 465 print("Author: ... ") |
