Mercurial > repos > rnateam > graphprot_predict_profile
comparison graphprot_train_wrapper.py @ 5:58ebf089377e draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit 902e994cb04db968ce797afa676f5fa6512ab6d3
| author | bgruening |
|---|---|
| date | Tue, 06 Aug 2024 14:54:58 +0000 |
| parents | 9a83a84a25a7 |
| children |
comparison
equal
deleted
inserted
replaced
| 4:2d6de5b769e6 | 5:58ebf089377e |
|---|---|
| 4 import os | 4 import os |
| 5 import subprocess | 5 import subprocess |
| 6 import sys | 6 import sys |
| 7 | 7 |
| 8 import gplib | 8 import gplib |
| 9 | |
| 10 | 9 |
| 11 """ | 10 """ |
| 12 | 11 |
| 13 TOOL DEPENDENCIES | 12 TOOL DEPENDENCIES |
| 14 ================= | 13 ================= |
| 60 | 59 |
| 61 | 60 |
| 62 """ | 61 """ |
| 63 | 62 |
| 64 | 63 |
| 65 ############################################################################### | 64 ####################################################################### |
| 65 | |
| 66 | 66 |
| 67 def setup_argument_parser(): | 67 def setup_argument_parser(): |
| 68 """Setup argparse parser.""" | 68 """Setup argparse parser.""" |
| 69 help_description = """ | 69 help_description = """ |
| 70 Galaxy wrapper script for GraphProt to train a GraphProt model on | 70 Galaxy wrapper script for GraphProt to train a GraphProt model on |
| 82 score out of these to store in .params file, using it later | 82 score out of these to store in .params file, using it later |
| 83 for outputting binding sites or peaks with higher confidence. | 83 for outputting binding sites or peaks with higher confidence. |
| 84 | 84 |
| 85 """ | 85 """ |
| 86 # Define argument parser. | 86 # Define argument parser. |
| 87 p = ap.ArgumentParser(add_help=False, | 87 p = ap.ArgumentParser( |
| 88 prog="graphprot_train_wrapper.py", | 88 add_help=False, |
| 89 description=help_description, | 89 prog="graphprot_train_wrapper.py", |
| 90 formatter_class=ap.MetavarTypeHelpFormatter) | 90 description=help_description, |
| 91 formatter_class=ap.MetavarTypeHelpFormatter, | |
| 92 ) | |
| 91 | 93 |
| 92 # Argument groups. | 94 # Argument groups. |
| 93 p_man = p.add_argument_group("REQUIRED ARGUMENTS") | 95 p_man = p.add_argument_group("REQUIRED ARGUMENTS") |
| 94 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") | 96 p_opt = p.add_argument_group("OPTIONAL ARGUMENTS") |
| 95 | 97 |
| 96 # Required arguments. | 98 # Required arguments. |
| 97 p_opt.add_argument("-h", "--help", | 99 p_opt.add_argument("-h", "--help", action="help", help="Print help message") |
| 98 action="help", | 100 p_man.add_argument( |
| 99 help="Print help message") | 101 "--pos", |
| 100 p_man.add_argument("--pos", | 102 dest="in_pos_fa", |
| 101 dest="in_pos_fa", | 103 type=str, |
| 102 type=str, | 104 required=True, |
| 103 required=True, | 105 help="Positive (= binding site) sequences .fa file " |
| 104 help="Positive (= binding site) sequences .fa file " | 106 "for model training (option -fasta)", |
| 105 "for model training (option -fasta)") | 107 ) |
| 106 p_man.add_argument("--neg", | 108 p_man.add_argument( |
| 107 dest="in_neg_fa", | 109 "--neg", |
| 108 type=str, | 110 dest="in_neg_fa", |
| 109 required=True, | 111 type=str, |
| 110 help="Negative sequences .fa file for model " | 112 required=True, |
| 111 "training (option -negfasta)") | 113 help="Negative sequences .fa file for model " "training (option -negfasta)", |
| 112 p_man.add_argument("--data-id", | 114 ) |
| 113 dest="data_id", | 115 p_man.add_argument( |
| 114 type=str, | 116 "--data-id", |
| 115 required=True, | 117 dest="data_id", |
| 116 help="Data ID (option -prefix)") | 118 type=str, |
| 119 required=True, | |
| 120 help="Data ID (option -prefix)", | |
| 121 ) | |
| 117 # Additional arguments. | 122 # Additional arguments. |
| 118 p_opt.add_argument("--opt-set-size", | 123 p_opt.add_argument( |
| 119 dest="opt_set_size", | 124 "--opt-set-size", |
| 120 type=int, | 125 dest="opt_set_size", |
| 121 default=500, | 126 type=int, |
| 122 help="Hyperparameter optimization set size (taken " | 127 default=500, |
| 123 "away from both --pos and --neg) (default: 500)") | 128 help="Hyperparameter optimization set size (taken " |
| 124 p_opt.add_argument("--opt-pos", | 129 "away from both --pos and --neg) (default: 500)", |
| 125 dest="opt_pos_fa", | 130 ) |
| 126 type=str, | 131 p_opt.add_argument( |
| 127 help="Positive (= binding site) sequences .fa file " | 132 "--opt-pos", |
| 128 "for hyperparameter optimization (default: take " | 133 dest="opt_pos_fa", |
| 129 "--opt-set-size from --pos)") | 134 type=str, |
| 130 p_opt.add_argument("--opt-neg", | 135 help="Positive (= binding site) sequences .fa file " |
| 131 dest="opt_neg_fa", | 136 "for hyperparameter optimization (default: take " |
| 132 type=str, | 137 "--opt-set-size from --pos)", |
| 133 help="Negative sequences .fa file for hyperparameter " | 138 ) |
| 134 "optimization (default: take --opt-set-size " | 139 p_opt.add_argument( |
| 135 "from --neg)") | 140 "--opt-neg", |
| 136 p_opt.add_argument("--min-train", | 141 dest="opt_neg_fa", |
| 137 dest="min_train", | 142 type=str, |
| 138 type=int, | 143 help="Negative sequences .fa file for hyperparameter " |
| 139 default=500, | 144 "optimization (default: take --opt-set-size " |
| 140 help="Minimum amount of training sites demanded " | 145 "from --neg)", |
| 141 "(default: 500)") | 146 ) |
| 142 p_opt.add_argument("--disable-cv", | 147 p_opt.add_argument( |
| 143 dest="disable_cv", | 148 "--min-train", |
| 144 default=False, | 149 dest="min_train", |
| 145 action="store_true", | 150 type=int, |
| 146 help="Disable cross validation step (default: false)") | 151 default=500, |
| 147 p_opt.add_argument("--disable-motifs", | 152 help="Minimum amount of training sites demanded " "(default: 500)", |
| 148 dest="disable_motifs", | 153 ) |
| 149 default=False, | 154 p_opt.add_argument( |
| 150 action="store_true", | 155 "--disable-cv", |
| 151 help="Disable motif generation step (default: false)") | 156 dest="disable_cv", |
| 152 p_opt.add_argument("--gp-output", | 157 default=False, |
| 153 dest="gp_output", | 158 action="store_true", |
| 154 default=False, | 159 help="Disable cross validation step (default: false)", |
| 155 action="store_true", | 160 ) |
| 156 help="Print output produced by GraphProt " | 161 p_opt.add_argument( |
| 157 "(default: false)") | 162 "--disable-motifs", |
| 158 p_opt.add_argument("--str-model", | 163 dest="disable_motifs", |
| 159 dest="train_str_model", | 164 default=False, |
| 160 default=False, | 165 action="store_true", |
| 161 action="store_true", | 166 help="Disable motif generation step (default: false)", |
| 162 help="Train a structure model (default: train " | 167 ) |
| 163 "a sequence model)") | 168 p_opt.add_argument( |
| 169 "--gp-output", | |
| 170 dest="gp_output", | |
| 171 default=False, | |
| 172 action="store_true", | |
| 173 help="Print output produced by GraphProt " "(default: false)", | |
| 174 ) | |
| 175 p_opt.add_argument( | |
| 176 "--str-model", | |
| 177 dest="train_str_model", | |
| 178 default=False, | |
| 179 action="store_true", | |
| 180 help="Train a structure model (default: train " "a sequence model)", | |
| 181 ) | |
| 164 return p | 182 return p |
| 165 | 183 |
| 166 | 184 |
| 167 ############################################################################### | 185 ####################################################################### |
| 168 | 186 |
| 169 if __name__ == '__main__': | 187 if __name__ == "__main__": |
| 170 | 188 |
| 171 # Setup argparse. | 189 # Setup argparse. |
| 172 parser = setup_argument_parser() | 190 parser = setup_argument_parser() |
| 173 # Read in command line arguments. | 191 # Read in command line arguments. |
| 174 args = parser.parse_args() | 192 args = parser.parse_args() |
| 180 # Check for Linux. | 198 # Check for Linux. |
| 181 assert "linux" in sys.platform, "please use Linux" | 199 assert "linux" in sys.platform, "please use Linux" |
| 182 # Check tool availability. | 200 # Check tool availability. |
| 183 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" | 201 assert gplib.is_tool("GraphProt.pl"), "GraphProt.pl not in PATH" |
| 184 # Check file inputs. | 202 # Check file inputs. |
| 185 assert os.path.exists(args.in_pos_fa), \ | 203 assert os.path.exists(args.in_pos_fa), 'positives .fa file "%s" not found' % ( |
| 186 "positives .fa file \"%s\" not found" % (args.in_pos_fa) | 204 args.in_pos_fa |
| 187 assert os.path.exists(args.in_neg_fa), \ | 205 ) |
| 188 "negatives .fa file \"%s\" not found" % (args.in_neg_fa) | 206 assert os.path.exists(args.in_neg_fa), 'negatives .fa file "%s" not found' % ( |
| 207 args.in_neg_fa | |
| 208 ) | |
| 189 # Count .fa entries. | 209 # Count .fa entries. |
| 190 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) | 210 c_pos_fa = gplib.count_fasta_headers(args.in_pos_fa) |
| 191 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) | 211 c_neg_fa = gplib.count_fasta_headers(args.in_neg_fa) |
| 192 assert c_pos_fa, "positives .fa file \"%s\" no headers found" % \ | 212 assert c_pos_fa, 'positives .fa file "%s" no headers found' % (args.in_pos_fa) |
| 193 (args.in_pos_fa) | 213 assert c_neg_fa, 'negatives .fa file "%s" no headers found' % (args.in_neg_fa) |
| 194 assert c_neg_fa, "negatives .fa file \"%s\" no headers found" % \ | |
| 195 (args.in_neg_fa) | |
| 196 print("# positive .fa sequences: %i" % (c_pos_fa)) | 214 print("# positive .fa sequences: %i" % (c_pos_fa)) |
| 197 print("# negative .fa sequences: %i" % (c_neg_fa)) | 215 print("# negative .fa sequences: %i" % (c_neg_fa)) |
| 198 # Check additional files. | 216 # Check additional files. |
| 199 if args.opt_pos_fa: | 217 if args.opt_pos_fa: |
| 200 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given" | 218 assert args.opt_neg_fa, "--opt-pos but no --opt-neg given" |
| 201 if args.opt_neg_fa: | 219 if args.opt_neg_fa: |
| 202 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" | 220 assert args.opt_pos_fa, "--opt-neg but no --opt-pos given" |
| 203 # Check for lowercase only sequences, which cause GP to crash. | 221 # Check for lowercase only sequences, which cause GP to crash. |
| 204 error_mess = "input sequences encountered containing "\ | 222 error_mess = ( |
| 205 "only lowercase characters or lowercase characters in between "\ | 223 "input sequences encountered containing " |
| 206 "uppercase characters. Please provide either all uppercase "\ | 224 "only lowercase characters or lowercase characters in between " |
| 207 "sequences or sequences containing uppercase regions surrounded "\ | 225 "uppercase characters. Please provide either all uppercase " |
| 208 "by lowercase context regions for structure calculation (see "\ | 226 "sequences or sequences containing uppercase regions surrounded " |
| 209 "viewpoint concept in original GraphProt publication "\ | 227 "by lowercase context regions for structure calculation (see " |
| 228 "viewpoint concept in original GraphProt publication " | |
| 210 "for more details)" | 229 "for more details)" |
| 230 ) | |
| 211 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa) | 231 seqs_dic = gplib.read_fasta_into_dic(args.in_pos_fa) |
| 212 bad_ids = gplib.check_seqs_dic_format(seqs_dic) | 232 bad_ids = gplib.check_seqs_dic_format(seqs_dic) |
| 213 assert not bad_ids, "%s" % (error_mess) | 233 assert not bad_ids, "%s" % (error_mess) |
| 214 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa) | 234 seqs_dic = gplib.read_fasta_into_dic(args.in_neg_fa) |
| 215 bad_ids = gplib.check_seqs_dic_format(seqs_dic) | 235 bad_ids = gplib.check_seqs_dic_format(seqs_dic) |
| 225 | 245 |
| 226 # If parop .fa files given. | 246 # If parop .fa files given. |
| 227 if args.opt_pos_fa and args.opt_neg_fa: | 247 if args.opt_pos_fa and args.opt_neg_fa: |
| 228 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) | 248 c_parop_pos_fa = gplib.count_fasta_headers(args.opt_pos_fa) |
| 229 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) | 249 c_parop_neg_fa = gplib.count_fasta_headers(args.opt_neg_fa) |
| 230 assert c_parop_pos_fa, "--opt-pos .fa file \"%s\" no headers found" \ | 250 assert c_parop_pos_fa, '--opt-pos .fa file "%s" no headers found' % ( |
| 231 % (args.opt_pos_fa) | 251 args.opt_pos_fa |
| 232 assert c_parop_neg_fa, "--opt-neg .fa file \"%s\" no headers found" \ | 252 ) |
| 233 % (args.opt_neg_fa) | 253 assert c_parop_neg_fa, '--opt-neg .fa file "%s" no headers found' % ( |
| 254 args.opt_neg_fa | |
| 255 ) | |
| 234 # Less than 500 for training?? You gotta be kidding. | 256 # Less than 500 for training?? You gotta be kidding. |
| 235 assert c_pos_fa >= args.min_train, \ | 257 assert c_pos_fa >= args.min_train, ( |
| 236 "--pos for training < %i, please provide more (try at least "\ | 258 "--pos for training < %i, please provide more (try at least " |
| 237 "> 1000, the more the better)" % (args.min_train) | 259 "> 1000, the more the better)" % (args.min_train) |
| 238 assert c_neg_fa >= args.min_train, \ | 260 ) |
| 239 "--neg for training < %i, please provide more (try at least "\ | 261 assert c_neg_fa >= args.min_train, ( |
| 262 "--neg for training < %i, please provide more (try at least " | |
| 240 "> 1000, the more the better)" % (args.min_train) | 263 "> 1000, the more the better)" % (args.min_train) |
| 264 ) | |
| 241 # Looking closer at ratios. | 265 # Looking closer at ratios. |
| 242 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa | 266 pos_neg_ratio = c_parop_pos_fa / c_parop_neg_fa |
| 243 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: | 267 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: |
| 244 assert 0, "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 "\ | 268 assert 0, ( |
| 245 "(ratio = %f). Try to keep ratio closer to 1 or better use "\ | 269 "ratio of --opt-pos to --opt-neg < 0.8 or > 1.25 " |
| 246 "identical numbers (keep in mind that performance measures "\ | 270 "(ratio = %f). Try to keep ratio closer to 1 or better use " |
| 247 "such as accuracy or AUROC are not suitable for imbalanced "\ | 271 "identical numbers (keep in mind that performance measures " |
| 272 "such as accuracy or AUROC are not suitable for imbalanced " | |
| 248 " datasets!)" % (pos_neg_ratio) | 273 " datasets!)" % (pos_neg_ratio) |
| 274 ) | |
| 249 else: | 275 else: |
| 250 # Define some minimum amount of training sites for the sake of sanity. | 276 # Define some minimum amount of training sites for the sake of sanity. |
| 251 c_pos_train = c_pos_fa - args.opt_set_size | 277 c_pos_train = c_pos_fa - args.opt_set_size |
| 252 c_neg_train = c_neg_fa - args.opt_set_size | 278 c_neg_train = c_neg_fa - args.opt_set_size |
| 253 # Start complaining. | 279 # Start complaining. |
| 254 assert c_pos_fa >= args.opt_set_size, \ | 280 assert ( |
| 255 "# positives < --opt-set-size (%i < %i)" \ | 281 c_pos_fa >= args.opt_set_size |
| 256 % (c_pos_fa, args.opt_set_size) | 282 ), "# positives < --opt-set-size (%i < %i)" % (c_pos_fa, args.opt_set_size) |
| 257 assert c_neg_fa >= args.opt_set_size, \ | 283 assert ( |
| 258 "# negatives < --opt-set-size (%i < %i)" \ | 284 c_neg_fa >= args.opt_set_size |
| 259 % (c_neg_fa, args.opt_set_size) | 285 ), "# negatives < --opt-set-size (%i < %i)" % (c_neg_fa, args.opt_set_size) |
| 260 assert c_pos_train >= args.opt_set_size, \ | 286 assert ( |
| 261 "# positives remaining for training < --opt-set-size "\ | 287 c_pos_train >= args.opt_set_size |
| 262 "(%i < %i)" % (c_pos_train, args.opt_set_size) | 288 ), "# positives remaining for training < --opt-set-size " "(%i < %i)" % ( |
| 263 assert c_neg_train >= args.opt_set_size, "# negatives remaining "\ | 289 c_pos_train, |
| 264 "for training < --opt-set-size (%i < %i)" \ | 290 args.opt_set_size, |
| 265 % (c_neg_train, args.opt_set_size) | 291 ) |
| 292 assert ( | |
| 293 c_neg_train >= args.opt_set_size | |
| 294 ), "# negatives remaining " "for training < --opt-set-size (%i < %i)" % ( | |
| 295 c_neg_train, | |
| 296 args.opt_set_size, | |
| 297 ) | |
| 266 # Less than 500?? You gotta be kidding. | 298 # Less than 500?? You gotta be kidding. |
| 267 assert c_pos_train >= args.min_train, \ | 299 assert c_pos_train >= args.min_train, ( |
| 268 "# positives remaining for training < %i, please provide more "\ | 300 "# positives remaining for training < %i, please provide more " |
| 269 " (try at least > 1000, the more the better)" % (args.min_train) | 301 " (try at least > 1000, the more the better)" % (args.min_train) |
| 270 assert c_neg_train >= args.min_train, \ | 302 ) |
| 271 "# negatives remaining for training < %i, please provide more "\ | 303 assert c_neg_train >= args.min_train, ( |
| 304 "# negatives remaining for training < %i, please provide more " | |
| 272 "(try at least > 1000, the more the better)" % (args.min_train) | 305 "(try at least > 1000, the more the better)" % (args.min_train) |
| 306 ) | |
| 273 # Looking closer at ratios. | 307 # Looking closer at ratios. |
| 274 pos_neg_ratio = c_pos_train / c_neg_train | 308 pos_neg_ratio = c_pos_train / c_neg_train |
| 275 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: | 309 if pos_neg_ratio < 0.8 or pos_neg_ratio > 1.25: |
| 276 assert 0, "ratio of --pos to --neg < 0.8 or > 1.25 "\ | 310 assert 0, ( |
| 277 "(ratio = %f). Try to keep ratio closer to 1 or better use "\ | 311 "ratio of --pos to --neg < 0.8 or > 1.25 " |
| 278 "identical numbers (keep in mind that performance measures "\ | 312 "(ratio = %f). Try to keep ratio closer to 1 or better use " |
| 279 "such as accuracy or AUROC are not suitable for imbalanced "\ | 313 "identical numbers (keep in mind that performance measures " |
| 314 "such as accuracy or AUROC are not suitable for imbalanced " | |
| 280 "datasets!)" % (pos_neg_ratio) | 315 "datasets!)" % (pos_neg_ratio) |
| 316 ) | |
| 281 | 317 |
| 282 """ | 318 """ |
| 283 Generate parop + train .fa output files for hyperparameter | 319 Generate parop + train .fa output files for hyperparameter |
| 284 optimization + training. | 320 optimization + training. |
| 285 | 321 |
| 297 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa) | 333 gplib.make_file_copy(args.opt_neg_fa, neg_parop_fa) |
| 298 gplib.make_file_copy(args.in_pos_fa, pos_train_fa) | 334 gplib.make_file_copy(args.in_pos_fa, pos_train_fa) |
| 299 gplib.make_file_copy(args.in_neg_fa, neg_train_fa) | 335 gplib.make_file_copy(args.in_neg_fa, neg_train_fa) |
| 300 else: | 336 else: |
| 301 # Generate parop + train .fa files from input .fa files. | 337 # Generate parop + train .fa files from input .fa files. |
| 302 gplib.split_fasta_into_test_train_files(args.in_pos_fa, pos_parop_fa, | 338 gplib.split_fasta_into_test_train_files( |
| 303 pos_train_fa, | 339 args.in_pos_fa, pos_parop_fa, pos_train_fa, test_size=args.opt_set_size |
| 304 test_size=args.opt_set_size) | 340 ) |
| 305 gplib.split_fasta_into_test_train_files(args.in_neg_fa, neg_parop_fa, | 341 gplib.split_fasta_into_test_train_files( |
| 306 neg_train_fa, | 342 args.in_neg_fa, neg_parop_fa, neg_train_fa, test_size=args.opt_set_size |
| 307 test_size=args.opt_set_size) | 343 ) |
| 308 | 344 |
| 309 """ | 345 """ |
| 310 Do the hyperparameter optimization. | 346 Do the hyperparameter optimization. |
| 311 | 347 |
| 312 """ | 348 """ |
| 313 print("Starting hyperparameter optimization (-action ls) ... ") | 349 print("Starting hyperparameter optimization (-action ls) ... ") |
| 314 check_cmd = "GraphProt.pl -action ls -prefix " + args.data_id + \ | 350 check_cmd = ( |
| 315 " -fasta " + pos_parop_fa + " -negfasta " + neg_parop_fa | 351 "GraphProt.pl -action ls -prefix " |
| 352 + args.data_id | |
| 353 + " -fasta " | |
| 354 + pos_parop_fa | |
| 355 + " -negfasta " | |
| 356 + neg_parop_fa | |
| 357 ) | |
| 316 # If sequence model should be trained (default). | 358 # If sequence model should be trained (default). |
| 317 if not args.train_str_model: | 359 if not args.train_str_model: |
| 318 check_cmd += " -onlyseq" | 360 check_cmd += " -onlyseq" |
| 319 print(check_cmd) | 361 print(check_cmd) |
| 320 output = subprocess.getoutput(check_cmd) | 362 output = subprocess.getoutput(check_cmd) |
| 321 params_file = args.data_id + ".params" | 363 params_file = args.data_id + ".params" |
| 322 assert os.path.exists(params_file), "Hyperparameter optimization output "\ | 364 assert os.path.exists( |
| 323 " .params file \"%s\" not found" % (params_file) | 365 params_file |
| 366 ), "Hyperparameter optimization output " ' .params file "%s" not found' % ( | |
| 367 params_file | |
| 368 ) | |
| 324 # Add model type to params file. | 369 # Add model type to params file. |
| 325 if args.train_str_model: | 370 if args.train_str_model: |
| 326 gplib.echo_add_to_file("model_type: structure", params_file) | 371 gplib.echo_add_to_file("model_type: structure", params_file) |
| 327 else: | 372 else: |
| 328 gplib.echo_add_to_file("model_type: sequence", params_file) | 373 gplib.echo_add_to_file("model_type: sequence", params_file) |
| 332 """ | 377 """ |
| 333 Do the model training. (Yowza!) | 378 Do the model training. (Yowza!) |
| 334 | 379 |
| 335 """ | 380 """ |
| 336 print("Starting model training (-action train) ... ") | 381 print("Starting model training (-action train) ... ") |
| 337 check_cmd = "GraphProt.pl -action train -prefix " + args.data_id \ | 382 check_cmd = ( |
| 338 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ | 383 "GraphProt.pl -action train -prefix " |
| 339 + " " + param_string | 384 + args.data_id |
| 385 + " -fasta " | |
| 386 + pos_train_fa | |
| 387 + " -negfasta " | |
| 388 + neg_train_fa | |
| 389 + " " | |
| 390 + param_string | |
| 391 ) | |
| 340 print(check_cmd) | 392 print(check_cmd) |
| 341 output = subprocess.getoutput(check_cmd) | 393 output = subprocess.getoutput(check_cmd) |
| 342 assert output, \ | 394 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
| 343 "The following call of GraphProt.pl produced no output:\n%s" \ | 395 check_cmd |
| 344 % (check_cmd) | 396 ) |
| 345 if args.gp_output: | 397 if args.gp_output: |
| 346 print(output) | 398 print(output) |
| 347 model_file = args.data_id + ".model" | 399 model_file = args.data_id + ".model" |
| 348 assert os.path.exists(model_file), \ | 400 assert os.path.exists(model_file), 'Training output .model file "%s" not found' % ( |
| 349 "Training output .model file \"%s\" not found" % (model_file) | 401 model_file |
| 402 ) | |
| 350 | 403 |
| 351 """ | 404 """ |
| 352 Do the 10-fold cross validation. | 405 Do the 10-fold cross validation. |
| 353 | 406 |
| 354 """ | 407 """ |
| 355 if not args.disable_cv: | 408 if not args.disable_cv: |
| 356 print("Starting 10-fold cross validation (-action cv) ... ") | 409 print("Starting 10-fold cross validation (-action cv) ... ") |
| 357 check_cmd = "GraphProt.pl -action cv -prefix " + args.data_id \ | 410 check_cmd = ( |
| 358 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ | 411 "GraphProt.pl -action cv -prefix " |
| 359 + " " + param_string + " -model " + model_file | 412 + args.data_id |
| 413 + " -fasta " | |
| 414 + pos_train_fa | |
| 415 + " -negfasta " | |
| 416 + neg_train_fa | |
| 417 + " " | |
| 418 + param_string | |
| 419 + " -model " | |
| 420 + model_file | |
| 421 ) | |
| 360 print(check_cmd) | 422 print(check_cmd) |
| 361 output = subprocess.getoutput(check_cmd) | 423 output = subprocess.getoutput(check_cmd) |
| 362 assert output, \ | 424 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
| 363 "The following call of GraphProt.pl produced no output:\n%s" \ | 425 check_cmd |
| 364 % (check_cmd) | 426 ) |
| 365 if args.gp_output: | 427 if args.gp_output: |
| 366 print(output) | 428 print(output) |
| 367 cv_results_file = args.data_id + ".cv_results" | 429 cv_results_file = args.data_id + ".cv_results" |
| 368 assert os.path.exists(cv_results_file), \ | 430 assert os.path.exists( |
| 369 "CV output .cv_results file \"%s\" not found" % (cv_results_file) | 431 cv_results_file |
| 432 ), 'CV output .cv_results file "%s" not found' % (cv_results_file) | |
| 370 | 433 |
| 371 """ | 434 """ |
| 372 Do the motif generation. | 435 Do the motif generation. |
| 373 | 436 |
| 374 """ | 437 """ |
| 375 if not args.disable_motifs: | 438 if not args.disable_motifs: |
| 376 print("Starting motif generation (-action motif) ... ") | 439 print("Starting motif generation (-action motif) ... ") |
| 377 check_cmd = "GraphProt.pl -action motif -prefix " + args.data_id \ | 440 check_cmd = ( |
| 378 + " -fasta " + pos_train_fa + " -negfasta " + neg_train_fa \ | 441 "GraphProt.pl -action motif -prefix " |
| 379 + " " + param_string + " -model " + model_file | 442 + args.data_id |
| 443 + " -fasta " | |
| 444 + pos_train_fa | |
| 445 + " -negfasta " | |
| 446 + neg_train_fa | |
| 447 + " " | |
| 448 + param_string | |
| 449 + " -model " | |
| 450 + model_file | |
| 451 ) | |
| 380 print(check_cmd) | 452 print(check_cmd) |
| 381 output = subprocess.getoutput(check_cmd) | 453 output = subprocess.getoutput(check_cmd) |
| 382 assert output, \ | 454 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
| 383 "The following call of GraphProt.pl produced no output:\n%s" \ | 455 check_cmd |
| 384 % (check_cmd) | 456 ) |
| 385 if args.gp_output: | 457 if args.gp_output: |
| 386 print(output) | 458 print(output) |
| 387 seq_motif_file = args.data_id + ".sequence_motif" | 459 seq_motif_file = args.data_id + ".sequence_motif" |
| 388 seq_motif_png_file = args.data_id + ".sequence_motif.png" | 460 seq_motif_png_file = args.data_id + ".sequence_motif.png" |
| 389 assert os.path.exists(seq_motif_file), \ | 461 assert os.path.exists( |
| 390 "Motif output .sequence_motif file \"%s\" not found" \ | 462 seq_motif_file |
| 391 % (seq_motif_file) | 463 ), 'Motif output .sequence_motif file "%s" not found' % (seq_motif_file) |
| 392 assert os.path.exists(seq_motif_png_file), \ | 464 assert os.path.exists( |
| 393 "Motif output .sequence_motif.png file \"%s\" not found" \ | 465 seq_motif_png_file |
| 394 % (seq_motif_png_file) | 466 ), 'Motif output .sequence_motif.png file "%s" not found' % (seq_motif_png_file) |
| 395 if args.train_str_model: | 467 if args.train_str_model: |
| 396 str_motif_file = args.data_id + ".structure_motif" | 468 str_motif_file = args.data_id + ".structure_motif" |
| 397 str_motif_png_file = args.data_id + ".structure_motif.png" | 469 str_motif_png_file = args.data_id + ".structure_motif.png" |
| 398 assert os.path.exists(str_motif_file), \ | 470 assert os.path.exists( |
| 399 "Motif output .structure_motif file \"%s\" not found" \ | 471 str_motif_file |
| 400 % (str_motif_file) | 472 ), 'Motif output .structure_motif file "%s" not found' % (str_motif_file) |
| 401 assert os.path.exists(str_motif_png_file), \ | 473 assert os.path.exists( |
| 402 "Motif output .structure_motif.png file \"%s\" not found" \ | 474 str_motif_png_file |
| 403 % (str_motif_png_file) | 475 ), 'Motif output .structure_motif.png file "%s" not found' % ( |
| 476 str_motif_png_file | |
| 477 ) | |
| 404 | 478 |
| 405 """ | 479 """ |
| 406 Do whole site predictions on positive training set. | 480 Do whole site predictions on positive training set. |
| 407 | 481 |
| 408 """ | 482 """ |
| 409 print("Starting whole site predictions on positive training set " | 483 print( |
| 410 " (-action predict) ... ") | 484 "Starting whole site predictions on positive training set " |
| 411 check_cmd = "GraphProt.pl -action predict -prefix " + args.data_id \ | 485 " (-action predict) ... " |
| 412 + " -fasta " + pos_train_fa + " " + param_string \ | 486 ) |
| 413 + " -model " + model_file | 487 check_cmd = ( |
| 488 "GraphProt.pl -action predict -prefix " | |
| 489 + args.data_id | |
| 490 + " -fasta " | |
| 491 + pos_train_fa | |
| 492 + " " | |
| 493 + param_string | |
| 494 + " -model " | |
| 495 + model_file | |
| 496 ) | |
| 414 print(check_cmd) | 497 print(check_cmd) |
| 415 output = subprocess.getoutput(check_cmd) | 498 output = subprocess.getoutput(check_cmd) |
| 416 assert output, \ | 499 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
| 417 "The following call of GraphProt.pl produced no output:\n%s" \ | 500 check_cmd |
| 418 % (check_cmd) | 501 ) |
| 419 if args.gp_output: | 502 if args.gp_output: |
| 420 print(output) | 503 print(output) |
| 421 ws_predictions_file = args.data_id + ".predictions" | 504 ws_predictions_file = args.data_id + ".predictions" |
| 422 assert os.path.exists(ws_predictions_file), \ | 505 assert os.path.exists( |
| 423 "Whole site prediction output .predictions file \"%s\" not found" \ | 506 ws_predictions_file |
| 424 % (ws_predictions_file) | 507 ), 'Whole site prediction output .predictions file "%s" not found' % ( |
| 508 ws_predictions_file | |
| 509 ) | |
| 425 | 510 |
| 426 """ | 511 """ |
| 427 Do profile predictions on positive training set. | 512 Do profile predictions on positive training set. |
| 428 | 513 |
| 429 """ | 514 """ |
| 430 print("Starting profile predictions on positive training set " | 515 print( |
| 431 "-action predict_profile) ... ") | 516 "Starting profile predictions on positive training set " |
| 432 check_cmd = "GraphProt.pl -action predict_profile -prefix " \ | 517 "-action predict_profile) ... " |
| 433 + args.data_id + " -fasta " + pos_train_fa + " " \ | 518 ) |
| 434 + param_string + " -model " + model_file | 519 check_cmd = ( |
| 520 "GraphProt.pl -action predict_profile -prefix " | |
| 521 + args.data_id | |
| 522 + " -fasta " | |
| 523 + pos_train_fa | |
| 524 + " " | |
| 525 + param_string | |
| 526 + " -model " | |
| 527 + model_file | |
| 528 ) | |
| 435 print(check_cmd) | 529 print(check_cmd) |
| 436 output = subprocess.getoutput(check_cmd) | 530 output = subprocess.getoutput(check_cmd) |
| 437 assert output, \ | 531 assert output, "The following call of GraphProt.pl produced no output:\n%s" % ( |
| 438 "The following call of GraphProt.pl produced no output:\n%s" \ | 532 check_cmd |
| 439 % (check_cmd) | 533 ) |
| 440 if args.gp_output: | 534 if args.gp_output: |
| 441 print(output) | 535 print(output) |
| 442 profile_predictions_file = args.data_id + ".profile" | 536 profile_predictions_file = args.data_id + ".profile" |
| 443 assert os.path.exists(profile_predictions_file), \ | 537 assert os.path.exists( |
| 444 "Profile prediction output .profile file \"%s\" not found" \ | 538 profile_predictions_file |
| 445 % (profile_predictions_file) | 539 ), 'Profile prediction output .profile file "%s" not found' % ( |
| 540 profile_predictions_file | |
| 541 ) | |
| 446 | 542 |
| 447 """ | 543 """ |
| 448 Get 50 % score (median) for .predictions and .profile file. | 544 Get 50 % score (median) for .predictions and .profile file. |
| 449 For .profile, first extract for each site the maximum score, and then | 545 For .profile, first extract for each site the maximum score, and then |
| 450 from the list of maximum site scores get the median. | 546 from the list of maximum site scores get the median. |
| 452 | 548 |
| 453 """ | 549 """ |
| 454 print("Getting .profile and .predictions median scores ... ") | 550 print("Getting .profile and .predictions median scores ... ") |
| 455 | 551 |
| 456 # Whole site scores median. | 552 # Whole site scores median. |
| 457 ws_pred_median = \ | 553 ws_pred_median = gplib.graphprot_predictions_get_median(ws_predictions_file) |
| 458 gplib.graphprot_predictions_get_median(ws_predictions_file) | |
| 459 # Profile top site scores median. | 554 # Profile top site scores median. |
| 460 profile_median = \ | 555 profile_median = gplib.graphprot_profile_get_tsm( |
| 461 gplib.graphprot_profile_get_tsm(profile_predictions_file, | 556 profile_predictions_file, profile_type="profile" |
| 462 profile_type="profile") | 557 ) |
| 463 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median) | 558 ws_pred_string = "pos_train_ws_pred_median: %f" % (ws_pred_median) |
| 464 profile_string = "pos_train_profile_median: %f" % (profile_median) | 559 profile_string = "pos_train_profile_median: %f" % (profile_median) |
| 465 gplib.echo_add_to_file(ws_pred_string, params_file) | 560 gplib.echo_add_to_file(ws_pred_string, params_file) |
| 466 gplib.echo_add_to_file(profile_string, params_file) | 561 gplib.echo_add_to_file(profile_string, params_file) |
| 467 # Average profile top site scores median for extlr 1 to 10. | 562 # Average profile top site scores median for extlr 1 to 10. |
| 468 for i in range(10): | 563 for i in range(10): |
| 469 i += 1 | 564 i += 1 |
| 470 avg_profile_median = \ | 565 avg_profile_median = gplib.graphprot_profile_get_tsm( |
| 471 gplib.graphprot_profile_get_tsm(profile_predictions_file, | 566 profile_predictions_file, profile_type="avg_profile", avg_profile_extlr=i |
| 472 profile_type="avg_profile", | 567 ) |
| 473 avg_profile_extlr=i) | 568 |
| 474 | 569 avg_profile_string = "pos_train_avg_profile_median_%i: %f" % ( |
| 475 avg_profile_string = "pos_train_avg_profile_median_%i: %f" \ | 570 i, |
| 476 % (i, avg_profile_median) | 571 avg_profile_median, |
| 572 ) | |
| 477 gplib.echo_add_to_file(avg_profile_string, params_file) | 573 gplib.echo_add_to_file(avg_profile_string, params_file) |
| 478 | 574 |
| 479 print("Script: I'm done.") | 575 print("Script: I'm done.") |
| 480 print("Author: Good. Now go back to your file system directory.") | 576 print("Author: Good. Now go back to your file system directory.") |
| 481 print("Script: Ok.") | 577 print("Script: Ok.") |
