graphprot_predict_profile: gplib.py comparison

comparison gplib.py @ 3:9a83a84a25a7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"

author	bgruening
date	Wed, 27 Jan 2021 19:27:12 +0000
parents	adcc4c457c3c
children	58ebf089377e

comparison

equal deleted inserted replaced

-:4cebb3439e1a
+:9a83a84a25a7
+import gzip
+import random
+import re
+import statistics
+import subprocess
 from distutils.spawn import find_executable
-import subprocess
-import statistics
-import random
-import gzip
-import uuid
-import sys
-import re
-import os
 """
 Run doctests:
 """
-################################################################################
+###############################################################################
 def graphprot_predictions_get_median(predictions_file):
 """
 Given a GraphProt .predictions file, read in site scores and return
 the median value.
 >>> test_file = "test-data/test.predictions"
 >>> graphprot_predictions_get_median(test_file)
 0.571673
 f.close()
 # Return the median.
 return statistics.median(sc_list)
-################################################################################
+###############################################################################
-def graphprot_profile_get_top_scores_median(profile_file,
+def graphprot_profile_get_tsm(profile_file,
 profile_type="profile",
 avg_profile_extlr=5):
 """
 Given a GraphProt .profile file, extract for each site (identified by
 column 1 ID) the top (= highest) score. Then return the median of these
 top scores.
 profile_type can be either "profile" or "avg_profile".
 "avg_profile means that the position-wise scores will first get smoothed
 out by calculating for each position a new score through taking a
 sequence window -avg_profile_extlr to +avg_profile_extlr of the position
-and calculate the mean score over this window and assign it to the position.
+and calculate the mean score over this window and assign it to the
-After that, the maximum score of each site is chosen, and the median over
+position. After that, the maximum score of each site is chosen, and the
-all maximum scores is returned.
+median over all maximum scores is returned.
 "profile" leaves the position-wise scores as they are, directly extracting
 the maximum for each site and then reporting the median.
 >>> test_file = "test-data/test.profile"
->>> graphprot_profile_get_top_scores_median(test_file)
+>>> graphprot_profile_get_tsm(test_file)
 3.2
 """
 # Dictionary of lists, with list of scores (value) for each site (key).
 lists_dic = {}
 if profile_type == "profile":
 max_sc = max(lists_dic[seq_id])
 max_list.append(max_sc)
 elif profile_type == "avg_profile":
 # Convert profile score list to average profile scores list.
-aps_list = list_moving_window_average_values(lists_dic[seq_id],
+aps_list = \
-win_extlr=avg_profile_extlr)
+list_moving_window_average_values(lists_dic[seq_id],
+win_extlr=avg_profile_extlr)
 max_sc = max(aps_list)
 max_list.append(max_sc)
 else:
-assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type)
+assert 0, "invalid profile_type argument given: \"%s\"" \
+% (profile_type)
 # Return the median.
 return statistics.median(max_list)
-################################################################################
+###############################################################################
 def list_moving_window_average_values(in_list,
 win_extlr=5,
 method=1):
 """
 Take a list of numeric values, and calculate for each position a new value,
 by taking the mean value of the window of positions -win_extlr and
 +win_extlr. If full extension is not possible (at list ends), it just
 takes what it gets.
 Two implementations of the task are given, chose by method=1 or method=2.
 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1]
 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1)
 e = i + win_extlr + 1
 if s < 0:
 s = 0
 if e > l_list:
 e = l_list
-l = e-s
+ln = e - s
 sc_sum = 0
-for j in range(l):
+for j in range(ln):
-sc_sum += in_list[s+j]
+sc_sum += in_list[s + j]
-new_list[i] = sc_sum / l
+new_list[i] = sc_sum / ln
 else:
-assert 0, "invalid method ID given (%i)" %(method)
+assert 0, "invalid method ID given (%i)" % (method)
 return new_list
-################################################################################
+###############################################################################
 def echo_add_to_file(echo_string, out_file):
 """
 Add a string to file, using echo command.
 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file)
 output = subprocess.getoutput(check_cmd)
 error = False
 if output:
 error = True
-assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output)
+assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output)
-################################################################################
+###############################################################################
 def is_tool(name):
 """Check whether tool "name" is in PATH."""
 return find_executable(name) is not None
-################################################################################
+###############################################################################
 def count_fasta_headers(fasta_file):
 """
 Count number of FASTA headers in fasta_file using grep.
 output = subprocess.getoutput(check_cmd)
 row_count = int(output.strip())
 return row_count
-################################################################################
+###############################################################################
 def make_file_copy(in_file, out_file):
 """
 Make a file copy by copying in_file to out_file.
 """
 check_cmd = "cat " + in_file + " > " + out_file
-assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd)
+assert in_file != out_file, \
+"cat does not like to cat file into same file (%s)" % (check_cmd)
 output = subprocess.getoutput(check_cmd)
 error = False
 if output:
 error = True
-assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output)
+assert not error, \
+"cat did not like your input (in_file: %s, out_file: %s):\n%s" \
+% (in_file, out_file, output)
-################################################################################
-def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
+###############################################################################
+def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa,
 test_size=500):
 """
 Split in_fasta .fa file into two files (e.g. test, train).
 """
 c_out += 1
 TESTOUT.close()
 TRAINOUT.close()
-################################################################################
+###############################################################################
+def check_seqs_dic_format(seqs_dic):
+"""
+Check sequence dictionary for lowercase-only sequences or sequences
+wich have lowercase nts in between uppercase nts.
+Return suspicious IDs as list or empty list if not hits.
+IDs with lowercase-only sequences.
+>>> seqs_dic = {"id1" : "acguACGU", "id2" : "acgua", "id3" : "acgUUaUcc"}
+>>> check_seqs_dic_format(seqs_dic)
+['id2', 'id3']
+>>> seqs_dic = {"id1" : "acgAUaa", "id2" : "ACGUACUA"}
+>>> check_seqs_dic_format(seqs_dic)
+[]
+"""
+assert seqs_dic, "given seqs_dic empty"
+bad_seq_ids = []
+for seq_id in seqs_dic:
+seq = seqs_dic[seq_id]
+if re.search("^[acgtun]+$", seq):
+bad_seq_ids.append(seq_id)
+if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq):
+bad_seq_ids.append(seq_id)
+return bad_seq_ids
+###############################################################################
 def read_fasta_into_dic(fasta_file,
 seqs_dic=False,
 ids_dic=False,
 read_dna=False,
+short_ensembl=False,
 reject_lc=False,
 convert_to_uc=False,
 skip_n_seqs=True):
 """
 Read in FASTA sequences, convert to RNA, store in dictionary
 and return dictionary.
 >>> test_fasta = "test-data/test.fa"
 >>> read_fasta_into_dic(test_fasta)
 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'}
 >>> test_fasta = "test-data/test2.fa"
 >>> read_fasta_into_dic(test_fasta)
 {}
 >>> test_fasta = "test-data/test.ensembl.fa"
->>> read_fasta_into_dic(test_fasta, read_dna=True)
+>>> read_fasta_into_dic(test_fasta, read_dna=True, short_ensembl=True)
 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'}
+>>> test_fasta = "test-data/test4.fa"
+>>> read_fasta_into_dic(test_fasta)
+{'1': 'gccuAUGUuuua', '2': 'cugaAACUaugu'}
 """
 if not seqs_dic:
 seqs_dic = {}
 seq_id = ""
 seq = ""
 # Go through FASTA file, extract sequences.
-if re.search(".+\.gz$", fasta_file):
+if re.search(r".+\.gz$", fasta_file):
 f = gzip.open(fasta_file, 'rt')
 else:
 f = open(fasta_file, "r")
 for line in f:
 if re.search(">.+", line):
 m = re.search(">(.+)", line)
 seq_id = m.group(1)
 # If there is a ".", take only first part of header.
 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..."
-if re.search(".+\..+", seq_id):
+if short_ensembl:
-m = re.search("(.+?)\..+", seq_id)
+if re.search(r".+\..+", seq_id):
-seq_id = m.group(1)
+m = re.search(r"(.+?)\..+", seq_id)
-assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file)
+seq_id = m.group(1)
+assert seq_id not in seqs_dic, \
+"non-unique FASTA header \"%s\" in \"%s\"" \
+% (seq_id, fasta_file)
 if ids_dic:
 if seq_id in ids_dic:
 seqs_dic[seq_id] = ""
 else:
 seqs_dic[seq_id] = ""
 elif re.search("[ACGTUN]+", line, re.I):
 if seq_id in seqs_dic:
 m = re.search("([ACGTUN]+)", line, re.I)
 seq = m.group(1)
 if reject_lc:
-assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id)
+assert \
+not re.search("[a-z]", seq), \
+"lc char detected in seq \"%i\" (reject_lc=True)" \
+% (seq_id)
 if convert_to_uc:
 seq = seq.upper()
 # If sequences with N nucleotides should be skipped.
 if skip_n_seqs:
 if "n" in m.group(1) or "N" in m.group(1):
-print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id))
+print("WARNING: \"%s\" contains N. Discarding "
+"sequence ... " % (seq_id))
 del seqs_dic[seq_id]
 continue
 # Convert to RNA, concatenate sequence.
 if read_dna:
-seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t")
+seqs_dic[seq_id] += \
+m.group(1).replace("U", "T").replace("u", "t")
 else:
-seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u")
+seqs_dic[seq_id] += \
+m.group(1).replace("T", "U").replace("t", "u")
 f.close()
 return seqs_dic
-################################################################################
+###############################################################################
 def random_order_dic_keys_into_list(in_dic):
 """
 Read in dictionary keys, and return random order list of IDs.
 id_list.append(key)
 random.shuffle(id_list)
 return id_list
-################################################################################
+###############################################################################
 def graphprot_get_param_string(params_file):
 """
 Get parameter string from GraphProt .params file.
 continue
 if par == "model_type":
 if setting == "sequence":
 param_string += "-onlyseq "
 else:
-param_string += "-%s %s " %(par, setting)
+param_string += "-%s %s " % (par, setting)
 else:
-assert 0, "pattern matching failed for string \"%s\"" %(param)
+assert 0, "pattern matching failed for string \"%s\"" % (param)
 return param_string
-################################################################################
+###############################################################################
 def seqs_dic_count_uc_nts(seqs_dic):
 """
 Count number of uppercase nucleotides in sequences stored in sequence
 dictionary.
 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'}
 >>> seqs_dic_count_uc_nts(seqs_dic)
 6
 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'}
 >>> seqs_dic_count_uc_nts(seqs_dic)
 for seq_id in seqs_dic:
 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id]))
 return c_uc
-################################################################################
+###############################################################################
 def seqs_dic_count_lc_nts(seqs_dic):
 """
 Count number of lowercase nucleotides in sequences stored in sequence
 dictionary.
 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'}
 >>> seqs_dic_count_lc_nts(seqs_dic)
 10
 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'}
 >>> seqs_dic_count_lc_nts(seqs_dic)
 for seq_id in seqs_dic:
 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id]))
 return c_uc
-################################################################################
+###############################################################################
 def count_file_rows(in_file):
 """
 Count number of file rows for given input file.
 >>> test_file = "test-data/test1.bed"
 >>> count_file_rows(test_file)
 7
 >>> test_file = "test-data/empty_file"
 >>> count_file_rows(test_file)
 output = subprocess.getoutput(check_cmd)
 row_count = int(output.strip())
 return row_count
-################################################################################
+###############################################################################
 def bed_check_six_col_format(bed_file):
 """
 Check whether given .bed file has 6 columns.
 break
 f.closed
 return six_col_format
-################################################################################
+###############################################################################
 def bed_check_unique_ids(bed_file):
 """
 Check whether .bed file (6 column format with IDs in column 4)
 has unique column 4 IDs.
 >>> test_bed = "test-data/test1.bed"
 >>> bed_check_unique_ids(test_bed)
 True
 >>> test_bed = "test-data/test2.bed"
 >>> bed_check_unique_ids(test_bed)
 return False
 else:
 return True
-################################################################################
+###############################################################################
 def get_seq_lengths_from_seqs_dic(seqs_dic):
 """
 Given a dictionary of sequences, return dictionary of sequence lengths.
 Mapping is sequence ID -> sequence length.
 seq_l = len(seqs_dic[seq_id])
 seq_len_dic[seq_id] = seq_l
 return seq_len_dic
-################################################################################
+###############################################################################
 def bed_get_region_lengths(bed_file):
 """
 Read in .bed file, store and return region lengths in dictionary.
 key   :  region ID (.bed col4)
 """
 id2len_dic = {}
 with open(bed_file) as f:
 for line in f:
-row = line.strip()
 cols = line.strip().split("\t")
 site_s = int(cols[1])
 site_e = int(cols[2])
 site_id = cols[3]
 site_l = site_e - site_s
-assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file)
+assert site_id \
+not in id2len_dic, \
+"column 4 IDs not unique in given .bed file \"%s\"" \
+% (bed_file)
 id2len_dic[site_id] = site_l
 f.closed
-assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed)
+assert id2len_dic, \
+"No IDs read into dic (input file \"%s\" empty or malformatted?)" \
+% (bed_file)
 return id2len_dic
-################################################################################
+###############################################################################
 def graphprot_get_param_dic(params_file):
 """
 Read in GraphProt .params file and store in dictionary.
 key = parameter
 value = parameter value
 >>> params_file = "test-data/test.params"
 >>> graphprot_get_param_dic(params_file)
-{'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'}
+{'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', \
+'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', \
+'pos_train_profile_median': '5.039610', \
+'pos_train_avg_profile_median_1': '4.236340', \
+'pos_train_avg_profile_median_2': '3.868431', \
+'pos_train_avg_profile_median_3': '3.331277', \
+'pos_train_avg_profile_median_4': '2.998667', \
+'pos_train_avg_profile_median_5': '2.829782', \
+'pos_train_avg_profile_median_6': '2.626623', \
+'pos_train_avg_profile_median_7': '2.447083', \
+'pos_train_avg_profile_median_8': '2.349919', \
+'pos_train_avg_profile_median_9': '2.239829', \
+'pos_train_avg_profile_median_10': '2.161676'}
 """
 param_dic = {}
 with open(params_file) as f:
 for line in f:
 param_dic[par] = setting
 f.close()
 return param_dic
-################################################################################
+###############################################################################
 def graphprot_filter_predictions_file(in_file, out_file,
 sc_thr=0):
 """
 Filter GraphProt .predictions file by given score thr_sc.
 row = line.strip()
 cols = line.strip().split("\t")
 score = float(cols[2])
 if score < sc_thr:
 continue
-OUTPRED.write("%s\n" %(row))
+OUTPRED.write("%s\n" % (row))
 f.close()
 OUTPRED.close()
-################################################################################
+###############################################################################
 def fasta_read_in_ids(fasta_file):
 """
 Given a .fa file, read in header IDs in order appearing in file,
 and store in list.
 >>> test_file = "test-data/test3.fa"
 >>> fasta_read_in_ids(test_file)
 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316']
 ids_list.append(seq_id)
 f.close()
 return ids_list
-################################################################################
+###############################################################################
-def graphprot_profile_calculate_avg_profile(in_file, out_file,
+def graphprot_profile_calc_avg_profile(in_file, out_file,
 ap_extlr=5,
 seq_ids_list=False,
 method=1):
 """
 Given a GraphProt .profile file, calculate average profiles and output
 average profile file.
 Average profile means that the position-wise scores will get smoothed
 out by calculating for each position a new score, taking a sequence
 window -ap_extlr to +ap_extlr relative to the position
 and calculate the mean score over this window. The mean score then
 becomes the new average profile score at this position.
 Two different implementations of the task are given:
 method=1 (new python implementation, slower + more memory but easy to read)
 method=2 (old perl implementation, faster and less memory but more code)
 >>> in_file = "test-data/test2.profile"
 >>> out_file1 = "test-data/test2_1.avg_profile"
 >>> out_file2 = "test-data/test2_2.avg_profile"
 >>> out_file4 = "test-data/test2_3.avg_profile"
->>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1)
+>>> graphprot_profile_calc_avg_profile(in_file, \
->>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2)
+out_file1, ap_extlr=2, method=1)
+>>> graphprot_profile_calc_avg_profile(in_file, \
+out_file2, ap_extlr=2, method=2)
 >>> diff_two_files_identical(out_file1, out_file2)
 True
 >>> test_list = ["s1", "s2", "s3", "s4"]
 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile"
 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile"
->>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list)
+>>> graphprot_profile_calc_avg_profile(in_file, out_file3, \
+ap_extlr=2, method=1, seq_ids_list=test_list)
 >>> diff_two_files_identical(out_file3_exp, out_file3)
 True
 """
 if method == 1:
 site_starts_dic = {}
 with open(in_file) as f:
 for line in f:
 cols = line.strip().split("\t")
 site_id = int(cols[0])
-pos = int(cols[1]) # 0-based.
+pos = int(cols[1])  # 0-based.
 score = float(cols[2])
 # Store first position of site.
 if site_id not in site_starts_dic:
 site_starts_dic[site_id] = pos
 if site_id in lists_dic:
 lists_dic[site_id].append(score)
 else:
 lists_dic[site_id] = []
 lists_dic[site_id].append(score)
 f.close()
-# Check number of IDs (# FASTA sequence IDs has to be same as # site IDs).
+# Check number of IDs (# FASTA IDs has to be same as # site IDs).
 if seq_ids_list:
 c_seq_ids = len(seq_ids_list)
 c_site_ids = len(site_starts_dic)
-assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids)
+assert c_seq_ids == c_site_ids, \
+"# sequence IDs != # site IDs (%i != %i)" \
+% (c_seq_ids, c_site_ids)
 OUTPROF = open(out_file, "w")
 # For each site, calculate average profile scores list.
-max_list = []
 for site_id in lists_dic:
 # Convert profile score list to average profile scores list.
 aps_list = list_moving_window_average_values(lists_dic[site_id],
 win_extlr=ap_extlr)
 start_pos = site_starts_dic[site_id]
 # Get original FASTA sequence ID.
 if seq_ids_list:
 site_id = seq_ids_list[site_id]
 for i, sc in enumerate(aps_list):
-pos = i + start_pos + 1 # make 1-based.
+pos = i + start_pos + 1  # make 1-based.
-OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc))
+OUTPROF.write("%s\t%i\t%f\n" % (site_id, pos, sc))
 OUTPROF.close()
 elif method == 2:
 OUTPROF = open(out_file, "w")
 # Old site ID.
 old_id = ""
 site_starts_dic = {}
 with open(in_file) as f:
 for line in f:
 cols = line.strip().split("\t")
 cur_id = int(cols[0])
-pos = int(cols[1]) # 0-based.
+pos = int(cols[1])  # 0-based.
 score = float(cols[2])
 # Store first position of site.
 if cur_id not in site_starts_dic:
 site_starts_dic[cur_id] = pos
 # Case: new site (new column 1 ID).
 if cur_id != old_id:
 # Process old id scores.
 if scores_list:
-aps_list = list_moving_window_average_values(scores_list,
+aps_list = \
-win_extlr=ap_extlr)
+list_moving_window_average_values(
+scores_list,
+win_extlr=ap_extlr)
 start_pos = site_starts_dic[old_id]
 seq_id = old_id
 # Get original FASTA sequence ID.
 if seq_ids_list:
 seq_id = seq_ids_list[old_id]
 for i, sc in enumerate(aps_list):
-pos = i + start_pos + 1 # make 1-based.
+pos = i + start_pos + 1  # make 1-based.
-OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
+OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
 # Reset list.
 scores_list = []
 old_id = cur_id
 scores_list.append(score)
 else:
 seq_id = old_id
 # Get original FASTA sequence ID.
 if seq_ids_list:
 seq_id = seq_ids_list[old_id]
 for i, sc in enumerate(aps_list):
-pos = i + start_pos + 1 # make 1-based.
+pos = i + start_pos + 1  # make 1-based.
-OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc))
+OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc))
 OUTPROF.close()
-################################################################################
+###############################################################################
 def graphprot_profile_extract_peak_regions(in_file, out_file,
 max_merge_dist=0,
 sc_thr=0):
 """
 Extract peak regions from GraphProt .profile file.
 Store the peak regions (defined as regions with scores >= sc_thr)
 as to out_file in 6-column .bed.
 TODO:
 Add option for genomic coordinates input (+ - polarity support).
 Output genomic regions instead of sequence regions.
 >>> diff_two_files_identical(out_file, exp_file)
 True
 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10)
 >>> diff_two_files_identical(out_file, empty_file)
 True
->>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2)
+>>> graphprot_profile_extract_peak_regions(in_file, out_file, \
+max_merge_dist=2)
 >>> diff_two_files_identical(out_file, exp2_file)
 True
 """
 site_starts_dic = {}
 with open(in_file) as f:
 for line in f:
 cols = line.strip().split("\t")
 cur_id = cols[0]
-pos = int(cols[1]) # 0-based.
+pos = int(cols[1])  # 0-based.
 score = float(cols[2])
 # Store first position of site.
 if cur_id not in site_starts_dic:
 # If first position != zero, we assume positions are 1-based.
 if pos != 0:
 # Case: new site (new column 1 ID).
 if cur_id != old_id:
 # Process old id scores.
 if scores_list:
 # Extract peaks from region.
-peak_list = list_extract_peaks(scores_list,
+peak_list = \
-max_merge_dist=max_merge_dist,
+list_extract_peaks(scores_list,
-coords="bed",
+max_merge_dist=max_merge_dist,
-sc_thr=sc_thr)
+coords="bed",
+sc_thr=sc_thr)
 start_pos = site_starts_dic[old_id]
 # Print out peaks in .bed format.
-for l in peak_list:
+for ln in peak_list:
-peak_s = start_pos + l[0]
+peak_s = start_pos + ln[0]
-peak_e = start_pos + l[1]
+peak_e = start_pos + ln[1]
-site_id = "%s,%i" %(old_id, l[2])
+site_id = "%s,%i" % (old_id, ln[2])
-OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
+OUTPEAKS.write("%s\t%i\t%i"
+"\t%s\t%f\t+\n"
+% (old_id, peak_s,
+peak_e, site_id, ln[3]))
 # Reset list.
 scores_list = []
 old_id = cur_id
 scores_list.append(score)
 else:
 scores_list.append(score)
 f.close()
 # Process last block.
 if scores_list:
 # Extract peaks from region.
 peak_list = list_extract_peaks(scores_list,
 max_merge_dist=max_merge_dist,
 coords="bed",
 sc_thr=sc_thr)
 start_pos = site_starts_dic[old_id]
 # Print out peaks in .bed format.
-for l in peak_list:
+for ln in peak_list:
-peak_s = start_pos + l[0]
+peak_s = start_pos + ln[0]
-peak_e = start_pos + l[1]
+peak_e = start_pos + ln[1]
-site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based.
+site_id = "%s,%i" % (old_id, ln[2])  # best score also 1-based.
-OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3]))
+OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n"
+% (old_id, peak_s, peak_e, site_id, ln[3]))
 OUTPEAKS.close()
-################################################################################
+###############################################################################
 def list_extract_peaks(in_list,
 max_merge_dist=0,
 coords="list",
 sc_thr=0):
 """
 Extract peak regions from list.
 Peak region is defined as region >= score threshold.
 coords=bed  :  peak start 0-based, peak end 1-based.
 coords=list :  peak start 0-based, peak end 0-based.
 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5]
 >>> list_extract_peaks(test_list)
 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]]
 >>> list_extract_peaks(test_list, sc_thr=2)
 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]]
 pr_top_pos = i
 else:
 # Before was peak region?
 if inside:
 # Store peak region.
-#peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc)
 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc]
 peak_list.append(peak_infos)
 inside = False
 pr_top_pos = 0
 pr_top_sc = -100000
 new_top_pos = peak_list[i][2]
 new_top_sc = peak_list[i][3]
 if peak_list[i][3] < peak_list[j][3]:
 new_top_pos = peak_list[j][2]
 new_top_sc = peak_list[j][3]
-new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc]
+new_peak = [peak_list[i][0], peak_list[j][1],
+new_top_pos, new_top_sc]
 # If two peaks were merged.
 if new_peak:
 merged_peak_list.append(new_peak)
 added_peaks_dic[i] = 1
 added_peaks_dic[j] = 1
 peaks_merged = False
 # If peak coordinates should be in .bed format, make peak ends 1-based.
 if coords == "bed":
 for i in range(len(peak_list)):
 peak_list[i][1] += 1
-peak_list[i][2] += 1 # 1-base best score position too.
+peak_list[i][2] += 1  # 1-base best score position too.
 return peak_list
-################################################################################
+###############################################################################
-def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False):
+def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed,
-"""
+print_rows=False):
-Given a .bed file of sequence peak regions (possible coordinates from
+"""
+Given a .bed file of sequence peak regions (possible coordinates from
 0 to length of s), convert peak coordinates to genomic coordinates.
 Do this by taking genomic regions of sequences as input.
 >>> test_in = "test-data/test.peaks.bed"
 >>> test_exp = "test-data/test_exp.peaks.bed"
 with open(genomic_sites_bed) as f:
 for line in f:
 row = line.strip()
 cols = line.strip().split("\t")
 site_id = cols[3]
-assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed)
+assert site_id \
+not in id2row_dic, \
+"column 4 IDs not unique in given .bed file \"%s\"" \
+% (genomic_sites_bed)
 id2row_dic[site_id] = row
 f.close()
 # Read in peaks file and convert coordinates.
 OUTPEAKS = open(genomic_peak_file, "w")
 site_id = cols[0]
 site_s = int(cols[1])
 site_e = int(cols[2])
 site_id2 = cols[3]
 site_sc = float(cols[4])
-assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2)
+assert re.search(".+,.+", site_id2), \
-m = re.search(".+,(\d+)", site_id2)
+"regular expression failed for ID \"%s\"" % (site_id2)
-sc_pos = int(m.group(1)) # 1-based.
+m = re.search(r".+,(\d+)", site_id2)
-assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id)
+sc_pos = int(m.group(1))  # 1-based.
+assert site_id in id2row_dic, \
+"site ID \"%s\" not found in genomic sites dictionary" \
+% (site_id)
 row = id2row_dic[site_id]
 rowl = row.split("\t")
 gen_chr = rowl[0]
 gen_s = int(rowl[1])
 gen_e = int(rowl[2])
 new_e = site_e + gen_s
 new_sc_pos = sc_pos + gen_s
 if gen_pol == "-":
 new_s = gen_e - site_e
 new_e = gen_e - site_s
-new_sc_pos = gen_e - sc_pos + 1 # keep 1-based.
+new_sc_pos = gen_e - sc_pos + 1  # keep 1-based.
-new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol)
+new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \
-OUTPEAKS.write("%s\n" %(new_row))
+% (gen_chr, new_s, new_e,
+site_id, new_sc_pos, site_sc, gen_pol)
+OUTPEAKS.write("%s\n" % (new_row))
 if print_rows:
 print(new_row)
 OUTPEAKS.close()
-################################################################################
+###############################################################################
 def diff_two_files_identical(file1, file2):
 """
 Check whether two files are identical. Return true if diff reports no
 differences.
 >>> file1 = "test-data/file1"
 >>> file2 = "test-data/file2"
 >>> diff_two_files_identical(file1, file2)
 True
 >>> file1 = "test-data/test1.bed"
 if output:
 same = False
 return same
-################################################################################
+###############################################################################

Mercurial > repos > rnateam > graphprot_predict_profile

comparison gplib.py @ 3:9a83a84a25a7 draft