Mercurial > repos > rnateam > graphprot_predict_profile
comparison gplib.py @ 3:9a83a84a25a7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/graphprot commit efcac98677c3ea9039c1c61eaa9e58f78287ccb3"
author | bgruening |
---|---|
date | Wed, 27 Jan 2021 19:27:12 +0000 |
parents | adcc4c457c3c |
children | 58ebf089377e |
comparison
equal
deleted
inserted
replaced
2:4cebb3439e1a | 3:9a83a84a25a7 |
---|---|
1 | 1 |
2 import gzip | |
3 import random | |
4 import re | |
5 import statistics | |
6 import subprocess | |
2 from distutils.spawn import find_executable | 7 from distutils.spawn import find_executable |
3 import subprocess | 8 |
4 import statistics | |
5 import random | |
6 import gzip | |
7 import uuid | |
8 import sys | |
9 import re | |
10 import os | |
11 | 9 |
12 """ | 10 """ |
13 | 11 |
14 Run doctests: | 12 Run doctests: |
15 | 13 |
17 | 15 |
18 | 16 |
19 """ | 17 """ |
20 | 18 |
21 | 19 |
22 ################################################################################ | 20 ############################################################################### |
23 | 21 |
24 def graphprot_predictions_get_median(predictions_file): | 22 def graphprot_predictions_get_median(predictions_file): |
25 """ | 23 """ |
26 Given a GraphProt .predictions file, read in site scores and return | 24 Given a GraphProt .predictions file, read in site scores and return |
27 the median value. | 25 the median value. |
28 | 26 |
29 >>> test_file = "test-data/test.predictions" | 27 >>> test_file = "test-data/test.predictions" |
30 >>> graphprot_predictions_get_median(test_file) | 28 >>> graphprot_predictions_get_median(test_file) |
31 0.571673 | 29 0.571673 |
41 f.close() | 39 f.close() |
42 # Return the median. | 40 # Return the median. |
43 return statistics.median(sc_list) | 41 return statistics.median(sc_list) |
44 | 42 |
45 | 43 |
46 ################################################################################ | 44 ############################################################################### |
47 | 45 |
48 def graphprot_profile_get_top_scores_median(profile_file, | 46 def graphprot_profile_get_tsm(profile_file, |
49 profile_type="profile", | 47 profile_type="profile", |
50 avg_profile_extlr=5): | 48 avg_profile_extlr=5): |
51 | 49 |
52 """ | 50 """ |
53 Given a GraphProt .profile file, extract for each site (identified by | 51 Given a GraphProt .profile file, extract for each site (identified by |
54 column 1 ID) the top (= highest) score. Then return the median of these | 52 column 1 ID) the top (= highest) score. Then return the median of these |
55 top scores. | 53 top scores. |
56 | 54 |
57 profile_type can be either "profile" or "avg_profile". | 55 profile_type can be either "profile" or "avg_profile". |
58 "avg_profile means that the position-wise scores will first get smoothed | 56 "avg_profile means that the position-wise scores will first get smoothed |
59 out by calculating for each position a new score through taking a | 57 out by calculating for each position a new score through taking a |
60 sequence window -avg_profile_extlr to +avg_profile_extlr of the position | 58 sequence window -avg_profile_extlr to +avg_profile_extlr of the position |
61 and calculate the mean score over this window and assign it to the position. | 59 and calculate the mean score over this window and assign it to the |
62 After that, the maximum score of each site is chosen, and the median over | 60 position. After that, the maximum score of each site is chosen, and the |
63 all maximum scores is returned. | 61 median over all maximum scores is returned. |
64 "profile" leaves the position-wise scores as they are, directly extracting | 62 "profile" leaves the position-wise scores as they are, directly extracting |
65 the maximum for each site and then reporting the median. | 63 the maximum for each site and then reporting the median. |
66 | 64 |
67 >>> test_file = "test-data/test.profile" | 65 >>> test_file = "test-data/test.profile" |
68 >>> graphprot_profile_get_top_scores_median(test_file) | 66 >>> graphprot_profile_get_tsm(test_file) |
69 3.2 | 67 3.2 |
70 | 68 |
71 """ | 69 """ |
72 # Dictionary of lists, with list of scores (value) for each site (key). | 70 # Dictionary of lists, with list of scores (value) for each site (key). |
73 lists_dic = {} | 71 lists_dic = {} |
88 if profile_type == "profile": | 86 if profile_type == "profile": |
89 max_sc = max(lists_dic[seq_id]) | 87 max_sc = max(lists_dic[seq_id]) |
90 max_list.append(max_sc) | 88 max_list.append(max_sc) |
91 elif profile_type == "avg_profile": | 89 elif profile_type == "avg_profile": |
92 # Convert profile score list to average profile scores list. | 90 # Convert profile score list to average profile scores list. |
93 aps_list = list_moving_window_average_values(lists_dic[seq_id], | 91 aps_list = \ |
94 win_extlr=avg_profile_extlr) | 92 list_moving_window_average_values(lists_dic[seq_id], |
93 win_extlr=avg_profile_extlr) | |
95 max_sc = max(aps_list) | 94 max_sc = max(aps_list) |
96 max_list.append(max_sc) | 95 max_list.append(max_sc) |
97 else: | 96 else: |
98 assert 0, "invalid profile_type argument given: \"%s\"" %(profile_type) | 97 assert 0, "invalid profile_type argument given: \"%s\"" \ |
98 % (profile_type) | |
99 # Return the median. | 99 # Return the median. |
100 return statistics.median(max_list) | 100 return statistics.median(max_list) |
101 | 101 |
102 | 102 |
103 ################################################################################ | 103 ############################################################################### |
104 | 104 |
105 def list_moving_window_average_values(in_list, | 105 def list_moving_window_average_values(in_list, |
106 win_extlr=5, | 106 win_extlr=5, |
107 method=1): | 107 method=1): |
108 """ | 108 """ |
109 Take a list of numeric values, and calculate for each position a new value, | 109 Take a list of numeric values, and calculate for each position a new value, |
110 by taking the mean value of the window of positions -win_extlr and | 110 by taking the mean value of the window of positions -win_extlr and |
111 +win_extlr. If full extension is not possible (at list ends), it just | 111 +win_extlr. If full extension is not possible (at list ends), it just |
112 takes what it gets. | 112 takes what it gets. |
113 Two implementations of the task are given, chose by method=1 or method=2. | 113 Two implementations of the task are given, chose by method=1 or method=2. |
114 | 114 |
115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] | 115 >>> test_list = [2, 3, 5, 8, 4, 3, 7, 1] |
116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) | 116 >>> list_moving_window_average_values(test_list, win_extlr=2, method=1) |
140 e = i + win_extlr + 1 | 140 e = i + win_extlr + 1 |
141 if s < 0: | 141 if s < 0: |
142 s = 0 | 142 s = 0 |
143 if e > l_list: | 143 if e > l_list: |
144 e = l_list | 144 e = l_list |
145 l = e-s | 145 ln = e - s |
146 sc_sum = 0 | 146 sc_sum = 0 |
147 for j in range(l): | 147 for j in range(ln): |
148 sc_sum += in_list[s+j] | 148 sc_sum += in_list[s + j] |
149 new_list[i] = sc_sum / l | 149 new_list[i] = sc_sum / ln |
150 else: | 150 else: |
151 assert 0, "invalid method ID given (%i)" %(method) | 151 assert 0, "invalid method ID given (%i)" % (method) |
152 return new_list | 152 return new_list |
153 | 153 |
154 | 154 |
155 ################################################################################ | 155 ############################################################################### |
156 | 156 |
157 def echo_add_to_file(echo_string, out_file): | 157 def echo_add_to_file(echo_string, out_file): |
158 """ | 158 """ |
159 Add a string to file, using echo command. | 159 Add a string to file, using echo command. |
160 | 160 |
162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) | 162 check_cmd = 'echo "%s" >> %s' % (echo_string, out_file) |
163 output = subprocess.getoutput(check_cmd) | 163 output = subprocess.getoutput(check_cmd) |
164 error = False | 164 error = False |
165 if output: | 165 if output: |
166 error = True | 166 error = True |
167 assert error == False, "echo is complaining:\n%s\n%s" %(check_cmd, output) | 167 assert not error, "echo is complaining:\n%s\n%s" % (check_cmd, output) |
168 | 168 |
169 | 169 |
170 ################################################################################ | 170 ############################################################################### |
171 | 171 |
172 def is_tool(name): | 172 def is_tool(name): |
173 """Check whether tool "name" is in PATH.""" | 173 """Check whether tool "name" is in PATH.""" |
174 return find_executable(name) is not None | 174 return find_executable(name) is not None |
175 | 175 |
176 | 176 |
177 ################################################################################ | 177 ############################################################################### |
178 | 178 |
179 def count_fasta_headers(fasta_file): | 179 def count_fasta_headers(fasta_file): |
180 """ | 180 """ |
181 Count number of FASTA headers in fasta_file using grep. | 181 Count number of FASTA headers in fasta_file using grep. |
182 | 182 |
192 output = subprocess.getoutput(check_cmd) | 192 output = subprocess.getoutput(check_cmd) |
193 row_count = int(output.strip()) | 193 row_count = int(output.strip()) |
194 return row_count | 194 return row_count |
195 | 195 |
196 | 196 |
197 ################################################################################ | 197 ############################################################################### |
198 | 198 |
199 def make_file_copy(in_file, out_file): | 199 def make_file_copy(in_file, out_file): |
200 """ | 200 """ |
201 Make a file copy by copying in_file to out_file. | 201 Make a file copy by copying in_file to out_file. |
202 | 202 |
203 """ | 203 """ |
204 check_cmd = "cat " + in_file + " > " + out_file | 204 check_cmd = "cat " + in_file + " > " + out_file |
205 assert in_file != out_file, "cat does not like to cat file into same file (%s)" %(check_cmd) | 205 assert in_file != out_file, \ |
206 "cat does not like to cat file into same file (%s)" % (check_cmd) | |
206 output = subprocess.getoutput(check_cmd) | 207 output = subprocess.getoutput(check_cmd) |
207 error = False | 208 error = False |
208 if output: | 209 if output: |
209 error = True | 210 error = True |
210 assert error == False, "cat did not like your input (in_file: %s, out_file: %s):\n%s" %(in_file, out_file, output) | 211 assert not error, \ |
211 | 212 "cat did not like your input (in_file: %s, out_file: %s):\n%s" \ |
212 | 213 % (in_file, out_file, output) |
213 ################################################################################ | 214 |
214 | 215 |
215 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | 216 ############################################################################### |
217 | |
218 def split_fasta_into_test_train_files(in_fasta, test_out_fa, train_out_fa, | |
216 test_size=500): | 219 test_size=500): |
217 """ | 220 """ |
218 Split in_fasta .fa file into two files (e.g. test, train). | 221 Split in_fasta .fa file into two files (e.g. test, train). |
219 | 222 |
220 """ | 223 """ |
234 c_out += 1 | 237 c_out += 1 |
235 TESTOUT.close() | 238 TESTOUT.close() |
236 TRAINOUT.close() | 239 TRAINOUT.close() |
237 | 240 |
238 | 241 |
239 ################################################################################ | 242 ############################################################################### |
243 | |
244 def check_seqs_dic_format(seqs_dic): | |
245 """ | |
246 Check sequence dictionary for lowercase-only sequences or sequences | |
247 wich have lowercase nts in between uppercase nts. | |
248 Return suspicious IDs as list or empty list if not hits. | |
249 IDs with lowercase-only sequences. | |
250 | |
251 >>> seqs_dic = {"id1" : "acguACGU", "id2" : "acgua", "id3" : "acgUUaUcc"} | |
252 >>> check_seqs_dic_format(seqs_dic) | |
253 ['id2', 'id3'] | |
254 >>> seqs_dic = {"id1" : "acgAUaa", "id2" : "ACGUACUA"} | |
255 >>> check_seqs_dic_format(seqs_dic) | |
256 [] | |
257 | |
258 """ | |
259 assert seqs_dic, "given seqs_dic empty" | |
260 bad_seq_ids = [] | |
261 for seq_id in seqs_dic: | |
262 seq = seqs_dic[seq_id] | |
263 if re.search("^[acgtun]+$", seq): | |
264 bad_seq_ids.append(seq_id) | |
265 if re.search("[ACGTUN][acgtun]+[ACGTUN]", seq): | |
266 bad_seq_ids.append(seq_id) | |
267 return bad_seq_ids | |
268 | |
269 | |
270 ############################################################################### | |
240 | 271 |
241 def read_fasta_into_dic(fasta_file, | 272 def read_fasta_into_dic(fasta_file, |
242 seqs_dic=False, | 273 seqs_dic=False, |
243 ids_dic=False, | 274 ids_dic=False, |
244 read_dna=False, | 275 read_dna=False, |
276 short_ensembl=False, | |
245 reject_lc=False, | 277 reject_lc=False, |
246 convert_to_uc=False, | 278 convert_to_uc=False, |
247 skip_n_seqs=True): | 279 skip_n_seqs=True): |
248 """ | 280 """ |
249 Read in FASTA sequences, convert to RNA, store in dictionary | 281 Read in FASTA sequences, convert to RNA, store in dictionary |
250 and return dictionary. | 282 and return dictionary. |
251 | 283 |
252 >>> test_fasta = "test-data/test.fa" | 284 >>> test_fasta = "test-data/test.fa" |
253 >>> read_fasta_into_dic(test_fasta) | 285 >>> read_fasta_into_dic(test_fasta) |
254 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} | 286 {'seq1': 'acguACGUacgu', 'seq2': 'ugcaUGCAugcaACGUacgu'} |
255 >>> test_fasta = "test-data/test2.fa" | 287 >>> test_fasta = "test-data/test2.fa" |
256 >>> read_fasta_into_dic(test_fasta) | 288 >>> read_fasta_into_dic(test_fasta) |
257 {} | 289 {} |
258 >>> test_fasta = "test-data/test.ensembl.fa" | 290 >>> test_fasta = "test-data/test.ensembl.fa" |
259 >>> read_fasta_into_dic(test_fasta, read_dna=True) | 291 >>> read_fasta_into_dic(test_fasta, read_dna=True, short_ensembl=True) |
260 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} | 292 {'ENST00000415118': 'GAAATAGT', 'ENST00000448914': 'ACTGGGGGATACGAAAA'} |
293 >>> test_fasta = "test-data/test4.fa" | |
294 >>> read_fasta_into_dic(test_fasta) | |
295 {'1': 'gccuAUGUuuua', '2': 'cugaAACUaugu'} | |
261 | 296 |
262 """ | 297 """ |
263 if not seqs_dic: | 298 if not seqs_dic: |
264 seqs_dic = {} | 299 seqs_dic = {} |
265 seq_id = "" | 300 seq_id = "" |
266 seq = "" | 301 seq = "" |
267 | 302 |
268 # Go through FASTA file, extract sequences. | 303 # Go through FASTA file, extract sequences. |
269 if re.search(".+\.gz$", fasta_file): | 304 if re.search(r".+\.gz$", fasta_file): |
270 f = gzip.open(fasta_file, 'rt') | 305 f = gzip.open(fasta_file, 'rt') |
271 else: | 306 else: |
272 f = open(fasta_file, "r") | 307 f = open(fasta_file, "r") |
273 for line in f: | 308 for line in f: |
274 if re.search(">.+", line): | 309 if re.search(">.+", line): |
275 m = re.search(">(.+)", line) | 310 m = re.search(">(.+)", line) |
276 seq_id = m.group(1) | 311 seq_id = m.group(1) |
277 # If there is a ".", take only first part of header. | 312 # If there is a ".", take only first part of header. |
278 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." | 313 # This assumes ENSEMBL header format ">ENST00000631435.1 cdna ..." |
279 if re.search(".+\..+", seq_id): | 314 if short_ensembl: |
280 m = re.search("(.+?)\..+", seq_id) | 315 if re.search(r".+\..+", seq_id): |
281 seq_id = m.group(1) | 316 m = re.search(r"(.+?)\..+", seq_id) |
282 assert seq_id not in seqs_dic, "non-unique FASTA header \"%s\" in \"%s\"" % (seq_id, fasta_file) | 317 seq_id = m.group(1) |
318 assert seq_id not in seqs_dic, \ | |
319 "non-unique FASTA header \"%s\" in \"%s\"" \ | |
320 % (seq_id, fasta_file) | |
283 if ids_dic: | 321 if ids_dic: |
284 if seq_id in ids_dic: | 322 if seq_id in ids_dic: |
285 seqs_dic[seq_id] = "" | 323 seqs_dic[seq_id] = "" |
286 else: | 324 else: |
287 seqs_dic[seq_id] = "" | 325 seqs_dic[seq_id] = "" |
288 elif re.search("[ACGTUN]+", line, re.I): | 326 elif re.search("[ACGTUN]+", line, re.I): |
289 if seq_id in seqs_dic: | 327 if seq_id in seqs_dic: |
290 m = re.search("([ACGTUN]+)", line, re.I) | 328 m = re.search("([ACGTUN]+)", line, re.I) |
291 seq = m.group(1) | 329 seq = m.group(1) |
292 if reject_lc: | 330 if reject_lc: |
293 assert not re.search("[a-z]", seq), "lowercase characters detected in sequence \"%i\" (reject_lc=True)" %(seq_id) | 331 assert \ |
332 not re.search("[a-z]", seq), \ | |
333 "lc char detected in seq \"%i\" (reject_lc=True)" \ | |
334 % (seq_id) | |
294 if convert_to_uc: | 335 if convert_to_uc: |
295 seq = seq.upper() | 336 seq = seq.upper() |
296 # If sequences with N nucleotides should be skipped. | 337 # If sequences with N nucleotides should be skipped. |
297 if skip_n_seqs: | 338 if skip_n_seqs: |
298 if "n" in m.group(1) or "N" in m.group(1): | 339 if "n" in m.group(1) or "N" in m.group(1): |
299 print ("WARNING: \"%s\" contains N nucleotides. Discarding sequence ... " % (seq_id)) | 340 print("WARNING: \"%s\" contains N. Discarding " |
341 "sequence ... " % (seq_id)) | |
300 del seqs_dic[seq_id] | 342 del seqs_dic[seq_id] |
301 continue | 343 continue |
302 # Convert to RNA, concatenate sequence. | 344 # Convert to RNA, concatenate sequence. |
303 if read_dna: | 345 if read_dna: |
304 seqs_dic[seq_id] += m.group(1).replace("U","T").replace("u","t") | 346 seqs_dic[seq_id] += \ |
347 m.group(1).replace("U", "T").replace("u", "t") | |
305 else: | 348 else: |
306 seqs_dic[seq_id] += m.group(1).replace("T","U").replace("t","u") | 349 seqs_dic[seq_id] += \ |
350 m.group(1).replace("T", "U").replace("t", "u") | |
307 f.close() | 351 f.close() |
308 return seqs_dic | 352 return seqs_dic |
309 | 353 |
310 | 354 |
311 ################################################################################ | 355 ############################################################################### |
312 | 356 |
313 def random_order_dic_keys_into_list(in_dic): | 357 def random_order_dic_keys_into_list(in_dic): |
314 """ | 358 """ |
315 Read in dictionary keys, and return random order list of IDs. | 359 Read in dictionary keys, and return random order list of IDs. |
316 | 360 |
320 id_list.append(key) | 364 id_list.append(key) |
321 random.shuffle(id_list) | 365 random.shuffle(id_list) |
322 return id_list | 366 return id_list |
323 | 367 |
324 | 368 |
325 ################################################################################ | 369 ############################################################################### |
326 | 370 |
327 def graphprot_get_param_string(params_file): | 371 def graphprot_get_param_string(params_file): |
328 """ | 372 """ |
329 Get parameter string from GraphProt .params file. | 373 Get parameter string from GraphProt .params file. |
330 | 374 |
346 continue | 390 continue |
347 if par == "model_type": | 391 if par == "model_type": |
348 if setting == "sequence": | 392 if setting == "sequence": |
349 param_string += "-onlyseq " | 393 param_string += "-onlyseq " |
350 else: | 394 else: |
351 param_string += "-%s %s " %(par, setting) | 395 param_string += "-%s %s " % (par, setting) |
352 else: | 396 else: |
353 assert 0, "pattern matching failed for string \"%s\"" %(param) | 397 assert 0, "pattern matching failed for string \"%s\"" % (param) |
354 return param_string | 398 return param_string |
355 | 399 |
356 | 400 |
357 ################################################################################ | 401 ############################################################################### |
358 | 402 |
359 def seqs_dic_count_uc_nts(seqs_dic): | 403 def seqs_dic_count_uc_nts(seqs_dic): |
360 """ | 404 """ |
361 Count number of uppercase nucleotides in sequences stored in sequence | 405 Count number of uppercase nucleotides in sequences stored in sequence |
362 dictionary. | 406 dictionary. |
363 | 407 |
364 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} | 408 >>> seqs_dic = {'seq1': "acgtACGTacgt", 'seq2': 'acgtACacgt'} |
365 >>> seqs_dic_count_uc_nts(seqs_dic) | 409 >>> seqs_dic_count_uc_nts(seqs_dic) |
366 6 | 410 6 |
367 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} | 411 >>> seqs_dic = {'seq1': "acgtacgt", 'seq2': 'acgtacgt'} |
368 >>> seqs_dic_count_uc_nts(seqs_dic) | 412 >>> seqs_dic_count_uc_nts(seqs_dic) |
374 for seq_id in seqs_dic: | 418 for seq_id in seqs_dic: |
375 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) | 419 c_uc += len(re.findall(r'[A-Z]', seqs_dic[seq_id])) |
376 return c_uc | 420 return c_uc |
377 | 421 |
378 | 422 |
379 ################################################################################ | 423 ############################################################################### |
380 | 424 |
381 def seqs_dic_count_lc_nts(seqs_dic): | 425 def seqs_dic_count_lc_nts(seqs_dic): |
382 """ | 426 """ |
383 Count number of lowercase nucleotides in sequences stored in sequence | 427 Count number of lowercase nucleotides in sequences stored in sequence |
384 dictionary. | 428 dictionary. |
385 | 429 |
386 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} | 430 >>> seqs_dic = {'seq1': "gtACGTac", 'seq2': 'cgtACacg'} |
387 >>> seqs_dic_count_lc_nts(seqs_dic) | 431 >>> seqs_dic_count_lc_nts(seqs_dic) |
388 10 | 432 10 |
389 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} | 433 >>> seqs_dic = {'seq1': "ACGT", 'seq2': 'ACGTAC'} |
390 >>> seqs_dic_count_lc_nts(seqs_dic) | 434 >>> seqs_dic_count_lc_nts(seqs_dic) |
396 for seq_id in seqs_dic: | 440 for seq_id in seqs_dic: |
397 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) | 441 c_uc += len(re.findall(r'[a-z]', seqs_dic[seq_id])) |
398 return c_uc | 442 return c_uc |
399 | 443 |
400 | 444 |
401 ################################################################################ | 445 ############################################################################### |
402 | 446 |
403 def count_file_rows(in_file): | 447 def count_file_rows(in_file): |
404 """ | 448 """ |
405 Count number of file rows for given input file. | 449 Count number of file rows for given input file. |
406 | 450 |
407 >>> test_file = "test-data/test1.bed" | 451 >>> test_file = "test-data/test1.bed" |
408 >>> count_file_rows(test_file) | 452 >>> count_file_rows(test_file) |
409 7 | 453 7 |
410 >>> test_file = "test-data/empty_file" | 454 >>> test_file = "test-data/empty_file" |
411 >>> count_file_rows(test_file) | 455 >>> count_file_rows(test_file) |
416 output = subprocess.getoutput(check_cmd) | 460 output = subprocess.getoutput(check_cmd) |
417 row_count = int(output.strip()) | 461 row_count = int(output.strip()) |
418 return row_count | 462 return row_count |
419 | 463 |
420 | 464 |
421 ################################################################################ | 465 ############################################################################### |
422 | 466 |
423 def bed_check_six_col_format(bed_file): | 467 def bed_check_six_col_format(bed_file): |
424 """ | 468 """ |
425 Check whether given .bed file has 6 columns. | 469 Check whether given .bed file has 6 columns. |
426 | 470 |
442 break | 486 break |
443 f.closed | 487 f.closed |
444 return six_col_format | 488 return six_col_format |
445 | 489 |
446 | 490 |
447 ################################################################################ | 491 ############################################################################### |
448 | 492 |
449 def bed_check_unique_ids(bed_file): | 493 def bed_check_unique_ids(bed_file): |
450 """ | 494 """ |
451 Check whether .bed file (6 column format with IDs in column 4) | 495 Check whether .bed file (6 column format with IDs in column 4) |
452 has unique column 4 IDs. | 496 has unique column 4 IDs. |
453 | 497 |
454 >>> test_bed = "test-data/test1.bed" | 498 >>> test_bed = "test-data/test1.bed" |
455 >>> bed_check_unique_ids(test_bed) | 499 >>> bed_check_unique_ids(test_bed) |
456 True | 500 True |
457 >>> test_bed = "test-data/test2.bed" | 501 >>> test_bed = "test-data/test2.bed" |
458 >>> bed_check_unique_ids(test_bed) | 502 >>> bed_check_unique_ids(test_bed) |
466 return False | 510 return False |
467 else: | 511 else: |
468 return True | 512 return True |
469 | 513 |
470 | 514 |
471 ################################################################################ | 515 ############################################################################### |
472 | 516 |
473 def get_seq_lengths_from_seqs_dic(seqs_dic): | 517 def get_seq_lengths_from_seqs_dic(seqs_dic): |
474 """ | 518 """ |
475 Given a dictionary of sequences, return dictionary of sequence lengths. | 519 Given a dictionary of sequences, return dictionary of sequence lengths. |
476 Mapping is sequence ID -> sequence length. | 520 Mapping is sequence ID -> sequence length. |
481 seq_l = len(seqs_dic[seq_id]) | 525 seq_l = len(seqs_dic[seq_id]) |
482 seq_len_dic[seq_id] = seq_l | 526 seq_len_dic[seq_id] = seq_l |
483 return seq_len_dic | 527 return seq_len_dic |
484 | 528 |
485 | 529 |
486 ################################################################################ | 530 ############################################################################### |
487 | 531 |
488 def bed_get_region_lengths(bed_file): | 532 def bed_get_region_lengths(bed_file): |
489 """ | 533 """ |
490 Read in .bed file, store and return region lengths in dictionary. | 534 Read in .bed file, store and return region lengths in dictionary. |
491 key : region ID (.bed col4) | 535 key : region ID (.bed col4) |
497 | 541 |
498 """ | 542 """ |
499 id2len_dic = {} | 543 id2len_dic = {} |
500 with open(bed_file) as f: | 544 with open(bed_file) as f: |
501 for line in f: | 545 for line in f: |
502 row = line.strip() | |
503 cols = line.strip().split("\t") | 546 cols = line.strip().split("\t") |
504 site_s = int(cols[1]) | 547 site_s = int(cols[1]) |
505 site_e = int(cols[2]) | 548 site_e = int(cols[2]) |
506 site_id = cols[3] | 549 site_id = cols[3] |
507 site_l = site_e - site_s | 550 site_l = site_e - site_s |
508 assert site_id not in id2len_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(bed_file) | 551 assert site_id \ |
552 not in id2len_dic, \ | |
553 "column 4 IDs not unique in given .bed file \"%s\"" \ | |
554 % (bed_file) | |
509 id2len_dic[site_id] = site_l | 555 id2len_dic[site_id] = site_l |
510 f.closed | 556 f.closed |
511 assert id2len_dic, "No IDs read into dictionary (input file \"%s\" empty or malformatted?)" % (in_bed) | 557 assert id2len_dic, \ |
558 "No IDs read into dic (input file \"%s\" empty or malformatted?)" \ | |
559 % (bed_file) | |
512 return id2len_dic | 560 return id2len_dic |
513 | 561 |
514 | 562 |
515 ################################################################################ | 563 ############################################################################### |
516 | 564 |
517 def graphprot_get_param_dic(params_file): | 565 def graphprot_get_param_dic(params_file): |
518 """ | 566 """ |
519 Read in GraphProt .params file and store in dictionary. | 567 Read in GraphProt .params file and store in dictionary. |
520 key = parameter | 568 key = parameter |
521 value = parameter value | 569 value = parameter value |
522 | 570 |
523 >>> params_file = "test-data/test.params" | 571 >>> params_file = "test-data/test.params" |
524 >>> graphprot_get_param_dic(params_file) | 572 >>> graphprot_get_param_dic(params_file) |
525 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', 'pos_train_profile_median': '5.039610', 'pos_train_avg_profile_median_1': '4.236340', 'pos_train_avg_profile_median_2': '3.868431', 'pos_train_avg_profile_median_3': '3.331277', 'pos_train_avg_profile_median_4': '2.998667', 'pos_train_avg_profile_median_5': '2.829782', 'pos_train_avg_profile_median_6': '2.626623', 'pos_train_avg_profile_median_7': '2.447083', 'pos_train_avg_profile_median_8': '2.349919', 'pos_train_avg_profile_median_9': '2.239829', 'pos_train_avg_profile_median_10': '2.161676'} | 573 {'epochs': '20', 'lambda': '0.01', 'R': '1', 'D': '3', 'bitsize': '14', \ |
574 'model_type': 'sequence', 'pos_train_ws_pred_median': '0.760321', \ | |
575 'pos_train_profile_median': '5.039610', \ | |
576 'pos_train_avg_profile_median_1': '4.236340', \ | |
577 'pos_train_avg_profile_median_2': '3.868431', \ | |
578 'pos_train_avg_profile_median_3': '3.331277', \ | |
579 'pos_train_avg_profile_median_4': '2.998667', \ | |
580 'pos_train_avg_profile_median_5': '2.829782', \ | |
581 'pos_train_avg_profile_median_6': '2.626623', \ | |
582 'pos_train_avg_profile_median_7': '2.447083', \ | |
583 'pos_train_avg_profile_median_8': '2.349919', \ | |
584 'pos_train_avg_profile_median_9': '2.239829', \ | |
585 'pos_train_avg_profile_median_10': '2.161676'} | |
526 | 586 |
527 """ | 587 """ |
528 param_dic = {} | 588 param_dic = {} |
529 with open(params_file) as f: | 589 with open(params_file) as f: |
530 for line in f: | 590 for line in f: |
537 param_dic[par] = setting | 597 param_dic[par] = setting |
538 f.close() | 598 f.close() |
539 return param_dic | 599 return param_dic |
540 | 600 |
541 | 601 |
542 ################################################################################ | 602 ############################################################################### |
543 | 603 |
544 def graphprot_filter_predictions_file(in_file, out_file, | 604 def graphprot_filter_predictions_file(in_file, out_file, |
545 sc_thr=0): | 605 sc_thr=0): |
546 """ | 606 """ |
547 Filter GraphProt .predictions file by given score thr_sc. | 607 Filter GraphProt .predictions file by given score thr_sc. |
552 row = line.strip() | 612 row = line.strip() |
553 cols = line.strip().split("\t") | 613 cols = line.strip().split("\t") |
554 score = float(cols[2]) | 614 score = float(cols[2]) |
555 if score < sc_thr: | 615 if score < sc_thr: |
556 continue | 616 continue |
557 OUTPRED.write("%s\n" %(row)) | 617 OUTPRED.write("%s\n" % (row)) |
558 f.close() | 618 f.close() |
559 OUTPRED.close() | 619 OUTPRED.close() |
560 | 620 |
561 | 621 |
562 ################################################################################ | 622 ############################################################################### |
563 | 623 |
564 def fasta_read_in_ids(fasta_file): | 624 def fasta_read_in_ids(fasta_file): |
565 """ | 625 """ |
566 Given a .fa file, read in header IDs in order appearing in file, | 626 Given a .fa file, read in header IDs in order appearing in file, |
567 and store in list. | 627 and store in list. |
568 | 628 |
569 >>> test_file = "test-data/test3.fa" | 629 >>> test_file = "test-data/test3.fa" |
570 >>> fasta_read_in_ids(test_file) | 630 >>> fasta_read_in_ids(test_file) |
571 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] | 631 ['SERBP1_K562_rep01_544', 'SERBP1_K562_rep02_709', 'SERBP1_K562_rep01_316'] |
580 ids_list.append(seq_id) | 640 ids_list.append(seq_id) |
581 f.close() | 641 f.close() |
582 return ids_list | 642 return ids_list |
583 | 643 |
584 | 644 |
585 ################################################################################ | 645 ############################################################################### |
586 | 646 |
587 def graphprot_profile_calculate_avg_profile(in_file, out_file, | 647 def graphprot_profile_calc_avg_profile(in_file, out_file, |
588 ap_extlr=5, | 648 ap_extlr=5, |
589 seq_ids_list=False, | 649 seq_ids_list=False, |
590 method=1): | 650 method=1): |
591 """ | 651 """ |
592 Given a GraphProt .profile file, calculate average profiles and output | 652 Given a GraphProt .profile file, calculate average profiles and output |
593 average profile file. | 653 average profile file. |
594 Average profile means that the position-wise scores will get smoothed | 654 Average profile means that the position-wise scores will get smoothed |
595 out by calculating for each position a new score, taking a sequence | 655 out by calculating for each position a new score, taking a sequence |
596 window -ap_extlr to +ap_extlr relative to the position | 656 window -ap_extlr to +ap_extlr relative to the position |
597 and calculate the mean score over this window. The mean score then | 657 and calculate the mean score over this window. The mean score then |
598 becomes the new average profile score at this position. | 658 becomes the new average profile score at this position. |
599 Two different implementations of the task are given: | 659 Two different implementations of the task are given: |
600 method=1 (new python implementation, slower + more memory but easy to read) | 660 method=1 (new python implementation, slower + more memory but easy to read) |
601 method=2 (old perl implementation, faster and less memory but more code) | 661 method=2 (old perl implementation, faster and less memory but more code) |
602 | 662 |
603 >>> in_file = "test-data/test2.profile" | 663 >>> in_file = "test-data/test2.profile" |
604 >>> out_file1 = "test-data/test2_1.avg_profile" | 664 >>> out_file1 = "test-data/test2_1.avg_profile" |
605 >>> out_file2 = "test-data/test2_2.avg_profile" | 665 >>> out_file2 = "test-data/test2_2.avg_profile" |
606 >>> out_file4 = "test-data/test2_3.avg_profile" | 666 >>> out_file4 = "test-data/test2_3.avg_profile" |
607 >>> graphprot_profile_calculate_avg_profile(in_file, out_file1, ap_extlr=2, method=1) | 667 >>> graphprot_profile_calc_avg_profile(in_file, \ |
608 >>> graphprot_profile_calculate_avg_profile(in_file, out_file2, ap_extlr=2, method=2) | 668 out_file1, ap_extlr=2, method=1) |
669 >>> graphprot_profile_calc_avg_profile(in_file, \ | |
670 out_file2, ap_extlr=2, method=2) | |
609 >>> diff_two_files_identical(out_file1, out_file2) | 671 >>> diff_two_files_identical(out_file1, out_file2) |
610 True | 672 True |
611 >>> test_list = ["s1", "s2", "s3", "s4"] | 673 >>> test_list = ["s1", "s2", "s3", "s4"] |
612 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" | 674 >>> out_file3_exp = "test-data/test3_added_ids_exp.avg_profile" |
613 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" | 675 >>> out_file3 = "test-data/test3_added_ids_out.avg_profile" |
614 >>> graphprot_profile_calculate_avg_profile(in_file, out_file3, ap_extlr=2, method=1, seq_ids_list=test_list) | 676 >>> graphprot_profile_calc_avg_profile(in_file, out_file3, \ |
677 ap_extlr=2, method=1, seq_ids_list=test_list) | |
615 >>> diff_two_files_identical(out_file3_exp, out_file3) | 678 >>> diff_two_files_identical(out_file3_exp, out_file3) |
616 True | 679 True |
617 | 680 |
618 """ | 681 """ |
619 if method == 1: | 682 if method == 1: |
622 site_starts_dic = {} | 685 site_starts_dic = {} |
623 with open(in_file) as f: | 686 with open(in_file) as f: |
624 for line in f: | 687 for line in f: |
625 cols = line.strip().split("\t") | 688 cols = line.strip().split("\t") |
626 site_id = int(cols[0]) | 689 site_id = int(cols[0]) |
627 pos = int(cols[1]) # 0-based. | 690 pos = int(cols[1]) # 0-based. |
628 score = float(cols[2]) | 691 score = float(cols[2]) |
629 # Store first position of site. | 692 # Store first position of site. |
630 if site_id not in site_starts_dic: | 693 if site_id not in site_starts_dic: |
631 site_starts_dic[site_id] = pos | 694 site_starts_dic[site_id] = pos |
632 if site_id in lists_dic: | 695 if site_id in lists_dic: |
633 lists_dic[site_id].append(score) | 696 lists_dic[site_id].append(score) |
634 else: | 697 else: |
635 lists_dic[site_id] = [] | 698 lists_dic[site_id] = [] |
636 lists_dic[site_id].append(score) | 699 lists_dic[site_id].append(score) |
637 f.close() | 700 f.close() |
638 # Check number of IDs (# FASTA sequence IDs has to be same as # site IDs). | 701 # Check number of IDs (# FASTA IDs has to be same as # site IDs). |
639 if seq_ids_list: | 702 if seq_ids_list: |
640 c_seq_ids = len(seq_ids_list) | 703 c_seq_ids = len(seq_ids_list) |
641 c_site_ids = len(site_starts_dic) | 704 c_site_ids = len(site_starts_dic) |
642 assert c_seq_ids == c_site_ids, "# sequence IDs != # site IDs (%i != %i)" %(c_seq_ids, c_site_ids) | 705 assert c_seq_ids == c_site_ids, \ |
706 "# sequence IDs != # site IDs (%i != %i)" \ | |
707 % (c_seq_ids, c_site_ids) | |
643 OUTPROF = open(out_file, "w") | 708 OUTPROF = open(out_file, "w") |
644 # For each site, calculate average profile scores list. | 709 # For each site, calculate average profile scores list. |
645 max_list = [] | |
646 for site_id in lists_dic: | 710 for site_id in lists_dic: |
647 # Convert profile score list to average profile scores list. | 711 # Convert profile score list to average profile scores list. |
648 aps_list = list_moving_window_average_values(lists_dic[site_id], | 712 aps_list = list_moving_window_average_values(lists_dic[site_id], |
649 win_extlr=ap_extlr) | 713 win_extlr=ap_extlr) |
650 start_pos = site_starts_dic[site_id] | 714 start_pos = site_starts_dic[site_id] |
651 # Get original FASTA sequence ID. | 715 # Get original FASTA sequence ID. |
652 if seq_ids_list: | 716 if seq_ids_list: |
653 site_id = seq_ids_list[site_id] | 717 site_id = seq_ids_list[site_id] |
654 for i, sc in enumerate(aps_list): | 718 for i, sc in enumerate(aps_list): |
655 pos = i + start_pos + 1 # make 1-based. | 719 pos = i + start_pos + 1 # make 1-based. |
656 OUTPROF.write("%s\t%i\t%f\n" %(site_id, pos, sc)) | 720 OUTPROF.write("%s\t%i\t%f\n" % (site_id, pos, sc)) |
657 OUTPROF.close() | 721 OUTPROF.close() |
658 elif method == 2: | 722 elif method == 2: |
659 OUTPROF = open(out_file, "w") | 723 OUTPROF = open(out_file, "w") |
660 # Old site ID. | 724 # Old site ID. |
661 old_id = "" | 725 old_id = "" |
666 site_starts_dic = {} | 730 site_starts_dic = {} |
667 with open(in_file) as f: | 731 with open(in_file) as f: |
668 for line in f: | 732 for line in f: |
669 cols = line.strip().split("\t") | 733 cols = line.strip().split("\t") |
670 cur_id = int(cols[0]) | 734 cur_id = int(cols[0]) |
671 pos = int(cols[1]) # 0-based. | 735 pos = int(cols[1]) # 0-based. |
672 score = float(cols[2]) | 736 score = float(cols[2]) |
673 # Store first position of site. | 737 # Store first position of site. |
674 if cur_id not in site_starts_dic: | 738 if cur_id not in site_starts_dic: |
675 site_starts_dic[cur_id] = pos | 739 site_starts_dic[cur_id] = pos |
676 # Case: new site (new column 1 ID). | 740 # Case: new site (new column 1 ID). |
677 if cur_id != old_id: | 741 if cur_id != old_id: |
678 # Process old id scores. | 742 # Process old id scores. |
679 if scores_list: | 743 if scores_list: |
680 aps_list = list_moving_window_average_values(scores_list, | 744 aps_list = \ |
681 win_extlr=ap_extlr) | 745 list_moving_window_average_values( |
746 scores_list, | |
747 win_extlr=ap_extlr) | |
682 start_pos = site_starts_dic[old_id] | 748 start_pos = site_starts_dic[old_id] |
683 seq_id = old_id | 749 seq_id = old_id |
684 # Get original FASTA sequence ID. | 750 # Get original FASTA sequence ID. |
685 if seq_ids_list: | 751 if seq_ids_list: |
686 seq_id = seq_ids_list[old_id] | 752 seq_id = seq_ids_list[old_id] |
687 for i, sc in enumerate(aps_list): | 753 for i, sc in enumerate(aps_list): |
688 pos = i + start_pos + 1 # make 1-based. | 754 pos = i + start_pos + 1 # make 1-based. |
689 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | 755 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) |
690 # Reset list. | 756 # Reset list. |
691 scores_list = [] | 757 scores_list = [] |
692 old_id = cur_id | 758 old_id = cur_id |
693 scores_list.append(score) | 759 scores_list.append(score) |
694 else: | 760 else: |
703 seq_id = old_id | 769 seq_id = old_id |
704 # Get original FASTA sequence ID. | 770 # Get original FASTA sequence ID. |
705 if seq_ids_list: | 771 if seq_ids_list: |
706 seq_id = seq_ids_list[old_id] | 772 seq_id = seq_ids_list[old_id] |
707 for i, sc in enumerate(aps_list): | 773 for i, sc in enumerate(aps_list): |
708 pos = i + start_pos + 1 # make 1-based. | 774 pos = i + start_pos + 1 # make 1-based. |
709 OUTPROF.write("%s\t%i\t%f\n" %(seq_id, pos, sc)) | 775 OUTPROF.write("%s\t%i\t%f\n" % (seq_id, pos, sc)) |
710 OUTPROF.close() | 776 OUTPROF.close() |
711 | 777 |
712 | 778 |
713 ################################################################################ | 779 ############################################################################### |
714 | 780 |
715 def graphprot_profile_extract_peak_regions(in_file, out_file, | 781 def graphprot_profile_extract_peak_regions(in_file, out_file, |
716 max_merge_dist=0, | 782 max_merge_dist=0, |
717 sc_thr=0): | 783 sc_thr=0): |
718 """ | 784 """ |
719 Extract peak regions from GraphProt .profile file. | 785 Extract peak regions from GraphProt .profile file. |
720 Store the peak regions (defined as regions with scores >= sc_thr) | 786 Store the peak regions (defined as regions with scores >= sc_thr) |
721 as to out_file in 6-column .bed. | 787 as to out_file in 6-column .bed. |
722 | 788 |
723 TODO: | 789 TODO: |
724 Add option for genomic coordinates input (+ - polarity support). | 790 Add option for genomic coordinates input (+ - polarity support). |
725 Output genomic regions instead of sequence regions. | 791 Output genomic regions instead of sequence regions. |
733 >>> diff_two_files_identical(out_file, exp_file) | 799 >>> diff_two_files_identical(out_file, exp_file) |
734 True | 800 True |
735 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) | 801 >>> graphprot_profile_extract_peak_regions(in_file, out_file, sc_thr=10) |
736 >>> diff_two_files_identical(out_file, empty_file) | 802 >>> diff_two_files_identical(out_file, empty_file) |
737 True | 803 True |
738 >>> graphprot_profile_extract_peak_regions(in_file, out_file, max_merge_dist=2) | 804 >>> graphprot_profile_extract_peak_regions(in_file, out_file, \ |
805 max_merge_dist=2) | |
739 >>> diff_two_files_identical(out_file, exp2_file) | 806 >>> diff_two_files_identical(out_file, exp2_file) |
740 True | 807 True |
741 | 808 |
742 """ | 809 """ |
743 | 810 |
751 site_starts_dic = {} | 818 site_starts_dic = {} |
752 with open(in_file) as f: | 819 with open(in_file) as f: |
753 for line in f: | 820 for line in f: |
754 cols = line.strip().split("\t") | 821 cols = line.strip().split("\t") |
755 cur_id = cols[0] | 822 cur_id = cols[0] |
756 pos = int(cols[1]) # 0-based. | 823 pos = int(cols[1]) # 0-based. |
757 score = float(cols[2]) | 824 score = float(cols[2]) |
758 # Store first position of site. | 825 # Store first position of site. |
759 if cur_id not in site_starts_dic: | 826 if cur_id not in site_starts_dic: |
760 # If first position != zero, we assume positions are 1-based. | 827 # If first position != zero, we assume positions are 1-based. |
761 if pos != 0: | 828 if pos != 0: |
766 # Case: new site (new column 1 ID). | 833 # Case: new site (new column 1 ID). |
767 if cur_id != old_id: | 834 if cur_id != old_id: |
768 # Process old id scores. | 835 # Process old id scores. |
769 if scores_list: | 836 if scores_list: |
770 # Extract peaks from region. | 837 # Extract peaks from region. |
771 peak_list = list_extract_peaks(scores_list, | 838 peak_list = \ |
772 max_merge_dist=max_merge_dist, | 839 list_extract_peaks(scores_list, |
773 coords="bed", | 840 max_merge_dist=max_merge_dist, |
774 sc_thr=sc_thr) | 841 coords="bed", |
842 sc_thr=sc_thr) | |
775 start_pos = site_starts_dic[old_id] | 843 start_pos = site_starts_dic[old_id] |
776 # Print out peaks in .bed format. | 844 # Print out peaks in .bed format. |
777 for l in peak_list: | 845 for ln in peak_list: |
778 peak_s = start_pos + l[0] | 846 peak_s = start_pos + ln[0] |
779 peak_e = start_pos + l[1] | 847 peak_e = start_pos + ln[1] |
780 site_id = "%s,%i" %(old_id, l[2]) | 848 site_id = "%s,%i" % (old_id, ln[2]) |
781 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | 849 OUTPEAKS.write("%s\t%i\t%i" |
850 "\t%s\t%f\t+\n" | |
851 % (old_id, peak_s, | |
852 peak_e, site_id, ln[3])) | |
782 # Reset list. | 853 # Reset list. |
783 scores_list = [] | 854 scores_list = [] |
784 old_id = cur_id | 855 old_id = cur_id |
785 scores_list.append(score) | 856 scores_list.append(score) |
786 else: | 857 else: |
788 scores_list.append(score) | 859 scores_list.append(score) |
789 f.close() | 860 f.close() |
790 # Process last block. | 861 # Process last block. |
791 if scores_list: | 862 if scores_list: |
792 # Extract peaks from region. | 863 # Extract peaks from region. |
793 peak_list = list_extract_peaks(scores_list, | 864 peak_list = list_extract_peaks(scores_list, |
794 max_merge_dist=max_merge_dist, | 865 max_merge_dist=max_merge_dist, |
795 coords="bed", | 866 coords="bed", |
796 sc_thr=sc_thr) | 867 sc_thr=sc_thr) |
797 start_pos = site_starts_dic[old_id] | 868 start_pos = site_starts_dic[old_id] |
798 # Print out peaks in .bed format. | 869 # Print out peaks in .bed format. |
799 for l in peak_list: | 870 for ln in peak_list: |
800 peak_s = start_pos + l[0] | 871 peak_s = start_pos + ln[0] |
801 peak_e = start_pos + l[1] | 872 peak_e = start_pos + ln[1] |
802 site_id = "%s,%i" %(old_id, l[2]) # best score also 1-based. | 873 site_id = "%s,%i" % (old_id, ln[2]) # best score also 1-based. |
803 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" %(old_id, peak_s, peak_e, site_id, l[3])) | 874 OUTPEAKS.write("%s\t%i\t%i\t%s\t%f\t+\n" |
875 % (old_id, peak_s, peak_e, site_id, ln[3])) | |
804 OUTPEAKS.close() | 876 OUTPEAKS.close() |
805 | 877 |
806 | 878 |
807 ################################################################################ | 879 ############################################################################### |
808 | 880 |
809 def list_extract_peaks(in_list, | 881 def list_extract_peaks(in_list, |
810 max_merge_dist=0, | 882 max_merge_dist=0, |
811 coords="list", | 883 coords="list", |
812 sc_thr=0): | 884 sc_thr=0): |
813 """ | 885 """ |
814 Extract peak regions from list. | 886 Extract peak regions from list. |
815 Peak region is defined as region >= score threshold. | 887 Peak region is defined as region >= score threshold. |
816 | 888 |
817 coords=bed : peak start 0-based, peak end 1-based. | 889 coords=bed : peak start 0-based, peak end 1-based. |
818 coords=list : peak start 0-based, peak end 0-based. | 890 coords=list : peak start 0-based, peak end 0-based. |
819 | 891 |
820 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] | 892 >>> test_list = [-1, 0, 2, 4.5, 1, -1, 5, 6.5] |
821 >>> list_extract_peaks(test_list) | 893 >>> list_extract_peaks(test_list) |
822 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] | 894 [[1, 4, 3, 4.5], [6, 7, 7, 6.5]] |
823 >>> list_extract_peaks(test_list, sc_thr=2) | 895 >>> list_extract_peaks(test_list, sc_thr=2) |
824 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] | 896 [[2, 3, 3, 4.5], [6, 7, 7, 6.5]] |
860 pr_top_pos = i | 932 pr_top_pos = i |
861 else: | 933 else: |
862 # Before was peak region? | 934 # Before was peak region? |
863 if inside: | 935 if inside: |
864 # Store peak region. | 936 # Store peak region. |
865 #peak_infos = "%i,%i,%i,%f" %(pr_s, pr_e, pr_top_pos, pr_top_sc) | |
866 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] | 937 peak_infos = [pr_s, pr_e, pr_top_pos, pr_top_sc] |
867 peak_list.append(peak_infos) | 938 peak_list.append(peak_infos) |
868 inside = False | 939 inside = False |
869 pr_top_pos = 0 | 940 pr_top_pos = 0 |
870 pr_top_sc = -100000 | 941 pr_top_sc = -100000 |
896 new_top_pos = peak_list[i][2] | 967 new_top_pos = peak_list[i][2] |
897 new_top_sc = peak_list[i][3] | 968 new_top_sc = peak_list[i][3] |
898 if peak_list[i][3] < peak_list[j][3]: | 969 if peak_list[i][3] < peak_list[j][3]: |
899 new_top_pos = peak_list[j][2] | 970 new_top_pos = peak_list[j][2] |
900 new_top_sc = peak_list[j][3] | 971 new_top_sc = peak_list[j][3] |
901 new_peak = [peak_list[i][0], peak_list[j][1], new_top_pos, new_top_sc] | 972 new_peak = [peak_list[i][0], peak_list[j][1], |
973 new_top_pos, new_top_sc] | |
902 # If two peaks were merged. | 974 # If two peaks were merged. |
903 if new_peak: | 975 if new_peak: |
904 merged_peak_list.append(new_peak) | 976 merged_peak_list.append(new_peak) |
905 added_peaks_dic[i] = 1 | 977 added_peaks_dic[i] = 1 |
906 added_peaks_dic[j] = 1 | 978 added_peaks_dic[j] = 1 |
913 peaks_merged = False | 985 peaks_merged = False |
914 # If peak coordinates should be in .bed format, make peak ends 1-based. | 986 # If peak coordinates should be in .bed format, make peak ends 1-based. |
915 if coords == "bed": | 987 if coords == "bed": |
916 for i in range(len(peak_list)): | 988 for i in range(len(peak_list)): |
917 peak_list[i][1] += 1 | 989 peak_list[i][1] += 1 |
918 peak_list[i][2] += 1 # 1-base best score position too. | 990 peak_list[i][2] += 1 # 1-base best score position too. |
919 return peak_list | 991 return peak_list |
920 | 992 |
921 | 993 |
922 ################################################################################ | 994 ############################################################################### |
923 | 995 |
924 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, print_rows=False): | 996 def bed_peaks_to_genomic_peaks(peak_file, genomic_peak_file, genomic_sites_bed, |
925 """ | 997 print_rows=False): |
926 Given a .bed file of sequence peak regions (possible coordinates from | 998 """ |
999 Given a .bed file of sequence peak regions (possible coordinates from | |
927 0 to length of s), convert peak coordinates to genomic coordinates. | 1000 0 to length of s), convert peak coordinates to genomic coordinates. |
928 Do this by taking genomic regions of sequences as input. | 1001 Do this by taking genomic regions of sequences as input. |
929 | 1002 |
930 >>> test_in = "test-data/test.peaks.bed" | 1003 >>> test_in = "test-data/test.peaks.bed" |
931 >>> test_exp = "test-data/test_exp.peaks.bed" | 1004 >>> test_exp = "test-data/test_exp.peaks.bed" |
942 with open(genomic_sites_bed) as f: | 1015 with open(genomic_sites_bed) as f: |
943 for line in f: | 1016 for line in f: |
944 row = line.strip() | 1017 row = line.strip() |
945 cols = line.strip().split("\t") | 1018 cols = line.strip().split("\t") |
946 site_id = cols[3] | 1019 site_id = cols[3] |
947 assert site_id not in id2row_dic, "column 4 IDs not unique in given .bed file \"%s\"" %(args.genomic_sites_bed) | 1020 assert site_id \ |
1021 not in id2row_dic, \ | |
1022 "column 4 IDs not unique in given .bed file \"%s\"" \ | |
1023 % (genomic_sites_bed) | |
948 id2row_dic[site_id] = row | 1024 id2row_dic[site_id] = row |
949 f.close() | 1025 f.close() |
950 | 1026 |
951 # Read in peaks file and convert coordinates. | 1027 # Read in peaks file and convert coordinates. |
952 OUTPEAKS = open(genomic_peak_file, "w") | 1028 OUTPEAKS = open(genomic_peak_file, "w") |
956 site_id = cols[0] | 1032 site_id = cols[0] |
957 site_s = int(cols[1]) | 1033 site_s = int(cols[1]) |
958 site_e = int(cols[2]) | 1034 site_e = int(cols[2]) |
959 site_id2 = cols[3] | 1035 site_id2 = cols[3] |
960 site_sc = float(cols[4]) | 1036 site_sc = float(cols[4]) |
961 assert re.search(".+,.+", site_id2), "regular expression failed for ID \"%s\"" %(site_id2) | 1037 assert re.search(".+,.+", site_id2), \ |
962 m = re.search(".+,(\d+)", site_id2) | 1038 "regular expression failed for ID \"%s\"" % (site_id2) |
963 sc_pos = int(m.group(1)) # 1-based. | 1039 m = re.search(r".+,(\d+)", site_id2) |
964 assert site_id in id2row_dic, "site ID \"%s\" not found in genomic sites dictionary" %(site_id) | 1040 sc_pos = int(m.group(1)) # 1-based. |
1041 assert site_id in id2row_dic, \ | |
1042 "site ID \"%s\" not found in genomic sites dictionary" \ | |
1043 % (site_id) | |
965 row = id2row_dic[site_id] | 1044 row = id2row_dic[site_id] |
966 rowl = row.split("\t") | 1045 rowl = row.split("\t") |
967 gen_chr = rowl[0] | 1046 gen_chr = rowl[0] |
968 gen_s = int(rowl[1]) | 1047 gen_s = int(rowl[1]) |
969 gen_e = int(rowl[2]) | 1048 gen_e = int(rowl[2]) |
972 new_e = site_e + gen_s | 1051 new_e = site_e + gen_s |
973 new_sc_pos = sc_pos + gen_s | 1052 new_sc_pos = sc_pos + gen_s |
974 if gen_pol == "-": | 1053 if gen_pol == "-": |
975 new_s = gen_e - site_e | 1054 new_s = gen_e - site_e |
976 new_e = gen_e - site_s | 1055 new_e = gen_e - site_s |
977 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. | 1056 new_sc_pos = gen_e - sc_pos + 1 # keep 1-based. |
978 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" %(gen_chr, new_s, new_e, site_id, new_sc_pos, site_sc, gen_pol) | 1057 new_row = "%s\t%i\t%i\t%s,%i\t%f\t%s" \ |
979 OUTPEAKS.write("%s\n" %(new_row)) | 1058 % (gen_chr, new_s, new_e, |
1059 site_id, new_sc_pos, site_sc, gen_pol) | |
1060 OUTPEAKS.write("%s\n" % (new_row)) | |
980 if print_rows: | 1061 if print_rows: |
981 print(new_row) | 1062 print(new_row) |
982 OUTPEAKS.close() | 1063 OUTPEAKS.close() |
983 | 1064 |
984 | 1065 |
985 ################################################################################ | 1066 ############################################################################### |
986 | 1067 |
987 def diff_two_files_identical(file1, file2): | 1068 def diff_two_files_identical(file1, file2): |
988 """ | 1069 """ |
989 Check whether two files are identical. Return true if diff reports no | 1070 Check whether two files are identical. Return true if diff reports no |
990 differences. | 1071 differences. |
991 | 1072 |
992 >>> file1 = "test-data/file1" | 1073 >>> file1 = "test-data/file1" |
993 >>> file2 = "test-data/file2" | 1074 >>> file2 = "test-data/file2" |
994 >>> diff_two_files_identical(file1, file2) | 1075 >>> diff_two_files_identical(file1, file2) |
995 True | 1076 True |
996 >>> file1 = "test-data/test1.bed" | 1077 >>> file1 = "test-data/test1.bed" |
1004 if output: | 1085 if output: |
1005 same = False | 1086 same = False |
1006 return same | 1087 return same |
1007 | 1088 |
1008 | 1089 |
1009 ################################################################################ | 1090 ############################################################################### |
1010 | |
1011 |