annotate remove_fake_cut_sites.py @ 3:4522bc2f7cca draft

"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
author bgruening
date Tue, 25 May 2021 20:11:49 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
1 import re
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
2 import sys
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
3
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
4 from Bio import SeqIO
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
5 from Bio.Seq import Seq
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
6
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
7
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
8 def main():
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
9
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
10 fasta_file = sys.argv[1]
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
11 output_file = sys.argv[2]
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
12 log_file = sys.argv[3]
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
13
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
14 output_handle = open(output_file, "w")
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
15 log_handle = open(log_file, "w")
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
16
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
17 with open(fasta_file, "r") as fasta_input_handle:
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
18 for record in SeqIO.parse(fasta_input_handle, "fasta"):
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
19
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
20 change_count = 0
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
21 cut_sites = [
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
22 Seq("CTTAAG"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
23 Seq("CTTCTCG"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
24 Seq("GCTCTTC"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
25 Seq("CCTCAGC"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
26 Seq("GAATGC"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
27 Seq("GCAATG"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
28 Seq("ATCGAT"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
29 Seq("CACGAG"),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
30 ]
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
31
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
32 for cut_site in cut_sites:
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
33 cut_site_both_orientations = (cut_site, cut_site.reverse_complement())
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
34
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
35 for cut_site_for_orientation in cut_site_both_orientations:
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
36
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
37 n_flank_length = 1
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
38 search_pattern = (
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
39 "N" * n_flank_length
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
40 + str(cut_site_for_orientation)
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
41 + "N" * n_flank_length
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
42 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
43 replacement = "N" * (
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
44 n_flank_length * 2 + len(cut_site_for_orientation)
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
45 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
46
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
47 (new_string, changes) = re.subn(
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
48 search_pattern,
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
49 replacement,
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
50 str(record.seq.upper()),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
51 flags=re.IGNORECASE,
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
52 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
53 change_count += changes
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
54
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
55 record.seq = Seq(new_string)
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
56
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
57 if change_count > 0:
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
58 log_handle.write(
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
59 " ".join([record.id, ":", str(change_count), "changes\n"])
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
60 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
61 SeqIO.write([record], output_handle, "fasta")
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
62
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
63 # Finally, count the matches
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
64 possible_fake_cut_sites = re.findall(
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
65 "N[^N]{1,10}N", str(record.seq.upper())
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
66 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
67 if len(possible_fake_cut_sites) > 0:
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
68 log_handle.write(
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
69 " ".join(
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
70 [
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
71 record.id,
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
72 ":",
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
73 str(len(possible_fake_cut_sites)),
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
74 "possible non-standard fake cut sites\n",
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
75 ]
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
76 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
77 )
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
78
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
79 output_handle.close()
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
80 log_handle.close()
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
81
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
82
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
83 if __name__ == "__main__":
4522bc2f7cca "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
84 main()