comparison qc.py @ 0:2aa014ad54bc draft

"planemo upload for repository https://github.com/SANBI-SA/tools-sanbi-uwc/tree/master/tools/qc commit 54450d56a66bd4b01a17c8ec8aa3e649e3e4749f-dirty"
author sanbi-uwc
date Tue, 30 Mar 2021 15:00:18 +0000
parents
children 878f18249f76
comparison
equal deleted inserted replaced
-1:000000000000 0:2aa014ad54bc
1 #!/usr/bin/env python3
2
3 from Bio import SeqIO
4 import csv
5 import subprocess
6 import pandas as pd
7 import matplotlib.pyplot as plt
8 import shlex
9
10 """
11 This script can incorporate as many QC checks as required
12 as long as it outputs a csv file containing a final column
13 headed with 'qc_pass' and rows for each sample indcating
14 'TRUE' if the overall QC check has passed or 'FALSE' if not.
15 """
16
17 def make_qc_plot(depth_pos, n_density, samplename, window=200):
18 depth_df = pd.DataFrame( { 'position' : [pos[1] for pos in depth_pos], 'depth' : [dep[2] for dep in depth_pos] } )
19 depth_df['depth_moving_average'] = depth_df.iloc[:,1].rolling(window=window).mean()
20
21 n_df = pd.DataFrame( { 'position' : [pos[0] for pos in n_density], 'n_density' : [dens[1] for dens in n_density] } )
22
23 fig, ax1 = plt.subplots()
24 ax2 = ax1.twinx()
25
26 ax1.set_xlabel('Position')
27
28 ax1.set_ylabel('Depth', color = 'g')
29 ax1.set_ylim(top=10**5, bottom=1)
30 ax1.set_yscale('log')
31 ax1.plot(depth_df['depth_moving_average'], color = 'g')
32
33 ax2.set_ylabel('N density', color = 'r')
34 ax2.plot(n_df['n_density'], color = 'r')
35 ax2.set_ylim(top=1)
36
37 plt.title(samplename)
38 plt.savefig(samplename + '.depth.png')
39
40 def read_depth_file(bamfile):
41 p = subprocess.Popen(['samtools', 'depth', '-a', '-d', '0', bamfile],
42 stdout=subprocess.PIPE)
43 out, err = p.communicate()
44 counter = 0
45
46 pos_depth = []
47 for ln in out.decode('utf-8').split("\n"):
48 if ln:
49 pos_depth.append(ln.split("\t"))
50
51 return pos_depth
52
53
54 def get_covered_pos(pos_depth, min_depth):
55 counter = 0
56 for contig, pos,depth in pos_depth:
57 if int(depth) >= min_depth:
58 counter = counter + 1
59
60 return counter
61
62 def get_N_positions(fasta):
63 n_pos = [i for i, letter in enumerate(fasta.seq.lower()) if letter == 'n']
64
65 return n_pos
66
67 def get_pct_N_bases(fasta):
68
69 count_N = len(get_N_positions(fasta))
70
71 pct_N_bases = count_N / len(fasta.seq) * 100
72
73 return pct_N_bases
74
75 def get_largest_N_gap(fasta):
76 n_pos = get_N_positions(fasta)
77
78 n_pos = [0] + n_pos + [len(fasta.seq)]
79
80 n_gaps = [j-i for i, j in zip(n_pos[:-1], n_pos[1:])]
81
82 return sorted(n_gaps)[-1]
83
84 def get_ref_length(ref):
85 record = SeqIO.read(ref, "fasta")
86 return len(record.seq)
87
88
89 def sliding_window_N_density(sequence, window=10):
90
91 sliding_window_n_density = []
92 for i in range(0, len(sequence.seq), 1):
93 window_mid = i + ( window / 2)
94 window_seq = sequence.seq[i:i+window]
95 n_count = window_seq.lower().count('n')
96 n_density = n_count / window
97
98 sliding_window_n_density.append( [ window_mid, n_density ] )
99
100 return sliding_window_n_density
101
102 def get_num_reads(bamfile):
103
104 st_filter = '0x900'
105 command = 'samtools view -c -F{} {}'.format(st_filter, bamfile)
106 what = shlex.split(command)
107
108 return subprocess.check_output(what).decode().strip()
109
110 def go(args):
111 if args.illumina:
112 depth = 10
113 elif args.nanopore:
114 depth = 20
115
116 ## Depth calcs
117 ref_length = get_ref_length(args.ref)
118 depth_pos = read_depth_file(args.bam)
119
120 depth_covered_bases = get_covered_pos(depth_pos, depth)
121
122 pct_covered_bases = depth_covered_bases / ref_length * 100
123
124 ## Number of aligned reads calculaton
125 num_reads = get_num_reads(args.bam)
126
127 # Unknown base calcs
128 fasta = SeqIO.read(args.fasta, "fasta")
129
130 pct_N_bases = 0
131 largest_N_gap = 0
132 qc_pass = "FALSE"
133
134 if len(fasta.seq) != 0:
135
136 pct_N_bases = get_pct_N_bases(fasta)
137 largest_N_gap = get_largest_N_gap(fasta)
138
139 # QC PASS / FAIL
140 if largest_N_gap >= 10000 or pct_N_bases < 50.0:
141 qc_pass = "TRUE"
142
143
144 qc_line = { 'sample_name' : args.sample,
145 'pct_N_bases' : "{:.2f}".format(pct_N_bases),
146 'pct_covered_bases' : "{:.2f}".format(pct_covered_bases),
147 'longest_no_N_run' : largest_N_gap,
148 'num_aligned_reads' : num_reads,
149 'fasta': args.fasta,
150 'bam' : args.bam,
151 'qc_pass' : qc_pass}
152
153
154 with open(args.outfile, 'w') as csvfile:
155 header = qc_line.keys()
156 writer = csv.DictWriter(csvfile, fieldnames=header)
157 writer.writeheader()
158 writer.writerow(qc_line)
159
160 N_density = sliding_window_N_density(fasta)
161 make_qc_plot(depth_pos, N_density, args.sample)
162
163 def main():
164 import argparse
165
166 parser = argparse.ArgumentParser()
167 group = parser.add_mutually_exclusive_group(required=True)
168 group.add_argument('--nanopore', action='store_true')
169 group.add_argument('--illumina', action='store_true')
170 parser.add_argument('--outfile', required=True)
171 parser.add_argument('--sample', required=True)
172 parser.add_argument('--ref', required=True)
173 parser.add_argument('--bam', required=True)
174 parser.add_argument('--fasta', required=True)
175
176 args = parser.parse_args()
177 go(args)
178
179 if __name__ == "__main__":
180 main()