Mercurial > repos > sanbi-uwc > qc
comparison qc.py @ 0:2aa014ad54bc draft
"planemo upload for repository https://github.com/SANBI-SA/tools-sanbi-uwc/tree/master/tools/qc commit 54450d56a66bd4b01a17c8ec8aa3e649e3e4749f-dirty"
author | sanbi-uwc |
---|---|
date | Tue, 30 Mar 2021 15:00:18 +0000 |
parents | |
children | 878f18249f76 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2aa014ad54bc |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 from Bio import SeqIO | |
4 import csv | |
5 import subprocess | |
6 import pandas as pd | |
7 import matplotlib.pyplot as plt | |
8 import shlex | |
9 | |
10 """ | |
11 This script can incorporate as many QC checks as required | |
12 as long as it outputs a csv file containing a final column | |
13 headed with 'qc_pass' and rows for each sample indcating | |
14 'TRUE' if the overall QC check has passed or 'FALSE' if not. | |
15 """ | |
16 | |
17 def make_qc_plot(depth_pos, n_density, samplename, window=200): | |
18 depth_df = pd.DataFrame( { 'position' : [pos[1] for pos in depth_pos], 'depth' : [dep[2] for dep in depth_pos] } ) | |
19 depth_df['depth_moving_average'] = depth_df.iloc[:,1].rolling(window=window).mean() | |
20 | |
21 n_df = pd.DataFrame( { 'position' : [pos[0] for pos in n_density], 'n_density' : [dens[1] for dens in n_density] } ) | |
22 | |
23 fig, ax1 = plt.subplots() | |
24 ax2 = ax1.twinx() | |
25 | |
26 ax1.set_xlabel('Position') | |
27 | |
28 ax1.set_ylabel('Depth', color = 'g') | |
29 ax1.set_ylim(top=10**5, bottom=1) | |
30 ax1.set_yscale('log') | |
31 ax1.plot(depth_df['depth_moving_average'], color = 'g') | |
32 | |
33 ax2.set_ylabel('N density', color = 'r') | |
34 ax2.plot(n_df['n_density'], color = 'r') | |
35 ax2.set_ylim(top=1) | |
36 | |
37 plt.title(samplename) | |
38 plt.savefig(samplename + '.depth.png') | |
39 | |
40 def read_depth_file(bamfile): | |
41 p = subprocess.Popen(['samtools', 'depth', '-a', '-d', '0', bamfile], | |
42 stdout=subprocess.PIPE) | |
43 out, err = p.communicate() | |
44 counter = 0 | |
45 | |
46 pos_depth = [] | |
47 for ln in out.decode('utf-8').split("\n"): | |
48 if ln: | |
49 pos_depth.append(ln.split("\t")) | |
50 | |
51 return pos_depth | |
52 | |
53 | |
54 def get_covered_pos(pos_depth, min_depth): | |
55 counter = 0 | |
56 for contig, pos,depth in pos_depth: | |
57 if int(depth) >= min_depth: | |
58 counter = counter + 1 | |
59 | |
60 return counter | |
61 | |
62 def get_N_positions(fasta): | |
63 n_pos = [i for i, letter in enumerate(fasta.seq.lower()) if letter == 'n'] | |
64 | |
65 return n_pos | |
66 | |
67 def get_pct_N_bases(fasta): | |
68 | |
69 count_N = len(get_N_positions(fasta)) | |
70 | |
71 pct_N_bases = count_N / len(fasta.seq) * 100 | |
72 | |
73 return pct_N_bases | |
74 | |
75 def get_largest_N_gap(fasta): | |
76 n_pos = get_N_positions(fasta) | |
77 | |
78 n_pos = [0] + n_pos + [len(fasta.seq)] | |
79 | |
80 n_gaps = [j-i for i, j in zip(n_pos[:-1], n_pos[1:])] | |
81 | |
82 return sorted(n_gaps)[-1] | |
83 | |
84 def get_ref_length(ref): | |
85 record = SeqIO.read(ref, "fasta") | |
86 return len(record.seq) | |
87 | |
88 | |
89 def sliding_window_N_density(sequence, window=10): | |
90 | |
91 sliding_window_n_density = [] | |
92 for i in range(0, len(sequence.seq), 1): | |
93 window_mid = i + ( window / 2) | |
94 window_seq = sequence.seq[i:i+window] | |
95 n_count = window_seq.lower().count('n') | |
96 n_density = n_count / window | |
97 | |
98 sliding_window_n_density.append( [ window_mid, n_density ] ) | |
99 | |
100 return sliding_window_n_density | |
101 | |
102 def get_num_reads(bamfile): | |
103 | |
104 st_filter = '0x900' | |
105 command = 'samtools view -c -F{} {}'.format(st_filter, bamfile) | |
106 what = shlex.split(command) | |
107 | |
108 return subprocess.check_output(what).decode().strip() | |
109 | |
110 def go(args): | |
111 if args.illumina: | |
112 depth = 10 | |
113 elif args.nanopore: | |
114 depth = 20 | |
115 | |
116 ## Depth calcs | |
117 ref_length = get_ref_length(args.ref) | |
118 depth_pos = read_depth_file(args.bam) | |
119 | |
120 depth_covered_bases = get_covered_pos(depth_pos, depth) | |
121 | |
122 pct_covered_bases = depth_covered_bases / ref_length * 100 | |
123 | |
124 ## Number of aligned reads calculaton | |
125 num_reads = get_num_reads(args.bam) | |
126 | |
127 # Unknown base calcs | |
128 fasta = SeqIO.read(args.fasta, "fasta") | |
129 | |
130 pct_N_bases = 0 | |
131 largest_N_gap = 0 | |
132 qc_pass = "FALSE" | |
133 | |
134 if len(fasta.seq) != 0: | |
135 | |
136 pct_N_bases = get_pct_N_bases(fasta) | |
137 largest_N_gap = get_largest_N_gap(fasta) | |
138 | |
139 # QC PASS / FAIL | |
140 if largest_N_gap >= 10000 or pct_N_bases < 50.0: | |
141 qc_pass = "TRUE" | |
142 | |
143 | |
144 qc_line = { 'sample_name' : args.sample, | |
145 'pct_N_bases' : "{:.2f}".format(pct_N_bases), | |
146 'pct_covered_bases' : "{:.2f}".format(pct_covered_bases), | |
147 'longest_no_N_run' : largest_N_gap, | |
148 'num_aligned_reads' : num_reads, | |
149 'fasta': args.fasta, | |
150 'bam' : args.bam, | |
151 'qc_pass' : qc_pass} | |
152 | |
153 | |
154 with open(args.outfile, 'w') as csvfile: | |
155 header = qc_line.keys() | |
156 writer = csv.DictWriter(csvfile, fieldnames=header) | |
157 writer.writeheader() | |
158 writer.writerow(qc_line) | |
159 | |
160 N_density = sliding_window_N_density(fasta) | |
161 make_qc_plot(depth_pos, N_density, args.sample) | |
162 | |
163 def main(): | |
164 import argparse | |
165 | |
166 parser = argparse.ArgumentParser() | |
167 group = parser.add_mutually_exclusive_group(required=True) | |
168 group.add_argument('--nanopore', action='store_true') | |
169 group.add_argument('--illumina', action='store_true') | |
170 parser.add_argument('--outfile', required=True) | |
171 parser.add_argument('--sample', required=True) | |
172 parser.add_argument('--ref', required=True) | |
173 parser.add_argument('--bam', required=True) | |
174 parser.add_argument('--fasta', required=True) | |
175 | |
176 args = parser.parse_args() | |
177 go(args) | |
178 | |
179 if __name__ == "__main__": | |
180 main() |