Mercurial > repos > rlegendre > ribo_tools
comparison kmer_analysis.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | da126b91f9ea |
children | 7c944fd9907e |
comparison
equal
deleted
inserted
replaced
9:d7739f797a26 | 10:707807fee542 |
---|---|
12 #from matplotlib import pyplot as pl | 12 #from matplotlib import pyplot as pl |
13 import matplotlib | 13 import matplotlib |
14 matplotlib.use('Agg') | 14 matplotlib.use('Agg') |
15 import matplotlib.pyplot as pl | 15 import matplotlib.pyplot as pl |
16 from numpy import arange | 16 from numpy import arange |
17 from collections import OrderedDict | |
17 import ribo_functions | 18 import ribo_functions |
18 from collections import OrderedDict | |
19 #import cPickle | 19 #import cPickle |
20 ## suppress matplotlib warnings | |
21 import warnings | |
22 warnings.filterwarnings('ignore') | |
23 | |
20 | 24 |
21 total_mapped_read = 0 | 25 total_mapped_read = 0 |
22 | 26 |
23 | 27 |
24 def stop_err( msg ): | 28 def stop_err( msg ): |
25 sys.stderr.write( "%s\n" % msg ) | 29 sys.stderr.write( "%s\n" % msg ) |
26 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) | 30 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) |
27 sys.exit() | 31 sys.exit() |
28 | 32 |
29 | 33 |
30 | 34 def split_bam(bamfile,tmpdir): |
35 ''' | |
36 split bam by chromosome and write sam file in tmpdir | |
37 ''' | |
38 try: | |
39 #get header | |
40 results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) | |
41 header = results.split('\n') | |
42 | |
43 #define genome size | |
44 genome = [] | |
45 for line in header: | |
46 result = re.search('SN', line) | |
47 if result : | |
48 #print line | |
49 feat = line.split('\t') | |
50 chrom = re.split(":", feat[1]) | |
51 #print feat[1] | |
52 genome.append(chrom[1]) | |
53 | |
54 #split sam by chrom | |
55 n = 0 | |
56 for chrm in genome: | |
57 with open(tmpdir+'/'+chrm+'.sam', 'w') as f : | |
58 #write header correctly for each chromosome | |
59 f.write(header[0]+'\n') | |
60 expr = re.compile(chrm+'\t') | |
61 el =[elem for elem in header if expr.search(elem)][0] | |
62 f.write(el+'\n') | |
63 f.write(header[-2]+'\n') | |
64 #write all reads for each chromosome | |
65 reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) | |
66 f.write(reads) | |
67 # calculate number of reads | |
68 n += reads.count(chrm) | |
69 | |
70 sys.stdout.write("%d reads are presents in your bam file\n" % n) | |
71 | |
72 except Exception, e: | |
73 stop_err( 'Error during bam file splitting : ' + str( e ) ) | |
74 | |
75 | |
76 | |
31 def get_first_base(tmpdir, kmer): | 77 def get_first_base(tmpdir, kmer): |
32 ''' | 78 ''' |
33 write footprint coverage file for each sam file in tmpdir and get kmer distribution | 79 write footprint coverage file for each sam file in tmpdir and get kmer distribution |
34 ''' | 80 ''' |
35 global total_mapped_read | 81 global total_mapped_read |
45 ##get chromosome name | 91 ##get chromosome name |
46 chrom = samfile.split(".sam")[0] | 92 chrom = samfile.split(".sam")[0] |
47 | 93 |
48 for line in sam: | 94 for line in sam: |
49 #initialize dictionnary | 95 #initialize dictionnary |
50 if '@SQ' in line : | 96 if '@SQ\tSN:' in line : |
51 size = int(line.split('LN:')[1]) | 97 size = int(line.split('LN:')[1]) |
52 genomeF = [0]*size | 98 genomeF = [0]*size |
53 genomeR = [0]*size | 99 genomeR = [0]*size |
54 # define multiple reads keys from mapper | 100 # define multiple reads keys from mapper |
55 elif '@PG' in line : | 101 elif '@PG\tID' in line : |
56 if 'bowtie' in line: | 102 if 'bowtie' in line: |
57 multi_tag = "XS:i:" | 103 multi_tag = "XS:i:" |
58 elif 'bwa' in line: | 104 elif 'bwa' in line: |
59 multi_tag = "XT:A:R" | 105 multi_tag = "XT:A:R" |
106 #elif 'TopHat' in line: | |
107 # multi_tag = "NH:i:1" | |
60 else : | 108 else : |
61 stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") | 109 stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") |
62 | 110 |
63 # get footprint | 111 # get footprint |
64 elif re.search('^[^@].+', line) : | 112 elif re.search('^[^@].+', line) : |
65 len_read = len(line.split('\t')[9]) | 113 len_read = len(line.split('\t')[9]) |
66 ##full kmer dict | 114 ##full kmer dict |
126 chrom = "" # initializing chromosome | 174 chrom = "" # initializing chromosome |
127 nb_gene = 0 # number of analysed genes | 175 nb_gene = 0 # number of analysed genes |
128 whole_phasing = [0,0,0] | 176 whole_phasing = [0,0,0] |
129 for gene in GFF['order']: | 177 for gene in GFF['order']: |
130 ## maybe no start position in GTF file so we must to check and replace | 178 ## maybe no start position in GTF file so we must to check and replace |
179 exon_number = GFF[gene]['exon_number'] | |
131 try : GFF[gene]['start'] | 180 try : GFF[gene]['start'] |
132 except : | 181 except : |
133 if GFF[gene]['strand'] == '+' : | 182 if GFF[gene]['strand'] == '+' : |
134 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] | 183 GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] |
135 else : | 184 else : |
136 GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] | 185 GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] |
137 ## also for stop coordinates | 186 ## also for stop coordinates |
138 try : GFF[gene]['stop'] | 187 try : GFF[gene]['stop'] |
139 except : | 188 except : |
140 exon_number = GFF[gene]['exon_number'] | |
141 if GFF[gene]['strand'] == '+' : | 189 if GFF[gene]['strand'] == '+' : |
142 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] | 190 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] |
143 | 191 |
144 else : | 192 else : |
145 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] | 193 GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] |
146 cov = [] | 194 cov = [] |
147 ##first chromosome : we open corresponding file | 195 ##first chromosome : we open corresponding file |
148 if chrom == "" : | 196 try: |
149 chrom = GFF[gene]['chrom'] | 197 if chrom == "" : |
150 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : | 198 chrom = GFF[gene]['chrom'] |
151 data = f.readlines() | 199 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : |
152 ##if we change chrosomosome | 200 data = f.readlines() |
153 elif chrom != GFF[gene]['chrom'] : | 201 ##if we change chromosome |
154 chrom = GFF[gene]['chrom'] | 202 elif chrom != GFF[gene]['chrom'] : |
155 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : | 203 chrom = GFF[gene]['chrom'] |
156 data = f.readlines() | 204 with open(tmpdir+"/assoCov_"+chrom+".txt") as f : |
157 | 205 data = f.readlines() |
206 except IOError : | |
207 print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" | |
208 | |
209 | |
158 ## if a gene without intron : | 210 ## if a gene without intron : |
159 if GFF[gene]['exon_number'] == 1: | 211 if GFF[gene]['exon_number'] == 1: |
160 | 212 |
161 ## get coverage for each gene | 213 ## get coverage for each gene |
162 if GFF[gene]['strand'] == "+": | 214 if GFF[gene]['strand'] == "+": |
170 ## For each gene, get coverage and sum of exon size | 222 ## For each gene, get coverage and sum of exon size |
171 if GFF[gene]['strand'] == "+": | 223 if GFF[gene]['strand'] == "+": |
172 | 224 |
173 for exon in range(1,GFF[gene]['exon_number']+1) : | 225 for exon in range(1,GFF[gene]['exon_number']+1) : |
174 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): | 226 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): |
175 if i <= GFF[gene]['stop'] : | 227 #if i <= GFF[gene]['stop'] : |
176 cov.append(int((data[i].rstrip()).split("\t")[0])) | 228 cov.append(int((data[i].rstrip()).split("\t")[0])) |
177 else : | 229 else : |
178 | 230 |
179 for exon in range(1,GFF[gene]['exon_number']+1) : | 231 for exon in range(1,GFF[gene]['exon_number']+1) : |
180 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): | 232 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): |
181 if i <= GFF[gene]['start'] : | 233 #if i <= GFF[gene]['start'] : |
182 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) | 234 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) |
183 cov.reverse() | 235 cov.reverse() |
184 | 236 |
185 len_cov = len(cov) | 237 len_cov = len(cov) |
186 prop = [0,0,0] | 238 prop = [0,0,0] |
187 for nuc in range(0,len_cov-2,3) : | 239 for nuc in range(0,len_cov-2,3) : |
216 pl.xlabel('kmer value', **axis_font) | 268 pl.xlabel('kmer value', **axis_font) |
217 pl.ylabel('Number of reads', **axis_font) | 269 pl.ylabel('Number of reads', **axis_font) |
218 pl.title('Number of reads for each k-mer') | 270 pl.title('Number of reads for each k-mer') |
219 pl.xticks(index + bar_width, label, **axis_font) | 271 pl.xticks(index + bar_width, label, **axis_font) |
220 #pl.show() | 272 #pl.show() |
273 fig.subplots_adjust() | |
221 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) | 274 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) |
222 pl.clf() | 275 pl.clf() |
223 | 276 |
224 for key, phase in results.iteritems() : | 277 for key, phase in results.iteritems() : |
225 fig = pl.figure(num=1) | 278 fig = pl.figure(num=1) |
229 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) | 282 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) |
230 pl.xlabel('Frame in gene', **axis_font) | 283 pl.xlabel('Frame in gene', **axis_font) |
231 pl.ylabel('Percent of read', **axis_font) | 284 pl.ylabel('Percent of read', **axis_font) |
232 pl.title('Proportion of reads in each frame for '+str(key)+'-mer') | 285 pl.title('Proportion of reads in each frame for '+str(key)+'-mer') |
233 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) | 286 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) |
234 pl.tight_layout() | 287 #pl.tight_layout() |
235 pl.ylim(0,100) | 288 pl.ylim(0,100) |
289 fig.subplots_adjust() | |
236 pl.draw() | 290 pl.draw() |
237 #pl.show() | 291 pl.show() |
238 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) | 292 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) |
239 pl.clf() | 293 pl.clf() |
240 | 294 |
241 kmer_summary = '' | 295 kmer_summary = '' |
242 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) | 296 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) |
286 | 340 |
287 def __main__(): | 341 def __main__(): |
288 | 342 |
289 #Parse command line options | 343 #Parse command line options |
290 parser = optparse.OptionParser() | 344 parser = optparse.OptionParser() |
291 parser.add_option("-g", "--gff", dest="gfffile", type= "string", | 345 parser.add_option("-g", "--gff", dest="gff", type= "string", |
292 help="GFF annotation file", metavar="FILE") | 346 help="GFF annotation file", metavar="FILE") |
293 | 347 |
294 parser.add_option("-b", "--bam", dest="bamfile", type= "string", | 348 parser.add_option("-b", "--bam", dest="bamfile", type= "string", |
295 help="Bam Ribo-Seq alignments ", metavar="FILE") | 349 help="Bam Ribo-Seq alignments ", metavar="FILE") |
296 | 350 |
323 cmd = "samtools index %s " % (options.bamfile) | 377 cmd = "samtools index %s " % (options.bamfile) |
324 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) | 378 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) |
325 returncode = proc.wait() | 379 returncode = proc.wait() |
326 | 380 |
327 tmpdir = tempfile.mkdtemp() | 381 tmpdir = tempfile.mkdtemp() |
328 GFF = ribo_functions.store_gff(options.gfffile) | 382 ## identify GFF or GTF format from 9th column |
383 with open (options.gff,"r") as gffile : | |
384 for line in gffile : | |
385 if '#' in line : | |
386 ## skip header | |
387 gffile.next() | |
388 elif 'gene_id' in line : | |
389 ## launch gtf reader : | |
390 GFF = ribo_functions.store_gtf(options.gff) | |
391 break | |
392 elif 'ID=' in line : | |
393 ## launch gff reader | |
394 GFF = ribo_functions.store_gff(options.gff) | |
395 break | |
396 else : | |
397 stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) | |
398 #GFF = store_gff(options.gff) | |
399 #GFF = ribo_functions.store_gtf(options.gff) | |
400 ## check gff reading | |
401 if not GFF['order'] : | |
402 stop_err( 'Incorrect GFF file' + str( e ) ) | |
403 | |
329 ## split bam | 404 ## split bam |
330 ribo_functions.split_bam(options.bamfile,tmpdir) | 405 split_bam(options.bamfile,tmpdir) |
331 ################################### | 406 ################################### |
332 ## First analysis with 28mer : | 407 ## First analysis with 28mer : |
333 ################################### | 408 ################################### |
334 ## compute coverage and distribution kmer | 409 ## compute coverage and distribution kmer |
335 kmer = get_first_base(tmpdir, 28) | 410 kmer = get_first_base(tmpdir, 28) |
349 if kmer[keys] > 100 : | 424 if kmer[keys] > 100 : |
350 ## compute coverage and distribution kmer | 425 ## compute coverage and distribution kmer |
351 tmp = get_first_base(tmpdir, keys) | 426 tmp = get_first_base(tmpdir, keys) |
352 ## compute phasing | 427 ## compute phasing |
353 whole_phasing = frame_analysis(tmpdir,GFF) | 428 whole_phasing = frame_analysis(tmpdir,GFF) |
429 | |
354 results[keys] = whole_phasing | 430 results[keys] = whole_phasing |
355 ## get report | 431 ## get report |
356 make_report(options.html_file, options.dirout, kmer, results) | 432 make_report(options.html_file, options.dirout, kmer, results) |
357 #======================================================================= | 433 #======================================================================= |
358 # ############ | 434 # ############ |