# HG changeset patch # User rlegendre # Date 1413817613 14400 # Node ID da126b91f9ea2a6ff13861cc7f32928e4d309caa # Parent 1fbddace2db6af1a8ed0b6c1b5cccb4316091b4b Uploaded diff -r 1fbddace2db6 -r da126b91f9ea kmer_analysis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kmer_analysis.py Mon Oct 20 11:06:53 2014 -0400 @@ -0,0 +1,379 @@ +#!/usr/bin/env python2.7.3 +#-*- coding: utf-8 -*- +''' + Created on Jun. 2014 + @author: rachel legendre + @copyright: rachel.legendre@igmors.u-psud.fr + @license: GPL v3 +''' + +import os, sys, time, optparse, re, urllib, subprocess, tempfile, commands +import pysam +#from matplotlib import pyplot as pl +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as pl +from numpy import arange +import ribo_functions +from collections import OrderedDict +#import cPickle + +total_mapped_read = 0 + + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) + sys.exit() + + + +def get_first_base(tmpdir, kmer): + ''' + write footprint coverage file for each sam file in tmpdir and get kmer distribution + ''' + global total_mapped_read + + ## init kmer dict + KMER = OrderedDict({}) + + try: + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') + ##write coverage for each sam file in tmpdir + for samfile in file_array: + with open(tmpdir+'/'+samfile, 'r') as sam : + ##get chromosome name + chrom = samfile.split(".sam")[0] + + for line in sam: + #initialize dictionnary + if '@SQ' in line : + size = int(line.split('LN:')[1]) + genomeF = [0]*size + genomeR = [0]*size + # define multiple reads keys from mapper + elif '@PG' in line : + if 'bowtie' in line: + multi_tag = "XS:i:" + elif 'bwa' in line: + multi_tag = "XT:A:R" + else : + stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + + # get footprint + elif re.search('^[^@].+', line) : + len_read = len(line.split('\t')[9]) + ##full kmer dict + if KMER.has_key(len_read): + KMER[len_read] += 1 + else : + KMER[len_read] = 1 + + #print line.rstrip() + read_pos = int(line.split('\t')[3]) + read_sens = int(line.split('\t')[1]) + #len_read = len(line.split('\t')[9]) + if len_read == kmer and multi_tag not in line: + ###if line.split('\t')[5] == '28M' : + total_mapped_read +=1 + #if it's a forward read + if read_sens == 0 : + #get 5' base + genomeF[read_pos] += 1 + #if it's a reverse read + elif read_sens == 16 : + #get 5' base + read_pos += (len_read-1) + genomeR[read_pos] += 1 + + #try: + #write coverage in files + with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : + for i in range(0,size): + cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') + #except Exception,e: + # stop_err( 'Error during coverage file writting : ' + str( e ) ) + + #sys.stdout.write("%d reads are in your bam file\n" % total_mapped_read) + + return KMER + + except Exception, e: + stop_err( 'Error during footprinting : ' + str( e ) ) + + +def __percent__(prop): + + if sum(prop) != 0 : + perc = [0,0,0] + if prop[0] != 0 : + perc[0] = int("{0:.0f}".format((prop[0]*100.0)/sum(prop))) + if prop[1] != 0 : + perc[1] = int("{0:.0f}".format((prop[1]*100.0)/sum(prop))) + if prop[2] != 0 : + perc[2] = int("{0:.0f}".format((prop[2]*100.0)/sum(prop))) + return perc + else : + return prop + + +def frame_analysis(tmpdir,GFF): + ''' + This function take footprint and annotation (gff) for analyse reading frame in each gene + ''' + global total_mapped_read + try: + chrom = "" # initializing chromosome + nb_gene = 0 # number of analysed genes + whole_phasing = [0,0,0] + for gene in GFF['order']: + ## maybe no start position in GTF file so we must to check and replace + try : GFF[gene]['start'] + except : + if GFF[gene]['strand'] == '+' : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] + ## also for stop coordinates + try : GFF[gene]['stop'] + except : + exon_number = GFF[gene]['exon_number'] + if GFF[gene]['strand'] == '+' : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + + else : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] + cov = [] + ##first chromosome : we open corresponding file + if chrom == "" : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + ##if we change chrosomosome + elif chrom != GFF[gene]['chrom'] : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + + ## if a gene without intron : + if GFF[gene]['exon_number'] == 1: + + ## get coverage for each gene + if GFF[gene]['strand'] == "+": + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + for i in range(GFF[gene]['exon'][1]['start'],GFF[gene]['exon'][1]['stop']+1): + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + else : + ## For each gene, get coverage and sum of exon size + if GFF[gene]['strand'] == "+": + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + if i <= GFF[gene]['stop'] : + cov.append(int((data[i].rstrip()).split("\t")[0])) + else : + + for exon in range(1,GFF[gene]['exon_number']+1) : + for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): + if i <= GFF[gene]['start'] : + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + cov.reverse() + + len_cov = len(cov) + prop = [0,0,0] + for nuc in range(0,len_cov-2,3) : + ## Calculate proportion + prop[0] += cov[nuc] + prop[1] += cov[nuc+1] + prop[2] += cov[nuc+2] + whole_phasing = (map(lambda (x, y): x + y, zip(whole_phasing, prop))) + + whole_phasing = __percent__(whole_phasing) + #sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) + return whole_phasing + + except Exception, e: + stop_err( 'Error during frame analysis : ' + str( e ) ) + + + +def make_report(html_file, dirout, kmer, results) : + + array = sorted(kmer.items(), key=lambda x: x[0]) + values = [] + label = [] + for x,y in array : + values.append(y) + label.append(x) + index = arange(len(label)) + bar_width = 0.35 + axis_font = {'size':'10'} + fig, ax = pl.subplots() + pl.bar(index ,values, color='LightsteelBlue') + pl.xlabel('kmer value', **axis_font) + pl.ylabel('Number of reads', **axis_font) + pl.title('Number of reads for each k-mer') + pl.xticks(index + bar_width, label, **axis_font) + #pl.show() + pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) + pl.clf() + + for key, phase in results.iteritems() : + fig = pl.figure(num=1) + frame = phase + index = arange(3) + bar_width = 0.5 + pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue']) + pl.xlabel('Frame in gene', **axis_font) + pl.ylabel('Percent of read', **axis_font) + pl.title('Proportion of reads in each frame for '+str(key)+'-mer') + pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) + pl.tight_layout() + pl.ylim(0,100) + pl.draw() + #pl.show() + pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) + pl.clf() + + kmer_summary = '' + kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0])) + for key, number in kmer_sorted.iteritems() : + if number > 100 : + kmer_summary += '
+