Mercurial > repos > rlegendre > ribo_tools
diff kmer_analysis.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | da126b91f9ea |
children | 7c944fd9907e |
line wrap: on
line diff
--- a/kmer_analysis.py Tue Dec 09 09:43:07 2014 -0500 +++ b/kmer_analysis.py Thu Jan 22 14:34:38 2015 +0100 @@ -14,9 +14,13 @@ matplotlib.use('Agg') import matplotlib.pyplot as pl from numpy import arange +from collections import OrderedDict import ribo_functions -from collections import OrderedDict #import cPickle +## suppress matplotlib warnings +import warnings +warnings.filterwarnings('ignore') + total_mapped_read = 0 @@ -25,9 +29,51 @@ sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) sys.exit() - + + +def split_bam(bamfile,tmpdir): + ''' + split bam by chromosome and write sam file in tmpdir + ''' + try: + #get header + results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) + header = results.split('\n') + + #define genome size + genome = [] + for line in header: + result = re.search('SN', line) + if result : + #print line + feat = line.split('\t') + chrom = re.split(":", feat[1]) + #print feat[1] + genome.append(chrom[1]) + + #split sam by chrom + n = 0 + for chrm in genome: + with open(tmpdir+'/'+chrm+'.sam', 'w') as f : + #write header correctly for each chromosome + f.write(header[0]+'\n') + expr = re.compile(chrm+'\t') + el =[elem for elem in header if expr.search(elem)][0] + f.write(el+'\n') + f.write(header[-2]+'\n') + #write all reads for each chromosome + reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) + f.write(reads) + # calculate number of reads + n += reads.count(chrm) + sys.stdout.write("%d reads are presents in your bam file\n" % n) + + except Exception, e: + stop_err( 'Error during bam file splitting : ' + str( e ) ) + + def get_first_base(tmpdir, kmer): ''' write footprint coverage file for each sam file in tmpdir and get kmer distribution @@ -47,18 +93,20 @@ for line in sam: #initialize dictionnary - if '@SQ' in line : + if '@SQ\tSN:' in line : size = int(line.split('LN:')[1]) genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper - elif '@PG' in line : + elif '@PG\tID' in line : if 'bowtie' in line: multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" + #elif 'TopHat' in line: + # multi_tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") # get footprint elif re.search('^[^@].+', line) : @@ -128,33 +176,37 @@ whole_phasing = [0,0,0] for gene in GFF['order']: ## maybe no start position in GTF file so we must to check and replace + exon_number = GFF[gene]['exon_number'] try : GFF[gene]['start'] except : if GFF[gene]['strand'] == '+' : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : - GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] + GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] ## also for stop coordinates try : GFF[gene]['stop'] except : - exon_number = GFF[gene]['exon_number'] if GFF[gene]['strand'] == '+' : GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] else : - GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] + GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] cov = [] ##first chromosome : we open corresponding file - if chrom == "" : - chrom = GFF[gene]['chrom'] - with open(tmpdir+"/assoCov_"+chrom+".txt") as f : - data = f.readlines() - ##if we change chrosomosome - elif chrom != GFF[gene]['chrom'] : - chrom = GFF[gene]['chrom'] - with open(tmpdir+"/assoCov_"+chrom+".txt") as f : - data = f.readlines() - + try: + if chrom == "" : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + ##if we change chromosome + elif chrom != GFF[gene]['chrom'] : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + except IOError : + print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" + + ## if a gene without intron : if GFF[gene]['exon_number'] == 1: @@ -172,14 +224,14 @@ for exon in range(1,GFF[gene]['exon_number']+1) : for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - if i <= GFF[gene]['stop'] : - cov.append(int((data[i].rstrip()).split("\t")[0])) + #if i <= GFF[gene]['stop'] : + cov.append(int((data[i].rstrip()).split("\t")[0])) else : for exon in range(1,GFF[gene]['exon_number']+1) : for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - if i <= GFF[gene]['start'] : - cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + #if i <= GFF[gene]['start'] : + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) cov.reverse() len_cov = len(cov) @@ -218,6 +270,7 @@ pl.title('Number of reads for each k-mer') pl.xticks(index + bar_width, label, **axis_font) #pl.show() + fig.subplots_adjust() pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) pl.clf() @@ -231,10 +284,11 @@ pl.ylabel('Percent of read', **axis_font) pl.title('Proportion of reads in each frame for '+str(key)+'-mer') pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) - pl.tight_layout() + #pl.tight_layout() pl.ylim(0,100) + fig.subplots_adjust() pl.draw() - #pl.show() + pl.show() pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) pl.clf() @@ -288,7 +342,7 @@ #Parse command line options parser = optparse.OptionParser() - parser.add_option("-g", "--gff", dest="gfffile", type= "string", + parser.add_option("-g", "--gff", dest="gff", type= "string", help="GFF annotation file", metavar="FILE") parser.add_option("-b", "--bam", dest="bamfile", type= "string", @@ -325,9 +379,30 @@ returncode = proc.wait() tmpdir = tempfile.mkdtemp() - GFF = ribo_functions.store_gff(options.gfffile) + ## identify GFF or GTF format from 9th column + with open (options.gff,"r") as gffile : + for line in gffile : + if '#' in line : + ## skip header + gffile.next() + elif 'gene_id' in line : + ## launch gtf reader : + GFF = ribo_functions.store_gtf(options.gff) + break + elif 'ID=' in line : + ## launch gff reader + GFF = ribo_functions.store_gff(options.gff) + break + else : + stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + #GFF = store_gff(options.gff) + #GFF = ribo_functions.store_gtf(options.gff) + ## check gff reading + if not GFF['order'] : + stop_err( 'Incorrect GFF file' + str( e ) ) + ## split bam - ribo_functions.split_bam(options.bamfile,tmpdir) + split_bam(options.bamfile,tmpdir) ################################### ## First analysis with 28mer : ################################### @@ -351,6 +426,7 @@ tmp = get_first_base(tmpdir, keys) ## compute phasing whole_phasing = frame_analysis(tmpdir,GFF) + results[keys] = whole_phasing ## get report make_report(options.html_file, options.dirout, kmer, results)