Mercurial > repos > rlegendre > ribo_tools
diff get_codon_frequency.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | b8c070add3b7 |
children | 7c944fd9907e |
line wrap: on
line diff
--- a/get_codon_frequency.py Tue Dec 09 09:43:07 2014 -0500 +++ b/get_codon_frequency.py Thu Jan 22 14:34:38 2015 +0100 @@ -25,8 +25,10 @@ import csv from scipy import stats from collections import OrderedDict +import ribo_functions +import HTSeq # #libraries for debugg -# import pdb +import pdb # import cPickle def stop_err(msg): @@ -34,7 +36,6 @@ sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() - def store_gff(gff): ''' parse and store gff file in a dictionnary @@ -98,8 +99,7 @@ #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified - - + def init_codon_dict(): @@ -115,17 +115,28 @@ ''' try: codon = init_codon_dict() - - for chrom in GFF.iterkeys(): - for gene in GFF[chrom] : + for feature in GFF : + if feature.type == 'gene' : codon_dict = init_codon_dict() - start = GFF[chrom][gene]['start'] - stop = GFF[chrom][gene]['stop'] + chrom = feature.iv.chrom + start = feature.iv.start + stop = feature.iv.end region = chrom + ':' + str(start) + '-' + str(stop) + + ## DEPRECATED + #for chrom in GFF.iterkeys(): + #for gene in GFF[chrom] : + # codon_dict = init_codon_dict() + #start = GFF[chrom][gene]['start'] + #print start + #stop = GFF[chrom][gene]['stop'] + #print stop + #region = chrom + ':' + str(start) + '-' + str(stop) + ####### # #get all reads in this gene reads = subprocess.check_output(["samtools", "view", bamfile, region]) head = subprocess.check_output(["samtools", "view", "-H", bamfile]) - read_tab = reads.split('\n') + read_tab = reads.split('\n') for read in read_tab: # # search mapper for eliminate multiple alignements if 'bowtie' in head: @@ -136,7 +147,6 @@ stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") if len(read) == 0: continue - len_read = len(read.split('\t')[9]) # if it's read of good length if len_read == kmer and multi_tag not in read: @@ -179,6 +189,8 @@ stop_err('Error during codon usage calcul: ' + str(e)) + + ''' http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html ''' @@ -491,7 +503,7 @@ # # plot amino acid profile : fig = pl.figure(num=1) - width = .35 + width = .45 ax = fig.add_subplot(111) ind = arange(21) pl.xlim(0, 21) @@ -503,7 +515,7 @@ ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) #for x, y, z in zip(ind, max_val, aa_name): # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) - axis_font = {'size':'16'} + axis_font = {'size':'10'} pl.xticks(ind + width, aa_name,**axis_font) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) @@ -513,7 +525,7 @@ ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) ax.set_xlabel('Amino Acids', **axis_font) handles, labels = ax.get_legend_handles_labels() - font_prop = font_manager.FontProperties(size=12) + font_prop = font_manager.FontProperties(size=8) ax.legend(handles, labels, prop=font_prop) pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) pl.clf() @@ -534,8 +546,9 @@ max_val.append(max(cond1_norm[i], cond2_norm[i])) # plot result - fig = pl.figure(figsize=(24, 10), num=1) - width = .50 + fig = pl.figure(figsize=(30, 10), num=1) + #fig = pl.figure(num=1) + width = .40 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) @@ -625,10 +638,7 @@ # raise Exception def __main__(): - ''' - python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2 - python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2 - ''' + # Parse command line options parser = optparse.OptionParser() @@ -648,7 +658,7 @@ help="Name of second condition", metavar="STR") parser.add_option("-k", "--kmer", dest="kmer", type="int", - help="Longer of your phasing reads", metavar="INT") + help="Length of your phasing reads", metavar="INT") # parser.add_option("-l", "--list", dest="list_cod", type= "string", # help="list of codons to compare to other", metavar="STR") @@ -692,13 +702,34 @@ if not colors.is_color_like(options.color2) : stop_err( options.color2+' is not a proper color' ) - GFF = store_gff(options.gff) + ## identify GFF or GTF format from 9th column + #with open (options.gff,"r") as gffile : + # for line in gffile : + # if '#' in line : + # ## skip header + # gffile.next() + # elif 'gene_id' in line : + # ## launch gtf reader : + # GFF = ribo_functions.store_gtf(options.gff) + # break + # elif 'ID=' in line : + # ## launch gff reader + # GFF = ribo_functions.store_gff(options.gff) + # break + # else : + # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + + #GFF = store_gff(options.gff) + #GFF = ribo_functions.store_gtf(options.gff) + ## check gff reading + #if not GFF['order'] : + # stop_err( 'Incorrect GFF file' + str( e ) ) #### NOT USE IN FINAL VERSION # # get codon list # codons = options.list_cod.upper().split(',') # check_codons_list(codons) - + GFF = HTSeq.GFF_Reader(options.gff) # # get html file and directory : (html, html_dir) = options.dirout.split(',') if os.path.exists(html_dir):