ribo_tools: get_codon_frequency.py comparison

comparison get_codon_frequency.py @ 13:7c944fd9907e draft

release 2

author	rlegendre
date	Thu, 09 Apr 2015 11:35:48 -0400
parents	707807fee542
children	344bacf6acb8

comparison

equal deleted inserted replaced

-:ee3a3435ce43
+:7c944fd9907e
 from scipy import stats
 from collections import OrderedDict
 import ribo_functions
 import HTSeq
 # #libraries for debugg
-import pdb
+#import pdb
 # import cPickle
 def stop_err(msg):
 sys.stderr.write("%s\n" % msg)
 sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
 sys.exit()
-def store_gff(gff):
-'''
-parse and store gff file in a dictionnary
-'''
-try:
-GFF = OrderedDict({})
-with open(gff, 'r') as f_gff :
-# GFF['order'] = []
-for line in f_gff:
-# # switch commented lines
-head = line.split("#")[0]
-if head != "" :
-feature = (line.split('\t')[8]).split(';')
-chrom = line.split('\t')[0]
-if chrom not in GFF :
-GFF[chrom] = {}
-# first line is already gene line :
-if line.split('\t')[2] == 'gene' :
-gene = feature[0].replace("ID=", "")
-if re.search('gene', feature[2]) :
-Name = feature[2].replace("gene=", "")
-else :
-Name = "Unknown"
-# #get annotation
-note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-note = urllib.unquote(str(note)).replace("\n", "")
-# # store gene information
-# GFF['order'].append(gene)
-GFF[chrom][gene] = {}
-GFF[chrom][gene]['chrom'] = line.split('\t')[0]
-GFF[chrom][gene]['start'] = int(line.split('\t')[3])
-GFF[chrom][gene]['stop'] = int(line.split('\t')[4])
-GFF[chrom][gene]['strand'] = line.split('\t')[6]
-GFF[chrom][gene]['name'] = Name
-GFF[chrom][gene]['note'] = note
-GFF[chrom][gene]['exon'] = {}
-GFF[chrom][gene]['exon_number'] = 0
-# print Name
-elif line.split('\t')[2] == 'CDS' :
-gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0])
-if GFF[chrom].has_key(gene) :
-GFF[chrom][gene]['exon_number'] += 1
-exon_number = GFF[chrom][gene]['exon_number']
-GFF[chrom][gene]['exon'][exon_number] = {}
-GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
-GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
-GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-# # if there is a five prim UTR intron, we change start of gene
-elif line.split('\t')[2] == 'five_prime_UTR_intron' :
-if GFF[chrom][gene]['strand'] == "+" :
-GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start']
-else :
-GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop']
-return GFF
-except Exception, e:
-stop_err('Error during gff storage : ' + str(e))
-#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
-#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
-#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
-#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
-#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
-#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
 def init_codon_dict():
 Codon_dict = OrderedDict([('AAA', 0), ('AAC', 0), ('AAG', 0), ('AAT', 0), ('ACA', 0), ('ACC', 0), ('ACG', 0), ('ACT', 0), ('AGA', 0), ('AGC', 0), ('AGG', 0), ('AGT', 0), ('ATA', 0), ('ATC', 0), ('ATG', 0), ('ATT', 0), ('CAA', 0), ('CAC', 0), ('CAG', 0), ('CAT', 0), ('CCA', 0), ('CCC', 0), ('CCG', 0), ('CCT', 0), ('CGA', 0), ('CGC', 0), ('CGG', 0), ('CGT', 0), ('CTA', 0), ('CTC', 0), ('CTG', 0), ('CTT', 0), ('GAA', 0), ('GAC', 0), ('GAG', 0), ('GAT', 0), ('GCA', 0), ('GCC', 0), ('GCG', 0), ('GCT', 0), ('GGA', 0), ('GGC', 0), ('GGG', 0), ('GGT', 0), ('GTA', 0), ('GTC', 0), ('GTG', 0), ('GTT', 0), ('TAA', 0), ('TAC', 0), ('TAG', 0), ('TAT', 0), ('TCA', 0), ('TCC', 0), ('TCG', 0), ('TCT', 0), ('TGA', 0), ('TGC', 0), ('TGG', 0), ('TGT', 0), ('TTA', 0), ('TTC', 0), ('TTG', 0), ('TTT', 0)])
 return Codon_dict
 Read GFF dict and get gene codon usage.
 Return dict of codons usage
 '''
 try:
 codon = init_codon_dict()
+multi_tag = "XS:i:" ## bowtie Tag
+tag = "IH:i:1" ## RUM tag
 for feature in GFF :
 if feature.type == 'gene' :
 codon_dict = init_codon_dict()
 chrom = feature.iv.chrom
 start = feature.iv.start
 stop = feature.iv.end
-region = chrom + ':' + str(start) + '-' + str(stop)
+if start+50 < stop-50 :
+region = chrom + ':' + str(start+50) + '-' + str(stop-50)
+else :
+break
 ## DEPRECATED
 #for chrom in GFF.iterkeys():
 #for gene in GFF[chrom] :
 # codon_dict = init_codon_dict()
 # # search mapper for eliminate multiple alignements
 if 'bowtie' in head:
 multi_tag = "XS:i:"
 elif 'bwa' in  head:
 multi_tag = "XT:A:R"
+elif 'TopHat' in  head:
+tag = "NH:i:1"
 else :
-stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
+stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
 if len(read) == 0:
 continue
 len_read = len(read.split('\t')[9])
 # if it's read of good length
-if len_read == kmer and multi_tag not in read:
+if len_read == kmer and (tag in line or multi_tag not in line):
 feat = read.split('\t')
 seq = feat[9]
 # if it's a reverse read
 if feat[1] == '16' :
 if site == "A" :
 else :
 # #get site-E
 cod = seq[a_pos-6:a_pos-3]
 if codon_dict.has_key(cod) :
 codon_dict[cod] += 1
+del(read)
 # # add in global dict
 for cod, count in codon_dict.iteritems() :
 codon[cod] += count
 return codon
 for z in AA.itervalues():
 cond1_aa.append(z[0])
 cond2_aa.append(z[1])
 max_valaa.append(max(z))
 # # plot amino acid profile :
-fig = pl.figure(figsize=(24, 10), num=1)
+fig = pl.figure(figsize=(30, 10), num=1)
 width = .50
 ax = fig.add_subplot(111)
 ax.xaxis.set_ticks([])
 ind = arange(21)
 pl.xlim(0, 21)
 out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')
 # plot result
-fig = pl.figure(figsize=(24, 10), num=1)
+fig = pl.figure(figsize=(30, 10), num=1)
-width = .50
+width = .40
 ind = arange(len(codon_sorted))
 ax = fig.add_subplot(111)
 pl.xlim(0, len(codon_sorted) + 1)
 ax.spines['right'].set_color('none')
 ax.spines['top'].set_color('none')
 for i in cond1:
 # # max value for each codon
 max_val.append(max(cond1_norm[i], cond2_norm[i]))
 # plot result
-fig = pl.figure(figsize=(30, 10), num=1)
+fig = pl.figure(figsize=(40, 10), num=1)
 #fig = pl.figure(num=1)
 width = .40
 ind = arange(len(codon_sorted))
 ax = fig.add_subplot(111)
 pl.xlim(0, len(codon_sorted) + 1)
 ax.spines['left'].set_smart_bounds(True)
 ax.yaxis.set_ticks_position('left')
 ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1)
 ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2)
 for x, y, z in zip(ind, max_val, codon_sorted):
-ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8)
+ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8)
 ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)')
 ax.set_xlabel('Codons')
 handles, labels = ax.get_legend_handles_labels()
 ax.legend(handles, labels)
 pl.savefig(dirout + '/hist_codons.png', format="png", dpi=340)
 help="Name for first condition", metavar="STR")
 parser.add_option("-C", "--cond2", dest="c2", type="string",
 help="Name of second condition", metavar="STR")
-parser.add_option("-k", "--kmer", dest="kmer", type="int",
+parser.add_option("-k", "--kmer", dest="kmer", type="int",  default = 28 ,
 help="Length of your phasing reads", metavar="INT")
 #     parser.add_option("-l", "--list", dest="list_cod", type= "string",
 #                   help="list of codons to compare to other", metavar="STR")
 help="write report to FILE", metavar="FILE")
 parser.add_option("-d", "--dirout", dest="dirout", type="string",
 help="write report to PNG files", metavar="FILE")
-parser.add_option("-a", "--asite", dest="asite", type="int",
+parser.add_option("-a", "--asite", dest="asite", type="int",  default = 15 ,
-help="Off-set from the 5'end of the footprint to the A-site", metavar="INT")
+help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT")
-parser.add_option("-s", "--site", dest="site", type="string",
+parser.add_option("-s", "--site", dest="site", type="string",  default = "A" ,
-help="Script can compute in site A, P or E", metavar="A|P|E")
+help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E")
-parser.add_option("-r", "--rep", dest="rep", type="string",
+parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" ,
 help="if replicate or not", metavar="yes|no")
-parser.add_option("-x", "--hex_col1", dest="color1", type= "string",
+parser.add_option("-x", "--hex_col1", dest="color1", type= "string",  default = "SkyBlue" ,
 help="Color for first condition", metavar="STR")
-parser.add_option("-X", "--hex_col2", dest="color2", type= "string",
+parser.add_option("-X", "--hex_col2", dest="color2", type= "string",  default = "Plum" ,
 help="Color for second condition", metavar="STR")
 parser.add_option("-q", "--quiet",
 action="store_false", dest="verbose", default=True,
 help="don't print status messages to stdout")
 if not colors.is_color_like(options.color1) :
 stop_err( options.color1+' is not a proper color' )
 if not colors.is_color_like(options.color2) :
 stop_err( options.color2+' is not a proper color' )
-## identify GFF or GTF format from 9th column
-#with open (options.gff,"r") as gffile :
-#    for line in gffile :
-#        if '#' in line :
-#            ## skip header
-#            gffile.next()
-#       elif 'gene_id' in line :
-#            ## launch gtf reader :
-#            GFF = ribo_functions.store_gtf(options.gff)
-#            break
-#        elif 'ID=' in line :
-#            ## launch gff reader
-#            GFF = ribo_functions.store_gff(options.gff)
-#            break
-#        else :
-#            stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
-#GFF = store_gff(options.gff)
-#GFF = ribo_functions.store_gtf(options.gff)
-## check gff reading
-#if not GFF['order'] :
-#   stop_err( 'Incorrect GFF file' + str( e ) )
 #### NOT USE IN FINAL VERSION
 # # get codon list
 # codons = options.list_cod.upper().split(',')
 # check_codons_list(codons)

Mercurial > repos > rlegendre > ribo_tools

comparison get_codon_frequency.py @ 13:7c944fd9907e draft