Mercurial > repos > rlegendre > ribo_tools
diff get_codon_frequency.py @ 13:7c944fd9907e draft
release 2
author | rlegendre |
---|---|
date | Thu, 09 Apr 2015 11:35:48 -0400 |
parents | 707807fee542 |
children | 344bacf6acb8 |
line wrap: on
line diff
--- a/get_codon_frequency.py Fri Jan 23 03:31:37 2015 -0500 +++ b/get_codon_frequency.py Thu Apr 09 11:35:48 2015 -0400 @@ -28,7 +28,7 @@ import ribo_functions import HTSeq # #libraries for debugg -import pdb +#import pdb # import cPickle def stop_err(msg): @@ -36,70 +36,6 @@ sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() -def store_gff(gff): - ''' - parse and store gff file in a dictionnary - ''' - try: - GFF = OrderedDict({}) - with open(gff, 'r') as f_gff : - # GFF['order'] = [] - for line in f_gff: - # # switch commented lines - head = line.split("#")[0] - if head != "" : - feature = (line.split('\t')[8]).split(';') - chrom = line.split('\t')[0] - if chrom not in GFF : - GFF[chrom] = {} - # first line is already gene line : - if line.split('\t')[2] == 'gene' : - gene = feature[0].replace("ID=", "") - if re.search('gene', feature[2]) : - Name = feature[2].replace("gene=", "") - else : - Name = "Unknown" - # #get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n", "") - # # store gene information - # GFF['order'].append(gene) - GFF[chrom][gene] = {} - GFF[chrom][gene]['chrom'] = line.split('\t')[0] - GFF[chrom][gene]['start'] = int(line.split('\t')[3]) - GFF[chrom][gene]['stop'] = int(line.split('\t')[4]) - GFF[chrom][gene]['strand'] = line.split('\t')[6] - GFF[chrom][gene]['name'] = Name - GFF[chrom][gene]['note'] = note - GFF[chrom][gene]['exon'] = {} - GFF[chrom][gene]['exon_number'] = 0 - # print Name - elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r"Parent\=(.+)_mRNA", r"\1", feature[0]) - if GFF[chrom].has_key(gene) : - GFF[chrom][gene]['exon_number'] += 1 - exon_number = GFF[chrom][gene]['exon_number'] - GFF[chrom][gene]['exon'][exon_number] = {} - GFF[chrom][gene]['exon'][exon_number]['frame'] = line.split('\t')[7] - GFF[chrom][gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) - GFF[chrom][gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - # # if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : - if GFF[chrom][gene]['strand'] == "+" : - GFF[chrom][gene]['start'] = GFF[chrom][gene]['exon'][1]['start'] - else : - GFF[chrom][gene]['stop'] = GFF[chrom][gene]['exon'][exon_number]['stop'] - return GFF - except Exception, e: - stop_err('Error during gff storage : ' + str(e)) - -#chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 -#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra -#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr -#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified -#chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified -#chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified - def init_codon_dict(): @@ -115,13 +51,19 @@ ''' try: codon = init_codon_dict() + multi_tag = "XS:i:" ## bowtie Tag + tag = "IH:i:1" ## RUM tag + for feature in GFF : if feature.type == 'gene' : codon_dict = init_codon_dict() chrom = feature.iv.chrom start = feature.iv.start stop = feature.iv.end - region = chrom + ':' + str(start) + '-' + str(stop) + if start+50 < stop-50 : + region = chrom + ':' + str(start+50) + '-' + str(stop-50) + else : + break ## DEPRECATED #for chrom in GFF.iterkeys(): @@ -143,13 +85,16 @@ multi_tag = "XS:i:" elif 'bwa' in head: multi_tag = "XT:A:R" + elif 'TopHat' in head: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + if len(read) == 0: continue len_read = len(read.split('\t')[9]) # if it's read of good length - if len_read == kmer and multi_tag not in read: + if len_read == kmer and (tag in line or multi_tag not in line): feat = read.split('\t') seq = feat[9] # if it's a reverse read @@ -179,6 +124,7 @@ cod = seq[a_pos-6:a_pos-3] if codon_dict.has_key(cod) : codon_dict[cod] += 1 + del(read) # # add in global dict for cod, count in codon_dict.iteritems() : codon[cod] += count @@ -400,7 +346,7 @@ cond2_aa.append(z[1]) max_valaa.append(max(z)) # # plot amino acid profile : - fig = pl.figure(figsize=(24, 10), num=1) + fig = pl.figure(figsize=(30, 10), num=1) width = .50 ax = fig.add_subplot(111) ax.xaxis.set_ticks([]) @@ -440,8 +386,8 @@ # plot result - fig = pl.figure(figsize=(24, 10), num=1) - width = .50 + fig = pl.figure(figsize=(30, 10), num=1) + width = .40 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) @@ -546,7 +492,7 @@ max_val.append(max(cond1_norm[i], cond2_norm[i])) # plot result - fig = pl.figure(figsize=(30, 10), num=1) + fig = pl.figure(figsize=(40, 10), num=1) #fig = pl.figure(num=1) width = .40 ind = arange(len(codon_sorted)) @@ -560,7 +506,7 @@ ax.bar(ind, list(cond1_norm.itervalues()), width, facecolor=color1, label=c1) ax.bar(ind + width, list(cond2_norm.itervalues()), width, facecolor=color2, label=c2) for x, y, z in zip(ind, max_val, codon_sorted): - ax.text(x + width, y + 0.02, '%s' % z, ha='center', va='bottom', fontsize=8) + ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=8) ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)') ax.set_xlabel('Codons') handles, labels = ax.get_legend_handles_labels() @@ -657,7 +603,7 @@ parser.add_option("-C", "--cond2", dest="c2", type="string", help="Name of second condition", metavar="STR") - parser.add_option("-k", "--kmer", dest="kmer", type="int", + parser.add_option("-k", "--kmer", dest="kmer", type="int", default = 28 , help="Length of your phasing reads", metavar="INT") # parser.add_option("-l", "--list", dest="list_cod", type= "string", @@ -669,19 +615,19 @@ parser.add_option("-d", "--dirout", dest="dirout", type="string", help="write report to PNG files", metavar="FILE") - parser.add_option("-a", "--asite", dest="asite", type="int", - help="Off-set from the 5'end of the footprint to the A-site", metavar="INT") + parser.add_option("-a", "--asite", dest="asite", type="int", default = 15 , + help="Off-set from the 5'end of the footprint to the A-site (default is 15)", metavar="INT") - parser.add_option("-s", "--site", dest="site", type="string", - help="Script can compute in site A, P or E", metavar="A|P|E") + parser.add_option("-s", "--site", dest="site", type="string", default = "A" , + help="Script can compute in site A, P or E (default is A-site)", metavar="A|P|E") - parser.add_option("-r", "--rep", dest="rep", type="string", + parser.add_option("-r", "--rep", dest="rep", type="string", default = "no" , help="if replicate or not", metavar="yes|no") - parser.add_option("-x", "--hex_col1", dest="color1", type= "string", + parser.add_option("-x", "--hex_col1", dest="color1", type= "string", default = "SkyBlue" , help="Color for first condition", metavar="STR") - parser.add_option("-X", "--hex_col2", dest="color2", type= "string", + parser.add_option("-X", "--hex_col2", dest="color2", type= "string", default = "Plum" , help="Color for second condition", metavar="STR") parser.add_option("-q", "--quiet", @@ -702,28 +648,6 @@ if not colors.is_color_like(options.color2) : stop_err( options.color2+' is not a proper color' ) - ## identify GFF or GTF format from 9th column - #with open (options.gff,"r") as gffile : - # for line in gffile : - # if '#' in line : - # ## skip header - # gffile.next() - # elif 'gene_id' in line : - # ## launch gtf reader : - # GFF = ribo_functions.store_gtf(options.gff) - # break - # elif 'ID=' in line : - # ## launch gff reader - # GFF = ribo_functions.store_gff(options.gff) - # break - # else : - # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) - - #GFF = store_gff(options.gff) - #GFF = ribo_functions.store_gtf(options.gff) - ## check gff reading - #if not GFF['order'] : - # stop_err( 'Incorrect GFF file' + str( e ) ) #### NOT USE IN FINAL VERSION # # get codon list