Mercurial > repos > rlegendre > ribo_tools
changeset 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | d7739f797a26 |
children | 313b8f7d2a92 |
files | get_codon_frequency.py kmer_analysis.py metagene_frameshift_analysis.py metagene_readthrough.py ribo_functions.py |
diffstat | 5 files changed, 370 insertions(+), 259 deletions(-) [+] |
line wrap: on
line diff
--- a/get_codon_frequency.py Tue Dec 09 09:43:07 2014 -0500 +++ b/get_codon_frequency.py Thu Jan 22 14:34:38 2015 +0100 @@ -25,8 +25,10 @@ import csv from scipy import stats from collections import OrderedDict +import ribo_functions +import HTSeq # #libraries for debugg -# import pdb +import pdb # import cPickle def stop_err(msg): @@ -34,7 +36,6 @@ sys.stderr.write("Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() - def store_gff(gff): ''' parse and store gff file in a dictionnary @@ -98,8 +99,7 @@ #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified - - + def init_codon_dict(): @@ -115,17 +115,28 @@ ''' try: codon = init_codon_dict() - - for chrom in GFF.iterkeys(): - for gene in GFF[chrom] : + for feature in GFF : + if feature.type == 'gene' : codon_dict = init_codon_dict() - start = GFF[chrom][gene]['start'] - stop = GFF[chrom][gene]['stop'] + chrom = feature.iv.chrom + start = feature.iv.start + stop = feature.iv.end region = chrom + ':' + str(start) + '-' + str(stop) + + ## DEPRECATED + #for chrom in GFF.iterkeys(): + #for gene in GFF[chrom] : + # codon_dict = init_codon_dict() + #start = GFF[chrom][gene]['start'] + #print start + #stop = GFF[chrom][gene]['stop'] + #print stop + #region = chrom + ':' + str(start) + '-' + str(stop) + ####### # #get all reads in this gene reads = subprocess.check_output(["samtools", "view", bamfile, region]) head = subprocess.check_output(["samtools", "view", "-H", bamfile]) - read_tab = reads.split('\n') + read_tab = reads.split('\n') for read in read_tab: # # search mapper for eliminate multiple alignements if 'bowtie' in head: @@ -136,7 +147,6 @@ stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") if len(read) == 0: continue - len_read = len(read.split('\t')[9]) # if it's read of good length if len_read == kmer and multi_tag not in read: @@ -179,6 +189,8 @@ stop_err('Error during codon usage calcul: ' + str(e)) + + ''' http://pyinsci.blogspot.fr/2009/09/violin-plot-with-matplotlib.html ''' @@ -491,7 +503,7 @@ # # plot amino acid profile : fig = pl.figure(num=1) - width = .35 + width = .45 ax = fig.add_subplot(111) ind = arange(21) pl.xlim(0, 21) @@ -503,7 +515,7 @@ ax.bar(ind + width, cond2_aa, width, facecolor=color2, label=c2) #for x, y, z in zip(ind, max_val, aa_name): # ax.text(x + width, y + 0.2, '%s' % z, ha='center', va='bottom', fontsize=14) - axis_font = {'size':'16'} + axis_font = {'size':'10'} pl.xticks(ind + width, aa_name,**axis_font) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) @@ -513,7 +525,7 @@ ax.set_ylabel('Ribosome Occupancy (percent of normalized reads)',**axis_font) ax.set_xlabel('Amino Acids', **axis_font) handles, labels = ax.get_legend_handles_labels() - font_prop = font_manager.FontProperties(size=12) + font_prop = font_manager.FontProperties(size=8) ax.legend(handles, labels, prop=font_prop) pl.savefig(dirout + '/hist_amino_acid.png', format="png", dpi=340) pl.clf() @@ -534,8 +546,9 @@ max_val.append(max(cond1_norm[i], cond2_norm[i])) # plot result - fig = pl.figure(figsize=(24, 10), num=1) - width = .50 + fig = pl.figure(figsize=(30, 10), num=1) + #fig = pl.figure(num=1) + width = .40 ind = arange(len(codon_sorted)) ax = fig.add_subplot(111) pl.xlim(0, len(codon_sorted) + 1) @@ -625,10 +638,7 @@ # raise Exception def __main__(): - ''' - python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -1 psiM1_sorted.bam,psiM2_sorted.bam -2 psiP1_sorted.bam,psiP2_sorted.bam -c psiM -C psiP -l TAG,TAA,TGA -r yes -o psi_count -d psi.html,html_dir > log2 - python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/get_codon_frequency.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -g Saccer3.gff -t tAI.csv -c psiM -C psiP -1 RPF_psi-_28sorted.bam -2 RPF_psi+_28sorted.bam -l TAG,TAA,TGA -n Stop Codon -r no -o psi_count -d psi.html,html_dir > log2 - ''' + # Parse command line options parser = optparse.OptionParser() @@ -648,7 +658,7 @@ help="Name of second condition", metavar="STR") parser.add_option("-k", "--kmer", dest="kmer", type="int", - help="Longer of your phasing reads", metavar="INT") + help="Length of your phasing reads", metavar="INT") # parser.add_option("-l", "--list", dest="list_cod", type= "string", # help="list of codons to compare to other", metavar="STR") @@ -692,13 +702,34 @@ if not colors.is_color_like(options.color2) : stop_err( options.color2+' is not a proper color' ) - GFF = store_gff(options.gff) + ## identify GFF or GTF format from 9th column + #with open (options.gff,"r") as gffile : + # for line in gffile : + # if '#' in line : + # ## skip header + # gffile.next() + # elif 'gene_id' in line : + # ## launch gtf reader : + # GFF = ribo_functions.store_gtf(options.gff) + # break + # elif 'ID=' in line : + # ## launch gff reader + # GFF = ribo_functions.store_gff(options.gff) + # break + # else : + # stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + + #GFF = store_gff(options.gff) + #GFF = ribo_functions.store_gtf(options.gff) + ## check gff reading + #if not GFF['order'] : + # stop_err( 'Incorrect GFF file' + str( e ) ) #### NOT USE IN FINAL VERSION # # get codon list # codons = options.list_cod.upper().split(',') # check_codons_list(codons) - + GFF = HTSeq.GFF_Reader(options.gff) # # get html file and directory : (html, html_dir) = options.dirout.split(',') if os.path.exists(html_dir):
--- a/kmer_analysis.py Tue Dec 09 09:43:07 2014 -0500 +++ b/kmer_analysis.py Thu Jan 22 14:34:38 2015 +0100 @@ -14,9 +14,13 @@ matplotlib.use('Agg') import matplotlib.pyplot as pl from numpy import arange +from collections import OrderedDict import ribo_functions -from collections import OrderedDict #import cPickle +## suppress matplotlib warnings +import warnings +warnings.filterwarnings('ignore') + total_mapped_read = 0 @@ -25,9 +29,51 @@ sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) ) sys.exit() - + + +def split_bam(bamfile,tmpdir): + ''' + split bam by chromosome and write sam file in tmpdir + ''' + try: + #get header + results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) + header = results.split('\n') + + #define genome size + genome = [] + for line in header: + result = re.search('SN', line) + if result : + #print line + feat = line.split('\t') + chrom = re.split(":", feat[1]) + #print feat[1] + genome.append(chrom[1]) + + #split sam by chrom + n = 0 + for chrm in genome: + with open(tmpdir+'/'+chrm+'.sam', 'w') as f : + #write header correctly for each chromosome + f.write(header[0]+'\n') + expr = re.compile(chrm+'\t') + el =[elem for elem in header if expr.search(elem)][0] + f.write(el+'\n') + f.write(header[-2]+'\n') + #write all reads for each chromosome + reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) + f.write(reads) + # calculate number of reads + n += reads.count(chrm) + sys.stdout.write("%d reads are presents in your bam file\n" % n) + + except Exception, e: + stop_err( 'Error during bam file splitting : ' + str( e ) ) + + def get_first_base(tmpdir, kmer): ''' write footprint coverage file for each sam file in tmpdir and get kmer distribution @@ -47,18 +93,20 @@ for line in sam: #initialize dictionnary - if '@SQ' in line : + if '@SQ\tSN:' in line : size = int(line.split('LN:')[1]) genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper - elif '@PG' in line : + elif '@PG\tID' in line : if 'bowtie' in line: multi_tag = "XS:i:" elif 'bwa' in line: multi_tag = "XT:A:R" + #elif 'TopHat' in line: + # multi_tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping") # get footprint elif re.search('^[^@].+', line) : @@ -128,33 +176,37 @@ whole_phasing = [0,0,0] for gene in GFF['order']: ## maybe no start position in GTF file so we must to check and replace + exon_number = GFF[gene]['exon_number'] try : GFF[gene]['start'] except : if GFF[gene]['strand'] == '+' : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : - GFF[gene]['start'] = GFF[gene]['exon'][1]['stop'] + GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] ## also for stop coordinates try : GFF[gene]['stop'] except : - exon_number = GFF[gene]['exon_number'] if GFF[gene]['strand'] == '+' : GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] else : - GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start'] + GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] cov = [] ##first chromosome : we open corresponding file - if chrom == "" : - chrom = GFF[gene]['chrom'] - with open(tmpdir+"/assoCov_"+chrom+".txt") as f : - data = f.readlines() - ##if we change chrosomosome - elif chrom != GFF[gene]['chrom'] : - chrom = GFF[gene]['chrom'] - with open(tmpdir+"/assoCov_"+chrom+".txt") as f : - data = f.readlines() - + try: + if chrom == "" : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + ##if we change chromosome + elif chrom != GFF[gene]['chrom'] : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + except IOError : + print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" + + ## if a gene without intron : if GFF[gene]['exon_number'] == 1: @@ -172,14 +224,14 @@ for exon in range(1,GFF[gene]['exon_number']+1) : for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - if i <= GFF[gene]['stop'] : - cov.append(int((data[i].rstrip()).split("\t")[0])) + #if i <= GFF[gene]['stop'] : + cov.append(int((data[i].rstrip()).split("\t")[0])) else : for exon in range(1,GFF[gene]['exon_number']+1) : for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1): - if i <= GFF[gene]['start'] : - cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) + #if i <= GFF[gene]['start'] : + cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-",""))) cov.reverse() len_cov = len(cov) @@ -218,6 +270,7 @@ pl.title('Number of reads for each k-mer') pl.xticks(index + bar_width, label, **axis_font) #pl.show() + fig.subplots_adjust() pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640) pl.clf() @@ -231,10 +284,11 @@ pl.ylabel('Percent of read', **axis_font) pl.title('Proportion of reads in each frame for '+str(key)+'-mer') pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font) - pl.tight_layout() + #pl.tight_layout() pl.ylim(0,100) + fig.subplots_adjust() pl.draw() - #pl.show() + pl.show() pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300) pl.clf() @@ -288,7 +342,7 @@ #Parse command line options parser = optparse.OptionParser() - parser.add_option("-g", "--gff", dest="gfffile", type= "string", + parser.add_option("-g", "--gff", dest="gff", type= "string", help="GFF annotation file", metavar="FILE") parser.add_option("-b", "--bam", dest="bamfile", type= "string", @@ -325,9 +379,30 @@ returncode = proc.wait() tmpdir = tempfile.mkdtemp() - GFF = ribo_functions.store_gff(options.gfffile) + ## identify GFF or GTF format from 9th column + with open (options.gff,"r") as gffile : + for line in gffile : + if '#' in line : + ## skip header + gffile.next() + elif 'gene_id' in line : + ## launch gtf reader : + GFF = ribo_functions.store_gtf(options.gff) + break + elif 'ID=' in line : + ## launch gff reader + GFF = ribo_functions.store_gff(options.gff) + break + else : + stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + #GFF = store_gff(options.gff) + #GFF = ribo_functions.store_gtf(options.gff) + ## check gff reading + if not GFF['order'] : + stop_err( 'Incorrect GFF file' + str( e ) ) + ## split bam - ribo_functions.split_bam(options.bamfile,tmpdir) + split_bam(options.bamfile,tmpdir) ################################### ## First analysis with 28mer : ################################### @@ -351,6 +426,7 @@ tmp = get_first_base(tmpdir, keys) ## compute phasing whole_phasing = frame_analysis(tmpdir,GFF) + results[keys] = whole_phasing ## get report make_report(options.html_file, options.dirout, kmer, results)
--- a/metagene_frameshift_analysis.py Tue Dec 09 09:43:07 2014 -0500 +++ b/metagene_frameshift_analysis.py Thu Jan 22 14:34:38 2015 +0100 @@ -11,15 +11,16 @@ import os, sys, optparse, tempfile, subprocess, re, shutil, commands, urllib, time, csv import math from numpy import arange -#from matplotlib import pyplot as pl -import matplotlib -matplotlib.use('Agg') -import matplotlib.pyplot as pl +from matplotlib import pyplot as pl +#import matplotlib +#matplotlib.use('Agg') +#import matplotlib.pyplot as pl import itertools from Bio import SeqIO from Bio.Seq import Seq from Bio.Alphabet import IUPAC from PIL import Image +import ribo_functions ##libraries for debugg #import cPickle #import pdb @@ -81,12 +82,15 @@ -def get_first_base(tmpdir, kmer): +def get_first_base(tmpdir, kmer, frame): ''' write footprint coverage file for each sam file in tmpdir ''' global total_mapped_read try: + ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint) + p_site_pos = 16-frame + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') ##write coverage for each sam file in tmpdir for samfile in file_array: @@ -123,12 +127,14 @@ #if it's a forward read if read_sens == 0 : #get P-site : start read + 12 nt - read_pos += 15 + #read_pos += 15 + read_pos += p_site_pos genomeF[read_pos] += 1 #if it's a reverse read elif read_sens == 16 : #get P-site - read_pos += 12 + #read_pos += 12 + read_pos += (len_read-1) - p_site_pos genomeR[read_pos] += 1 #try: @@ -145,74 +151,6 @@ stop_err( 'Error during footprinting : ' + str( e ) ) - - - -def store_gff(gff): - ''' - parse and store gff file in a dictionnary - ''' - try: - GFF = {} - with open(gff, 'r') as f_gff : - GFF['order'] = [] - for line in f_gff: - ## switch commented lines - line = line.split("#")[0] - if line != "" : - feature = (line.split('\t')[8]).split(';') - # first line is already gene line : - if line.split('\t')[2] == 'gene' : - gene = feature[0].replace("ID=","") - if re.search('gene',feature[2]) : - Name = feature[2].replace("gene=","") - else : - Name = "Unknown" - ##get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n","") - ## store gene information - GFF['order'].append(gene) - GFF[gene] = {} - GFF[gene]['chrom'] = line.split('\t')[0] - GFF[gene]['start'] = int(line.split('\t')[3]) - GFF[gene]['stop'] = int(line.split('\t')[4]) - GFF[gene]['strand'] = line.split('\t')[6] - GFF[gene]['name'] = Name - GFF[gene]['note'] = note - GFF[gene]['exon'] = {} - GFF[gene]['exon_number'] = 0 - #print Name - elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) - if GFF.has_key(gene) : - GFF[gene]['exon_number'] += 1 - exon_number = GFF[gene]['exon_number'] - GFF[gene]['exon'][exon_number] = {} - GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] - GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) - GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - - ## if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : - if GFF[gene]['strand'] == "+" : - GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] - else : - GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] - return GFF - except Exception,e: - stop_err( 'Error during gff storage : ' + str( e ) ) - - -#chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 -#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra -#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr -#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified -#chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified -#chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified -#chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified - - def __percent__(prop): if sum(prop) != 0 : @@ -228,7 +166,7 @@ return prop -def frame_analysis(tmpdir,subfolder,cutoff,GFF,box_file): +def frame_analysis(tmpdir,subfolder,cutoff,GFF,box_file, orf_length): ''' This function take footprint and annotation (gff) for analyse reading frame in each gene ''' @@ -237,6 +175,7 @@ try: chrom = "" # initializing chromosome nb_gene = 0 # number of analysed genes + mean_orf = 0 myheader = ['Gene','Name','Total_proportion','Dual_coding','RPKM','Segment_RPKM','Note'] with open(subfolder+'/dual_coding.tab',"w") as out : @@ -247,15 +186,18 @@ for gene in GFF['order']: cov = [] ##first chromosome : we open corresponding file - if chrom == "" : - chrom = GFF[gene]['chrom'] - with open(tmpdir+"/assoCov_"+chrom+".txt") as f : - data = f.readlines() - ##if we change chrosomosome - elif chrom != GFF[gene]['chrom'] : - chrom = GFF[gene]['chrom'] - with open(tmpdir+"/assoCov_"+chrom+".txt") as f : - data = f.readlines() + try: + if chrom == "" : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + ##if we change chromosome + elif chrom != GFF[gene]['chrom'] : + chrom = GFF[gene]['chrom'] + with open(tmpdir+"/assoCov_"+chrom+".txt") as f : + data = f.readlines() + except IOError : + print tmpdir+"/assoCov_"+chrom+".txt doesn't exist" ## if a gene without intron : if GFF[gene]['exon_number'] == 1: @@ -283,13 +225,20 @@ cov.reverse() len_cov = len(cov) - #segmentation by gene size - if len_cov < 1000 : - nb_segment = 3 - elif len_cov > 1000 and len_cov < 2000: - nb_segment = 6 - else : - nb_segment = 9 + ## segmentation by orf_length + nb_segment = int(len_cov/orf_length) + ## too short segment gestion + if len_cov < orf_length : + nb_segment = 1 + +# #segmentation by gene size +# if len_cov < 1000 : +# nb_segment = 3 +# elif len_cov > 1000 and len_cov < 2000: +# nb_segment = 6 +# else : +# nb_segment = 9 + ## number of aa in gene : nb_aa = len_cov/3 len_frag = len_cov/nb_segment @@ -302,6 +251,8 @@ GFF[gene]['dual_coding'] = '-' ''' segmentation et total proportions ''' if sum(cov) > MIN_COV_REQUIREMENT: + ## compute mean of ORF length + mean_orf += len_cov nb_gene += 1 for nuc in range(0,len_cov-2,3) : window += 1 @@ -356,6 +307,7 @@ whole_phasing = __percent__(whole_phasing) sys.stdout.write("Proportion of reads in each frame :\n%s\n" % whole_phasing) + sys.stdout.write("Mean length of ORF in your organisme : %s\n" % int(mean_orf/nb_gene)) sys.stdout.write("%d genes are used for frame analysis\n" % nb_gene) ##produce boxplot @@ -363,8 +315,9 @@ pl.ylabel('Total number of footprints (percent)') pl.draw() pl.savefig(subfolder+'/boxplot.png',format='png', dpi=640) - pl.savefig(box_file, format='png', dpi=640) - pl.clf() + #pl.savefig(box_file, format='png', dpi=640) + + return GFF except Exception, e: @@ -790,13 +743,9 @@ def __main__(): - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/BY_project/BY_BAM/LMA830_sort.bam -o /data/BY_project/dual_coding_lma.csv -d /data/BY_project/LMA_dual/index.html,/data/BY_project/LMA_dual -c 60 - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_28mer.bam -o /home/rlegendre/Documents/PDE2S/dual_coding_pde2s.csv -d /home/rlegendre/Documents/PDE2S/index.html,/home/rlegendre/Documents/PDE2S/PDE2S_dual -c 60 - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_frameshift_analysis.py -i /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/RPF_psi+_28sorted.bam -o /home/rlegendre/Documents/test_rpkmlesszero.tab -d /home/rlegendre/Documents/index.html,/home/rlegendre/Documents/plouf -c 60 - #Parse command line options parser = optparse.OptionParser() - parser.add_option("-i", "--input", dest="gfffile", type= "string", + parser.add_option("-g", "--gff", dest="gff", type= "string", help="GFF annotation file", metavar="FILE") parser.add_option("-b", "--bam", dest="bamfile", type= "string", @@ -809,13 +758,19 @@ help="write report in this html file and in associated directory", metavar="STR,STR") parser.add_option("-x", "--boxplot", dest="box", type= "string", - help="report boxplot in this file", metavar="STR,STR") + help="report boxplot in this file", metavar="STR") parser.add_option("-c", "--cutoff", dest="cutoff", type= "int", help="Integer value for selecting all genes that have less than 60 % (default) of reads in coding frame ", metavar="INT") parser.add_option("-k", "--kmer", dest="kmer", type= "int", - help="Longer of your phasing reads", metavar="INT") + help="Longer of your phasing reads", metavar="INT") + + parser.add_option("-o", "--orf_length", dest="orf_length", type= "int", + help="Mean of ORF's length for segmentation", metavar="INT") + + parser.add_option("-p", "--frame", dest="frame", type="int", + help="Script can compute in frame 1, 2 or 3", metavar="1|2|3") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, @@ -835,11 +790,41 @@ cmd = "samtools index %s " % (options.bamfile) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) returncode = proc.wait() - + + if os.path.exists(subfolder): + raise + try: + os.mkdir(subfolder) + except: + raise + + ## check frame arg + if options.frame not in [1,2,3]: + stop_err( 'Please enter a good value for frame argument : must be 1, 2 or 3' ) + ##RUN analysis split_bam(options.bamfile,tmp_dir) - get_first_base (tmp_dir,options.kmer) - GFF = store_gff(options.gfffile) + get_first_base (tmp_dir,options.kmer,options.frame) + ## identify GFF or GTF format from 9th column + with open (options.gff,"r") as gffile : + for line in gffile : + if '#' in line : + ## skip header + gffile.next() + elif 'gene_id' in line : + ## launch gtf reader : + GFF = ribo_functions.store_gtf(options.gff) + break + elif 'ID=' in line : + ## launch gff reader + GFF = ribo_functions.store_gff(options.gff) + break + else : + stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + #GFF = store_gff(options.gfffile) + ## check gff reading + if not GFF['order'] : + stop_err( 'Incorrect GFF file' + str( e ) ) # #### cPickles for Test #### #if os.path.isfile("/home/rlegendre/Documents/FrameShift/OutOfFrameWithIntron/pyt_dict") : @@ -855,13 +840,7 @@ # #### cPickles for Test #### - if os.path.exists(subfolder): - raise - try: - os.mkdir(subfolder) - except: - raise - GFF = frame_analysis (tmp_dir,subfolder,options.cutoff,GFF, options.box) + GFF = frame_analysis (tmp_dir,subfolder,options.cutoff,GFF, options.box, options.orf_length) plot_subprofile (GFF,subfolder,options.fasta) write_html_page(html_file,subfolder,GFF)
--- a/metagene_readthrough.py Tue Dec 09 09:43:07 2014 -0500 +++ b/metagene_readthrough.py Thu Jan 22 14:34:38 2015 +0100 @@ -21,70 +21,17 @@ from numpy import arange from matplotlib import ticker as t from PIL import Image +import ribo_functions +## suppress matplotlib warnings +import warnings + + def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() - -def store_gff(gff): - ''' - parse and store gff file in a dictionnary - ''' - try: - GFF = {} - with open(gff, 'r') as f_gff : - GFF['order'] = [] - for line in f_gff: - ## switch commented lines - line = line.split("#")[0] - if line != "" : - feature = (line.split('\t')[8]).split(';') - # first line is already gene line : - if line.split('\t')[2] == 'gene' : - gene = feature[0].replace("ID=","") - if re.search('gene',feature[2]) : - Name = feature[2].replace("gene=","") - else : - Name = "Unknown" - ##get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = unquote(str(note)).replace("\n","") - ## store gene information - GFF['order'].append(gene) - GFF[gene] = {} - GFF[gene]['chrom'] = line.split('\t')[0] - GFF[gene]['start'] = int(line.split('\t')[3]) - GFF[gene]['stop'] = int(line.split('\t')[4]) - GFF[gene]['strand'] = line.split('\t')[6] - GFF[gene]['name'] = Name - GFF[gene]['note'] = note - GFF[gene]['exon'] = {} - GFF[gene]['exon_number'] = 0 - #print Name - elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) - if GFF.has_key(gene) : - GFF[gene]['exon_number'] += 1 - exon_number = GFF[gene]['exon_number'] - GFF[gene]['exon'][exon_number] = {} - GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] - GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) - GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - - ## if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : - if GFF[gene]['strand'] == "+" : - GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] - else : - GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] - return GFF - except Exception,e: - stop_err( 'Error during gff storage : ' + str( e ) ) - - - def compute_rpkm(length,count_gene,count_tot) : @@ -135,7 +82,7 @@ ranges, this in one more than the coordinate of the last base that is considered part of the sequence. strand: The strand, as a single character, '+' or '-'. '.' indicates - that the strand is irrelavant. (Alternatively, pass a Strand object.) + that the strand is irrelevant. (Alternatively, pass a Strand object.) length: The length of the interval, i.e., end - start start_d: The "directional start" position. This is the position of the first base of the interval, taking the strand into account. Hence, @@ -145,14 +92,14 @@ strand=='-1', it is start-1. ''' -def check_overlapping(gff_reader,chrm,start,stop,strand): +def check_overlapping(gff_reader,chrm,start,stop,strand, name): #### probleme avec les genes completement inclu... iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) for feature in gff_reader: if feature.type == "gene" : - if feature.iv.overlaps(iv2) : + if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name : ## if its a reverse gene, we replace start of extension by start of previous gene if strand == '-' : return (feature.iv.end+3,stop) @@ -200,6 +147,9 @@ def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : + + ### ignore matplotlib warnings for galaxy + warnings.filterwarnings('ignore') try: strand = gene['strand'] len_gene = gene['stop']-gene['start'] @@ -446,13 +396,13 @@ im = Image.open(infile) im.thumbnail(size, Image.ANTIALIAS) im.save(dirout+"_thumbnail.png", "PNG") - + warnings.resetwarnings() except Exception, e: stop_err( 'Error during gene plotting : ' + str( e ) ) -def compute_analysis(bam, GFF, fasta, gff, dirout) : +def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : try: background_file = dirout+"/background_sequence.fasta" @@ -479,20 +429,46 @@ writer = csv.writer(out,delimiter='\t') writer.writerow(myheader) for gene in GFF['order'] : + #print GFF[gene] + ## maybe no start position in GTF file so we must to check and replace + exon_number = GFF[gene]['exon_number'] + try : GFF[gene]['start'] + except : + if GFF[gene]['strand'] == '+' : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] + ## also for stop coordinates + try : GFF[gene]['stop'] + except : + if GFF[gene]['strand'] == '+' : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + + else : + GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] + + indic = "" # compute rpkm of CDS : len_cds = GFF[gene]['stop']-GFF[gene]['start'] count_cds = 0 - ### count method of pysam doesn't strand information - if GFF[gene]['strand'] == '+' : - for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : - if not track.is_reverse : - count_cds += 1 - else : - for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : - if track.is_reverse : - count_cds += 1 - rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) + rpkm_cds = 0 + count_cds = 0 + try : + ### count method of pysam doesn't strand information + if GFF[gene]['strand'] == '+' : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : + if not track.is_reverse : + count_cds += 1 + else : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : + if track.is_reverse : + count_cds += 1 + rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) + except ValueError: + ## genere warning about gtf coordinates + #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" ) + pass ## Only if gene is translate : if rpkm_cds > 0 and count_cds > 128: ## search footprint in UTR3 @@ -520,7 +496,8 @@ count += 1 ## get sequence of extension seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) - + + ## if we have coverage after cds stop codon if count > 10 : res = find_stop(seq) @@ -530,9 +507,11 @@ ''' ## check if next in-frame codon is far than 90 nt extension : if GFF[gene]['strand'] == '+' : - pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+') + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name']) start_extension = pos[0]-1 stop_extension = pos[1] + #start_extension = GFF[gene]['stop'] + #stop_extension = GFF[gene]['stop']+extend seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) #print gene,count,pos,'\n',seq @@ -547,9 +526,11 @@ #print res #stop_extension = start_extension + res +3 else : - pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-') + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name']) start_extension = pos[0] stop_extension = pos[1]+1 + #start_extension = GFF[gene]['start']-extend + #stop_extension = GFF[gene]['start'] seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) #print gene,count,pos,'\n',seq @@ -970,11 +951,6 @@ def __main__(): - ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/ - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans - - #Parse command line options parser = optparse.OptionParser() parser.add_option("-g", "--gff", dest="gff", type= "string", @@ -987,7 +963,10 @@ help="Bam Ribo-Seq alignments ", metavar="FILE") parser.add_option("-d", "--dirout", dest="dirout", type= "string", - help="write report in this html file and in associated directory", metavar="STR,STR") + help="write report in this html file and in associated directory", metavar="STR,STR") + + parser.add_option("-e", "--extend", dest="extend", type= "int", + help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, @@ -1004,10 +983,30 @@ os.mkdir(subfolder) except: raise - - GFF = store_gff(options.gff) + ## identify GFF or GTF format from 9th column + with open (options.gff,"r") as gffile : + for line in gffile : + if '#' in line : + ## skip header + gffile.next() + elif 'gene_id' in line : + ## launch gtf reader : + GFF = ribo_functions.store_gtf(options.gff) + break + elif 'ID=' in line : + ## launch gff reader + GFF = ribo_functions.store_gff(options.gff) + break + else : + stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + + #GFF = store_gff(options.gff) + #GFF = ribo_functions.store_gtf(options.gff) + ## check gff reading + if not GFF['order'] : + stop_err( 'Incorrect GFF file' + str( e ) ) clean_file = cleaning_bam(options.bamfile) - compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder) + compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) if os.path.exists( clean_file ): os.remove( clean_file )
--- a/ribo_functions.py Tue Dec 09 09:43:07 2014 -0500 +++ b/ribo_functions.py Thu Jan 22 14:34:38 2015 +0100 @@ -8,6 +8,8 @@ ''' import sys, subprocess, re, commands, time, urllib +from copy import copy + def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) @@ -154,7 +156,7 @@ GFF[gene]['exon_number'] = 0 #print Name elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) + gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -184,7 +186,7 @@ def store_gtf(gff): ''' - parse and store gtf file in a dictionnary (DEPRECATED) + parse and store gtf file in a dictionnary ''' try: GFF = {} @@ -195,11 +197,11 @@ line = line.split("#")[0] if line != "" : # first line is already gene line : - if line.split('\t')[1] == 'protein_coding' : + if 'protein_coding' in line : ##get name - gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() - Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip() - if line.split('\t')[2] == 'exon' : + gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() + Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() + if line.split('\t')[2] == 'CDS' : ##if its first time we get this gene if gene not in GFF.keys() : ## store gene information @@ -207,35 +209,41 @@ GFF[gene] = {} GFF[gene]['chrom'] = line.split('\t')[0] GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['start'] = int(line.split('\t')[3]) + GFF[gene]['stop'] = int(line.split('\t')[4]) GFF[gene]['name'] = Name + GFF[gene]['note'] = "" GFF[gene]['exon_number'] = 1 GFF[gene]['exon'] = {} - exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + ## some exons are non codant + exon_number = 1 GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) else : ## we add exon - exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + exon_number += 1 GFF[gene]['exon_number'] = exon_number GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - elif line.split('\t')[2] == 'CDS' : - exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + #elif line.split('\t')[2] == 'CDS' : + #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] elif line.split('\t')[2] == 'start_codon' : if GFF[gene]['strand'] == '-' : - GFF[gene]['start'] = int(line.split('\t')[4]) + GFF[gene]['stop'] = int(line.split('\t')[4]) else : GFF[gene]['start'] = int(line.split('\t')[3]) elif line.split('\t')[2] == 'stop_codon' : if GFF[gene]['strand'] == '-' : - GFF[gene]['stop'] = int(line.split('\t')[3]) + GFF[gene]['start'] = int(line.split('\t')[3]) else : GFF[gene]['stop'] = int(line.split('\t')[4]) - return GFF + return __reverse_coordinates__(GFF) except Exception,e: stop_err( 'Error during gff storage : ' + str( e ) ) @@ -252,8 +260,26 @@ ## protein_id "YDL083C"; ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " ##RPS16B"; +def __reverse_coordinates__(GFF): + + for gene in GFF['order']: + ## for reverse gene + if GFF[gene]['strand'] == "-": + ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates + if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 : + tmp = copy(GFF[gene]['exon']) + exon_number = GFF[gene]['exon_number'] + rev_index = exon_number+1 + for z in range(1,exon_number+1): + rev_index -= 1 + GFF[gene]['exon'][z] = tmp[rev_index] + ## check start + if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']: + GFF[gene]['exon'][1]['start'] = GFF[gene]['start'] + return GFF + def cleaning_bam(bam): '''