Mercurial > repos > rlegendre > ribo_tools
diff metagene_readthrough.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | 29c9c86e17e1 |
children | 7c944fd9907e |
line wrap: on
line diff
--- a/metagene_readthrough.py Tue Dec 09 09:43:07 2014 -0500 +++ b/metagene_readthrough.py Thu Jan 22 14:34:38 2015 +0100 @@ -21,70 +21,17 @@ from numpy import arange from matplotlib import ticker as t from PIL import Image +import ribo_functions +## suppress matplotlib warnings +import warnings + + def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() - -def store_gff(gff): - ''' - parse and store gff file in a dictionnary - ''' - try: - GFF = {} - with open(gff, 'r') as f_gff : - GFF['order'] = [] - for line in f_gff: - ## switch commented lines - line = line.split("#")[0] - if line != "" : - feature = (line.split('\t')[8]).split(';') - # first line is already gene line : - if line.split('\t')[2] == 'gene' : - gene = feature[0].replace("ID=","") - if re.search('gene',feature[2]) : - Name = feature[2].replace("gene=","") - else : - Name = "Unknown" - ##get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = unquote(str(note)).replace("\n","") - ## store gene information - GFF['order'].append(gene) - GFF[gene] = {} - GFF[gene]['chrom'] = line.split('\t')[0] - GFF[gene]['start'] = int(line.split('\t')[3]) - GFF[gene]['stop'] = int(line.split('\t')[4]) - GFF[gene]['strand'] = line.split('\t')[6] - GFF[gene]['name'] = Name - GFF[gene]['note'] = note - GFF[gene]['exon'] = {} - GFF[gene]['exon_number'] = 0 - #print Name - elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) - if GFF.has_key(gene) : - GFF[gene]['exon_number'] += 1 - exon_number = GFF[gene]['exon_number'] - GFF[gene]['exon'][exon_number] = {} - GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] - GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) - GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - - ## if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : - if GFF[gene]['strand'] == "+" : - GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] - else : - GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] - return GFF - except Exception,e: - stop_err( 'Error during gff storage : ' + str( e ) ) - - - def compute_rpkm(length,count_gene,count_tot) : @@ -135,7 +82,7 @@ ranges, this in one more than the coordinate of the last base that is considered part of the sequence. strand: The strand, as a single character, '+' or '-'. '.' indicates - that the strand is irrelavant. (Alternatively, pass a Strand object.) + that the strand is irrelevant. (Alternatively, pass a Strand object.) length: The length of the interval, i.e., end - start start_d: The "directional start" position. This is the position of the first base of the interval, taking the strand into account. Hence, @@ -145,14 +92,14 @@ strand=='-1', it is start-1. ''' -def check_overlapping(gff_reader,chrm,start,stop,strand): +def check_overlapping(gff_reader,chrm,start,stop,strand, name): #### probleme avec les genes completement inclu... iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand) for feature in gff_reader: if feature.type == "gene" : - if feature.iv.overlaps(iv2) : + if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name : ## if its a reverse gene, we replace start of extension by start of previous gene if strand == '-' : return (feature.iv.end+3,stop) @@ -200,6 +147,9 @@ def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) : + + ### ignore matplotlib warnings for galaxy + warnings.filterwarnings('ignore') try: strand = gene['strand'] len_gene = gene['stop']-gene['start'] @@ -446,13 +396,13 @@ im = Image.open(infile) im.thumbnail(size, Image.ANTIALIAS) im.save(dirout+"_thumbnail.png", "PNG") - + warnings.resetwarnings() except Exception, e: stop_err( 'Error during gene plotting : ' + str( e ) ) -def compute_analysis(bam, GFF, fasta, gff, dirout) : +def compute_analysis(bam, GFF, fasta, gff, dirout, extend) : try: background_file = dirout+"/background_sequence.fasta" @@ -479,20 +429,46 @@ writer = csv.writer(out,delimiter='\t') writer.writerow(myheader) for gene in GFF['order'] : + #print GFF[gene] + ## maybe no start position in GTF file so we must to check and replace + exon_number = GFF[gene]['exon_number'] + try : GFF[gene]['start'] + except : + if GFF[gene]['strand'] == '+' : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop'] + ## also for stop coordinates + try : GFF[gene]['stop'] + except : + if GFF[gene]['strand'] == '+' : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + + else : + GFF[gene]['stop'] = GFF[gene]['exon'][1]['start'] + + indic = "" # compute rpkm of CDS : len_cds = GFF[gene]['stop']-GFF[gene]['start'] count_cds = 0 - ### count method of pysam doesn't strand information - if GFF[gene]['strand'] == '+' : - for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : - if not track.is_reverse : - count_cds += 1 - else : - for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : - if track.is_reverse : - count_cds += 1 - rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) + rpkm_cds = 0 + count_cds = 0 + try : + ### count method of pysam doesn't strand information + if GFF[gene]['strand'] == '+' : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) : + if not track.is_reverse : + count_cds += 1 + else : + for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) : + if track.is_reverse : + count_cds += 1 + rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot) + except ValueError: + ## genere warning about gtf coordinates + #warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" ) + pass ## Only if gene is translate : if rpkm_cds > 0 and count_cds > 128: ## search footprint in UTR3 @@ -520,7 +496,8 @@ count += 1 ## get sequence of extension seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) - + + ## if we have coverage after cds stop codon if count > 10 : res = find_stop(seq) @@ -530,9 +507,11 @@ ''' ## check if next in-frame codon is far than 90 nt extension : if GFF[gene]['strand'] == '+' : - pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+') + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name']) start_extension = pos[0]-1 stop_extension = pos[1] + #start_extension = GFF[gene]['stop'] + #stop_extension = GFF[gene]['stop']+extend seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension]) #print gene,count,pos,'\n',seq @@ -547,9 +526,11 @@ #print res #stop_extension = start_extension + res +3 else : - pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-') + pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name']) start_extension = pos[0] stop_extension = pos[1]+1 + #start_extension = GFF[gene]['start']-extend + #stop_extension = GFF[gene]['start'] seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement()) #print gene,count,pos,'\n',seq @@ -970,11 +951,6 @@ def __main__(): - ## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/ - #python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans - - #Parse command line options parser = optparse.OptionParser() parser.add_option("-g", "--gff", dest="gff", type= "string", @@ -987,7 +963,10 @@ help="Bam Ribo-Seq alignments ", metavar="FILE") parser.add_option("-d", "--dirout", dest="dirout", type= "string", - help="write report in this html file and in associated directory", metavar="STR,STR") + help="write report in this html file and in associated directory", metavar="STR,STR") + + parser.add_option("-e", "--extend", dest="extend", type= "int", + help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT") parser.add_option("-q", "--quiet", action="store_false", dest="verbose", default=True, @@ -1004,10 +983,30 @@ os.mkdir(subfolder) except: raise - - GFF = store_gff(options.gff) + ## identify GFF or GTF format from 9th column + with open (options.gff,"r") as gffile : + for line in gffile : + if '#' in line : + ## skip header + gffile.next() + elif 'gene_id' in line : + ## launch gtf reader : + GFF = ribo_functions.store_gtf(options.gff) + break + elif 'ID=' in line : + ## launch gff reader + GFF = ribo_functions.store_gff(options.gff) + break + else : + stop_err( 'Please check your annotation file is in correct format, GFF or GTF' ) + + #GFF = store_gff(options.gff) + #GFF = ribo_functions.store_gtf(options.gff) + ## check gff reading + if not GFF['order'] : + stop_err( 'Incorrect GFF file' + str( e ) ) clean_file = cleaning_bam(options.bamfile) - compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder) + compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend) if os.path.exists( clean_file ): os.remove( clean_file )