ribo_tools: metagene_readthrough.py comparison

comparison metagene_readthrough.py @ 10:707807fee542

(none)

author	rlegendre
date	Thu, 22 Jan 2015 14:34:38 +0100
parents	29c9c86e17e1
children	7c944fd9907e

comparison

equal deleted inserted replaced

-:d7739f797a26
+:707807fee542
 matplotlib.use('Agg')
 import matplotlib.pyplot as pl
 from numpy import arange
 from matplotlib import ticker as t
 from PIL import Image
+import ribo_functions
+## suppress matplotlib warnings
+import warnings
 def stop_err( msg ):
 sys.stderr.write( "%s\n" % msg )
 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
 sys.exit()
-def store_gff(gff):
-'''
-parse and store gff file in a dictionnary
-'''
-try:
-GFF = {}
-with open(gff, 'r') as f_gff :
-GFF['order'] = []
-for line in f_gff:
-## switch commented lines
-line = line.split("#")[0]
-if line != "" :
-feature = (line.split('\t')[8]).split(';')
-# first line is already gene line :
-if line.split('\t')[2] == 'gene' :
-gene = feature[0].replace("ID=","")
-if re.search('gene',feature[2]) :
-Name = feature[2].replace("gene=","")
-else :
-Name = "Unknown"
-##get annotation
-note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-note = unquote(str(note)).replace("\n","")
-## store gene information
-GFF['order'].append(gene)
-GFF[gene] = {}
-GFF[gene]['chrom'] = line.split('\t')[0]
-GFF[gene]['start'] = int(line.split('\t')[3])
-GFF[gene]['stop'] = int(line.split('\t')[4])
-GFF[gene]['strand'] = line.split('\t')[6]
-GFF[gene]['name'] = Name
-GFF[gene]['note'] = note
-GFF[gene]['exon'] = {}
-GFF[gene]['exon_number'] = 0
-#print Name
-elif line.split('\t')[2] == 'CDS' :
-gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
-if GFF.has_key(gene) :
-GFF[gene]['exon_number'] += 1
-exon_number = GFF[gene]['exon_number']
-GFF[gene]['exon'][exon_number] = {}
-GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
-GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
-GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-## if there is a five prim UTR intron, we change start of gene
-elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
-if GFF[gene]['strand'] == "+" :
-GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
-else :
-GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
-return GFF
-except Exception,e:
-stop_err( 'Error during gff storage : ' + str( e ) )
 def compute_rpkm(length,count_gene,count_tot) :
 try :
 rpkm = "{0:.4f}".format(count_gene*1000000.0/(count_tot*length))
 Note that all positions should be given as 0-based value!
 end: The end of the interval. Following Python convention for
 ranges, this in one more than the coordinate of the last base
 that is considered part of the sequence.
 strand: The strand, as a single character, '+' or '-'. '.' indicates
-that the strand is irrelavant. (Alternatively, pass a Strand object.)
+that the strand is irrelevant. (Alternatively, pass a Strand object.)
 length: The length of the interval, i.e., end - start
 start_d: The "directional start" position. This is the position of the
 first base of the interval, taking the strand into account. Hence,
 this is the same as 'start' except when strand == '-', in which
 case it is end-1.
 end_d: The "directional end": Usually, the same as 'end', but for
 strand=='-1', it is start-1.
 '''
-def check_overlapping(gff_reader,chrm,start,stop,strand):
+def check_overlapping(gff_reader,chrm,start,stop,strand, name):
 #### probleme avec les genes completement inclu...
 iv2 = HTSeq.GenomicInterval(chrm,start,stop,strand)
 for feature in gff_reader:
 if feature.type == "gene" :
-if feature.iv.overlaps(iv2) :
+if feature.iv.overlaps(iv2) and feature.attr.get('gene_name') != name :
 ## if its a reverse gene, we replace start of extension by start of previous gene
 if strand == '-' :
 return (feature.iv.end+3,stop)
 ## else we replace stop of extension by start of following gene
 else :
 else :
 return True
 def plot_gene ( aln, gene, start_extension, stop_extension, dirout ) :
+### ignore matplotlib warnings for galaxy
+warnings.filterwarnings('ignore')
 try:
 strand = gene['strand']
 len_gene = gene['stop']-gene['start']
 if strand is "-" :
 len_ext = gene['stop']-start_extension
 infile = dirout+'.png'
 size = 128, 128
 im = Image.open(infile)
 im.thumbnail(size, Image.ANTIALIAS)
 im.save(dirout+"_thumbnail.png", "PNG")
+warnings.resetwarnings()
 except Exception, e:
 stop_err( 'Error during gene plotting : ' + str( e ) )
-def compute_analysis(bam, GFF, fasta, gff, dirout) :
+def compute_analysis(bam, GFF, fasta, gff, dirout, extend) :
 try:
 background_file = dirout+"/background_sequence.fasta"
 file_back = open(background_file, 'w')
 file_context = open(dirout+"/stop_context.fasta", 'w')
 with open(dirout+"/readthrough_result.csv","w") as out :
 myheader = ['Gene','Name', 'FAIL', 'Stop context','chrom','start extension','stop extension','length extension','RPKM CDS', 'RPKM extension','ratio','Annotation','sequence']
 writer = csv.writer(out,delimiter='\t')
 writer.writerow(myheader)
 for gene in GFF['order'] :
+#print GFF[gene]
+## maybe no start position in GTF file so we must to check and replace
+exon_number = GFF[gene]['exon_number']
+try : GFF[gene]['start']
+except :
+if GFF[gene]['strand'] == '+' :
+GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
+else :
+GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
+## also for stop coordinates
+try : GFF[gene]['stop']
+except :
+if GFF[gene]['strand'] == '+' :
+GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
+else :
+GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
 indic = ""
 # compute rpkm of CDS :
 len_cds = GFF[gene]['stop']-GFF[gene]['start']
 count_cds = 0
-### count method of pysam doesn't strand information
+rpkm_cds = 0
-if GFF[gene]['strand'] == '+' :
+count_cds = 0
-for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
+try :
-if not track.is_reverse :
+### count method of pysam doesn't strand information
-count_cds += 1
+if GFF[gene]['strand'] == '+' :
-else :
+for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+12,GFF[gene]['stop']-15) :
-for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
+if not track.is_reverse :
-if track.is_reverse :
+count_cds += 1
-count_cds += 1
+else :
-rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+for track in aln.fetch(GFF[gene]['chrom'],GFF[gene]['start']+15,GFF[gene]['stop']-12) :
+if track.is_reverse :
+count_cds += 1
+rpkm_cds = compute_rpkm(len_cds,count_cds,count_tot)
+except ValueError:
+## genere warning about gtf coordinates
+#warnings.warn("Please check coordinates in GFF : maybe stop or start codon are missing" )
+pass
 ## Only if gene is translate :
 if rpkm_cds > 0 and count_cds > 128:
 ## search footprint in UTR3
 count = 0
 try :
 for track in aln.fetch(GFF[gene]['chrom'],start_extension,stop_extension-15) :
 if track.is_reverse :
 count += 1
 ## get sequence of extension
 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
 ## if we have coverage after cds stop codon
 if count > 10 :
 res = find_stop(seq)
 if res == -1 :
 '''
 Write results with no stop but RPF in extension
 '''
 ## check if next in-frame codon is far than 90 nt extension :
 if GFF[gene]['strand'] == '+' :
-pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+300,'+')
+pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['stop']+1,GFF[gene]['stop']+extend,'+',GFF[gene]['name'])
 start_extension = pos[0]-1
 stop_extension = pos[1]
+#start_extension = GFF[gene]['stop']
+#stop_extension = GFF[gene]['stop']+extend
 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension])
 #print gene,count,pos,'\n',seq
 if (seq):
 else :
 indic = 'ok'
 #print res
 #stop_extension = start_extension + res +3
 else :
-pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-300,GFF[gene]['start']-1,'-')
+pos = check_overlapping(gff_reader,GFF[gene]['chrom'],GFF[gene]['start']-extend,GFF[gene]['start']-1,'-',GFF[gene]['name'])
 start_extension = pos[0]
 stop_extension = pos[1]+1
+#start_extension = GFF[gene]['start']-extend
+#stop_extension = GFF[gene]['start']
 seq = str(SeqDict[GFF[gene]['chrom']].seq[start_extension:stop_extension-1].reverse_complement())
 #print gene,count,pos,'\n',seq
 if (seq):
 stop_err('Error during html page creation : ' + str( e ) )
 def __main__():
-## python metagene_readthrough.py -g Saccer3.gff -f Saccer3.fa -b B1_sorted.bam -o Bstrain_readthrough.csv
-#python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /data/Mapping/read_mapped_unique_psiP_sorted.bam -o /home/rlegendre/Documents/Translecture/plop_py.csv -d /home/rlegendre/Documents/Translecture/out_trans/
-#python /home/rlegendre/galaxy/galaxy-dist/tools/rib_profiling/metagene_readthrough.py -g /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.gff -f /home/rlegendre/galaxy/galaxy-dist/SharedData/Ribo/Saccer3.fa -b /home/rlegendre/Documents/PDE2S/PDE2S_trim_no_rRNA_sorted.bam -o /home/rlegendre/Documents/PDE2S/Readthrough_pde2s.csv -d /home/rlegendre/Documents/PDE2S/out_trans
 #Parse command line options
 parser = optparse.OptionParser()
 parser.add_option("-g", "--gff", dest="gff", type= "string",
 help="GFF annotation file", metavar="FILE")
 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
 help="Bam Ribo-Seq alignments ", metavar="FILE")
 parser.add_option("-d", "--dirout", dest="dirout", type= "string",
 help="write report in this html file and in associated directory", metavar="STR,STR")
+parser.add_option("-e", "--extend", dest="extend", type= "int",
+help="Lenght of extension after stop in number of base pairs (depends on your organisme)", metavar="INT")
 parser.add_option("-q", "--quiet",
 action="store_false", dest="verbose", default=True,
 help="don't print status messages to stdout")
 raise
 try:
 os.mkdir(subfolder)
 except:
 raise
+## identify GFF or GTF format from 9th column
-GFF = store_gff(options.gff)
+with open (options.gff,"r") as gffile :
+for line in gffile :
+if '#' in line :
+## skip header
+gffile.next()
+elif 'gene_id' in line :
+## launch gtf reader :
+GFF = ribo_functions.store_gtf(options.gff)
+break
+elif 'ID=' in line :
+## launch gff reader
+GFF = ribo_functions.store_gff(options.gff)
+break
+else :
+stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+#GFF = store_gff(options.gff)
+#GFF = ribo_functions.store_gtf(options.gff)
+## check gff reading
+if not GFF['order'] :
+stop_err( 'Incorrect GFF file' + str( e ) )
 clean_file = cleaning_bam(options.bamfile)
-compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder)
+compute_analysis(clean_file, GFF, options.fasta, options.gff, subfolder, options.extend)
 if os.path.exists( clean_file ):
 os.remove( clean_file )
 write_html_page(html_file,subfolder)
 ##paste jquery script in result directory :

Mercurial > repos > rlegendre > ribo_tools

comparison metagene_readthrough.py @ 10:707807fee542