Mercurial > repos > rlegendre > ribo_tools
diff ribo_functions.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | d7739f797a26 |
children | 7c944fd9907e |
line wrap: on
line diff
--- a/ribo_functions.py Tue Dec 09 09:43:07 2014 -0500 +++ b/ribo_functions.py Thu Jan 22 14:34:38 2015 +0100 @@ -8,6 +8,8 @@ ''' import sys, subprocess, re, commands, time, urllib +from copy import copy + def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) @@ -154,7 +156,7 @@ GFF[gene]['exon_number'] = 0 #print Name elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) + gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -184,7 +186,7 @@ def store_gtf(gff): ''' - parse and store gtf file in a dictionnary (DEPRECATED) + parse and store gtf file in a dictionnary ''' try: GFF = {} @@ -195,11 +197,11 @@ line = line.split("#")[0] if line != "" : # first line is already gene line : - if line.split('\t')[1] == 'protein_coding' : + if 'protein_coding' in line : ##get name - gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() - Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip() - if line.split('\t')[2] == 'exon' : + gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() + Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() + if line.split('\t')[2] == 'CDS' : ##if its first time we get this gene if gene not in GFF.keys() : ## store gene information @@ -207,35 +209,41 @@ GFF[gene] = {} GFF[gene]['chrom'] = line.split('\t')[0] GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['start'] = int(line.split('\t')[3]) + GFF[gene]['stop'] = int(line.split('\t')[4]) GFF[gene]['name'] = Name + GFF[gene]['note'] = "" GFF[gene]['exon_number'] = 1 GFF[gene]['exon'] = {} - exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + ## some exons are non codant + exon_number = 1 GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) else : ## we add exon - exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + exon_number += 1 GFF[gene]['exon_number'] = exon_number GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - elif line.split('\t')[2] == 'CDS' : - exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + #elif line.split('\t')[2] == 'CDS' : + #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] elif line.split('\t')[2] == 'start_codon' : if GFF[gene]['strand'] == '-' : - GFF[gene]['start'] = int(line.split('\t')[4]) + GFF[gene]['stop'] = int(line.split('\t')[4]) else : GFF[gene]['start'] = int(line.split('\t')[3]) elif line.split('\t')[2] == 'stop_codon' : if GFF[gene]['strand'] == '-' : - GFF[gene]['stop'] = int(line.split('\t')[3]) + GFF[gene]['start'] = int(line.split('\t')[3]) else : GFF[gene]['stop'] = int(line.split('\t')[4]) - return GFF + return __reverse_coordinates__(GFF) except Exception,e: stop_err( 'Error during gff storage : ' + str( e ) ) @@ -252,8 +260,26 @@ ## protein_id "YDL083C"; ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " ##RPS16B"; +def __reverse_coordinates__(GFF): + + for gene in GFF['order']: + ## for reverse gene + if GFF[gene]['strand'] == "-": + ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates + if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 : + tmp = copy(GFF[gene]['exon']) + exon_number = GFF[gene]['exon_number'] + rev_index = exon_number+1 + for z in range(1,exon_number+1): + rev_index -= 1 + GFF[gene]['exon'][z] = tmp[rev_index] + ## check start + if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']: + GFF[gene]['exon'][1]['start'] = GFF[gene]['start'] + return GFF + def cleaning_bam(bam): '''