ribo_tools: ribo_functions.py comparison

comparison ribo_functions.py @ 10:707807fee542

(none)

author	rlegendre
date	Thu, 22 Jan 2015 14:34:38 +0100
parents	d7739f797a26
children	7c944fd9907e

comparison

equal deleted inserted replaced

-:d7739f797a26
+:707807fee542
 @copyright:  rachel.legendre@igmors.u-psud.fr
 @license: GPL v3
 '''
 import sys, subprocess, re, commands, time, urllib
+from copy import copy
 def stop_err( msg ):
 sys.stderr.write( "%s\n" % msg )
 sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
 sys.exit()
 GFF[gene]['note'] = note
 GFF[gene]['exon'] = {}
 GFF[gene]['exon_number'] = 0
 #print Name
 elif line.split('\t')[2] == 'CDS' :
-gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0])
+gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
 if GFF.has_key(gene) :
 GFF[gene]['exon_number'] += 1
 exon_number = GFF[gene]['exon_number']
 GFF[gene]['exon'][exon_number] = {}
 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
 #chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
 #chrI    SGD     intron  87388   87500   .       +       .       Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified
 def store_gtf(gff):
 '''
-parse and store gtf file in a dictionnary (DEPRECATED)
+parse and store gtf file in a dictionnary
 '''
 try:
 GFF = {}
 with open(gff, 'r') as f_gff :
 GFF['order'] = []
 for line in f_gff:
 ## switch commented lines
 line = line.split("#")[0]
 if line != "" :
 # first line is already gene line :
-if line.split('\t')[1] == 'protein_coding' :
+if 'protein_coding' in line :
 ##get name
-gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
+gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
-Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip()
+Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
-if line.split('\t')[2] == 'exon' :
+if line.split('\t')[2] == 'CDS' :
 ##if its first time we get this gene
 if gene not in GFF.keys() :
 ## store gene information
 GFF['order'].append(gene)
 GFF[gene] = {}
 GFF[gene]['chrom'] = line.split('\t')[0]
 GFF[gene]['strand'] = line.split('\t')[6]
+GFF[gene]['start'] = int(line.split('\t')[3])
+GFF[gene]['stop'] = int(line.split('\t')[4])
 GFF[gene]['name'] = Name
+GFF[gene]['note'] = ""
 GFF[gene]['exon_number'] = 1
 GFF[gene]['exon'] = {}
-exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+#exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+## some exons are non codant
+exon_number = 1
 GFF[gene]['exon'][exon_number] = {}
 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
 else :
 ## we add exon
-exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+#exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+exon_number += 1
 GFF[gene]['exon_number'] = exon_number
 GFF[gene]['exon'][exon_number] = {}
 GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
 GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
-elif line.split('\t')[2] == 'CDS' :
+#elif line.split('\t')[2] == 'CDS' :
-exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
+#exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
 GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
 elif line.split('\t')[2] == 'start_codon' :
 if GFF[gene]['strand'] == '-' :
-GFF[gene]['start'] = int(line.split('\t')[4])
+GFF[gene]['stop'] = int(line.split('\t')[4])
 else :
 GFF[gene]['start'] = int(line.split('\t')[3])
 elif line.split('\t')[2] == 'stop_codon' :
 if GFF[gene]['strand'] == '-' :
-GFF[gene]['stop'] = int(line.split('\t')[3])
+GFF[gene]['start'] = int(line.split('\t')[3])
 else :
 GFF[gene]['stop'] = int(line.split('\t')[4])
-return GFF
+return __reverse_coordinates__(GFF)
 except Exception,e:
 stop_err( 'Error during gff storage : ' + str( e ) )
 ##IV      protein_coding  exon    307766  307789  .       -       .       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
 ## exon_id "YDL083C.2";
 ##IV      protein_coding  CDS     306929  307333  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
 ## protein_id "YDL083C";
 ##IV      protein_coding  stop_codon      306926  306928  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
 ##RPS16B";
+def __reverse_coordinates__(GFF):
+for gene in GFF['order']:
+## for reverse gene
+if GFF[gene]['strand'] == "-":
+## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
+if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
+tmp = copy(GFF[gene]['exon'])
+exon_number = GFF[gene]['exon_number']
+rev_index = exon_number+1
+for z in range(1,exon_number+1):
+rev_index -= 1
+GFF[gene]['exon'][z] = tmp[rev_index]
+## check start
+if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
+GFF[gene]['exon'][1]['start'] = GFF[gene]['start']
+return GFF
 def cleaning_bam(bam):
 '''
 Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
 '''

Mercurial > repos > rlegendre > ribo_tools

comparison ribo_functions.py @ 10:707807fee542