Mercurial > repos > rlegendre > ribo_tools
diff ribo_functions.py @ 18:a121cce43f90 draft
Uploaded
author | rlegendre |
---|---|
date | Tue, 09 Jun 2015 09:06:17 -0400 |
parents | c87c40e642af |
children |
line wrap: on
line diff
--- a/ribo_functions.py Fri May 29 09:17:29 2015 -0400 +++ b/ribo_functions.py Tue Jun 09 09:06:17 2015 -0400 @@ -141,6 +141,7 @@ ''' try: GFF = {} + mRNA = {} with open(gff, 'r') as f_gff : GFF['order'] = [] @@ -153,8 +154,9 @@ # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=","") + curent_gene = gene if 'Name' in line : - regex = re.compile('(Name=)([^;]*);') + regex = re.compile('(Name=)([^;]*);?') res = regex.search(line.split('\t')[8]) Name = res.group(2) Name = Name.rstrip() @@ -162,7 +164,7 @@ Name = "Unknown" ##get annotation if 'Note' in line : - regex = re.compile('(Note=)([^;]*);') + regex = re.compile('(Note=)([^;]*);?') res = regex.search(line.split('\t')[8]) note = res.group(2) note = urllib.unquote(str(note)).replace("\n","") @@ -180,12 +182,27 @@ GFF[gene]['exon'] = {} GFF[gene]['exon_number'] = 0 #print Name + elif line.split('\t')[2] == 'mRNA' : + regex = re.compile('(Parent=)([^;]*);?') + res = regex.search(line.split('\t')[8]) + gene_name = res.group(2) + regex = re.compile('(ID=)([^;]*);?') + res = regex.search(line.split('\t')[8]) + mRNA_name = res.group(2) + if gene not in mRNA.viewvalues() and gene_name == curent_gene : + mRNA[mRNA_name] = gene_name + elif line.split('\t')[2] == 'CDS' : - regex = re.compile('(Parent=)([^;]*);') + regex = re.compile('(Parent=)([^;]*);?') res = regex.search(line.split('\t')[8]) gene = res.group(2) + if 'mRNA' in gene: gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) + if mRNA.has_key(gene) and GFF.has_key(mRNA[gene]): + + gene = gene_name + if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -193,7 +210,7 @@ GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) - + ## if there is a five prim UTR intron, we change start of gene elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[gene]['strand'] == "+" :