Mercurial > repos > rlegendre > ribo_tools
diff ribo_functions.py @ 13:7c944fd9907e draft
release 2
author | rlegendre |
---|---|
date | Thu, 09 Apr 2015 11:35:48 -0400 |
parents | 707807fee542 |
children | c87c40e642af |
line wrap: on
line diff
--- a/ribo_functions.py Fri Jan 23 03:31:37 2015 -0500 +++ b/ribo_functions.py Thu Apr 09 11:35:48 2015 -0400 @@ -7,7 +7,7 @@ @license: GPL v3 ''' -import sys, subprocess, re, commands, time, urllib +import sys, subprocess, re, commands, time, urllib, tempfile from copy import copy @@ -127,7 +127,9 @@ try: GFF = {} with open(gff, 'r') as f_gff : + GFF['order'] = [] + #for line in itertools.islice( f_gff, 25 ): for line in f_gff: ## switch commented lines line = line.split("#")[0] @@ -136,13 +138,21 @@ # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=","") - if re.search('gene',feature[2]) : - Name = feature[2].replace("gene=","") + if 'Name' in line : + regex = re.compile('(Name=)([^;]*);') + res = regex.search(line.split('\t')[8]) + Name = res.group(2) + Name = Name.rstrip() else : Name = "Unknown" ##get annotation - note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) - note = urllib.unquote(str(note)).replace("\n","") + if 'Note' in line : + regex = re.compile('(Note=)([^;]*);') + res = regex.search(line.split('\t')[8]) + note = res.group(2) + note = urllib.unquote(str(note)).replace("\n","") + else : + note = "" ## store gene information GFF['order'].append(gene) GFF[gene] = {} @@ -156,7 +166,11 @@ GFF[gene]['exon_number'] = 0 #print Name elif line.split('\t')[2] == 'CDS' : - gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) + regex = re.compile('(Parent=)([^;]*);') + res = regex.search(line.split('\t')[8]) + gene = res.group(2) + if 'mRNA' in gene: + gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene) if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] @@ -166,7 +180,7 @@ GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) ## if there is a five prim UTR intron, we change start of gene - elif line.split('\t')[2] == 'five_prime_UTR_intron' : + elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[gene]['strand'] == "+" : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : @@ -199,7 +213,7 @@ # first line is already gene line : if 'protein_coding' in line : ##get name - gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() + gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip() Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() if line.split('\t')[2] == 'CDS' : ##if its first time we get this gene @@ -245,7 +259,7 @@ return __reverse_coordinates__(GFF) except Exception,e: - stop_err( 'Error during gff storage : ' + str( e ) ) + stop_err( 'Error during gtf storage : ' + str( e ) ) ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; @@ -288,15 +302,19 @@ try : header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) #header = results.split('\n') - + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" # check mapper for define multiple tag - if re.search('bowtie', header): + if 'bowtie' in header: multi_tag = "XS:i:" - elif re.search('bwa', header): + elif 'bwa' in header: multi_tag = "XT:A:R" + elif 'TopHat' in header: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") - + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + tmp_sam = tempfile.mktemp() cmd = "samtools view %s > %s" % (bam, tmp_sam) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) @@ -307,7 +325,7 @@ out.write(header) with open(tmp_sam,'r') as sam : for line in sam : - if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : + if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : out.write(line) bamfile = tempfile.mktemp()+'.bam'