diff ribo_functions.py @ 13:7c944fd9907e draft

release 2
author rlegendre
date Thu, 09 Apr 2015 11:35:48 -0400
parents 707807fee542
children c87c40e642af
line wrap: on
line diff
--- a/ribo_functions.py	Fri Jan 23 03:31:37 2015 -0500
+++ b/ribo_functions.py	Thu Apr 09 11:35:48 2015 -0400
@@ -7,7 +7,7 @@
     @license: GPL v3
 '''
 
-import sys, subprocess, re, commands, time, urllib
+import sys, subprocess, re, commands, time, urllib,  tempfile
 from copy import copy
 
 
@@ -127,7 +127,9 @@
     try:
         GFF = {}
         with open(gff, 'r') as f_gff : 
+
             GFF['order'] = []
+            #for line in itertools.islice( f_gff, 25 ):
             for line in f_gff:
                 ## switch commented lines
                 line = line.split("#")[0]
@@ -136,13 +138,21 @@
                 # first line is already gene line :
                     if line.split('\t')[2] == 'gene' :
                         gene = feature[0].replace("ID=","")
-                        if re.search('gene',feature[2]) :
-                            Name = feature[2].replace("gene=","")
+                        if 'Name' in line :
+                            regex = re.compile('(Name=)([^;]*);')
+                            res = regex.search(line.split('\t')[8])
+                            Name = res.group(2)
+                            Name = Name.rstrip()
                         else :
                             Name = "Unknown"
                         ##get annotation
-                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
-                        note = urllib.unquote(str(note)).replace("\n","")
+                        if 'Note' in line :
+                            regex = re.compile('(Note=)([^;]*);')
+                            res = regex.search(line.split('\t')[8])
+                            note = res.group(2)                     
+                            note = urllib.unquote(str(note)).replace("\n","")
+                        else :
+                            note = ""
                         ## store gene information
                         GFF['order'].append(gene)
                         GFF[gene] = {}
@@ -156,7 +166,11 @@
                         GFF[gene]['exon_number'] = 0
                         #print Name
                     elif line.split('\t')[2] == 'CDS' :
-                        gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
+                        regex = re.compile('(Parent=)([^;]*);')
+                        res = regex.search(line.split('\t')[8])
+                        gene = res.group(2) 
+                        if 'mRNA' in gene:
+                            gene = re.sub(r"(.*)(\_mRNA)", r"\1", gene)
                         if GFF.has_key(gene) :
                             GFF[gene]['exon_number'] += 1
                             exon_number = GFF[gene]['exon_number'] 
@@ -166,7 +180,7 @@
                             GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
                         
                     ## if there is a five prim UTR intron, we change start of gene
-                    elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
+                    elif line.split('\t')[2] == 'five_prime_UTR_intron' :
                         if GFF[gene]['strand'] == "+" :
                             GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
                         else :
@@ -199,7 +213,7 @@
                 # first line is already gene line :
                     if 'protein_coding' in line :
                         ##get name
-                        gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
+                        gene = re.sub(r".+transcript_id \"([\w|-|\.]+)\";.*", r"\1", line).rstrip()
                         Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
                         if line.split('\t')[2] == 'CDS' :
                             ##if its first time we get this gene
@@ -245,7 +259,7 @@
 
         return __reverse_coordinates__(GFF)
     except Exception,e:
-        stop_err( 'Error during gff storage : ' + str( e ) )
+        stop_err( 'Error during gtf storage : ' + str( e ) )
 
 
 ##IV      protein_coding  exon    307766  307789  .       -       .       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
@@ -288,15 +302,19 @@
     try :
         header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
         #header = results.split('\n')
-        
+        ## tags by default
+        multi_tag = "XS:i:"
+        tag = "IH:i:1"
         # check mapper for define multiple tag
-        if re.search('bowtie', header):
+        if 'bowtie' in header:
             multi_tag = "XS:i:"
-        elif re.search('bwa', header):
+        elif 'bwa' in  header:
             multi_tag = "XT:A:R"
+        elif 'TopHat' in  header:
+            tag = "NH:i:1"
         else :
-            stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping")
-        
+            stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") 
+            
         tmp_sam = tempfile.mktemp()
         cmd = "samtools view %s > %s" % (bam, tmp_sam) 
         proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
@@ -307,7 +325,7 @@
             out.write(header)
             with open(tmp_sam,'r') as sam :
                 for line in sam :
-                    if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
+                    if (multi_tag not in line or tag in line) and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
                         if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
                             out.write(line)
         bamfile = tempfile.mktemp()+'.bam'