diff ribo_functions.py @ 17:c87c40e642af draft

Uploaded
author rlegendre
date Fri, 29 May 2015 09:17:29 -0400
parents 7c944fd9907e
children a121cce43f90
line wrap: on
line diff
--- a/ribo_functions.py	Wed May 13 04:34:59 2015 -0400
+++ b/ribo_functions.py	Fri May 29 09:17:29 2015 -0400
@@ -59,53 +59,65 @@
     
     
     
-def get_first_base(tmpdir,kmer):
+def get_first_base(tmpdir, kmer, frame):
     '''
         write footprint coverage file for each sam file in tmpdir
     '''
     global total_mapped_read
+    ## tags by default
+    multi_tag = "XS:i:"
+    tag = "IH:i:1"
+    
     try:
+        ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint)
+        p_site_pos = 16-frame
+        
         file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
         ##write coverage for each sam file in tmpdir
         for samfile in file_array:
             with open(tmpdir+'/'+samfile, 'r') as sam : 
                 ##get chromosome name
-                chrom = samfile.split(".")[0]
+                chrom = samfile.split(".sam")[0]
         
                 for line in sam:
                     #initialize dictionnary
-                    if re.search('@SQ', line) :
+                    if '@SQ' in line :
                         size = int(line.split('LN:')[1])
                         genomeF = [0]*size
                         genomeR = [0]*size
                     # define multiple reads keys from mapper
-                    elif re.search('@PG', line) :
-                        if re.search('bowtie', line):
+                    elif '@PG\tID' in line :
+                        if 'bowtie' in line:
                             multi_tag = "XS:i:"
-                        elif re.search('bwa', line):
+                        elif 'bwa' in  line:
                             multi_tag = "XT:A:R"
+                        elif 'TopHat' in  line:
+                            tag = "NH:i:1"
                         else :
-                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
+                            stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")   
                         
                     # get footprint
                     elif re.search('^[^@].+', line) :
                         #print line.rstrip()
                         read_pos = int(line.split('\t')[3])
                         read_sens = int(line.split('\t')[1])
-                        #len_read = len(line.split('\t')[9])
-                        if line.split('\t')[5] == kmer+'M' and multi_tag not in line:
+                        len_read = len(line.split('\t')[9])
+                        read_qual = int(line.split('\t')[4])
+                        if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 :
                         ###if line.split('\t')[5] == '28M' :
                            # print line.rstrip()
                             total_mapped_read +=1
                             #if it's a forward read
                             if read_sens == 0 :
                                 #get P-site : start read + 12 nt
-                                read_pos += 12
+                                #read_pos += 15
+                                read_pos += p_site_pos
                                 genomeF[read_pos] += 1
                             #if it's a reverse read
                             elif read_sens == 16 :
                                 #get P-site
-                                read_pos += 15
+                                #read_pos += 12
+                                read_pos += (len_read-1) - p_site_pos
                                 genomeR[read_pos] += 1
 
             #try:
@@ -115,10 +127,13 @@
                     cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
             #except Exception,e:
             #    stop_err( 'Error during coverage file writting : ' + str( e ) )            
-                    
+            del genomeF
+            del genomeR      
         sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read)              
     except Exception, e:
         stop_err( 'Error during footprinting : ' + str( e ) )
+        
+        
 
 def store_gff(gff):
     '''