Mercurial > repos > rlegendre > ribo_tools

--- a/get_codon_frequency.py	Wed May 13 04:34:59 2015 -0400
+++ b/get_codon_frequency.py	Fri May 29 09:17:29 2015 -0400
@@ -23,7 +23,7 @@
 from matplotlib import font_manager
 from matplotlib import colors
 import csv
-from scipy import stats
+from scipy import stats, errstate
 from collections import OrderedDict
 import ribo_functions
 import HTSeq
@@ -60,76 +60,65 @@
                 chrom = feature.iv.chrom
                 start = feature.iv.start
                 stop = feature.iv.end
-                if start+50 < stop-50 :
+                if start+50 < stop-50 :
                     region = chrom + ':' + str(start+50) + '-' + str(stop-50)
-                else :
-                    break
-
-        ## DEPRECATED
-        #for chrom in GFF.iterkeys():
-            #for gene in GFF[chrom] :
-               # codon_dict = init_codon_dict()
-                #start = GFF[chrom][gene]['start']
-                #print start
-                #stop = GFF[chrom][gene]['stop']
-                #print stop
-                #region = chrom + ':' + str(start) + '-' + str(stop)
-        #######
-                # #get all reads in this gene
-                reads = subprocess.check_output(["samtools", "view", bamfile, region])
-                head = subprocess.check_output(["samtools", "view", "-H", bamfile])
-                read_tab = reads.split('\n')
-                for read in read_tab:
-                    # # search mapper for eliminate multiple alignements
-                    if 'bowtie' in head:
-                        multi_tag = "XS:i:"
-                    elif 'bwa' in  head:
-                        multi_tag = "XT:A:R"
-                    elif 'TopHat' in  head:
-                        tag = "NH:i:1"
-                    else :
-                        stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
-
-                    if len(read) == 0:
-                        continue
-                    len_read = len(read.split('\t')[9])
-                    # if it's read of good length
-                    if len_read == kmer and (tag in read or multi_tag not in read):
-                        feat = read.split('\t')
-                        seq = feat[9]
-                        # if it's a reverse read
-                        if feat[1] == '16' :
-                            if site == "A" :
-                            # #get A-site
-                                cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
-                            elif site == "P" :
-                            # #get P-site
-                                cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
-                            else :
-                            # #get site-E
-                                cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
-                            # # test if it's a true codon not a CNG codon for example
-                            if codon_dict.has_key(cod) :
-                                codon_dict[cod] += 1
-                        # if it's a forward read
-                        elif feat[1] == '0' :
-                            if site == "A" :
-                            # #get A-site
-                                cod = seq[a_pos:a_pos+3]
-                            elif site == "P" :
-                            # #get P-site
-                                cod = seq[a_pos-3:a_pos]
-                            else :
-                            # #get site-E
-                                cod = seq[a_pos-6:a_pos-3]
-                            if codon_dict.has_key(cod) :
-                                codon_dict[cod] += 1
-                del(read)
+                    # #get all reads in this gene
+                    reads = subprocess.check_output(["samtools", "view", bamfile, region])
+                    head = subprocess.check_output(["samtools", "view", "-H", bamfile])
+                    read_tab = reads.split('\n')
+                    for read in read_tab:
+                        # # search mapper for eliminate multiple alignements
+                        if 'bowtie' in head:
+                            multi_tag = "XS:i:"
+                        elif 'bwa' in  head:
+                            multi_tag = "XT:A:R"
+                        elif 'TopHat' in  head:
+                            tag = "NH:i:1"
+                        else :
+                            stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")
+
+                        if len(read) == 0:
+                            continue
+                        len_read = len(read.split('\t')[9])
+                        # if it's read of good length
+                        if len_read == kmer and (tag in read or multi_tag not in read):
+                            feat = read.split('\t')
+                            seq = feat[9]
+                            # if it's a reverse read
+                            if feat[1] == '16' :
+                                if site == "A" :
+                                # #get A-site
+                                    cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement())
+                                elif site == "P" :
+                                # #get P-site
+                                    cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement())
+                                else :
+                                # #get site-E
+                                    cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement())
+                                # # test if it's a true codon not a CNG codon for example
+                                if codon_dict.has_key(cod) :
+                                    codon_dict[cod] += 1
+                            # if it's a forward read
+                            elif feat[1] == '0' :
+                                if site == "A" :
+                                # #get A-site
+                                    cod = seq[a_pos:a_pos+3]
+                                elif site == "P" :
+                                # #get P-site
+                                    cod = seq[a_pos-3:a_pos]
+                                else :
+                                # #get site-E
+                                    cod = seq[a_pos-6:a_pos-3]
+                                if codon_dict.has_key(cod) :
+                                    codon_dict[cod] += 1
+                    del(read)
                 # # add in global dict
                 for cod, count in codon_dict.iteritems() :
                     codon[cod] += count
-
-        return codon
+        if sum(codon.values()) == 0 :
+            stop_err('There are no reads aligning on annotated genes in your GFF file')
+        else :
+            return codon

     except Exception, e:
         stop_err('Error during codon usage calcul: ' + str(e))
@@ -317,12 +306,12 @@
             std_cond2 = []
             max_val = []  # # max value for graph
             for i in codon_sorted:
-                # # cond1 = moyenne of replicats cond1 divided by max
+                # # cond1 = mean of replicats cond1 divided by max
                 cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2)
                 cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2)
-                # # standard deviation = absolute value of diffence between replicats of cond1
+                # # standard deviation = absolute value of difference between replicats of cond1
                 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)])))
-                # # cond2 = moyenne of replicats cond1divided by max
+                # # cond2 = mean of replicats cond1divided by max
                 cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2)
                 cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2)
                 # # standard deviation = absolute value of difference between replicats of cond2
@@ -388,7 +377,8 @@
                     out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n')
                 else:
                     out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
-            chi = stats.chisquare(observed, expected)
+            with errstate(all='ignore'):
+                chi = stats.chisquare(observed, expected)
             out.write('Khi2 test\n')
             out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')

@@ -435,7 +425,7 @@
             cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items())
             cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items())
         except ZeroDivisionError:
-            stop_err("Not enough reads to compute the codon occupancy")
+            stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n")

         # # compute theorical count in COND2
         cond2_count = []
@@ -497,7 +487,8 @@
                 else:
                     out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n')
             out.write('Khi2 test\n')
-            chi = stats.chisquare(observed, expected)
+            with errstate(all='ignore'):
+                chi = stats.chisquare(observed, expected)
             out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n')

         # # get max value for each codon for histogram
@@ -603,7 +594,10 @@
     fc = cond1.copy()

     for key, value in fc.iteritems():
-        fc[key] = cond2[key]/cond1[key]
+        if cond1[key] == 0:
+            fc[key] = 1
+        else:
+            fc[key] = cond2[key]/cond1[key]

     index = arange(len(fc.keys()))
     label = fc.keys()
@@ -695,12 +689,11 @@
         GFF = HTSeq.GFF_Reader(options.gff)
         # # get html file and directory :
         (html, html_dir) = options.dirout.split(',')
-        if os.path.exists(html_dir):
-            raise
-        try:
-            os.mkdir(html_dir)
-        except:
-            raise Exception(html_dir + ' mkdir')
+        if not os.path.exists(html_dir):
+            try:
+                os.mkdir(html_dir)
+            except Exception, e :
+                stop_err('Error running make directory : ' + str(e))
         # #RUN analysis
         # #If there are replicats
         if options.rep == "yes" :
@@ -724,6 +717,7 @@
                 check_index_bam (fh)
                 result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite))
             (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2)
+
             # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir)
             plot_fc (cond1, cond2, options.site, html_dir)
         else :
--- a/metagene_readthrough.py	Wed May 13 04:34:59 2015 -0400
+++ b/metagene_readthrough.py	Fri May 29 09:17:29 2015 -0400
@@ -945,12 +945,11 @@

     try:
         (html_file, subfolder ) = options.dirout.split(",")
-        if os.path.exists(subfolder):
-            raise
-        try:
-            os.mkdir(subfolder)
-        except:
-            raise
+        if not os.path.exists(subfolder):
+            try:
+                os.mkdir(subfolder)
+            except Exception, e :
+                stop_err('Error running make directory : ' + str(e))
         ## identify GFF or GTF format from 9th column
         with open (options.gff,"r") as gffile :
             for line in gffile :
--- a/ribo_functions.py	Wed May 13 04:34:59 2015 -0400
+++ b/ribo_functions.py	Fri May 29 09:17:29 2015 -0400
@@ -59,53 +59,65 @@


-def get_first_base(tmpdir,kmer):
+def get_first_base(tmpdir, kmer, frame):
     '''
         write footprint coverage file for each sam file in tmpdir
     '''
     global total_mapped_read
+    ## tags by default
+    multi_tag = "XS:i:"
+    tag = "IH:i:1"
+
     try:
+        ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint)
+        p_site_pos = 16-frame
+
         file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
         ##write coverage for each sam file in tmpdir
         for samfile in file_array:
             with open(tmpdir+'/'+samfile, 'r') as sam :
                 ##get chromosome name
-                chrom = samfile.split(".")[0]
+                chrom = samfile.split(".sam")[0]

                 for line in sam:
                     #initialize dictionnary
-                    if re.search('@SQ', line) :
+                    if '@SQ' in line :
                         size = int(line.split('LN:')[1])
                         genomeF = [0]*size
                         genomeR = [0]*size
                     # define multiple reads keys from mapper
-                    elif re.search('@PG', line) :
-                        if re.search('bowtie', line):
+                    elif '@PG\tID' in line :
+                        if 'bowtie' in line:
                             multi_tag = "XS:i:"
-                        elif re.search('bwa', line):
+                        elif 'bwa' in  line:
                             multi_tag = "XT:A:R"
+                        elif 'TopHat' in  line:
+                            tag = "NH:i:1"
                         else :
-                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
+                            stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping")

                     # get footprint
                     elif re.search('^[^@].+', line) :
                         #print line.rstrip()
                         read_pos = int(line.split('\t')[3])
                         read_sens = int(line.split('\t')[1])
-                        #len_read = len(line.split('\t')[9])
-                        if line.split('\t')[5] == kmer+'M' and multi_tag not in line:
+                        len_read = len(line.split('\t')[9])
+                        read_qual = int(line.split('\t')[4])
+                        if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 :
                         ###if line.split('\t')[5] == '28M' :
                            # print line.rstrip()
                             total_mapped_read +=1
                             #if it's a forward read
                             if read_sens == 0 :
                                 #get P-site : start read + 12 nt
-                                read_pos += 12
+                                #read_pos += 15
+                                read_pos += p_site_pos
                                 genomeF[read_pos] += 1
                             #if it's a reverse read
                             elif read_sens == 16 :
                                 #get P-site
-                                read_pos += 15
+                                #read_pos += 12
+                                read_pos += (len_read-1) - p_site_pos
                                 genomeR[read_pos] += 1

             #try:
@@ -115,10 +127,13 @@
                     cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
             #except Exception,e:
             #    stop_err( 'Error during coverage file writting : ' + str( e ) )
-
+            del genomeF
+            del genomeR
         sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read)
     except Exception, e:
         stop_err( 'Error during footprinting : ' + str( e ) )
+
+

 def store_gff(gff):
     '''