diff kmer_analysis.py @ 10:707807fee542

(none)
author rlegendre
date Thu, 22 Jan 2015 14:34:38 +0100
parents da126b91f9ea
children 7c944fd9907e
line wrap: on
line diff
--- a/kmer_analysis.py	Tue Dec 09 09:43:07 2014 -0500
+++ b/kmer_analysis.py	Thu Jan 22 14:34:38 2015 +0100
@@ -14,9 +14,13 @@
 matplotlib.use('Agg')
 import matplotlib.pyplot as pl
 from numpy import arange
+from collections import OrderedDict
 import ribo_functions
-from collections import OrderedDict
 #import cPickle
+## suppress matplotlib warnings
+import warnings
+warnings.filterwarnings('ignore')
+
 
 total_mapped_read = 0 
 
@@ -25,9 +29,51 @@
     sys.stderr.write( "%s\n" % msg )
     sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
     sys.exit()
-
+    
+    
+def split_bam(bamfile,tmpdir):
+    '''
+        split bam by chromosome and write sam file in tmpdir
+    '''
+    try:
+        #get header
+        results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
+        header = results.split('\n')
+       
+        #define genome size
+        genome = []
+        for line in header:
+            result = re.search('SN', line)
+            if result :
+                #print line    
+                feat = line.split('\t')
+                chrom = re.split(":", feat[1])
+                #print feat[1]
+                genome.append(chrom[1])
+       
+        #split sam by chrom
+        n = 0
+        for chrm in genome:
+            with open(tmpdir+'/'+chrm+'.sam', 'w') as f : 
+                #write header correctly for each chromosome
+                f.write(header[0]+'\n')
+                expr = re.compile(chrm+'\t')
+                el =[elem for elem in header if expr.search(elem)][0]
+                f.write(el+'\n')            
+                f.write(header[-2]+'\n')
+                #write all reads for each chromosome
+                reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
+                f.write(reads)
+                # calculate number of reads 
+                n += reads.count(chrm)
 
+        sys.stdout.write("%d reads are presents in your bam file\n" % n)
+    
+    except Exception, e:
+        stop_err( 'Error during bam file splitting : ' + str( e ) )
         
+
+       
 def get_first_base(tmpdir, kmer): 
     '''
         write footprint coverage file for each sam file in tmpdir and get kmer distribution
@@ -47,18 +93,20 @@
         
                 for line in sam:
                     #initialize dictionnary
-                    if '@SQ' in line :
+                    if '@SQ\tSN:' in line :
                         size = int(line.split('LN:')[1])
                         genomeF = [0]*size
                         genomeR = [0]*size
                     # define multiple reads keys from mapper
-                    elif '@PG' in line :
+                    elif '@PG\tID' in line :
                         if 'bowtie' in line:
                             multi_tag = "XS:i:"
                         elif 'bwa' in  line:
                             multi_tag = "XT:A:R"
+                        #elif 'TopHat' in  line:
+                        #    multi_tag = "NH:i:1"
                         else :
-                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")  
+                            stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping")  
                         
                     # get footprint
                     elif re.search('^[^@].+', line) :
@@ -128,33 +176,37 @@
         whole_phasing = [0,0,0]
         for gene in GFF['order']:
             ## maybe no start position in GTF file so we must to check and replace
+            exon_number = GFF[gene]['exon_number']
             try : GFF[gene]['start'] 
             except :
                 if GFF[gene]['strand'] == '+' :
                     GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
                 else :
-                    GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
+                    GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
             ## also for stop coordinates
             try : GFF[gene]['stop'] 
             except :
-                exon_number = GFF[gene]['exon_number']
                 if GFF[gene]['strand'] == '+' :
                     GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
 
                 else :
-                    GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
+                    GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
             cov = []
             ##first chromosome : we open corresponding file
-            if chrom == "" :
-                chrom = GFF[gene]['chrom']
-                with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-                    data = f.readlines()
-            ##if we change chrosomosome
-            elif chrom != GFF[gene]['chrom'] :
-                chrom = GFF[gene]['chrom']
-                with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-                    data = f.readlines()
-                 
+            try: 
+                if chrom == "" :
+                    chrom = GFF[gene]['chrom']
+                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                        data = f.readlines()
+                ##if we change chromosome
+                elif chrom != GFF[gene]['chrom'] :
+                    chrom = GFF[gene]['chrom']
+                    with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+                        data = f.readlines()
+            except IOError :
+                print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"     
+
+            
             ## if a gene without intron :
             if GFF[gene]['exon_number'] == 1:
                     
@@ -172,14 +224,14 @@
                         
                     for exon in range(1,GFF[gene]['exon_number']+1) :
                         for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-                            if i <= GFF[gene]['stop'] :
-                                cov.append(int((data[i].rstrip()).split("\t")[0]))  
+                            #if i <= GFF[gene]['stop'] :
+                            cov.append(int((data[i].rstrip()).split("\t")[0]))  
                 else :
                           
                     for exon in range(1,GFF[gene]['exon_number']+1) :
                         for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-                            if i <= GFF[gene]['start'] :
-                                cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
+                            #if i <= GFF[gene]['start'] :
+                            cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
                     cov.reverse()        
                             
             len_cov = len(cov)
@@ -218,6 +270,7 @@
     pl.title('Number of reads for each k-mer')
     pl.xticks(index + bar_width, label, **axis_font)
     #pl.show()
+    fig.subplots_adjust()
     pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
     pl.clf()
     
@@ -231,10 +284,11 @@
         pl.ylabel('Percent of read', **axis_font)
         pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
         pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
-        pl.tight_layout()
+        #pl.tight_layout()
         pl.ylim(0,100)
+        fig.subplots_adjust() 
         pl.draw()
-        #pl.show()
+        pl.show()
         pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
         pl.clf()
     
@@ -288,7 +342,7 @@
 
     #Parse command line options
     parser = optparse.OptionParser()
-    parser.add_option("-g", "--gff", dest="gfffile", type= "string",
+    parser.add_option("-g", "--gff", dest="gff", type= "string",
                   help="GFF annotation file", metavar="FILE")
                   
     parser.add_option("-b", "--bam", dest="bamfile", type= "string",
@@ -325,9 +379,30 @@
             returncode = proc.wait()
         
         tmpdir = tempfile.mkdtemp()
-        GFF = ribo_functions.store_gff(options.gfffile)
+        ## identify GFF or GTF format from 9th column
+        with open (options.gff,"r") as gffile :
+            for line in gffile :
+                if '#' in line :
+                    ## skip header
+                    gffile.next()
+                elif 'gene_id' in line :
+                    ## launch gtf reader :
+                    GFF = ribo_functions.store_gtf(options.gff)
+                    break
+                elif 'ID=' in line :
+                    ## launch gff reader 
+                    GFF = ribo_functions.store_gff(options.gff)
+                    break
+                else :
+                    stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+        #GFF = store_gff(options.gff)
+        #GFF = ribo_functions.store_gtf(options.gff)
+        ## check gff reading
+        if not GFF['order'] :
+            stop_err( 'Incorrect GFF file' + str( e ) )
+        
         ## split bam
-        ribo_functions.split_bam(options.bamfile,tmpdir)
+        split_bam(options.bamfile,tmpdir)
         ###################################
         ## First analysis with 28mer :
         ###################################
@@ -351,6 +426,7 @@
                     tmp = get_first_base(tmpdir, keys)
                     ## compute phasing
                     whole_phasing = frame_analysis(tmpdir,GFF)
+
                     results[keys] = whole_phasing
                  ## get report
         make_report(options.html_file, options.dirout, kmer, results)