ribo_tools: kmer_analysis.py comparison

comparison kmer_analysis.py @ 10:707807fee542

(none)

author	rlegendre
date	Thu, 22 Jan 2015 14:34:38 +0100
parents	da126b91f9ea
children	7c944fd9907e

comparison

equal deleted inserted replaced

-:d7739f797a26
+:707807fee542
 #from matplotlib import pyplot as pl
 import matplotlib
 matplotlib.use('Agg')
 import matplotlib.pyplot as pl
 from numpy import arange
+from collections import OrderedDict
 import ribo_functions
-from collections import OrderedDict
 #import cPickle
+## suppress matplotlib warnings
+import warnings
+warnings.filterwarnings('ignore')
 total_mapped_read = 0
 def stop_err( msg ):
 sys.stderr.write( "%s\n" % msg )
 sys.stderr.write( "Programme aborted at %s\n" % time.asctime( time.localtime( time.time() ) ) )
 sys.exit()
+def split_bam(bamfile,tmpdir):
+'''
+split bam by chromosome and write sam file in tmpdir
+'''
+try:
+#get header
+results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
+header = results.split('\n')
+#define genome size
+genome = []
+for line in header:
+result = re.search('SN', line)
+if result :
+#print line
+feat = line.split('\t')
+chrom = re.split(":", feat[1])
+#print feat[1]
+genome.append(chrom[1])
+#split sam by chrom
+n = 0
+for chrm in genome:
+with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
+#write header correctly for each chromosome
+f.write(header[0]+'\n')
+expr = re.compile(chrm+'\t')
+el =[elem for elem in header if expr.search(elem)][0]
+f.write(el+'\n')
+f.write(header[-2]+'\n')
+#write all reads for each chromosome
+reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
+f.write(reads)
+# calculate number of reads
+n += reads.count(chrm)
+sys.stdout.write("%d reads are presents in your bam file\n" % n)
+except Exception, e:
+stop_err( 'Error during bam file splitting : ' + str( e ) )
 def get_first_base(tmpdir, kmer):
 '''
 write footprint coverage file for each sam file in tmpdir and get kmer distribution
 '''
 global total_mapped_read
 ##get chromosome name
 chrom = samfile.split(".sam")[0]
 for line in sam:
 #initialize dictionnary
-if '@SQ' in line :
+if '@SQ\tSN:' in line :
 size = int(line.split('LN:')[1])
 genomeF = [0]*size
 genomeR = [0]*size
 # define multiple reads keys from mapper
-elif '@PG' in line :
+elif '@PG\tID' in line :
 if 'bowtie' in line:
 multi_tag = "XS:i:"
 elif 'bwa' in  line:
 multi_tag = "XT:A:R"
+#elif 'TopHat' in  line:
+#    multi_tag = "NH:i:1"
 else :
-stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")
+stop_err("No PG tag find in "+samfile+". Please use bowtie or bwa for mapping")
 # get footprint
 elif re.search('^[^@].+', line) :
 len_read = len(line.split('\t')[9])
 ##full kmer dict
 chrom = "" # initializing chromosome
 nb_gene = 0 # number of analysed genes
 whole_phasing = [0,0,0]
 for gene in GFF['order']:
 ## maybe no start position in GTF file so we must to check and replace
+exon_number = GFF[gene]['exon_number']
 try : GFF[gene]['start']
 except :
 if GFF[gene]['strand'] == '+' :
 GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
 else :
-GFF[gene]['start'] = GFF[gene]['exon'][1]['stop']
+GFF[gene]['start'] = GFF[gene]['exon'][exon_number]['stop']
 ## also for stop coordinates
 try : GFF[gene]['stop']
 except :
-exon_number = GFF[gene]['exon_number']
 if GFF[gene]['strand'] == '+' :
 GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
 else :
-GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['start']
+GFF[gene]['stop'] = GFF[gene]['exon'][1]['start']
 cov = []
 ##first chromosome : we open corresponding file
-if chrom == "" :
+try:
-chrom = GFF[gene]['chrom']
+if chrom == "" :
-with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+chrom = GFF[gene]['chrom']
-data = f.readlines()
+with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
-##if we change chrosomosome
+data = f.readlines()
-elif chrom != GFF[gene]['chrom'] :
+##if we change chromosome
-chrom = GFF[gene]['chrom']
+elif chrom != GFF[gene]['chrom'] :
-with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+chrom = GFF[gene]['chrom']
-data = f.readlines()
+with open(tmpdir+"/assoCov_"+chrom+".txt") as f :
+data = f.readlines()
+except IOError :
+print tmpdir+"/assoCov_"+chrom+".txt doesn't exist"
 ## if a gene without intron :
 if GFF[gene]['exon_number'] == 1:
 ## get coverage for each gene
 if GFF[gene]['strand'] == "+":
 ## For each gene, get coverage and sum of exon size
 if GFF[gene]['strand'] == "+":
 for exon in range(1,GFF[gene]['exon_number']+1) :
 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-if i <= GFF[gene]['stop'] :
+#if i <= GFF[gene]['stop'] :
 cov.append(int((data[i].rstrip()).split("\t")[0]))
 else :
 for exon in range(1,GFF[gene]['exon_number']+1) :
 for i in range(GFF[gene]['exon'][exon]['start'],GFF[gene]['exon'][exon]['stop']+1):
-if i <= GFF[gene]['start'] :
+#if i <= GFF[gene]['start'] :
 cov.append(int(((data[i].rstrip()).split("\t")[1]).replace("-","")))
 cov.reverse()
 len_cov = len(cov)
 prop = [0,0,0]
 for nuc in range(0,len_cov-2,3) :
 pl.xlabel('kmer value', **axis_font)
 pl.ylabel('Number of reads', **axis_font)
 pl.title('Number of reads for each k-mer')
 pl.xticks(index + bar_width, label, **axis_font)
 #pl.show()
+fig.subplots_adjust()
 pl.savefig(dirout+"/kmer_proportion.png", format='png', dpi=640)
 pl.clf()
 for key, phase in results.iteritems() :
 fig = pl.figure(num=1)
 pl.bar(index,frame,color=['RoyalBlue','LightSkyBlue','LightBlue'])
 pl.xlabel('Frame in gene', **axis_font)
 pl.ylabel('Percent of read', **axis_font)
 pl.title('Proportion of reads in each frame for '+str(key)+'-mer')
 pl.xticks(index+bar_width, ('1', '2', '3'), **axis_font)
-pl.tight_layout()
+#pl.tight_layout()
 pl.ylim(0,100)
+fig.subplots_adjust()
 pl.draw()
-#pl.show()
+pl.show()
 pl.savefig(dirout+"/"+str(key)+"_phasing.png", format='png', dpi=300)
 pl.clf()
 kmer_summary = ''
 kmer_sorted = OrderedDict(sorted(kmer.iteritems(), key=lambda x: x[0]))
 def __main__():
 #Parse command line options
 parser = optparse.OptionParser()
-parser.add_option("-g", "--gff", dest="gfffile", type= "string",
+parser.add_option("-g", "--gff", dest="gff", type= "string",
 help="GFF annotation file", metavar="FILE")
 parser.add_option("-b", "--bam", dest="bamfile", type= "string",
 help="Bam Ribo-Seq alignments ", metavar="FILE")
 cmd = "samtools index %s " % (options.bamfile)
 proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
 returncode = proc.wait()
 tmpdir = tempfile.mkdtemp()
-GFF = ribo_functions.store_gff(options.gfffile)
+## identify GFF or GTF format from 9th column
+with open (options.gff,"r") as gffile :
+for line in gffile :
+if '#' in line :
+## skip header
+gffile.next()
+elif 'gene_id' in line :
+## launch gtf reader :
+GFF = ribo_functions.store_gtf(options.gff)
+break
+elif 'ID=' in line :
+## launch gff reader
+GFF = ribo_functions.store_gff(options.gff)
+break
+else :
+stop_err( 'Please check your annotation file is in correct format, GFF or GTF' )
+#GFF = store_gff(options.gff)
+#GFF = ribo_functions.store_gtf(options.gff)
+## check gff reading
+if not GFF['order'] :
+stop_err( 'Incorrect GFF file' + str( e ) )
 ## split bam
-ribo_functions.split_bam(options.bamfile,tmpdir)
+split_bam(options.bamfile,tmpdir)
 ###################################
 ## First analysis with 28mer :
 ###################################
 ## compute coverage and distribution kmer
 kmer = get_first_base(tmpdir, 28)
 if kmer[keys] > 100 :
 ## compute coverage and distribution kmer
 tmp = get_first_base(tmpdir, keys)
 ## compute phasing
 whole_phasing = frame_analysis(tmpdir,GFF)
 results[keys] = whole_phasing
 ## get report
 make_report(options.html_file, options.dirout, kmer, results)
 #=======================================================================
 # ############

Mercurial > repos > rlegendre > ribo_tools

comparison kmer_analysis.py @ 10:707807fee542