Mercurial > repos > rlegendre > ribo_tools
changeset 17:c87c40e642af draft
Uploaded
author | rlegendre |
---|---|
date | Fri, 29 May 2015 09:17:29 -0400 |
parents | fcfdb2607cb8 |
children | a121cce43f90 |
files | get_codon_frequency.py metagene_readthrough.py ribo_functions.py |
diffstat | 3 files changed, 106 insertions(+), 98 deletions(-) [+] |
line wrap: on
line diff
--- a/get_codon_frequency.py Wed May 13 04:34:59 2015 -0400 +++ b/get_codon_frequency.py Fri May 29 09:17:29 2015 -0400 @@ -23,7 +23,7 @@ from matplotlib import font_manager from matplotlib import colors import csv -from scipy import stats +from scipy import stats, errstate from collections import OrderedDict import ribo_functions import HTSeq @@ -60,76 +60,65 @@ chrom = feature.iv.chrom start = feature.iv.start stop = feature.iv.end - if start+50 < stop-50 : + if start+50 < stop-50 : region = chrom + ':' + str(start+50) + '-' + str(stop-50) - else : - break - - ## DEPRECATED - #for chrom in GFF.iterkeys(): - #for gene in GFF[chrom] : - # codon_dict = init_codon_dict() - #start = GFF[chrom][gene]['start'] - #print start - #stop = GFF[chrom][gene]['stop'] - #print stop - #region = chrom + ':' + str(start) + '-' + str(stop) - ####### - # #get all reads in this gene - reads = subprocess.check_output(["samtools", "view", bamfile, region]) - head = subprocess.check_output(["samtools", "view", "-H", bamfile]) - read_tab = reads.split('\n') - for read in read_tab: - # # search mapper for eliminate multiple alignements - if 'bowtie' in head: - multi_tag = "XS:i:" - elif 'bwa' in head: - multi_tag = "XT:A:R" - elif 'TopHat' in head: - tag = "NH:i:1" - else : - stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") - - if len(read) == 0: - continue - len_read = len(read.split('\t')[9]) - # if it's read of good length - if len_read == kmer and (tag in read or multi_tag not in read): - feat = read.split('\t') - seq = feat[9] - # if it's a reverse read - if feat[1] == '16' : - if site == "A" : - # #get A-site - cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) - elif site == "P" : - # #get P-site - cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) - else : - # #get site-E - cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) - # # test if it's a true codon not a CNG codon for example - if codon_dict.has_key(cod) : - codon_dict[cod] += 1 - # if it's a forward read - elif feat[1] == '0' : - if site == "A" : - # #get A-site - cod = seq[a_pos:a_pos+3] - elif site == "P" : - # #get P-site - cod = seq[a_pos-3:a_pos] - else : - # #get site-E - cod = seq[a_pos-6:a_pos-3] - if codon_dict.has_key(cod) : - codon_dict[cod] += 1 - del(read) + # #get all reads in this gene + reads = subprocess.check_output(["samtools", "view", bamfile, region]) + head = subprocess.check_output(["samtools", "view", "-H", bamfile]) + read_tab = reads.split('\n') + for read in read_tab: + # # search mapper for eliminate multiple alignements + if 'bowtie' in head: + multi_tag = "XS:i:" + elif 'bwa' in head: + multi_tag = "XT:A:R" + elif 'TopHat' in head: + tag = "NH:i:1" + else : + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") + + if len(read) == 0: + continue + len_read = len(read.split('\t')[9]) + # if it's read of good length + if len_read == kmer and (tag in read or multi_tag not in read): + feat = read.split('\t') + seq = feat[9] + # if it's a reverse read + if feat[1] == '16' : + if site == "A" : + # #get A-site + cod = str(Seq(seq[a_pos-5:a_pos-2]).reverse_complement()) + elif site == "P" : + # #get P-site + cod = str(Seq(seq[a_pos-2:a_pos+1]).reverse_complement()) + else : + # #get site-E + cod = str(Seq(seq[a_pos+1:a_pos+4]).reverse_complement()) + # # test if it's a true codon not a CNG codon for example + if codon_dict.has_key(cod) : + codon_dict[cod] += 1 + # if it's a forward read + elif feat[1] == '0' : + if site == "A" : + # #get A-site + cod = seq[a_pos:a_pos+3] + elif site == "P" : + # #get P-site + cod = seq[a_pos-3:a_pos] + else : + # #get site-E + cod = seq[a_pos-6:a_pos-3] + if codon_dict.has_key(cod) : + codon_dict[cod] += 1 + del(read) # # add in global dict for cod, count in codon_dict.iteritems() : codon[cod] += count - - return codon + if sum(codon.values()) == 0 : + stop_err('There are no reads aligning on annotated genes in your GFF file') + else : + return codon except Exception, e: stop_err('Error during codon usage calcul: ' + str(e)) @@ -317,12 +306,12 @@ std_cond2 = [] max_val = [] # # max value for graph for i in codon_sorted: - # # cond1 = moyenne of replicats cond1 divided by max + # # cond1 = mean of replicats cond1 divided by max cond1_val[i] = ((cond1_1[i] / sum11 + cond1_2[i] / sum12) / 2) cond1[i] = ((cond1_1[i] + cond1_2[i]) / 2) - # # standard deviation = absolute value of diffence between replicats of cond1 + # # standard deviation = absolute value of difference between replicats of cond1 std_cond1.append(std(array([(cond1_1[i] * 100 / sum11), (cond1_2[i] * 100 / sum12)]))) - # # cond2 = moyenne of replicats cond1divided by max + # # cond2 = mean of replicats cond1divided by max cond2_val[i] = ((cond2_1[i] / sum21 + cond2_2[i] / sum22) / 2) cond2[i] = ((cond2_1[i] + cond2_2[i]) / 2) # # standard deviation = absolute value of difference between replicats of cond2 @@ -388,7 +377,8 @@ out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t0.0\n') else: out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') - chi = stats.chisquare(observed, expected) + with errstate(all='ignore'): + chi = stats.chisquare(observed, expected) out.write('Khi2 test\n') out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') @@ -435,7 +425,7 @@ cond1_norm.update ((x, (y / sum1) * 100.0) for x, y in cond1_norm.items()) cond2_norm.update((x, (y / sum2) * 100.0) for x, y in cond2_norm.items()) except ZeroDivisionError: - stop_err("Not enough reads to compute the codon occupancy") + stop_err("Not enough reads to compute the codon occupancy. "+str(sum1)+" and "+str(sum2)+" reads are used for each condition, respectively.\n") # # compute theorical count in COND2 cond2_count = [] @@ -497,7 +487,8 @@ else: out.write(i + '\t' + str(cond1[i]) + '\t' + str(cond2[i]) + '\t' + str(cond1_norm[i]) + '\t' + str(cond2_norm[i]) + '\t' + str(cond2_norm[i] / cond1_norm[i]) + '\n') out.write('Khi2 test\n') - chi = stats.chisquare(observed, expected) + with errstate(all='ignore'): + chi = stats.chisquare(observed, expected) out.write('T : ' + str(chi[0]) + '; p-value : ' + str(chi[1]) + '\n') # # get max value for each codon for histogram @@ -603,7 +594,10 @@ fc = cond1.copy() for key, value in fc.iteritems(): - fc[key] = cond2[key]/cond1[key] + if cond1[key] == 0: + fc[key] = 1 + else: + fc[key] = cond2[key]/cond1[key] index = arange(len(fc.keys())) label = fc.keys() @@ -695,12 +689,11 @@ GFF = HTSeq.GFF_Reader(options.gff) # # get html file and directory : (html, html_dir) = options.dirout.split(',') - if os.path.exists(html_dir): - raise - try: - os.mkdir(html_dir) - except: - raise Exception(html_dir + ' mkdir') + if not os.path.exists(html_dir): + try: + os.mkdir(html_dir) + except Exception, e : + stop_err('Error running make directory : ' + str(e)) # #RUN analysis # #If there are replicats if options.rep == "yes" : @@ -724,6 +717,7 @@ check_index_bam (fh) result.append(get_codon_usage(fh, GFF, options.site, options.kmer,options.asite)) (cond1, cond2, chi_pval) = plot_codon_usage(result, html_dir, options.c1, options.c2, options.outfile,options.color1, options.color2) + # t_pval = compute_FC_plot(cond1,cond2,codons,html_dir) plot_fc (cond1, cond2, options.site, html_dir) else :
--- a/metagene_readthrough.py Wed May 13 04:34:59 2015 -0400 +++ b/metagene_readthrough.py Fri May 29 09:17:29 2015 -0400 @@ -945,12 +945,11 @@ try: (html_file, subfolder ) = options.dirout.split(",") - if os.path.exists(subfolder): - raise - try: - os.mkdir(subfolder) - except: - raise + if not os.path.exists(subfolder): + try: + os.mkdir(subfolder) + except Exception, e : + stop_err('Error running make directory : ' + str(e)) ## identify GFF or GTF format from 9th column with open (options.gff,"r") as gffile : for line in gffile :
--- a/ribo_functions.py Wed May 13 04:34:59 2015 -0400 +++ b/ribo_functions.py Fri May 29 09:17:29 2015 -0400 @@ -59,53 +59,65 @@ -def get_first_base(tmpdir,kmer): +def get_first_base(tmpdir, kmer, frame): ''' write footprint coverage file for each sam file in tmpdir ''' global total_mapped_read + ## tags by default + multi_tag = "XS:i:" + tag = "IH:i:1" + try: + ### compute position of P-site according to frame (P-site -> +15 starting from 5' end of footprint) + p_site_pos = 16-frame + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') ##write coverage for each sam file in tmpdir for samfile in file_array: with open(tmpdir+'/'+samfile, 'r') as sam : ##get chromosome name - chrom = samfile.split(".")[0] + chrom = samfile.split(".sam")[0] for line in sam: #initialize dictionnary - if re.search('@SQ', line) : + if '@SQ' in line : size = int(line.split('LN:')[1]) genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper - elif re.search('@PG', line) : - if re.search('bowtie', line): + elif '@PG\tID' in line : + if 'bowtie' in line: multi_tag = "XS:i:" - elif re.search('bwa', line): + elif 'bwa' in line: multi_tag = "XT:A:R" + elif 'TopHat' in line: + tag = "NH:i:1" else : - stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + stop_err("No PG tag find in "+samfile+". Please use bowtie, bwa or Tophat for mapping") # get footprint elif re.search('^[^@].+', line) : #print line.rstrip() read_pos = int(line.split('\t')[3]) read_sens = int(line.split('\t')[1]) - #len_read = len(line.split('\t')[9]) - if line.split('\t')[5] == kmer+'M' and multi_tag not in line: + len_read = len(line.split('\t')[9]) + read_qual = int(line.split('\t')[4]) + if len_read == kmer and (tag in line or multi_tag not in line) and read_qual > 20 : ###if line.split('\t')[5] == '28M' : # print line.rstrip() total_mapped_read +=1 #if it's a forward read if read_sens == 0 : #get P-site : start read + 12 nt - read_pos += 12 + #read_pos += 15 + read_pos += p_site_pos genomeF[read_pos] += 1 #if it's a reverse read elif read_sens == 16 : #get P-site - read_pos += 15 + #read_pos += 12 + read_pos += (len_read-1) - p_site_pos genomeR[read_pos] += 1 #try: @@ -115,10 +127,13 @@ cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') #except Exception,e: # stop_err( 'Error during coverage file writting : ' + str( e ) ) - + del genomeF + del genomeR sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read) except Exception, e: stop_err( 'Error during footprinting : ' + str( e ) ) + + def store_gff(gff): '''