Mercurial > repos > rlegendre > ribo_tools
changeset 9:d7739f797a26
Uploaded
author | rlegendre |
---|---|
date | Tue, 09 Dec 2014 09:43:07 -0500 |
parents | adc01e560eae |
children | 707807fee542 |
files | ribo_functions.py |
diffstat | 1 files changed, 296 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ribo_functions.py Tue Dec 09 09:43:07 2014 -0500 @@ -0,0 +1,296 @@ +#!/usr/bin/env python2.7 + +''' + Created on Jan. 2014 + @author: rachel legendre + @copyright: rachel.legendre@igmors.u-psud.fr + @license: GPL v3 +''' + +import sys, subprocess, re, commands, time, urllib + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) + sys.exit() + +def split_bam(bamfile,tmpdir): + ''' + split bam by chromosome and write sam file in tmpdir + ''' + try: + #get header + results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) + header = results.split('\n') + + #define genome size + genome = [] + for line in header: + result = re.search('SN', line) + if result : + #print line + feat = line.split('\t') + chrom = re.split(":", feat[1]) + #print feat[1] + genome.append(chrom[1]) + + #split sam by chrom + n = 0 + for chrm in genome: + with open(tmpdir+'/'+chrm+'.sam', 'w') as f : + #write header correctly for each chromosome + f.write(header[0]+'\n') + expr = re.compile(chrm+'\t') + el =[elem for elem in header if expr.search(elem)][0] + f.write(el+'\n') + f.write(header[-2]+'\n') + #write all reads for each chromosome + reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) + f.write(reads) + # calculate number of reads + n += reads.count(chrm) + + sys.stdout.write("%d reads are presents in your bam file\n" % n) + + except Exception, e: + stop_err( 'Error during bam file splitting : ' + str( e ) ) + + + +def get_first_base(tmpdir,kmer): + ''' + write footprint coverage file for each sam file in tmpdir + ''' + global total_mapped_read + try: + file_array = (commands.getoutput('ls '+tmpdir)).split('\n') + ##write coverage for each sam file in tmpdir + for samfile in file_array: + with open(tmpdir+'/'+samfile, 'r') as sam : + ##get chromosome name + chrom = samfile.split(".")[0] + + for line in sam: + #initialize dictionnary + if re.search('@SQ', line) : + size = int(line.split('LN:')[1]) + genomeF = [0]*size + genomeR = [0]*size + # define multiple reads keys from mapper + elif re.search('@PG', line) : + if re.search('bowtie', line): + multi_tag = "XS:i:" + elif re.search('bwa', line): + multi_tag = "XT:A:R" + else : + stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") + + # get footprint + elif re.search('^[^@].+', line) : + #print line.rstrip() + read_pos = int(line.split('\t')[3]) + read_sens = int(line.split('\t')[1]) + #len_read = len(line.split('\t')[9]) + if line.split('\t')[5] == kmer+'M' and multi_tag not in line: + ###if line.split('\t')[5] == '28M' : + # print line.rstrip() + total_mapped_read +=1 + #if it's a forward read + if read_sens == 0 : + #get P-site : start read + 12 nt + read_pos += 12 + genomeF[read_pos] += 1 + #if it's a reverse read + elif read_sens == 16 : + #get P-site + read_pos += 15 + genomeR[read_pos] += 1 + + #try: + #write coverage in files + with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : + for i in range(0,size): + cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') + #except Exception,e: + # stop_err( 'Error during coverage file writting : ' + str( e ) ) + + sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read) + except Exception, e: + stop_err( 'Error during footprinting : ' + str( e ) ) + +def store_gff(gff): + ''' + parse and store gff file in a dictionnary + ''' + try: + GFF = {} + with open(gff, 'r') as f_gff : + GFF['order'] = [] + for line in f_gff: + ## switch commented lines + line = line.split("#")[0] + if line != "" : + feature = (line.split('\t')[8]).split(';') + # first line is already gene line : + if line.split('\t')[2] == 'gene' : + gene = feature[0].replace("ID=","") + if re.search('gene',feature[2]) : + Name = feature[2].replace("gene=","") + else : + Name = "Unknown" + ##get annotation + note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) + note = urllib.unquote(str(note)).replace("\n","") + ## store gene information + GFF['order'].append(gene) + GFF[gene] = {} + GFF[gene]['chrom'] = line.split('\t')[0] + GFF[gene]['start'] = int(line.split('\t')[3]) + GFF[gene]['stop'] = int(line.split('\t')[4]) + GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['name'] = Name + GFF[gene]['note'] = note + GFF[gene]['exon'] = {} + GFF[gene]['exon_number'] = 0 + #print Name + elif line.split('\t')[2] == 'CDS' : + gene = re.sub(r".?Parent\=(.+)(_mRNA)+", r"\1", feature[0]) + if GFF.has_key(gene) : + GFF[gene]['exon_number'] += 1 + exon_number = GFF[gene]['exon_number'] + GFF[gene]['exon'][exon_number] = {} + GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] + GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) + GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) + + ## if there is a five prim UTR intron, we change start of gene + elif line.split('\t')[2] == 'five_prime_UTR_intron' : + if GFF[gene]['strand'] == "+" : + GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] + else : + GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] + return GFF + except Exception,e: + stop_err( 'Error during gff storage : ' + str( e ) ) + + +#chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 +#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra +#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr +#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified +#chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified +#chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified +#chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified + +def store_gtf(gff): + ''' + parse and store gtf file in a dictionnary (DEPRECATED) + ''' + try: + GFF = {} + with open(gff, 'r') as f_gff : + GFF['order'] = [] + for line in f_gff: + ## switch commented lines + line = line.split("#")[0] + if line != "" : + # first line is already gene line : + if line.split('\t')[1] == 'protein_coding' : + ##get name + gene = re.sub(r".+ transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() + Name = re.sub(r".+ transcript_name \"([\w|-]+)\";.*", r"\1", line).rstrip() + if line.split('\t')[2] == 'exon' : + ##if its first time we get this gene + if gene not in GFF.keys() : + ## store gene information + GFF['order'].append(gene) + GFF[gene] = {} + GFF[gene]['chrom'] = line.split('\t')[0] + GFF[gene]['strand'] = line.split('\t')[6] + GFF[gene]['name'] = Name + GFF[gene]['exon_number'] = 1 + GFF[gene]['exon'] = {} + exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + GFF[gene]['exon'][exon_number] = {} + GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) + GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) + else : + ## we add exon + exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + GFF[gene]['exon_number'] = exon_number + GFF[gene]['exon'][exon_number] = {} + GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) + GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) + elif line.split('\t')[2] == 'CDS' : + exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) + GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] + elif line.split('\t')[2] == 'start_codon' : + if GFF[gene]['strand'] == '-' : + GFF[gene]['start'] = int(line.split('\t')[4]) + else : + GFF[gene]['start'] = int(line.split('\t')[3]) + elif line.split('\t')[2] == 'stop_codon' : + if GFF[gene]['strand'] == '-' : + GFF[gene]['stop'] = int(line.split('\t')[3]) + else : + GFF[gene]['stop'] = int(line.split('\t')[4]) + + return GFF + except Exception,e: + stop_err( 'Error during gff storage : ' + str( e ) ) + + +##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; +## exon_id "YDL083C.1"; +##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; +## protein_id "YDL083C"; +##IV protein_coding start_codon 307787 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " +##RPS16B"; +##IV protein_coding exon 306926 307333 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; +## exon_id "YDL083C.2"; +##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; +## protein_id "YDL083C"; +##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " +##RPS16B"; + + + +def cleaning_bam(bam): + ''' + Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 + ''' + try : + header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) + #header = results.split('\n') + + # check mapper for define multiple tag + if re.search('bowtie', header): + multi_tag = "XS:i:" + elif re.search('bwa', header): + multi_tag = "XT:A:R" + else : + stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") + + tmp_sam = tempfile.mktemp() + cmd = "samtools view %s > %s" % (bam, tmp_sam) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + + with open(tempfile.mktemp(),'w') as out : + out.write(header) + with open(tmp_sam,'r') as sam : + for line in sam : + if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : + if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : + out.write(line) + bamfile = tempfile.mktemp()+'.bam' + cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) + proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) + returncode = proc.wait() + + return bamfile + + except Exception,e: + stop_err( 'Error during cleaning bam : ' + str( e ) ) + \ No newline at end of file