Mercurial > repos > rlegendre > ribo_tools
view ribo_functions.py @ 10:707807fee542
(none)
author | rlegendre |
---|---|
date | Thu, 22 Jan 2015 14:34:38 +0100 |
parents | d7739f797a26 |
children | 7c944fd9907e |
line wrap: on
line source
#!/usr/bin/env python2.7 ''' Created on Jan. 2014 @author: rachel legendre @copyright: rachel.legendre@igmors.u-psud.fr @license: GPL v3 ''' import sys, subprocess, re, commands, time, urllib from copy import copy def stop_err( msg ): sys.stderr.write( "%s\n" % msg ) sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time()))) sys.exit() def split_bam(bamfile,tmpdir): ''' split bam by chromosome and write sam file in tmpdir ''' try: #get header results = subprocess.check_output(['samtools', 'view', '-H',bamfile]) header = results.split('\n') #define genome size genome = [] for line in header: result = re.search('SN', line) if result : #print line feat = line.split('\t') chrom = re.split(":", feat[1]) #print feat[1] genome.append(chrom[1]) #split sam by chrom n = 0 for chrm in genome: with open(tmpdir+'/'+chrm+'.sam', 'w') as f : #write header correctly for each chromosome f.write(header[0]+'\n') expr = re.compile(chrm+'\t') el =[elem for elem in header if expr.search(elem)][0] f.write(el+'\n') f.write(header[-2]+'\n') #write all reads for each chromosome reads = subprocess.check_output(["samtools", "view", bamfile, chrm]) f.write(reads) # calculate number of reads n += reads.count(chrm) sys.stdout.write("%d reads are presents in your bam file\n" % n) except Exception, e: stop_err( 'Error during bam file splitting : ' + str( e ) ) def get_first_base(tmpdir,kmer): ''' write footprint coverage file for each sam file in tmpdir ''' global total_mapped_read try: file_array = (commands.getoutput('ls '+tmpdir)).split('\n') ##write coverage for each sam file in tmpdir for samfile in file_array: with open(tmpdir+'/'+samfile, 'r') as sam : ##get chromosome name chrom = samfile.split(".")[0] for line in sam: #initialize dictionnary if re.search('@SQ', line) : size = int(line.split('LN:')[1]) genomeF = [0]*size genomeR = [0]*size # define multiple reads keys from mapper elif re.search('@PG', line) : if re.search('bowtie', line): multi_tag = "XS:i:" elif re.search('bwa', line): multi_tag = "XT:A:R" else : stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping") # get footprint elif re.search('^[^@].+', line) : #print line.rstrip() read_pos = int(line.split('\t')[3]) read_sens = int(line.split('\t')[1]) #len_read = len(line.split('\t')[9]) if line.split('\t')[5] == kmer+'M' and multi_tag not in line: ###if line.split('\t')[5] == '28M' : # print line.rstrip() total_mapped_read +=1 #if it's a forward read if read_sens == 0 : #get P-site : start read + 12 nt read_pos += 12 genomeF[read_pos] += 1 #if it's a reverse read elif read_sens == 16 : #get P-site read_pos += 15 genomeR[read_pos] += 1 #try: #write coverage in files with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov : for i in range(0,size): cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n') #except Exception,e: # stop_err( 'Error during coverage file writting : ' + str( e ) ) sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read) except Exception, e: stop_err( 'Error during footprinting : ' + str( e ) ) def store_gff(gff): ''' parse and store gff file in a dictionnary ''' try: GFF = {} with open(gff, 'r') as f_gff : GFF['order'] = [] for line in f_gff: ## switch commented lines line = line.split("#")[0] if line != "" : feature = (line.split('\t')[8]).split(';') # first line is already gene line : if line.split('\t')[2] == 'gene' : gene = feature[0].replace("ID=","") if re.search('gene',feature[2]) : Name = feature[2].replace("gene=","") else : Name = "Unknown" ##get annotation note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line) note = urllib.unquote(str(note)).replace("\n","") ## store gene information GFF['order'].append(gene) GFF[gene] = {} GFF[gene]['chrom'] = line.split('\t')[0] GFF[gene]['start'] = int(line.split('\t')[3]) GFF[gene]['stop'] = int(line.split('\t')[4]) GFF[gene]['strand'] = line.split('\t')[6] GFF[gene]['name'] = Name GFF[gene]['note'] = note GFF[gene]['exon'] = {} GFF[gene]['exon_number'] = 0 #print Name elif line.split('\t')[2] == 'CDS' : gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0]) if GFF.has_key(gene) : GFF[gene]['exon_number'] += 1 exon_number = GFF[gene]['exon_number'] GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) ## if there is a five prim UTR intron, we change start of gene elif line.split('\t')[2] == 'five_prime_UTR_intron' : if GFF[gene]['strand'] == "+" : GFF[gene]['start'] = GFF[gene]['exon'][1]['start'] else : GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop'] return GFF except Exception,e: stop_err( 'Error during gff storage : ' + str( e ) ) #chrI SGD gene 87286 87752 . + . ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689 #7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra #ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr #om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified #chrI SGD CDS 87286 87387 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified #chrI SGD CDS 87501 87752 . + 0 Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified #chrI SGD intron 87388 87500 . + . Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified def store_gtf(gff): ''' parse and store gtf file in a dictionnary ''' try: GFF = {} with open(gff, 'r') as f_gff : GFF['order'] = [] for line in f_gff: ## switch commented lines line = line.split("#")[0] if line != "" : # first line is already gene line : if 'protein_coding' in line : ##get name gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip() Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip() if line.split('\t')[2] == 'CDS' : ##if its first time we get this gene if gene not in GFF.keys() : ## store gene information GFF['order'].append(gene) GFF[gene] = {} GFF[gene]['chrom'] = line.split('\t')[0] GFF[gene]['strand'] = line.split('\t')[6] GFF[gene]['start'] = int(line.split('\t')[3]) GFF[gene]['stop'] = int(line.split('\t')[4]) GFF[gene]['name'] = Name GFF[gene]['note'] = "" GFF[gene]['exon_number'] = 1 GFF[gene]['exon'] = {} #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) ## some exons are non codant exon_number = 1 GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) else : ## we add exon #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) exon_number += 1 GFF[gene]['exon_number'] = exon_number GFF[gene]['exon'][exon_number] = {} GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3]) GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4]) #elif line.split('\t')[2] == 'CDS' : #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip()) GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7] elif line.split('\t')[2] == 'start_codon' : if GFF[gene]['strand'] == '-' : GFF[gene]['stop'] = int(line.split('\t')[4]) else : GFF[gene]['start'] = int(line.split('\t')[3]) elif line.split('\t')[2] == 'stop_codon' : if GFF[gene]['strand'] == '-' : GFF[gene]['start'] = int(line.split('\t')[3]) else : GFF[gene]['stop'] = int(line.split('\t')[4]) return __reverse_coordinates__(GFF) except Exception,e: stop_err( 'Error during gff storage : ' + str( e ) ) ##IV protein_coding exon 307766 307789 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; ## exon_id "YDL083C.1"; ##IV protein_coding CDS 307766 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; ## protein_id "YDL083C"; ##IV protein_coding start_codon 307787 307789 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " ##RPS16B"; ##IV protein_coding exon 306926 307333 . - . gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; ## exon_id "YDL083C.2"; ##IV protein_coding CDS 306929 307333 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B"; ## protein_id "YDL083C"; ##IV protein_coding stop_codon 306926 306928 . - 0 gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name " ##RPS16B"; def __reverse_coordinates__(GFF): for gene in GFF['order']: ## for reverse gene if GFF[gene]['strand'] == "-": ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 : tmp = copy(GFF[gene]['exon']) exon_number = GFF[gene]['exon_number'] rev_index = exon_number+1 for z in range(1,exon_number+1): rev_index -= 1 GFF[gene]['exon'][z] = tmp[rev_index] ## check start if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']: GFF[gene]['exon'][1]['start'] = GFF[gene]['start'] return GFF def cleaning_bam(bam): ''' Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12 ''' try : header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE) #header = results.split('\n') # check mapper for define multiple tag if re.search('bowtie', header): multi_tag = "XS:i:" elif re.search('bwa', header): multi_tag = "XT:A:R" else : stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping") tmp_sam = tempfile.mktemp() cmd = "samtools view %s > %s" % (bam, tmp_sam) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) returncode = proc.wait() with open(tempfile.mktemp(),'w') as out : out.write(header) with open(tmp_sam,'r') as sam : for line in sam : if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 : if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 : out.write(line) bamfile = tempfile.mktemp()+'.bam' cmd = "samtools view -hSb %s > %s" % (out.name,bamfile) proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE) returncode = proc.wait() return bamfile except Exception,e: stop_err( 'Error during cleaning bam : ' + str( e ) )