Mercurial > repos > rlegendre > ribo_tools

#!/usr/bin/env python2.7

'''
    Created on Jan. 2014
    @author: rachel legendre
    @copyright:  rachel.legendre@igmors.u-psud.fr
    @license: GPL v3
'''

import sys, subprocess, re, commands, time, urllib
from copy import copy


def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.stderr.write( "Programme aborted at %s\n" % time.asctime(time.localtime(time.time())))
    sys.exit()

def split_bam(bamfile,tmpdir):
    '''
        split bam by chromosome and write sam file in tmpdir
    '''
    try:
        #get header
        results = subprocess.check_output(['samtools', 'view', '-H',bamfile])
        header = results.split('\n')

        #define genome size
        genome = []
        for line in header:
            result = re.search('SN', line)
            if result :
                #print line
                feat = line.split('\t')
                chrom = re.split(":", feat[1])
                #print feat[1]
                genome.append(chrom[1])

        #split sam by chrom
        n = 0
        for chrm in genome:
            with open(tmpdir+'/'+chrm+'.sam', 'w') as f :
                #write header correctly for each chromosome
                f.write(header[0]+'\n')
                expr = re.compile(chrm+'\t')
                el =[elem for elem in header if expr.search(elem)][0]
                f.write(el+'\n')
                f.write(header[-2]+'\n')
                #write all reads for each chromosome
                reads = subprocess.check_output(["samtools", "view", bamfile, chrm])
                f.write(reads)
                # calculate number of reads
                n += reads.count(chrm)

        sys.stdout.write("%d reads are presents in your bam file\n" % n)

    except Exception, e:
        stop_err( 'Error during bam file splitting : ' + str( e ) )


def get_first_base(tmpdir,kmer):
    '''
        write footprint coverage file for each sam file in tmpdir
    '''
    global total_mapped_read
    try:
        file_array = (commands.getoutput('ls '+tmpdir)).split('\n')
        ##write coverage for each sam file in tmpdir
        for samfile in file_array:
            with open(tmpdir+'/'+samfile, 'r') as sam :
                ##get chromosome name
                chrom = samfile.split(".")[0]

                for line in sam:
                    #initialize dictionnary
                    if re.search('@SQ', line) :
                        size = int(line.split('LN:')[1])
                        genomeF = [0]*size
                        genomeR = [0]*size
                    # define multiple reads keys from mapper
                    elif re.search('@PG', line) :
                        if re.search('bowtie', line):
                            multi_tag = "XS:i:"
                        elif re.search('bwa', line):
                            multi_tag = "XT:A:R"
                        else :
                            stop_err("No PG tag find in"+samfile+". Please use bowtie or bwa for mapping")

                    # get footprint
                    elif re.search('^[^@].+', line) :
                        #print line.rstrip()
                        read_pos = int(line.split('\t')[3])
                        read_sens = int(line.split('\t')[1])
                        #len_read = len(line.split('\t')[9])
                        if line.split('\t')[5] == kmer+'M' and multi_tag not in line:
                        ###if line.split('\t')[5] == '28M' :
                           # print line.rstrip()
                            total_mapped_read +=1
                            #if it's a forward read
                            if read_sens == 0 :
                                #get P-site : start read + 12 nt
                                read_pos += 12
                                genomeF[read_pos] += 1
                            #if it's a reverse read
                            elif read_sens == 16 :
                                #get P-site
                                read_pos += 15
                                genomeR[read_pos] += 1

            #try:
            #write coverage in files
            with open(tmpdir+'/assoCov_'+chrom+'.txt', 'w') as cov :
                for i in range(0,size):
                    cov.write(str(genomeF[i])+'\t-'+str(genomeR[i])+'\n')
            #except Exception,e:
            #    stop_err( 'Error during coverage file writting : ' + str( e ) )

        sys.stdout.write("%d reads are used for frame analysis\n" % total_mapped_read)
    except Exception, e:
        stop_err( 'Error during footprinting : ' + str( e ) )

def store_gff(gff):
    '''
        parse and store gff file in a dictionnary
    '''
    try:
        GFF = {}
        with open(gff, 'r') as f_gff :
            GFF['order'] = []
            for line in f_gff:
                ## switch commented lines
                line = line.split("#")[0]
                if line != "" :
                    feature = (line.split('\t')[8]).split(';')
                # first line is already gene line :
                    if line.split('\t')[2] == 'gene' :
                        gene = feature[0].replace("ID=","")
                        if re.search('gene',feature[2]) :
                            Name = feature[2].replace("gene=","")
                        else :
                            Name = "Unknown"
                        ##get annotation
                        note = re.sub(r".+\;Note\=(.+)\;display\=.+", r"\1", line)
                        note = urllib.unquote(str(note)).replace("\n","")
                        ## store gene information
                        GFF['order'].append(gene)
                        GFF[gene] = {}
                        GFF[gene]['chrom'] = line.split('\t')[0]
                        GFF[gene]['start'] = int(line.split('\t')[3])
                        GFF[gene]['stop'] = int(line.split('\t')[4])
                        GFF[gene]['strand'] = line.split('\t')[6]
                        GFF[gene]['name'] = Name
                        GFF[gene]['note'] = note
                        GFF[gene]['exon'] = {}
                        GFF[gene]['exon_number'] = 0
                        #print Name
                    elif line.split('\t')[2] == 'CDS' :
                        gene = re.sub(r".?Parent\=(.+)\_mRNA?", r"\1", feature[0])
                        if GFF.has_key(gene) :
                            GFF[gene]['exon_number'] += 1
                            exon_number = GFF[gene]['exon_number']
                            GFF[gene]['exon'][exon_number] = {}
                            GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                            GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                            GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])

                    ## if there is a five prim UTR intron, we change start of gene
                    elif   line.split('\t')[2] == 'five_prime_UTR_intron' :
                        if GFF[gene]['strand'] == "+" :
                            GFF[gene]['start'] = GFF[gene]['exon'][1]['start']
                        else :
                            GFF[gene]['stop'] = GFF[gene]['exon'][exon_number]['stop']
        return GFF
    except Exception,e:
        stop_err( 'Error during gff storage : ' + str( e ) )


#chrI    SGD     gene    87286   87752   .       +       .       ID=YAL030W;Name=YAL030W;gene=SNC1;Alias=SNC1;Ontology_term=GO:0005484,GO:0005768,GO:0005802,GO:0005886,GO:0005935,GO:0006887,GO:0006893,GO:000689
#7,GO:0006906,GO:0030658,GO:0031201;Note=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29%3B%20involved%20in%20the%20fusion%20between%20Golgi-derived%20secretory%20vesicles%20with%20the%20plasma%20membra
#ne%3B%20proposed%20to%20be%20involved%20in%20endocytosis%3B%20member%20of%20the%20synaptobrevin%2FVAMP%20family%20of%20R-type%20v-SNARE%20proteins%3B%20SNC1%20has%20a%20paralog%2C%20SNC2%2C%20that%20arose%20fr
#om%20the%20whole%20genome%20duplication;display=Vesicle%20membrane%20receptor%20protein%20%28v-SNARE%29;dbxref=SGD:S000000028;orf_classification=Verified
#chrI    SGD     CDS     87286   87387   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
#chrI    SGD     CDS     87501   87752   .       +       0       Parent=YAL030W_mRNA;Name=YAL030W_CDS;orf_classification=Verified
#chrI    SGD     intron  87388   87500   .       +       .       Parent=YAL030W_mRNA;Name=YAL030W_intron;orf_classification=Verified

def store_gtf(gff):
    '''
        parse and store gtf file in a dictionnary
    '''
    try:
        GFF = {}
        with open(gff, 'r') as f_gff :
            GFF['order'] = []
            for line in f_gff:
                ## switch commented lines
                line = line.split("#")[0]
                if line != "" :
                # first line is already gene line :
                    if 'protein_coding' in line :
                        ##get name
                        gene = re.sub(r".+transcript_id \"([\w|-]+)\";.*", r"\1", line).rstrip()
                        Name = re.sub(r".+gene_name \"([\w|\-|\:|\.|\(|\)]+)\";.*", r"\1", line).rstrip()
                        if line.split('\t')[2] == 'CDS' :
                            ##if its first time we get this gene
                            if gene not in GFF.keys() :
                            ## store gene information
                                GFF['order'].append(gene)
                                GFF[gene] = {}
                                GFF[gene]['chrom'] = line.split('\t')[0]
                                GFF[gene]['strand'] = line.split('\t')[6]
                                GFF[gene]['start'] = int(line.split('\t')[3])
                                GFF[gene]['stop'] = int(line.split('\t')[4])
                                GFF[gene]['name'] = Name
                                GFF[gene]['note'] = ""
                                GFF[gene]['exon_number'] = 1
                                GFF[gene]['exon'] = {}
                                #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
                                ## some exons are non codant
                                exon_number = 1
                                GFF[gene]['exon'][exon_number] = {}
                                GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                                GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
                            else :
                                ## we add exon
                                #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
                                exon_number += 1
                                GFF[gene]['exon_number'] = exon_number
                                GFF[gene]['exon'][exon_number] = {}
                                GFF[gene]['exon'][exon_number]['start'] = int(line.split('\t')[3])
                                GFF[gene]['exon'][exon_number]['stop'] = int(line.split('\t')[4])
                        #elif line.split('\t')[2] == 'CDS' :
                            #exon_number = int(re.sub(r".+exon_number \"(\d+)\".+", r"\1",line).rstrip())
                            GFF[gene]['exon'][exon_number]['frame'] = line.split('\t')[7]
                        elif line.split('\t')[2] == 'start_codon' :
                            if GFF[gene]['strand'] == '-' :
                                GFF[gene]['stop'] = int(line.split('\t')[4])
                            else :
                                GFF[gene]['start'] = int(line.split('\t')[3])
                        elif line.split('\t')[2] == 'stop_codon' :
                            if GFF[gene]['strand'] == '-' :
                                GFF[gene]['start'] = int(line.split('\t')[3])
                            else :
                                GFF[gene]['stop'] = int(line.split('\t')[4])

        return __reverse_coordinates__(GFF)
    except Exception,e:
        stop_err( 'Error during gff storage : ' + str( e ) )


##IV      protein_coding  exon    307766  307789  .       -       .       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
## exon_id "YDL083C.1";
##IV      protein_coding  CDS     307766  307789  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
## protein_id "YDL083C";
##IV      protein_coding  start_codon     307787  307789  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "1"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
##RPS16B";
##IV      protein_coding  exon    306926  307333  .       -       .       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
## exon_id "YDL083C.2";
##IV      protein_coding  CDS     306929  307333  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "RPS16B";
## protein_id "YDL083C";
##IV      protein_coding  stop_codon      306926  306928  .       -       0       gene_id "YDL083C"; transcript_id "YDL083C"; exon_number "2"; gene_name "RPS16B"; gene_biotype "protein_coding"; transcript_name "
##RPS16B";
def __reverse_coordinates__(GFF):

    for gene in GFF['order']:
        ## for reverse gene
        if GFF[gene]['strand'] == "-":
            ## if this gene have many exon and the stop of gene is the stop of first (and not last) exon, we reverse exon coordinates
            if GFF[gene]['stop'] == GFF[gene]['exon'][1]['stop'] and GFF[gene]['exon_number'] > 1 :
                tmp = copy(GFF[gene]['exon'])
                exon_number = GFF[gene]['exon_number']
                rev_index = exon_number+1
                for z in range(1,exon_number+1):
                    rev_index -= 1
                    GFF[gene]['exon'][z] = tmp[rev_index]

                ## check start
                if GFF[gene]['start'] != GFF[gene]['exon'][1]['start'] and GFF[gene]['start']:
                    GFF[gene]['exon'][1]['start'] = GFF[gene]['start']

    return GFF


def cleaning_bam(bam):
    '''
        Remove reads unmapped, non uniquely mapped and reads with length lower than 25 and upper than 32, and mapping quality upper than 12
    '''
    try :
        header = subprocess.check_output(['samtools', 'view', '-H', bam], stderr= subprocess.PIPE)
        #header = results.split('\n')

        # check mapper for define multiple tag
        if re.search('bowtie', header):
            multi_tag = "XS:i:"
        elif re.search('bwa', header):
            multi_tag = "XT:A:R"
        else :
            stop_err("No PG tag find in"+bam+". Please use bowtie or bwa for mapping")

        tmp_sam = tempfile.mktemp()
        cmd = "samtools view %s > %s" % (bam, tmp_sam)
        proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
        returncode = proc.wait()


        with open(tempfile.mktemp(),'w') as out :
            out.write(header)
            with open(tmp_sam,'r') as sam :
                for line in sam :
                    if multi_tag not in line and line.split('\t')[1] != '4' and int(line.split('\t')[4]) > 12 :
                        if len(line.split('\t')[9]) < 32 and len(line.split('\t')[9]) > 25 :
                            out.write(line)
        bamfile = tempfile.mktemp()+'.bam'
        cmd = "samtools view -hSb %s > %s" % (out.name,bamfile)
        proc = subprocess.Popen( args=cmd, shell=True, stderr = subprocess.PIPE)
        returncode = proc.wait()

        return bamfile

    except Exception,e:
        stop_err( 'Error during cleaning bam : ' + str( e ) )
author	rlegendre
date	Thu, 22 Jan 2015 14:34:38 +0100
parents	d7739f797a26
children	7c944fd9907e