Mercurial > repos > abims-sbr > pairwise

#!/usr/bin/env python
## AUTHOR: Eric Fontanillas
## LAST VERSION: 14/08/14 by Julie BAFFARD

### TBLASTX formatting

### MATCH = Only the first match keeped
MATCH = 0    # Only 1rst match Wanted
#MATCH = 1    # All match wanted

### SUBMATCH = several part of a same sequence match with the query
SUBMATCH = 0    # SUBMATCH NOT WANTED (ONLY 1rst HIT)
#SUBMATCH =1    # SUBMATCH WANTED


### NAME FORMATTING:
# [A] FORMAT QUERY NAME 1st STEP [IN DEF1]

# [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1]

# [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3]
# [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3]
# [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3]

### SPECIFICITY TBLASTX (/BLASTN) formatting:
## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
## 2/ change line "if keyword in nextline:" in function "split_file"
## 3/ change "Strand" by "Frame" in function "get_information_on_matches"


############################
### DEF4 : get sequences ###
############################
### [+ get informations from the DEF3.]
def get_sequences(query, list2, SUBMATCHEU, WORK_DIR):
    list_Pairwise = []

    F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w')
    F7.write(bash1[query])    # bash1[query]  ==> blast output for each query
    F7.close()
    F8 = open("%s/blastRun3.tmp" %WORK_DIR, 'r')

    text1 = F8.readlines()

    miniList = []
    for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function

        l = -1
        for n in text1:
            l = l+1
            if name in n:
                i = l
                miniList.append(i)   # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])

    miniList.reverse()

    if miniList != []:
        length = len(miniList)

        ii = 0
        Listing1 = []
        while ii < length:
            iii = miniList[ii]
            entry = text1[iii:]
            text1 = text1[:iii]
            Listing1.append(entry)   # each "entry" = list of thing beginning by ">"
            ii = ii+1                # Listing1 is a table of table!!

        Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]"
        Listing1.reverse()

        Listing2 = Listing1[1:]   # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">"
        SEK = len(Listing2)
        NB_SEK = 0

        for e1 in Listing2:   # "Listing2" contents all the entries begining with ">"
            NB_SEK = NB_SEK + 1
            list51 = []

            l = -1
            for line in e1:
                l = l+1
                if "Score =" in line:
                    index = l
                    list51.append(l)     # index of the lines with score

            list51.reverse()
            Listing3 = []

            for i5 in list51:
                e2 = e1[i5:]
                Listing3.append(e2)
                e1 = e1[:i5]


            ######################################
            ### [C] FORMAT MATCH NAME 2nd STEP ###

            BigFastaName = e1      ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines

            SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME

            SmallFastaName = SmallFastaName[1:-2]  ### remove ">" and "\n"

            """
            3 lines below : only difference with S05
            """
            S1 = string.split(SmallFastaName, "||")
            S2 = string.split(S1[0], " ")
            PutInFastaName1 = S2[0]

            ### [C] END FORMAT MATCH NAME 2nd STEP ###
            ##########################################

            SUBSEK = len(Listing3)
            NB_SUBSEK = 0
            list_inBatch = []

            ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3":
            if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!!
                Listing4 = []
                Listing4.append(Listing3[-1])   # Remove this line if submatch wanted!!!
            elif SUBMATCHEU == 1:
                Listing4 = Listing3

            for l in Listing4:   ## "listing3" contents
                NB_SUBSEK = NB_SUBSEK+1

                ll1 = string.replace(l[0], " ", "")
                ll2 = string.replace(l[1], " ", "")
                ll3 = string.replace(l[2], " ", "")
                PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information

                seq_query = ""
                pos_query = []
                seq_match = ""
                pos_match = []

                for line in l:
                    if "Query:" in line:
                        line = string.replace(line, "    ", " ")  # remove multiple spaces in line
                        line = string.replace(line, "   ", " ")
                        line = string.replace(line, "  ", " ")

                        lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end

                        pos1 = lll1[1]
                        pos1 = string.atoi(pos1)
                        pos_query.append(pos1)

                        pos2 = lll1[3][:-1]
                        pos2 = string.atoi(pos2)
                        pos_query.append(pos2)

                        seq = lll1[2]
                        seq_query = seq_query + seq

                    if "Sbjct:" in line:
                        line = string.replace(line, "    ", " ")  # remove multiple spaces in line
                        line = string.replace(line, "   ", " ")
                        line = string.replace(line, "  ", " ")

                        lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end

                        pos1 = lll2[1]
                        pos1 = string.atoi(pos1)
                        pos_match.append(pos1)

                        pos2 = lll2[3][:-1]
                        pos2 = string.atoi(pos2)
                        pos_match.append(pos2)

                        seq = lll2[2]
                        seq_match = seq_match + seq

                ## Get the query and matched sequences and the corresponding positions
                pos_query.sort() # rank small to big
                pos_query_start = pos_query[0] # get the smaller
                pos_query_end = pos_query[-1] # get the bigger
                PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end)

                ######################################
                ### [D] FORMAT QUERY NAME 2nd STEP ###

                FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)

                ### [D] END FORMAT QUERY NAME 2nd STEP ###
                ##########################################

                pos_match.sort()
                pos_match_start = pos_match[0]
                pos_match_end = pos_match[-1]
                PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end)

                ######################################
                ### [E] FORMAT MATCH NAME 3rd STEP ###

                FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)

                ### [E] END FORMAT MATCH NAME 3rd STEP ###
                ##########################################

                Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match]  # list with 4 members
                list_Pairwise.append(Pairwise)

                ### Get informations about matches
                list_info = get_information_on_matches(l)    ### DEF3 ###

    F8.close()
    return(list_Pairwise, list_info)
#########################################


###################
### RUN RUN RUN ###
###################
import string, os, time, re, sys
from functions import split_file, detect_Matches, get_information_on_matches

## 1 ## INPUT/OUTPUT
SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db

"""
04 and 06 for S05, 11 and 13 for S08
"""
path_in = "%s/11_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE)
file_out = open("%s/13_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")

## 2 ## RUN
## create Bash1 ##
bash1 = split_file(path_in, "TBLASTX") ### DEF1 ###

## detect and save match ##
list_hits =[]
list_no_hits = []
j= 0
k = 0
lene = len(bash1.keys())
for query in bash1.keys():
    j  = j+1

    ## 2.1. detect matches ##
    list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE, bash1)    ### DEF2 ###

    if hit == 1:   # match(es)
        list_hits.append(query)
    if hit == 0: # no match for that sequence
        list_no_hits.append(query)

    ## 2.2. get sequences ##
    if hit ==1:
        list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)

        # divergencve
        divergence = list_info[6]
        # gap number
        gap_number = list_info[7]
        # real divergence (divergence without accounting INDELs)
        real_divergence = list_info[8]
        # length matched
        length_matched = list_info[10]

        ### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES
        for pairwise in list_pairwiseMatch:
            k = k+1

            query_name = pairwise[0]
            query_seq = pairwise[1]
            match_name = pairwise[2]
            match_seq = pairwise[3]

            len_query_seq = len(query_seq)

            # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD:

            file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched))
            file_out.write("\n")
            file_out.write("%s" %query_seq)
            file_out.write("\n")

            file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched))
            file_out.write("\n")
            file_out.write("%s" %match_seq)
            file_out.write("\n")


file_out.close()
author	abims-sbr
date	Wed, 28 Feb 2018 10:37:14 -0500
parents	6709645eff5d
children