view scripts/S05_script_extract_match_v20_blastx.py @ 8:471ed956ff13 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit b7a3030ea134b5dfad89b1a869db659d72d1145c
author abims-sbr
date Wed, 28 Feb 2018 10:37:14 -0500
parents 6709645eff5d
children
line wrap: on
line source

#!/usr/bin/env python
## AUTHOR: Eric Fontanillas
## LAST VERSION: 14/08/14 by Julie BAFFARD

### TBLASTX formatting

### MATCH = Only the first match keeped
MATCH = 0    # Only 1rst match Wanted
#MATCH = 1    # All match wanted

### SUBMATCH = several part of a same sequence match with the query
SUBMATCH = 0    # SUBMATCH NOT WANTED (only the best hit)
#SUBMATCH =1    # SUBMATCH WANTED


### NAME FORMATTING:
# [A] FORMAT QUERY NAME 1st STEP [IN DEF1]

# [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1]

# [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3]
# [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3]
# [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3]

### SPECIFICITY TBLASTX (/BLASTN) formatting:
## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
## 2/ change line "if keyword in nextline:" in function "split_file"
## 3/ change "Strand" by "Frame" in function "get_information_on_matches"


############################
### DEF4 : get sequences ###
############################
### [+ get informations from the function 2.2.]
def get_sequences(query, list2, SUBMATCHEU,WORK_DIR):
    list_Pairwise = []

    F7 = open("%s/blastRun3.tmp" %WORK_DIR, 'w')
    F7.write(bash1[query])    # bash1[query]  ==> blast output for each query
    F7.close()
    F8 = open("%s/blastRun3.tmp" %WORK_DIR, 'r')
  
    text1 = F8.readlines()
    
    miniList = []
    for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function

        l = -1
        for n in text1:
            l = l+1
            if name in n:
                i = l
                miniList.append(i)   # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])

    miniList.reverse()

    if miniList != []:
        length = len(miniList)

        ii = 0
        Listing1 = []
        while ii < length:
            iii = miniList[ii]
            entry = text1[iii:]
            text1 = text1[:iii]
            Listing1.append(entry)   # each "entry" = list of thing beginning by ">"
            ii = ii+1                # Listing1 is a table of table!!
        
        Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]"
        Listing1.reverse()

        Listing2 = Listing1[1:]   # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">"
        SEK = len(Listing2)        
        NB_SEK = 0
        
        for e1 in Listing2:   # "Listing2" contents all the entries begining with ">"            
            NB_SEK = NB_SEK + 1
            list51 = []

            l = -1
            for line in e1:
                l = l+1
                if "Score =" in line:
                    index = l
                    list51.append(l)     # index of the lines with score

            list51.reverse()            
            Listing3 = []
            
            for i5 in list51:
                e2 = e1[i5:]
                Listing3.append(e2)
                e1 = e1[:i5]

            ######################################
            ### [C] FORMAT MATCH NAME 2nd STEP ###
            
            BigFastaName = e1      ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines
            
            SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME

            SmallFastaName = SmallFastaName[1:-1]  ### remove ">" and "\n"

            """
            3 lines below : only difference with S08
            """
            if SmallFastaName[-1] == " ":
                SmallFastaName = SmallFastaName[:-1]           
            PutInFastaName1 = SmallFastaName

            ### [C] END FORMAT MATCH NAME 2nd STEP ###
            ##########################################

            SUBSEK = len(Listing3)   
            NB_SUBSEK = 0
            list_inBatch = []

            ### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3":
            if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!!
                Listing4 = []
                Listing4.append(Listing3[-1])   # Remove this line if submatch wanted!!!
            elif SUBMATCHEU == 1:
                Listing4 = Listing3
           
            for l in Listing4:   ## "listing3" contents               
                NB_SUBSEK = NB_SUBSEK+1
     
                ll1 = string.replace(l[0], " ", "")
                ll2 = string.replace(l[1], " ", "")
                ll3 = string.replace(l[2], " ", "")
                PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information

                seq_query = ""
                pos_query = []
                seq_match = ""
                pos_match = []

                for line in l:
                    if "Query:" in line:
                        line = string.replace(line, "    ", " ")  # remove multiple spaces in line
                        line = string.replace(line, "   ", " ")
                        line = string.replace(line, "  ", " ")
                        
                        lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end

                        pos1 = lll1[1]
                        pos1 = string.atoi(pos1)
                        pos_query.append(pos1)

                        pos2 = lll1[3][:-1]
                        pos2 = string.atoi(pos2)
                        pos_query.append(pos2)
                        
                        seq = lll1[2]
                        seq_query = seq_query + seq                       

                    if "Sbjct:" in line:
                        line = string.replace(line, "    ", " ")  # remove multiple spaces in line
                        line = string.replace(line, "   ", " ")
                        line = string.replace(line, "  ", " ")
                        
                        lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end

                        pos1 = lll2[1]
                        pos1 = string.atoi(pos1)
                        pos_match.append(pos1)

                        pos2 = lll2[3][:-1]
                        pos2 = string.atoi(pos2)
                        pos_match.append(pos2)
                        
                        seq = lll2[2]
                        seq_match = seq_match + seq

                ## Get the query and matched sequences and the corresponding positions
                pos_query.sort() # rank small to big
                pos_query_start = pos_query[0] # get the smaller
                pos_query_end = pos_query[-1] # get the bigger
                PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end)

                ######################################
                ### [D] FORMAT QUERY NAME 2nd STEP ###

                FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)

                ### [D] END FORMAT QUERY NAME 2nd STEP ###
                ##########################################

                pos_match.sort()
                pos_match_start = pos_match[0]
                pos_match_end = pos_match[-1]
                PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end)

                ######################################
                ### [E] FORMAT MATCH NAME 3rd STEP ###
               
                FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)

                ### [E] END FORMAT MATCH NAME 3rd STEP ###
                ##########################################

                Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match]  # list with 4 members
                list_Pairwise.append(Pairwise)

                ### Get informations about matches
                list_info = get_information_on_matches(l)    ### DEF3 ###
                                    
    F8.close()
    return(list_Pairwise, list_info)
#########################################


###################
### RUN RUN RUN ###
###################
import string, os, time, re, sys
from functions import split_file, detect_Matches, get_information_on_matches

## 1 ## INPUT/OUTPUT
SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db

"""
04 and 06 for S05, 11 and 13 for S08
"""
path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) 
file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")

## 2 ## RUN
## create Bash1 ##
bash1 = split_file(path_in, "TBLASTX") ### DEF1 ###

## detect and save match ##
list_hits =[]
list_no_hits = []
j= 0
k = 0
lene = len(bash1.keys())
for query in bash1.keys():
    j  = j+1
    
    ## 2.1. detect matches ##
    list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE, bash1)    ### DEF2 ###
    
    if hit == 1:   # match(es)
        list_hits.append(query)
    if hit == 0: # no match for that sequence
        list_no_hits.append(query)
    
    ## 2.2. get sequences ##
    if hit ==1:
        list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)#

        # divergencve
        divergence = list_info[6]
        # gap number
        gap_number = list_info[7]
        # real divergence (divergence without accounting INDELs)
        real_divergence = list_info[8]
        # length matched
        length_matched = list_info[10]

        ### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES
        for pairwise in list_pairwiseMatch:
            k = k+1

            query_name = pairwise[0]
            query_seq = pairwise[1]
            match_name = pairwise[2]
            match_seq = pairwise[3]

            len_query_seq = len(query_seq)

            """
            4 lines below : only in S05 
            """
            Lis1 = string.split(query_name, "||")
            short_query_name = Lis1[0]
            Lis2 = string.split(match_name, "||")
            short_match_name = Lis2[0]

            # If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD:
            
            file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched))
            file_out.write("\n")
            file_out.write("%s" %query_seq)
            file_out.write("\n")

            file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched))
            file_out.write("\n")
            file_out.write("%s" %match_seq)
            file_out.write("\n")
    
file_out.close()