diff scripts/S05_script_extract_match_v20_blastx.py @ 4:6709645eff5d draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:53:53 -0500
parents c8af52875b0f
children
line wrap: on
line diff
--- a/scripts/S05_script_extract_match_v20_blastx.py	Wed Sep 27 10:01:55 2017 -0400
+++ b/scripts/S05_script_extract_match_v20_blastx.py	Wed Jan 17 08:53:53 2018 -0500
@@ -28,193 +28,10 @@
 ## 3/ change "Strand" by "Frame" in function "get_information_on_matches"
 
 
-########################################
-### DEF 1. Split each "BLASTN" event ###
-########################################
-def split_file(path_in, keyword):
-
-    file_in = open(path_in, "r")
-    RUN = ''
-    BASH1={}
-    
-    while 1:
-        nextline = file_in.readline()
-
-        ##################################
-        ###  [A] FORMATTING QUERY NAME ###
-        
-        ### Get query name ###
-        if nextline[0:6]=='Query=':    
-            L1 = string.split(nextline, "||")
-            L2 = string.split(L1[0], " ")
-            query = L2[1]
-            if query[-1] == "\n":
-                query = query[:-1]
-            
-        ###  [A] END FORMATTING QUERY NAME ###
-        ######################################
-        
-        
-        ### split the file with keyword ###
-        if keyword in nextline:
-
-            # Two cases here:
-            #1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN".
-            #2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN"
-
-            if RUN == '':                          # case #1#
-                RUN = RUN + nextline     # we just added the first line of  the file
-
-            else:                                # case #2# (there was a run before)
-                BASH1[query] = RUN    # add the previous run to the bash
-                RUN = ''                           # re-initialize the "RUN"
-                RUN = RUN + nextline    # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN")
-
-        else:                          # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN")  (which is not treated here but previously)
-            RUN = RUN + nextline
-            
-
-        if not nextline:                                   # when no more line, we should record the last "RUN" in the bash1
-            BASH1[query] = RUN                       # add the last "RUN"
-            break
-    
-    file_in.close()
-    return(BASH1)
-#########################################################
-
-
-################################################
-### DEF2 : Parse blast output for each query ###
-################################################
-### detect matches (i.e. 'Sequences producing significant alignments:' ###
-
-def detect_Matches(query, MATCH, WORK_DIR):
-    F5 = open("%s/blastRun2.tmp" %WORK_DIR, 'w') 
-    F5.write(bash1[query])
-    F5.close()
-
-    F6 = open("%s/blastRun2.tmp" %WORK_DIR, 'r') 
-    list1 =[]
-    list2 =[]
-
-    while 1:
-        nexteu = F6.readline()
-        if not nexteu : break
-
-        if "***** No hits found ******" in nexteu :
-            hit = 0
-            break
-        
-        if 'Sequences producing significant alignments:' in nexteu:
-            hit = 1          
-            F6.readline() # jump a line
-
-            while 1:
-                nexteu2 = F6.readline()           
-                if nexteu2[0]==">": break
-
-                ######################################
-                ### [B] FORMAT MATCH NAME 1st STEP ###
-               
-                if nexteu2 != '\n':
-                    LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!!
-                    match = LL1[0]                     #### SOUTH databank // NORTH will have "|" separators
-                    list1.append(match)
-
-                    match2 = ">" + LL1[0]        # more complete name // still specific NORTH database names !!!!!!!
-                    list2.append(match2)         #### SOUTH databank // NORTH will have "|" separators
-
-                    if MATCH == 0:        ## Only read the 1rst line (i.e. the First Match)
-                        break                
-                    else:                 ## Read the other lines (i.e. All the Matches)
-                        continue
-
-                ### [B] END FORMAT MATCH NAME 1st STEP ###
-                ##########################################                    
-            
-            break
-    
-    F6.close()
-    return(list1, list2, hit)    # list1 = short name // list2 = more complete name
-#######################################
-
-
-#########################################
-### DEF3 : Get Information on matches ###
-#########################################
-### Function used in the next function (2.3.)
-
-def get_information_on_matches(list_of_line):   
-    for line in list_of_line:
-
-        ## Score and Expect
-        if "Score" in line:
-            line = line[:-1]  # remove "\n"
-            S_line = string.split(line, " = ") 
-            Expect = S_line[-1]                                           ## ***** Expect
-            S_line2 = string.split(S_line[1], " bits ")
-            Score = string.atof(S_line2[0])
-
-        ## Identities/gaps/percent/divergence/length_matched
-        elif "Identities" in line:
-            line = line[:-1]  # remove "\n"
-            g = 0
-
-            if "Gaps" in line:
-                pre_S_line = string.split(line, ",")
-                identity_line = pre_S_line[0]
-                gaps_line = pre_S_line[1]
-                g = 1
-            else:
-                identity_line = line
-                g = 0
-
-            ## treat identity line
-            S_line = string.split(identity_line, " ")
-
-            identities = S_line[-2]                                  ## ***** identities
-
-            S_line2 = string.split(identities, "/")
-            hits = string.atof(S_line2[0])                           ## ***** hits
-            length_matched = string.atof(S_line2[1])                 ## ***** length_matched
-            abs_nb_differences = length_matched - hits               ## ***** abs_nb_differences
-
-
-            identity_percent =   hits/length_matched * 100                   ## ***** identity_percent
-
-            divergence_percent = abs_nb_differences/length_matched*100                  ## ***** divergence_percent
-
-            ## treat gap line if any
-            if g ==1:    # means there are gaps
-                S_line3 = string.split(gaps_line, " ")
-                gaps_part  = S_line3[-2]
-                S_line4 = string.split(gaps_part, "/")
-                gaps_number = string.atoi(S_line4[0])                                   ## ***** gaps_number
-                
-                real_differences = abs_nb_differences - gaps_number                         ## ***** real_differences
-                real_divergence_percent = (real_differences/length_matched)*100             ## ***** real_divergence_percent
-            else:
-                gaps_number = 0               
-                real_differences = 0
-                real_divergence_percent = divergence_percent
-
-        ## Frame
-        elif "Frame" in line:
-            line = line[:-1] 
-            S_line = string.split(line, " = ")
-            frame = S_line[1]
-
-    list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched]
-
-    return(list_informations)
-########################################
-
-
 ############################
 ### DEF4 : get sequences ###
 ############################
 ### [+ get informations from the function 2.2.]
-
 def get_sequences(query, list2, SUBMATCHEU,WORK_DIR):
     list_Pairwise = []
 
@@ -235,11 +52,12 @@
                 i = l
                 miniList.append(i)   # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])
 
-    miniList.reverse()    
+    miniList.reverse()
+
     if miniList != []:
         length = len(miniList)
+
         ii = 0
-
         Listing1 = []
         while ii < length:
             iii = miniList[ii]
@@ -279,12 +97,15 @@
             
             BigFastaName = e1      ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines
             
-            SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME            
+            SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME
+
             SmallFastaName = SmallFastaName[1:-1]  ### remove ">" and "\n"
 
+            """
+            3 lines below : only difference with S08
+            """
             if SmallFastaName[-1] == " ":
-                SmallFastaName = SmallFastaName[:-1]
-           
+                SmallFastaName = SmallFastaName[:-1]           
             PutInFastaName1 = SmallFastaName
 
             ### [C] END FORMAT MATCH NAME 2nd STEP ###
@@ -389,20 +210,24 @@
 #########################################
 
 
-######################
-### 2. RUN RUN RUN ###
-######################
+###################
+### RUN RUN RUN ###
+###################
 import string, os, time, re, sys
+from functions import split_file, detect_Matches, get_information_on_matches
 
 ## 1 ## INPUT/OUTPUT
-SHORT_FILE = sys.argv[1] #short-name-query_short-name-db
+SHORT_FILE = sys.argv[1] ## short-name-query_short-name-db
 
+"""
+04 and 06 for S05, 11 and 13 for S08
+"""
 path_in = "%s/04_outputBlast_%s.txt" %(SHORT_FILE, SHORT_FILE) 
 file_out = open("%s/06_PairwiseMatch_%s.fasta" %(SHORT_FILE, SHORT_FILE),"w")
 
 ## 2 ## RUN
 ## create Bash1 ##
-bash1 = split_file(path_in, "TBLASTX") ## DEF1 ##
+bash1 = split_file(path_in, "TBLASTX") ### DEF1 ###
 
 ## detect and save match ##
 list_hits =[]
@@ -414,7 +239,7 @@
     j  = j+1
     
     ## 2.1. detect matches ##
-    list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE)    ### DEF2 ###
+    list_match, list_match2, hit=detect_Matches(query, MATCH, SHORT_FILE, bash1)    ### DEF2 ###
     
     if hit == 1:   # match(es)
         list_hits.append(query)
@@ -423,7 +248,7 @@
     
     ## 2.2. get sequences ##
     if hit ==1:
-        list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)       ### FUNCTION ###
+        list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, SHORT_FILE)#
 
         # divergencve
         divergence = list_info[6]
@@ -445,6 +270,9 @@
 
             len_query_seq = len(query_seq)
 
+            """
+            4 lines below : only in S05 
+            """
             Lis1 = string.split(query_name, "||")
             short_query_name = Lis1[0]
             Lis2 = string.split(match_name, "||")