changeset 1:0037e4e222b1 draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
author lecorguille
date Thu, 13 Apr 2017 09:47:11 -0400
parents b2895c835ea8
children be2128ad0030
files CHANGELOG.md README.md scripts/S01_get_locus_ortholog_part1.py scripts/S02_get_locus_ortholog_part2.py
diffstat 4 files changed, 311 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/CHANGELOG.md	Thu Apr 13 05:46:54 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-Changelog
-
-Version 1.0 - 13/04/2017
-
-  - Add functional test with planemo
-  - Planemo test with conda dependency for python
-  - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Thu Apr 13 09:47:11 2017 -0400
@@ -0,0 +1,7 @@
+Changelog
+
+Version 1.0 - 13/04/2017
+
+  - Add functional test with planemo
+  - Planemo test with conda dependency for python
+  - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S01_get_locus_ortholog_part1.py	Thu Apr 13 09:47:11 2017 -0400
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+## AUTHOR: Eric Fontanillas
+## LAST VERSION: 14/08/14 by Julie BAFFARD
+
+#MINIMUM_LENGTH = 100
+
+
+############################
+##### DEF1 : Get Pairs #####
+############################
+def get_pairs(fasta_file_path):
+    F2 = open(fasta_file_path, "r")
+    list_pairwises = []
+    while 1:
+        next2 = F2.readline()
+        if not next2:
+            break
+        if next2[0] == ">":
+            fasta_name_query = next2[:-1]
+            Sn = string.split(fasta_name_query, "||")
+            fasta_name_query = Sn[0]
+            next3 = F2.readline()
+            fasta_seq_query = next3[:-1]
+            next3 = F2.readline()    ## jump one empty line (if any after the sequence)
+            fasta_name_match = next3[:-1]
+            Sn = string.split(fasta_name_match, "||")
+            fasta_name_match = Sn[0]
+
+            next3 = F2.readline()
+            fasta_seq_match = next3[:-1]
+
+            pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match]
+            list_pairwises.append(pairwise)
+
+    F2.close()
+    return(list_pairwises)
+##############################################
+
+
+#######################
+##### RUN RUN RUN #####
+#######################
+import string, os, time, re, sys, pickle, zipfile #modif julie : ajout de zipfile
+
+## 1 ## INPUT/OUTPUT
+#extraction du fichier zip contenant les sorties
+zfile = zipfile.ZipFile(sys.argv[1])
+for name in zfile.namelist() :
+	zfile.extract(name)
+
+PATH = "./"
+L1 = os.listdir(PATH)
+
+## 2 ## RUN
+##Get list locus
+list_LOCUS=[]
+for subfolder in L1:
+    if subfolder.startswith("19_") : 
+    	fileIN = subfolder
+	list_pairwises = get_pairs(fileIN)          ### DEF1 ###
+
+    	jj=0
+    	for pair in list_pairwises:
+            name1 = pair[0]
+            name2 = pair[2]         
+            m=0
+
+            ## If one of the 2 names yet present ==> complete the locus
+            for locus in list_LOCUS:
+            	if name1 in locus or name2 in locus:
+                    locus.append(name1)
+                    locus.append(name2)
+                    m=1
+
+            ## If no names was yet present in locus ==> create the locus
+            if m==0:
+                new_locus = [name1, name2]
+            	list_LOCUS.append(new_locus)
+
+            ## Compteur
+            jj = jj+1
+
+## Remove redondancy in list_LOCUS : Several locus in the list_locus, should be the same (but not in the same order), depending in which order they were created, we should remove them
+
+list_short_LOCUS = []
+for locus in list_LOCUS:
+    index = list_LOCUS.index(locus)
+
+    short_locus = []
+
+    ## Remove redondant sequence name
+    for seq in locus:
+        if seq not in short_locus:
+            short_locus.append(seq)
+            
+    ## Sort the list of sequence name (by alphabetical order)
+    short_locus.sort()
+
+    ## Add the locus to the new list (list_short_LOCUS) if not yet present (avoid redondant locus)
+    if short_locus not in list_short_LOCUS:
+        list_short_LOCUS.append(short_locus)
+    
+list_LOCUS = list_short_LOCUS
+
+## Control list locus length
+ln = len(list_LOCUS)
+print "\nNumber of locus = %d\n" %ln
+
+## Backup list locus with PICKLE
+backup_list_LOCUS = open("02_backup_list_LOCUS", "w")
+pickle.dump(list_LOCUS, backup_list_LOCUS) 
+
+backup_list_LOCUS.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S02_get_locus_ortholog_part2.py	Thu Apr 13 09:47:11 2017 -0400
@@ -0,0 +1,191 @@
+#!/usr/bin/env python
+## AUTHOR: Eric Fontanillas
+## LAST VERSION: 14/08/14 by Julie BAFFARD
+
+##############################
+##### DEF1 : Dico fasta  #####
+##############################
+def dico(fasta_file_path):
+    F2 = open(fasta_file_path, "r")
+    dicoco = {}
+    while 1:
+        next2 = F2.readline()
+        if not next2:
+            break
+        if next2[0] == ">":
+            fasta_name_query = next2[:-1]
+            Sn = string.split(fasta_name_query, "||")
+            fasta_name_query = Sn[0]
+            next3 = F2.readline()
+            fasta_seq_query = next3[:-1]
+
+            dicoco[fasta_name_query]=fasta_seq_query
+
+    F2.close()
+    return(dicoco)
+######################################################
+
+
+##############################################################
+##### DEF2 : Get sequences from list of names per locus  #####
+##############################################################
+def get_seq_per_locus(list_locus, bash_seq, path_OUT,prefix_name):
+
+    i=0
+    for locus in list_locus:
+        i = i+1
+        OUT = open("%s/locus%d_%s.fasta" %(path_OUT, i, prefix_name), "w")
+        
+        for seq_name in locus:
+            sequence = bash_seq[seq_name]
+            OUT.write("%s\n" %seq_name)
+            OUT.write("%s\n" %sequence)
+
+        OUT.close()
+###################################################################################
+
+
+#######################
+##### RUN RUN RUN #####
+#######################
+import string, os, sys, pickle, zipfile, re
+
+## 1 ## LOAD "PICKLE" LIST OF ALL POTENTIAL LOCUS (orthologuous and paraloguous)
+################################################################################
+file_LOCUS = open("02_backup_list_LOCUS")
+list_LOCUS = pickle.load(file_LOCUS) 
+file_LOCUS.close()
+
+# L2 : list of input file
+L2 = []
+zfile = zipfile.ZipFile(sys.argv[1])
+for name in zfile.namelist() :
+	zfile.extract(name)
+	L2.append(name)
+
+nb=1
+os.mkdir("04_LOCUS_ORTHOLOGS_UNALIGNED_perCLASS")
+while nb<len(L2) :
+	nb+=1
+	os.mkdir("04_LOCUS_ORTHOLOGS_UNALIGNED_perCLASS/LOCUS_%i_sp" %nb)
+
+
+
+## 2 ## [1rst treatment INTRA LOCUS] SELECT ONLY ORTHOLOGUOUS (list of names of orthologuous sequences) :
+## test if locus is orthologuous (i.e. when a same species appear with only 1 sequence in the locus)
+#########################################################################################################
+LOCUS_without_DUPLI = []
+for locus in list_LOCUS:
+    ## remove redondancy (i.e. same names present several times as it is expected)
+    l = []
+    for name in locus:
+        if name not in l:
+            if name[-2:] == "_S":
+                name = name[:-2]
+            l.append(name)
+
+    ## Now test if only one name per species, if not ==> duplication
+    list_initials = []
+    D=0
+    for name in l:
+        initials = name[1:3]
+
+        if initials not in list_initials:
+            list_initials.append(initials)            
+        else:  # DUPLICATION DETECTED
+            D=1
+
+    if D==0:  # means no duplication detected
+        LOCUS_without_DUPLI.append(l)
+
+print "\nNUMBER OF REMAINING LOCUS AFTER 1RST TREATMENT [INTRA LOCUS] = %d" %len(LOCUS_without_DUPLI) 
+
+
+## 3 ## [2nd treatment INTER LOCUS]In fact there are still some duplication in "LOCUS_without_DUPLI" 
+## ==> no duplication in each loci ... OK, but several loci sharing a same sequence is still possible, 
+## I will exclude this case now
+#########################################################################################################
+list_name_seq = []
+bash_name_seq = {}
+
+# 3.1. Get bash = key = sequence name // value = list of locus
+for locus in LOCUS_without_DUPLI:
+    for sequence in locus:
+        if sequence not in list_name_seq:
+            list_name_seq.append(sequence)
+            list = [locus]
+            bash_name_seq[sequence] = list
+
+        else:
+            bash_name_seq[sequence] = bash_name_seq[sequence] + list
+
+# 3.2. From the previous bash, test sequence name present in more than one loci (i.e. length of the list of loci > 1), record these paralogous loci
+k = bash_name_seq.keys()
+list_locus_paralogues = []
+for sequence in k:
+    list = bash_name_seq[sequence]
+    if len(list) > 1:
+        for locus in list:
+            list_locus_paralogues.append(locus)    ### list of list
+
+# 3.3. Remove the paralogous loci from the "LOCUS_without_DUPLI" list
+for locus in list_locus_paralogues:
+    if locus in LOCUS_without_DUPLI:
+        LOCUS_without_DUPLI.remove(locus)
+print "NUMBER OF REMAINING LOCUS AFTER 2ND TREATMENT [INTER LOCUS] = %d\n\n" %len(LOCUS_without_DUPLI)
+
+nbr=1
+dico_LOCUS_sp = {}
+
+while nbr<len(L2) :
+    nbr+=1
+    dico_LOCUS_sp['%i_sp' %nbr] = []
+    for locus in LOCUS_without_DUPLI :
+        if len(locus) == nbr :
+	    dico_LOCUS_sp['%i_sp' %nbr].append(locus)
+
+total_number_locus = 0
+list_key = []
+
+for key in dico_LOCUS_sp.keys() :
+    list_key.append(key)
+
+list_key.sort()
+list_key.reverse()
+for number in list_key :    
+    value = dico_LOCUS_sp[number]
+    if value != [] :
+        number = number.split("_")
+	number = number[0]
+        print "Number of species in the locus : %s" %number
+        print "\tNumber of locus : %i" %len(value)
+    print ""
+
+
+## 4 ## GET SEQUENCES CORRESPONDING TO THE ORTHOLOGUOUS NAMES (per locus)
+#########################################################################
+BASH = {}
+
+for subfile in L2:
+    dicoco = dico(subfile)     ### DEF1 ###
+    BASH.update(dicoco)
+
+
+sp = 1
+while sp<len(L2) :
+    sp+=1 
+    get_seq_per_locus(dico_LOCUS_sp['%i_sp' %sp], BASH, "04_LOCUS_ORTHOLOGS_UNALIGNED_perCLASS/LOCUS_%i_sp" %sp, "sp%i" %sp)      ## DEF2 ##
+    get_seq_per_locus(dico_LOCUS_sp['%i_sp' %sp], BASH, ".", "sp%i" %sp)        ## DEF2 ##     
+
+
+## 5 ## OUTPUT CONVERSION TO ZIP FORMAT
+#######################################
+locus_orthologs_unaligned = "^locus.*$"
+
+f = zipfile.ZipFile("POGs_locus_orthologs_unaligned.zip", "w")
+
+folder = os.listdir("./")
+
+for i in folder :
+    if re.match(locus_orthologs_unaligned, i) :
+    	f.write("./%s" %i)