Mercurial > repos > abims-sbr > pogs
changeset 1:0037e4e222b1 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit ab76075e541dd7ece1090f6b55ca508ec0fde39d
| author | lecorguille |
|---|---|
| date | Thu, 13 Apr 2017 09:47:11 -0400 |
| parents | b2895c835ea8 |
| children | be2128ad0030 |
| files | CHANGELOG.md README.md scripts/S01_get_locus_ortholog_part1.py scripts/S02_get_locus_ortholog_part2.py |
| diffstat | 4 files changed, 311 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGELOG.md Thu Apr 13 05:46:54 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -Changelog - -Version 1.0 - 13/04/2017 - - - Add functional test with planemo - - Planemo test with conda dependency for python - - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Apr 13 09:47:11 2017 -0400 @@ -0,0 +1,7 @@ +Changelog + +Version 1.0 - 13/04/2017 + + - Add functional test with planemo + - Planemo test with conda dependency for python + - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01_get_locus_ortholog_part1.py Thu Apr 13 09:47:11 2017 -0400 @@ -0,0 +1,113 @@ +#!/usr/bin/env python +## AUTHOR: Eric Fontanillas +## LAST VERSION: 14/08/14 by Julie BAFFARD + +#MINIMUM_LENGTH = 100 + + +############################ +##### DEF1 : Get Pairs ##### +############################ +def get_pairs(fasta_file_path): + F2 = open(fasta_file_path, "r") + list_pairwises = [] + while 1: + next2 = F2.readline() + if not next2: + break + if next2[0] == ">": + fasta_name_query = next2[:-1] + Sn = string.split(fasta_name_query, "||") + fasta_name_query = Sn[0] + next3 = F2.readline() + fasta_seq_query = next3[:-1] + next3 = F2.readline() ## jump one empty line (if any after the sequence) + fasta_name_match = next3[:-1] + Sn = string.split(fasta_name_match, "||") + fasta_name_match = Sn[0] + + next3 = F2.readline() + fasta_seq_match = next3[:-1] + + pairwise = [fasta_name_query,fasta_seq_query,fasta_name_match,fasta_seq_match] + list_pairwises.append(pairwise) + + F2.close() + return(list_pairwises) +############################################## + + +####################### +##### RUN RUN RUN ##### +####################### +import string, os, time, re, sys, pickle, zipfile #modif julie : ajout de zipfile + +## 1 ## INPUT/OUTPUT +#extraction du fichier zip contenant les sorties +zfile = zipfile.ZipFile(sys.argv[1]) +for name in zfile.namelist() : + zfile.extract(name) + +PATH = "./" +L1 = os.listdir(PATH) + +## 2 ## RUN +##Get list locus +list_LOCUS=[] +for subfolder in L1: + if subfolder.startswith("19_") : + fileIN = subfolder + list_pairwises = get_pairs(fileIN) ### DEF1 ### + + jj=0 + for pair in list_pairwises: + name1 = pair[0] + name2 = pair[2] + m=0 + + ## If one of the 2 names yet present ==> complete the locus + for locus in list_LOCUS: + if name1 in locus or name2 in locus: + locus.append(name1) + locus.append(name2) + m=1 + + ## If no names was yet present in locus ==> create the locus + if m==0: + new_locus = [name1, name2] + list_LOCUS.append(new_locus) + + ## Compteur + jj = jj+1 + +## Remove redondancy in list_LOCUS : Several locus in the list_locus, should be the same (but not in the same order), depending in which order they were created, we should remove them + +list_short_LOCUS = [] +for locus in list_LOCUS: + index = list_LOCUS.index(locus) + + short_locus = [] + + ## Remove redondant sequence name + for seq in locus: + if seq not in short_locus: + short_locus.append(seq) + + ## Sort the list of sequence name (by alphabetical order) + short_locus.sort() + + ## Add the locus to the new list (list_short_LOCUS) if not yet present (avoid redondant locus) + if short_locus not in list_short_LOCUS: + list_short_LOCUS.append(short_locus) + +list_LOCUS = list_short_LOCUS + +## Control list locus length +ln = len(list_LOCUS) +print "\nNumber of locus = %d\n" %ln + +## Backup list locus with PICKLE +backup_list_LOCUS = open("02_backup_list_LOCUS", "w") +pickle.dump(list_LOCUS, backup_list_LOCUS) + +backup_list_LOCUS.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S02_get_locus_ortholog_part2.py Thu Apr 13 09:47:11 2017 -0400 @@ -0,0 +1,191 @@ +#!/usr/bin/env python +## AUTHOR: Eric Fontanillas +## LAST VERSION: 14/08/14 by Julie BAFFARD + +############################## +##### DEF1 : Dico fasta ##### +############################## +def dico(fasta_file_path): + F2 = open(fasta_file_path, "r") + dicoco = {} + while 1: + next2 = F2.readline() + if not next2: + break + if next2[0] == ">": + fasta_name_query = next2[:-1] + Sn = string.split(fasta_name_query, "||") + fasta_name_query = Sn[0] + next3 = F2.readline() + fasta_seq_query = next3[:-1] + + dicoco[fasta_name_query]=fasta_seq_query + + F2.close() + return(dicoco) +###################################################### + + +############################################################## +##### DEF2 : Get sequences from list of names per locus ##### +############################################################## +def get_seq_per_locus(list_locus, bash_seq, path_OUT,prefix_name): + + i=0 + for locus in list_locus: + i = i+1 + OUT = open("%s/locus%d_%s.fasta" %(path_OUT, i, prefix_name), "w") + + for seq_name in locus: + sequence = bash_seq[seq_name] + OUT.write("%s\n" %seq_name) + OUT.write("%s\n" %sequence) + + OUT.close() +################################################################################### + + +####################### +##### RUN RUN RUN ##### +####################### +import string, os, sys, pickle, zipfile, re + +## 1 ## LOAD "PICKLE" LIST OF ALL POTENTIAL LOCUS (orthologuous and paraloguous) +################################################################################ +file_LOCUS = open("02_backup_list_LOCUS") +list_LOCUS = pickle.load(file_LOCUS) +file_LOCUS.close() + +# L2 : list of input file +L2 = [] +zfile = zipfile.ZipFile(sys.argv[1]) +for name in zfile.namelist() : + zfile.extract(name) + L2.append(name) + +nb=1 +os.mkdir("04_LOCUS_ORTHOLOGS_UNALIGNED_perCLASS") +while nb<len(L2) : + nb+=1 + os.mkdir("04_LOCUS_ORTHOLOGS_UNALIGNED_perCLASS/LOCUS_%i_sp" %nb) + + + +## 2 ## [1rst treatment INTRA LOCUS] SELECT ONLY ORTHOLOGUOUS (list of names of orthologuous sequences) : +## test if locus is orthologuous (i.e. when a same species appear with only 1 sequence in the locus) +######################################################################################################### +LOCUS_without_DUPLI = [] +for locus in list_LOCUS: + ## remove redondancy (i.e. same names present several times as it is expected) + l = [] + for name in locus: + if name not in l: + if name[-2:] == "_S": + name = name[:-2] + l.append(name) + + ## Now test if only one name per species, if not ==> duplication + list_initials = [] + D=0 + for name in l: + initials = name[1:3] + + if initials not in list_initials: + list_initials.append(initials) + else: # DUPLICATION DETECTED + D=1 + + if D==0: # means no duplication detected + LOCUS_without_DUPLI.append(l) + +print "\nNUMBER OF REMAINING LOCUS AFTER 1RST TREATMENT [INTRA LOCUS] = %d" %len(LOCUS_without_DUPLI) + + +## 3 ## [2nd treatment INTER LOCUS]In fact there are still some duplication in "LOCUS_without_DUPLI" +## ==> no duplication in each loci ... OK, but several loci sharing a same sequence is still possible, +## I will exclude this case now +######################################################################################################### +list_name_seq = [] +bash_name_seq = {} + +# 3.1. Get bash = key = sequence name // value = list of locus +for locus in LOCUS_without_DUPLI: + for sequence in locus: + if sequence not in list_name_seq: + list_name_seq.append(sequence) + list = [locus] + bash_name_seq[sequence] = list + + else: + bash_name_seq[sequence] = bash_name_seq[sequence] + list + +# 3.2. From the previous bash, test sequence name present in more than one loci (i.e. length of the list of loci > 1), record these paralogous loci +k = bash_name_seq.keys() +list_locus_paralogues = [] +for sequence in k: + list = bash_name_seq[sequence] + if len(list) > 1: + for locus in list: + list_locus_paralogues.append(locus) ### list of list + +# 3.3. Remove the paralogous loci from the "LOCUS_without_DUPLI" list +for locus in list_locus_paralogues: + if locus in LOCUS_without_DUPLI: + LOCUS_without_DUPLI.remove(locus) +print "NUMBER OF REMAINING LOCUS AFTER 2ND TREATMENT [INTER LOCUS] = %d\n\n" %len(LOCUS_without_DUPLI) + +nbr=1 +dico_LOCUS_sp = {} + +while nbr<len(L2) : + nbr+=1 + dico_LOCUS_sp['%i_sp' %nbr] = [] + for locus in LOCUS_without_DUPLI : + if len(locus) == nbr : + dico_LOCUS_sp['%i_sp' %nbr].append(locus) + +total_number_locus = 0 +list_key = [] + +for key in dico_LOCUS_sp.keys() : + list_key.append(key) + +list_key.sort() +list_key.reverse() +for number in list_key : + value = dico_LOCUS_sp[number] + if value != [] : + number = number.split("_") + number = number[0] + print "Number of species in the locus : %s" %number + print "\tNumber of locus : %i" %len(value) + print "" + + +## 4 ## GET SEQUENCES CORRESPONDING TO THE ORTHOLOGUOUS NAMES (per locus) +######################################################################### +BASH = {} + +for subfile in L2: + dicoco = dico(subfile) ### DEF1 ### + BASH.update(dicoco) + + +sp = 1 +while sp<len(L2) : + sp+=1 + get_seq_per_locus(dico_LOCUS_sp['%i_sp' %sp], BASH, "04_LOCUS_ORTHOLOGS_UNALIGNED_perCLASS/LOCUS_%i_sp" %sp, "sp%i" %sp) ## DEF2 ## + get_seq_per_locus(dico_LOCUS_sp['%i_sp' %sp], BASH, ".", "sp%i" %sp) ## DEF2 ## + + +## 5 ## OUTPUT CONVERSION TO ZIP FORMAT +####################################### +locus_orthologs_unaligned = "^locus.*$" + +f = zipfile.ZipFile("POGs_locus_orthologs_unaligned.zip", "w") + +folder = os.listdir("./") + +for i in folder : + if re.match(locus_orthologs_unaligned, i) : + f.write("./%s" %i)
