Mercurial > repos > abims-sbr > filter_assemblies
changeset 1:2bd29709310f draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit d42de9d17f284473f001574107d8c59ed36be0d4
| author | lecorguille |
|---|---|
| date | Thu, 13 Apr 2017 09:36:16 -0400 |
| parents | 13a9ae9ef940 |
| children | 1daa43b4729c |
| files | CHANGELOG.md README.md scripts/S01_script_to_choose.py scripts/S02a_remove_redondancy_from_velvet_oases.py scripts/S02b_format_fasta_name_trinity.py scripts/S03_choose_one_variants_per_locus_trinity.py scripts/S04_find_orf.py scripts/S05_filter.py |
| diffstat | 8 files changed, 560 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/CHANGELOG.md Thu Apr 13 05:45:50 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -Changelog - -Version 1.0 - 13/04/2017 - - - Add funtional test with planemo - - Planemo test with conda dependencies for cap3, fastaformatter and python - - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,7 @@ +Changelog + +Version 1.0 - 13/04/2017 + + - Add funtional test with planemo + - Planemo test with conda dependencies for cap3, fastaformatter and python + - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S01_script_to_choose.py Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,84 @@ +#!/usr/bin/env python +import sys, string, os, itertools, re, zipfile + +# coding: utf8 + +i=1 +j=i+1 +L1 = [] +fas = "^.*fas$" +fasta = "^.*fasta$" +script_path = os.path.dirname(sys.argv[0]) + +##Run scripts +if sys.argv[1]=="velvet" : + zfile = zipfile.ZipFile(sys.argv[2]) + for name in zfile.namelist() : + zfile.extract(name) + suffix=name[:2] + name2 = "01_%s" %name + name3="02_%s" %name + name4="%s%s" %(suffix,name) + name5="03_%s" % name + name6="%s" %(suffix) + name7="%s%s"%(suffix,name) + #Fasta sequences on one line + os.system("zcat -f < '%s' | fasta_formatter -w 0 -o '%s'" % (name,name2)) + + #Format the name of the sequences with good name + suffix=name[:2] + #if re.match(fas, name) : + #os.system("python /w/galaxy/galaxy4misharl/galaxy-dist/tools/abims/julie/oasearch/filter_assembly/remove_redondancy_from_oases_output_v3.1.py %s %s" %(name2, name3)) + os.system("python %s/S02a_remove_redondancy_from_velvet_oases.py %s %s" %(script_path,name2, name3)) + #elif re.match(fasta, name) : + os.system("sed -e 's/Locus_/%s/g' -e 's/_Confidence_/_/g' -e 's/_Transcript_/_/g' -e 's/_Length_/_/g' %s > %s" % (suffix,name3,name4)) + #os.system("python /w/galaxy/galaxy4misharl/galaxy-dist/tools/abims/julie/oasearch/filter_assembly/remove_redondancy_from_oases_output_v3.0.py %s %s" %(name, name2)) + #Pierre guillaume find_orf script for keeping the longuest ORF + os.system("python %s/S04_find_orf.py %s %s" %(script_path,name4,name5)) + + #Apply cap3 + os.system("cap3 %s -p %s -o %s"%(name5,sys.argv[4],sys.argv[5])) + #Fasta sequences on one line + #Il faudrait faire un merge des singlets et contigs! TODO + os.system("zcat -f < '%s.cap.singlets' | fasta_formatter -w 0 -o '%s'" % (name5,name6)) + #Apply pgbrun script filter script TODO length parameter + os.system("python %s/S05_filter.py %s %s %s" %(script_path,name6,sys.argv[3],name7)) + L1.append(name7) + f = zipfile.ZipFile("sequences_filtered.zip", "w") + for files in L1 : + f.write("./%s" %files) + +##For trinity files +else: + zfile = zipfile.ZipFile(sys.argv[2]) + for name in zfile.namelist() : + zfile.extract(name) + suffix=name[:2] + name2 = "01%s" %name + name3="02%s" %name + name4="03%s" %name + name5="04%s" %name + name6="05%s"% name + name7="%s"%(suffix) + name8="%s%s"%(suffix,name) + #Fasta sequences on one line + os.system("zcat -f < '%s' | fasta_formatter -w 0 -o '%s'" % (name,name2)) + #replace white space by "_" + os.system("sed -e 's/ /_/g' %s > %s " % (name2,name3)) + #Format the name of the sequences with good name + os.system("python %s/S02b_format_fasta_name_trinity.py %s %s %s" %(script_path,name3, name4,suffix)) + #Apply first script to avoid reductant sequences + os.system("python %s/S03_choose_one_variants_per_locus_trinity.py %s %s" %(script_path,name4, name5)) + #Pierre guillaume find_orf script for keeping the longuest ORF + os.system("python %s/S04_find_orf.py %s %s" %(script_path,name5,name6)) + #Apply cap3 + os.system("cap3 %s -p %s -o %s"%(name6,sys.argv[4],sys.argv[5])) + #Fasta sequences on one line + #Il faudrait faire un merge des singlets et contigs! TODO + os.system("zcat -f < '%s.cap.singlets' | fasta_formatter -w 0 -o '%s'" % (name6,name7)) + #Apply pgbrun script filter script TODO length parameter + os.system("python %s/S05_filter.py %s %s %s" %(script_path,name7,sys.argv[3],name8)) + L1.append(name8) + f = zipfile.ZipFile("sequences_filtered.zip", "w") + for files in L1 : + f.write("./%s" %files)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S02a_remove_redondancy_from_velvet_oases.py Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,142 @@ +#!/usr/bin/env python + +## AUTHOR: Eric Fontanillas + +## LAST VERSION: 09.06.2011 + +## DESCRIPTION: Remove redondant transcripts (i.e. transcript from the same locus) from Oases output on the basis of two recursive criterias (see in DEF1): + ## 1. [CRITERIA 1] Keep in priority seq with BEST "confidence_oases_criteria" present in the fasta name + ## 2. [CRITERIA 2] Second choice (if same coverage) : choose the longuest sequence (once any "N" have been removed => effective_length = length - N number +## => criticize of this approach: the transcripts may come from a same locus but may be not redundant (non-overlapping) ==> SEE "DEF2" for an alternative + +################### +###### DEF 1 ###### +################### +def dico_filtering_redundancy(path_in): + f_in = open(path_in, "r") + bash = {} + bash_unredundant = {} + file_read = f_in.read() + S1 = string.split(file_read, ">") + k = 0 + + + ## 1 ## Extract each transcript and group them in same locus if they share the same "short_fasta_name" + for element in S1: + if element != "": + S2 = string.split(element, "\n") + fasta_name = S2[0] + fasta_seq = S2[1:-1] # that line was unindented + fasta_seq = "".join(fasta_seq) # that line was unindented + L = string.split(fasta_name, "_") + short_fasta_name = L[0] + L[1] + + + ##################################################### + ## Used later for [CRITERIA 1] (see below) + confidence_oases_criteria = L[-3] + + ## Used later for [CRITERIA 1] (see below) + countN = string.count(fasta_seq, "N") + length = len(fasta_seq) + effective_length = length - countN + ##################################################### + if short_fasta_name not in bash.keys(): + bash[short_fasta_name] = [[fasta_name, fasta_seq, confidence_oases_criteria, effective_length]] + else: + bash[short_fasta_name].append([fasta_name, fasta_seq, confidence_oases_criteria, effective_length]) + k = k+1 + f_in.close() + + for key in bash.keys(): + ## 2 ## IF ONE TRANSCRIPT PER LOCUS: + ## In this case => we record directly + if len(bash[key]) == 1: + entry = bash[key][0] + name = entry[0] + seq = entry[1] + bash_unredundant[name] = seq + + ## 3 ## IF MORE THAN ONE TRANSCRIPTS PER LOCUS: + ## In this case: + ## 1. [CRITERIA 1] Keep in priority seq with BEST "confidence_oases_criteria" present in the fasta name + ## 2. [CRITERIA 2] Second choice (if same coverage) : choose the longuest sequence (once any "N" have been removed => effective_length = length - N numb + elif len(bash[key]) > 1: ### means there are more than 1 seq + MAX_CONFIDENCE = {} + MAX_LENGTH = {} + for entry in bash[key]: ## KEY = short fasta name || VALUE = list of list, e.g. : [[fasta_name1, fasta_seq1],[fasta_name2, fasta_seq2][fasta_name3, fasta_seq3]] + name = entry[0] + seq = entry[1] + effective_length = entry[3] + confidence_oases_criteria = entry[2] + + ## Bash for [CRITERIA 2] + MAX_LENGTH[effective_length] = entry + + ## Bash for [CRITERIA 1] + confidence_oases_criteria = string.atof(confidence_oases_criteria) + if confidence_oases_criteria not in MAX_CONFIDENCE.keys(): + MAX_CONFIDENCE[confidence_oases_criteria] = entry + else: ## IF SEVERAL SEQUENCES WITH THE SAME CONFIDENCE INTERVAL => RECORD ONLY THE LONGUEST ONE [CRITERIA 2] + current_seq_length = effective_length + yet_recorded_seq_length = MAX_CONFIDENCE[confidence_oases_criteria][3] + if current_seq_length > yet_recorded_seq_length: + MAX_CONFIDENCE[confidence_oases_criteria] = entry ## Replace the previous recorded entry with the same confidence interval but lower length + + + + ## Sort keys() for MAX_CONFIDENCE bash + KC = MAX_CONFIDENCE.keys() + KC.sort() + + ## Select the best entry + MAX_CONFIDENCE_KEY = KC[-1] ## [CRITERIA 1] + BEST_ENTRY = MAX_CONFIDENCE[MAX_CONFIDENCE_KEY] + # if len(KC) > 1: ## [CRITERIA 1] Different confidence criteria found + # MAX_CONFIDENCE_KEY = KC[-1] + # BEST_ENTRY = MAX_CONFIDENCE[MAX_CONFIDENCE_KEY] + # else: ## [CRITERIA 2] ALL transcripts have the same confidence interval => we should use the second criteria: effective length (= length - nb of N) + # MAX_LENGTH_KEY = KL[-1] + # BEST_ENTRY = MAX_LENGTH[MAX_LENGTH_KEY] + + BEST_fasta_name = BEST_ENTRY[0] + BEST_seq = BEST_ENTRY[1] + bash_unredundant[BEST_fasta_name] = BEST_seq + + return bash_unredundant +#~#~#~#~#~#~#~#~#~# + + +################### +### RUN RUN RUN ### +################### +import string, os, sys, re + +path_IN = sys.argv[1] +path_OUT = sys.argv[2] +#length_seq_max=sys.argv[3] +file_OUT = open(path_OUT, "w") + +dico = dico_filtering_redundancy(path_IN) ### DEF1 ### + + +KB = dico.keys() + +## Sort the fasta_name depending their number XX : ApXX +BASH_KB = {} +for name in KB: + L = string.split(name, "_") + nb = string.atoi(L[1]) + BASH_KB[nb] = name +NEW_KB = [] +KKB = BASH_KB.keys() +KKB.sort() + +for nb in KKB: + fasta_name = BASH_KB[nb] + seq = dico[fasta_name] + #if int(len(seq)) > int(length_seq_max): + file_OUT.write(">%s\n" %fasta_name) + file_OUT.write("%s\n" %seq) + +file_OUT.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S02b_format_fasta_name_trinity.py Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,79 @@ +#!/usr/bin/env python + +## AUTHOR: Eric Fontanillas + +## LAST VERSION: 06.12.2011 + +## DESCRIPTION: format fasta name in TRINITY output + + +from os import listdir +import re + + +################### +###### DEF 1 ###### +################### +def dico_format_fasta_name(path_in, SUFFIX): + f_in = open(path_in, "r") + bash = {} + file_read = f_in.read() + S1 = string.split(file_read, ">") + k = 0 + + for element in S1: + if element != "": + S2 = string.split(element, "\n") + fasta_name = S2[0] + fasta_seq = S2[1] + L = string.split(fasta_name, "_") + match=re.search('(\D+)(\d+)', L[0]) + short_fasta_name= SUFFIX + match.group(2) + "_" + L[1] + "_" + L[2] + #short_fasta_name = L[0] + "_" + L[1] + "_" + L[2] + #short_fasta_name = string.replace(short_fasta_name, "comp", SUFFIX) + bash[short_fasta_name] = fasta_seq + + return bash +#~#~#~#~#~#~#~#~#~# + + + +################### +### RUN RUN RUN ### +################### +import string, os, sys, re + +path_IN = sys.argv[1] +path_OUT = sys.argv[2] +suffix= sys.argv[3] +file_OUT = open(path_OUT, "w") +#Extract suffix info + +dico = dico_format_fasta_name(path_IN, suffix) ### DEF1 ### + + +print (len(dico.keys())) + +KB = dico.keys() + +## Sort the fasta_name depending their number XX : ApXX +BASH_KB = {} +for name in KB: + #print name + + L = string.split(name, "_") + nb = L[0][2:] + nb = string.atoi(nb) + #print nb + BASH_KB[nb] = name + +KKB = BASH_KB.keys() +KKB.sort() + +for nb in KKB: + fasta_name = BASH_KB[nb] + seq = dico[fasta_name] + file_OUT.write(">%s\n" %fasta_name) + file_OUT.write("%s\n" %seq) + +file_OUT.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S03_choose_one_variants_per_locus_trinity.py Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,142 @@ +#!/usr/bin/env python + +## AUTHOR: Eric Fontanillas + +## LAST VERSION: 06.12.2011 + +## DESCRIPTION: Remove redondant transcripts (i.e. transcript from the same locus) from TRINITY on the basis of 1 criteria: + ## 1. [CRITERIA 1] choose the longuest sequence (once any "N" have been removed => effective_length = length - N number + +################### +###### DEF 1 ###### +################### +def dico_filtering_redundancy(path_in): + f_in = open(path_in, "r") + bash = {} + bash_unredundant = {} + file_read = f_in.read() + S1 = string.split(file_read, ">") + k = 0 + + + ## 1 ## Extract each transcript and group them in same locus if they share the same "short_fasta_name" + for element in S1: + if element != "": + S2 = string.split(element, "\n") + fasta_name = S2[0] + fasta_seq = S2[1] + + L = string.split(fasta_name, "_") + + ## 1.1. ## Extract short fasta name + short_fasta_name = L[0] + L[1] + + #new_fasta_name = L[0] + "_" + L[1] + "_" + L[2] + + ## 1.2. ## Extract length + # Length extraction from the name not necessary + #lgt = L[-2] + #list_lgt = string.split(lgt, ":") + #length = string.atoi(list_lgt[1]) + #print "\tLength in fasta name = %.5f" %length + + ## Used later for [CRITERIA 1] (see below) + countN = string.count(fasta_seq, "N") + #print "\tNb of N in sequence = %d" %countN + length = len(fasta_seq) + effective_length = length - countN + ##################################################### + + if short_fasta_name not in bash.keys(): + bash[short_fasta_name] = [[fasta_name, fasta_seq, effective_length]] + else: + bash[short_fasta_name].append([fasta_name, fasta_seq, effective_length]) + k = k+1 + if k%1000 == 0: + print (k) + f_in.close() + + for key in bash.keys(): + #print "%s = %d" %(key,len(bash[key])) + + ## 2 ## IF ONE TRANSCRIPT PER LOCUS: + ## In this case => we record directly + if len(bash[key]) == 1: + #print bash[key] + entry = bash[key][0] + name = entry[0] + seq = entry[1] + bash_unredundant[name] = seq + + ## 3 ## IF MORE THAN ONE TRANSCRIPTS PER LOCUS: + ## In this case: + ## [CRITERIA 1]: Choose the longuest sequence (once any "N" have been removed => effective_length = length - N numb + elif len(bash[key]) > 1: ### means there are more than 1 seq + MAX_LENGTH = {} + for entry in bash[key]: ## KEY = short fasta name || VALUE = list of list, e.g. : [[fasta_name1, fasta_seq1],[fasta_name2, fasta_seq2][fasta_name3, fasta_seq3]] + #print entry + name = entry[0] + seq = entry[1] + effective_length = entry[2] + + ## Bash for [CRITERIA 1] + MAX_LENGTH[effective_length] = entry + + + ## Sort keys() for MAX_LENGTH bash + KC = MAX_LENGTH.keys() + KC.sort() + + ## Select the best entry + MAX_LENGTH_KEY = KC[-1] ## [CRITERIA 1] + BEST_ENTRY = MAX_LENGTH[MAX_LENGTH_KEY] + + BEST_fasta_name = BEST_ENTRY[0] + BEST_seq = BEST_ENTRY[1] + bash_unredundant[BEST_fasta_name] = BEST_seq + + return bash_unredundant +#~#~#~#~#~#~#~#~#~# + + + +################### +### RUN RUN RUN ### +################### +import string, os, sys, re + +path_IN = sys.argv[1] +path_OUT = sys.argv[2] +file_OUT = open(path_OUT, "w") +#length_seq_max=sys.argv[3] +dico = dico_filtering_redundancy(path_IN) ### DEF1 ### + +#print dico.keys() + +#print len(dico.keys()) + +KB = dico.keys() +#print "dis donc %s" %KB + +## Sort the fasta_name depending their number XX : ApXX +BASH_KB = {} +for name in KB: + #print name + L = string.split(name, "_") + nb = L[0][2:] + nb = string.atoi(nb) + #print nb + BASH_KB[nb] = name + +KKB = BASH_KB.keys() +KKB.sort() + +for nb in KKB: + fasta_name = BASH_KB[nb] + #print fasta_name + seq = dico[fasta_name] + #if int(len(seq)) > int(length_seq_max): + file_OUT.write(">%s\n" %fasta_name) + file_OUT.write("%s\n" %seq) + +file_OUT.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S04_find_orf.py Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,72 @@ +#!/usr/bin/env python +#keeps the longest ORF found in the 6 possible ORF alltogether +#python find_ORF.py file output + +def find_orf(entry): + orf={} + orf_length={} + stop=['TAA','TAG','TGA'] + for i in range(0,3): + pos=i + orf[i]=[0] + while pos<len(entry): + if entry[pos:pos+3] in stop: + orf[i].append(pos-1) + orf[i].append(pos+3) + pos+=3 + orf[i].append(len(entry)-1) + orf_length[i]=[] + for u in range(1,len(orf[i])): + orf_length[i].append(orf[i][u]-orf[i][u-1]+1) + orf[i]=[orf[i][orf_length[i].index(max(orf_length[i]))],orf[i][orf_length[i].index(max(orf_length[i]))+1]] + orf_max={0:max(orf_length[0]),1:max(orf_length[1]),2:max(orf_length[2])} + orf=orf[max(orf_max.keys(), key=(lambda k: orf_max[k]))] + if orf[0]==0: + orf[0]=orf[0]+max(orf_max.keys(), key=(lambda k: orf_max[k])) + return orf + + +def reverse_seq(entry): + nt={'A':'T','T':'A','G':'C','C':'G', 'N':'N'} + seqlist=[] + for i in range(len(entry)-1,-1,-1): + seqlist.append(nt[entry[i]]) + seq=''.join(seqlist) + return seq + + + +# RUN + +import string, os, sys, re + +path_IN = sys.argv[1] +file_OUT = open(sys.argv[2], "w") +f_in = open(path_IN, "r") +inc=1 +threshold=0 #minimal length of the ORF +while 1: + line=f_in.readline() + if not line: + break + line=f_in.readline() + name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_" + high_plus=find_orf(line[:-1]) + reverse=reverse_seq(line[:-1]) + high_minus=find_orf(reverse) + if high_plus[1]-high_plus[0]>threshold or high_minus[1]-high_minus[0]>threshold: + inc+=1 + if high_plus[1]-high_plus[0]>high_minus[1]-high_minus[0]: + file_OUT.write("%s" %name) + file_OUT.write(str(high_plus[1]-high_plus[0]+1)+"\n") + file_OUT.write("%s" %line[high_plus[0]:high_plus[1]+1]) + file_OUT.write("\n") + else: + file_OUT.write("%s" %name) + file_OUT.write(str(high_minus[1]-high_minus[0]+1)+"\n") + file_OUT.write("%s" %reverse[high_minus[0]:high_minus[1]+1]) + file_OUT.write("\n") +f_in.close() +file_OUT.close() + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/S05_filter.py Thu Apr 13 09:36:16 2017 -0400 @@ -0,0 +1,34 @@ +#!/usr/bin/env python +#filters the sequences depending on their length after cap3, makes the sequences names compatible with the phylogeny workflow +#python filter.py file length_threshold_nucleotides output + +import string, os, sys, re + +path_IN = sys.argv[1] +threshold=int(sys.argv[2]) #minimum number of nucleotides for one sequence +file_OUT = open(sys.argv[3], "w") +f_in = open(path_IN, "r") +inc=1 +while 1: + line=f_in.readline() + if not line: + break + line=f_in.readline() + name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_" + if len(line)-1>threshold-1: + inc+=1 + file_OUT.write("%s" %name) + file_OUT.write(str(len(line)-1)+"\n") + file_OUT.write("%s" %line) +f_in.close() +file_OUT.close() + + + +#filtre eventuel sur les petits transcrits + + + + + +
