Mercurial > repos > abims-sbr > filter_assemblies
changeset 8:10f70fea57b3 draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit b7a3030ea134b5dfad89b1a869db659d72d1145c
| author | abims-sbr |
|---|---|
| date | Wed, 28 Feb 2018 10:36:51 -0500 |
| parents | 948864b6ab4b |
| children | c0f695436d51 |
| files | scripts/S04_find_orf.py scripts/S05_filter.py |
| diffstat | 2 files changed, 59 insertions(+), 80 deletions(-) [+] |
line wrap: on
line diff
--- a/scripts/S04_find_orf.py Wed Feb 28 06:03:31 2018 -0500 +++ b/scripts/S04_find_orf.py Wed Feb 28 10:36:51 2018 -0500 @@ -3,70 +3,62 @@ #python find_ORF.py file output def find_orf(entry): - orf={} - orf_length={} - stop=['TAA','TAG','TGA'] - for i in range(0,3): - pos=i - orf[i]=[0] - while pos<len(entry): - if entry[pos:pos+3] in stop: - orf[i].append(pos-1) - orf[i].append(pos+3) - pos+=3 - orf[i].append(len(entry)-1) - orf_length[i]=[] - for u in range(1,len(orf[i])): - orf_length[i].append(orf[i][u]-orf[i][u-1]+1) - orf[i]=[orf[i][orf_length[i].index(max(orf_length[i]))],orf[i][orf_length[i].index(max(orf_length[i]))+1]] - orf_max={0:max(orf_length[0]),1:max(orf_length[1]),2:max(orf_length[2])} - orf=orf[max(list(orf_max.keys()), key=(lambda k: orf_max[k]))] - if orf[0]==0: - orf[0]=orf[0]+max(list(orf_max.keys()), key=(lambda k: orf_max[k])) - return orf + orf={} + orf_length={} + stop=['TAA','TAG','TGA'] + for i in range(0,3): + pos=i + orf[i]=[0] + while pos<len(entry): + if entry[pos:pos+3] in stop: + orf[i].append(pos-1) + orf[i].append(pos+3) + pos+=3 + orf[i].append(len(entry)-1) + orf_length[i]=[] + for u in range(1,len(orf[i])): + orf_length[i].append(orf[i][u]-orf[i][u-1]+1) + orf[i]=[orf[i][orf_length[i].index(max(orf_length[i]))],orf[i][orf_length[i].index(max(orf_length[i]))+1]] + orf_max={0:max(orf_length[0]),1:max(orf_length[1]),2:max(orf_length[2])} + orf=orf[max(list(orf_max.keys()), key=(lambda k: orf_max[k]))] + if orf[0]==0: + orf[0]=orf[0]+max(list(orf_max.keys()), key=(lambda k: orf_max[k])) + return orf def reverse_seq(entry): - nt={'A':'T','T':'A','G':'C','C':'G', 'N':'N'} - seqlist=[] - for i in range(len(entry)-1,-1,-1): - seqlist.append(nt[entry[i]]) - seq=''.join(seqlist) - return seq - - + nt={'A':'T','T':'A','G':'C','C':'G', 'N':'N'} + seqlist=[] + for i in range(len(entry)-1,-1,-1): + seqlist.append(nt[entry[i]]) + seq=''.join(seqlist) + return seq # RUN -import string, os, sys, re +import string, os, sys, re, itertools path_IN = sys.argv[1] file_OUT = open(sys.argv[2], "w") -f_in = open(path_IN, "r") inc=1 threshold=0 #minimal length of the ORF -while 1: - line=f_in.readline() - if not line: - break - line=f_in.readline() - name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_" - high_plus=find_orf(line[:-1]) - reverse=reverse_seq(line[:-1]) - high_minus=find_orf(reverse) - if high_plus[1]-high_plus[0]>threshold or high_minus[1]-high_minus[0]>threshold: - inc+=1 - if high_plus[1]-high_plus[0]>high_minus[1]-high_minus[0]: - file_OUT.write("%s" %name) - file_OUT.write(str(high_plus[1]-high_plus[0]+1)+"\n") - file_OUT.write("%s" %line[high_plus[0]:high_plus[1]+1]) - file_OUT.write("\n") - else: - file_OUT.write("%s" %name) - file_OUT.write(str(high_minus[1]-high_minus[0]+1)+"\n") - file_OUT.write("%s" %reverse[high_minus[0]:high_minus[1]+1]) - file_OUT.write("\n") -f_in.close() -file_OUT.close() - +with open (path_IN, "r") as f_in: + for ignored, line in itertools.izip_longest(*[f_in]*2): + name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_" + high_plus=find_orf(line[:-1]) + reverse=reverse_seq(line[:-1]) + high_minus=find_orf(reverse) + if high_plus[1]-high_plus[0]>threshold or high_minus[1]-high_minus[0]>threshold: + inc+=1 + if high_plus[1]-high_plus[0]>high_minus[1]-high_minus[0]: + file_OUT.write("%s" %name) + file_OUT.write(str(high_plus[1]-high_plus[0]+1)+"\n") + file_OUT.write("%s" %line[high_plus[0]:high_plus[1]+1]) + file_OUT.write("\n") + else: + file_OUT.write("%s" %name) + file_OUT.write(str(high_minus[1]-high_minus[0]+1)+"\n") + file_OUT.write("%s" %reverse[high_minus[0]:high_minus[1]+1]) + file_OUT.write("\n") +file_OUT.close()
--- a/scripts/S05_filter.py Wed Feb 28 06:03:31 2018 -0500 +++ b/scripts/S05_filter.py Wed Feb 28 10:36:51 2018 -0500 @@ -2,33 +2,20 @@ #filters the sequences depending on their length after cap3, makes the sequences names compatible with the phylogeny workflow #python filter.py file length_threshold_nucleotides output -import string, os, sys, re +import string, os, sys, re, itertools path_IN = sys.argv[1] -threshold=int(sys.argv[2]) #minimum number of nucleotides for one sequence +threshold = int(sys.argv[2]) #minimum number of nucleotides for one sequence file_OUT = open(sys.argv[3], "w") -f_in = open(path_IN, "r") -inc=1 -while 1: - line=f_in.readline() - if not line: - break - line=f_in.readline() - name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_" - if len(line)-1>threshold-1: - inc+=1 - file_OUT.write("%s" %name) - file_OUT.write(str(len(line)-1)+"\n") - file_OUT.write("%s" %line) -f_in.close() +inc = 1 +with open(path_IN, "r") as f_in: + for ignored, sequence in itertools.izip_longest(*[f_in]*2): + name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_" + if len(sequence)-1>threshold-1: + inc+=1 + file_OUT.write("%s" %name) + file_OUT.write(str(len(sequence)-1)+"\n") + file_OUT.write("%s" %sequence) file_OUT.close() - - -#filtre eventuel sur les petits transcrits - - - - - - +#filtre eventuel sur les petits transcrits \ No newline at end of file
