Mercurial > repos > abims-sbr > filter_assemblies

--- a/CHANGELOG.md	Thu Apr 13 05:45:50 2017 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,7 +0,0 @@
-Changelog
-
-Version 1.0 - 13/04/2017
-
-  - Add funtional test with planemo
-  - Planemo test with conda dependencies for cap3, fastaformatter and python
-  - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,7 @@
+Changelog
+
+Version 1.0 - 13/04/2017
+
+  - Add funtional test with planemo
+  - Planemo test with conda dependencies for cap3, fastaformatter and python
+  - Scripts renamed + symlinks to the directory 'scripts'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S01_script_to_choose.py	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,84 @@
+#!/usr/bin/env python
+import sys, string, os, itertools, re, zipfile
+
+# coding: utf8
+
+i=1
+j=i+1
+L1 = []
+fas = "^.*fas$"
+fasta = "^.*fasta$"
+script_path = os.path.dirname(sys.argv[0])
+
+##Run scripts
+if sys.argv[1]=="velvet" :
+    zfile = zipfile.ZipFile(sys.argv[2])
+    for name in zfile.namelist() :
+        zfile.extract(name)
+        suffix=name[:2]
+        name2 = "01_%s" %name
+        name3="02_%s" %name
+        name4="%s%s" %(suffix,name)
+        name5="03_%s" % name
+        name6="%s" %(suffix)
+        name7="%s%s"%(suffix,name)
+    #Fasta sequences on one line
+        os.system("zcat -f < '%s' | fasta_formatter -w 0 -o '%s'" % (name,name2))
+
+        #Format the name of the sequences with good name
+        suffix=name[:2]
+        #if re.match(fas, name) :
+    #os.system("python /w/galaxy/galaxy4misharl/galaxy-dist/tools/abims/julie/oasearch/filter_assembly/remove_redondancy_from_oases_output_v3.1.py %s %s" %(name2, name3))
+        os.system("python %s/S02a_remove_redondancy_from_velvet_oases.py %s %s" %(script_path,name2, name3))
+        #elif re.match(fasta, name) :
+        os.system("sed -e 's/Locus_/%s/g' -e 's/_Confidence_/_/g' -e 's/_Transcript_/_/g' -e 's/_Length_/_/g' %s > %s" % (suffix,name3,name4))
+        #os.system("python /w/galaxy/galaxy4misharl/galaxy-dist/tools/abims/julie/oasearch/filter_assembly/remove_redondancy_from_oases_output_v3.0.py %s %s" %(name, name2))
+        #Pierre guillaume find_orf script for keeping the longuest ORF
+        os.system("python %s/S04_find_orf.py %s %s" %(script_path,name4,name5))
+
+	#Apply cap3
+        os.system("cap3  %s -p %s -o %s"%(name5,sys.argv[4],sys.argv[5]))
+        #Fasta sequences on one line
+        #Il faudrait faire un merge des singlets et contigs! TODO
+        os.system("zcat -f < '%s.cap.singlets' | fasta_formatter -w 0 -o '%s'" % (name5,name6))
+        #Apply pgbrun script filter script TODO length parameter
+        os.system("python %s/S05_filter.py %s %s %s" %(script_path,name6,sys.argv[3],name7))
+        L1.append(name7)
+    f = zipfile.ZipFile("sequences_filtered.zip", "w")
+    for files in L1 :
+        f.write("./%s" %files)
+
+##For trinity files
+else:
+    zfile = zipfile.ZipFile(sys.argv[2])
+    for name in zfile.namelist() :
+        zfile.extract(name)
+        suffix=name[:2]
+        name2 = "01%s" %name
+        name3="02%s" %name
+        name4="03%s" %name
+        name5="04%s" %name
+        name6="05%s"% name
+        name7="%s"%(suffix)
+        name8="%s%s"%(suffix,name)
+        #Fasta sequences on one line
+        os.system("zcat -f < '%s' | fasta_formatter -w 0 -o '%s'" % (name,name2))
+        #replace white space by "_"
+        os.system("sed -e 's/ /_/g' %s > %s " % (name2,name3))
+        #Format the name of the sequences with good name
+        os.system("python %s/S02b_format_fasta_name_trinity.py %s %s %s" %(script_path,name3, name4,suffix))
+        #Apply first script to avoid reductant sequences
+        os.system("python %s/S03_choose_one_variants_per_locus_trinity.py %s %s" %(script_path,name4, name5))
+    #Pierre guillaume find_orf script for keeping the longuest ORF
+        os.system("python %s/S04_find_orf.py %s %s" %(script_path,name5,name6))
+        #Apply cap3
+        os.system("cap3  %s -p %s -o %s"%(name6,sys.argv[4],sys.argv[5]))
+        #Fasta sequences on one line
+        #Il faudrait faire un merge des singlets et contigs! TODO
+        os.system("zcat -f < '%s.cap.singlets' | fasta_formatter -w 0 -o '%s'" % (name6,name7))
+        #Apply pgbrun script filter script TODO length parameter
+        os.system("python %s/S05_filter.py %s %s %s" %(script_path,name7,sys.argv[3],name8))
+        L1.append(name8)
+    f = zipfile.ZipFile("sequences_filtered.zip", "w")
+    for files in L1 :
+        f.write("./%s" %files)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S02a_remove_redondancy_from_velvet_oases.py	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+## AUTHOR: Eric Fontanillas
+
+## LAST VERSION: 09.06.2011
+
+## DESCRIPTION: Remove redondant transcripts (i.e. transcript from the same locus) from Oases output on the basis of two recursive criterias (see in DEF1):
+            ## 1. [CRITERIA 1] Keep in priority seq with BEST "confidence_oases_criteria" present in the fasta name
+            ## 2. [CRITERIA 2] Second choice (if same coverage) : choose the longuest sequence (once any "N" have been removed => effective_length = length - N number
+## => criticize of this approach: the transcripts may come from a same locus but may be not redundant (non-overlapping) ==> SEE "DEF2" for an alternative
+
+###################
+###### DEF 1 ######
+###################
+def dico_filtering_redundancy(path_in):
+    f_in = open(path_in, "r")
+    bash = {}
+    bash_unredundant = {}
+    file_read = f_in.read()
+    S1 = string.split(file_read, ">")
+    k = 0
+
+
+    ## 1 ## Extract each transcript and group them in same locus if they share the same "short_fasta_name"
+    for element in S1:
+        if element != "":
+            S2 = string.split(element, "\n")
+            fasta_name = S2[0]
+            fasta_seq = S2[1:-1] # that line was unindented
+            fasta_seq = "".join(fasta_seq) # that line was unindented
+            L = string.split(fasta_name, "_")
+            short_fasta_name = L[0] + L[1]
+
+
+            #####################################################
+            ## Used later for [CRITERIA 1] (see below)
+            confidence_oases_criteria = L[-3]
+
+            ## Used later for [CRITERIA 1] (see below)
+            countN = string.count(fasta_seq, "N")
+            length = len(fasta_seq)
+            effective_length = length - countN
+            #####################################################
+            if short_fasta_name not in bash.keys():
+                bash[short_fasta_name] = [[fasta_name, fasta_seq, confidence_oases_criteria, effective_length]]
+            else:
+                bash[short_fasta_name].append([fasta_name, fasta_seq, confidence_oases_criteria, effective_length])
+        k = k+1
+        f_in.close()
+
+    for key in bash.keys():
+        ## 2 ## IF ONE TRANSCRIPT PER LOCUS:
+        ## In this case => we record directly
+        if len(bash[key]) == 1:
+            entry = bash[key][0]
+            name = entry[0]
+            seq = entry[1]
+            bash_unredundant[name] = seq
+
+        ## 3 ## IF MORE THAN ONE TRANSCRIPTS PER LOCUS:
+        ## In this case:
+        ## 1. [CRITERIA 1] Keep in priority seq with BEST "confidence_oases_criteria" present in the fasta name
+        ## 2. [CRITERIA 2] Second choice (if same coverage) : choose the longuest sequence (once any "N" have been removed => effective_length = length - N numb
+        elif len(bash[key]) > 1:   ### means there are more than 1 seq
+            MAX_CONFIDENCE = {}
+            MAX_LENGTH = {}
+            for entry in bash[key]:    ## KEY = short fasta name    || VALUE = list of list, e.g. :  [[fasta_name1, fasta_seq1],[fasta_name2, fasta_seq2][fasta_name3, fasta_seq3]]
+                name = entry[0]
+                seq = entry[1]
+                effective_length = entry[3]
+                confidence_oases_criteria = entry[2]
+
+                ## Bash for [CRITERIA 2]
+                MAX_LENGTH[effective_length] = entry
+
+                ## Bash for [CRITERIA 1]
+                confidence_oases_criteria = string.atof(confidence_oases_criteria)
+                if confidence_oases_criteria not in MAX_CONFIDENCE.keys():
+                    MAX_CONFIDENCE[confidence_oases_criteria] = entry
+                else:    ## IF SEVERAL SEQUENCES WITH THE SAME CONFIDENCE INTERVAL => RECORD ONLY THE LONGUEST ONE [CRITERIA 2]
+                    current_seq_length = effective_length
+                    yet_recorded_seq_length = MAX_CONFIDENCE[confidence_oases_criteria][3]
+                    if current_seq_length > yet_recorded_seq_length:
+                        MAX_CONFIDENCE[confidence_oases_criteria] = entry   ## Replace the previous recorded entry with the same confidence interval but lower length
+
+
+
+            ## Sort keys() for MAX_CONFIDENCE bash
+            KC = MAX_CONFIDENCE.keys()
+            KC.sort()
+
+            ## Select the best entry
+            MAX_CONFIDENCE_KEY = KC[-1]  ## [CRITERIA 1]
+            BEST_ENTRY = MAX_CONFIDENCE[MAX_CONFIDENCE_KEY]
+            # if len(KC) > 1:    ## [CRITERIA 1] Different confidence criteria found
+            #     MAX_CONFIDENCE_KEY = KC[-1]
+            #     BEST_ENTRY = MAX_CONFIDENCE[MAX_CONFIDENCE_KEY]
+            # else:             ## [CRITERIA 2] ALL transcripts have the same confidence interval => we should use the second criteria: effective length (= length - nb of N)
+            #     MAX_LENGTH_KEY = KL[-1]
+            #     BEST_ENTRY = MAX_LENGTH[MAX_LENGTH_KEY]
+
+            BEST_fasta_name = BEST_ENTRY[0]
+            BEST_seq = BEST_ENTRY[1]
+            bash_unredundant[BEST_fasta_name] = BEST_seq
+
+    return bash_unredundant
+#~#~#~#~#~#~#~#~#~#
+
+
+###################
+### RUN RUN RUN ###
+###################
+import string, os, sys, re
+
+path_IN = sys.argv[1]
+path_OUT = sys.argv[2]
+#length_seq_max=sys.argv[3]
+file_OUT = open(path_OUT, "w")
+
+dico = dico_filtering_redundancy(path_IN)   ### DEF1 ###
+
+
+KB = dico.keys()
+
+## Sort the fasta_name depending their number XX : ApXX
+BASH_KB = {}
+for name in KB:
+    L = string.split(name, "_")
+    nb = string.atoi(L[1])
+    BASH_KB[nb] = name
+NEW_KB = []
+KKB = BASH_KB.keys()
+KKB.sort()
+
+for nb in KKB:
+    fasta_name = BASH_KB[nb]
+    seq = dico[fasta_name]
+    #if int(len(seq)) > int(length_seq_max):
+    file_OUT.write(">%s\n" %fasta_name)
+    file_OUT.write("%s\n" %seq)
+
+file_OUT.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S02b_format_fasta_name_trinity.py	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+
+## AUTHOR: Eric Fontanillas
+
+## LAST VERSION: 06.12.2011
+
+## DESCRIPTION: format fasta name in TRINITY output
+
+
+from os import listdir
+import re
+
+
+###################
+###### DEF 1 ######
+###################
+def dico_format_fasta_name(path_in, SUFFIX):
+    f_in = open(path_in, "r")
+    bash = {}
+    file_read = f_in.read()
+    S1 = string.split(file_read, ">")
+    k = 0
+
+    for element in S1:
+        if element != "":
+            S2 = string.split(element, "\n")
+            fasta_name = S2[0]
+            fasta_seq = S2[1]
+            L = string.split(fasta_name, "_")
+            match=re.search('(\D+)(\d+)', L[0])
+            short_fasta_name= SUFFIX + match.group(2) + "_" + L[1] + "_" + L[2]
+            #short_fasta_name =  L[0] + "_" + L[1] + "_" + L[2]
+            #short_fasta_name = string.replace(short_fasta_name, "comp", SUFFIX)
+            bash[short_fasta_name] = fasta_seq
+
+    return bash
+#~#~#~#~#~#~#~#~#~#
+
+
+
+###################
+### RUN RUN RUN ###
+###################
+import string, os, sys, re
+
+path_IN = sys.argv[1]
+path_OUT = sys.argv[2]
+suffix= sys.argv[3]
+file_OUT = open(path_OUT, "w")
+#Extract suffix info
+
+dico = dico_format_fasta_name(path_IN, suffix)   ### DEF1 ###
+
+
+print (len(dico.keys()))
+
+KB = dico.keys()
+
+## Sort the fasta_name depending their number XX : ApXX
+BASH_KB = {}
+for name in KB:
+    #print name
+
+    L = string.split(name, "_")
+    nb = L[0][2:]
+    nb = string.atoi(nb)
+    #print nb
+    BASH_KB[nb] = name
+
+KKB = BASH_KB.keys()
+KKB.sort()
+
+for nb in KKB:
+    fasta_name = BASH_KB[nb]
+    seq = dico[fasta_name]
+    file_OUT.write(">%s\n" %fasta_name)
+    file_OUT.write("%s\n" %seq)
+
+file_OUT.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S03_choose_one_variants_per_locus_trinity.py	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,142 @@
+#!/usr/bin/env python
+
+## AUTHOR: Eric Fontanillas
+
+## LAST VERSION: 06.12.2011
+
+## DESCRIPTION: Remove redondant transcripts (i.e. transcript from the same locus) from TRINITY on the basis of 1 criteria:
+            ## 1. [CRITERIA 1] choose the longuest sequence (once any "N" have been removed => effective_length = length - N number
+
+###################
+###### DEF 1 ######
+###################
+def dico_filtering_redundancy(path_in):
+    f_in = open(path_in, "r")
+    bash = {}
+    bash_unredundant = {}
+    file_read = f_in.read()
+    S1 = string.split(file_read, ">")
+    k = 0
+
+
+    ## 1 ## Extract each transcript and group them in same locus if they share the same "short_fasta_name"
+    for element in S1:
+        if element != "":
+            S2 = string.split(element, "\n")
+            fasta_name = S2[0]
+            fasta_seq = S2[1]
+
+            L = string.split(fasta_name, "_")
+
+            ## 1.1. ## Extract short fasta name
+            short_fasta_name = L[0] + L[1]
+
+            #new_fasta_name = L[0] + "_" + L[1] + "_" + L[2]
+
+            ## 1.2. ## Extract length
+            # Length extraction from the name not necessary
+            #lgt = L[-2]
+            #list_lgt = string.split(lgt, ":")
+            #length = string.atoi(list_lgt[1])
+            #print "\tLength in fasta name = %.5f" %length
+
+            ## Used later for [CRITERIA 1] (see below)
+            countN = string.count(fasta_seq, "N")
+            #print "\tNb of N in sequence = %d" %countN
+            length = len(fasta_seq)
+            effective_length = length - countN
+            #####################################################
+
+            if short_fasta_name not in bash.keys():
+                bash[short_fasta_name] = [[fasta_name, fasta_seq, effective_length]]
+            else:
+                bash[short_fasta_name].append([fasta_name, fasta_seq, effective_length])
+        k = k+1
+        if k%1000 == 0:
+            print (k)
+    f_in.close()
+
+    for key in bash.keys():
+        #print "%s = %d" %(key,len(bash[key]))
+
+        ## 2 ## IF ONE TRANSCRIPT PER LOCUS:
+        ## In this case => we record directly
+        if len(bash[key]) == 1:
+            #print bash[key]
+            entry = bash[key][0]
+            name = entry[0]
+            seq = entry[1]
+            bash_unredundant[name] = seq
+
+        ## 3 ## IF MORE THAN ONE TRANSCRIPTS PER LOCUS:
+        ## In this case:
+        ## [CRITERIA 1]: Choose the longuest sequence (once any "N" have been removed => effective_length = length - N numb
+        elif len(bash[key]) > 1:   ### means there are more than 1 seq
+            MAX_LENGTH = {}
+            for entry in bash[key]:    ## KEY = short fasta name    || VALUE = list of list, e.g. :  [[fasta_name1, fasta_seq1],[fasta_name2, fasta_seq2][fasta_name3, fasta_seq3]]
+                #print entry
+                name = entry[0]
+                seq = entry[1]
+                effective_length = entry[2]
+
+                ## Bash for [CRITERIA 1]
+                MAX_LENGTH[effective_length] = entry
+
+
+            ## Sort keys() for MAX_LENGTH bash
+            KC = MAX_LENGTH.keys()
+            KC.sort()
+
+            ## Select the best entry
+            MAX_LENGTH_KEY = KC[-1]  ## [CRITERIA 1]
+            BEST_ENTRY = MAX_LENGTH[MAX_LENGTH_KEY]
+
+            BEST_fasta_name = BEST_ENTRY[0]
+            BEST_seq = BEST_ENTRY[1]
+            bash_unredundant[BEST_fasta_name] = BEST_seq
+
+    return bash_unredundant
+#~#~#~#~#~#~#~#~#~#
+
+
+
+###################
+### RUN RUN RUN ###
+###################
+import string, os, sys, re
+
+path_IN = sys.argv[1]
+path_OUT = sys.argv[2]
+file_OUT = open(path_OUT, "w")
+#length_seq_max=sys.argv[3]
+dico = dico_filtering_redundancy(path_IN)   ### DEF1 ###
+
+#print dico.keys()
+
+#print len(dico.keys())
+
+KB = dico.keys()
+#print "dis donc %s" %KB
+
+## Sort the fasta_name depending their number XX : ApXX
+BASH_KB = {}
+for name in KB:
+    #print name
+    L = string.split(name, "_")
+    nb = L[0][2:]
+    nb = string.atoi(nb)
+    #print nb
+    BASH_KB[nb] = name
+
+KKB = BASH_KB.keys()
+KKB.sort()
+
+for nb in KKB:
+    fasta_name = BASH_KB[nb]
+    #print fasta_name
+    seq = dico[fasta_name]
+    #if int(len(seq)) > int(length_seq_max):
+    file_OUT.write(">%s\n" %fasta_name)
+    file_OUT.write("%s\n" %seq)
+
+file_OUT.close()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S04_find_orf.py	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+#keeps the longest ORF found in the 6 possible ORF alltogether
+#python find_ORF.py file output
+
+def find_orf(entry):
+  orf={}
+  orf_length={}
+  stop=['TAA','TAG','TGA']
+  for i in range(0,3):
+    pos=i
+    orf[i]=[0]
+    while pos<len(entry):
+      if entry[pos:pos+3] in stop:
+        orf[i].append(pos-1)
+        orf[i].append(pos+3)
+      pos+=3
+    orf[i].append(len(entry)-1)
+    orf_length[i]=[]
+    for u in range(1,len(orf[i])):
+      orf_length[i].append(orf[i][u]-orf[i][u-1]+1)
+    orf[i]=[orf[i][orf_length[i].index(max(orf_length[i]))],orf[i][orf_length[i].index(max(orf_length[i]))+1]]
+  orf_max={0:max(orf_length[0]),1:max(orf_length[1]),2:max(orf_length[2])}
+  orf=orf[max(orf_max.keys(), key=(lambda k: orf_max[k]))]
+  if orf[0]==0:
+    orf[0]=orf[0]+max(orf_max.keys(), key=(lambda k: orf_max[k]))
+  return orf
+
+
+def reverse_seq(entry):
+  nt={'A':'T','T':'A','G':'C','C':'G', 'N':'N'}
+  seqlist=[]
+  for i in range(len(entry)-1,-1,-1):
+    seqlist.append(nt[entry[i]])
+  seq=''.join(seqlist)
+  return seq
+
+
+
+# RUN
+
+import string, os, sys, re
+
+path_IN = sys.argv[1]
+file_OUT = open(sys.argv[2], "w")
+f_in = open(path_IN, "r")
+inc=1
+threshold=0 #minimal length of the ORF
+while 1:
+  line=f_in.readline()
+  if not line:
+    break
+  line=f_in.readline()
+  name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_"
+  high_plus=find_orf(line[:-1])
+  reverse=reverse_seq(line[:-1])
+  high_minus=find_orf(reverse)
+  if high_plus[1]-high_plus[0]>threshold or high_minus[1]-high_minus[0]>threshold:
+    inc+=1
+    if high_plus[1]-high_plus[0]>high_minus[1]-high_minus[0]:
+      file_OUT.write("%s" %name)
+      file_OUT.write(str(high_plus[1]-high_plus[0]+1)+"\n")
+      file_OUT.write("%s" %line[high_plus[0]:high_plus[1]+1])
+      file_OUT.write("\n")
+    else:
+      file_OUT.write("%s" %name)
+      file_OUT.write(str(high_minus[1]-high_minus[0]+1)+"\n")
+      file_OUT.write("%s" %reverse[high_minus[0]:high_minus[1]+1])
+      file_OUT.write("\n")
+f_in.close()
+file_OUT.close()
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/scripts/S05_filter.py	Thu Apr 13 09:36:16 2017 -0400
@@ -0,0 +1,34 @@
+#!/usr/bin/env python
+#filters the sequences depending on their length after cap3, makes the sequences names compatible with the phylogeny workflow
+#python filter.py file length_threshold_nucleotides output
+
+import string, os, sys, re
+
+path_IN = sys.argv[1]
+threshold=int(sys.argv[2]) #minimum number of nucleotides for one sequence
+file_OUT = open(sys.argv[3], "w")
+f_in = open(path_IN, "r")
+inc=1
+while 1:
+    line=f_in.readline()
+    if not line:
+        break
+    line=f_in.readline()
+    name=">"+path_IN[:2]+str(inc)+"_1/1_1.000_"
+    if len(line)-1>threshold-1:
+        inc+=1
+        file_OUT.write("%s" %name)
+        file_OUT.write(str(len(line)-1)+"\n")
+        file_OUT.write("%s" %line)
+f_in.close()
+file_OUT.close()
+
+
+
+#filtre eventuel sur les petits transcrits
+
+
+
+
+
+