Mercurial > repos > abims-sbr > orthogroups_tool

--- a/orthogroups_tool.xml	Wed Jan 17 11:32:14 2018 -0500
+++ b/orthogroups_tool.xml	Fri Jan 19 09:51:12 2018 -0500
@@ -1,4 +1,4 @@
-<tool name="Orthogroups_Tool" id="orthogroups_tool" version="1.0">
+<tool name="Orthogroups_Tool" id="orthogroups_tool" version="1.0.1">

     <description>
         This tool takes Orthogroups found by OrthoFinder and proceeds to retrieve nucleic sequences back, then write each orthogroups in its own fasta file.
@@ -93,6 +93,8 @@

 <![CDATA[

+The script of this tool has been written by Victor Mataigne.
+
 -------------------------------------------

 **Description**
--- a/scripts/filter_orthofinder.py	Wed Jan 17 11:32:14 2018 -0500
+++ b/scripts/filter_orthofinder.py	Fri Jan 19 09:51:12 2018 -0500
@@ -4,7 +4,7 @@
 ## This script takes an output file of OrthoFinder (Orthogroups.txt), which contains a set of orthogroups,
 ## and rewrite it to split each orthogroup into a single fasta file.

-import os, string, glob, argparse, csv
+import os, string, glob, argparse, csv, itertools
 import numpy as np
 import pandas as pd

@@ -15,18 +15,13 @@
 def hashSequences(path):
     hashTable = {}
     # WARNING : sequences are expected to be on one line. If not, biopython can do it
-    for file in path:
-        originFile = open(file, "r")
+    for file in path:
         gene = ""
         sequence = ""
-        with originFile:
-            while (1): # Not the best way to do
-                gene = originFile.readline()
-                if not gene:
-                    break
-                gene = gene[:-1]
-                sequence = originFile.readline()
-                sequence = sequence[:-1]
+        with open(file, "r") as origin:
+            for line1,line2 in itertools.izip_longest(*[origin]*2):
+                gene=line1.strip("\r\n ")
+                sequence=line2.strip(" \r\n ")
                 hashTable[gene] = sequence
     return hashTable

@@ -107,12 +102,12 @@
     i,j = 1,1
     for group in list_orthogroups:
         group = string.split(group, " ") # list of lists
-        group.sort()
-        if verbose or paralogs:
-            if len(group) >= mini:
-                list_orthogroups_withpara.append(group)
-                writeOutputFile(group, hashTable, j, True)
-                j += 1
+        group.sort()
+        if verbose and len(group) >= mini:
+            list_orthogroups_withpara.append(group)
+        if paralogs and len(group) >= mini:
+            writeOutputFile(group, hashTable, j, True)
+        j += 1
         new_group = []
         rang=-1
         # Keep only one paralogs per species (1st encounter)
@@ -157,7 +152,7 @@

     # Build hashtable
     print "  Building hashTable IDs/sequences ...\n"
-    path = glob.glob('*.fasta')
+    path = glob.glob('*.fasta')
     hashTable = hashSequences(path)

     # Open txt file with orthogroups
@@ -179,7 +174,8 @@
         os.system("mv {} filtered_orthogroups".format(file))

     print "  \nFiltered orthogroups are written in the directory 'filtered_orthogroups'"
-    print "  \nFull orthogroups files are written in the directory 'orthogroups_withParalogs'\n"
+    if args.paralogs:
+        print "  \nFull orthogroups files are written in the directory 'orthogroups_withParalogs'\n"

 if __name__ == "__main__":
     main()