comparison scripts/pogs.py @ 6:b19ed7395dcc draft

planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
author abims-sbr
date Wed, 17 Jan 2018 08:54:30 -0500
parents
children 1a728cb1da31
comparison
equal deleted inserted replaced
5:dad7053ba20e 6:b19ed7395dcc
1 #!/usr/bin/env python
2 # coding: utf8
3 # September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS)
4 # Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p]
5
6 """
7 What it does:
8 - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity).
9 - A minimal number of species per group has to be set.
10
11 BETA VERSION
12 """
13
14 import os, argparse
15 import numpy as np
16 import pandas as pd
17
18 """ Definition of a locus : header + sequence + a tag """
19 class Locus:
20
21 def __init__(self, header, sequence):
22 self.header = header
23 self.sequence = sequence
24 self.tagged = False
25
26 def __str__(self):
27 return "{}{}".format(self.header, self.sequence)
28
29 def __eq__(self, other):
30 return self.getHeader() == other.getHeader() # Test if two loci are the same
31
32 def __hash__(self):
33 return hash((self.header, self.sequence)) # Make the object iterable and hashable
34
35 def getHeader(self):
36 return self.header
37
38 def getSequence(self):
39 return self.sequence
40
41 def getTag(self):
42 return self.tagged
43
44 def prettyPrint(self):
45 # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }"
46 print "[ Header : {header} ]".format(header=self.header[0:-1])
47
48 def prettyPrint2(self):
49 print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1])
50
51 """ Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples
52 Returns a list of sets (2 items per set) """
53 def getListPairwiseAll(listPairwiseFiles):
54
55 # Sub-Function
56
57 """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list
58 Returns a list of sets (2 items per set) """
59 def getPairwiseCouple(pairwiseFile):
60 list_pairwises_2sp = []
61 with open(pairwiseFile, "r") as file:
62 while (1): # Ugly !
63 name, sequence, name2, sequence2 = file.readline(), file.readline(), file.readline(), file.readline()
64 if not name: break # Use assert ?
65 # One locus every two lines (one pairwise couple = 4 lines) : header + sequence
66 locus1 = Locus(name, sequence)
67 locus2 = Locus(name2, sequence2)
68 group = set([])
69 group.add(locus1)
70 group.add(locus2)
71 list_pairwises_2sp.append(group)
72 return (list_pairwises_2sp)
73
74 # Function
75 list_pairwises_allsp = []
76 for file in listPairwiseFiles:
77 listPairwises = getPairwiseCouple(file)
78 for pairwise in listPairwises:
79 list_pairwises_allsp.append(pairwise) # all pairwises in the same 1D list
80 return list_pairwises_allsp
81
82 """ Proceeds to create orthogroups by putting together pairwise couples sharing a locus.
83 Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid
84 redondancy. Writes each orthogroup in a fasta file
85 Returns an integer (a list length) """
86 def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs):
87
88 # Sub-funtions
89
90 """ Check if a locus/group has already been treated in makeOrthogroups()
91 Returns a boolean """
92 def checkIfTagged(pair):
93 tag = True
94 for element in pair:
95 if not element.getTag() and tag: # use a list comprehension maybe ?
96 tag = False
97 return tag
98
99 """ True means a locus/group has already been treated in makeOrthogroups()
100 A stronger code would be to implement a method inside the class Locus """
101 def tagGroup(pair):
102 for element in pair:
103 element.tagged = True
104
105 """ Write an orthogroup in a file """
106 def writeOutputFile(orthogroup, number, naming):
107 name = ""
108 if naming:
109 name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup))
110 else :
111 name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup))
112 result = open(name, "w")
113 with result:
114 for locus in orthogroup:
115 if locus.getHeader()[-1] == "\n":
116 result.write("%s" % locus.getHeader()) # write geneID
117 else :
118 result.write("%s\n" % locus.Header()) # write geneID
119 if locus.getSequence()[-1] == "\n":
120 result.write("%s" % locus.getSequence()) # write sequence
121 else :
122 result.write("%s\n" % locus.getSequence()) # write sequence
123 if naming:
124 os.system("mv {} outputs_withParalogs/".format(name))
125 else :
126 os.system("mv {} outputs/".format(name))
127
128 """ Parse an orthogroup list to keep only one paralog sequence per species & per group
129 (Keeps the 1st paralogous encoutered)
130 Returns a list """
131 def filterParalogs(list_orthogroups, minspec):
132 list_orthogroups_format = []
133 j = 1
134
135 for nofilter_group in list_orthogroups:
136 new_group = []
137 species = {}
138 for loci in nofilter_group:
139 species[loci.getHeader()[1:3]] = False
140 for loci in nofilter_group:
141 if not species[loci.getHeader()[1:3]]:
142 new_group.append(loci)
143 species[loci.getHeader()[1:3]] = True
144
145 if len(new_group) >= minspec: # Drop too small orthogroups
146 list_orthogroups_format.append(new_group)
147 writeOutputFile(new_group, j, False)
148 j += 1
149
150 return list_orthogroups_format
151
152 """ Builds a 2D array for a summary
153 Returns a numpy 2D array """
154 def countings(listOrthogroups, nb_rbh):
155
156 def compute_nbspec(nb_rbh):
157
158 def factorielle(x):
159 n = 1
160 s = 0
161 while n <= x:
162 s += n
163 n += 1
164 return s
165
166 x = 2
167 nb_specs = 0
168 while x*x - factorielle(x) < nb_rbh:
169 x += 1
170 return x
171 #listOrthogroups.sort().reverse()
172 #nblines = len(listOrthogroups[0])
173 nblines = 0
174 for group in listOrthogroups:
175 if len(group) > nblines:
176 nblines = len(group)
177 matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines)
178
179 for group in listOrthogroups:
180 listSpecs = []
181 for loci in group:
182 if loci.getHeader()[1:3] not in listSpecs:
183 listSpecs.append(loci.getHeader()[1:3])
184 matrix[len(group)-1][len(listSpecs)-1] += 1
185
186 return matrix
187
188 """ numpy 2D array in a nice dataframe
189 Returns a pandas 2D dataframe """
190 def asFrame(matrix) :
191 index = [0]*len(matrix)
192 colnames = [0]*len(matrix[0])
193 index = [str(i+1)+" seqs" for i in range(len(matrix))]
194 colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))]
195 df = pd.DataFrame(matrix, index=index, columns=colnames)
196 return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0
197 #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]]
198
199 # Function -------------------------------------------------------------------------------------------------
200 list_orthogroups = []
201
202 for ortho_pair1 in list_pairwises_allsp[0:-1]:
203 if not checkIfTagged(ortho_pair1):
204 orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop
205
206 # check for common locus between two groups
207 for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]:
208 if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2):
209 orthogroup.update(orthogroup | ortho_pair2)
210 tagGroup(ortho_pair2)
211
212 # Check if subgroup is already computed
213 if len(list_orthogroups) > 0:
214 presence = False
215 for group in list_orthogroups:
216 if len(group.intersection(orthogroup)) != 0:
217 group.update(group | orthogroup)
218 presence = True
219 if not presence:
220 list_orthogroups.append(orthogroup)
221 else:
222 list_orthogroups.append(orthogroup)
223
224 # Options --------------------------------------------------------------------------------------------------
225
226 """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of
227 the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity
228 over code efficiency """
229
230 # Print summary table with all paralogs
231 if verbose :
232 frame = countings(list_orthogroups, nb_rbh)
233 df = asFrame(frame)
234 print "\n Summary before paralogous filtering : \n"
235 print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0
236
237 # Write outputFile with all the paralogous
238 if paralogs:
239 print "Writing orthogroups with paralogs files ...\n"
240 j = 1
241 for group in list_orthogroups:
242 if len(group) >= minspec:
243 writeOutputFile(group, j, True)
244 j += 1
245
246 # Paralogs filtering and summary ----------------------------------------------------------------------------
247
248 print "Filtering paralogous sequences and writing final orthogroups files ..."
249 print " (Dropping Orthogroups with less than {} species)".format(minspec)
250
251 # writeOutputFile() is called in filterParalogs()
252 list_orthogroups_format = filterParalogs(list_orthogroups, minspec)
253
254 frame = countings(list_orthogroups_format, nb_rbh)
255 df = asFrame(frame)
256 print "\n Summary after paralogous filtering : \n"
257 print df.loc[df.ne(0).any(1),df.ne(0).any()]
258
259 #return only the length of the list (at this point the program doesn't need more)
260 return len(list_orthogroups_format)
261
262 def main():
263 parser = argparse.ArgumentParser()
264 parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species")
265 parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int)
266 parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned")
267 parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering")
268 args = parser.parse_args()
269
270 print "*** pogs.py ***"
271 print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species."
272 print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs."
273
274 os.system("mkdir outputs")
275 if args.paralogs: os.system("mkdir outputs_withParalogs")
276 infiles = args.files
277 listPairwiseFiles = str.split(infiles, ",")
278 print "\nParsing input files ..."
279 list_Locus = getListPairwiseAll(listPairwiseFiles)
280 print "Creating Orthogroups ..."
281 nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs)
282 print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles))
283
284 if __name__ == "__main__":
285 main()