Mercurial > repos > abims-sbr > pogs
comparison scripts/pogs.py @ 6:b19ed7395dcc draft
planemo upload for repository https://github.com/abims-sbr/adaptsearch commit cf1b9c905931ca2ca25faa4844d45c908756472f
| author | abims-sbr |
|---|---|
| date | Wed, 17 Jan 2018 08:54:30 -0500 |
| parents | |
| children | 1a728cb1da31 |
comparison
equal
deleted
inserted
replaced
| 5:dad7053ba20e | 6:b19ed7395dcc |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # coding: utf8 | |
| 3 # September 2017 - Author : Victor Mataigne (Station Biologique de Roscoff - ABiMS) | |
| 4 # Command line : ./pogsPOO.py <list_of_input_files_separated_by_commas> <minimal number of species per group> [-v) [-p] | |
| 5 | |
| 6 """ | |
| 7 What it does: | |
| 8 - pogs.py parses output files from the "pairwise" tool of the AdaptSearch suite and proceeds to gather genes in orthogroups (using transitivity). | |
| 9 - A minimal number of species per group has to be set. | |
| 10 | |
| 11 BETA VERSION | |
| 12 """ | |
| 13 | |
| 14 import os, argparse | |
| 15 import numpy as np | |
| 16 import pandas as pd | |
| 17 | |
| 18 """ Definition of a locus : header + sequence + a tag """ | |
| 19 class Locus: | |
| 20 | |
| 21 def __init__(self, header, sequence): | |
| 22 self.header = header | |
| 23 self.sequence = sequence | |
| 24 self.tagged = False | |
| 25 | |
| 26 def __str__(self): | |
| 27 return "{}{}".format(self.header, self.sequence) | |
| 28 | |
| 29 def __eq__(self, other): | |
| 30 return self.getHeader() == other.getHeader() # Test if two loci are the same | |
| 31 | |
| 32 def __hash__(self): | |
| 33 return hash((self.header, self.sequence)) # Make the object iterable and hashable | |
| 34 | |
| 35 def getHeader(self): | |
| 36 return self.header | |
| 37 | |
| 38 def getSequence(self): | |
| 39 return self.sequence | |
| 40 | |
| 41 def getTag(self): | |
| 42 return self.tagged | |
| 43 | |
| 44 def prettyPrint(self): | |
| 45 # Used for debugging : print "{ Header : ", self.header[0:-1], "Tag : ", self.tagged, " }" | |
| 46 print "[ Header : {header} ]".format(header=self.header[0:-1]) | |
| 47 | |
| 48 def prettyPrint2(self): | |
| 49 print "[ Header : {header} Sequence : {sequence} ]".format(header=self.header[0:-1], sequence=self.sequence[0:-1]) | |
| 50 | |
| 51 """ Applies the getPairwiseCouple() function to a list of files and return a big list with ALL pairwises couples | |
| 52 Returns a list of sets (2 items per set) """ | |
| 53 def getListPairwiseAll(listPairwiseFiles): | |
| 54 | |
| 55 # Sub-Function | |
| 56 | |
| 57 """ Reads an output file from the 'Pairwise' tool (AdaptSearch suite) and returns its content into a list | |
| 58 Returns a list of sets (2 items per set) """ | |
| 59 def getPairwiseCouple(pairwiseFile): | |
| 60 list_pairwises_2sp = [] | |
| 61 with open(pairwiseFile, "r") as file: | |
| 62 while (1): # Ugly ! | |
| 63 name, sequence, name2, sequence2 = file.readline(), file.readline(), file.readline(), file.readline() | |
| 64 if not name: break # Use assert ? | |
| 65 # One locus every two lines (one pairwise couple = 4 lines) : header + sequence | |
| 66 locus1 = Locus(name, sequence) | |
| 67 locus2 = Locus(name2, sequence2) | |
| 68 group = set([]) | |
| 69 group.add(locus1) | |
| 70 group.add(locus2) | |
| 71 list_pairwises_2sp.append(group) | |
| 72 return (list_pairwises_2sp) | |
| 73 | |
| 74 # Function | |
| 75 list_pairwises_allsp = [] | |
| 76 for file in listPairwiseFiles: | |
| 77 listPairwises = getPairwiseCouple(file) | |
| 78 for pairwise in listPairwises: | |
| 79 list_pairwises_allsp.append(pairwise) # all pairwises in the same 1D list | |
| 80 return list_pairwises_allsp | |
| 81 | |
| 82 """ Proceeds to create orthogroups by putting together pairwise couples sharing a locus. | |
| 83 Iterates over the orthogroups list and tag to 'True' the pairwise couple already gathered in a group to avoid | |
| 84 redondancy. Writes each orthogroup in a fasta file | |
| 85 Returns an integer (a list length) """ | |
| 86 def makeOrthogroups(list_pairwises_allsp, minspec, nb_rbh, verbose, paralogs): | |
| 87 | |
| 88 # Sub-funtions | |
| 89 | |
| 90 """ Check if a locus/group has already been treated in makeOrthogroups() | |
| 91 Returns a boolean """ | |
| 92 def checkIfTagged(pair): | |
| 93 tag = True | |
| 94 for element in pair: | |
| 95 if not element.getTag() and tag: # use a list comprehension maybe ? | |
| 96 tag = False | |
| 97 return tag | |
| 98 | |
| 99 """ True means a locus/group has already been treated in makeOrthogroups() | |
| 100 A stronger code would be to implement a method inside the class Locus """ | |
| 101 def tagGroup(pair): | |
| 102 for element in pair: | |
| 103 element.tagged = True | |
| 104 | |
| 105 """ Write an orthogroup in a file """ | |
| 106 def writeOutputFile(orthogroup, number, naming): | |
| 107 name = "" | |
| 108 if naming: | |
| 109 name = "orthogroup_{}_with_{}_sequences_withParalogs.fasta".format(number, len(orthogroup)) | |
| 110 else : | |
| 111 name = "orthogroup_{}_with_{}_sequences.fasta".format(number, len(orthogroup)) | |
| 112 result = open(name, "w") | |
| 113 with result: | |
| 114 for locus in orthogroup: | |
| 115 if locus.getHeader()[-1] == "\n": | |
| 116 result.write("%s" % locus.getHeader()) # write geneID | |
| 117 else : | |
| 118 result.write("%s\n" % locus.Header()) # write geneID | |
| 119 if locus.getSequence()[-1] == "\n": | |
| 120 result.write("%s" % locus.getSequence()) # write sequence | |
| 121 else : | |
| 122 result.write("%s\n" % locus.getSequence()) # write sequence | |
| 123 if naming: | |
| 124 os.system("mv {} outputs_withParalogs/".format(name)) | |
| 125 else : | |
| 126 os.system("mv {} outputs/".format(name)) | |
| 127 | |
| 128 """ Parse an orthogroup list to keep only one paralog sequence per species & per group | |
| 129 (Keeps the 1st paralogous encoutered) | |
| 130 Returns a list """ | |
| 131 def filterParalogs(list_orthogroups, minspec): | |
| 132 list_orthogroups_format = [] | |
| 133 j = 1 | |
| 134 | |
| 135 for nofilter_group in list_orthogroups: | |
| 136 new_group = [] | |
| 137 species = {} | |
| 138 for loci in nofilter_group: | |
| 139 species[loci.getHeader()[1:3]] = False | |
| 140 for loci in nofilter_group: | |
| 141 if not species[loci.getHeader()[1:3]]: | |
| 142 new_group.append(loci) | |
| 143 species[loci.getHeader()[1:3]] = True | |
| 144 | |
| 145 if len(new_group) >= minspec: # Drop too small orthogroups | |
| 146 list_orthogroups_format.append(new_group) | |
| 147 writeOutputFile(new_group, j, False) | |
| 148 j += 1 | |
| 149 | |
| 150 return list_orthogroups_format | |
| 151 | |
| 152 """ Builds a 2D array for a summary | |
| 153 Returns a numpy 2D array """ | |
| 154 def countings(listOrthogroups, nb_rbh): | |
| 155 | |
| 156 def compute_nbspec(nb_rbh): | |
| 157 | |
| 158 def factorielle(x): | |
| 159 n = 1 | |
| 160 s = 0 | |
| 161 while n <= x: | |
| 162 s += n | |
| 163 n += 1 | |
| 164 return s | |
| 165 | |
| 166 x = 2 | |
| 167 nb_specs = 0 | |
| 168 while x*x - factorielle(x) < nb_rbh: | |
| 169 x += 1 | |
| 170 return x | |
| 171 #listOrthogroups.sort().reverse() | |
| 172 #nblines = len(listOrthogroups[0]) | |
| 173 nblines = 0 | |
| 174 for group in listOrthogroups: | |
| 175 if len(group) > nblines: | |
| 176 nblines = len(group) | |
| 177 matrix = np.array([[0]*compute_nbspec(nb_rbh)]*nblines) | |
| 178 | |
| 179 for group in listOrthogroups: | |
| 180 listSpecs = [] | |
| 181 for loci in group: | |
| 182 if loci.getHeader()[1:3] not in listSpecs: | |
| 183 listSpecs.append(loci.getHeader()[1:3]) | |
| 184 matrix[len(group)-1][len(listSpecs)-1] += 1 | |
| 185 | |
| 186 return matrix | |
| 187 | |
| 188 """ numpy 2D array in a nice dataframe | |
| 189 Returns a pandas 2D dataframe """ | |
| 190 def asFrame(matrix) : | |
| 191 index = [0]*len(matrix) | |
| 192 colnames = [0]*len(matrix[0]) | |
| 193 index = [str(i+1)+" seqs" for i in range(len(matrix))] | |
| 194 colnames = [str(i+1)+" sps" for i in range(len(matrix[0]))] | |
| 195 df = pd.DataFrame(matrix, index=index, columns=colnames) | |
| 196 return df # Mettre une selection pour ne renvoyer que les lignes et les colonnes qui somment > 0 | |
| 197 #return df.loc['4 seqs':'9 seqs'].loc[:,colnames[3:]] | |
| 198 | |
| 199 # Function ------------------------------------------------------------------------------------------------- | |
| 200 list_orthogroups = [] | |
| 201 | |
| 202 for ortho_pair1 in list_pairwises_allsp[0:-1]: | |
| 203 if not checkIfTagged(ortho_pair1): | |
| 204 orthogroup = ortho_pair1 # the orthogroup grows as we go throught the second loop | |
| 205 | |
| 206 # check for common locus between two groups | |
| 207 for ortho_pair2 in list_pairwises_allsp[list_pairwises_allsp.index(ortho_pair1) + 1:]: | |
| 208 if len(orthogroup.intersection(ortho_pair2)) != 0 and not checkIfTagged(ortho_pair2): | |
| 209 orthogroup.update(orthogroup | ortho_pair2) | |
| 210 tagGroup(ortho_pair2) | |
| 211 | |
| 212 # Check if subgroup is already computed | |
| 213 if len(list_orthogroups) > 0: | |
| 214 presence = False | |
| 215 for group in list_orthogroups: | |
| 216 if len(group.intersection(orthogroup)) != 0: | |
| 217 group.update(group | orthogroup) | |
| 218 presence = True | |
| 219 if not presence: | |
| 220 list_orthogroups.append(orthogroup) | |
| 221 else: | |
| 222 list_orthogroups.append(orthogroup) | |
| 223 | |
| 224 # Options -------------------------------------------------------------------------------------------------- | |
| 225 | |
| 226 """ nb : I could try to implement a more complex code which does in the same previous loop all the following lines, to avoid multiples parsing of | |
| 227 the orthogroups list, but the code would become hardly readable. Since the whole program is already quite fast, I chosed code simplicity | |
| 228 over code efficiency """ | |
| 229 | |
| 230 # Print summary table with all paralogs | |
| 231 if verbose : | |
| 232 frame = countings(list_orthogroups, nb_rbh) | |
| 233 df = asFrame(frame) | |
| 234 print "\n Summary before paralogous filtering : \n" | |
| 235 print df.loc[df.ne(0).any(1),df.ne(0).any()], "\n" # Don't display columns and lines filled with 0 | |
| 236 | |
| 237 # Write outputFile with all the paralogous | |
| 238 if paralogs: | |
| 239 print "Writing orthogroups with paralogs files ...\n" | |
| 240 j = 1 | |
| 241 for group in list_orthogroups: | |
| 242 if len(group) >= minspec: | |
| 243 writeOutputFile(group, j, True) | |
| 244 j += 1 | |
| 245 | |
| 246 # Paralogs filtering and summary ---------------------------------------------------------------------------- | |
| 247 | |
| 248 print "Filtering paralogous sequences and writing final orthogroups files ..." | |
| 249 print " (Dropping Orthogroups with less than {} species)".format(minspec) | |
| 250 | |
| 251 # writeOutputFile() is called in filterParalogs() | |
| 252 list_orthogroups_format = filterParalogs(list_orthogroups, minspec) | |
| 253 | |
| 254 frame = countings(list_orthogroups_format, nb_rbh) | |
| 255 df = asFrame(frame) | |
| 256 print "\n Summary after paralogous filtering : \n" | |
| 257 print df.loc[df.ne(0).any(1),df.ne(0).any()] | |
| 258 | |
| 259 #return only the length of the list (at this point the program doesn't need more) | |
| 260 return len(list_orthogroups_format) | |
| 261 | |
| 262 def main(): | |
| 263 parser = argparse.ArgumentParser() | |
| 264 parser.add_argument("files", help="Input files separated by commas. Each file contains all the reciprocical best hits between a pair of species") | |
| 265 parser.add_argument("minspec", help="Only keep Orthogroups with at least this number of species", type=int) | |
| 266 parser.add_argument("-v", "--verbose", action="store_true", help="A supplemental summary table of orthogroups before paralogs filtering will be returned") | |
| 267 parser.add_argument("-p", "--paralogs", action="store_true", help="Proceeds to write orthogroups also before paralogous filtering") | |
| 268 args = parser.parse_args() | |
| 269 | |
| 270 print "*** pogs.py ***" | |
| 271 print "\nBuilding of orthogroups based on pairs of genes obtained by pairwise comparisons between pairs of species." | |
| 272 print "Genes are gathered in orthogroups based on the principle of transitivity between genes pairs." | |
| 273 | |
| 274 os.system("mkdir outputs") | |
| 275 if args.paralogs: os.system("mkdir outputs_withParalogs") | |
| 276 infiles = args.files | |
| 277 listPairwiseFiles = str.split(infiles, ",") | |
| 278 print "\nParsing input files ..." | |
| 279 list_Locus = getListPairwiseAll(listPairwiseFiles) | |
| 280 print "Creating Orthogroups ..." | |
| 281 nb_orthogroups = makeOrthogroups(list_Locus, args.minspec, len(listPairwiseFiles), args.verbose, args.paralogs) | |
| 282 print "\n{} orthogroups have been infered from {} pairwise comparisons by RBH\n".format(nb_orthogroups, len(listPairwiseFiles)) | |
| 283 | |
| 284 if __name__ == "__main__": | |
| 285 main() |
