Mercurial > repos > rico > test_600
comparison calclenchange.py @ 0:73648da53556 default tip
Uploaded
| author | rico |
|---|---|
| date | Mon, 09 Apr 2012 11:55:36 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:73648da53556 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 # | |
| 4 # calclenchange.py | |
| 5 # | |
| 6 # Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu> | |
| 7 # | |
| 8 # This program is free software; you can redistribute it and/or modify | |
| 9 # it under the terms of the GNU General Public License as published by | |
| 10 # the Free Software Foundation; either version 2 of the License, or | |
| 11 # (at your option) any later version. | |
| 12 # | |
| 13 # This program is distributed in the hope that it will be useful, | |
| 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 # GNU General Public License for more details. | |
| 17 # | |
| 18 # You should have received a copy of the GNU General Public License | |
| 19 # along with this program; if not, write to the Free Software | |
| 20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 21 # MA 02110-1301, USA. | |
| 22 | |
| 23 import argparse,mechanize,os,sys | |
| 24 from decimal import Decimal,getcontext | |
| 25 from xml.etree.ElementTree import ElementTree,tostring | |
| 26 import networkx as nx | |
| 27 from copy import copy | |
| 28 | |
| 29 #method to rank the the pthways by mut. freq. | |
| 30 def rankdN(ltfreqs): | |
| 31 ordvals=sorted(ltfreqs)#sort and reverse freqs. | |
| 32 #~ | |
| 33 outrnk=[] | |
| 34 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value | |
| 35 if tmpOri=='C': | |
| 36 if tmpMut!='C': | |
| 37 tmpChng0='C-%s'%tmpMut | |
| 38 else: | |
| 39 tmpChng0=Decimal('0') | |
| 40 crank=1 | |
| 41 outrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw]) | |
| 42 totalnvals=len(ordvals) | |
| 43 cnt=0 | |
| 44 while totalnvals>cnt: | |
| 45 cnt+=1 | |
| 46 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop() | |
| 47 if tmpOri=='C': | |
| 48 if tmpMut!='C': | |
| 49 tmpChng='C-%s'%tmpMut | |
| 50 else: | |
| 51 tmpChng=Decimal('0') | |
| 52 if tmpChng!=tmpChng0: | |
| 53 crank=len(outrnk)+1 | |
| 54 tmpChng0=tmpChng | |
| 55 outrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw]) | |
| 56 return outrnk | |
| 57 | |
| 58 #method to rank the the pthways by mut. freq. | |
| 59 def rankdAvr(ltfreqs): | |
| 60 ordvals=sorted(ltfreqs)#sort and reverse freqs. | |
| 61 #~ | |
| 62 outrnk={} | |
| 63 tmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value | |
| 64 if tmpOri=='I': | |
| 65 if tmpMut!='I': | |
| 66 tmpChng0='I-%s'%tmpMut | |
| 67 else: | |
| 68 tmpChng0=Decimal('0') | |
| 69 crank=1 | |
| 70 outrnk[tmpPthw]='\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)]) | |
| 71 totalnvals=len(ordvals) | |
| 72 cnt=0 | |
| 73 while totalnvals>cnt: | |
| 74 cnt+=1 | |
| 75 tmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop() | |
| 76 if tmpOri=='I': | |
| 77 if tmpMut!='I': | |
| 78 tmpChng='I-%s'%tmpMut | |
| 79 else: | |
| 80 tmpChng=Decimal('0') | |
| 81 if tmpChng!=tmpChng0: | |
| 82 crank=len(outrnk)+1 | |
| 83 tmpChng0=tmpChng | |
| 84 outrnk[tmpPthw]='\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)]) | |
| 85 return outrnk | |
| 86 | |
| 87 #this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0 | |
| 88 def returnstartanendnodes(edges): | |
| 89 listID0st=set()#starts | |
| 90 listOD0en=set()#end | |
| 91 for beginNod,endNod in edges:# O(n) | |
| 92 listID0st.add(beginNod) | |
| 93 listOD0en.add(endNod) | |
| 94 startNdsID0=listID0st.difference(listOD0en) | |
| 95 endNdsOD0=listOD0en.difference(listID0st) | |
| 96 return startNdsID0,endNdsOD0 | |
| 97 | |
| 98 #~ Method to return nodes and edges | |
| 99 def returnNodesNEdgesfKXML(fpthwKGXML): | |
| 100 #~ | |
| 101 tree = ElementTree() | |
| 102 ptree=tree.parse(fpthwKGXML) | |
| 103 #~ | |
| 104 title=ptree.get('title') | |
| 105 prots=ptree.findall('entry') | |
| 106 reactns=ptree.findall('reaction') | |
| 107 #~ | |
| 108 edges,ndstmp=set(),set() | |
| 109 nreactns=len(reactns) | |
| 110 cr=0#count reacts | |
| 111 while nreactns>cr: | |
| 112 cr+=1 | |
| 113 reactn=reactns.pop() | |
| 114 mainid=reactn.get('id') | |
| 115 ndstmp.add(mainid)#add node | |
| 116 reacttyp=reactn.get('type') | |
| 117 sbstrts=reactn.findall('substrate') | |
| 118 while len(sbstrts)>0: | |
| 119 csbstrt=sbstrts.pop() | |
| 120 csbtsid=csbstrt.get('id') | |
| 121 ndstmp.add(csbtsid)#add node | |
| 122 if reacttyp=='irreversible': | |
| 123 edges.add((csbtsid,mainid))#add edges | |
| 124 elif reacttyp=='reversible': | |
| 125 edges.add((mainid,csbtsid))#add edges | |
| 126 edges.add((csbtsid,mainid))#add edges | |
| 127 #~ | |
| 128 prdcts=reactn.findall('product') | |
| 129 while len(prdcts)>0: | |
| 130 prdct=prdcts.pop() | |
| 131 prodctid=prdct.get('id') | |
| 132 ndstmp.add(prodctid)#add node | |
| 133 if reacttyp=='irreversible': | |
| 134 edges.add((mainid,prodctid))#add edges | |
| 135 elif reacttyp=='reversible': | |
| 136 edges.add((mainid,prodctid))#add edges | |
| 137 edges.add((prodctid,mainid))#add edges | |
| 138 #~ Nodes | |
| 139 nprots=len(prots) | |
| 140 cp=0#count prots | |
| 141 dnodes={} | |
| 142 while nprots>cp: | |
| 143 cp+=1 | |
| 144 prot=prots.pop() | |
| 145 tmpProtnm=prot.get('id') | |
| 146 if tmpProtnm in ndstmp: | |
| 147 dnodes[prot.get('id')]=set(prot.get('name').split())#each genename for each Id | |
| 148 return dnodes,edges,title | |
| 149 | |
| 150 #~ make calculation on pathways | |
| 151 def rtrnAvrgLen(edges,strNds,endNds): | |
| 152 wG=nx.DiGraph()#reference graph | |
| 153 wG.add_edges_from(edges) | |
| 154 dPairsSrcSnks=nx.all_pairs_shortest_path_length(wG)#dictionary between sources and sink and length | |
| 155 nstartNdsID0=len(strNds) | |
| 156 cstrtNds=0 | |
| 157 nPaths=0 | |
| 158 lPathLen=[] | |
| 159 while nstartNdsID0>cstrtNds: | |
| 160 cStartNd=strNds.pop()#current start node | |
| 161 dEndNdsLen=dPairsSrcSnks.pop(cStartNd) | |
| 162 for cendNd in dEndNdsLen: | |
| 163 if cendNd in endNds: | |
| 164 lPathLen.append(dEndNdsLen[cendNd]) | |
| 165 nPaths+=1 | |
| 166 cstrtNds+=1 | |
| 167 AvrgPthLen=0 | |
| 168 if nPaths!=0: | |
| 169 AvrgPthLen=Decimal(sum(lPathLen))/Decimal(str(nPaths)) | |
| 170 return nPaths,AvrgPthLen | |
| 171 | |
| 172 def main(): | |
| 173 parser = argparse.ArgumentParser(description='Rank pathways based on the change in length and number of paths connecting sources and sinks.') | |
| 174 parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database') | |
| 175 parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file') | |
| 176 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the diference between column 2 and column 3, Column 2 is the pathway average length (between sources and sinks) including the genes in the input list, Column 3 is the pathway average length EXCLUDING the genes in the input list, Column 4 is the rank based on column 1. Column 5 is the diference between column 6 and column 7, Column 6 is the number of paths between sources and sinks, including the genes in the input list, Column 7 is the number of paths between sources and sinks EXCLUDING the genes in the input list, Column 8 is the rank based on column 5. Column 9 I the pathway name' ) | |
| 177 parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') | |
| 178 parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') | |
| 179 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') | |
| 180 #~ | |
| 181 #~Open arguments | |
| 182 class C(object): | |
| 183 pass | |
| 184 fulargs=C() | |
| 185 parser.parse_args(sys.argv[1:],namespace=fulargs) | |
| 186 #test input vars | |
| 187 inputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn | |
| 188 posKEGGclmn-=1#correct pos | |
| 189 Kgeneposcolmn-=1 | |
| 190 #~ Get the extra variables | |
| 191 crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0] | |
| 192 sppPrefx,dinput=crDB[1],crDB[2] | |
| 193 #~ set decimal positions | |
| 194 getcontext().prec = 3 | |
| 195 #make a dictionary of valid genes | |
| 196 dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()]) | |
| 197 sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) | |
| 198 while True:#to crrect names with more than one gene | |
| 199 try: | |
| 200 mgenes=sdGenes.pop() | |
| 201 pthwsAssotd=dKEGGcPthws.pop(mgenes) | |
| 202 mgenes=mgenes.split('.') | |
| 203 for eachg in mgenes: | |
| 204 dKEGGcPthws[eachg]=pthwsAssotd | |
| 205 except: | |
| 206 break | |
| 207 #~ | |
| 208 lPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']] | |
| 209 nPthws=len(lPthwsF) | |
| 210 cPthw=0 | |
| 211 lPthwPthN=[]#the output list for number of paths | |
| 212 lPthwPthAvr=[]#the output list for the length of paths | |
| 213 #~ | |
| 214 while cPthw<nPthws: | |
| 215 cPthw+=1 | |
| 216 KEGGpathw=lPthwsF.pop() | |
| 217 comdKEGGpathw=KEGGpathw.split('.')[0] | |
| 218 tmpddGenrcgenPresent=set() | |
| 219 sKEGGc=dKEGGcPthws.keys() | |
| 220 lsKEGGc=len(sKEGGc) | |
| 221 ctPthw=0 | |
| 222 while ctPthw < lsKEGGc:#to save memory | |
| 223 eachK=sKEGGc.pop() | |
| 224 alPthws=dKEGGcPthws[eachK] | |
| 225 if comdKEGGpathw in alPthws: | |
| 226 tmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK])) | |
| 227 ctPthw+=1 | |
| 228 #~ Make graph calculations | |
| 229 dnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw))) | |
| 230 startNdsID0,endNdsOD0=returnstartanendnodes(edges) | |
| 231 startNdsOri=copy(startNdsID0) | |
| 232 #~ | |
| 233 nPaths='C'#stands for circuit | |
| 234 AvrgPthLen='I'#stand for infinite | |
| 235 if len(startNdsID0)>0 and len(endNdsOD0)>0: | |
| 236 nPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0) | |
| 237 #~ work with the genes in the list | |
| 238 genestodel=set() | |
| 239 lnodes=len(dnodes) | |
| 240 sNds=set(dnodes) | |
| 241 ctPthw=0 | |
| 242 while ctPthw<lnodes: | |
| 243 ctPthw+=1 | |
| 244 cNod=sNds.pop() | |
| 245 sgenes=dnodes.pop(cNod) | |
| 246 if len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes): | |
| 247 genestodel.add(cNod) | |
| 248 #~ del nodes from graph edges | |
| 249 wnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen) | |
| 250 if len(genestodel)>0: | |
| 251 wedges=set([x for x in edges if len(set(x).intersection(genestodel))==0]) | |
| 252 wstartNds,wendNds=returnstartanendnodes(wedges) | |
| 253 if nPaths!='C': | |
| 254 wstartNds=[x for x in wstartNds if x in startNdsOri] | |
| 255 wendNds=[x for x in wendNds if x in endNdsOD0] | |
| 256 if len(wstartNds)>0 and len(wendNds)>0: | |
| 257 wnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds) | |
| 258 #~ Calculate the differences | |
| 259 orNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen | |
| 260 if nPaths=='C': | |
| 261 orNP=Decimal('1000') | |
| 262 oriLen=Decimal('1000') | |
| 263 if wnPaths=='C': | |
| 264 mutNP=Decimal('1000') | |
| 265 mutLen=Decimal('1000') | |
| 266 lPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen | |
| 267 lPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen | |
| 268 doutrnkPthN=rankdN(lPthwPthN) | |
| 269 doutrnkPthAvr=rankdAvr(lPthwPthAvr) | |
| 270 #~ | |
| 271 sall=['\t'.join([doutrnkPthAvr[x[4]],'\t'.join(x)]) for x in doutrnkPthN] | |
| 272 salef=open(output,'w') | |
| 273 salef.write('\n'.join(sall)) | |
| 274 salef.close() | |
| 275 return 0 | |
| 276 | |
| 277 | |
| 278 if __name__ == '__main__': | |
| 279 main() | |
| 280 |
