Mercurial > repos > rico > no_tests_test
comparison calctfreq.py @ 0:580da578c5e6 default tip
Uploaded
| author | rico |
|---|---|
| date | Thu, 05 Apr 2012 15:56:36 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:580da578c5e6 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 # | |
| 4 # calcfreq.py | |
| 5 # | |
| 6 # Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu> | |
| 7 # | |
| 8 # This program is free software; you can redistribute it and/or modify | |
| 9 # it under the terms of the GNU General Public License as published by | |
| 10 # the Free Software Foundation; either version 2 of the License, or | |
| 11 # (at your option) any later version. | |
| 12 # | |
| 13 # This program is distributed in the hope that it will be useful, | |
| 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 # GNU General Public License for more details. | |
| 17 # | |
| 18 # You should have received a copy of the GNU General Public License | |
| 19 # along with this program; if not, write to the Free Software | |
| 20 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, | |
| 21 # MA 02110-1301, USA. | |
| 22 | |
| 23 import argparse,os,sys | |
| 24 from decimal import Decimal,getcontext | |
| 25 from LocationFile import LocationFile | |
| 26 | |
| 27 #method to rank the the pthways by mut. freq. | |
| 28 def rankd(ltfreqs): | |
| 29 ordvals=sorted(ltfreqs)#sort and reverse freqs. | |
| 30 #~ | |
| 31 outrnk=[] | |
| 32 tmpFreq0,tmpCount,tmpPthw=ordvals.pop()#the highest possible value | |
| 33 crank=1 | |
| 34 outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),tmpPthw])) | |
| 35 totalnvals=len(ordvals) | |
| 36 cnt=0 | |
| 37 while totalnvals>cnt: | |
| 38 cnt+=1 | |
| 39 tmpFreq,tmpCount,tmpPthw=ordvals.pop() | |
| 40 if tmpFreq!=tmpFreq0: | |
| 41 crank=len(outrnk)+1 | |
| 42 tmpFreq0=tmpFreq | |
| 43 outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),tmpPthw])) | |
| 44 return outrnk | |
| 45 | |
| 46 | |
| 47 def main(): | |
| 48 parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.') | |
| 49 parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format') | |
| 50 parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the count of genes in the list, Column 2 is the percentage of the pathway genes present on the list. Column 3 is the rank based on column 2') | |
| 51 parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name') | |
| 52 parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code') | |
| 53 parser.add_argument('--loc_file',metavar='location file',type=str,help='location file') | |
| 54 parser.add_argument('--species',metavar='species',type=str,help='species') | |
| 55 #~Open arguments | |
| 56 class C(object): | |
| 57 pass | |
| 58 fulargs=C() | |
| 59 parser.parse_args(sys.argv[1:],namespace=fulargs) | |
| 60 #test input vars | |
| 61 inputf,outputf,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn | |
| 62 locf,species=fulargs.loc_file,fulargs.species | |
| 63 #make a dictionary of valid genes | |
| 64 posKEGGclmn-=1 | |
| 65 Kgeneposcolmn-=1 | |
| 66 dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set(x.split('\t')[posKEGGclmn].split('.'))) for x in open(inputf).read().splitlines()[1:] if x.strip()]) | |
| 67 sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1]) | |
| 68 while True:#to correct names with more than one gene | |
| 69 try: | |
| 70 mgenes=sdGenes.pop() | |
| 71 pthwsAssotd=dKEGGcPthws.pop(mgenes) | |
| 72 mgenes=mgenes.split('.') | |
| 73 for eachg in mgenes: | |
| 74 dKEGGcPthws[eachg]=pthwsAssotd | |
| 75 except: | |
| 76 break | |
| 77 #~ Count genes | |
| 78 getcontext().prec=2#set 2 decimal places | |
| 79 | |
| 80 location_file = LocationFile(locf) | |
| 81 prefix, kxml_dir_path, dict_file = location_file.get_values(species) | |
| 82 dPthContsTotls = {} | |
| 83 try: | |
| 84 with open(dict_file) as fh: | |
| 85 for line in fh: | |
| 86 line = line.rstrip('\r\n') | |
| 87 value, key = line.split('\t') | |
| 88 dPthContsTotls[key] = int(value) | |
| 89 except IOError, err: | |
| 90 print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror) | |
| 91 sys.exit(1) | |
| 92 | |
| 93 dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes | |
| 94 sdGenes=set([x for x in dKEGGcPthws.keys()])#list of all genes | |
| 95 cntGens=0 | |
| 96 ltGens=len(sdGenes) | |
| 97 while cntGens<ltGens: | |
| 98 cGen=sdGenes.pop() | |
| 99 sKEGGcPthws=dKEGGcPthws.pop(cGen) | |
| 100 for eachP in sKEGGcPthws: | |
| 101 if eachP!='N': | |
| 102 dPthContsTmp[eachP]+=1 | |
| 103 cntGens+=1 | |
| 104 #~ Calculate Freqs. | |
| 105 ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls] | |
| 106 tabllfreqs='\n'.join(rankd(ltfreqs)) | |
| 107 salef=open(outputf,'w') | |
| 108 salef.write(tabllfreqs) | |
| 109 salef.close() | |
| 110 return 0 | |
| 111 | |
| 112 | |
| 113 if __name__ == '__main__': | |
| 114 main() |
