comparison g_frequency.py @ 3:0df72a8ab095 draft default tip

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author stevecassidy
date Mon, 20 Nov 2017 22:52:11 -0500
parents a47980ef2b96
children
comparison
equal deleted inserted replaced
2:a47980ef2b96 3:0df72a8ab095
10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
12 return parser.parse_args() 12 return parser.parse_args()
13 13
14 14
15 def frequency(in_file, out_file): 15 def frequency(textfiles, out_file):
16 """Input: a text file 16 """Input: a text file
17 Output: a table of word frequency with three columns for Word, Count and Percent frequency 17 Output: a table of word frequency with three columns for Word, Count and Percent frequency
18 """ 18 """
19 with open(in_file, 'r') as fd:
20 text = fd.read()
21 19
22 words = nltk.word_tokenize(text) 20 words = []
21 for textfile in textfiles:
22 with open(textfile, 'r') as fd:
23 text = fd.read()
24
25 words.extend(nltk.word_tokenize(text))
26
23 fdist = FreqDist(words) 27 fdist = FreqDist(words)
24 total = float(fdist.N()) 28 total = float(fdist.N())
25 29
26 with open(out_file, 'w') as output: 30 with open(out_file, 'w') as output:
27 output.write("Word\tCount\tPercent\n") 31 output.write("Word\tCount\tPercent\n")
28 for pair in fdist.items(): 32 for pair in sorted(fdist.items()):
29 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total)) 33 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))
30 34
31 35
32 if __name__ == '__main__': 36 if __name__ == '__main__':
33 args = arguments() 37 args = arguments()
34 frequency(args.input, args.output) 38 textfiles = args.input.split(',')
39 frequency(textfiles, args.output)