Mercurial > repos > stevecassidy > nltktools
comparison g_frequency.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author | stevecassidy |
---|---|
date | Mon, 20 Nov 2017 22:52:11 -0500 |
parents | a47980ef2b96 |
children |
comparison
equal
deleted
inserted
replaced
2:a47980ef2b96 | 3:0df72a8ab095 |
---|---|
10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
12 return parser.parse_args() | 12 return parser.parse_args() |
13 | 13 |
14 | 14 |
15 def frequency(in_file, out_file): | 15 def frequency(textfiles, out_file): |
16 """Input: a text file | 16 """Input: a text file |
17 Output: a table of word frequency with three columns for Word, Count and Percent frequency | 17 Output: a table of word frequency with three columns for Word, Count and Percent frequency |
18 """ | 18 """ |
19 with open(in_file, 'r') as fd: | |
20 text = fd.read() | |
21 | 19 |
22 words = nltk.word_tokenize(text) | 20 words = [] |
21 for textfile in textfiles: | |
22 with open(textfile, 'r') as fd: | |
23 text = fd.read() | |
24 | |
25 words.extend(nltk.word_tokenize(text)) | |
26 | |
23 fdist = FreqDist(words) | 27 fdist = FreqDist(words) |
24 total = float(fdist.N()) | 28 total = float(fdist.N()) |
25 | 29 |
26 with open(out_file, 'w') as output: | 30 with open(out_file, 'w') as output: |
27 output.write("Word\tCount\tPercent\n") | 31 output.write("Word\tCount\tPercent\n") |
28 for pair in fdist.items(): | 32 for pair in sorted(fdist.items()): |
29 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total)) | 33 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total)) |
30 | 34 |
31 | 35 |
32 if __name__ == '__main__': | 36 if __name__ == '__main__': |
33 args = arguments() | 37 args = arguments() |
34 frequency(args.input, args.output) | 38 textfiles = args.input.split(',') |
39 frequency(textfiles, args.output) |