Mercurial > repos > stevecassidy > nltktools
comparison g_frequency.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
| author | stevecassidy |
|---|---|
| date | Mon, 20 Nov 2017 22:52:11 -0500 |
| parents | a47980ef2b96 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:a47980ef2b96 | 3:0df72a8ab095 |
|---|---|
| 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
| 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
| 12 return parser.parse_args() | 12 return parser.parse_args() |
| 13 | 13 |
| 14 | 14 |
| 15 def frequency(in_file, out_file): | 15 def frequency(textfiles, out_file): |
| 16 """Input: a text file | 16 """Input: a text file |
| 17 Output: a table of word frequency with three columns for Word, Count and Percent frequency | 17 Output: a table of word frequency with three columns for Word, Count and Percent frequency |
| 18 """ | 18 """ |
| 19 with open(in_file, 'r') as fd: | |
| 20 text = fd.read() | |
| 21 | 19 |
| 22 words = nltk.word_tokenize(text) | 20 words = [] |
| 21 for textfile in textfiles: | |
| 22 with open(textfile, 'r') as fd: | |
| 23 text = fd.read() | |
| 24 | |
| 25 words.extend(nltk.word_tokenize(text)) | |
| 26 | |
| 23 fdist = FreqDist(words) | 27 fdist = FreqDist(words) |
| 24 total = float(fdist.N()) | 28 total = float(fdist.N()) |
| 25 | 29 |
| 26 with open(out_file, 'w') as output: | 30 with open(out_file, 'w') as output: |
| 27 output.write("Word\tCount\tPercent\n") | 31 output.write("Word\tCount\tPercent\n") |
| 28 for pair in fdist.items(): | 32 for pair in sorted(fdist.items()): |
| 29 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total)) | 33 output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total)) |
| 30 | 34 |
| 31 | 35 |
| 32 if __name__ == '__main__': | 36 if __name__ == '__main__': |
| 33 args = arguments() | 37 args = arguments() |
| 34 frequency(args.input, args.output) | 38 textfiles = args.input.split(',') |
| 39 frequency(textfiles, args.output) |
