diff g_frequency.py @ 0:e991d4e60c17 draft

planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author stevecassidy
date Wed, 12 Oct 2016 22:17:53 -0400
parents
children fb617586f4b2
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_frequency.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,30 @@
+import nltk
+from nltk import FreqDist
+import argparse
+
+def arguments():
+  parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
+  parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+  parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+  args = parser.parse_args()
+  return args
+
+
+def frequency(in_file, out_file):
+    """Input: a text file
+    Output: a table of word frequency with three columns for Word, Count and Percent frequency
+    """
+    text = unicode(open(in_file, 'r').read(), errors='ignore')
+    words = nltk.word_tokenize(text)
+    frequency = FreqDist(words)
+    total = float(frequency.N())
+    output = open(out_file, 'w')
+    output.write("Word\tCount\tPercent\n")
+    for pair in frequency.items():
+        output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
+    output.close()
+
+
+if __name__ == '__main__':
+    args = arguments()
+    frequency(args.input, args.output)