Mercurial > repos > stevecassidy > nltktools

diff g_frequency.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author: stevecassidy
date: Wed, 01 Nov 2017 01:19:55 -0400
parents: fb617586f4b2
children: 0df72a8ab095
--- a/g_frequency.py	Mon Dec 05 05:22:05 2016 -0500
+++ b/g_frequency.py	Wed Nov 01 01:19:55 2017 -0400
@@ -2,12 +2,14 @@
 from nltk import FreqDist
 import argparse
 
+nltk.download('punkt', quiet=True)
+
+
 def arguments():
-  parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
-  parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
-  parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
-  args = parser.parse_args()
-  return args
+    parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    return parser.parse_args()
 
 
 def frequency(in_file, out_file):
@@ -18,13 +20,13 @@
         text = fd.read()
 
     words = nltk.word_tokenize(text)
-    frequency = FreqDist(words)
-    total = float(frequency.N())
-    
+    fdist = FreqDist(words)
+    total = float(fdist.N())
+
     with open(out_file, 'w') as output:
         output.write("Word\tCount\tPercent\n")
-        for pair in frequency.items():
-            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
+        for pair in fdist.items():
+            output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))
 
 
 if __name__ == '__main__':
author	stevecassidy
date	Wed, 01 Nov 2017 01:19:55 -0400
parents	fb617586f4b2
children	0df72a8ab095