Mercurial > repos > stevecassidy > nltktools

diff g_frequency.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author: stevecassidy
date: Mon, 20 Nov 2017 22:52:11 -0500
parents: a47980ef2b96
--- a/g_frequency.py	Wed Nov 01 01:19:55 2017 -0400
+++ b/g_frequency.py	Mon Nov 20 22:52:11 2017 -0500
@@ -12,23 +12,28 @@
     return parser.parse_args()
 
 
-def frequency(in_file, out_file):
+def frequency(textfiles, out_file):
     """Input: a text file
     Output: a table of word frequency with three columns for Word, Count and Percent frequency
     """
-    with open(in_file, 'r') as fd:
-        text = fd.read()
 
-    words = nltk.word_tokenize(text)
+    words = []
+    for textfile in textfiles:
+        with open(textfile, 'r') as fd:
+            text = fd.read()
+
+        words.extend(nltk.word_tokenize(text))
+
     fdist = FreqDist(words)
     total = float(fdist.N())
 
     with open(out_file, 'w') as output:
         output.write("Word\tCount\tPercent\n")
-        for pair in fdist.items():
+        for pair in sorted(fdist.items()):
             output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100 * pair[1] / total))
 
 
 if __name__ == '__main__':
     args = arguments()
-    frequency(args.input, args.output)
+    textfiles = args.input.split(',')
+    frequency(textfiles, args.output)