diff g_stemmer.py @ 1:fb617586f4b2 draft

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author stevecassidy
date Mon, 05 Dec 2016 05:22:05 -0500
parents e991d4e60c17
children a47980ef2b96
line wrap: on
line diff
--- a/g_stemmer.py	Wed Oct 12 22:17:53 2016 -0400
+++ b/g_stemmer.py	Mon Dec 05 05:22:05 2016 -0500
@@ -14,17 +14,18 @@
     return args
 
 def stem_file(in_file, out_file, stemmer_type):
-    unsegmented = unicode(open(in_file, 'r').read(), errors='ignore')
-    output = open(out_file, 'w')
-    sentences = nltk.sent_tokenize(unsegmented)
-    stemmer = get_stemmer(stemmer_type)
-    for sentence in sentences:
-        words = nltk.word_tokenize(sentence)
-        for word in words:
-            stemmed_word = stemmer.stem(word)
-            output.write(stemmed_word)
-            output.write('\n')
-    output.close()
+    with open(in_file, 'r') as fd:
+        unsegmented = fd.read()
+
+    with open(out_file, 'w') as output:
+        sentences = nltk.sent_tokenize(unsegmented)
+        stemmer = get_stemmer(stemmer_type)
+        for sentence in sentences:
+            words = nltk.word_tokenize(sentence)
+            for word in words:
+                stemmed_word = stemmer.stem(word)
+                output.write(stemmed_word)
+                output.write('\n')
 
 def get_stemmer(stemmer_type):
     if stemmer_type == 'lancaster':