diff g_tokenize.py @ 1:fb617586f4b2 draft

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author stevecassidy
date Mon, 05 Dec 2016 05:22:05 -0500
parents e991d4e60c17
children a47980ef2b96
line wrap: on
line diff
--- a/g_tokenize.py	Wed Oct 12 22:17:53 2016 -0400
+++ b/g_tokenize.py	Mon Dec 05 05:22:05 2016 -0500
@@ -18,23 +18,24 @@
 
 
 def tokenize(in_file, out_file, lower=False, nopunct=False):
-    text = open(in_file, 'r').read()
+    with open(in_file, 'r') as fd:
+        text = fd.read()
+
     if lower:
         text = text.lower()
     if nopunct:
         text = strip_punct(text)
     result = []
-    text = unicode(text, errors='ignore')
+    #text = unicode(text, errors='ignore')
     sentences = nltk.sent_tokenize(text)
     for sentence in sentences:
         tokens = nltk.word_tokenize(sentence)
         result.append(tokens)
-    output = open(out_file, 'w')
-    # write one token per line
-    for sentence in result:
-        for token in sentence:
-            output.write(token + "\n")
-    output.close()
+    with open(out_file, 'w') as output:
+        # write one token per line
+        for sentence in result:
+            for token in sentence:
+                output.write(token + "\n")
 
 
 if __name__ == '__main__':