Mercurial > repos > stevecassidy > nltktools

diff g_pos.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author: stevecassidy
date: Mon, 20 Nov 2017 22:52:11 -0500
parents: a47980ef2b96
--- a/g_pos.py	Wed Nov 01 01:19:55 2017 -0400
+++ b/g_pos.py	Mon Nov 20 22:52:11 2017 -0500
@@ -1,7 +1,10 @@
+from __future__ import print_function, unicode_literals
 import nltk
 import argparse
+import io
 
 nltk.download('averaged_perceptron_tagger', quiet=True)
+nltk.download('punkt', quiet=True)
 
 
 def arguments():
@@ -15,18 +18,20 @@
     """Input: a text file with one token per line
     Output: a version of the text with Part of Speech tags written as word/TAG
     """
-    with open(in_file, 'r') as fd:
+    with open(in_file, 'rb') as fd:
         text = fd.read()
+        text = text.decode('utf-8')
 
     sentences = nltk.sent_tokenize(text)
 
-    with open(out_file, 'w') as output:
+    with io.open(out_file, 'w') as output:
         for sentence in sentences:
             tokens = nltk.word_tokenize(sentence)
             postags = nltk.pos_tag(tokens)
             for postag in postags:
                 # print postag
-                output.write("%s/%s " % postag)
+                p = "%s/%s " % postag
+                output.write(p)
         output.write('\n')