Mercurial > repos > stevecassidy > nltktools
diff g_pos.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author | stevecassidy |
---|---|
date | Mon, 20 Nov 2017 22:52:11 -0500 |
parents | a47980ef2b96 |
children |
line wrap: on
line diff
--- a/g_pos.py Wed Nov 01 01:19:55 2017 -0400 +++ b/g_pos.py Mon Nov 20 22:52:11 2017 -0500 @@ -1,7 +1,10 @@ +from __future__ import print_function, unicode_literals import nltk import argparse +import io nltk.download('averaged_perceptron_tagger', quiet=True) +nltk.download('punkt', quiet=True) def arguments(): @@ -15,18 +18,20 @@ """Input: a text file with one token per line Output: a version of the text with Part of Speech tags written as word/TAG """ - with open(in_file, 'r') as fd: + with open(in_file, 'rb') as fd: text = fd.read() + text = text.decode('utf-8') sentences = nltk.sent_tokenize(text) - with open(out_file, 'w') as output: + with io.open(out_file, 'w') as output: for sentence in sentences: tokens = nltk.word_tokenize(sentence) postags = nltk.pos_tag(tokens) for postag in postags: # print postag - output.write("%s/%s " % postag) + p = "%s/%s " % postag + output.write(p) output.write('\n')