Mercurial > repos > stevecassidy > nltktools
diff g_stemmer.py @ 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
line wrap: on
line diff
--- a/g_stemmer.py Wed Oct 12 22:17:53 2016 -0400 +++ b/g_stemmer.py Mon Dec 05 05:22:05 2016 -0500 @@ -14,17 +14,18 @@ return args def stem_file(in_file, out_file, stemmer_type): - unsegmented = unicode(open(in_file, 'r').read(), errors='ignore') - output = open(out_file, 'w') - sentences = nltk.sent_tokenize(unsegmented) - stemmer = get_stemmer(stemmer_type) - for sentence in sentences: - words = nltk.word_tokenize(sentence) - for word in words: - stemmed_word = stemmer.stem(word) - output.write(stemmed_word) - output.write('\n') - output.close() + with open(in_file, 'r') as fd: + unsegmented = fd.read() + + with open(out_file, 'w') as output: + sentences = nltk.sent_tokenize(unsegmented) + stemmer = get_stemmer(stemmer_type) + for sentence in sentences: + words = nltk.word_tokenize(sentence) + for word in words: + stemmed_word = stemmer.stem(word) + output.write(stemmed_word) + output.write('\n') def get_stemmer(stemmer_type): if stemmer_type == 'lancaster':