Mercurial > repos > stevecassidy > nltktools
comparison g_stemmer.py @ 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
comparison
equal
deleted
inserted
replaced
0:e991d4e60c17 | 1:fb617586f4b2 |
---|---|
12 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") | 12 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") |
13 args = parser.parse_args() | 13 args = parser.parse_args() |
14 return args | 14 return args |
15 | 15 |
16 def stem_file(in_file, out_file, stemmer_type): | 16 def stem_file(in_file, out_file, stemmer_type): |
17 unsegmented = unicode(open(in_file, 'r').read(), errors='ignore') | 17 with open(in_file, 'r') as fd: |
18 output = open(out_file, 'w') | 18 unsegmented = fd.read() |
19 sentences = nltk.sent_tokenize(unsegmented) | 19 |
20 stemmer = get_stemmer(stemmer_type) | 20 with open(out_file, 'w') as output: |
21 for sentence in sentences: | 21 sentences = nltk.sent_tokenize(unsegmented) |
22 words = nltk.word_tokenize(sentence) | 22 stemmer = get_stemmer(stemmer_type) |
23 for word in words: | 23 for sentence in sentences: |
24 stemmed_word = stemmer.stem(word) | 24 words = nltk.word_tokenize(sentence) |
25 output.write(stemmed_word) | 25 for word in words: |
26 output.write('\n') | 26 stemmed_word = stemmer.stem(word) |
27 output.close() | 27 output.write(stemmed_word) |
28 output.write('\n') | |
28 | 29 |
29 def get_stemmer(stemmer_type): | 30 def get_stemmer(stemmer_type): |
30 if stemmer_type == 'lancaster': | 31 if stemmer_type == 'lancaster': |
31 stemmer = LancasterStemmer() | 32 stemmer = LancasterStemmer() |
32 elif stemmer_type == 'porter': | 33 elif stemmer_type == 'porter': |