Mercurial > repos > stevecassidy > nltktools
comparison g_tokenize.py @ 1:fb617586f4b2 draft
planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author | stevecassidy |
---|---|
date | Mon, 05 Dec 2016 05:22:05 -0500 |
parents | e991d4e60c17 |
children | a47980ef2b96 |
comparison
equal
deleted
inserted
replaced
0:e991d4e60c17 | 1:fb617586f4b2 |
---|---|
16 table = string.maketrans("","") | 16 table = string.maketrans("","") |
17 return text.translate(table, string.punctuation) | 17 return text.translate(table, string.punctuation) |
18 | 18 |
19 | 19 |
20 def tokenize(in_file, out_file, lower=False, nopunct=False): | 20 def tokenize(in_file, out_file, lower=False, nopunct=False): |
21 text = open(in_file, 'r').read() | 21 with open(in_file, 'r') as fd: |
22 text = fd.read() | |
23 | |
22 if lower: | 24 if lower: |
23 text = text.lower() | 25 text = text.lower() |
24 if nopunct: | 26 if nopunct: |
25 text = strip_punct(text) | 27 text = strip_punct(text) |
26 result = [] | 28 result = [] |
27 text = unicode(text, errors='ignore') | 29 #text = unicode(text, errors='ignore') |
28 sentences = nltk.sent_tokenize(text) | 30 sentences = nltk.sent_tokenize(text) |
29 for sentence in sentences: | 31 for sentence in sentences: |
30 tokens = nltk.word_tokenize(sentence) | 32 tokens = nltk.word_tokenize(sentence) |
31 result.append(tokens) | 33 result.append(tokens) |
32 output = open(out_file, 'w') | 34 with open(out_file, 'w') as output: |
33 # write one token per line | 35 # write one token per line |
34 for sentence in result: | 36 for sentence in result: |
35 for token in sentence: | 37 for token in sentence: |
36 output.write(token + "\n") | 38 output.write(token + "\n") |
37 output.close() | |
38 | 39 |
39 | 40 |
40 if __name__ == '__main__': | 41 if __name__ == '__main__': |
41 args = arguments() | 42 args = arguments() |
42 tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct) | 43 tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct) |