comparison g_tokenize.py @ 1:fb617586f4b2 draft

planemo upload commit a81826fe44f09a3710a35c183aa88b745aeec064-dirty
author stevecassidy
date Mon, 05 Dec 2016 05:22:05 -0500
parents e991d4e60c17
children a47980ef2b96
comparison
equal deleted inserted replaced
0:e991d4e60c17 1:fb617586f4b2
16 table = string.maketrans("","") 16 table = string.maketrans("","")
17 return text.translate(table, string.punctuation) 17 return text.translate(table, string.punctuation)
18 18
19 19
20 def tokenize(in_file, out_file, lower=False, nopunct=False): 20 def tokenize(in_file, out_file, lower=False, nopunct=False):
21 text = open(in_file, 'r').read() 21 with open(in_file, 'r') as fd:
22 text = fd.read()
23
22 if lower: 24 if lower:
23 text = text.lower() 25 text = text.lower()
24 if nopunct: 26 if nopunct:
25 text = strip_punct(text) 27 text = strip_punct(text)
26 result = [] 28 result = []
27 text = unicode(text, errors='ignore') 29 #text = unicode(text, errors='ignore')
28 sentences = nltk.sent_tokenize(text) 30 sentences = nltk.sent_tokenize(text)
29 for sentence in sentences: 31 for sentence in sentences:
30 tokens = nltk.word_tokenize(sentence) 32 tokens = nltk.word_tokenize(sentence)
31 result.append(tokens) 33 result.append(tokens)
32 output = open(out_file, 'w') 34 with open(out_file, 'w') as output:
33 # write one token per line 35 # write one token per line
34 for sentence in result: 36 for sentence in result:
35 for token in sentence: 37 for token in sentence:
36 output.write(token + "\n") 38 output.write(token + "\n")
37 output.close()
38 39
39 40
40 if __name__ == '__main__': 41 if __name__ == '__main__':
41 args = arguments() 42 args = arguments()
42 tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct) 43 tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct)