Mercurial > repos > stevecassidy > nltktools
comparison g_tokenize.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:19:55 -0400 |
parents | fb617586f4b2 |
children |
comparison
equal
deleted
inserted
replaced
1:fb617586f4b2 | 2:a47980ef2b96 |
---|---|
1 import nltk | 1 import nltk |
2 import string | 2 import string |
3 import argparse | 3 import argparse |
4 | 4 |
5 nltk.download('punkt', quiet=True) | |
6 | |
7 | |
5 def arguments(): | 8 def arguments(): |
6 parser = argparse.ArgumentParser(description="tokenize a text") | 9 parser = argparse.ArgumentParser(description="tokenize a text") |
7 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
8 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
9 parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words") | 12 parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words") |
10 parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters") | 13 parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters") |
11 args = parser.parse_args() | 14 return parser.parse_args() |
12 return args | |
13 | 15 |
14 | 16 |
15 def strip_punct(text): | 17 def strip_punct(text): |
16 table = string.maketrans("","") | 18 table = text.maketrans("", "") |
17 return text.translate(table, string.punctuation) | 19 return text.translate(table, string.punctuation) |
18 | 20 |
19 | 21 |
20 def tokenize(in_file, out_file, lower=False, nopunct=False): | 22 def tokenize(in_file, out_file, lower=False, nopunct=False): |
21 with open(in_file, 'r') as fd: | 23 with open(in_file, 'r') as fd: |
22 text = fd.read() | 24 text = fd.read() |
23 | 25 |
24 if lower: | 26 if lower: |
25 text = text.lower() | 27 text = text.lower() |
26 if nopunct: | |
27 text = strip_punct(text) | |
28 result = [] | 28 result = [] |
29 #text = unicode(text, errors='ignore') | 29 # text = unicode(text, errors='ignore') |
30 sentences = nltk.sent_tokenize(text) | 30 sentences = nltk.sent_tokenize(text) |
31 for sentence in sentences: | 31 for sentence in sentences: |
32 tokens = nltk.word_tokenize(sentence) | 32 tokens = nltk.word_tokenize(sentence) |
33 if nopunct: | |
34 tokens = filter(lambda w: w not in string.punctuation, tokens) | |
33 result.append(tokens) | 35 result.append(tokens) |
36 | |
34 with open(out_file, 'w') as output: | 37 with open(out_file, 'w') as output: |
35 # write one token per line | 38 # write one token per line |
36 for sentence in result: | 39 for sentence in result: |
37 for token in sentence: | 40 for token in sentence: |
38 output.write(token + "\n") | 41 output.write(token + "\n") |