Mercurial > repos > stevecassidy > nltktools
comparison g_pos.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
author | stevecassidy |
---|---|
date | Mon, 20 Nov 2017 22:52:11 -0500 |
parents | a47980ef2b96 |
children |
comparison
equal
deleted
inserted
replaced
2:a47980ef2b96 | 3:0df72a8ab095 |
---|---|
1 from __future__ import print_function, unicode_literals | |
1 import nltk | 2 import nltk |
2 import argparse | 3 import argparse |
4 import io | |
3 | 5 |
4 nltk.download('averaged_perceptron_tagger', quiet=True) | 6 nltk.download('averaged_perceptron_tagger', quiet=True) |
7 nltk.download('punkt', quiet=True) | |
5 | 8 |
6 | 9 |
7 def arguments(): | 10 def arguments(): |
8 parser = argparse.ArgumentParser(description="tokenize a text") | 11 parser = argparse.ArgumentParser(description="tokenize a text") |
9 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 12 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
13 | 16 |
14 def postag(in_file, out_file): | 17 def postag(in_file, out_file): |
15 """Input: a text file with one token per line | 18 """Input: a text file with one token per line |
16 Output: a version of the text with Part of Speech tags written as word/TAG | 19 Output: a version of the text with Part of Speech tags written as word/TAG |
17 """ | 20 """ |
18 with open(in_file, 'r') as fd: | 21 with open(in_file, 'rb') as fd: |
19 text = fd.read() | 22 text = fd.read() |
23 text = text.decode('utf-8') | |
20 | 24 |
21 sentences = nltk.sent_tokenize(text) | 25 sentences = nltk.sent_tokenize(text) |
22 | 26 |
23 with open(out_file, 'w') as output: | 27 with io.open(out_file, 'w') as output: |
24 for sentence in sentences: | 28 for sentence in sentences: |
25 tokens = nltk.word_tokenize(sentence) | 29 tokens = nltk.word_tokenize(sentence) |
26 postags = nltk.pos_tag(tokens) | 30 postags = nltk.pos_tag(tokens) |
27 for postag in postags: | 31 for postag in postags: |
28 # print postag | 32 # print postag |
29 output.write("%s/%s " % postag) | 33 p = "%s/%s " % postag |
34 output.write(p) | |
30 output.write('\n') | 35 output.write('\n') |
31 | 36 |
32 | 37 |
33 if __name__ == '__main__': | 38 if __name__ == '__main__': |
34 args = arguments() | 39 args = arguments() |