Mercurial > repos > stevecassidy > nltktools
comparison g_pos.py @ 3:0df72a8ab095 draft default tip
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit f2432aaedd36ae7662873623d8861d0982dffdd2
| author | stevecassidy |
|---|---|
| date | Mon, 20 Nov 2017 22:52:11 -0500 |
| parents | a47980ef2b96 |
| children |
comparison
equal
deleted
inserted
replaced
| 2:a47980ef2b96 | 3:0df72a8ab095 |
|---|---|
| 1 from __future__ import print_function, unicode_literals | |
| 1 import nltk | 2 import nltk |
| 2 import argparse | 3 import argparse |
| 4 import io | |
| 3 | 5 |
| 4 nltk.download('averaged_perceptron_tagger', quiet=True) | 6 nltk.download('averaged_perceptron_tagger', quiet=True) |
| 7 nltk.download('punkt', quiet=True) | |
| 5 | 8 |
| 6 | 9 |
| 7 def arguments(): | 10 def arguments(): |
| 8 parser = argparse.ArgumentParser(description="tokenize a text") | 11 parser = argparse.ArgumentParser(description="tokenize a text") |
| 9 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 12 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
| 13 | 16 |
| 14 def postag(in_file, out_file): | 17 def postag(in_file, out_file): |
| 15 """Input: a text file with one token per line | 18 """Input: a text file with one token per line |
| 16 Output: a version of the text with Part of Speech tags written as word/TAG | 19 Output: a version of the text with Part of Speech tags written as word/TAG |
| 17 """ | 20 """ |
| 18 with open(in_file, 'r') as fd: | 21 with open(in_file, 'rb') as fd: |
| 19 text = fd.read() | 22 text = fd.read() |
| 23 text = text.decode('utf-8') | |
| 20 | 24 |
| 21 sentences = nltk.sent_tokenize(text) | 25 sentences = nltk.sent_tokenize(text) |
| 22 | 26 |
| 23 with open(out_file, 'w') as output: | 27 with io.open(out_file, 'w') as output: |
| 24 for sentence in sentences: | 28 for sentence in sentences: |
| 25 tokens = nltk.word_tokenize(sentence) | 29 tokens = nltk.word_tokenize(sentence) |
| 26 postags = nltk.pos_tag(tokens) | 30 postags = nltk.pos_tag(tokens) |
| 27 for postag in postags: | 31 for postag in postags: |
| 28 # print postag | 32 # print postag |
| 29 output.write("%s/%s " % postag) | 33 p = "%s/%s " % postag |
| 34 output.write(p) | |
| 30 output.write('\n') | 35 output.write('\n') |
| 31 | 36 |
| 32 | 37 |
| 33 if __name__ == '__main__': | 38 if __name__ == '__main__': |
| 34 args = arguments() | 39 args = arguments() |
