Mercurial > repos > stevecassidy > nltktools
comparison g_stemmer.py @ 2:a47980ef2b96 draft
planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author | stevecassidy |
---|---|
date | Wed, 01 Nov 2017 01:19:55 -0400 |
parents | fb617586f4b2 |
children |
comparison
equal
deleted
inserted
replaced
1:fb617586f4b2 | 2:a47980ef2b96 |
---|---|
1 import sys | |
2 import os | |
3 import nltk | 1 import nltk |
4 from nltk.stem import * | 2 from nltk.stem import PorterStemmer, LancasterStemmer, snowball |
5 import argparse | 3 import argparse |
6 | 4 |
7 | 5 |
8 def arguments(): | 6 def arguments(): |
9 parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") | 7 parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") |
10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") | 8 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") |
11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") | 9 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") |
12 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") | 10 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") |
13 args = parser.parse_args() | 11 args = parser.parse_args() |
14 return args | 12 return args |
13 | |
15 | 14 |
16 def stem_file(in_file, out_file, stemmer_type): | 15 def stem_file(in_file, out_file, stemmer_type): |
17 with open(in_file, 'r') as fd: | 16 with open(in_file, 'r') as fd: |
18 unsegmented = fd.read() | 17 unsegmented = fd.read() |
19 | 18 |
25 for word in words: | 24 for word in words: |
26 stemmed_word = stemmer.stem(word) | 25 stemmed_word = stemmer.stem(word) |
27 output.write(stemmed_word) | 26 output.write(stemmed_word) |
28 output.write('\n') | 27 output.write('\n') |
29 | 28 |
29 | |
30 def get_stemmer(stemmer_type): | 30 def get_stemmer(stemmer_type): |
31 if stemmer_type == 'lancaster': | 31 if stemmer_type == 'lancaster': |
32 stemmer = LancasterStemmer() | 32 stemmer = LancasterStemmer() |
33 elif stemmer_type == 'porter': | 33 elif stemmer_type == 'porter': |
34 stemmer = PorterStemmer() | 34 stemmer = PorterStemmer() |
35 else: | 35 else: |
36 stemmer = snowball.EnglishStemmer() | 36 stemmer = snowball.EnglishStemmer() |
37 return stemmer | 37 return stemmer |
38 | 38 |
39 | |
39 if __name__ == '__main__': | 40 if __name__ == '__main__': |
40 args = arguments() | 41 args = arguments() |
41 stem_file(args.input, args.output, args.stemmer) | 42 stem_file(args.input, args.output, args.stemmer) |