comparison g_stemmer.py @ 2:a47980ef2b96 draft

planemo upload for repository https://github.com/Alveo/alveo-galaxy-tools commit b5b26e9118f2ad8af109d606746b39a5588f0511-dirty
author stevecassidy
date Wed, 01 Nov 2017 01:19:55 -0400
parents fb617586f4b2
children
comparison
equal deleted inserted replaced
1:fb617586f4b2 2:a47980ef2b96
1 import sys
2 import os
3 import nltk 1 import nltk
4 from nltk.stem import * 2 from nltk.stem import PorterStemmer, LancasterStemmer, snowball
5 import argparse 3 import argparse
6 4
7 5
8 def arguments(): 6 def arguments():
9 parser = argparse.ArgumentParser(description="Segments the text input into separate sentences") 7 parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
10 parser.add_argument('--input', required=True, action="store", type=str, help="input text file") 8 parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
11 parser.add_argument('--output', required=True, action="store", type=str, help="output file path") 9 parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
12 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path") 10 parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path")
13 args = parser.parse_args() 11 args = parser.parse_args()
14 return args 12 return args
13
15 14
16 def stem_file(in_file, out_file, stemmer_type): 15 def stem_file(in_file, out_file, stemmer_type):
17 with open(in_file, 'r') as fd: 16 with open(in_file, 'r') as fd:
18 unsegmented = fd.read() 17 unsegmented = fd.read()
19 18
25 for word in words: 24 for word in words:
26 stemmed_word = stemmer.stem(word) 25 stemmed_word = stemmer.stem(word)
27 output.write(stemmed_word) 26 output.write(stemmed_word)
28 output.write('\n') 27 output.write('\n')
29 28
29
30 def get_stemmer(stemmer_type): 30 def get_stemmer(stemmer_type):
31 if stemmer_type == 'lancaster': 31 if stemmer_type == 'lancaster':
32 stemmer = LancasterStemmer() 32 stemmer = LancasterStemmer()
33 elif stemmer_type == 'porter': 33 elif stemmer_type == 'porter':
34 stemmer = PorterStemmer() 34 stemmer = PorterStemmer()
35 else: 35 else:
36 stemmer = snowball.EnglishStemmer() 36 stemmer = snowball.EnglishStemmer()
37 return stemmer 37 return stemmer
38 38
39
39 if __name__ == '__main__': 40 if __name__ == '__main__':
40 args = arguments() 41 args = arguments()
41 stem_file(args.input, args.output, args.stemmer) 42 stem_file(args.input, args.output, args.stemmer)