# HG changeset patch
# User stevecassidy
# Date 1476325073 14400
# Node ID e991d4e60c17697cd5ce38e404024408844f4c1c
planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
diff -r 000000000000 -r e991d4e60c17 colloc.dat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/colloc.dat Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,6 @@
+will be
+that will
+. The
+is to
+of the
+and the
diff -r 000000000000 -r e991d4e60c17 g_chart_parser.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_chart_parser.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,38 @@
+import sys
+import nltk
+import argparse
+from nltk.corpus import PlaintextCorpusReader
+
+def arguments():
+ parser = argparse.ArgumentParser(description="run NER on a text")
+ parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ parser.add_argument('--grammar', required=True, action="store", type=str, help="grammar file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ args = parser.parse_args()
+ return args
+
+
+def chart_parse(in_file, grammar_file, out_file):
+ text = unicode(open(in_file, 'r').read(), errors='ignore')
+ output = open(out_file, 'w')
+ grammar_string = unicode(open(grammar_file, 'r').read(), errors='ignore')
+ try:
+ grammar = nltk.parse_cfg(grammar_string)
+ parser = nltk.ChartParser(grammar)
+ sentences = nltk.sent_tokenize(text)
+ for sentence in sentences:
+ words = nltk.word_tokenize(sentence)
+ tree = parser.parse(words)
+ output.write(tree.pprint())
+ output.write('\n')
+ except Exception, e:
+ message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e)
+ sys.stderr.write(message)
+ sys.exit()
+ output.close()
+
+if __name__ == '__main__':
+ args = arguments()
+ chart_parse(args.input, args.grammar, args.output)
+
+
diff -r 000000000000 -r e991d4e60c17 g_chart_parser.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_chart_parser.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,28 @@
+
+ Parse the sentence using Chart Parser and a supplied grammar
+
+ nltk
+
+
+
+ g_chart_parser.py --input $input1 --grammar $grammar --output $tab_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r e991d4e60c17 g_collocation.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_collocation.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,48 @@
+import sys
+import os
+import nltk
+from nltk.collocations import *
+import argparse
+
+def Parser():
+ the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar")
+ the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus")
+ the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results")
+ the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find")
+ the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags")
+
+ args = the_parser.parse_args()
+ return args
+
+def collocation(inp, outp, freq_filter, results, coll_type, pos):
+ pos = bool(pos == 'true')
+ i = str(unicode(open(inp, 'r').read(), errors='ignore'))
+ o = open(outp, 'w')
+ all_words = []
+ if pos:
+ text = i.split(' ')[:-1]
+ all_words = [x[0:x.index('/')] if x != '\n' else x for x in text]
+ all_words = [x.strip(' ').strip('\n') for x in all_words]
+ else:
+ sents = nltk.sent_tokenize(i)
+ for sent in sents:
+ all_words += nltk.word_tokenize(sent)
+ if coll_type == 'bigram':
+ measures = nltk.collocations.BigramAssocMeasures()
+ finder = BigramCollocationFinder.from_words(all_words)
+ else:
+ measures = nltk.collocations.TrigramAssocMeasures()
+ finder = TrigramCollocationFinder.from_words(all_words)
+ finder.apply_freq_filter(int(freq_filter))
+ colls = finder.nbest(measures.pmi, int(results))
+ for coll in colls:
+ o.write("%s\t%s" % coll)
+ o.write('\n')
+ o.close()
+
+if __name__ == '__main__':
+ args = Parser()
+
+ collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos)
diff -r 000000000000 -r e991d4e60c17 g_collocation.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_collocation.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,38 @@
+
+ Generates a list of the most frequent collocations from an input sequence
+
+ nltk
+
+
+
+ g_collocation.py --input $input1 --output $tab_file --freq_filter $freq_filter --results $results --coll_type $collocation_type --pos $pos
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r e991d4e60c17 g_frequency.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_frequency.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,30 @@
+import nltk
+from nltk import FreqDist
+import argparse
+
+def arguments():
+ parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
+ parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ args = parser.parse_args()
+ return args
+
+
+def frequency(in_file, out_file):
+ """Input: a text file
+ Output: a table of word frequency with three columns for Word, Count and Percent frequency
+ """
+ text = unicode(open(in_file, 'r').read(), errors='ignore')
+ words = nltk.word_tokenize(text)
+ frequency = FreqDist(words)
+ total = float(frequency.N())
+ output = open(out_file, 'w')
+ output.write("Word\tCount\tPercent\n")
+ for pair in frequency.items():
+ output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
+ output.close()
+
+
+if __name__ == '__main__':
+ args = arguments()
+ frequency(args.input, args.output)
diff -r 000000000000 -r e991d4e60c17 g_frequency.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_frequency.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,34 @@
+
+ Takes a text input and generates a frequency list
+
+
+ nltk
+
+
+
+ g_frequency.py --input $input1 --output $frequency_table
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Generate a frequency list from a text ordered by word frequency.
+
+
diff -r 000000000000 -r e991d4e60c17 g_pos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_pos.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,34 @@
+import nltk
+import argparse
+import json
+
+def arguments():
+ parser = argparse.ArgumentParser(description="tokenize a text")
+ parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ args = parser.parse_args()
+ return args
+
+
+def postag(in_file, out_file):
+ """Input: a text file with one token per line
+ Output: a version of the text with Part of Speech tags written as word/TAG
+ """
+ text = unicode(open(in_file, 'r').read(), errors='ignore')
+ sentences = nltk.sent_tokenize(text)
+ output = open(out_file, 'w')
+ for sentence in sentences:
+ tokens = nltk.word_tokenize(sentence)
+ postags = nltk.pos_tag(tokens)
+ for postag in postags:
+ # print postag
+ output.write("%s/%s " % postag)
+ output.write('\n')
+ output.close()
+
+
+if __name__ == '__main__':
+ args = arguments()
+ postag(args.input, args.output)
+
+
\ No newline at end of file
diff -r 000000000000 -r e991d4e60c17 g_pos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_pos.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,35 @@
+
+ Part of Speech tagging
+
+
+ nltk
+
+
+
+ g_pos.py --input $input1 --output $postags
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Apply a Part of Speech (POS) tagger to a list of sentences.
+
+
diff -r 000000000000 -r e991d4e60c17 g_read_sents.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_read_sents.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,48 @@
+import sys
+import os
+import nltk
+from nltk.corpus import PlaintextCorpusReader
+import argparse
+
+def Parser():
+ the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
+ the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+
+ args = the_parser.parse_args()
+ return args
+
+def print_out(outp, text, sentences):
+ o = open(outp, 'w')
+ curr = 0
+ for sent in sentences:
+ times = count_occurences(sent, sent[-1])
+ curr = text.find(sent[0], curr)
+ end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
+ o.write(text[curr:end] + '\n')
+ curr = end
+ o.close()
+
+def find_nth(string, sub, n, offset):
+ start = string.find(sub, offset)
+ while start >= 0 and n > 1:
+ start = string.find(sub, start + len(sub))
+ n -= 1
+ return start
+
+def count_occurences(lst, string):
+ count = 0
+ for item in lst:
+ if string in item:
+ count += 1
+ return count
+
+def read_sents(inp, outp):
+ i = open(inp, 'r').read()
+ corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
+ sents = corpus.sents()
+ print_out(outp, i, sents)
+
+if __name__ == '__main__':
+ args = Parser()
+ read_sents(args.input, args.output)
diff -r 000000000000 -r e991d4e60c17 g_read_sents.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_read_sents.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,31 @@
+
+ Segments the text input into separate sentences
+
+
+ nltk
+
+
+
+ g_read_sents.py --input $input1 --output $tab_file
+
+\
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r e991d4e60c17 g_stemmer.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_stemmer.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,40 @@
+import sys
+import os
+import nltk
+from nltk.stem import *
+import argparse
+
+
+def arguments():
+ parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
+ parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path")
+ args = parser.parse_args()
+ return args
+
+def stem_file(in_file, out_file, stemmer_type):
+ unsegmented = unicode(open(in_file, 'r').read(), errors='ignore')
+ output = open(out_file, 'w')
+ sentences = nltk.sent_tokenize(unsegmented)
+ stemmer = get_stemmer(stemmer_type)
+ for sentence in sentences:
+ words = nltk.word_tokenize(sentence)
+ for word in words:
+ stemmed_word = stemmer.stem(word)
+ output.write(stemmed_word)
+ output.write('\n')
+ output.close()
+
+def get_stemmer(stemmer_type):
+ if stemmer_type == 'lancaster':
+ stemmer = LancasterStemmer()
+ elif stemmer_type == 'porter':
+ stemmer = PorterStemmer()
+ else:
+ stemmer = snowball.EnglishStemmer()
+ return stemmer
+
+if __name__ == '__main__':
+ args = arguments()
+ stem_file(args.input, args.output, args.stemmer)
diff -r 000000000000 -r e991d4e60c17 g_stemmer.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_stemmer.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,36 @@
+
+ Takes a list of tokens and generates a list of word stems using one of the stemming algorithms
+
+
+ nltk
+
+
+
+ g_stemmer.py --input $input1 --output $tab_file --stemmer $stemming_algorithm
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r e991d4e60c17 g_tokenize.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_tokenize.py Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,42 @@
+import nltk
+import string
+import argparse
+
+def arguments():
+ parser = argparse.ArgumentParser(description="tokenize a text")
+ parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+ parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+ parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words")
+ parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters")
+ args = parser.parse_args()
+ return args
+
+
+def strip_punct(text):
+ table = string.maketrans("","")
+ return text.translate(table, string.punctuation)
+
+
+def tokenize(in_file, out_file, lower=False, nopunct=False):
+ text = open(in_file, 'r').read()
+ if lower:
+ text = text.lower()
+ if nopunct:
+ text = strip_punct(text)
+ result = []
+ text = unicode(text, errors='ignore')
+ sentences = nltk.sent_tokenize(text)
+ for sentence in sentences:
+ tokens = nltk.word_tokenize(sentence)
+ result.append(tokens)
+ output = open(out_file, 'w')
+ # write one token per line
+ for sentence in result:
+ for token in sentence:
+ output.write(token + "\n")
+ output.close()
+
+
+if __name__ == '__main__':
+ args = arguments()
+ tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct)
diff -r 000000000000 -r e991d4e60c17 g_tokenize.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/g_tokenize.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,51 @@
+
+ Split a text into words
+
+
+ nltk
+
+
+
+ g_tokenize.py --input $input1 --output $tokens $lower $nopunct
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Tokenize a text into separate words, optionally remove punctuation and convert to lower case.
+
+
+
diff -r 000000000000 -r e991d4e60c17 test-data/dependency_resolvers_config.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dependency_resolvers_config.xml Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff -r 000000000000 -r e991d4e60c17 test-data/sample_text.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text.txt Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,696 @@
+DADA
+Project
+Update
+The
+DADA
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+Language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+.
+The
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+ARC
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+Macquarie
+University
+for
+some
+work
+on
+the
+Auslan
+corpus
+of
+Australian
+Sign
+Language
+collected
+by
+Trevor
+Johnston
+.
+Recently
+we
+have
+two
+projects
+which
+DADA
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+The
+Australian
+National
+Corpus
+(
+AusNC
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+Australia
+.
+The
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+Using
+some
+funding
+from
+HCSNet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+Australian
+Corpus
+of
+English
+and
+the
+Corpus
+of
+Oz
+Early
+English
+.
+We
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+What
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+While
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+This
+project
+is
+funded
+by
+the
+Australian
+National
+Data
+Service
+(
+ANDS
+)
+and
+is
+a
+collaboration
+with
+Michael
+Haugh
+at
+Griffith
+.
+The
+Big
+Australian
+Speech
+Corpus
+,
+more
+recently
+renamed
+AusTalk
+,
+is
+an
+ARC
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+Australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+The
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+'
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+Map
+task
+-
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+Our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+DADA
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+An
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+Some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+Later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+-
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+All
+of
+this
+will
+be
+published
+via
+the
+DADA
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+Australian
+English
+.
+Since
+the
+development
+of
+DADA
+now
+involves
+people
+outside
+Macquarie
+,
+I
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+As
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+The
+public
+DADA
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+Macquarie
+(
+it
+'s
+only
+visible
+inside
+MQ
+)
+-
+I
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+DADA
+.
+We
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+DADA
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.
diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_frequency.dat
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_frequency.dat Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,294 @@
+Word Count Percent
+the 44 6.32
+of 26 3.74
+and 25 3.59
+. 24 3.45
+to 23 3.30
+a 15 2.16
+, 12 1.72
+for 12 1.72
+will 12 1.72
+is 11 1.58
+DADA 9 1.29
+some 8 1.15
+( 7 1.01
+be 7 1.01
+on 7 1.01
+that 7 1.01
+this 7 1.01
+Australian 7 1.01
+) 7 1.01
+The 7 1.01
+text 6 0.86
+project 6 0.86
+we 6 0.86
+infrastructure 6 0.86
+from 6 0.86
+have 6 0.86
+in 6 0.86
+video 5 0.72
+language 5 0.72
+data 5 0.72
+it 5 0.72
+collection 5 0.72
+annotation 5 0.72
+Corpus 4 0.57
+with 4 0.57
+build 4 0.57
+audio 4 0.57
+hope 3 0.43
+collections 3 0.43
+resources 3 0.43
+funding 3 0.43
+available 3 0.43
+English 3 0.43
+meta-data 3 0.43
+Macquarie 3 0.43
+done 3 0.43
+two 3 0.43
+corpus 3 0.43
+part 3 0.43
+work 3 0.43
+up 3 0.43
+at 3 0.43
+- 3 0.43
+code 2 0.29
+people 2 0.29
+We 2 0.29
+but 2 0.29
+has 2 0.29
+them 2 0.29
+example 2 0.29
+words 2 0.29
+using 2 0.29
+now 2 0.29
+collect 2 0.29
+each 2 0.29
+corpora 2 0.29
+year 2 0.29
+server 2 0.29
+new 2 0.29
+public 2 0.29
+by 2 0.29
+search 2 0.29
+store 2 0.29
+involves 2 0.29
+within 2 0.29
+texts 2 0.29
+support 2 0.29
+Language 2 0.29
+sentences 2 0.29
+freely 2 0.29
+National 2 0.29
+funded 2 0.29
+site 2 0.29
+an 2 0.29
+as 2 0.29
+able 2 0.29
+make 2 0.29
+subjects 2 0.29
+speech 2 0.29
+development 2 0.29
+recording 2 0.29
+I 2 0.29
+significant 2 0.29
+task 2 0.29
+provide 2 0.29
+ARC 2 0.29
+demo 1 0.14
+automatically 1 0.14
+What 1 0.14
+Service 1 0.14
+being 1 0.14
+both 1 0.14
+soon 1 0.14
+existing 1 0.14
+large 1 0.14
+via 1 0.14
+looks 1 0.14
+Haugh 1 0.14
+still 1 0.14
+find 1 0.14
+alignment 1 0.14
+web 1 0.14
+Recently 1 0.14
+writing 1 0.14
+linguistics 1 0.14
+only 1 0.14
+going 1 0.14
+systems 1 0.14
+under 1 0.14
+Using 1 0.14
+2011 1 0.14
+take 1 0.14
+move 1 0.14
+around 1 0.14
+get 1 0.14
+read 1 0.14
+providing 1 0.14
+Michael 1 0.14
+number 1 0.14
+Project 1 0.14
+next 1 0.14
+While 1 0.14
+Oz 1 0.14
+communities 1 0.14
+comes 1 0.14
+projects 1 0.14
+articles 1 0.14
+like 1 0.14
+visible 1 0.14
+manual 1 0.14
+solution 1 0.14
+'ve 1 0.14
+capability 1 0.14
+these 1 0.14
+continue 1 0.14
+steps 1 0.14
+common 1 0.14
+small 1 0.14
+Speech 1 0.14
+fixed 1 0.14
+Griffith 1 0.14
+searching 1 0.14
+core 1 0.14
+doing 1 0.14
+Since 1 0.14
+idea 1 0.14
+All 1 0.14
+titles 1 0.14
+are 1 0.14
+picked 1 0.14
+Some 1 0.14
+network 1 0.14
+renamed 1 0.14
+managing 1 0.14
+sites 1 0.14
+publish 1 0.14
+research 1 0.14
+Later 1 0.14
+AusNC 1 0.14
+written 1 0.14
+between 1 0.14
+technology 1 0.14
+reading 1 0.14
+can 1 0.14
+recently 1 0.14
+repository 1 0.14
+partners 1 0.14
+This 1 0.14
+University 1 0.14
+hosted 1 0.14
+free 1 0.14
+box 1 0.14
+exposing 1 0.14
+technical 1 0.14
+study 1 0.14
+allows 1 0.14
+forced 1 0.14
+Sign 1 0.14
+published 1 0.14
+map 1 0.14
+MQ 1 0.14
+month 1 0.14
+interviews 1 0.14
+software 1 0.14
+already 1 0.14
+useful 1 0.14
+secure 1 0.14
+'black 1 0.14
+primary 1 0.14
+whatever 1 0.14
+Update 1 0.14
+1000 1 0.14
+parties 1 0.14
+loaded 1 0.14
+centralised 1 0.14
+Auslan 1 0.14
+1900 1 0.14
+size 1 0.14
+little 1 0.14
+Australia 1 0.14
+initial 1 0.14
+been 1 0.14
+Early 1 0.14
+their 1 0.14
+station 1 0.14
+down 1 0.14
+basic 1 0.14
+collected 1 0.14
+: 1 0.14
+Data 1 0.14
+ANDS 1 0.14
+more 1 0.14
+describe 1 0.14
+HCSNet 1 0.14
+denoting 1 0.14
+interviewed 1 0.14
+Trevor 1 0.14
+bitbucket 1 0.14
+testing 1 0.14
+Johnston 1 0.14
+effort 1 0.14
+pilot 1 0.14
+upgrades 1 0.14
+main 1 0.14
+look 1 0.14
+developing 1 0.14
+reliable 1 0.14
+pace 1 0.14
+while 1 0.14
+technoogy 1 0.14
+install 1 0.14
+Our 1 0.14
+transcripts 1 0.14
+country 1 0.14
+descriptions 1 0.14
+due 1 0.14
+documentation 1 0.14
+allowed 1 0.14
+sample 1 0.14
+enable 1 0.14
+create 1 0.14
+demonstration 1 0.14
+Map 1 0.14
+speakers 1 0.14
+inside 1 0.14
+end 1 0.14
+sessions 1 0.14
+things 1 0.14
+permission 1 0.14
+feature 1 0.14
+who 1 0.14
+started 1 0.14
+which 1 0.14
+digital 1 0.14
+many 1 0.14
+outside 1 0.14
+used 1 0.14
+'s 1 0.14
+separate 1 0.14
+collaboration 1 0.14
+after 1 0.14
+driver 1 0.14
+needs 1 0.14
+moment 1 0.14
+important 1 0.14
+designed 1 0.14
+tidying 1 0.14
+services 1 0.14
+elicit 1 0.14
+AusTalk 1 0.14
+expand 1 0.14
+stereo 1 0.14
+natural 1 0.14
+' 1 0.14
+third 1 0.14
+later 1 0.14
+game 1 0.14
+An 1 0.14
+As 1 0.14
+so 1 0.14
+Big 1 0.14
+allow 1 0.14
+sets 1 0.14
diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_lower.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_lower.txt Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,696 @@
+dada
+project
+update
+the
+dada
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+.
+the
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+arc
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+macquarie
+university
+for
+some
+work
+on
+the
+auslan
+corpus
+of
+australian
+sign
+language
+collected
+by
+trevor
+johnston
+.
+recently
+we
+have
+two
+projects
+which
+dada
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+the
+australian
+national
+corpus
+(
+ausnc
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+australia
+.
+the
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+using
+some
+funding
+from
+hcsnet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+australian
+corpus
+of
+english
+and
+the
+corpus
+of
+oz
+early
+english
+.
+we
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+what
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+while
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+this
+project
+is
+funded
+by
+the
+australian
+national
+data
+service
+(
+ands
+)
+and
+is
+a
+collaboration
+with
+michael
+haugh
+at
+griffith
+.
+the
+big
+australian
+speech
+corpus
+,
+more
+recently
+renamed
+austalk
+,
+is
+an
+arc
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+the
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+'
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+map
+task
+-
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+dada
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+an
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+-
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+all
+of
+this
+will
+be
+published
+via
+the
+dada
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+australian
+english
+.
+since
+the
+development
+of
+dada
+now
+involves
+people
+outside
+macquarie
+,
+i
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+as
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+the
+public
+dada
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+macquarie
+(
+it
+'s
+only
+visible
+inside
+mq
+)
+-
+i
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+dada
+.
+we
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+dada
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.
diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_lower_nopunct.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_lower_nopunct.txt Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,641 @@
+dada
+project
+update
+the
+dada
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+the
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+arc
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+macquarie
+university
+for
+some
+work
+on
+the
+auslan
+corpus
+of
+australian
+sign
+language
+collected
+by
+trevor
+johnston
+recently
+we
+have
+two
+projects
+which
+dada
+will
+be
+part
+of
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+the
+australian
+national
+corpus
+ausnc
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+australia
+the
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+using
+some
+funding
+from
+hcsnet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+the
+australian
+corpus
+of
+english
+and
+the
+corpus
+of
+oz
+early
+english
+we
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+what
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+descriptions
+of
+each
+text
+and
+the
+annotation
+denoting
+things
+within
+the
+texts
+while
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+look
+for
+this
+in
+texts
+written
+after
+1900
+and
+the
+annotation
+find
+this
+in
+the
+titles
+of
+articles
+this
+project
+is
+funded
+by
+the
+australian
+national
+data
+service
+ands
+and
+is
+a
+collaboration
+with
+michael
+haugh
+at
+griffith
+the
+big
+australian
+speech
+corpus
+more
+recently
+renamed
+austalk
+is
+an
+arc
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+the
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+being
+interviewed
+and
+doing
+the
+map
+task
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+dada
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+an
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+later
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+all
+of
+this
+will
+be
+published
+via
+the
+dada
+server
+infrastructure
+to
+create
+a
+large
+freely
+available
+research
+collection
+for
+australian
+english
+since
+the
+development
+of
+dada
+now
+involves
+people
+outside
+macquarie
+i
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+as
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+the
+public
+dada
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+macquarie
+it
+'s
+only
+visible
+inside
+mq
+i
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+dada
+we
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+audio
+and
+video
+corpora
+hosted
+on
+dada
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
diff -r 000000000000 -r e991d4e60c17 test-data/sample_text_tok.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_tok.txt Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,696 @@
+DADA
+Project
+Update
+The
+DADA
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+Language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+.
+The
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+ARC
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+Macquarie
+University
+for
+some
+work
+on
+the
+Auslan
+corpus
+of
+Australian
+Sign
+Language
+collected
+by
+Trevor
+Johnston
+.
+Recently
+we
+have
+two
+projects
+which
+DADA
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+The
+Australian
+National
+Corpus
+(
+AusNC
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+Australia
+.
+The
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+Using
+some
+funding
+from
+HCSNet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+Australian
+Corpus
+of
+English
+and
+the
+Corpus
+of
+Oz
+Early
+English
+.
+We
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+What
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+While
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+This
+project
+is
+funded
+by
+the
+Australian
+National
+Data
+Service
+(
+ANDS
+)
+and
+is
+a
+collaboration
+with
+Michael
+Haugh
+at
+Griffith
+.
+The
+Big
+Australian
+Speech
+Corpus
+,
+more
+recently
+renamed
+AusTalk
+,
+is
+an
+ARC
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+Australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+The
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+'
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+Map
+task
+-
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+Our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+DADA
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+An
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+Some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+Later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+-
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+All
+of
+this
+will
+be
+published
+via
+the
+DADA
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+Australian
+English
+.
+Since
+the
+development
+of
+DADA
+now
+involves
+people
+outside
+Macquarie
+,
+I
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+As
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+The
+public
+DADA
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+Macquarie
+(
+it
+'s
+only
+visible
+inside
+MQ
+)
+-
+I
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+DADA
+.
+We
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+DADA
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.