changeset 0:e991d4e60c17 draft

planemo upload commit 0203cb3a0b40d9348674b2b098af805e2986abca-dirty
author stevecassidy
date Wed, 12 Oct 2016 22:17:53 -0400
parents
children fb617586f4b2
files colloc.dat g_chart_parser.py g_chart_parser.xml g_collocation.py g_collocation.xml g_frequency.py g_frequency.xml g_pos.py g_pos.xml g_read_sents.py g_read_sents.xml g_stemmer.py g_stemmer.xml g_tokenize.py g_tokenize.xml test-data/dependency_resolvers_config.xml test-data/sample_text.txt test-data/sample_text_frequency.dat test-data/sample_text_lower.txt test-data/sample_text_lower_nopunct.txt test-data/sample_text_tok.txt
diffstat 21 files changed, 3568 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/colloc.dat	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,6 @@
+will	be
+that	will
+.	The
+is	to
+of	the
+and	the
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_chart_parser.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,38 @@
+import sys
+import nltk
+import argparse
+from nltk.corpus import PlaintextCorpusReader
+
+def arguments():
+    parser = argparse.ArgumentParser(description="run NER on a text")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--grammar', required=True,  action="store", type=str, help="grammar file")
+    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+    args = parser.parse_args()
+    return args
+
+
+def chart_parse(in_file, grammar_file, out_file):
+    text = unicode(open(in_file, 'r').read(), errors='ignore')
+    output = open(out_file, 'w')
+    grammar_string = unicode(open(grammar_file, 'r').read(), errors='ignore')
+    try:
+        grammar = nltk.parse_cfg(grammar_string)
+        parser = nltk.ChartParser(grammar)
+        sentences = nltk.sent_tokenize(text)
+        for sentence in sentences:
+            words = nltk.word_tokenize(sentence)
+            tree = parser.parse(words)
+            output.write(tree.pprint())
+            output.write('\n')
+    except Exception, e:
+        message = "Error with parsing. Check the input files are correct and the grammar contains every word in the input sequence. \n----\n" + str(e)
+        sys.stderr.write(message)
+        sys.exit()
+    output.close()
+
+if __name__ == '__main__':
+    args = arguments()
+    chart_parse(args.input, args.grammar, args.output)
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_chart_parser.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,28 @@
+<tool id="ChartParser" name="Chart Parser" version="1.0">
+    <description>Parse the sentence using Chart Parser and a supplied grammar</description>
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        g_chart_parser.py --input $input1 --grammar $grammar --output $tab_file
+    </command>
+
+    <inputs>
+        <param name="input1" type="data" format="txt" label="Select a suitable input file from your history"/>
+        <param name="grammar" type="data" format="txt" label="Grammar file to use in parsing"/>
+        <param name="job_name" type="text" size="25" label="Supply a name for the outputs to remind you what they contain" value="Chart Parser"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="tab_file" label="${job_name}"/>
+    </outputs>
+    <options refresh="True"/>
+    <help>
+        <![CDATA[
+
+Input is sentence segmented text and a suitable grammar. Run the chart parser over the sentence using the grammar. Output is a bracketed parse tree for the sentences.
+
+]]>
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_collocation.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,48 @@
+import sys
+import os
+import nltk
+from nltk.collocations import *
+import argparse
+
+def Parser():
+    the_parser = argparse.ArgumentParser(description="Parse the sentence using Chart Parser and a supplied grammar")
+    the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    the_parser.add_argument('--freq_filter', required=True, action="store", type=str, help="The minimum number of required occurrences in the corpus")
+    the_parser.add_argument('--results', required=True, action="store", type=str, help="The maximum number of collocations to show in the results")
+    the_parser.add_argument('--coll_type', required=True, action="store", type=str, help="Type of collocations to find")
+    the_parser.add_argument('--pos', required=True, action="store", type=str, help="Data input is a set of POS tags")
+
+    args = the_parser.parse_args()
+    return args
+
+def collocation(inp, outp, freq_filter, results, coll_type, pos):
+    pos = bool(pos == 'true')
+    i = str(unicode(open(inp, 'r').read(), errors='ignore'))
+    o = open(outp, 'w')
+    all_words = []
+    if pos:
+        text = i.split(' ')[:-1]
+        all_words = [x[0:x.index('/')] if x != '\n' else x for x in text]
+        all_words = [x.strip(' ').strip('\n') for x in all_words]
+    else:
+        sents = nltk.sent_tokenize(i)
+        for sent in sents:
+            all_words += nltk.word_tokenize(sent)
+    if coll_type == 'bigram':
+        measures = nltk.collocations.BigramAssocMeasures()
+        finder = BigramCollocationFinder.from_words(all_words)
+    else:
+        measures = nltk.collocations.TrigramAssocMeasures()
+        finder = TrigramCollocationFinder.from_words(all_words)
+    finder.apply_freq_filter(int(freq_filter))
+    colls = finder.nbest(measures.pmi, int(results))
+    for coll in colls:
+        o.write("%s\t%s" % coll)
+        o.write('\n')
+    o.close()
+
+if __name__ == '__main__':
+    args = Parser()
+
+    collocation(args.input, args.output, args.freq_filter, args.results, args.coll_type, args.pos)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_collocation.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,38 @@
+<tool id="Collocation" name="Collocation" version="1.0">
+    <description>Generates a list of the most frequent collocations from an input sequence</description>
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        g_collocation.py --input $input1 --output $tab_file --freq_filter $freq_filter --results $results --coll_type $collocation_type --pos $pos
+    </command>
+
+    <inputs>
+        <param name="input1" type="data" format="txt" label="Select a suitable input file from your history"/>
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain" value="Collocation"/>
+        <param name="freq_filter" type="integer" label="The minimum number of required occurrences in the corpus"
+               value="0"/>
+        <param name="results" type="integer" label="The maximum number of collocations to show in the results"
+               value="10"/>
+        <param name="collocation_type" type="select" label="" display="radio">
+            <option value="bigram">Bigram Collocations</option>
+            <option value="trigram">Trigram Collocations</option>
+        </param>
+        <param name="pos" type="boolean" label="Data input is a set of POS tags"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="tab_file" label="${job_name}"/>
+
+    </outputs>
+    <options refresh="True"/>
+    <help>
+        <![CDATA[
+
+Input is a text, optionally with POS tags. Output is a list of the top N collocations ordered by frequency.
+
+]]>
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_frequency.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,30 @@
+import nltk
+from nltk import FreqDist
+import argparse
+
+def arguments():
+  parser = argparse.ArgumentParser(description="generate a word frequency table from a text")
+  parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+  parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+  args = parser.parse_args()
+  return args
+
+
+def frequency(in_file, out_file):
+    """Input: a text file
+    Output: a table of word frequency with three columns for Word, Count and Percent frequency
+    """
+    text = unicode(open(in_file, 'r').read(), errors='ignore')
+    words = nltk.word_tokenize(text)
+    frequency = FreqDist(words)
+    total = float(frequency.N())
+    output = open(out_file, 'w')
+    output.write("Word\tCount\tPercent\n")
+    for pair in frequency.items():
+        output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
+    output.close()
+
+
+if __name__ == '__main__':
+    args = arguments()
+    frequency(args.input, args.output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_frequency.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,34 @@
+<tool id="Frequency" name="Frequency List" version="1.0">
+    <description>Takes a text input and generates a frequency list</description>
+
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        g_frequency.py --input $input1 --output $frequency_table
+    </command>
+
+    <inputs>
+        <param name="input1" type="data" format="txt"
+               label="Select a suitable input file from your history"/>
+
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain"
+               value="Frequency List"/>
+    </inputs>
+    <outputs>
+        <data format="tabular" name="frequency_table" label="${job_name}"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name='input1' value='sample_text.txt'/>
+            <param name='job_name' value='testfrequency'/>
+            <output name='testfrequency' file='sample_text_frequency.dat'/>
+        </test>
+    </tests>
+    <help>
+        Generate a frequency list from a text ordered by word frequency.
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_pos.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,34 @@
+import nltk
+import argparse
+import json
+
+def arguments():
+    parser = argparse.ArgumentParser(description="tokenize a text")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+    args = parser.parse_args()
+    return args
+  
+
+def postag(in_file, out_file):
+    """Input: a text file with one token per line
+    Output: a version of the text with Part of Speech tags written as word/TAG
+    """
+    text = unicode(open(in_file, 'r').read(), errors='ignore')
+    sentences = nltk.sent_tokenize(text)
+    output = open(out_file, 'w')
+    for sentence in sentences:
+        tokens = nltk.word_tokenize(sentence)
+        postags = nltk.pos_tag(tokens)
+        for postag in postags:
+            # print postag
+            output.write("%s/%s " % postag)
+    output.write('\n')
+    output.close()
+
+
+if __name__ == '__main__':
+    args = arguments()
+    postag(args.input, args.output)
+    
+    
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_pos.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,35 @@
+<tool id="POSTag" name="Part of Speech Tagging" version="1.0">
+    <description>Part of Speech tagging</description>
+
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+    
+    <command interpreter="python">
+        g_pos.py --input $input1 --output $postags
+    </command>
+
+    <inputs>
+        <param name="input1" type="data" format="txt"
+               label="Select a suitable input file from your history"/>
+
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the output to remind you what they contain"
+               value="POS Tags"/>
+    </inputs>
+    <outputs>
+        <data format="json" name="postags" label="${job_name}"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <param name='input1' value='sample_text_tok.json'/>
+            <param name='job_name' value='testpos1'/>
+            <output name='tokens' file='sample_text_pos.json'/>
+        </test>
+    </tests>
+
+    <help>
+        Apply a Part of Speech (POS) tagger to a list of sentences.
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_read_sents.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,48 @@
+import sys
+import os
+import nltk
+from nltk.corpus import PlaintextCorpusReader
+import argparse
+
+def Parser():
+    the_parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
+    the_parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    the_parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+
+    args = the_parser.parse_args()
+    return args
+
+def print_out(outp, text, sentences):
+    o = open(outp, 'w')
+    curr = 0
+    for sent in sentences:
+        times = count_occurences(sent, sent[-1])
+        curr = text.find(sent[0], curr)
+        end = find_nth(text, sent[-1], times, curr) + len(sent[-1])
+        o.write(text[curr:end] + '\n')
+        curr = end
+    o.close()
+
+def find_nth(string, sub, n, offset):
+    start = string.find(sub, offset)
+    while start >= 0 and n > 1:
+        start = string.find(sub, start + len(sub))
+        n -= 1
+    return start
+
+def count_occurences(lst, string):
+    count = 0
+    for item in lst:
+        if string in item:
+            count += 1
+    return count
+
+def read_sents(inp, outp):
+    i = open(inp, 'r').read()
+    corpus = PlaintextCorpusReader(os.path.dirname(inp), os.path.basename(inp))
+    sents = corpus.sents()
+    print_out(outp, i, sents)
+
+if __name__ == '__main__':
+    args = Parser()
+    read_sents(args.input, args.output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_read_sents.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,31 @@
+<tool id="ReadSents" name="Sentence Segmenter" version="1.0">
+    <description>Segments the text input into separate sentences</description>
+
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+    
+    <command interpreter="python">
+        g_read_sents.py --input $input1 --output $tab_file
+    </command>
+\
+    <inputs>
+        <param name="input1" type="data" format="txt" label="Select a suitable input file from your history"/>
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain" value="Sentence Segmenter"/>
+
+    </inputs>
+    <outputs>
+        <data format="txt" name="tab_file" label="${job_name}"/>
+
+    </outputs>
+    <options refresh="True"/>
+    <help>
+        <![CDATA[
+
+Segment text input into sentences using the Punkt sentence segmenter. Output is one sentence per line.
+
+]]>
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_stemmer.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,40 @@
+import sys
+import os
+import nltk
+from nltk.stem import *
+import argparse
+
+
+def arguments():
+    parser = argparse.ArgumentParser(description="Segments the text input into separate sentences")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--output', required=True, action="store", type=str, help="output file path")
+    parser.add_argument('--stemmer', required=False, action="store", type=str, help="output file path")
+    args = parser.parse_args()
+    return args
+
+def stem_file(in_file, out_file, stemmer_type):
+    unsegmented = unicode(open(in_file, 'r').read(), errors='ignore')
+    output = open(out_file, 'w')
+    sentences = nltk.sent_tokenize(unsegmented)
+    stemmer = get_stemmer(stemmer_type)
+    for sentence in sentences:
+        words = nltk.word_tokenize(sentence)
+        for word in words:
+            stemmed_word = stemmer.stem(word)
+            output.write(stemmed_word)
+            output.write('\n')
+    output.close()
+
+def get_stemmer(stemmer_type):
+    if stemmer_type == 'lancaster':
+        stemmer = LancasterStemmer()
+    elif stemmer_type == 'porter':
+        stemmer = PorterStemmer()
+    else:
+        stemmer = snowball.EnglishStemmer()
+    return stemmer
+
+if __name__ == '__main__':
+    args = arguments()
+    stem_file(args.input, args.output, args.stemmer)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_stemmer.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,36 @@
+<tool id="Stemmer" name="Stemmer" version="1.0">
+    <description>Takes a list of tokens and generates a list of word stems using one of the stemming algorithms</description>
+
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+    
+    <command interpreter="python">
+        g_stemmer.py --input $input1 --output $tab_file --stemmer $stemming_algorithm
+    </command>
+
+    <inputs>
+        <param name="input1" type="data" format="txt" label="Select a suitable input file from your history"/>
+        <param name="stemming_algorithm" type="select" label="Select the stemming algorithm to be used">
+            <option value="lancaster">Lancaster Stemmer</option>
+            <option value="porter">Porter Stemmer</option>
+            <option value="snowball">Snowball English Stemmer</option>
+        </param>
+        <param name="job_name" type="text" size="25"
+               label="Supply a name for the outputs to remind you what they contain" value="Stemmer"/>
+
+    </inputs>
+    <outputs>
+        <data format="tabular" name="tab_file" label="${job_name}"/>
+
+    </outputs>
+    <options refresh="True"/>
+    <help>
+        <![CDATA[
+
+Input is a list of tokens, uses one of the NLTK stemming algorithms to generate a list of word stems.
+
+]]>
+
+    </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_tokenize.py	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,42 @@
+import nltk
+import string
+import argparse
+
+def arguments():
+    parser = argparse.ArgumentParser(description="tokenize a text")
+    parser.add_argument('--input', required=True, action="store", type=str, help="input text file")
+    parser.add_argument('--output', required=True,  action="store", type=str, help="output file path")
+    parser.add_argument('--lower', required=False, action="store_true", help="lowercase all words")
+    parser.add_argument('--nopunct', required=False, action="store_true", help="remove all punctuation characters")
+    args = parser.parse_args()
+    return args
+
+
+def strip_punct(text):
+  table = string.maketrans("","")
+  return text.translate(table, string.punctuation)
+
+
+def tokenize(in_file, out_file, lower=False, nopunct=False):
+    text = open(in_file, 'r').read()
+    if lower:
+        text = text.lower()
+    if nopunct:
+        text = strip_punct(text)
+    result = []
+    text = unicode(text, errors='ignore')
+    sentences = nltk.sent_tokenize(text)
+    for sentence in sentences:
+        tokens = nltk.word_tokenize(sentence)
+        result.append(tokens)
+    output = open(out_file, 'w')
+    # write one token per line
+    for sentence in result:
+        for token in sentence:
+            output.write(token + "\n")
+    output.close()
+
+
+if __name__ == '__main__':
+    args = arguments()
+    tokenize(args.input, args.output, lower=args.lower, nopunct=args.nopunct)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/g_tokenize.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,51 @@
+<tool id="Tokenize" name="Tokenize Text" version="1.0">
+    <description>Split a text into words</description>
+
+    <requirements>
+        <requirement type="package" version="3.2.1">nltk</requirement>
+    </requirements>
+
+    <command interpreter="python">
+        g_tokenize.py --input $input1 --output $tokens $lower $nopunct
+    </command>
+
+    <inputs>
+        <param name="input1" type="data" format="txt"
+               label="Select a suitable input file from your history"/>
+
+        <param name='lower' type="boolean" truevalue="--lower" falsevalue=""
+               label="Lowercase all tokens?"/>
+
+        <param name='nopunct' type="boolean" truevalue="--nopunct" falsevalue=""
+               label="Remove punctuation?"/>
+    </inputs>
+    <outputs>
+        <data format="txt" name="tokens" label="${input1.name} Tokens"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name='input1' value='sample_text.txt'/>
+            <param name='lower' value='--lower'/>
+            <param name='nopunct' value='--nopunct'/>
+            <output name='tokens' file='sample_text_lower_nopunct.txt'/>
+        </test>
+        <test>
+            <param name='input1' value='sample_text.txt'/>
+            <param name='lower' value='--lower'/>
+            <param name='nopunct' value=''/>
+            <output name='tokens' file='sample_text_lower.txt'/>
+        </test>
+        <test>
+            <param name='input1' value='sample_text.txt'/>
+            <param name='lower' value=''/>
+            <param name='nopunct' value=''/>
+            <output name='tokens' file='sample_text_tok.txt'/>
+        </test>
+
+    </tests>
+
+    <help>
+        Tokenize a text into separate words, optionally remove punctuation and convert to lower case.
+    </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/dependency_resolvers_config.xml	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,6 @@
+<dependency_resolvers>
+  <tool_shed_packages />
+  <galaxy_packages />
+  <galaxy_packages versionless="true" />
+  <unlinked_tool_shed_packages />
+</dependency_resolvers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text.txt	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,696 @@
+DADA
+Project
+Update
+The
+DADA
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+Language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+.
+The
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+ARC
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+Macquarie
+University
+for
+some
+work
+on
+the
+Auslan
+corpus
+of
+Australian
+Sign
+Language
+collected
+by
+Trevor
+Johnston
+.
+Recently
+we
+have
+two
+projects
+which
+DADA
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+The
+Australian
+National
+Corpus
+(
+AusNC
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+Australia
+.
+The
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+Using
+some
+funding
+from
+HCSNet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+Australian
+Corpus
+of
+English
+and
+the
+Corpus
+of
+Oz
+Early
+English
+.
+We
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+What
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+While
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+This
+project
+is
+funded
+by
+the
+Australian
+National
+Data
+Service
+(
+ANDS
+)
+and
+is
+a
+collaboration
+with
+Michael
+Haugh
+at
+Griffith
+.
+The
+Big
+Australian
+Speech
+Corpus
+,
+more
+recently
+renamed
+AusTalk
+,
+is
+an
+ARC
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+Australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+The
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+'
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+Map
+task
+-
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+Our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+DADA
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+An
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+Some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+Later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+-
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+All
+of
+this
+will
+be
+published
+via
+the
+DADA
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+Australian
+English
+.
+Since
+the
+development
+of
+DADA
+now
+involves
+people
+outside
+Macquarie
+,
+I
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+As
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+The
+public
+DADA
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+Macquarie
+(
+it
+'s
+only
+visible
+inside
+MQ
+)
+-
+I
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+DADA
+.
+We
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+DADA
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_frequency.dat	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,294 @@
+Word	Count	Percent
+the	44	6.32
+of	26	3.74
+and	25	3.59
+.	24	3.45
+to	23	3.30
+a	15	2.16
+,	12	1.72
+for	12	1.72
+will	12	1.72
+is	11	1.58
+DADA	9	1.29
+some	8	1.15
+(	7	1.01
+be	7	1.01
+on	7	1.01
+that	7	1.01
+this	7	1.01
+Australian	7	1.01
+)	7	1.01
+The	7	1.01
+text	6	0.86
+project	6	0.86
+we	6	0.86
+infrastructure	6	0.86
+from	6	0.86
+have	6	0.86
+in	6	0.86
+video	5	0.72
+language	5	0.72
+data	5	0.72
+it	5	0.72
+collection	5	0.72
+annotation	5	0.72
+Corpus	4	0.57
+with	4	0.57
+build	4	0.57
+audio	4	0.57
+hope	3	0.43
+collections	3	0.43
+resources	3	0.43
+funding	3	0.43
+available	3	0.43
+English	3	0.43
+meta-data	3	0.43
+Macquarie	3	0.43
+done	3	0.43
+two	3	0.43
+corpus	3	0.43
+part	3	0.43
+work	3	0.43
+up	3	0.43
+at	3	0.43
+-	3	0.43
+code	2	0.29
+people	2	0.29
+We	2	0.29
+but	2	0.29
+has	2	0.29
+them	2	0.29
+example	2	0.29
+words	2	0.29
+using	2	0.29
+now	2	0.29
+collect	2	0.29
+each	2	0.29
+corpora	2	0.29
+year	2	0.29
+server	2	0.29
+new	2	0.29
+public	2	0.29
+by	2	0.29
+search	2	0.29
+store	2	0.29
+involves	2	0.29
+within	2	0.29
+texts	2	0.29
+support	2	0.29
+Language	2	0.29
+sentences	2	0.29
+freely	2	0.29
+National	2	0.29
+funded	2	0.29
+site	2	0.29
+an	2	0.29
+as	2	0.29
+able	2	0.29
+make	2	0.29
+subjects	2	0.29
+speech	2	0.29
+development	2	0.29
+recording	2	0.29
+I	2	0.29
+significant	2	0.29
+task	2	0.29
+provide	2	0.29
+ARC	2	0.29
+demo	1	0.14
+automatically	1	0.14
+What	1	0.14
+Service	1	0.14
+being	1	0.14
+both	1	0.14
+soon	1	0.14
+existing	1	0.14
+large	1	0.14
+via	1	0.14
+looks	1	0.14
+Haugh	1	0.14
+still	1	0.14
+find	1	0.14
+alignment	1	0.14
+web	1	0.14
+Recently	1	0.14
+writing	1	0.14
+linguistics	1	0.14
+only	1	0.14
+going	1	0.14
+systems	1	0.14
+under	1	0.14
+Using	1	0.14
+2011	1	0.14
+take	1	0.14
+move	1	0.14
+around	1	0.14
+get	1	0.14
+read	1	0.14
+providing	1	0.14
+Michael	1	0.14
+number	1	0.14
+Project	1	0.14
+next	1	0.14
+While	1	0.14
+Oz	1	0.14
+communities	1	0.14
+comes	1	0.14
+projects	1	0.14
+articles	1	0.14
+like	1	0.14
+visible	1	0.14
+manual	1	0.14
+solution	1	0.14
+'ve	1	0.14
+capability	1	0.14
+these	1	0.14
+continue	1	0.14
+steps	1	0.14
+common	1	0.14
+small	1	0.14
+Speech	1	0.14
+fixed	1	0.14
+Griffith	1	0.14
+searching	1	0.14
+core	1	0.14
+doing	1	0.14
+Since	1	0.14
+idea	1	0.14
+All	1	0.14
+titles	1	0.14
+are	1	0.14
+picked	1	0.14
+Some	1	0.14
+network	1	0.14
+renamed	1	0.14
+managing	1	0.14
+sites	1	0.14
+publish	1	0.14
+research	1	0.14
+Later	1	0.14
+AusNC	1	0.14
+written	1	0.14
+between	1	0.14
+technology	1	0.14
+reading	1	0.14
+can	1	0.14
+recently	1	0.14
+repository	1	0.14
+partners	1	0.14
+This	1	0.14
+University	1	0.14
+hosted	1	0.14
+free	1	0.14
+box	1	0.14
+exposing	1	0.14
+technical	1	0.14
+study	1	0.14
+allows	1	0.14
+forced	1	0.14
+Sign	1	0.14
+published	1	0.14
+map	1	0.14
+MQ	1	0.14
+month	1	0.14
+interviews	1	0.14
+software	1	0.14
+already	1	0.14
+useful	1	0.14
+secure	1	0.14
+'black	1	0.14
+primary	1	0.14
+whatever	1	0.14
+Update	1	0.14
+1000	1	0.14
+parties	1	0.14
+loaded	1	0.14
+centralised	1	0.14
+Auslan	1	0.14
+1900	1	0.14
+size	1	0.14
+little	1	0.14
+Australia	1	0.14
+initial	1	0.14
+been	1	0.14
+Early	1	0.14
+their	1	0.14
+station	1	0.14
+down	1	0.14
+basic	1	0.14
+collected	1	0.14
+:	1	0.14
+Data	1	0.14
+ANDS	1	0.14
+more	1	0.14
+describe	1	0.14
+HCSNet	1	0.14
+denoting	1	0.14
+interviewed	1	0.14
+Trevor	1	0.14
+bitbucket	1	0.14
+testing	1	0.14
+Johnston	1	0.14
+effort	1	0.14
+pilot	1	0.14
+upgrades	1	0.14
+main	1	0.14
+look	1	0.14
+developing	1	0.14
+reliable	1	0.14
+pace	1	0.14
+while	1	0.14
+technoogy	1	0.14
+install	1	0.14
+Our	1	0.14
+transcripts	1	0.14
+country	1	0.14
+descriptions	1	0.14
+due	1	0.14
+documentation	1	0.14
+allowed	1	0.14
+sample	1	0.14
+enable	1	0.14
+create	1	0.14
+demonstration	1	0.14
+Map	1	0.14
+speakers	1	0.14
+inside	1	0.14
+end	1	0.14
+sessions	1	0.14
+things	1	0.14
+permission	1	0.14
+feature	1	0.14
+who	1	0.14
+started	1	0.14
+which	1	0.14
+digital	1	0.14
+many	1	0.14
+outside	1	0.14
+used	1	0.14
+'s	1	0.14
+separate	1	0.14
+collaboration	1	0.14
+after	1	0.14
+driver	1	0.14
+needs	1	0.14
+moment	1	0.14
+important	1	0.14
+designed	1	0.14
+tidying	1	0.14
+services	1	0.14
+elicit	1	0.14
+AusTalk	1	0.14
+expand	1	0.14
+stereo	1	0.14
+natural	1	0.14
+'	1	0.14
+third	1	0.14
+later	1	0.14
+game	1	0.14
+An	1	0.14
+As	1	0.14
+so	1	0.14
+Big	1	0.14
+allow	1	0.14
+sets	1	0.14
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_lower.txt	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,696 @@
+dada
+project
+update
+the
+dada
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+.
+the
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+arc
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+macquarie
+university
+for
+some
+work
+on
+the
+auslan
+corpus
+of
+australian
+sign
+language
+collected
+by
+trevor
+johnston
+.
+recently
+we
+have
+two
+projects
+which
+dada
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+the
+australian
+national
+corpus
+(
+ausnc
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+australia
+.
+the
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+using
+some
+funding
+from
+hcsnet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+australian
+corpus
+of
+english
+and
+the
+corpus
+of
+oz
+early
+english
+.
+we
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+what
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+while
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+this
+project
+is
+funded
+by
+the
+australian
+national
+data
+service
+(
+ands
+)
+and
+is
+a
+collaboration
+with
+michael
+haugh
+at
+griffith
+.
+the
+big
+australian
+speech
+corpus
+,
+more
+recently
+renamed
+austalk
+,
+is
+an
+arc
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+the
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+'
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+map
+task
+-
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+dada
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+an
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+-
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+all
+of
+this
+will
+be
+published
+via
+the
+dada
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+australian
+english
+.
+since
+the
+development
+of
+dada
+now
+involves
+people
+outside
+macquarie
+,
+i
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+as
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+the
+public
+dada
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+macquarie
+(
+it
+'s
+only
+visible
+inside
+mq
+)
+-
+i
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+dada
+.
+we
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+dada
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_lower_nopunct.txt	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,641 @@
+dada
+project
+update
+the
+dada
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+the
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+arc
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+macquarie
+university
+for
+some
+work
+on
+the
+auslan
+corpus
+of
+australian
+sign
+language
+collected
+by
+trevor
+johnston
+recently
+we
+have
+two
+projects
+which
+dada
+will
+be
+part
+of
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+the
+australian
+national
+corpus
+ausnc
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+australia
+the
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+using
+some
+funding
+from
+hcsnet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+the
+australian
+corpus
+of
+english
+and
+the
+corpus
+of
+oz
+early
+english
+we
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+what
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+descriptions
+of
+each
+text
+and
+the
+annotation
+denoting
+things
+within
+the
+texts
+while
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+look
+for
+this
+in
+texts
+written
+after
+1900
+and
+the
+annotation
+find
+this
+in
+the
+titles
+of
+articles
+this
+project
+is
+funded
+by
+the
+australian
+national
+data
+service
+ands
+and
+is
+a
+collaboration
+with
+michael
+haugh
+at
+griffith
+the
+big
+australian
+speech
+corpus
+more
+recently
+renamed
+austalk
+is
+an
+arc
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+the
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+being
+interviewed
+and
+doing
+the
+map
+task
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+dada
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+an
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+later
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+all
+of
+this
+will
+be
+published
+via
+the
+dada
+server
+infrastructure
+to
+create
+a
+large
+freely
+available
+research
+collection
+for
+australian
+english
+since
+the
+development
+of
+dada
+now
+involves
+people
+outside
+macquarie
+i
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+as
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+the
+public
+dada
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+macquarie
+it
+'s
+only
+visible
+inside
+mq
+i
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+dada
+we
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+audio
+and
+video
+corpora
+hosted
+on
+dada
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sample_text_tok.txt	Wed Oct 12 22:17:53 2016 -0400
@@ -0,0 +1,696 @@
+DADA
+Project
+Update
+The
+DADA
+project
+is
+developing
+software
+for
+managing
+language
+resources
+and
+exposing
+them
+on
+the
+web
+.
+Language
+resources
+are
+digital
+collections
+of
+language
+as
+audio
+,
+video
+and
+text
+used
+to
+study
+language
+and
+build
+technoogy
+systems
+.
+The
+project
+has
+been
+going
+for
+a
+while
+with
+some
+initial
+funding
+from
+the
+ARC
+to
+build
+the
+basic
+infrastructure
+and
+later
+from
+Macquarie
+University
+for
+some
+work
+on
+the
+Auslan
+corpus
+of
+Australian
+Sign
+Language
+collected
+by
+Trevor
+Johnston
+.
+Recently
+we
+have
+two
+projects
+which
+DADA
+will
+be
+part
+of
+,
+and
+so
+the
+pace
+of
+development
+has
+picked
+up
+a
+little
+.
+The
+Australian
+National
+Corpus
+(
+AusNC
+)
+is
+an
+effort
+to
+build
+a
+centralised
+collection
+of
+resources
+of
+language
+in
+Australia
+.
+The
+core
+idea
+is
+to
+take
+whatever
+existing
+collections
+we
+can
+get
+permission
+to
+publish
+and
+make
+them
+available
+under
+a
+common
+technical
+infrastructure
+.
+Using
+some
+funding
+from
+HCSNet
+we
+build
+a
+small
+demonstration
+site
+that
+allowed
+free
+text
+search
+on
+two
+collections
+:
+the
+Australian
+Corpus
+of
+English
+and
+the
+Corpus
+of
+Oz
+Early
+English
+.
+We
+now
+have
+some
+funding
+to
+continue
+this
+work
+and
+expand
+both
+the
+size
+of
+the
+collection
+and
+the
+capability
+of
+the
+infrastructure
+that
+will
+support
+it
+.
+What
+we
+'ve
+already
+done
+is
+to
+separate
+the
+text
+in
+these
+corpora
+from
+their
+meta-data
+(
+descriptions
+of
+each
+text
+)
+and
+the
+annotation
+(
+denoting
+things
+within
+the
+texts
+)
+.
+While
+the
+pilot
+allows
+searching
+on
+the
+text
+the
+next
+steps
+will
+allow
+search
+using
+the
+meta-data
+(
+look
+for
+this
+in
+texts
+written
+after
+1900
+)
+and
+the
+annotation
+(
+find
+this
+in
+the
+titles
+of
+articles
+)
+.
+This
+project
+is
+funded
+by
+the
+Australian
+National
+Data
+Service
+(
+ANDS
+)
+and
+is
+a
+collaboration
+with
+Michael
+Haugh
+at
+Griffith
+.
+The
+Big
+Australian
+Speech
+Corpus
+,
+more
+recently
+renamed
+AusTalk
+,
+is
+an
+ARC
+funded
+project
+to
+collect
+speech
+and
+video
+from
+1000
+Australian
+speakers
+for
+a
+new
+freely
+available
+corpus
+.
+The
+project
+involves
+many
+partners
+around
+the
+country
+each
+of
+who
+will
+have
+a
+'black
+box
+'
+recording
+station
+to
+collect
+audio
+and
+stereo
+video
+of
+subjects
+reading
+words
+and
+sentences
+,
+being
+interviewed
+and
+doing
+the
+Map
+task
+-
+a
+game
+designed
+to
+elicit
+natural
+speech
+between
+two
+people
+.
+Our
+part
+of
+the
+project
+is
+to
+provide
+the
+server
+infrastructure
+that
+will
+store
+the
+audio
+,
+video
+and
+annotation
+data
+that
+will
+make
+up
+the
+corpus
+.
+DADA
+will
+be
+part
+of
+this
+solution
+but
+the
+main
+driver
+is
+to
+be
+able
+to
+provide
+a
+secure
+and
+reliable
+store
+for
+the
+primary
+data
+as
+it
+comes
+in
+from
+the
+collection
+sites
+.
+An
+important
+feature
+of
+the
+collection
+is
+the
+meta-data
+that
+will
+describe
+the
+subjects
+in
+the
+recording
+.
+Some
+annotation
+of
+the
+data
+will
+be
+done
+automatically
+,
+for
+example
+some
+forced
+alignment
+of
+the
+read
+words
+and
+sentences
+.
+Later
+,
+we
+will
+move
+on
+to
+support
+manual
+annotation
+of
+some
+of
+the
+data
+-
+for
+example
+transcripts
+of
+the
+interviews
+and
+map
+task
+sessions
+.
+All
+of
+this
+will
+be
+published
+via
+the
+DADA
+server
+infrastructure
+to
+create
+a
+large
+,
+freely
+available
+research
+collection
+for
+Australian
+English
+.
+Since
+the
+development
+of
+DADA
+now
+involves
+people
+outside
+Macquarie
+,
+I
+have
+started
+using
+a
+public
+bitbucket
+repository
+for
+the
+code
+.
+As
+of
+this
+writing
+the
+code
+still
+needs
+some
+tidying
+and
+documentation
+to
+enable
+third
+parties
+to
+be
+able
+to
+install
+and
+work
+on
+it
+,
+but
+we
+hope
+to
+have
+that
+done
+within
+a
+month
+.
+The
+public
+DADA
+demo
+site
+is
+down
+at
+the
+moment
+due
+to
+network
+upgrades
+at
+Macquarie
+(
+it
+'s
+only
+visible
+inside
+MQ
+)
+-
+I
+hope
+to
+have
+that
+fixed
+soon
+with
+some
+new
+sample
+data
+sets
+loaded
+up
+for
+testing
+.
+2011
+looks
+like
+it
+will
+be
+a
+significant
+year
+for
+DADA
+.
+We
+hope
+to
+end
+this
+year
+with
+a
+number
+of
+significant
+text
+,
+audio
+and
+video
+corpora
+hosted
+on
+DADA
+infrastructure
+and
+providing
+useful
+services
+to
+the
+linguistics
+and
+language
+technology
+communities
+.