Replace MyThes thesaurus implementation with a custom one

Fixes a pile of unicode-related issues, such as dictionaries in a path which does not fit into the system's local charset, and significantly cuts down on the amount of code. Originally committed to SVN as r6250.
2012-01-08 01:36:50 +00:00 · 2012-01-08 01:36:50 +00:00 · 3c62a38c7a
commit 3c62a38c7a
parent 518f93f18f
18 changed files with 462 additions and 845 deletions
--- a/aegisub/build/aegisub_vs2008/aegisub_vs2008.vcproj
+++ b/aegisub/build/aegisub_vs2008/aegisub_vs2008.vcproj
@ -855,14 +855,6 @@
 				RelativePath="..\..\src\md5.h"
 				>
 			</File>
-			<File
-				RelativePath="..\..\src\mythes.cxx"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\mythes.hxx"
-				>
-			</File>
 			<File
 				RelativePath="..\..\src\pen.cpp"
 				>
@ -1423,14 +1415,6 @@
 				RelativePath="..\..\src\thesaurus.h"
 				>
 			</File>
-			<File
-				RelativePath="..\..\src\thesaurus_myspell.cpp"
-				>
-			</File>
-			<File
-				RelativePath="..\..\src\thesaurus_myspell.h"
-				>
-			</File>
 		</Filter>
 		<Filter
 			Name="Subtitle Formats"
--- a/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
+++ b/aegisub/build/libaegisub_vs2008/libaegisub_vs2008.vcproj
@ -311,6 +311,10 @@
 				RelativePath="..\..\libaegisub\common\path.cpp"
 				>
 			</File>
+			<File
+				RelativePath="..\..\libaegisub\common\thesaurus.cpp"
+				>
+			</File>
 			<File
 				RelativePath="..\..\libaegisub\common\util.cpp"
 				>
@ -477,6 +481,10 @@
 				RelativePath="..\..\libaegisub\include\libaegisub\signal.h"
 				>
 			</File>
+			<File
+				RelativePath="..\..\libaegisub\include\libaegisub\thesaurus.h"
+				>
+			</File>
 			<File
 				RelativePath="..\..\libaegisub\include\libaegisub\types.h"
 				>
--- a/aegisub/build/tests_vs2008/tests_vs2008.vcproj
+++ b/aegisub/build/tests_vs2008/tests_vs2008.vcproj
@ -334,6 +334,10 @@
 			RelativePath="..\..\tests\libaegisub_signals.cpp"
 			>
 		</File>
+		<File
+			RelativePath="..\..\tests\libaegisub_thesaurus.cpp"
+			>
+		</File>
 		<File
 			RelativePath="..\..\tests\libaegisub_util.cpp"
 			>
--- a/aegisub/libaegisub/Makefile
+++ b/aegisub/libaegisub/Makefile
@ -35,6 +35,7 @@ SRC = \
 	common/keyframe.cpp \
 	common/util.cpp \
 	common/log.cpp \
+	common/thesaurus.cpp \
 	common/validator.cpp \
 	common/vfr.cpp \
 	unix/util.cpp \
--- a/aegisub/libaegisub/common/thesaurus.cpp
+++ b/aegisub/libaegisub/common/thesaurus.cpp
@ -0,0 +1,97 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// $Id$
+
+/// @file thesaurus.cpp
+/// @brief MyThes-compatible thesaurus implementation
+/// @ingroup libaegisub thesaurus
+
+#include "libaegisub/thesaurus.h"
+
+#include "libaegisub/charset_conv.h"
+#include "libaegisub/io.h"
+#include "libaegisub/line_iterator.h"
+
+template<class String, class Char, class Container>
+static void split(String const& str, Char sep, Container *out) {
+	typename String::size_type pos, prev = 0;
+	out->reserve(2);
+	while ((pos = str.find(sep, prev)) != String::npos) {
+		if (pos > prev)
+			out->push_back(str.substr(prev, pos - prev));
+		prev = pos + 1;
+	}
+	if (prev < str.size())
+		out->push_back(str.substr(prev));
+}
+
+namespace agi {
+
+Thesaurus::Thesaurus(std::string const& dat_path, std::string const& idx_path)
+: dat(io::Open(dat_path))
+{
+	scoped_ptr<std::ifstream> idx(io::Open(idx_path));
+
+	std::string encoding_name;
+	getline(*idx, encoding_name);
+	std::string unused_entry_count;
+	getline(*idx, unused_entry_count);
+
+	// Read the list of words and file offsets for those words
+	for (line_iterator<std::string> iter(*idx, encoding_name), end; iter != end; ++iter) {
+		std::vector<std::string> chunks;
+		split(*iter, '|', &chunks);
+		if (chunks.size() == 2) {
+			offsets[chunks[0]] = atoi(chunks[1].c_str());
+		}
+	}
+
+	conv.reset(new charset::IconvWrapper(encoding_name.c_str(), "utf-8"));
+}
+
+Thesaurus::~Thesaurus() { }
+
+void Thesaurus::Lookup(std::string const& word, std::vector<Entry> *out) {
+	out->clear();
+
+	std::map<std::string, int>::const_iterator it = offsets.find(word);
+	if (!dat.get() || it == offsets.end()) return;
+
+	dat->seekg(it->second, std::ios::beg);
+	if (!dat->good()) return;
+
+	// First line is the word and meaning count
+	std::string temp;
+	getline(*dat, temp);
+	std::vector<std::string> header;
+	split(conv->Convert(temp), '|', &header);
+	if (header.size() != 2) return;
+	int meanings = atoi(header[1].c_str());
+
+	out->resize(meanings);
+	for (int i = 0; i < meanings; ++i) {
+		std::vector<std::string> line;
+		getline(*dat, temp);
+		split(conv->Convert(temp), '|', &line);
+
+		// The "definition" is just the part of speech plus the word it's
+		// giving synonyms for (which may not be the passed word)
+		(*out)[i].first = line[0] + ' ' + line[1];
+		(*out)[i].second.reserve(line.size() - 2);
+		copy(line.begin() + 2, line.end(), back_inserter((*out)[i].second));
+	}
+}
+
+}
--- a/aegisub/libaegisub/include/libaegisub/thesaurus.h
+++ b/aegisub/libaegisub/include/libaegisub/thesaurus.h
@ -0,0 +1,58 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// $Id$
+
+/// @file thesaurus.h
+/// @brief MyThes-compatible thesaurus implementation
+/// @ingroup libaegisub thesaurus
+
+#include <libaegisub/scoped_ptr.h>
+
+#ifndef LAGI_PRE
+#include <iosfwd>
+#include <map>
+#include <string>
+#include <vector>
+#endif
+
+namespace agi {
+
+namespace charset { class IconvWrapper; }
+
+class Thesaurus {
+	/// Map of word -> byte position in the data file
+	std::map<std::string, int> offsets;
+	/// Read handle to the data file
+	scoped_ptr<std::ifstream> dat;
+	/// Converter from the data file's charset to UTF-8
+	scoped_ptr<charset::IconvWrapper> conv;
+
+public:
+	/// A pair of a word and synonyms for that word
+	typedef std::pair<std::string, std::vector<std::string> > Entry;
+
+	/// Constructor
+	/// @param dat_path Path to data file
+	/// @param idx_path Path to index file
+	Thesaurus(std::string const& dat_path, std::string const& idx_path);
+	~Thesaurus();
+
+	/// Look up synonyms for a word
+	/// @param word Word to look up
+	/// @param[out] out Vector to fill with word/synonym lists
+	void Lookup(std::string const& word, std::vector<Entry> *out);
+};
+
+}
--- a/aegisub/src/Makefile
+++ b/aegisub/src/Makefile
@ -192,7 +192,6 @@ SRC += \
 	menu.cpp \
 	md5.c \
 	mkv_wrap.cpp \
-	mythes.cxx \
 	pen.cpp \
 	persist_location.cpp \
 	plugin_manager.cpp \
@ -221,7 +220,6 @@ SRC += \
 	text_file_reader.cpp \
 	text_file_writer.cpp \
 	thesaurus.cpp \
-	thesaurus_myspell.cpp \
 	timeedit_ctrl.cpp \
 	threaded_frame_source.cpp \
 	toggle_bitmap.cpp \
--- a/aegisub/src/dialog_about.cpp
+++ b/aegisub/src/dialog_about.cpp
@ -97,7 +97,6 @@ AboutScreen::AboutScreen(wxWindow *parent)
 #ifdef WITH_FREETYPE2
 	libString += "    Freetype - Copyright (c) David Turner, Robert Wilhelm, Werner Lemberg;\n";
 #endif
-	libString += "    MyThes - Copyright (c) Kevin B. Hendricks, Stratford, Ontario, Canada.\n";
 #ifdef WITH_FFTW3
 	libString += "    FFTW - Copyright (c) Matteo Frigo, Massachusetts Institute of Technology;\n";
 #endif
--- a/aegisub/src/mythes.cxx
+++ b/aegisub/src/mythes.cxx
@ -1,398 +0,0 @@
-/*
- * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada
- * And Contributors.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * 3. All modifications to the source code must be clearly marked as
- *    such.  Binary redistributions based on modified source code
- *    must be clearly marked as modified versions in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS 
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL 
- * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-
- 
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <errno.h>
-#include "mythes.hxx"
-
-// some basic utility routines
-
-// string duplication routine
-char * mythes_mystrdup(const char * p)
-{
-   
-  int sl = strlen(p) + 1;
-  char * d = (char *)malloc(sl);
-  if (d) {
-    memcpy(d,p,sl);
-    return d;
-  }
-  return NULL;
-}
-
-// remove cross-platform text line end characters
-void mythes_mychomp(char * s)
-{
-  int k = strlen(s);
-  if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
-  if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
-}
-
-
-// return index of char in string
-int mystr_indexOfChar(const char * d, int c)
-{
-  const char * p = strchr(d,c);
-  if (p) return (int)(p-d);
-  return -1;
-}
-
-
-MyThes::MyThes(const char* idxpath, const char * datpath)
-{
-    nw = 0;
-    encoding = NULL;
-    list = NULL;
-    offst = NULL;
-
-    if (thInitialize(idxpath, datpath) != 1) {
-        fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
-        fflush(stderr);
-        if (encoding) free((void*)encoding);
-        if (list)  free((void*)list);
-        if (offst) free((void*)offst);
-        // did not initialize properly - throw exception?
-    }
-}
-
-
-MyThes::~MyThes()
-{
-    if (thCleanup() != 1) {
-        /* did not cleanup properly - throw exception? */
-    }
-    if (encoding) free((void*)encoding);
-    encoding = NULL;
-    list = NULL;
-    offst = NULL;
-}
-
-
-int MyThes::thInitialize(const char* idxpath, const char* datpath)
-{
-
-    // open the index file
-    FILE * pifile = fopen(idxpath,"r");
-    if (!pifile) {
-        pifile = NULL;
-        return 0;
-    } 
-
-    // parse in encoding and index size */    
-    char * wrd;
-    wrd = (char *)calloc(1, MAX_WD_LEN);
-    int len = readLine(pifile,wrd,MAX_WD_LEN);
-    encoding = mythes_mystrdup(wrd);
-    len = readLine(pifile,wrd,MAX_WD_LEN);
-    int idxsz = atoi(wrd); 
-    
-
-    // now allocate list, offst for the given size
-    list = (char**)   calloc(idxsz,sizeof(char*));
-    offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
-
-    if ( (!(list)) || (!(offst)) ) {
-       fprintf(stderr,"Error - bad memory allocation\n");
-       fflush(stderr);
-       return 0;
-    }
-
-    // now parse the remaining lines of the index
-    len = readLine(pifile,wrd,MAX_WD_LEN);
-    while (len > 0)
-    { 
-        int np = mystr_indexOfChar(wrd,'|');
-        if (nw < idxsz) {
-           if (np >= 0) {          
-              *(wrd+np) = '\0';
-              list[nw] = (char *)calloc(1,(np+1));
-              memcpy((list[nw]),wrd,np);
-              offst[nw] = atoi(wrd+np+1);
-              nw++;
-	   }
-        }
-        len = readLine(pifile,wrd,MAX_WD_LEN);
-    }
-
-    free((void *)wrd);
-    fclose(pifile);
-    pifile=NULL;
-
-    /* next open the data file */
-    pdfile = fopen(datpath,"r");
-    if (!pdfile) {
-        pdfile = NULL;
-        return 0;
-    } 
-        
-    return 1;        
-}
-
-
-int MyThes::thCleanup()
-{
-    /* first close the data file */
-    if (pdfile) {
-        fclose(pdfile);
-        pdfile=NULL;
-    }
-
-    /* now free up all the allocated strings on the list */
-    for (int i=0; i < nw; i++) 
-    {
-        if (list[i]) {
-            free(list[i]);
-            list[i] = 0;
-        }
-    }
-
-    if (list)  free((void*)list);
-    if (offst) free((void*)offst);
-
-    nw = 0;
-    return 1;
-}
-
-
-
-// lookup text in index and count of meanings and a list of meaning entries
-// with each entry having a synonym count and pointer to an 
-// array of char * (i.e the synonyms)
-// 
-// note: calling routine should call CleanUpAfterLookup with the original
-// meaning point and count to properly deallocate memory
-
-int MyThes::Lookup(const char * pText, int len, mentry** pme)
-{ 
-
-    *pme = NULL;
-
-    // handle the case of missing file or file related errors
-    if (! pdfile) return 0;
-
-    long offset = 0;
-
-    /* copy search word and make sure null terminated */
-    char * wrd = (char *) calloc(1,(len+1));
-    memcpy(wrd,pText,len);
-  
-    /* find it in the list */
-    int idx = binsearch(wrd,list,nw);
-    free(wrd);  
-    if (idx < 0) return 0;
-
-    // now seek to the offset
-    offset = (long) offst[idx];
-    int rc = fseek(pdfile,offset,SEEK_SET);
-    if (rc) {
-       return 0;
-    }
-
-    // grab the count of the number of meanings
-    // and allocate a list of meaning entries
-    char * buf = NULL;
-    buf  = (char *) malloc( MAX_LN_LEN );
-    if (!buf) return 0;
-    readLine(pdfile, buf, (MAX_LN_LEN-1));
-    int np = mystr_indexOfChar(buf,'|');
-    if (np < 0) {
-         free(buf);
-         return 0;
-    }          
-    int nmeanings = atoi(buf+np+1);
-    *pme = (mentry*) malloc( nmeanings * sizeof(mentry) );
-    if (!(*pme)) {
-        free(buf);
-        return 0;
-    }
-
-    // now read in each meaning and parse it to get defn, count and synonym lists
-    mentry* pm = *(pme);
-    char dfn[MAX_WD_LEN];
-
-    for (int j = 0; j < nmeanings; j++) {
-        readLine(pdfile, buf, (MAX_LN_LEN-1));
-
-        pm->count = 0;
-        pm->psyns = NULL;
-        pm->defn = NULL;
-
-        // store away the part of speech for later use
-        char * p = buf;
-        char * pos = NULL;
-        np = mystr_indexOfChar(p,'|');
-        if (np >= 0) {
-           *(buf+np) = '\0';
-	   pos = mythes_mystrdup(p);
-           p = p + np + 1;
-	} else {
-          pos = mythes_mystrdup("");
-        }
-        
-        // count the number of fields in the remaining line
-        int nf = 1;
-        char * d = p;
-        np = mystr_indexOfChar(d,'|');        
-        while ( np >= 0 ) {
-	  nf++;
-          d = d + np + 1;
-          np = mystr_indexOfChar(d,'|');          
-	}
-	pm->count = nf;
-        pm->psyns = (char **) malloc(nf*sizeof(char*)); 
-        
-        // fill in the synonym list
-        d = p;
-        for (int j = 0; j < nf; j++) {
-            np = mystr_indexOfChar(d,'|');
-            if (np > 0) {
-	      *(d+np) = '\0';
-              pm->psyns[j] = mythes_mystrdup(d);
-              d = d + np + 1;
-            } else {
-              pm->psyns[j] = mythes_mystrdup(d);
-	    }            
-        }
-
-        // add pos to first synonym to create the definition
-        int k = strlen(pos);
-        int m = strlen(pm->psyns[0]);
-        if ((k+m) < (MAX_WD_LEN - 1)) {
-             strncpy(dfn,pos,k);
-             *(dfn+k) = ' ';
-             strncpy((dfn+k+1),(pm->psyns[0]),m+1);
-             pm->defn = mythes_mystrdup(dfn);
-	} else {
-	     pm->defn = mythes_mystrdup(pm->psyns[0]);
-	}
-        free(pos);
-        pm++;
-
-    }
-    free(buf);
-   
-    return nmeanings;
-} 
-
-
-
-void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
-{ 
-
-    if (nmeanings == 0) return;
-    if ((*pme) == NULL) return;
-
-    mentry * pm = *pme;
-       
-    for (int i = 0; i < nmeanings; i++) {
-       int count = pm->count;
-       for (int j = 0; j < count; j++) {
-	  if (pm->psyns[j]) free(pm->psyns[j]);
-          pm->psyns[j] = NULL;
-       }
-       if (pm->psyns) free(pm->psyns);
-       pm->psyns = NULL;
-       if (pm->defn) free(pm->defn);
-       pm->defn = NULL;
-       pm->count = 0;
-       pm++;
-    }
-    pm = *pme;
-    free(pm);
-    *pme = NULL;
-    return;
-}
-
-
-// read a line of text from a text file stripping
-// off the line terminator and replacing it with
-// a null string terminator.
-// returns:  -1 on error or the number of characters in
-//             in the returning string
-
-// A maximum of nc characters will be returned
-
-int MyThes::readLine(FILE * pf, char * buf, int nc)
-{
-    
-  if (fgets(buf,nc,pf)) {
-    mythes_mychomp(buf);
-    return strlen(buf);
-  }
-  return -1;
-}
-
-
- 
-//  performs a binary search on null terminated character
-//  strings
-//
-//  returns: -1 on not found
-//           index of wrd in the list[]
-
-int MyThes::binsearch(char * sw, char* list[], int nlst) 
-{
-    int lp, up, mp, j, indx;
-    lp = 0;
-    up = nlst-1;
-    indx = -1;
-    if (nlst == 0) return -1;
-    if (strcmp(sw,list[lp]) < 0) return -1;
-    if (strcmp(sw,list[up]) > 0) return -1;
-    while (indx < 0 ) {
-        mp = (int)((lp+up) >> 1);
-        j = strcmp(sw,list[mp]);
-        if ( j > 0) {
-            lp = mp + 1;
-        } else if (j < 0 ) {
-            up = mp - 1;
-        } else {
-            indx = mp;
-        }
-        if (lp > up) return -1;      
-    }
-    return indx;
-}
-
-char * MyThes::get_th_encoding()
-{
-  if (encoding) return encoding;
-  return NULL;
-}
-
-
--- a/aegisub/src/mythes.hxx
+++ b/aegisub/src/mythes.hxx
@ -1,103 +0,0 @@
-/*
- * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada
- * And Contributors.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * 3. All modifications to the source code must be clearly marked as
- *    such.  Binary redistributions based on modified source code
- *    must be clearly marked as modified versions in the documentation
- *    and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS 
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL 
- * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- */
-
- 
- #ifndef _MYTHES_HXX_
-#define _MYTHES_HXX_
-
-// some maximum sizes for buffers
-#define MAX_WD_LEN 200
-#define MAX_LN_LEN 16384
-
-
-// a meaning with definition, count of synonyms and synonym list
-struct mentry {
-  char*  defn;
-  int  count;
-  char** psyns;
-};
-
-
-class MyThes
-{
-
-       int  nw;                  /* number of entries in thesaurus */
-       char**  list;               /* stores word list */
-       unsigned int* offst;              /* stores offset list */
-       char *  encoding;           /* stores text encoding; */
- 
-        FILE  *pdfile;
-
-	// disallow copy-constructor and assignment-operator for now
-	MyThes();
-	MyThes(const MyThes &);
-	MyThes & operator = (const MyThes &);
-
-public:
-	MyThes(const char* idxpath, const char* datpath);
-	~MyThes();
-
-        // lookup text in index and return number of meanings
-	// each meaning entry has a defintion, synonym count and pointer 
-        // when complete return the *original* meaning entry and count via 
-        // CleanUpAfterLookup to properly handle memory deallocation
-
-        int Lookup(const char * pText, int len, mentry** pme); 
-
-        void CleanUpAfterLookup(mentry** pme, int nmean);
-
-        char* get_th_encoding(); 
-
-private:
-        // Open index and dat files and load list array
-        int thInitialize (const char* indxpath, const char* datpath);
-        
-        // internal close and cleanup dat and idx files
-        int thCleanup ();
-
-        // read a text line (\n terminated) stripping off line terminator
-        int readLine(FILE * pf, char * buf, int nc);
-
-        // binary search on null terminated character strings
-        int binsearch(char * wrd, char* list[], int nlst);
-
-};
-
-#endif
-
-
-
-
-
--- a/aegisub/src/subs_edit_ctrl.cpp
+++ b/aegisub/src/subs_edit_ctrl.cpp
@ -79,7 +79,7 @@ enum {
 SubsTextEditCtrl::SubsTextEditCtrl(wxWindow* parent, wxSize wsize, long style, SubtitlesGrid *grid)
 : ScintillaTextCtrl(parent, -1, "", wxDefaultPosition, wsize, style)
 , spellchecker(SpellCheckerFactory::GetSpellChecker())
-, thesaurus(Thesaurus::GetThesaurus())
+, thesaurus(new Thesaurus)
 , grid(grid)
 {
 	// Set properties
@ -795,14 +795,15 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
 	// Thesaurus
 	if (thesaurus.get() && currentWord.Length()) {
 		// Get results
-		ThesaurusEntryArray result;
-		thesaurus->Lookup(currentWord,result);
+		std::vector<Thesaurus::Entry> result;
+		thesaurus->Lookup(currentWord,&result);

 		// Compile list
-		thesSugs.Clear();
-		for (unsigned int i=0;i<result.size();i++) {
-			for (unsigned int j=0;j<result[i].words.Count();j++) {
-				thesSugs.Add(result[i].words[j]);
+		thesSugs.clear();
+		thesSugs.reserve(result.size() * 5);
+		for (size_t i = 0; i < result.size(); ++i) {
+			for (size_t j = 0; j < result[i].second.size(); ++j) {
+				thesSugs.push_back(result[i].second[j]);
 			}
 		}

@ -815,10 +816,10 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {

 			// Build menu
 			int curThesEntry = 0;
-			for (unsigned int i=0;i<result.size();i++) {
+			for (size_t i=0;i<result.size();i++) {
 				// Single word, insert directly
-				if (result[i].words.Count() == 1) {
-					thesMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,result[i].name);
+				if (result[i].second.size() == 1) {
+					thesMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,lagi_wxString(result[i].first));
 					curThesEntry++;
 				}

@ -826,13 +827,13 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
 				else {
 					// Insert entries
 					wxMenu *subMenu = new wxMenu();
-					for (unsigned int j=0;j<result[i].words.Count();j++) {
-						subMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,result[i].words[j]);
+					for (size_t j=0;j<result[i].second.size();j++) {
+						subMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,lagi_wxString(result[i].second[j]));
 						curThesEntry++;
 					}

 					// Insert submenu
-					thesMenu->Append(-1, result[i].name, subMenu);
+					thesMenu->Append(-1, lagi_wxString(result[i].first), subMenu);
 				}
 			}

@ -911,7 +912,7 @@ void SubsTextEditCtrl::OnUseSuggestion(wxCommandEvent &event) {
 	wxString suggestion;
 	int sugIdx = event.GetId() - EDIT_MENU_THESAURUS_SUGS;
 	if (sugIdx >= 0) {
-		suggestion = thesSugs[sugIdx];
+		suggestion = lagi_wxString(thesSugs[sugIdx]);
 	}
 	else {
 		suggestion = sugs[event.GetId() - EDIT_MENU_SUGGESTIONS];
@ -953,10 +954,9 @@ void SubsTextEditCtrl::OnSetThesLanguage(wxCommandEvent &event) {

 	// Set language
 	int index = event.GetId() - EDIT_MENU_THES_LANGS - 1;
-	if (index >= 0) {
-		thesaurus->SetLanguage(langs[index]);
-		OPT_SET("Tool/Thesaurus/Language")->SetString(STD_STR(langs[index]));
-	}
+	wxString lang;
+	if (index >= 0) lang = langs[index];
+	OPT_SET("Tool/Thesaurus/Language")->SetString(STD_STR(lang));

 	UpdateStyle();
 }
--- a/aegisub/src/subs_edit_ctrl.h
+++ b/aegisub/src/subs_edit_ctrl.h
@ -67,7 +67,7 @@ class SubsTextEditCtrl : public ScintillaTextCtrl {
 	wxArrayString sugs;

 	/// DOCME
-	wxArrayString thesSugs;
+	std::vector<std::string> thesSugs;

 	/// DOCME
 	int currentWordPos;
--- a/aegisub/src/thesaurus.cpp
+++ b/aegisub/src/thesaurus.cpp
@ -1,4 +1,4 @@
-// Copyright (c) 2006, Rodrigo Braz Monteiro
+// Copyright (c) 2011, Thomas Goyne <plorkyeran@aegisub.org>
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@ -30,29 +30,111 @@
 // $Id$

 /// @file thesaurus.cpp
-/// @brief Base-class for thesaurus implementations
+/// @brief Thesaurus implementation
 /// @ingroup thesaurus
 ///

-
-///////////
-// Headers
 #include "config.h"

-#include "thesaurus_myspell.h"
+#include "thesaurus.h"

+#ifndef AGI_PRE
+#include <wx/dir.h>
+#include <wx/filename.h>
+#endif

-/// @brief Get spell checker 
-///
-Thesaurus *Thesaurus::GetThesaurus() {
-	// Initialize
-	Thesaurus *thes = NULL;
+#include <libaegisub/log.h>
+#include <libaegisub/thesaurus.h>

-	// Get myspell
-	thes = new MySpellThesaurus();
+#include "compat.h"
+#include "main.h"
+#include "standard_paths.h"

-	// Return
-	return thes;
+Thesaurus::Thesaurus()
+: lang_listener(OPT_SUB("Tool/Thesaurus/Language", &Thesaurus::OnLanguageChanged, this))
+, dict_path_listener(OPT_SUB("Path/Dictionary", &Thesaurus::OnPathChanged, this))
+{
+	OnLanguageChanged();
 }

+Thesaurus::~Thesaurus() {
+	// Explicit empty destructor needed for scoped_ptr with incomplete types
+}

+void Thesaurus::Lookup(wxString const& word, std::vector<Entry> *result) {
+	if (!impl.get()) return;
+	impl->Lookup(STD_STR(word.Lower()), result);
+}
+
+wxArrayString Thesaurus::GetLanguageList() const {
+	if (!languages.empty()) return languages;
+
+	wxArrayString idx, dat;
+
+	// Get list of dictionaries
+	wxString path = StandardPaths::DecodePath("?data/dictionaries/");
+	if (wxFileName::DirExists(path)) {
+		wxDir::GetAllFiles(path, &idx, "th_*.idx", wxDIR_FILES);
+		wxDir::GetAllFiles(path, &dat, "th_*.dat", wxDIR_FILES);
+	}
+	path = StandardPaths::DecodePath(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()) + "/");
+	if (wxFileName::DirExists(path)) {
+		wxDir::GetAllFiles(path, &idx, "th_*.idx", wxDIR_FILES);
+		wxDir::GetAllFiles(path, &dat, "th_*.dat", wxDIR_FILES);
+	}
+	if (idx.empty() || dat.empty()) return languages;
+
+	idx.Sort();
+	dat.Sort();
+
+	// Drop extensions and the th_ prefix
+	for (size_t i = 0; i < idx.size(); ++i) idx[i] = idx[i].Mid(3, idx[i].size() - 7);
+	for (size_t i = 0; i < dat.size(); ++i) dat[i] = dat[i].Mid(3, dat[i].size() - 7);
+
+	// Verify that each idx has a dat
+	for (size_t i = 0, j = 0; i < idx.size() && j < dat.size(); ) {
+		int cmp = idx[i].Cmp(dat[j]);
+		if (cmp < 0) ++i;
+		else if (cmp > 0) ++j;
+		else {
+			// Don't insert a language twice if it's in both the user dir and
+			// the app's dir
+			wxString name = wxFileName(dat[j]).GetName().Mid(3);
+			if (languages.empty() || name != languages.back())
+				languages.push_back(name);
+			++i;
+			++j;
+		}
+	}
+	return languages;
+}
+
+void Thesaurus::OnLanguageChanged() {
+	impl.reset();
+
+	std::string language = OPT_GET("Tool/Thesaurus/Language")->GetString();
+	if (language.empty()) return;
+
+	wxString path = StandardPaths::DecodePath(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()) + "/");
+
+	// Get index and data paths
+	wxString idxpath = wxString::Format("%s/th_%s.idx", path, language);
+	wxString datpath = wxString::Format("%s/th_%s.dat", path, language);
+
+	// If they aren't in the user dictionary path, check the application directory
+	if (!wxFileExists(idxpath) || !wxFileExists(datpath)) {
+		path = StandardPaths::DecodePath("?data/dictionaries/");
+		idxpath = wxString::Format("%s/th_%s.idx", path, language);
+		datpath = wxString::Format("%s/th_%s.dat", path, language);
+
+		if (!wxFileExists(idxpath) || !wxFileExists(datpath)) return;
+	}
+
+	LOG_I("thesaurus/file") << "Using thesaurus: " << datpath.c_str();
+
+	impl.reset(new agi::Thesaurus(STD_STR(datpath), STD_STR(idxpath)));
+}
+
+void Thesaurus::OnPathChanged() {
+	languages.clear();
+}
--- a/aegisub/src/thesaurus.h
+++ b/aegisub/src/thesaurus.h
@ -1,4 +1,4 @@
-// Copyright (c) 2006, Rodrigo Braz Monteiro
+// Copyright (c) 2011, Thomas Goyne <plorkyeran@aegisub.org>
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
@ -34,9 +34,6 @@
 /// @ingroup thesaurus
 ///

-
-///////////
-// Headers
 #ifndef AGI_PRE
 #include <vector>

@ -44,50 +41,40 @@
 #include <wx/string.h>
 #endif

+#include <libaegisub/scoped_ptr.h>
+#include <libaegisub/signal.h>

-/// DOCME
-/// @class ThesaurusEntry
-/// @brief DOCME
-///
-/// DOCME
-class ThesaurusEntry {
-public:
+namespace agi { class Thesaurus; }

-	/// DOCME
-	wxString name;
-
-	/// DOCME
-	wxArrayString words;
-};
-
-
-
-/// DOCME
-typedef std::vector<ThesaurusEntry> ThesaurusEntryArray;
-
-
-
-/// DOCME
 /// @class Thesaurus
-/// @brief DOCME
-///
-/// DOCME
+/// @brief A wrapper around agi::Thesarus adding wx and Aegisub-specific stuff
 class Thesaurus {
+	/// The actual thesarus implementation
+	agi::scoped_ptr<agi::Thesaurus> impl;
+	/// A cached list of languages available
+	mutable wxArrayString languages;
+
+	/// Thesaurus language change slot
+	agi::signal::Connection lang_listener;
+	/// Thesaurus language change handler
+	void OnLanguageChanged();
+
+	/// Thesaurus path change slot
+	agi::signal::Connection dict_path_listener;
+	/// Thesaurus path change handler
+	void OnPathChanged();
 public:
-	static Thesaurus *GetThesaurus();
+	/// A pair of a word and synonyms for that word
+	typedef std::pair<std::string, std::vector<std::string> > Entry;

+	Thesaurus();
+	~Thesaurus();

-	/// @brief DOCME
-	///
-	Thesaurus() {}
+	/// Get a list of synonyms for a word, grouped by possible meanings of the word
+	/// @param word Word to get synonyms for
+	/// @param[out] result Output list
+	void Lookup(wxString const& word, std::vector<Entry> *result);

-	/// @brief DOCME
-	///
-	virtual ~Thesaurus() {}
-
-	virtual void Lookup(wxString word,ThesaurusEntryArray &result)=0;
-	virtual wxArrayString GetLanguageList()=0;
-	virtual void SetLanguage(wxString language)=0;
+	/// Get a list of language codes which thesauri are available for
+	wxArrayString GetLanguageList() const;
 };
-
-
--- a/aegisub/src/thesaurus_myspell.cpp
+++ b/aegisub/src/thesaurus_myspell.cpp
@ -1,176 +0,0 @@
-// Copyright (c) 2006, Rodrigo Braz Monteiro
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   * Redistributions of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//   * Redistributions in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//   * Neither the name of the Aegisub Group nor the names of its contributors
-//     may be used to endorse or promote products derived from this software
-//     without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Aegisub Project http://www.aegisub.org/
-//
-// $Id$
-
-/// @file thesaurus_myspell.cpp
-/// @brief MySpell-based thesaurus implementation
-/// @ingroup thesaurus
-///
-
-
-///////////
-// Headers
-#include "config.h"
-
-#ifndef AGI_PRE
-#include <wx/dir.h>
-#include <wx/filename.h>
-#include <wx/log.h>
-#endif
-
-#include <libaegisub/log.h>
-
-#include "compat.h"
-#include "mythes.hxx"
-#include "main.h"
-#include "standard_paths.h"
-#include "thesaurus_myspell.h"
-#include "utils.h"
-
-
-/// @brief Constructor 
-///
-MySpellThesaurus::MySpellThesaurus() {
-	conv = NULL;
-	mythes = NULL;
-	SetLanguage(lagi_wxString(OPT_GET("Tool/Thesaurus/Language")->GetString()));
-}
-
-
-
-/// @brief Destructor 
-///
-MySpellThesaurus::~MySpellThesaurus() {
-	delete mythes;
-	mythes = NULL;
-	delete conv;
-	conv = NULL;
-}
-
-
-
-/// @brief Get suggestions 
-/// @param word   
-/// @param result 
-/// @return 
-///
-void MySpellThesaurus::Lookup(wxString word,ThesaurusEntryArray &result) {
-	// Loaded?
-	if (!mythes) return;
-
-	// Grab raw from MyThes
-	mentry *me;
-	wxCharBuffer buf = word.Lower().mb_str(*conv);
-	if (!buf) return;
-	int n = mythes->Lookup(buf,strlen(buf),&me);
-
-	// Each entry
-	for (int i=0;i<n;i++) {
-		ThesaurusEntry entry;
-		entry.name = wxString(me[i].defn,*conv);
-		for (int j=0;j<me[i].count;j++) entry.words.Add(wxString(me[i].psyns[j],*conv));
-		result.push_back(entry);
-	}
-
-	// Clean up
-	mythes->CleanUpAfterLookup(&me,n);
-}
-
-
-
-/// @brief Get language list 
-/// @return 
-///
-wxArrayString MySpellThesaurus::GetLanguageList() {
-	// Get dir name
-	wxString path = StandardPaths::DecodePathMaybeRelative(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()), _T("?data")) + _T("/");
-	wxArrayString list;
-	wxFileName folder(path);
-	if (!folder.DirExists()) return list;
-
-	// Get file lists
-	wxArrayString idx;
-	wxDir::GetAllFiles(path,&idx,_T("*.idx"),wxDIR_FILES);
-	wxArrayString dat;
-	wxDir::GetAllFiles(path,&dat,_T("*.dat"),wxDIR_FILES);
-
-	// For each idxtionary match, see if it can find the corresponding .dat
-	for (unsigned int i=0;i<idx.Count();i++) {
-		wxString curdat = idx[i].Left(std::max(0,signed(idx[i].Length())-4)) + _T(".dat");
-		for (unsigned int j=0;j<dat.Count();j++) {
-			// Found match
-			if (curdat == dat[j]) {
-				wxFileName fname(curdat);
-				wxString name = fname.GetName();
-				if (name.Left(3) == _T("th_")) name = name.Mid(3);
-				list.Add(name);
-				break;
-			}
-		}
-	}
-
-	// Return list
-	return list;
-}
-
-
-
-/// @brief Set language 
-/// @param language 
-///
-void MySpellThesaurus::SetLanguage(wxString language) {
-	// Unload
-	delete mythes;
-	mythes = NULL;
-	delete conv;
-	conv = NULL;
-
-	// Unloading
-	if (language.IsEmpty()) return;
-
-	// Get dir name
-	wxString path = StandardPaths::DecodePathMaybeRelative(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()), _T("?data")) + _T("/");
-
-	// Get affix and dictionary paths
-	wxString idxpath = path + _T("th_") + language + _T(".idx");
-	wxString datpath = path + _T("th_") + language + _T(".dat");
-
-	// Check if language is available
-	if (!wxFileExists(idxpath) || !wxFileExists(datpath)) return;
-
-	LOG_I("thesaurus/file") << "Using thesaurus: " << datpath.c_str();
-
-	// Load
-	mythes = new MyThes(idxpath.mb_str(wxConvLocal),datpath.mb_str(wxConvLocal));
-	conv = NULL;
-	if (mythes) conv = new wxCSConv(wxString(mythes->get_th_encoding(),wxConvUTF8));
-}
-
-
--- a/aegisub/src/thesaurus_myspell.h
+++ b/aegisub/src/thesaurus_myspell.h
@ -1,73 +0,0 @@
-// Copyright (c) 2006, Rodrigo Braz Monteiro
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-//   * Redistributions of source code must retain the above copyright notice,
-//     this list of conditions and the following disclaimer.
-//   * Redistributions in binary form must reproduce the above copyright notice,
-//     this list of conditions and the following disclaimer in the documentation
-//     and/or other materials provided with the distribution.
-//   * Neither the name of the Aegisub Group nor the names of its contributors
-//     may be used to endorse or promote products derived from this software
-//     without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-//
-// Aegisub Project http://www.aegisub.org/
-//
-// $Id$
-
-/// @file thesaurus_myspell.h
-/// @see thesaurus_myspell.cpp
-/// @ingroup thesaurus
-///
-
-
-
-
-///////////
-// Headers
-#include "thesaurus.h"
-
-
-//////////////
-// Prototypes
-class MyThes;
-
-
-/// DOCME
-/// @class MySpellThesaurus
-/// @brief DOCME
-///
-/// DOCME
-class MySpellThesaurus: public Thesaurus {
-private:
-
-	/// DOCME
-	MyThes *mythes;
-
-	/// DOCME
-	wxCSConv *conv;
-
-public:
-	MySpellThesaurus();
-	~MySpellThesaurus();
-
-	void Lookup(wxString word,ThesaurusEntryArray &result);
-	wxArrayString GetLanguageList();
-	void SetLanguage(wxString language);
-};
-
-
--- a/aegisub/tests/Makefile
+++ b/aegisub/tests/Makefile
@ -27,6 +27,7 @@ SRC = \
 		libaegisub_option.cpp \
 		libaegisub_mru.cpp \
 		libaegisub_signals.cpp \
+		libaegisub_thesaurus.cpp \
 		libaegisub_util.cpp \
 		libaegisub_vfr.cpp
 		
--- a/aegisub/tests/libaegisub_thesaurus.cpp
+++ b/aegisub/tests/libaegisub_thesaurus.cpp
@ -0,0 +1,148 @@
+// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
+//
+// Permission to use, copy, modify, and distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+//
+// $Id$
+
+#include <libaegisub/thesaurus.h>
+
+#include "main.h"
+#include "util.h"
+
+#include <fstream>
+
+class lagi_thes : public libagi {
+protected:
+	std::string idx_path;
+	std::string dat_path;
+
+	void SetUp() {
+		using std::endl;
+
+		idx_path = "data/thes.idx";
+		dat_path = "data/thes.dat";
+
+		std::ofstream idx(idx_path.c_str());
+		std::ofstream dat(dat_path.c_str());
+
+		idx << "UTF-8" << endl;
+		dat << "UTF-8" << endl;
+		idx << 7 << endl; // entry count
+
+		idx << "Word 1|" << dat.tellp() << endl;
+		dat << "Word 1|1" << endl;
+		dat << "(noun)|Word 1|Word 1A|Word 1B|Word 1C" << endl;
+
+		idx << "Word 2|" << dat.tellp() << endl;
+		dat << "Word 2|2" << endl;
+		dat << "(adj)|Word 2|Word 2 adj" << endl;
+		dat << "(noun)|Word 2|Word 2 noun" << endl;
+
+		dat << "Unindexed Word|1" << endl;
+		dat << "(adv)|Unindexed Word|Indexed Word" << endl;
+
+		idx << "Word 3|" << dat.tellp() << endl;
+		dat << "Word 3|1" << endl;
+		dat << "(verb)|Not Word 3|Four" << endl;
+
+		idx << "Too few fields" << endl;
+		idx << "Too many fields|100|100" << endl;
+		idx << "Not a number|foo" << endl;
+		idx << "Out of range|" << dat.tellp() << endl;
+		idx << "Further out of range|" << 1 + dat.tellp() << endl;
+	}
+};
+
+TEST_F(lagi_thes, parse) {
+	ASSERT_NO_THROW(agi::Thesaurus(dat_path, idx_path));
+}
+
+TEST_F(lagi_thes, word_1) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Word 1", &entries));
+	ASSERT_EQ(1, entries.size());
+	ASSERT_EQ(3, entries[0].second.size());
+	EXPECT_STREQ("(noun) Word 1", entries[0].first.c_str());
+	EXPECT_STREQ("Word 1A", entries[0].second[0].c_str());
+	EXPECT_STREQ("Word 1B", entries[0].second[1].c_str());
+	EXPECT_STREQ("Word 1C", entries[0].second[2].c_str());
+}
+
+TEST_F(lagi_thes, word_2) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Word 2", &entries));
+	ASSERT_EQ(2, entries.size());
+	ASSERT_EQ(1, entries[0].second.size());
+	ASSERT_EQ(1, entries[1].second.size());
+	EXPECT_STREQ("(adj) Word 2", entries[0].first.c_str());
+	EXPECT_STREQ("(noun) Word 2", entries[1].first.c_str());
+	EXPECT_STREQ("Word 2 adj", entries[0].second[0].c_str());
+	EXPECT_STREQ("Word 2 noun", entries[1].second[0].c_str());
+}
+
+TEST_F(lagi_thes, word_3) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Word 3", &entries));
+	ASSERT_EQ(1, entries.size());
+	ASSERT_EQ(1, entries[0].second.size());
+	EXPECT_STREQ("(verb) Not Word 3", entries[0].first.c_str());
+	EXPECT_STREQ("Four", entries[0].second[0].c_str());
+}
+
+TEST_F(lagi_thes, bad_word) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Nonexistent word", &entries));
+	EXPECT_EQ(0, entries.size());
+}
+
+TEST_F(lagi_thes, lookup_clears) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Word 1", &entries));
+	ASSERT_NO_THROW(thes.Lookup("Word 2", &entries));
+	ASSERT_NO_THROW(thes.Lookup("Word 3", &entries));
+	EXPECT_EQ(1, entries.size());
+}
+
+TEST_F(lagi_thes, malformed_index_lines) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Too few fields", &entries));
+	EXPECT_EQ(0, entries.size());
+	ASSERT_NO_THROW(thes.Lookup("Too many fields", &entries));
+	EXPECT_EQ(0, entries.size());
+	ASSERT_NO_THROW(thes.Lookup("Not a number", &entries));
+	EXPECT_EQ(0, entries.size());
+	ASSERT_NO_THROW(thes.Lookup("Out of range", &entries));
+	EXPECT_EQ(0, entries.size());
+	ASSERT_NO_THROW(thes.Lookup("Further out of range", &entries));
+	EXPECT_EQ(0, entries.size());
+}
+
+TEST_F(lagi_thes, unindexed_word) {
+	agi::Thesaurus thes(dat_path, idx_path);
+
+	std::vector<agi::Thesaurus::Entry> entries;
+	ASSERT_NO_THROW(thes.Lookup("Unindexed Word", &entries));
+	EXPECT_EQ(0, entries.size());
+}