From 3c62a38c7a6e1ea8988962d9abb3091b077af4b0 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Sun, 8 Jan 2012 01:36:50 +0000 Subject: [PATCH] Replace MyThes thesaurus implementation with a custom one Fixes a pile of unicode-related issues, such as dictionaries in a path which does not fit into the system's local charset, and significantly cuts down on the amount of code. Originally committed to SVN as r6250. --- .../aegisub_vs2008/aegisub_vs2008.vcproj | 16 - .../libaegisub_vs2008.vcproj | 8 + .../build/tests_vs2008/tests_vs2008.vcproj | 4 + aegisub/libaegisub/Makefile | 1 + aegisub/libaegisub/common/thesaurus.cpp | 97 +++++ .../libaegisub/include/libaegisub/thesaurus.h | 58 +++ aegisub/src/Makefile | 2 - aegisub/src/dialog_about.cpp | 1 - aegisub/src/mythes.cxx | 398 ------------------ aegisub/src/mythes.hxx | 103 ----- aegisub/src/subs_edit_ctrl.cpp | 36 +- aegisub/src/subs_edit_ctrl.h | 2 +- aegisub/src/thesaurus.cpp | 112 ++++- aegisub/src/thesaurus.h | 71 ++-- aegisub/src/thesaurus_myspell.cpp | 176 -------- aegisub/src/thesaurus_myspell.h | 73 ---- aegisub/tests/Makefile | 1 + aegisub/tests/libaegisub_thesaurus.cpp | 148 +++++++ 18 files changed, 462 insertions(+), 845 deletions(-) create mode 100644 aegisub/libaegisub/common/thesaurus.cpp create mode 100644 aegisub/libaegisub/include/libaegisub/thesaurus.h delete mode 100644 aegisub/src/mythes.cxx delete mode 100644 aegisub/src/mythes.hxx delete mode 100644 aegisub/src/thesaurus_myspell.cpp delete mode 100644 aegisub/src/thesaurus_myspell.h create mode 100644 aegisub/tests/libaegisub_thesaurus.cpp diff --git a/aegisub/build/aegisub_vs2008/aegisub_vs2008.vcproj b/aegisub/build/aegisub_vs2008/aegisub_vs2008.vcproj index 8a75cdb61..43f7a8540 100644 --- a/aegisub/build/aegisub_vs2008/aegisub_vs2008.vcproj +++ b/aegisub/build/aegisub_vs2008/aegisub_vs2008.vcproj @@ -855,14 +855,6 @@ RelativePath="..\..\src\md5.h" > - - - - @@ -1423,14 +1415,6 @@ RelativePath="..\..\src\thesaurus.h" > - - - - + + @@ -477,6 +481,10 @@ RelativePath="..\..\libaegisub\include\libaegisub\signal.h" > + + diff --git a/aegisub/build/tests_vs2008/tests_vs2008.vcproj b/aegisub/build/tests_vs2008/tests_vs2008.vcproj index 0bb46ee0a..65de85689 100644 --- a/aegisub/build/tests_vs2008/tests_vs2008.vcproj +++ b/aegisub/build/tests_vs2008/tests_vs2008.vcproj @@ -334,6 +334,10 @@ RelativePath="..\..\tests\libaegisub_signals.cpp" > + + diff --git a/aegisub/libaegisub/Makefile b/aegisub/libaegisub/Makefile index e5cbcbc3b..d3934a093 100644 --- a/aegisub/libaegisub/Makefile +++ b/aegisub/libaegisub/Makefile @@ -35,6 +35,7 @@ SRC = \ common/keyframe.cpp \ common/util.cpp \ common/log.cpp \ + common/thesaurus.cpp \ common/validator.cpp \ common/vfr.cpp \ unix/util.cpp \ diff --git a/aegisub/libaegisub/common/thesaurus.cpp b/aegisub/libaegisub/common/thesaurus.cpp new file mode 100644 index 000000000..aec4ee2f4 --- /dev/null +++ b/aegisub/libaegisub/common/thesaurus.cpp @@ -0,0 +1,97 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file thesaurus.cpp +/// @brief MyThes-compatible thesaurus implementation +/// @ingroup libaegisub thesaurus + +#include "libaegisub/thesaurus.h" + +#include "libaegisub/charset_conv.h" +#include "libaegisub/io.h" +#include "libaegisub/line_iterator.h" + +template +static void split(String const& str, Char sep, Container *out) { + typename String::size_type pos, prev = 0; + out->reserve(2); + while ((pos = str.find(sep, prev)) != String::npos) { + if (pos > prev) + out->push_back(str.substr(prev, pos - prev)); + prev = pos + 1; + } + if (prev < str.size()) + out->push_back(str.substr(prev)); +} + +namespace agi { + +Thesaurus::Thesaurus(std::string const& dat_path, std::string const& idx_path) +: dat(io::Open(dat_path)) +{ + scoped_ptr idx(io::Open(idx_path)); + + std::string encoding_name; + getline(*idx, encoding_name); + std::string unused_entry_count; + getline(*idx, unused_entry_count); + + // Read the list of words and file offsets for those words + for (line_iterator iter(*idx, encoding_name), end; iter != end; ++iter) { + std::vector chunks; + split(*iter, '|', &chunks); + if (chunks.size() == 2) { + offsets[chunks[0]] = atoi(chunks[1].c_str()); + } + } + + conv.reset(new charset::IconvWrapper(encoding_name.c_str(), "utf-8")); +} + +Thesaurus::~Thesaurus() { } + +void Thesaurus::Lookup(std::string const& word, std::vector *out) { + out->clear(); + + std::map::const_iterator it = offsets.find(word); + if (!dat.get() || it == offsets.end()) return; + + dat->seekg(it->second, std::ios::beg); + if (!dat->good()) return; + + // First line is the word and meaning count + std::string temp; + getline(*dat, temp); + std::vector header; + split(conv->Convert(temp), '|', &header); + if (header.size() != 2) return; + int meanings = atoi(header[1].c_str()); + + out->resize(meanings); + for (int i = 0; i < meanings; ++i) { + std::vector line; + getline(*dat, temp); + split(conv->Convert(temp), '|', &line); + + // The "definition" is just the part of speech plus the word it's + // giving synonyms for (which may not be the passed word) + (*out)[i].first = line[0] + ' ' + line[1]; + (*out)[i].second.reserve(line.size() - 2); + copy(line.begin() + 2, line.end(), back_inserter((*out)[i].second)); + } +} + +} diff --git a/aegisub/libaegisub/include/libaegisub/thesaurus.h b/aegisub/libaegisub/include/libaegisub/thesaurus.h new file mode 100644 index 000000000..8a7758f56 --- /dev/null +++ b/aegisub/libaegisub/include/libaegisub/thesaurus.h @@ -0,0 +1,58 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +/// @file thesaurus.h +/// @brief MyThes-compatible thesaurus implementation +/// @ingroup libaegisub thesaurus + +#include + +#ifndef LAGI_PRE +#include +#include +#include +#include +#endif + +namespace agi { + +namespace charset { class IconvWrapper; } + +class Thesaurus { + /// Map of word -> byte position in the data file + std::map offsets; + /// Read handle to the data file + scoped_ptr dat; + /// Converter from the data file's charset to UTF-8 + scoped_ptr conv; + +public: + /// A pair of a word and synonyms for that word + typedef std::pair > Entry; + + /// Constructor + /// @param dat_path Path to data file + /// @param idx_path Path to index file + Thesaurus(std::string const& dat_path, std::string const& idx_path); + ~Thesaurus(); + + /// Look up synonyms for a word + /// @param word Word to look up + /// @param[out] out Vector to fill with word/synonym lists + void Lookup(std::string const& word, std::vector *out); +}; + +} diff --git a/aegisub/src/Makefile b/aegisub/src/Makefile index 3015b45ce..657618fcb 100644 --- a/aegisub/src/Makefile +++ b/aegisub/src/Makefile @@ -192,7 +192,6 @@ SRC += \ menu.cpp \ md5.c \ mkv_wrap.cpp \ - mythes.cxx \ pen.cpp \ persist_location.cpp \ plugin_manager.cpp \ @@ -221,7 +220,6 @@ SRC += \ text_file_reader.cpp \ text_file_writer.cpp \ thesaurus.cpp \ - thesaurus_myspell.cpp \ timeedit_ctrl.cpp \ threaded_frame_source.cpp \ toggle_bitmap.cpp \ diff --git a/aegisub/src/dialog_about.cpp b/aegisub/src/dialog_about.cpp index a907f44bc..65c839840 100644 --- a/aegisub/src/dialog_about.cpp +++ b/aegisub/src/dialog_about.cpp @@ -97,7 +97,6 @@ AboutScreen::AboutScreen(wxWindow *parent) #ifdef WITH_FREETYPE2 libString += " Freetype - Copyright (c) David Turner, Robert Wilhelm, Werner Lemberg;\n"; #endif - libString += " MyThes - Copyright (c) Kevin B. Hendricks, Stratford, Ontario, Canada.\n"; #ifdef WITH_FFTW3 libString += " FFTW - Copyright (c) Matteo Frigo, Massachusetts Institute of Technology;\n"; #endif diff --git a/aegisub/src/mythes.cxx b/aegisub/src/mythes.cxx deleted file mode 100644 index 39c56dab9..000000000 --- a/aegisub/src/mythes.cxx +++ /dev/null @@ -1,398 +0,0 @@ -/* - * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada - * And Contributors. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. All modifications to the source code must be clearly marked as - * such. Binary redistributions based on modified source code - * must be clearly marked as modified versions in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - - -#include -#include -#include -#include -#include "mythes.hxx" - -// some basic utility routines - -// string duplication routine -char * mythes_mystrdup(const char * p) -{ - - int sl = strlen(p) + 1; - char * d = (char *)malloc(sl); - if (d) { - memcpy(d,p,sl); - return d; - } - return NULL; -} - -// remove cross-platform text line end characters -void mythes_mychomp(char * s) -{ - int k = strlen(s); - if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; - if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; -} - - -// return index of char in string -int mystr_indexOfChar(const char * d, int c) -{ - const char * p = strchr(d,c); - if (p) return (int)(p-d); - return -1; -} - - -MyThes::MyThes(const char* idxpath, const char * datpath) -{ - nw = 0; - encoding = NULL; - list = NULL; - offst = NULL; - - if (thInitialize(idxpath, datpath) != 1) { - fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath); - fflush(stderr); - if (encoding) free((void*)encoding); - if (list) free((void*)list); - if (offst) free((void*)offst); - // did not initialize properly - throw exception? - } -} - - -MyThes::~MyThes() -{ - if (thCleanup() != 1) { - /* did not cleanup properly - throw exception? */ - } - if (encoding) free((void*)encoding); - encoding = NULL; - list = NULL; - offst = NULL; -} - - -int MyThes::thInitialize(const char* idxpath, const char* datpath) -{ - - // open the index file - FILE * pifile = fopen(idxpath,"r"); - if (!pifile) { - pifile = NULL; - return 0; - } - - // parse in encoding and index size */ - char * wrd; - wrd = (char *)calloc(1, MAX_WD_LEN); - int len = readLine(pifile,wrd,MAX_WD_LEN); - encoding = mythes_mystrdup(wrd); - len = readLine(pifile,wrd,MAX_WD_LEN); - int idxsz = atoi(wrd); - - - // now allocate list, offst for the given size - list = (char**) calloc(idxsz,sizeof(char*)); - offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int)); - - if ( (!(list)) || (!(offst)) ) { - fprintf(stderr,"Error - bad memory allocation\n"); - fflush(stderr); - return 0; - } - - // now parse the remaining lines of the index - len = readLine(pifile,wrd,MAX_WD_LEN); - while (len > 0) - { - int np = mystr_indexOfChar(wrd,'|'); - if (nw < idxsz) { - if (np >= 0) { - *(wrd+np) = '\0'; - list[nw] = (char *)calloc(1,(np+1)); - memcpy((list[nw]),wrd,np); - offst[nw] = atoi(wrd+np+1); - nw++; - } - } - len = readLine(pifile,wrd,MAX_WD_LEN); - } - - free((void *)wrd); - fclose(pifile); - pifile=NULL; - - /* next open the data file */ - pdfile = fopen(datpath,"r"); - if (!pdfile) { - pdfile = NULL; - return 0; - } - - return 1; -} - - -int MyThes::thCleanup() -{ - /* first close the data file */ - if (pdfile) { - fclose(pdfile); - pdfile=NULL; - } - - /* now free up all the allocated strings on the list */ - for (int i=0; i < nw; i++) - { - if (list[i]) { - free(list[i]); - list[i] = 0; - } - } - - if (list) free((void*)list); - if (offst) free((void*)offst); - - nw = 0; - return 1; -} - - - -// lookup text in index and count of meanings and a list of meaning entries -// with each entry having a synonym count and pointer to an -// array of char * (i.e the synonyms) -// -// note: calling routine should call CleanUpAfterLookup with the original -// meaning point and count to properly deallocate memory - -int MyThes::Lookup(const char * pText, int len, mentry** pme) -{ - - *pme = NULL; - - // handle the case of missing file or file related errors - if (! pdfile) return 0; - - long offset = 0; - - /* copy search word and make sure null terminated */ - char * wrd = (char *) calloc(1,(len+1)); - memcpy(wrd,pText,len); - - /* find it in the list */ - int idx = binsearch(wrd,list,nw); - free(wrd); - if (idx < 0) return 0; - - // now seek to the offset - offset = (long) offst[idx]; - int rc = fseek(pdfile,offset,SEEK_SET); - if (rc) { - return 0; - } - - // grab the count of the number of meanings - // and allocate a list of meaning entries - char * buf = NULL; - buf = (char *) malloc( MAX_LN_LEN ); - if (!buf) return 0; - readLine(pdfile, buf, (MAX_LN_LEN-1)); - int np = mystr_indexOfChar(buf,'|'); - if (np < 0) { - free(buf); - return 0; - } - int nmeanings = atoi(buf+np+1); - *pme = (mentry*) malloc( nmeanings * sizeof(mentry) ); - if (!(*pme)) { - free(buf); - return 0; - } - - // now read in each meaning and parse it to get defn, count and synonym lists - mentry* pm = *(pme); - char dfn[MAX_WD_LEN]; - - for (int j = 0; j < nmeanings; j++) { - readLine(pdfile, buf, (MAX_LN_LEN-1)); - - pm->count = 0; - pm->psyns = NULL; - pm->defn = NULL; - - // store away the part of speech for later use - char * p = buf; - char * pos = NULL; - np = mystr_indexOfChar(p,'|'); - if (np >= 0) { - *(buf+np) = '\0'; - pos = mythes_mystrdup(p); - p = p + np + 1; - } else { - pos = mythes_mystrdup(""); - } - - // count the number of fields in the remaining line - int nf = 1; - char * d = p; - np = mystr_indexOfChar(d,'|'); - while ( np >= 0 ) { - nf++; - d = d + np + 1; - np = mystr_indexOfChar(d,'|'); - } - pm->count = nf; - pm->psyns = (char **) malloc(nf*sizeof(char*)); - - // fill in the synonym list - d = p; - for (int j = 0; j < nf; j++) { - np = mystr_indexOfChar(d,'|'); - if (np > 0) { - *(d+np) = '\0'; - pm->psyns[j] = mythes_mystrdup(d); - d = d + np + 1; - } else { - pm->psyns[j] = mythes_mystrdup(d); - } - } - - // add pos to first synonym to create the definition - int k = strlen(pos); - int m = strlen(pm->psyns[0]); - if ((k+m) < (MAX_WD_LEN - 1)) { - strncpy(dfn,pos,k); - *(dfn+k) = ' '; - strncpy((dfn+k+1),(pm->psyns[0]),m+1); - pm->defn = mythes_mystrdup(dfn); - } else { - pm->defn = mythes_mystrdup(pm->psyns[0]); - } - free(pos); - pm++; - - } - free(buf); - - return nmeanings; -} - - - -void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings) -{ - - if (nmeanings == 0) return; - if ((*pme) == NULL) return; - - mentry * pm = *pme; - - for (int i = 0; i < nmeanings; i++) { - int count = pm->count; - for (int j = 0; j < count; j++) { - if (pm->psyns[j]) free(pm->psyns[j]); - pm->psyns[j] = NULL; - } - if (pm->psyns) free(pm->psyns); - pm->psyns = NULL; - if (pm->defn) free(pm->defn); - pm->defn = NULL; - pm->count = 0; - pm++; - } - pm = *pme; - free(pm); - *pme = NULL; - return; -} - - -// read a line of text from a text file stripping -// off the line terminator and replacing it with -// a null string terminator. -// returns: -1 on error or the number of characters in -// in the returning string - -// A maximum of nc characters will be returned - -int MyThes::readLine(FILE * pf, char * buf, int nc) -{ - - if (fgets(buf,nc,pf)) { - mythes_mychomp(buf); - return strlen(buf); - } - return -1; -} - - - -// performs a binary search on null terminated character -// strings -// -// returns: -1 on not found -// index of wrd in the list[] - -int MyThes::binsearch(char * sw, char* list[], int nlst) -{ - int lp, up, mp, j, indx; - lp = 0; - up = nlst-1; - indx = -1; - if (nlst == 0) return -1; - if (strcmp(sw,list[lp]) < 0) return -1; - if (strcmp(sw,list[up]) > 0) return -1; - while (indx < 0 ) { - mp = (int)((lp+up) >> 1); - j = strcmp(sw,list[mp]); - if ( j > 0) { - lp = mp + 1; - } else if (j < 0 ) { - up = mp - 1; - } else { - indx = mp; - } - if (lp > up) return -1; - } - return indx; -} - -char * MyThes::get_th_encoding() -{ - if (encoding) return encoding; - return NULL; -} - - diff --git a/aegisub/src/mythes.hxx b/aegisub/src/mythes.hxx deleted file mode 100644 index 3718545ac..000000000 --- a/aegisub/src/mythes.hxx +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada - * And Contributors. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * 3. All modifications to the source code must be clearly marked as - * such. Binary redistributions based on modified source code - * must be clearly marked as modified versions in the documentation - * and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL - * KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - */ - - - #ifndef _MYTHES_HXX_ -#define _MYTHES_HXX_ - -// some maximum sizes for buffers -#define MAX_WD_LEN 200 -#define MAX_LN_LEN 16384 - - -// a meaning with definition, count of synonyms and synonym list -struct mentry { - char* defn; - int count; - char** psyns; -}; - - -class MyThes -{ - - int nw; /* number of entries in thesaurus */ - char** list; /* stores word list */ - unsigned int* offst; /* stores offset list */ - char * encoding; /* stores text encoding; */ - - FILE *pdfile; - - // disallow copy-constructor and assignment-operator for now - MyThes(); - MyThes(const MyThes &); - MyThes & operator = (const MyThes &); - -public: - MyThes(const char* idxpath, const char* datpath); - ~MyThes(); - - // lookup text in index and return number of meanings - // each meaning entry has a defintion, synonym count and pointer - // when complete return the *original* meaning entry and count via - // CleanUpAfterLookup to properly handle memory deallocation - - int Lookup(const char * pText, int len, mentry** pme); - - void CleanUpAfterLookup(mentry** pme, int nmean); - - char* get_th_encoding(); - -private: - // Open index and dat files and load list array - int thInitialize (const char* indxpath, const char* datpath); - - // internal close and cleanup dat and idx files - int thCleanup (); - - // read a text line (\n terminated) stripping off line terminator - int readLine(FILE * pf, char * buf, int nc); - - // binary search on null terminated character strings - int binsearch(char * wrd, char* list[], int nlst); - -}; - -#endif - - - - - diff --git a/aegisub/src/subs_edit_ctrl.cpp b/aegisub/src/subs_edit_ctrl.cpp index 8ee2f600b..019242686 100644 --- a/aegisub/src/subs_edit_ctrl.cpp +++ b/aegisub/src/subs_edit_ctrl.cpp @@ -79,7 +79,7 @@ enum { SubsTextEditCtrl::SubsTextEditCtrl(wxWindow* parent, wxSize wsize, long style, SubtitlesGrid *grid) : ScintillaTextCtrl(parent, -1, "", wxDefaultPosition, wsize, style) , spellchecker(SpellCheckerFactory::GetSpellChecker()) -, thesaurus(Thesaurus::GetThesaurus()) +, thesaurus(new Thesaurus) , grid(grid) { // Set properties @@ -795,14 +795,15 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) { // Thesaurus if (thesaurus.get() && currentWord.Length()) { // Get results - ThesaurusEntryArray result; - thesaurus->Lookup(currentWord,result); + std::vector result; + thesaurus->Lookup(currentWord,&result); // Compile list - thesSugs.Clear(); - for (unsigned int i=0;iAppend(EDIT_MENU_THESAURUS_SUGS+curThesEntry,result[i].name); + if (result[i].second.size() == 1) { + thesMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,lagi_wxString(result[i].first)); curThesEntry++; } @@ -826,13 +827,13 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) { else { // Insert entries wxMenu *subMenu = new wxMenu(); - for (unsigned int j=0;jAppend(EDIT_MENU_THESAURUS_SUGS+curThesEntry,result[i].words[j]); + for (size_t j=0;jAppend(EDIT_MENU_THESAURUS_SUGS+curThesEntry,lagi_wxString(result[i].second[j])); curThesEntry++; } // Insert submenu - thesMenu->Append(-1, result[i].name, subMenu); + thesMenu->Append(-1, lagi_wxString(result[i].first), subMenu); } } @@ -911,7 +912,7 @@ void SubsTextEditCtrl::OnUseSuggestion(wxCommandEvent &event) { wxString suggestion; int sugIdx = event.GetId() - EDIT_MENU_THESAURUS_SUGS; if (sugIdx >= 0) { - suggestion = thesSugs[sugIdx]; + suggestion = lagi_wxString(thesSugs[sugIdx]); } else { suggestion = sugs[event.GetId() - EDIT_MENU_SUGGESTIONS]; @@ -953,10 +954,9 @@ void SubsTextEditCtrl::OnSetThesLanguage(wxCommandEvent &event) { // Set language int index = event.GetId() - EDIT_MENU_THES_LANGS - 1; - if (index >= 0) { - thesaurus->SetLanguage(langs[index]); - OPT_SET("Tool/Thesaurus/Language")->SetString(STD_STR(langs[index])); - } + wxString lang; + if (index >= 0) lang = langs[index]; + OPT_SET("Tool/Thesaurus/Language")->SetString(STD_STR(lang)); UpdateStyle(); } diff --git a/aegisub/src/subs_edit_ctrl.h b/aegisub/src/subs_edit_ctrl.h index 379294818..90734f188 100644 --- a/aegisub/src/subs_edit_ctrl.h +++ b/aegisub/src/subs_edit_ctrl.h @@ -67,7 +67,7 @@ class SubsTextEditCtrl : public ScintillaTextCtrl { wxArrayString sugs; /// DOCME - wxArrayString thesSugs; + std::vector thesSugs; /// DOCME int currentWordPos; diff --git a/aegisub/src/thesaurus.cpp b/aegisub/src/thesaurus.cpp index 256fc5ed0..43acbe5f2 100644 --- a/aegisub/src/thesaurus.cpp +++ b/aegisub/src/thesaurus.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2006, Rodrigo Braz Monteiro +// Copyright (c) 2011, Thomas Goyne // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -30,29 +30,111 @@ // $Id$ /// @file thesaurus.cpp -/// @brief Base-class for thesaurus implementations +/// @brief Thesaurus implementation /// @ingroup thesaurus /// - -/////////// -// Headers #include "config.h" -#include "thesaurus_myspell.h" +#include "thesaurus.h" +#ifndef AGI_PRE +#include +#include +#endif -/// @brief Get spell checker -/// -Thesaurus *Thesaurus::GetThesaurus() { - // Initialize - Thesaurus *thes = NULL; +#include +#include - // Get myspell - thes = new MySpellThesaurus(); +#include "compat.h" +#include "main.h" +#include "standard_paths.h" - // Return - return thes; +Thesaurus::Thesaurus() +: lang_listener(OPT_SUB("Tool/Thesaurus/Language", &Thesaurus::OnLanguageChanged, this)) +, dict_path_listener(OPT_SUB("Path/Dictionary", &Thesaurus::OnPathChanged, this)) +{ + OnLanguageChanged(); } +Thesaurus::~Thesaurus() { + // Explicit empty destructor needed for scoped_ptr with incomplete types +} +void Thesaurus::Lookup(wxString const& word, std::vector *result) { + if (!impl.get()) return; + impl->Lookup(STD_STR(word.Lower()), result); +} + +wxArrayString Thesaurus::GetLanguageList() const { + if (!languages.empty()) return languages; + + wxArrayString idx, dat; + + // Get list of dictionaries + wxString path = StandardPaths::DecodePath("?data/dictionaries/"); + if (wxFileName::DirExists(path)) { + wxDir::GetAllFiles(path, &idx, "th_*.idx", wxDIR_FILES); + wxDir::GetAllFiles(path, &dat, "th_*.dat", wxDIR_FILES); + } + path = StandardPaths::DecodePath(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()) + "/"); + if (wxFileName::DirExists(path)) { + wxDir::GetAllFiles(path, &idx, "th_*.idx", wxDIR_FILES); + wxDir::GetAllFiles(path, &dat, "th_*.dat", wxDIR_FILES); + } + if (idx.empty() || dat.empty()) return languages; + + idx.Sort(); + dat.Sort(); + + // Drop extensions and the th_ prefix + for (size_t i = 0; i < idx.size(); ++i) idx[i] = idx[i].Mid(3, idx[i].size() - 7); + for (size_t i = 0; i < dat.size(); ++i) dat[i] = dat[i].Mid(3, dat[i].size() - 7); + + // Verify that each idx has a dat + for (size_t i = 0, j = 0; i < idx.size() && j < dat.size(); ) { + int cmp = idx[i].Cmp(dat[j]); + if (cmp < 0) ++i; + else if (cmp > 0) ++j; + else { + // Don't insert a language twice if it's in both the user dir and + // the app's dir + wxString name = wxFileName(dat[j]).GetName().Mid(3); + if (languages.empty() || name != languages.back()) + languages.push_back(name); + ++i; + ++j; + } + } + return languages; +} + +void Thesaurus::OnLanguageChanged() { + impl.reset(); + + std::string language = OPT_GET("Tool/Thesaurus/Language")->GetString(); + if (language.empty()) return; + + wxString path = StandardPaths::DecodePath(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()) + "/"); + + // Get index and data paths + wxString idxpath = wxString::Format("%s/th_%s.idx", path, language); + wxString datpath = wxString::Format("%s/th_%s.dat", path, language); + + // If they aren't in the user dictionary path, check the application directory + if (!wxFileExists(idxpath) || !wxFileExists(datpath)) { + path = StandardPaths::DecodePath("?data/dictionaries/"); + idxpath = wxString::Format("%s/th_%s.idx", path, language); + datpath = wxString::Format("%s/th_%s.dat", path, language); + + if (!wxFileExists(idxpath) || !wxFileExists(datpath)) return; + } + + LOG_I("thesaurus/file") << "Using thesaurus: " << datpath.c_str(); + + impl.reset(new agi::Thesaurus(STD_STR(datpath), STD_STR(idxpath))); +} + +void Thesaurus::OnPathChanged() { + languages.clear(); +} diff --git a/aegisub/src/thesaurus.h b/aegisub/src/thesaurus.h index 873d15a9b..26c18b49a 100644 --- a/aegisub/src/thesaurus.h +++ b/aegisub/src/thesaurus.h @@ -1,4 +1,4 @@ -// Copyright (c) 2006, Rodrigo Braz Monteiro +// Copyright (c) 2011, Thomas Goyne // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -34,9 +34,6 @@ /// @ingroup thesaurus /// - -/////////// -// Headers #ifndef AGI_PRE #include @@ -44,50 +41,40 @@ #include #endif +#include +#include -/// DOCME -/// @class ThesaurusEntry -/// @brief DOCME -/// -/// DOCME -class ThesaurusEntry { -public: +namespace agi { class Thesaurus; } - /// DOCME - wxString name; - - /// DOCME - wxArrayString words; -}; - - - -/// DOCME -typedef std::vector ThesaurusEntryArray; - - - -/// DOCME /// @class Thesaurus -/// @brief DOCME -/// -/// DOCME +/// @brief A wrapper around agi::Thesarus adding wx and Aegisub-specific stuff class Thesaurus { + /// The actual thesarus implementation + agi::scoped_ptr impl; + /// A cached list of languages available + mutable wxArrayString languages; + + /// Thesaurus language change slot + agi::signal::Connection lang_listener; + /// Thesaurus language change handler + void OnLanguageChanged(); + + /// Thesaurus path change slot + agi::signal::Connection dict_path_listener; + /// Thesaurus path change handler + void OnPathChanged(); public: - static Thesaurus *GetThesaurus(); + /// A pair of a word and synonyms for that word + typedef std::pair > Entry; + Thesaurus(); + ~Thesaurus(); - /// @brief DOCME - /// - Thesaurus() {} + /// Get a list of synonyms for a word, grouped by possible meanings of the word + /// @param word Word to get synonyms for + /// @param[out] result Output list + void Lookup(wxString const& word, std::vector *result); - /// @brief DOCME - /// - virtual ~Thesaurus() {} - - virtual void Lookup(wxString word,ThesaurusEntryArray &result)=0; - virtual wxArrayString GetLanguageList()=0; - virtual void SetLanguage(wxString language)=0; + /// Get a list of language codes which thesauri are available for + wxArrayString GetLanguageList() const; }; - - diff --git a/aegisub/src/thesaurus_myspell.cpp b/aegisub/src/thesaurus_myspell.cpp deleted file mode 100644 index 154071587..000000000 --- a/aegisub/src/thesaurus_myspell.cpp +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2006, Rodrigo Braz Monteiro -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// * Neither the name of the Aegisub Group nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Aegisub Project http://www.aegisub.org/ -// -// $Id$ - -/// @file thesaurus_myspell.cpp -/// @brief MySpell-based thesaurus implementation -/// @ingroup thesaurus -/// - - -/////////// -// Headers -#include "config.h" - -#ifndef AGI_PRE -#include -#include -#include -#endif - -#include - -#include "compat.h" -#include "mythes.hxx" -#include "main.h" -#include "standard_paths.h" -#include "thesaurus_myspell.h" -#include "utils.h" - - -/// @brief Constructor -/// -MySpellThesaurus::MySpellThesaurus() { - conv = NULL; - mythes = NULL; - SetLanguage(lagi_wxString(OPT_GET("Tool/Thesaurus/Language")->GetString())); -} - - - -/// @brief Destructor -/// -MySpellThesaurus::~MySpellThesaurus() { - delete mythes; - mythes = NULL; - delete conv; - conv = NULL; -} - - - -/// @brief Get suggestions -/// @param word -/// @param result -/// @return -/// -void MySpellThesaurus::Lookup(wxString word,ThesaurusEntryArray &result) { - // Loaded? - if (!mythes) return; - - // Grab raw from MyThes - mentry *me; - wxCharBuffer buf = word.Lower().mb_str(*conv); - if (!buf) return; - int n = mythes->Lookup(buf,strlen(buf),&me); - - // Each entry - for (int i=0;iCleanUpAfterLookup(&me,n); -} - - - -/// @brief Get language list -/// @return -/// -wxArrayString MySpellThesaurus::GetLanguageList() { - // Get dir name - wxString path = StandardPaths::DecodePathMaybeRelative(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()), _T("?data")) + _T("/"); - wxArrayString list; - wxFileName folder(path); - if (!folder.DirExists()) return list; - - // Get file lists - wxArrayString idx; - wxDir::GetAllFiles(path,&idx,_T("*.idx"),wxDIR_FILES); - wxArrayString dat; - wxDir::GetAllFiles(path,&dat,_T("*.dat"),wxDIR_FILES); - - // For each idxtionary match, see if it can find the corresponding .dat - for (unsigned int i=0;iGetString()), _T("?data")) + _T("/"); - - // Get affix and dictionary paths - wxString idxpath = path + _T("th_") + language + _T(".idx"); - wxString datpath = path + _T("th_") + language + _T(".dat"); - - // Check if language is available - if (!wxFileExists(idxpath) || !wxFileExists(datpath)) return; - - LOG_I("thesaurus/file") << "Using thesaurus: " << datpath.c_str(); - - // Load - mythes = new MyThes(idxpath.mb_str(wxConvLocal),datpath.mb_str(wxConvLocal)); - conv = NULL; - if (mythes) conv = new wxCSConv(wxString(mythes->get_th_encoding(),wxConvUTF8)); -} - - diff --git a/aegisub/src/thesaurus_myspell.h b/aegisub/src/thesaurus_myspell.h deleted file mode 100644 index 46603c5fc..000000000 --- a/aegisub/src/thesaurus_myspell.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2006, Rodrigo Braz Monteiro -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// * Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// * Neither the name of the Aegisub Group nor the names of its contributors -// may be used to endorse or promote products derived from this software -// without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Aegisub Project http://www.aegisub.org/ -// -// $Id$ - -/// @file thesaurus_myspell.h -/// @see thesaurus_myspell.cpp -/// @ingroup thesaurus -/// - - - - -/////////// -// Headers -#include "thesaurus.h" - - -////////////// -// Prototypes -class MyThes; - - -/// DOCME -/// @class MySpellThesaurus -/// @brief DOCME -/// -/// DOCME -class MySpellThesaurus: public Thesaurus { -private: - - /// DOCME - MyThes *mythes; - - /// DOCME - wxCSConv *conv; - -public: - MySpellThesaurus(); - ~MySpellThesaurus(); - - void Lookup(wxString word,ThesaurusEntryArray &result); - wxArrayString GetLanguageList(); - void SetLanguage(wxString language); -}; - - diff --git a/aegisub/tests/Makefile b/aegisub/tests/Makefile index fb962af55..510fe1cb8 100644 --- a/aegisub/tests/Makefile +++ b/aegisub/tests/Makefile @@ -27,6 +27,7 @@ SRC = \ libaegisub_option.cpp \ libaegisub_mru.cpp \ libaegisub_signals.cpp \ + libaegisub_thesaurus.cpp \ libaegisub_util.cpp \ libaegisub_vfr.cpp diff --git a/aegisub/tests/libaegisub_thesaurus.cpp b/aegisub/tests/libaegisub_thesaurus.cpp new file mode 100644 index 000000000..6239613f9 --- /dev/null +++ b/aegisub/tests/libaegisub_thesaurus.cpp @@ -0,0 +1,148 @@ +// Copyright (c) 2012, Thomas Goyne +// +// Permission to use, copy, modify, and distribute this software for any +// purpose with or without fee is hereby granted, provided that the above +// copyright notice and this permission notice appear in all copies. +// +// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// +// $Id$ + +#include + +#include "main.h" +#include "util.h" + +#include + +class lagi_thes : public libagi { +protected: + std::string idx_path; + std::string dat_path; + + void SetUp() { + using std::endl; + + idx_path = "data/thes.idx"; + dat_path = "data/thes.dat"; + + std::ofstream idx(idx_path.c_str()); + std::ofstream dat(dat_path.c_str()); + + idx << "UTF-8" << endl; + dat << "UTF-8" << endl; + idx << 7 << endl; // entry count + + idx << "Word 1|" << dat.tellp() << endl; + dat << "Word 1|1" << endl; + dat << "(noun)|Word 1|Word 1A|Word 1B|Word 1C" << endl; + + idx << "Word 2|" << dat.tellp() << endl; + dat << "Word 2|2" << endl; + dat << "(adj)|Word 2|Word 2 adj" << endl; + dat << "(noun)|Word 2|Word 2 noun" << endl; + + dat << "Unindexed Word|1" << endl; + dat << "(adv)|Unindexed Word|Indexed Word" << endl; + + idx << "Word 3|" << dat.tellp() << endl; + dat << "Word 3|1" << endl; + dat << "(verb)|Not Word 3|Four" << endl; + + idx << "Too few fields" << endl; + idx << "Too many fields|100|100" << endl; + idx << "Not a number|foo" << endl; + idx << "Out of range|" << dat.tellp() << endl; + idx << "Further out of range|" << 1 + dat.tellp() << endl; + } +}; + +TEST_F(lagi_thes, parse) { + ASSERT_NO_THROW(agi::Thesaurus(dat_path, idx_path)); +} + +TEST_F(lagi_thes, word_1) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Word 1", &entries)); + ASSERT_EQ(1, entries.size()); + ASSERT_EQ(3, entries[0].second.size()); + EXPECT_STREQ("(noun) Word 1", entries[0].first.c_str()); + EXPECT_STREQ("Word 1A", entries[0].second[0].c_str()); + EXPECT_STREQ("Word 1B", entries[0].second[1].c_str()); + EXPECT_STREQ("Word 1C", entries[0].second[2].c_str()); +} + +TEST_F(lagi_thes, word_2) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Word 2", &entries)); + ASSERT_EQ(2, entries.size()); + ASSERT_EQ(1, entries[0].second.size()); + ASSERT_EQ(1, entries[1].second.size()); + EXPECT_STREQ("(adj) Word 2", entries[0].first.c_str()); + EXPECT_STREQ("(noun) Word 2", entries[1].first.c_str()); + EXPECT_STREQ("Word 2 adj", entries[0].second[0].c_str()); + EXPECT_STREQ("Word 2 noun", entries[1].second[0].c_str()); +} + +TEST_F(lagi_thes, word_3) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Word 3", &entries)); + ASSERT_EQ(1, entries.size()); + ASSERT_EQ(1, entries[0].second.size()); + EXPECT_STREQ("(verb) Not Word 3", entries[0].first.c_str()); + EXPECT_STREQ("Four", entries[0].second[0].c_str()); +} + +TEST_F(lagi_thes, bad_word) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Nonexistent word", &entries)); + EXPECT_EQ(0, entries.size()); +} + +TEST_F(lagi_thes, lookup_clears) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Word 1", &entries)); + ASSERT_NO_THROW(thes.Lookup("Word 2", &entries)); + ASSERT_NO_THROW(thes.Lookup("Word 3", &entries)); + EXPECT_EQ(1, entries.size()); +} + +TEST_F(lagi_thes, malformed_index_lines) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Too few fields", &entries)); + EXPECT_EQ(0, entries.size()); + ASSERT_NO_THROW(thes.Lookup("Too many fields", &entries)); + EXPECT_EQ(0, entries.size()); + ASSERT_NO_THROW(thes.Lookup("Not a number", &entries)); + EXPECT_EQ(0, entries.size()); + ASSERT_NO_THROW(thes.Lookup("Out of range", &entries)); + EXPECT_EQ(0, entries.size()); + ASSERT_NO_THROW(thes.Lookup("Further out of range", &entries)); + EXPECT_EQ(0, entries.size()); +} + +TEST_F(lagi_thes, unindexed_word) { + agi::Thesaurus thes(dat_path, idx_path); + + std::vector entries; + ASSERT_NO_THROW(thes.Lookup("Unindexed Word", &entries)); + EXPECT_EQ(0, entries.size()); +}