Replace MyThes thesaurus implementation with a custom one

Fixes a pile of unicode-related issues, such as dictionaries in a path
which does not fit into the system's local charset, and significantly
cuts down on the amount of code.

Originally committed to SVN as r6250.
This commit is contained in:
Thomas Goyne 2012-01-08 01:36:50 +00:00
parent 518f93f18f
commit 3c62a38c7a
18 changed files with 462 additions and 845 deletions

View file

@ -855,14 +855,6 @@
RelativePath="..\..\src\md5.h"
>
</File>
<File
RelativePath="..\..\src\mythes.cxx"
>
</File>
<File
RelativePath="..\..\src\mythes.hxx"
>
</File>
<File
RelativePath="..\..\src\pen.cpp"
>
@ -1423,14 +1415,6 @@
RelativePath="..\..\src\thesaurus.h"
>
</File>
<File
RelativePath="..\..\src\thesaurus_myspell.cpp"
>
</File>
<File
RelativePath="..\..\src\thesaurus_myspell.h"
>
</File>
</Filter>
<Filter
Name="Subtitle Formats"

View file

@ -311,6 +311,10 @@
RelativePath="..\..\libaegisub\common\path.cpp"
>
</File>
<File
RelativePath="..\..\libaegisub\common\thesaurus.cpp"
>
</File>
<File
RelativePath="..\..\libaegisub\common\util.cpp"
>
@ -477,6 +481,10 @@
RelativePath="..\..\libaegisub\include\libaegisub\signal.h"
>
</File>
<File
RelativePath="..\..\libaegisub\include\libaegisub\thesaurus.h"
>
</File>
<File
RelativePath="..\..\libaegisub\include\libaegisub\types.h"
>

View file

@ -334,6 +334,10 @@
RelativePath="..\..\tests\libaegisub_signals.cpp"
>
</File>
<File
RelativePath="..\..\tests\libaegisub_thesaurus.cpp"
>
</File>
<File
RelativePath="..\..\tests\libaegisub_util.cpp"
>

View file

@ -35,6 +35,7 @@ SRC = \
common/keyframe.cpp \
common/util.cpp \
common/log.cpp \
common/thesaurus.cpp \
common/validator.cpp \
common/vfr.cpp \
unix/util.cpp \

View file

@ -0,0 +1,97 @@
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// $Id$
/// @file thesaurus.cpp
/// @brief MyThes-compatible thesaurus implementation
/// @ingroup libaegisub thesaurus
#include "libaegisub/thesaurus.h"
#include "libaegisub/charset_conv.h"
#include "libaegisub/io.h"
#include "libaegisub/line_iterator.h"
template<class String, class Char, class Container>
static void split(String const& str, Char sep, Container *out) {
typename String::size_type pos, prev = 0;
out->reserve(2);
while ((pos = str.find(sep, prev)) != String::npos) {
if (pos > prev)
out->push_back(str.substr(prev, pos - prev));
prev = pos + 1;
}
if (prev < str.size())
out->push_back(str.substr(prev));
}
namespace agi {
Thesaurus::Thesaurus(std::string const& dat_path, std::string const& idx_path)
: dat(io::Open(dat_path))
{
scoped_ptr<std::ifstream> idx(io::Open(idx_path));
std::string encoding_name;
getline(*idx, encoding_name);
std::string unused_entry_count;
getline(*idx, unused_entry_count);
// Read the list of words and file offsets for those words
for (line_iterator<std::string> iter(*idx, encoding_name), end; iter != end; ++iter) {
std::vector<std::string> chunks;
split(*iter, '|', &chunks);
if (chunks.size() == 2) {
offsets[chunks[0]] = atoi(chunks[1].c_str());
}
}
conv.reset(new charset::IconvWrapper(encoding_name.c_str(), "utf-8"));
}
Thesaurus::~Thesaurus() { }
void Thesaurus::Lookup(std::string const& word, std::vector<Entry> *out) {
out->clear();
std::map<std::string, int>::const_iterator it = offsets.find(word);
if (!dat.get() || it == offsets.end()) return;
dat->seekg(it->second, std::ios::beg);
if (!dat->good()) return;
// First line is the word and meaning count
std::string temp;
getline(*dat, temp);
std::vector<std::string> header;
split(conv->Convert(temp), '|', &header);
if (header.size() != 2) return;
int meanings = atoi(header[1].c_str());
out->resize(meanings);
for (int i = 0; i < meanings; ++i) {
std::vector<std::string> line;
getline(*dat, temp);
split(conv->Convert(temp), '|', &line);
// The "definition" is just the part of speech plus the word it's
// giving synonyms for (which may not be the passed word)
(*out)[i].first = line[0] + ' ' + line[1];
(*out)[i].second.reserve(line.size() - 2);
copy(line.begin() + 2, line.end(), back_inserter((*out)[i].second));
}
}
}

View file

@ -0,0 +1,58 @@
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// $Id$
/// @file thesaurus.h
/// @brief MyThes-compatible thesaurus implementation
/// @ingroup libaegisub thesaurus
#include <libaegisub/scoped_ptr.h>
#ifndef LAGI_PRE
#include <iosfwd>
#include <map>
#include <string>
#include <vector>
#endif
namespace agi {
namespace charset { class IconvWrapper; }
class Thesaurus {
/// Map of word -> byte position in the data file
std::map<std::string, int> offsets;
/// Read handle to the data file
scoped_ptr<std::ifstream> dat;
/// Converter from the data file's charset to UTF-8
scoped_ptr<charset::IconvWrapper> conv;
public:
/// A pair of a word and synonyms for that word
typedef std::pair<std::string, std::vector<std::string> > Entry;
/// Constructor
/// @param dat_path Path to data file
/// @param idx_path Path to index file
Thesaurus(std::string const& dat_path, std::string const& idx_path);
~Thesaurus();
/// Look up synonyms for a word
/// @param word Word to look up
/// @param[out] out Vector to fill with word/synonym lists
void Lookup(std::string const& word, std::vector<Entry> *out);
};
}

View file

@ -192,7 +192,6 @@ SRC += \
menu.cpp \
md5.c \
mkv_wrap.cpp \
mythes.cxx \
pen.cpp \
persist_location.cpp \
plugin_manager.cpp \
@ -221,7 +220,6 @@ SRC += \
text_file_reader.cpp \
text_file_writer.cpp \
thesaurus.cpp \
thesaurus_myspell.cpp \
timeedit_ctrl.cpp \
threaded_frame_source.cpp \
toggle_bitmap.cpp \

View file

@ -97,7 +97,6 @@ AboutScreen::AboutScreen(wxWindow *parent)
#ifdef WITH_FREETYPE2
libString += " Freetype - Copyright (c) David Turner, Robert Wilhelm, Werner Lemberg;\n";
#endif
libString += " MyThes - Copyright (c) Kevin B. Hendricks, Stratford, Ontario, Canada.\n";
#ifdef WITH_FFTW3
libString += " FFTW - Copyright (c) Matteo Frigo, Massachusetts Institute of Technology;\n";
#endif

View file

@ -1,398 +0,0 @@
/*
* Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include "mythes.hxx"
// some basic utility routines
// string duplication routine
char * mythes_mystrdup(const char * p)
{
int sl = strlen(p) + 1;
char * d = (char *)malloc(sl);
if (d) {
memcpy(d,p,sl);
return d;
}
return NULL;
}
// remove cross-platform text line end characters
void mythes_mychomp(char * s)
{
int k = strlen(s);
if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
}
// return index of char in string
int mystr_indexOfChar(const char * d, int c)
{
const char * p = strchr(d,c);
if (p) return (int)(p-d);
return -1;
}
MyThes::MyThes(const char* idxpath, const char * datpath)
{
nw = 0;
encoding = NULL;
list = NULL;
offst = NULL;
if (thInitialize(idxpath, datpath) != 1) {
fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
fflush(stderr);
if (encoding) free((void*)encoding);
if (list) free((void*)list);
if (offst) free((void*)offst);
// did not initialize properly - throw exception?
}
}
MyThes::~MyThes()
{
if (thCleanup() != 1) {
/* did not cleanup properly - throw exception? */
}
if (encoding) free((void*)encoding);
encoding = NULL;
list = NULL;
offst = NULL;
}
int MyThes::thInitialize(const char* idxpath, const char* datpath)
{
// open the index file
FILE * pifile = fopen(idxpath,"r");
if (!pifile) {
pifile = NULL;
return 0;
}
// parse in encoding and index size */
char * wrd;
wrd = (char *)calloc(1, MAX_WD_LEN);
int len = readLine(pifile,wrd,MAX_WD_LEN);
encoding = mythes_mystrdup(wrd);
len = readLine(pifile,wrd,MAX_WD_LEN);
int idxsz = atoi(wrd);
// now allocate list, offst for the given size
list = (char**) calloc(idxsz,sizeof(char*));
offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
if ( (!(list)) || (!(offst)) ) {
fprintf(stderr,"Error - bad memory allocation\n");
fflush(stderr);
return 0;
}
// now parse the remaining lines of the index
len = readLine(pifile,wrd,MAX_WD_LEN);
while (len > 0)
{
int np = mystr_indexOfChar(wrd,'|');
if (nw < idxsz) {
if (np >= 0) {
*(wrd+np) = '\0';
list[nw] = (char *)calloc(1,(np+1));
memcpy((list[nw]),wrd,np);
offst[nw] = atoi(wrd+np+1);
nw++;
}
}
len = readLine(pifile,wrd,MAX_WD_LEN);
}
free((void *)wrd);
fclose(pifile);
pifile=NULL;
/* next open the data file */
pdfile = fopen(datpath,"r");
if (!pdfile) {
pdfile = NULL;
return 0;
}
return 1;
}
int MyThes::thCleanup()
{
/* first close the data file */
if (pdfile) {
fclose(pdfile);
pdfile=NULL;
}
/* now free up all the allocated strings on the list */
for (int i=0; i < nw; i++)
{
if (list[i]) {
free(list[i]);
list[i] = 0;
}
}
if (list) free((void*)list);
if (offst) free((void*)offst);
nw = 0;
return 1;
}
// lookup text in index and count of meanings and a list of meaning entries
// with each entry having a synonym count and pointer to an
// array of char * (i.e the synonyms)
//
// note: calling routine should call CleanUpAfterLookup with the original
// meaning point and count to properly deallocate memory
int MyThes::Lookup(const char * pText, int len, mentry** pme)
{
*pme = NULL;
// handle the case of missing file or file related errors
if (! pdfile) return 0;
long offset = 0;
/* copy search word and make sure null terminated */
char * wrd = (char *) calloc(1,(len+1));
memcpy(wrd,pText,len);
/* find it in the list */
int idx = binsearch(wrd,list,nw);
free(wrd);
if (idx < 0) return 0;
// now seek to the offset
offset = (long) offst[idx];
int rc = fseek(pdfile,offset,SEEK_SET);
if (rc) {
return 0;
}
// grab the count of the number of meanings
// and allocate a list of meaning entries
char * buf = NULL;
buf = (char *) malloc( MAX_LN_LEN );
if (!buf) return 0;
readLine(pdfile, buf, (MAX_LN_LEN-1));
int np = mystr_indexOfChar(buf,'|');
if (np < 0) {
free(buf);
return 0;
}
int nmeanings = atoi(buf+np+1);
*pme = (mentry*) malloc( nmeanings * sizeof(mentry) );
if (!(*pme)) {
free(buf);
return 0;
}
// now read in each meaning and parse it to get defn, count and synonym lists
mentry* pm = *(pme);
char dfn[MAX_WD_LEN];
for (int j = 0; j < nmeanings; j++) {
readLine(pdfile, buf, (MAX_LN_LEN-1));
pm->count = 0;
pm->psyns = NULL;
pm->defn = NULL;
// store away the part of speech for later use
char * p = buf;
char * pos = NULL;
np = mystr_indexOfChar(p,'|');
if (np >= 0) {
*(buf+np) = '\0';
pos = mythes_mystrdup(p);
p = p + np + 1;
} else {
pos = mythes_mystrdup("");
}
// count the number of fields in the remaining line
int nf = 1;
char * d = p;
np = mystr_indexOfChar(d,'|');
while ( np >= 0 ) {
nf++;
d = d + np + 1;
np = mystr_indexOfChar(d,'|');
}
pm->count = nf;
pm->psyns = (char **) malloc(nf*sizeof(char*));
// fill in the synonym list
d = p;
for (int j = 0; j < nf; j++) {
np = mystr_indexOfChar(d,'|');
if (np > 0) {
*(d+np) = '\0';
pm->psyns[j] = mythes_mystrdup(d);
d = d + np + 1;
} else {
pm->psyns[j] = mythes_mystrdup(d);
}
}
// add pos to first synonym to create the definition
int k = strlen(pos);
int m = strlen(pm->psyns[0]);
if ((k+m) < (MAX_WD_LEN - 1)) {
strncpy(dfn,pos,k);
*(dfn+k) = ' ';
strncpy((dfn+k+1),(pm->psyns[0]),m+1);
pm->defn = mythes_mystrdup(dfn);
} else {
pm->defn = mythes_mystrdup(pm->psyns[0]);
}
free(pos);
pm++;
}
free(buf);
return nmeanings;
}
void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
{
if (nmeanings == 0) return;
if ((*pme) == NULL) return;
mentry * pm = *pme;
for (int i = 0; i < nmeanings; i++) {
int count = pm->count;
for (int j = 0; j < count; j++) {
if (pm->psyns[j]) free(pm->psyns[j]);
pm->psyns[j] = NULL;
}
if (pm->psyns) free(pm->psyns);
pm->psyns = NULL;
if (pm->defn) free(pm->defn);
pm->defn = NULL;
pm->count = 0;
pm++;
}
pm = *pme;
free(pm);
*pme = NULL;
return;
}
// read a line of text from a text file stripping
// off the line terminator and replacing it with
// a null string terminator.
// returns: -1 on error or the number of characters in
// in the returning string
// A maximum of nc characters will be returned
int MyThes::readLine(FILE * pf, char * buf, int nc)
{
if (fgets(buf,nc,pf)) {
mythes_mychomp(buf);
return strlen(buf);
}
return -1;
}
// performs a binary search on null terminated character
// strings
//
// returns: -1 on not found
// index of wrd in the list[]
int MyThes::binsearch(char * sw, char* list[], int nlst)
{
int lp, up, mp, j, indx;
lp = 0;
up = nlst-1;
indx = -1;
if (nlst == 0) return -1;
if (strcmp(sw,list[lp]) < 0) return -1;
if (strcmp(sw,list[up]) > 0) return -1;
while (indx < 0 ) {
mp = (int)((lp+up) >> 1);
j = strcmp(sw,list[mp]);
if ( j > 0) {
lp = mp + 1;
} else if (j < 0 ) {
up = mp - 1;
} else {
indx = mp;
}
if (lp > up) return -1;
}
return indx;
}
char * MyThes::get_th_encoding()
{
if (encoding) return encoding;
return NULL;
}

View file

@ -1,103 +0,0 @@
/*
* Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#ifndef _MYTHES_HXX_
#define _MYTHES_HXX_
// some maximum sizes for buffers
#define MAX_WD_LEN 200
#define MAX_LN_LEN 16384
// a meaning with definition, count of synonyms and synonym list
struct mentry {
char* defn;
int count;
char** psyns;
};
class MyThes
{
int nw; /* number of entries in thesaurus */
char** list; /* stores word list */
unsigned int* offst; /* stores offset list */
char * encoding; /* stores text encoding; */
FILE *pdfile;
// disallow copy-constructor and assignment-operator for now
MyThes();
MyThes(const MyThes &);
MyThes & operator = (const MyThes &);
public:
MyThes(const char* idxpath, const char* datpath);
~MyThes();
// lookup text in index and return number of meanings
// each meaning entry has a defintion, synonym count and pointer
// when complete return the *original* meaning entry and count via
// CleanUpAfterLookup to properly handle memory deallocation
int Lookup(const char * pText, int len, mentry** pme);
void CleanUpAfterLookup(mentry** pme, int nmean);
char* get_th_encoding();
private:
// Open index and dat files and load list array
int thInitialize (const char* indxpath, const char* datpath);
// internal close and cleanup dat and idx files
int thCleanup ();
// read a text line (\n terminated) stripping off line terminator
int readLine(FILE * pf, char * buf, int nc);
// binary search on null terminated character strings
int binsearch(char * wrd, char* list[], int nlst);
};
#endif

View file

@ -79,7 +79,7 @@ enum {
SubsTextEditCtrl::SubsTextEditCtrl(wxWindow* parent, wxSize wsize, long style, SubtitlesGrid *grid)
: ScintillaTextCtrl(parent, -1, "", wxDefaultPosition, wsize, style)
, spellchecker(SpellCheckerFactory::GetSpellChecker())
, thesaurus(Thesaurus::GetThesaurus())
, thesaurus(new Thesaurus)
, grid(grid)
{
// Set properties
@ -795,14 +795,15 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
// Thesaurus
if (thesaurus.get() && currentWord.Length()) {
// Get results
ThesaurusEntryArray result;
thesaurus->Lookup(currentWord,result);
std::vector<Thesaurus::Entry> result;
thesaurus->Lookup(currentWord,&result);
// Compile list
thesSugs.Clear();
for (unsigned int i=0;i<result.size();i++) {
for (unsigned int j=0;j<result[i].words.Count();j++) {
thesSugs.Add(result[i].words[j]);
thesSugs.clear();
thesSugs.reserve(result.size() * 5);
for (size_t i = 0; i < result.size(); ++i) {
for (size_t j = 0; j < result[i].second.size(); ++j) {
thesSugs.push_back(result[i].second[j]);
}
}
@ -815,10 +816,10 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
// Build menu
int curThesEntry = 0;
for (unsigned int i=0;i<result.size();i++) {
for (size_t i=0;i<result.size();i++) {
// Single word, insert directly
if (result[i].words.Count() == 1) {
thesMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,result[i].name);
if (result[i].second.size() == 1) {
thesMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,lagi_wxString(result[i].first));
curThesEntry++;
}
@ -826,13 +827,13 @@ void SubsTextEditCtrl::OnContextMenu(wxContextMenuEvent &event) {
else {
// Insert entries
wxMenu *subMenu = new wxMenu();
for (unsigned int j=0;j<result[i].words.Count();j++) {
subMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,result[i].words[j]);
for (size_t j=0;j<result[i].second.size();j++) {
subMenu->Append(EDIT_MENU_THESAURUS_SUGS+curThesEntry,lagi_wxString(result[i].second[j]));
curThesEntry++;
}
// Insert submenu
thesMenu->Append(-1, result[i].name, subMenu);
thesMenu->Append(-1, lagi_wxString(result[i].first), subMenu);
}
}
@ -911,7 +912,7 @@ void SubsTextEditCtrl::OnUseSuggestion(wxCommandEvent &event) {
wxString suggestion;
int sugIdx = event.GetId() - EDIT_MENU_THESAURUS_SUGS;
if (sugIdx >= 0) {
suggestion = thesSugs[sugIdx];
suggestion = lagi_wxString(thesSugs[sugIdx]);
}
else {
suggestion = sugs[event.GetId() - EDIT_MENU_SUGGESTIONS];
@ -953,10 +954,9 @@ void SubsTextEditCtrl::OnSetThesLanguage(wxCommandEvent &event) {
// Set language
int index = event.GetId() - EDIT_MENU_THES_LANGS - 1;
if (index >= 0) {
thesaurus->SetLanguage(langs[index]);
OPT_SET("Tool/Thesaurus/Language")->SetString(STD_STR(langs[index]));
}
wxString lang;
if (index >= 0) lang = langs[index];
OPT_SET("Tool/Thesaurus/Language")->SetString(STD_STR(lang));
UpdateStyle();
}

View file

@ -67,7 +67,7 @@ class SubsTextEditCtrl : public ScintillaTextCtrl {
wxArrayString sugs;
/// DOCME
wxArrayString thesSugs;
std::vector<std::string> thesSugs;
/// DOCME
int currentWordPos;

View file

@ -1,4 +1,4 @@
// Copyright (c) 2006, Rodrigo Braz Monteiro
// Copyright (c) 2011, Thomas Goyne <plorkyeran@aegisub.org>
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
@ -30,29 +30,111 @@
// $Id$
/// @file thesaurus.cpp
/// @brief Base-class for thesaurus implementations
/// @brief Thesaurus implementation
/// @ingroup thesaurus
///
///////////
// Headers
#include "config.h"
#include "thesaurus_myspell.h"
#include "thesaurus.h"
#ifndef AGI_PRE
#include <wx/dir.h>
#include <wx/filename.h>
#endif
/// @brief Get spell checker
///
Thesaurus *Thesaurus::GetThesaurus() {
// Initialize
Thesaurus *thes = NULL;
#include <libaegisub/log.h>
#include <libaegisub/thesaurus.h>
// Get myspell
thes = new MySpellThesaurus();
#include "compat.h"
#include "main.h"
#include "standard_paths.h"
// Return
return thes;
Thesaurus::Thesaurus()
: lang_listener(OPT_SUB("Tool/Thesaurus/Language", &Thesaurus::OnLanguageChanged, this))
, dict_path_listener(OPT_SUB("Path/Dictionary", &Thesaurus::OnPathChanged, this))
{
OnLanguageChanged();
}
Thesaurus::~Thesaurus() {
// Explicit empty destructor needed for scoped_ptr with incomplete types
}
void Thesaurus::Lookup(wxString const& word, std::vector<Entry> *result) {
if (!impl.get()) return;
impl->Lookup(STD_STR(word.Lower()), result);
}
wxArrayString Thesaurus::GetLanguageList() const {
if (!languages.empty()) return languages;
wxArrayString idx, dat;
// Get list of dictionaries
wxString path = StandardPaths::DecodePath("?data/dictionaries/");
if (wxFileName::DirExists(path)) {
wxDir::GetAllFiles(path, &idx, "th_*.idx", wxDIR_FILES);
wxDir::GetAllFiles(path, &dat, "th_*.dat", wxDIR_FILES);
}
path = StandardPaths::DecodePath(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()) + "/");
if (wxFileName::DirExists(path)) {
wxDir::GetAllFiles(path, &idx, "th_*.idx", wxDIR_FILES);
wxDir::GetAllFiles(path, &dat, "th_*.dat", wxDIR_FILES);
}
if (idx.empty() || dat.empty()) return languages;
idx.Sort();
dat.Sort();
// Drop extensions and the th_ prefix
for (size_t i = 0; i < idx.size(); ++i) idx[i] = idx[i].Mid(3, idx[i].size() - 7);
for (size_t i = 0; i < dat.size(); ++i) dat[i] = dat[i].Mid(3, dat[i].size() - 7);
// Verify that each idx has a dat
for (size_t i = 0, j = 0; i < idx.size() && j < dat.size(); ) {
int cmp = idx[i].Cmp(dat[j]);
if (cmp < 0) ++i;
else if (cmp > 0) ++j;
else {
// Don't insert a language twice if it's in both the user dir and
// the app's dir
wxString name = wxFileName(dat[j]).GetName().Mid(3);
if (languages.empty() || name != languages.back())
languages.push_back(name);
++i;
++j;
}
}
return languages;
}
void Thesaurus::OnLanguageChanged() {
impl.reset();
std::string language = OPT_GET("Tool/Thesaurus/Language")->GetString();
if (language.empty()) return;
wxString path = StandardPaths::DecodePath(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()) + "/");
// Get index and data paths
wxString idxpath = wxString::Format("%s/th_%s.idx", path, language);
wxString datpath = wxString::Format("%s/th_%s.dat", path, language);
// If they aren't in the user dictionary path, check the application directory
if (!wxFileExists(idxpath) || !wxFileExists(datpath)) {
path = StandardPaths::DecodePath("?data/dictionaries/");
idxpath = wxString::Format("%s/th_%s.idx", path, language);
datpath = wxString::Format("%s/th_%s.dat", path, language);
if (!wxFileExists(idxpath) || !wxFileExists(datpath)) return;
}
LOG_I("thesaurus/file") << "Using thesaurus: " << datpath.c_str();
impl.reset(new agi::Thesaurus(STD_STR(datpath), STD_STR(idxpath)));
}
void Thesaurus::OnPathChanged() {
languages.clear();
}

View file

@ -1,4 +1,4 @@
// Copyright (c) 2006, Rodrigo Braz Monteiro
// Copyright (c) 2011, Thomas Goyne <plorkyeran@aegisub.org>
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
@ -34,9 +34,6 @@
/// @ingroup thesaurus
///
///////////
// Headers
#ifndef AGI_PRE
#include <vector>
@ -44,50 +41,40 @@
#include <wx/string.h>
#endif
#include <libaegisub/scoped_ptr.h>
#include <libaegisub/signal.h>
/// DOCME
/// @class ThesaurusEntry
/// @brief DOCME
///
/// DOCME
class ThesaurusEntry {
public:
namespace agi { class Thesaurus; }
/// DOCME
wxString name;
/// DOCME
wxArrayString words;
};
/// DOCME
typedef std::vector<ThesaurusEntry> ThesaurusEntryArray;
/// DOCME
/// @class Thesaurus
/// @brief DOCME
///
/// DOCME
/// @brief A wrapper around agi::Thesarus adding wx and Aegisub-specific stuff
class Thesaurus {
/// The actual thesarus implementation
agi::scoped_ptr<agi::Thesaurus> impl;
/// A cached list of languages available
mutable wxArrayString languages;
/// Thesaurus language change slot
agi::signal::Connection lang_listener;
/// Thesaurus language change handler
void OnLanguageChanged();
/// Thesaurus path change slot
agi::signal::Connection dict_path_listener;
/// Thesaurus path change handler
void OnPathChanged();
public:
static Thesaurus *GetThesaurus();
/// A pair of a word and synonyms for that word
typedef std::pair<std::string, std::vector<std::string> > Entry;
Thesaurus();
~Thesaurus();
/// @brief DOCME
///
Thesaurus() {}
/// Get a list of synonyms for a word, grouped by possible meanings of the word
/// @param word Word to get synonyms for
/// @param[out] result Output list
void Lookup(wxString const& word, std::vector<Entry> *result);
/// @brief DOCME
///
virtual ~Thesaurus() {}
virtual void Lookup(wxString word,ThesaurusEntryArray &result)=0;
virtual wxArrayString GetLanguageList()=0;
virtual void SetLanguage(wxString language)=0;
/// Get a list of language codes which thesauri are available for
wxArrayString GetLanguageList() const;
};

View file

@ -1,176 +0,0 @@
// Copyright (c) 2006, Rodrigo Braz Monteiro
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of the Aegisub Group nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Aegisub Project http://www.aegisub.org/
//
// $Id$
/// @file thesaurus_myspell.cpp
/// @brief MySpell-based thesaurus implementation
/// @ingroup thesaurus
///
///////////
// Headers
#include "config.h"
#ifndef AGI_PRE
#include <wx/dir.h>
#include <wx/filename.h>
#include <wx/log.h>
#endif
#include <libaegisub/log.h>
#include "compat.h"
#include "mythes.hxx"
#include "main.h"
#include "standard_paths.h"
#include "thesaurus_myspell.h"
#include "utils.h"
/// @brief Constructor
///
MySpellThesaurus::MySpellThesaurus() {
conv = NULL;
mythes = NULL;
SetLanguage(lagi_wxString(OPT_GET("Tool/Thesaurus/Language")->GetString()));
}
/// @brief Destructor
///
MySpellThesaurus::~MySpellThesaurus() {
delete mythes;
mythes = NULL;
delete conv;
conv = NULL;
}
/// @brief Get suggestions
/// @param word
/// @param result
/// @return
///
void MySpellThesaurus::Lookup(wxString word,ThesaurusEntryArray &result) {
// Loaded?
if (!mythes) return;
// Grab raw from MyThes
mentry *me;
wxCharBuffer buf = word.Lower().mb_str(*conv);
if (!buf) return;
int n = mythes->Lookup(buf,strlen(buf),&me);
// Each entry
for (int i=0;i<n;i++) {
ThesaurusEntry entry;
entry.name = wxString(me[i].defn,*conv);
for (int j=0;j<me[i].count;j++) entry.words.Add(wxString(me[i].psyns[j],*conv));
result.push_back(entry);
}
// Clean up
mythes->CleanUpAfterLookup(&me,n);
}
/// @brief Get language list
/// @return
///
wxArrayString MySpellThesaurus::GetLanguageList() {
// Get dir name
wxString path = StandardPaths::DecodePathMaybeRelative(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()), _T("?data")) + _T("/");
wxArrayString list;
wxFileName folder(path);
if (!folder.DirExists()) return list;
// Get file lists
wxArrayString idx;
wxDir::GetAllFiles(path,&idx,_T("*.idx"),wxDIR_FILES);
wxArrayString dat;
wxDir::GetAllFiles(path,&dat,_T("*.dat"),wxDIR_FILES);
// For each idxtionary match, see if it can find the corresponding .dat
for (unsigned int i=0;i<idx.Count();i++) {
wxString curdat = idx[i].Left(std::max(0,signed(idx[i].Length())-4)) + _T(".dat");
for (unsigned int j=0;j<dat.Count();j++) {
// Found match
if (curdat == dat[j]) {
wxFileName fname(curdat);
wxString name = fname.GetName();
if (name.Left(3) == _T("th_")) name = name.Mid(3);
list.Add(name);
break;
}
}
}
// Return list
return list;
}
/// @brief Set language
/// @param language
///
void MySpellThesaurus::SetLanguage(wxString language) {
// Unload
delete mythes;
mythes = NULL;
delete conv;
conv = NULL;
// Unloading
if (language.IsEmpty()) return;
// Get dir name
wxString path = StandardPaths::DecodePathMaybeRelative(lagi_wxString(OPT_GET("Path/Dictionary")->GetString()), _T("?data")) + _T("/");
// Get affix and dictionary paths
wxString idxpath = path + _T("th_") + language + _T(".idx");
wxString datpath = path + _T("th_") + language + _T(".dat");
// Check if language is available
if (!wxFileExists(idxpath) || !wxFileExists(datpath)) return;
LOG_I("thesaurus/file") << "Using thesaurus: " << datpath.c_str();
// Load
mythes = new MyThes(idxpath.mb_str(wxConvLocal),datpath.mb_str(wxConvLocal));
conv = NULL;
if (mythes) conv = new wxCSConv(wxString(mythes->get_th_encoding(),wxConvUTF8));
}

View file

@ -1,73 +0,0 @@
// Copyright (c) 2006, Rodrigo Braz Monteiro
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of the Aegisub Group nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Aegisub Project http://www.aegisub.org/
//
// $Id$
/// @file thesaurus_myspell.h
/// @see thesaurus_myspell.cpp
/// @ingroup thesaurus
///
///////////
// Headers
#include "thesaurus.h"
//////////////
// Prototypes
class MyThes;
/// DOCME
/// @class MySpellThesaurus
/// @brief DOCME
///
/// DOCME
class MySpellThesaurus: public Thesaurus {
private:
/// DOCME
MyThes *mythes;
/// DOCME
wxCSConv *conv;
public:
MySpellThesaurus();
~MySpellThesaurus();
void Lookup(wxString word,ThesaurusEntryArray &result);
wxArrayString GetLanguageList();
void SetLanguage(wxString language);
};

View file

@ -27,6 +27,7 @@ SRC = \
libaegisub_option.cpp \
libaegisub_mru.cpp \
libaegisub_signals.cpp \
libaegisub_thesaurus.cpp \
libaegisub_util.cpp \
libaegisub_vfr.cpp

View file

@ -0,0 +1,148 @@
// Copyright (c) 2012, Thomas Goyne <plorkyeran@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
//
// $Id$
#include <libaegisub/thesaurus.h>
#include "main.h"
#include "util.h"
#include <fstream>
class lagi_thes : public libagi {
protected:
std::string idx_path;
std::string dat_path;
void SetUp() {
using std::endl;
idx_path = "data/thes.idx";
dat_path = "data/thes.dat";
std::ofstream idx(idx_path.c_str());
std::ofstream dat(dat_path.c_str());
idx << "UTF-8" << endl;
dat << "UTF-8" << endl;
idx << 7 << endl; // entry count
idx << "Word 1|" << dat.tellp() << endl;
dat << "Word 1|1" << endl;
dat << "(noun)|Word 1|Word 1A|Word 1B|Word 1C" << endl;
idx << "Word 2|" << dat.tellp() << endl;
dat << "Word 2|2" << endl;
dat << "(adj)|Word 2|Word 2 adj" << endl;
dat << "(noun)|Word 2|Word 2 noun" << endl;
dat << "Unindexed Word|1" << endl;
dat << "(adv)|Unindexed Word|Indexed Word" << endl;
idx << "Word 3|" << dat.tellp() << endl;
dat << "Word 3|1" << endl;
dat << "(verb)|Not Word 3|Four" << endl;
idx << "Too few fields" << endl;
idx << "Too many fields|100|100" << endl;
idx << "Not a number|foo" << endl;
idx << "Out of range|" << dat.tellp() << endl;
idx << "Further out of range|" << 1 + dat.tellp() << endl;
}
};
TEST_F(lagi_thes, parse) {
ASSERT_NO_THROW(agi::Thesaurus(dat_path, idx_path));
}
TEST_F(lagi_thes, word_1) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Word 1", &entries));
ASSERT_EQ(1, entries.size());
ASSERT_EQ(3, entries[0].second.size());
EXPECT_STREQ("(noun) Word 1", entries[0].first.c_str());
EXPECT_STREQ("Word 1A", entries[0].second[0].c_str());
EXPECT_STREQ("Word 1B", entries[0].second[1].c_str());
EXPECT_STREQ("Word 1C", entries[0].second[2].c_str());
}
TEST_F(lagi_thes, word_2) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Word 2", &entries));
ASSERT_EQ(2, entries.size());
ASSERT_EQ(1, entries[0].second.size());
ASSERT_EQ(1, entries[1].second.size());
EXPECT_STREQ("(adj) Word 2", entries[0].first.c_str());
EXPECT_STREQ("(noun) Word 2", entries[1].first.c_str());
EXPECT_STREQ("Word 2 adj", entries[0].second[0].c_str());
EXPECT_STREQ("Word 2 noun", entries[1].second[0].c_str());
}
TEST_F(lagi_thes, word_3) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Word 3", &entries));
ASSERT_EQ(1, entries.size());
ASSERT_EQ(1, entries[0].second.size());
EXPECT_STREQ("(verb) Not Word 3", entries[0].first.c_str());
EXPECT_STREQ("Four", entries[0].second[0].c_str());
}
TEST_F(lagi_thes, bad_word) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Nonexistent word", &entries));
EXPECT_EQ(0, entries.size());
}
TEST_F(lagi_thes, lookup_clears) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Word 1", &entries));
ASSERT_NO_THROW(thes.Lookup("Word 2", &entries));
ASSERT_NO_THROW(thes.Lookup("Word 3", &entries));
EXPECT_EQ(1, entries.size());
}
TEST_F(lagi_thes, malformed_index_lines) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Too few fields", &entries));
EXPECT_EQ(0, entries.size());
ASSERT_NO_THROW(thes.Lookup("Too many fields", &entries));
EXPECT_EQ(0, entries.size());
ASSERT_NO_THROW(thes.Lookup("Not a number", &entries));
EXPECT_EQ(0, entries.size());
ASSERT_NO_THROW(thes.Lookup("Out of range", &entries));
EXPECT_EQ(0, entries.size());
ASSERT_NO_THROW(thes.Lookup("Further out of range", &entries));
EXPECT_EQ(0, entries.size());
}
TEST_F(lagi_thes, unindexed_word) {
agi::Thesaurus thes(dat_path, idx_path);
std::vector<agi::Thesaurus::Entry> entries;
ASSERT_NO_THROW(thes.Lookup("Unindexed Word", &entries));
EXPECT_EQ(0, entries.size());
}