Aegisub/devel/traydict/dictionary.cpp
Amar Takhar aa506bfe40 SVN Transition Step 6/7
1. svn mv OverLua SSATool athenasub avisynth_prs kanamemo \
     motiontracker prs traydict unit_test vsfilter devel/

* See r2749 for full description.

Originally committed to SVN as r2755.
2009-03-08 08:31:54 +00:00

589 lines
14 KiB
C++

// Copyright (c) 2006, Rodrigo Braz Monteiro
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
// * Neither the name of the TrayDict Group nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// -----------------------------------------------------------------------------
//
// TRAYDICT
//
// Website: http://aegisub.cellosoft.com
// Contact: mailto:zeratul@cellosoft.com
//
///////////
// Headers
#include <wx/wxprec.h>
#include <wx/tokenzr.h>
#include <wx/filename.h>
#include <wx/zstream.h>
#include <wx/wfstream.h>
#include <stdio.h>
#include "dictionary.h"
#include "../aegisub/text_file_reader.h"
#include "main.h"
///////////////
// Constructor
Dictionary::Dictionary(wxString _name,wxCheckBox *_check) {
name = _name;
check = _check;
// Set file
wxString filename = TrayDict::folderName + _name + _T(".dic");
Load(filename);
// Set checkbox
wxFileName file(filename);
check->Enable(file.FileExists());
}
//////////////
// Destructor
Dictionary::~Dictionary() {
}
//////////
// Static
KanaTable Dictionary::kanatable;
////////
// Load
void Dictionary::Convert(wxString source,wxString dest) {
// Variables
const wchar_t *str = NULL;
short len;
DictEntry curEntry;
try {
// Open source file
TextFileReader file(source,_T("EUC-JP"));
// Open destination file
wxFileOutputStream out(dest);
wxBufferedOutputStream buf(out);
wxZlibOutputStream fp(buf,9,wxZLIB_GZIP);
// Skip first line
if (file.HasMoreLines()) file.ReadLineFromFile();
// Read lines
while (file.HasMoreLines()) {
// Get string
wxString string = file.ReadLineFromFile();
// Process string to account for lack of kana
if (!string.Contains(_T("["))) {
if (!string.Contains(_T("]"))) {
int pos = string.Find(_T(' '));
if (pos == -1) continue;
wxString temp = string;
string = temp.Left(pos) + _T(" []") + temp.Right(temp.Length() - pos);
}
else continue;
}
// Tokenize
wxStringTokenizer token(string,_T("[]"),wxTOKEN_RET_EMPTY);
// Kanji
if (token.HasMoreTokens()) {
curEntry.kanji = token.GetNextToken().Trim(false).Trim(true);
}
else continue;
// Kana & romaji
if (token.HasMoreTokens()) {
curEntry.kana = token.GetNextToken().Trim(false).Trim(true);
if (curEntry.kana.IsEmpty()) curEntry.kana = curEntry.kanji;
curEntry.romaji = kanatable.KanaToRomaji(curEntry.kana,0);
}
else continue;
// English
if (token.HasMoreTokens()) {
curEntry.english = token.GetNextToken().Trim(false).Trim(true);
curEntry.english = curEntry.english.Mid(1,curEntry.english.Length()-2);
}
else continue;
// Write kanji
str = curEntry.kanji.c_str();
len = wcslen(str);
fp.Write(&len,2);
fp.Write(str,len*2);
// Write kana
str = curEntry.kana.c_str();
len = wcslen(str);
fp.Write(&len,2);
fp.Write(str,len*2);
// Write romaji
str = curEntry.romaji.c_str();
len = wcslen(str);
fp.Write(&len,2);
fp.Write(str,len*2);
// Write english
str = curEntry.english.c_str();
len = wcslen(str);
fp.Write(&len,2);
fp.Write(str,len*2);
}
}
catch (...) {
wxMessageBox(_T("Could not find dictionary file: ") + source,_T("File not found!"),wxICON_ERROR);
}
}
////////
// Load
void Dictionary::Load(wxString filename) {
dictFile = filename;
}
//////////
// Search
void Dictionary::Search(ResultSet &results,wxString query) {
// Prepare results
int resCount = 0;
results.ownData = true;
results.dicName = name;
results.query = query;
query.Trim(true);
query.Trim(false);
// Determine query type
bool isJapanese = false;
for (size_t i=0;i<query.Length();i++) {
if (query[i] > 255) {
isJapanese = true;
break;
}
}
// Stopwatch
wxStopWatch stopwatch;
// Open file
wxInputStream *file;
wxFileInputStream filestream(dictFile);
wxBufferedInputStream buf(filestream);
wxZlibInputStream zstream(buf);
wxBufferedInputStream buf2(zstream);
file = &zstream;
// Buffer
wchar_t buffer[16384];
short len;
DictEntry *cur = NULL;
// Search for matches
while (!file->Eof()) {
// Prepare
bool addThis = false;
int rel = 0;
// Create data
if (!cur) cur = new DictEntry;
// Read kanji
file->Read(&len,2);
if (len < 0) return;
file->Read(buffer,2*len);
buffer[len] = 0;
cur->kanji = buffer;
// Read kana
file->Read(&len,2);
if (len < 0) return;
file->Read(buffer,2*len);
buffer[len] = 0;
cur->kana = buffer;
// Read romaji
file->Read(&len,2);
if (len < 0) return;
file->Read(buffer,2*len);
buffer[len] = 0;
cur->romaji = buffer;
// Read english
file->Read(&len,2);
if (len < 0) return;
file->Read(buffer,2*len);
buffer[len] = 0;
cur->english = buffer;
// Japanese query
if (isJapanese) {
// Matches kanji?
if (cur->kanji.Contains(query)) {
addThis = true;
rel = GetRelevance(query,cur->kanji,cur->english.Contains(_T("(P)")));
}
// Matches kana?
else if (cur->kana.Contains(query)) {
addThis = true;
rel = GetRelevance(query,cur->kana,cur->english.Contains(_T("(P)")));
}
}
// English/romaji query
else {
// Lowercase query
wxString lowQuery = query.Lower();
// Matches english?
if (cur->english.Lower().Contains(lowQuery)) {
addThis = true;
rel = GetRelevance(lowQuery,cur->english,cur->english.Contains(_T("(P)")),true);
}
// Matches wapuro romaji?
if (cur->romaji.Contains(lowQuery)) {
addThis = true;
rel = GetRelevance(lowQuery,cur->romaji,cur->english.Contains(_T("(P)")));
}
}
// Add entry
if (addThis) {
SearchResult res;
res.relevance = rel;
res.entry = cur;
results.results.push_back(res);
cur = NULL;
}
}
// Delete cur
if (cur) delete cur;
// Time
stopwatch.Pause();
results.time = stopwatch.Time();
// Close file
}
/////////////////
// Get relevancy
int Dictionary::GetRelevance(wxString substr,wxString _str,bool isPop,bool english) {
// Best score
int bestScore = 0;
// Generate list of strings
wxArrayString strings;
if (!english) strings.Add(_str.Lower());
else {
wxStringTokenizer tkn(_str.Lower(),_T("/"));
while (tkn.HasMoreTokens()) {
// Get token
wxString token = tkn.GetNextToken();
// Remove parenthesis
wxString temp;
bool inside = false;
bool gotOne = false;
for (size_t i=0;i<token.Length();i++) {
if (token[i] == _T('(')) {
inside = true;
gotOne = true;
}
if (!inside) temp += token[i];
if (token[i] == _T(')')) inside = false;
}
// Add a copy with parenthesis
if (gotOne) strings.Add(token);
// Trim & add
temp.Trim(true);
temp.Trim(false);
strings.Add(temp);
}
}
// Search in each match
for (size_t i=0;i<strings.Count();i++) {
// Get string
wxString str = strings[i];
if (!str.Contains(substr)) continue;
// Score
int score = 0;
if (isPop) score += 5000;
// Exact match, can't get better
if (substr == str) {
score += 10000;
}
else {
// Semi-exact match (to e.g. match "car shed" higher than "card" when looking for "car")
if (english) {
wxString temp1 = _T(" ") + str + _T(" ");
wxString temp2 = _T(" ") + substr + _T(" ");
if (temp1.Contains(temp2)) score += 5000;
}
// Calculate how much of a partial match it was
score += 1000 - (str.Length() - substr.Length())*1000/str.Length();
// Find match position
int start = str.Find(substr);
if (start == -1) throw 0;
int temp1 = (str.length() - start)*500/str.Length() + 1;
int temp2 = (start + substr.Length())*500/str.Length();
if (temp1 > temp2) score += temp1;
else score += temp2;
}
// Best score?
if (score > bestScore) bestScore = score;
}
return bestScore;
}
//////////////////////////
// Comparison for sorting
bool operator < (const SearchResult &a,const SearchResult &b) {
return (a.relevance > b.relevance);
}
/////////////////
// Compact entry
void DictEntry::Compact() {
kanji.Trim(true).Trim(false).Shrink();
kana.Trim(true).Trim(false).Shrink();
english.Trim(true).Trim(false).Shrink();
}
///////////////
// Constructor
ResultSet::ResultSet() {
ownData = false;
}
//////////////
// Destructor
ResultSet::~ResultSet() {
if (ownData) {
std::list<SearchResult>::iterator cur;
for (cur = results.begin(); cur != results.end();cur++) {
delete (*cur).entry;
}
}
}
///////////////////
// Print resultset
/*
void ResultSet::Print(wxTextCtrl *target,int bitmask) {
// Get options
bool drawKanji = (bitmask & 1) != 0;
bool drawKana = (bitmask & 2) != 0;
bool drawRomaji = (bitmask & 4) != 0;
bool drawEnglish = (bitmask & 8) != 0;
// Fonts
wxFont font;
font.SetFaceName(_T("MS Mincho"));
font.SetPointSize(9);
wxFont font2;
font2.SetFaceName(_T("Tahoma"));
// Text attributes
wchar_t space = 0x3000;
wxString spaceStr = space;
wxTextAttr fontAttr;
fontAttr.SetFont(font);
wxTextAttr kanjiCol;
kanjiCol.SetFont(font);
kanjiCol.SetTextColour(wxColour(192,0,0));
wxTextAttr kanaCol;
kanaCol.SetTextColour(wxColour(0,0,192));
wxTextAttr romajiCol;
romajiCol.SetTextColour(wxColour(0,128,0));
wxTextAttr engCol;
engCol.SetFont(font2);
engCol.SetTextColour(wxColour(0,0,0));
wxTextAttr commonCol;
commonCol.SetTextColour(wxColour(0,128,128));
commonCol.SetFont(font2);
wxTextAttr sepCol;
sepCol.SetTextColour(wxColour(128,90,0));
wxTextAttr boldCol;
font2.SetWeight(wxBOLD);
boldCol.SetFont(font2);
wxTextAttr notBoldCol;
font2.SetWeight(wxNORMAL);
notBoldCol.SetFont(font);
// Find column widths
int kanjiWidth = 0;
int kanaWidth = 0;
int romajiWidth = 0;
std::list<SearchResult>::iterator cur;
DictEntry *entry;
int curLen;
int resPrinted = 0;
for (cur=results.begin();cur!=results.end();cur++) {
entry = cur->entry;
curLen = entry->kanji.Length();
if (curLen > kanjiWidth) kanjiWidth = curLen;
curLen = entry->kana.Length();
if (curLen > kanaWidth) kanaWidth = curLen;
wxString temp = Dictionary::kanatable.KanaToRomaji(entry->kana);
curLen = temp.Length() - temp.Freq(0x304);
if (curLen > romajiWidth) romajiWidth = curLen;
// Limit to 1000
resPrinted++;
if (resPrinted >= 1000) break;
}
// List number of results
target->SetDefaultStyle(boldCol);
int maxDisp = results.size();
if (maxDisp > 1000) maxDisp = 1000;
target->AppendText(wxString::Format(_T("Searched %s for \"%s\". Displaying %i matches. Search took %i ms.\n"),dicName.Upper().c_str(),query.c_str(),maxDisp,time));
target->SetDefaultStyle(notBoldCol);
target->SetDefaultStyle(fontAttr);
// Append to results
wxString curText;
resPrinted = 0;
for (cur=results.begin();cur!=results.end();cur++) {
entry = cur->entry;
// Write kanji
if (drawKanji) {
target->SetDefaultStyle(kanjiCol);
curText = entry->kanji;
curLen = kanjiWidth - curText.Length() + 1;
for (int i=0;i<curLen;i++) curText = curText + spaceStr;
target->AppendText(curText);
}
// Write kana
if (drawKana) {
target->SetDefaultStyle(kanaCol);
//curText = _T("[") + entry->kana + _T("]");
curText = entry->kana;
curLen = kanaWidth - curText.Length() + 1;
for (int i=0;i<curLen;i++) curText = curText + spaceStr;
target->AppendText(curText);
}
// Write romaji
if (drawRomaji) {
target->SetDefaultStyle(romajiCol);
curText = Dictionary::kanatable.KanaToRomaji(entry->kana);
curLen = romajiWidth - curText.Length() + curText.Freq(0x304) + 1;
for (int i=0;i<curLen;i++) curText = curText + _T(" ");
target->AppendText(curText);
}
// Write english
if (drawEnglish) {
// Search for grammatical class
wxString mainText;
int pos = entry->english.Find(_T(')'));
if (entry->english[0] == _T('(') && pos != -1) {
mainText = entry->english.Mid(pos+1);
mainText.Trim(false);
// Draw grammatical class
target->SetDefaultStyle(sepCol);
target->AppendText(entry->english.Left(pos+1) + _T(" "));
}
else mainText = entry->english;
target->SetDefaultStyle(engCol);
// Draw rest
target->SetDefaultStyle(engCol);
wxStringTokenizer tkn(mainText,_T("/"));
bool hadPrev = false;
while (tkn.HasMoreTokens()) {
// Popular entry
wxString token = tkn.GetNextToken();
if (token == _T("(P)")) {
target->SetDefaultStyle(commonCol);
target->AppendText(_T(" [Common]"));
target->SetDefaultStyle(engCol);
}
// Normal entry
else {
// Separator
if (hadPrev) {
target->SetDefaultStyle(sepCol);
target->AppendText(_T(" / "));
target->SetDefaultStyle(engCol);
}
// Append text
target->AppendText(token);
hadPrev = true;
}
}
}
// Line break
target->SetDefaultStyle(fontAttr);
target->AppendText(_T("\n"));
// Limit to 1000
resPrinted++;
if (resPrinted >= 1000) {
target->SetDefaultStyle(boldCol);
target->AppendText(wxString::Format(_T("Too many (%i) matches, stopping.\n"),results.size()));
break;
}
}
// Print two carriage returns
target->AppendText(_T("\n\n"));
}
*/