From 42e0dd6ce42064cd50603a822e59bbb404a9463f Mon Sep 17 00:00:00 2001 From: Amar Takhar Date: Fri, 9 Oct 2009 14:30:27 +0000 Subject: [PATCH] Update universalchardet using a patch I made around 2009-02, the one we're currently using is from ~1998. I'll check again later to see if there are any updates to it before closing the ticket. Updates #866. Originally committed to SVN as r3653. --- aegisub/src/charset_detect.cpp | 17 +- aegisub/src/charset_detect.h | 3 + aegisub/universalchardet/CharDistribution.cpp | 5 +- aegisub/universalchardet/CharDistribution.h | 24 +-- aegisub/universalchardet/JpCntx.cpp | 8 +- aegisub/universalchardet/JpCntx.h | 12 +- .../universalchardet/LangBulgarianModel.cpp | 10 +- .../universalchardet/LangCyrillicModel.cpp | 26 +-- aegisub/universalchardet/LangGreekModel.cpp | 10 +- aegisub/universalchardet/LangHebrewModel.cpp | 6 +- .../universalchardet/LangHungarianModel.cpp | 10 +- aegisub/universalchardet/LangThaiModel.cpp | 6 +- aegisub/universalchardet/nsBig5Prober.cpp | 7 +- aegisub/universalchardet/nsBig5Prober.h | 15 +- aegisub/universalchardet/nsCharSetProber.cpp | 4 +- aegisub/universalchardet/nsCharSetProber.h | 6 +- .../universalchardet/nsCodingStateMachine.h | 38 ++--- aegisub/universalchardet/nsEUCJPProber.cpp | 9 +- aegisub/universalchardet/nsEUCJPProber.h | 15 +- aegisub/universalchardet/nsEUCKRProber.cpp | 7 +- aegisub/universalchardet/nsEUCKRProber.h | 16 +- aegisub/universalchardet/nsEUCTWProber.cpp | 7 +- aegisub/universalchardet/nsEUCTWProber.h | 15 +- .../universalchardet/nsEscCharsetProber.cpp | 46 +++--- aegisub/universalchardet/nsEscCharsetProber.h | 10 +- aegisub/universalchardet/nsEscSM.cpp | 24 +-- aegisub/universalchardet/nsGB2312Prober.cpp | 7 +- aegisub/universalchardet/nsGB2312Prober.h | 15 +- aegisub/universalchardet/nsHebrewProber.h | 2 +- aegisub/universalchardet/nsLatin1Prober.cpp | 4 +- aegisub/universalchardet/nsLatin1Prober.h | 10 +- .../universalchardet/nsMBCSGroupProber.cpp | 48 +++--- aegisub/universalchardet/nsMBCSGroupProber.h | 10 +- aegisub/universalchardet/nsMBCSSM.cpp | 155 +++--------------- aegisub/universalchardet/nsPkgInt.h | 2 +- aegisub/universalchardet/nsSBCSGroupProber.h | 8 +- aegisub/universalchardet/nsSBCharSetProber.h | 44 ++--- aegisub/universalchardet/nsSJISProber.cpp | 9 +- aegisub/universalchardet/nsSJISProber.h | 15 +- aegisub/universalchardet/nsUTF8Prober.cpp | 5 - aegisub/universalchardet/nsUTF8Prober.h | 10 +- .../universalchardet/nsUniversalDetector.cpp | 64 +++++--- .../universalchardet/nsUniversalDetector.h | 18 +- 43 files changed, 324 insertions(+), 458 deletions(-) diff --git a/aegisub/src/charset_detect.cpp b/aegisub/src/charset_detect.cpp index 5684e71c2..0e1296c98 100644 --- a/aegisub/src/charset_detect.cpp +++ b/aegisub/src/charset_detect.cpp @@ -106,17 +106,14 @@ wxString CharSetDetect::GetEncoding(wxString filename) { bool gotLocal = false; for (int i=0;iGetProbeCount(); - for (int j=0;jGetConfidence(j); + float conf = mCharSetProbers[i]->GetConfidence(); - // Only bother with those whose confidence is at least 1% - wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(j),wxConvUTF8); - if (conf > 0.01f || curName == local) { - results.push_back(CharDetResult()); - results.back().name = curName; - results.back().confidence = mCharSetProbers[i]->GetConfidence(j); - } + // Only bother with those whose confidence is at least 1% + wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8); + if (conf > 0.01f || curName == local) { + results.push_back(CharDetResult()); + results.back().name = curName; + results.back().confidence = conf; } } } diff --git a/aegisub/src/charset_detect.h b/aegisub/src/charset_detect.h index 318ee6bc1..61cf43bef 100644 --- a/aegisub/src/charset_detect.h +++ b/aegisub/src/charset_detect.h @@ -38,7 +38,9 @@ /////////// // Headers +#include "../universalchardet/nscore.h" #include "../universalchardet/nsUniversalDetector.h" +#include "../universalchardet/nsMBCSGroupProber.h" /// DOCME @@ -54,6 +56,7 @@ private: void Report(const char* aCharset); public: + CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { }; wxString GetEncoding(wxString filename); /// @brief DOCME diff --git a/aegisub/universalchardet/CharDistribution.cpp b/aegisub/universalchardet/CharDistribution.cpp index ce72d3b93..4817c0e2f 100644 --- a/aegisub/universalchardet/CharDistribution.cpp +++ b/aegisub/universalchardet/CharDistribution.cpp @@ -49,12 +49,13 @@ #define MINIMUM_DATA_THRESHOLD 4 //return confidence base on received data -float CharDistributionAnalysis::GetConfidence() +float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage) { //if we didn't receive any character in our consideration range, or the // number of frequent characters is below the minimum threshold, return // negative answer - if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD) + if (mTotalChars <= 0 || + !aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD) return SURE_NO; if (mTotalChars != mFreqChars) { diff --git a/aegisub/universalchardet/CharDistribution.h b/aegisub/universalchardet/CharDistribution.h index 3a4c3c454..36ad57ca6 100644 --- a/aegisub/universalchardet/CharDistribution.h +++ b/aegisub/universalchardet/CharDistribution.h @@ -69,10 +69,10 @@ public: mFreqChars++; } } - }; + } //return confidence base on existing data - float GetConfidence(); + float GetConfidence(PRBool aIsPreferredLanguage); //Reset analyser, clear any state void Reset(void) @@ -80,21 +80,21 @@ public: mDone = PR_FALSE; mTotalChars = 0; mFreqChars = 0; - }; + } //This function is for future extension. Caller can use this function to control //analyser's behavior - void SetOpion(){}; + void SetOpion(){} //It is not necessary to receive all data to draw conclusion. For charset detection, // certain amount of data is enough - PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}; + PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;} protected: //we do not handle character base on its original encoding string, but //convert this encoding string to a number, here called order. //This allow multiple encoding of a language to share one frequency table - virtual PRInt32 GetOrder(const char* str) {return -1;}; + virtual PRInt32 GetOrder(const char* str) {return -1;} //If this flag is set to PR_TRUE, detection is done and conclusion has been made PRBool mDone; @@ -132,7 +132,7 @@ protected: return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; else return -1; - }; + } }; @@ -150,7 +150,7 @@ protected: return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; else return -1; - }; + } }; class GB2312DistributionAnalysis : public CharDistributionAnalysis @@ -167,7 +167,7 @@ protected: return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; else return -1; - }; + } }; @@ -188,7 +188,7 @@ protected: return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; else return -1; - }; + } }; class SJISDistributionAnalysis : public CharDistributionAnalysis @@ -213,7 +213,7 @@ protected: if ((unsigned char)str[1] > (unsigned char)0x7f) order--; return order; - }; + } }; class EUCJPDistributionAnalysis : public CharDistributionAnalysis @@ -230,7 +230,7 @@ protected: return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; else return -1; - }; + } }; #endif //CharDistribution_h__ diff --git a/aegisub/universalchardet/JpCntx.cpp b/aegisub/universalchardet/JpCntx.cpp index 812c3bf96..ed7be2d8f 100644 --- a/aegisub/universalchardet/JpCntx.cpp +++ b/aegisub/universalchardet/JpCntx.cpp @@ -39,7 +39,7 @@ #include "JpCntx.h" //This is hiragana 2-char sequence table, the number in each cell represents its frequency category -char jp2CharContext[83][83] = +const char jp2CharContext[83][83] = { { 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,}, { 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,}, @@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void) } #define DONT_KNOW (float)-1 -float JapaneseContextAnalysis::GetConfidence() +float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage) { //This is just one way to calculate confidence. It works well for me. - if (mTotalRel > MINIMUM_DATA_THRESHOLD) + if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD) return ((float)(mTotalRel - mRelSample[0]))/mTotalRel; else return (float)DONT_KNOW; @@ -227,5 +227,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen) return (unsigned char)*(str+1) - (unsigned char)0xa1; return -1; } - - diff --git a/aegisub/universalchardet/JpCntx.h b/aegisub/universalchardet/JpCntx.h index 1ca9cd5eb..734b8f058 100644 --- a/aegisub/universalchardet/JpCntx.h +++ b/aegisub/universalchardet/JpCntx.h @@ -73,12 +73,12 @@ public: mRelSample[jp2CharContext[mLastCharOrder][order]]++; } mLastCharOrder = order; - }; + } - float GetConfidence(); + float GetConfidence(PRBool aIsPreferredLanguage); void Reset(void); - void SetOpion(){}; - PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}; + void SetOpion(){} + PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} protected: virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0; @@ -116,7 +116,7 @@ protected: (unsigned char)*(str+1) <= (unsigned char)0xf1) return (unsigned char)*(str+1) - (unsigned char)0x9f; return -1; - }; + } }; class EUCJPContextAnalysis : public JapaneseContextAnalysis @@ -131,7 +131,7 @@ protected: (unsigned char)*(str+1) <= (unsigned char)0xf3) return (unsigned char)*(str+1) - (unsigned char)0xa1; return -1; - }; + } }; #endif /* __JPCNTX_H__ */ diff --git a/aegisub/universalchardet/LangBulgarianModel.cpp b/aegisub/universalchardet/LangBulgarianModel.cpp index cbbc2a316..79c575641 100644 --- a/aegisub/universalchardet/LangBulgarianModel.cpp +++ b/aegisub/universalchardet/LangBulgarianModel.cpp @@ -48,7 +48,7 @@ //this talbe is modified base on win1251BulgarianCharToOrderMap, so //only number <64 is sure valid -unsigned char Latin5_BulgarianCharToOrderMap[] = +static const unsigned char Latin5_BulgarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -68,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] = 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0 }; -unsigned char win1251BulgarianCharToOrderMap[] = +static const unsigned char win1251BulgarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -94,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] = //first 1024 sequences:3.0618% //rest sequences: 0.2992% //negative sequences: 0.0020% -char BulgarianLangModel[] = +static const char BulgarianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, @@ -226,7 +226,7 @@ char BulgarianLangModel[] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, }; -SequenceModel Latin5BulgarianModel = +const SequenceModel Latin5BulgarianModel = { Latin5_BulgarianCharToOrderMap, BulgarianLangModel, @@ -235,7 +235,7 @@ SequenceModel Latin5BulgarianModel = "ISO-8859-5" }; -SequenceModel Win1251BulgarianModel = +const SequenceModel Win1251BulgarianModel = { win1251BulgarianCharToOrderMap, BulgarianLangModel, diff --git a/aegisub/universalchardet/LangCyrillicModel.cpp b/aegisub/universalchardet/LangCyrillicModel.cpp index a0261e4f7..180c40e5f 100644 --- a/aegisub/universalchardet/LangCyrillicModel.cpp +++ b/aegisub/universalchardet/LangCyrillicModel.cpp @@ -41,7 +41,7 @@ //KOI8-R language model //Character Mapping Table: -unsigned char KOI8R_CharToOrderMap[] = +static const unsigned char KOI8R_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -61,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] = 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0 }; -unsigned char win1251_CharToOrderMap[] = +static const unsigned char win1251_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -81,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] = 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, }; -unsigned char latin5_CharToOrderMap[] = +static const unsigned char latin5_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -101,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] = 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, }; -unsigned char macCyrillic_CharToOrderMap[] = +static const unsigned char macCyrillic_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -121,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] = 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, }; -unsigned char IBM855_CharToOrderMap[] = +static const unsigned char IBM855_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -141,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] = 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, }; -unsigned char IBM866_CharToOrderMap[] = +static const unsigned char IBM866_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -167,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] = //first 1024 sequences: 2.3389% //rest sequences: 0.1237% //negative sequences: 0.0009% -char RussianLangModel[] = +static const char RussianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, @@ -300,7 +300,7 @@ char RussianLangModel[] = }; -SequenceModel Koi8rModel = +const SequenceModel Koi8rModel = { KOI8R_CharToOrderMap, RussianLangModel, @@ -309,7 +309,7 @@ SequenceModel Koi8rModel = "KOI8-R" }; -SequenceModel Win1251Model = +const SequenceModel Win1251Model = { win1251_CharToOrderMap, RussianLangModel, @@ -318,7 +318,7 @@ SequenceModel Win1251Model = "windows-1251" }; -SequenceModel Latin5Model = +const SequenceModel Latin5Model = { latin5_CharToOrderMap, RussianLangModel, @@ -327,7 +327,7 @@ SequenceModel Latin5Model = "ISO-8859-5" }; -SequenceModel MacCyrillicModel = +const SequenceModel MacCyrillicModel = { macCyrillic_CharToOrderMap, RussianLangModel, @@ -336,7 +336,7 @@ SequenceModel MacCyrillicModel = "x-mac-cyrillic" }; -SequenceModel Ibm866Model = +const SequenceModel Ibm866Model = { IBM866_CharToOrderMap, RussianLangModel, @@ -345,7 +345,7 @@ SequenceModel Ibm866Model = "IBM866" }; -SequenceModel Ibm855Model = +const SequenceModel Ibm855Model = { IBM855_CharToOrderMap, RussianLangModel, diff --git a/aegisub/universalchardet/LangGreekModel.cpp b/aegisub/universalchardet/LangGreekModel.cpp index c4f3f3e1c..13ce0ae9e 100644 --- a/aegisub/universalchardet/LangGreekModel.cpp +++ b/aegisub/universalchardet/LangGreekModel.cpp @@ -45,7 +45,7 @@ *****************************************************************/ //Character Mapping Table: -unsigned char Latin7_CharToOrderMap[] = +static const unsigned char Latin7_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -67,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] = -unsigned char win1253_CharToOrderMap[] = +static const unsigned char win1253_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -93,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] = //first 1024 sequences:1.7001% //rest sequences: 0.0359% //negative sequences: 0.0148% -char GreekLangModel[] = +static const char GreekLangModel[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, @@ -225,7 +225,7 @@ char GreekLangModel[] = 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; -SequenceModel Latin7Model = +const SequenceModel Latin7Model = { Latin7_CharToOrderMap, GreekLangModel, @@ -234,7 +234,7 @@ SequenceModel Latin7Model = "ISO-8859-7" }; -SequenceModel Win1253Model = +const SequenceModel Win1253Model = { win1253_CharToOrderMap, GreekLangModel, diff --git a/aegisub/universalchardet/LangHebrewModel.cpp b/aegisub/universalchardet/LangHebrewModel.cpp index 21e4d68a1..f86e68098 100644 --- a/aegisub/universalchardet/LangHebrewModel.cpp +++ b/aegisub/universalchardet/LangHebrewModel.cpp @@ -50,7 +50,7 @@ //Windows-1255 language model //Character Mapping Table: -unsigned char win1255_CharToOrderMap[] = +static const unsigned char win1255_CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -76,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] = //first 1024 sequences: 1.5981% //rest sequences: 0.087% //negative sequences: 0.0015% -char HebrewLangModel[] = +static const char HebrewLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, @@ -208,7 +208,7 @@ char HebrewLangModel[] = 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, }; -SequenceModel Win1255Model = +const SequenceModel Win1255Model = { win1255_CharToOrderMap, HebrewLangModel, diff --git a/aegisub/universalchardet/LangHungarianModel.cpp b/aegisub/universalchardet/LangHungarianModel.cpp index 075e80494..876826cdd 100644 --- a/aegisub/universalchardet/LangHungarianModel.cpp +++ b/aegisub/universalchardet/LangHungarianModel.cpp @@ -45,7 +45,7 @@ *****************************************************************/ //Character Mapping Table: -unsigned char Latin2_HungarianCharToOrderMap[] = +static const unsigned char Latin2_HungarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -65,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] = 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, }; -unsigned char win1250HungarianCharToOrderMap[] = +static const unsigned char win1250HungarianCharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -91,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] = //first 1024 sequences:5.2623% //rest sequences: 0.8894% //negative sequences: 0.0009% -char HungarianLangModel[] = +static const char HungarianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, @@ -223,7 +223,7 @@ char HungarianLangModel[] = 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, }; -SequenceModel Latin2HungarianModel = +const SequenceModel Latin2HungarianModel = { Latin2_HungarianCharToOrderMap, HungarianLangModel, @@ -232,7 +232,7 @@ SequenceModel Latin2HungarianModel = "ISO-8859-2" }; -SequenceModel Win1250HungarianModel = +const SequenceModel Win1250HungarianModel = { win1250HungarianCharToOrderMap, HungarianLangModel, diff --git a/aegisub/universalchardet/LangThaiModel.cpp b/aegisub/universalchardet/LangThaiModel.cpp index 0b8c252ed..aadc2dda9 100644 --- a/aegisub/universalchardet/LangThaiModel.cpp +++ b/aegisub/universalchardet/LangThaiModel.cpp @@ -49,7 +49,7 @@ //The following result for thai was collected from a limited sample (1M). //Character Mapping Table: -unsigned char TIS620CharToOrderMap[] = +static const unsigned char TIS620CharToOrderMap[] = { 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 @@ -78,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] = //first 1024 sequences:7.3177% //rest sequences: 1.0230% //negative sequences: 0.0436% -char ThaiLangModel[] = +static const char ThaiLangModel[] = { 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, @@ -211,7 +211,7 @@ char ThaiLangModel[] = }; -SequenceModel TIS620ThaiModel = +const SequenceModel TIS620ThaiModel = { TIS620CharToOrderMap, ThaiLangModel, diff --git a/aegisub/universalchardet/nsBig5Prober.cpp b/aegisub/universalchardet/nsBig5Prober.cpp index 55d63c6f5..aee5cd483 100644 --- a/aegisub/universalchardet/nsBig5Prober.cpp +++ b/aegisub/universalchardet/nsBig5Prober.cpp @@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; @@ -86,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsBig5Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsBig5Prober.h b/aegisub/universalchardet/nsBig5Prober.h index a80d1a0a8..5ae357643 100644 --- a/aegisub/universalchardet/nsBig5Prober.h +++ b/aegisub/universalchardet/nsBig5Prober.h @@ -44,15 +44,17 @@ class nsBig5Prober: public nsCharSetProber { public: - nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel); - Reset();}; - virtual ~nsBig5Prober(void){delete mCodingSM;}; + nsBig5Prober(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&Big5SMModel); + Reset();} + virtual ~nsBig5Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "Big5";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "Big5";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: void GetDistribution(PRUint32 aCharLen, const char* aStr); @@ -63,6 +65,7 @@ protected: //Big5ContextAnalysis mContextAnalyser; Big5DistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/aegisub/universalchardet/nsCharSetProber.cpp b/aegisub/universalchardet/nsCharSetProber.cpp index d19a2417d..0429dd1a0 100644 --- a/aegisub/universalchardet/nsCharSetProber.cpp +++ b/aegisub/universalchardet/nsCharSetProber.cpp @@ -74,7 +74,7 @@ PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 a if (meetMSB && curPtr > prevPtr) while (prevPtr < curPtr) *newptr++ = *prevPtr++; - newLen = PRUint32(newptr - *newBuf); + newLen = newptr - *newBuf; return PR_TRUE; } @@ -119,7 +119,7 @@ PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen while (prevPtr < curPtr) *newptr++ = *prevPtr++; - newLen = PRUint32(newptr - *newBuf); + newLen = newptr - *newBuf; return PR_TRUE; } diff --git a/aegisub/universalchardet/nsCharSetProber.h b/aegisub/universalchardet/nsCharSetProber.h index becf3f9fc..c078ccf08 100644 --- a/aegisub/universalchardet/nsCharSetProber.h +++ b/aegisub/universalchardet/nsCharSetProber.h @@ -52,7 +52,7 @@ typedef enum { class nsCharSetProber { public: - virtual ~nsCharSetProber() {}; + virtual ~nsCharSetProber() {} virtual const char* GetCharSetName() = 0; virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; virtual nsProbingState GetState(void) = 0; @@ -60,10 +60,6 @@ public: virtual float GetConfidence(void) = 0; virtual void SetOpion() = 0; - virtual const char* GetCharSetName(int i) { return GetCharSetName(); } - virtual float GetConfidence(int i) { return GetConfidence(); } - virtual int GetProbeCount(void) { return 1; } - #ifdef DEBUG_chardet virtual void DumpStatus() {}; #endif diff --git a/aegisub/universalchardet/nsCodingStateMachine.h b/aegisub/universalchardet/nsCodingStateMachine.h index b05723ead..819f9ab07 100644 --- a/aegisub/universalchardet/nsCodingStateMachine.h +++ b/aegisub/universalchardet/nsCodingStateMachine.h @@ -59,10 +59,7 @@ typedef struct class nsCodingStateMachine { public: - nsCodingStateMachine(SMModel* sm){ - mCurrentState = eStart; - mModel = sm; - }; + nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; } nsSMState NextState(char c){ //for each byte we get its class , if it is first byte, we also get byte length PRUint32 byteCls = GETCLASS(c); @@ -76,33 +73,32 @@ public: mModel->stateTable); mCurrentBytePos++; return mCurrentState; - }; - PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}; - void Reset(void) {mCurrentState = eStart;}; - const char * GetCodingStateMachine() {return mModel->name;}; + } + PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;} + void Reset(void) {mCurrentState = eStart;} + const char * GetCodingStateMachine() {return mModel->name;} protected: nsSMState mCurrentState; PRUint32 mCurrentCharLen; PRUint32 mCurrentBytePos; - SMModel *mModel; + const SMModel *mModel; }; -extern SMModel UTF8SMModel; -extern SMModel Big5SMModel; -extern SMModel EUCJPSMModel; -extern SMModel EUCKRSMModel; -extern SMModel EUCTWSMModel; -extern SMModel GB18030SMModel; -extern SMModel SJISSMModel; -extern SMModel UCS2BESMModel; +extern const SMModel UTF8SMModel; +extern const SMModel Big5SMModel; +extern const SMModel EUCJPSMModel; +extern const SMModel EUCKRSMModel; +extern const SMModel EUCTWSMModel; +extern const SMModel GB18030SMModel; +extern const SMModel SJISSMModel; -extern SMModel HZSMModel; -extern SMModel ISO2022CNSMModel; -extern SMModel ISO2022JPSMModel; -extern SMModel ISO2022KRSMModel; +extern const SMModel HZSMModel; +extern const SMModel ISO2022CNSMModel; +extern const SMModel ISO2022JPSMModel; +extern const SMModel ISO2022KRSMModel; #endif /* nsCodingStateMachine_h__ */ diff --git a/aegisub/universalchardet/nsEUCJPProber.cpp b/aegisub/universalchardet/nsEUCJPProber.cpp index fb0d29654..35387dfdd 100644 --- a/aegisub/universalchardet/nsEUCJPProber.cpp +++ b/aegisub/universalchardet/nsEUCJPProber.cpp @@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; @@ -96,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCJPProber::GetConfidence(void) { - float contxtCf = mContextAnalyser.GetConfidence(); - float distribCf = mDistributionAnalyser.GetConfidence(); + float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (contxtCf > distribCf ? contxtCf : distribCf); } diff --git a/aegisub/universalchardet/nsEUCJPProber.h b/aegisub/universalchardet/nsEUCJPProber.h index 3ae6cfd9f..a7a2f5147 100644 --- a/aegisub/universalchardet/nsEUCJPProber.h +++ b/aegisub/universalchardet/nsEUCJPProber.h @@ -50,15 +50,17 @@ class nsEUCJPProber: public nsCharSetProber { public: - nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); - Reset();}; - virtual ~nsEUCJPProber(void){delete mCodingSM;}; + nsEUCJPProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); + Reset();} + virtual ~nsEUCJPProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "EUC-JP";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "EUC-JP";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: nsCodingStateMachine* mCodingSM; @@ -68,6 +70,7 @@ protected: EUCJPDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/aegisub/universalchardet/nsEUCKRProber.cpp b/aegisub/universalchardet/nsEUCKRProber.cpp index c91a97e66..396b09527 100644 --- a/aegisub/universalchardet/nsEUCKRProber.cpp +++ b/aegisub/universalchardet/nsEUCKRProber.cpp @@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; @@ -89,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCKRProber::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsEUCKRProber.h b/aegisub/universalchardet/nsEUCKRProber.h index 1f3dec511..8e0998460 100644 --- a/aegisub/universalchardet/nsEUCKRProber.h +++ b/aegisub/universalchardet/nsEUCKRProber.h @@ -44,15 +44,18 @@ class nsEUCKRProber: public nsCharSetProber { public: - nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); - Reset();}; - virtual ~nsEUCKRProber(void){delete mCodingSM;}; + nsEUCKRProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); + Reset(); + } + virtual ~nsEUCKRProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "EUC-KR";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "EUC-KR";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: void GetDistribution(PRUint32 aCharLen, const char* aStr); @@ -63,6 +66,7 @@ protected: //EUCKRContextAnalysis mContextAnalyser; EUCKRDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/aegisub/universalchardet/nsEUCTWProber.cpp b/aegisub/universalchardet/nsEUCTWProber.cpp index 8552941c3..710e413eb 100644 --- a/aegisub/universalchardet/nsEUCTWProber.cpp +++ b/aegisub/universalchardet/nsEUCTWProber.cpp @@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; @@ -89,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCTWProber::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsEUCTWProber.h b/aegisub/universalchardet/nsEUCTWProber.h index 89100a568..911d50b03 100644 --- a/aegisub/universalchardet/nsEUCTWProber.h +++ b/aegisub/universalchardet/nsEUCTWProber.h @@ -44,15 +44,17 @@ class nsEUCTWProber: public nsCharSetProber { public: - nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); - Reset();}; - virtual ~nsEUCTWProber(void){delete mCodingSM;}; + nsEUCTWProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); + Reset();} + virtual ~nsEUCTWProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "x-euc-tw";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "x-euc-tw";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: void GetDistribution(PRUint32 aCharLen, const char* aStr); @@ -63,6 +65,7 @@ protected: //EUCTWContextAnalysis mContextAnalyser; EUCTWDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/aegisub/universalchardet/nsEscCharsetProber.cpp b/aegisub/universalchardet/nsEscCharsetProber.cpp index a816bab76..464c75346 100644 --- a/aegisub/universalchardet/nsEscCharsetProber.cpp +++ b/aegisub/universalchardet/nsEscCharsetProber.cpp @@ -37,13 +37,21 @@ #include "nsEscCharsetProber.h" +#include "nsUniversalDetector.h" -nsEscCharSetProber::nsEscCharSetProber(void) +nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter) { - mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); - mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); - mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); - mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); + for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) + mCodingSM[i] = nsnull; + if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) + { + mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); + mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); + } + if (aLanguageFilter & NS_FILTER_JAPANESE) + mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); + if (aLanguageFilter & NS_FILTER_KOREAN) + mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); mActiveSM = NUM_OF_ESC_CHARSETS; mState = eDetecting; mDetectedCharset = nsnull; @@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void) { mState = eDetecting; for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) - mCodingSM[i]->Reset(); + if (mCodingSM[i]) + mCodingSM[i]->Reset(); mActiveSM = NUM_OF_ESC_CHARSETS; mDetectedCharset = nsnull; } @@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen) { for (j = mActiveSM-1; j>= 0; j--) { - //byte is feed to all active state machine - codingState = mCodingSM[j]->NextState(aBuf[i]); - if (codingState == eError) + if (mCodingSM[j]) { - //got negative answer for this state machine, make it inactive - mActiveSM--; - if (mActiveSM == 0) + codingState = mCodingSM[j]->NextState(aBuf[i]); + if (codingState == eItsMe) { - mState = eNotMe; + mState = eFoundIt; + mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); return mState; } - else if (j != (PRInt32)mActiveSM) - { - nsCodingStateMachine* t; - t = mCodingSM[mActiveSM]; - mCodingSM[mActiveSM] = mCodingSM[j]; - mCodingSM[j] = t; - } - } - else if (codingState == eItsMe) - { - mState = eFoundIt; - mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); - return mState; } } } diff --git a/aegisub/universalchardet/nsEscCharsetProber.h b/aegisub/universalchardet/nsEscCharsetProber.h index 5df54279b..4b648e012 100644 --- a/aegisub/universalchardet/nsEscCharsetProber.h +++ b/aegisub/universalchardet/nsEscCharsetProber.h @@ -45,14 +45,14 @@ class nsEscCharSetProber: public nsCharSetProber { public: - nsEscCharSetProber(void); + nsEscCharSetProber(PRUint32 aLanguageFilter); virtual ~nsEscCharSetProber(void); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return mDetectedCharset;}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return mDetectedCharset;} + nsProbingState GetState(void) {return mState;} void Reset(void); - float GetConfidence(void){return (float)0.99;}; - void SetOpion() {}; + float GetConfidence(void){return (float)0.99;} + void SetOpion() {} protected: void GetDistribution(PRUint32 aCharLen, const char* aStr); diff --git a/aegisub/universalchardet/nsEscSM.cpp b/aegisub/universalchardet/nsEscSM.cpp index ef8810adb..eed1b7cf8 100644 --- a/aegisub/universalchardet/nsEscSM.cpp +++ b/aegisub/universalchardet/nsEscSM.cpp @@ -36,7 +36,7 @@ * ***** END LICENSE BLOCK ***** */ #include "nsCodingStateMachine.h" -static PRUint32 HZ_cls[ 256 / 8 ] = { +static const PRUint32 HZ_cls[ 256 / 8 ] = { PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -72,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff }; -static PRUint32 HZ_st [ 6] = { +static const PRUint32 HZ_st [ 6] = { PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17 @@ -83,7 +83,7 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0}; -SMModel HZSMModel = { +const SMModel HZSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st }, @@ -92,7 +92,7 @@ SMModel HZSMModel = { }; -static PRUint32 ISO2022CN_cls [ 256 / 8 ] = { +static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -128,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff }; -static PRUint32 ISO2022CN_st [ 8] = { +static const PRUint32 ISO2022CN_st [ 8] = { PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 @@ -141,7 +141,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; -SMModel ISO2022CNSMModel = { +const SMModel ISO2022CNSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls }, 9, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st }, @@ -149,7 +149,7 @@ SMModel ISO2022CNSMModel = { "ISO-2022-CN", }; -static PRUint32 ISO2022JP_cls [ 256 / 8 ] = { +static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -185,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff }; -static PRUint32 ISO2022JP_st [ 9] = { +static const PRUint32 ISO2022JP_st [ 9] = { PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 @@ -199,7 +199,7 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47 static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0}; -SMModel ISO2022JPSMModel = { +const SMModel ISO2022JPSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls }, 10, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st }, @@ -207,7 +207,7 @@ SMModel ISO2022JPSMModel = { "ISO-2022-JP", }; -static PRUint32 ISO2022KR_cls [ 256 / 8 ] = { +static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = { PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 @@ -243,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff }; -static PRUint32 ISO2022KR_st [ 5] = { +static const PRUint32 ISO2022KR_st [ 5] = { PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17 @@ -253,7 +253,7 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27 static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0}; -SMModel ISO2022KRSMModel = { +const SMModel ISO2022KRSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st }, diff --git a/aegisub/universalchardet/nsGB2312Prober.cpp b/aegisub/universalchardet/nsGB2312Prober.cpp index 576dcd6e9..95374c312 100644 --- a/aegisub/universalchardet/nsGB2312Prober.cpp +++ b/aegisub/universalchardet/nsGB2312Prober.cpp @@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; @@ -94,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsGB18030Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsGB2312Prober.h b/aegisub/universalchardet/nsGB2312Prober.h index c11dd3158..4bdac3bbe 100644 --- a/aegisub/universalchardet/nsGB2312Prober.h +++ b/aegisub/universalchardet/nsGB2312Prober.h @@ -46,15 +46,17 @@ class nsGB18030Prober: public nsCharSetProber { public: - nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel); - Reset();}; - virtual ~nsGB18030Prober(void){delete mCodingSM;}; + nsGB18030Prober(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&GB18030SMModel); + Reset();} + virtual ~nsGB18030Prober(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "gb18030";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "gb18030";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: void GetDistribution(PRUint32 aCharLen, const char* aStr); @@ -65,6 +67,7 @@ protected: //GB2312ContextAnalysis mContextAnalyser; GB2312DistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/aegisub/universalchardet/nsHebrewProber.h b/aegisub/universalchardet/nsHebrewProber.h index 9bb3a0032..eedfed45b 100644 --- a/aegisub/universalchardet/nsHebrewProber.h +++ b/aegisub/universalchardet/nsHebrewProber.h @@ -55,7 +55,7 @@ public: virtual nsProbingState GetState(void); virtual float GetConfidence(void) { return (float)0.0; } - virtual void SetOpion() {}; + virtual void SetOpion() {} void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) { mLogicalProb = logicalPrb; mVisualProb = visualPrb; } diff --git a/aegisub/universalchardet/nsLatin1Prober.cpp b/aegisub/universalchardet/nsLatin1Prober.cpp index 05d682398..7694ef765 100644 --- a/aegisub/universalchardet/nsLatin1Prober.cpp +++ b/aegisub/universalchardet/nsLatin1Prober.cpp @@ -50,7 +50,7 @@ #define ASO 7 // accent small other #define CLASS_NUM 8 // total classes -static unsigned char Latin1_CharToClass[] = +static const unsigned char Latin1_CharToClass[] = { OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F @@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] = 2 : normal 3 : very likely */ -static unsigned char Latin1ClassModel[] = +static const unsigned char Latin1ClassModel[] = { /* UDF OTH ASC ASS ACV ACO ASV ASO */ /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, diff --git a/aegisub/universalchardet/nsLatin1Prober.h b/aegisub/universalchardet/nsLatin1Prober.h index 103d271b0..5145e9655 100644 --- a/aegisub/universalchardet/nsLatin1Prober.h +++ b/aegisub/universalchardet/nsLatin1Prober.h @@ -45,14 +45,14 @@ class nsLatin1Prober: public nsCharSetProber { public: - nsLatin1Prober(void){Reset();}; - virtual ~nsLatin1Prober(void){}; + nsLatin1Prober(void){Reset();} + virtual ~nsLatin1Prober(void){} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "windows-1252";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "windows-1252";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} #ifdef DEBUG_chardet virtual void DumpStatus(); diff --git a/aegisub/universalchardet/nsMBCSGroupProber.cpp b/aegisub/universalchardet/nsMBCSGroupProber.cpp index d723a1e7c..f1611651f 100644 --- a/aegisub/universalchardet/nsMBCSGroupProber.cpp +++ b/aegisub/universalchardet/nsMBCSGroupProber.cpp @@ -39,6 +39,7 @@ #include #include "nsMBCSGroupProber.h" +#include "nsUniversalDetector.h" #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) const char *ProberName[] = @@ -54,15 +55,26 @@ const char *ProberName[] = #endif -nsMBCSGroupProber::nsMBCSGroupProber() +nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) { + for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) + mProbers[i] = nsnull; + mProbers[0] = new nsUTF8Prober(); - mProbers[1] = new nsSJISProber(); - mProbers[2] = new nsEUCJPProber(); - mProbers[3] = new nsGB18030Prober(); - mProbers[4] = new nsEUCKRProber(); - mProbers[5] = new nsBig5Prober(); - mProbers[6] = new nsEUCTWProber(); + if (aLanguageFilter & NS_FILTER_JAPANESE) + { + mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE); + mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE); + } + if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED) + mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED); + if (aLanguageFilter & NS_FILTER_KOREAN) + mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN); + if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL) + { + mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); + mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL); + } Reset(); } @@ -134,16 +146,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) mState = eFoundIt; return mState; } - else if (st == eNotMe) - { - mIsActive[i] = PR_FALSE; - mActiveNum--; - if (mActiveNum <= 0) - { - mState = eNotMe; - return mState; - } - } } } } @@ -154,23 +156,13 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) { if (!mIsActive[i]) continue; - st = mProbers[i]->HandleData(aBuf + start, aLen + 1 - start); + st = mProbers[i]->HandleData(aBuf + start, aLen - start); if (st == eFoundIt) { mBestGuess = i; mState = eFoundIt; return mState; } - else if (st == eNotMe) - { - mIsActive[i] = PR_FALSE; - mActiveNum--; - if (mActiveNum <= 0) - { - mState = eNotMe; - return mState; - } - } } } mKeepNext = keepNext; diff --git a/aegisub/universalchardet/nsMBCSGroupProber.h b/aegisub/universalchardet/nsMBCSGroupProber.h index c674da84c..c4e996497 100644 --- a/aegisub/universalchardet/nsMBCSGroupProber.h +++ b/aegisub/universalchardet/nsMBCSGroupProber.h @@ -51,18 +51,14 @@ class nsMBCSGroupProber: public nsCharSetProber { public: - nsMBCSGroupProber(); + nsMBCSGroupProber(PRUint32 aLanguageFilter); virtual ~nsMBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); - nsProbingState GetState(void) {return mState;}; + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; - - const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); } - float GetConfidence(int i) { return mProbers[i]->GetConfidence(); } - int GetProbeCount(void) { return NUM_OF_PROBERS; } + void SetOpion() {} #ifdef DEBUG_chardet void DumpStatus(); diff --git a/aegisub/universalchardet/nsMBCSSM.cpp b/aegisub/universalchardet/nsMBCSSM.cpp index 357ad9994..584e93182 100644 --- a/aegisub/universalchardet/nsMBCSSM.cpp +++ b/aegisub/universalchardet/nsMBCSSM.cpp @@ -44,7 +44,7 @@ Modification from frank tang's original work: // BIG5 -static PRUint32 BIG5_cls [ 256 / 8 ] = { +static const PRUint32 BIG5_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -81,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff }; -static PRUint32 BIG5_st [ 3] = { +static const PRUint32 BIG5_st [ 3] = { PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 @@ -89,7 +89,7 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0}; -SMModel Big5SMModel = { +SMModel const Big5SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls }, 5, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, @@ -97,7 +97,7 @@ SMModel Big5SMModel = { "Big5", }; -static PRUint32 EUCJP_cls [ 256 / 8 ] = { +static const PRUint32 EUCJP_cls [ 256 / 8 ] = { //PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07 PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f @@ -134,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff }; -static PRUint32 EUCJP_st [ 5] = { +static const PRUint32 EUCJP_st [ 5] = { PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 @@ -144,7 +144,7 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27 static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; -SMModel EUCJPSMModel = { +const SMModel EUCJPSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st }, @@ -152,7 +152,7 @@ SMModel EUCJPSMModel = { "EUC-JP", }; -static PRUint32 EUCKR_cls [ 256 / 8 ] = { +static const PRUint32 EUCKR_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -189,14 +189,14 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff }; -static PRUint32 EUCKR_st [ 2] = { +static const PRUint32 EUCKR_st [ 2] = { PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f }; static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0}; -SMModel EUCKRSMModel = { +const SMModel EUCKRSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls }, 4, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st }, @@ -204,7 +204,7 @@ SMModel EUCKRSMModel = { "EUC-KR", }; -static PRUint32 EUCTW_cls [ 256 / 8 ] = { +static const PRUint32 EUCTW_cls [ 256 / 8 ] = { //PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07 PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07 PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f @@ -241,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff }; -static PRUint32 EUCTW_st [ 6] = { +static const PRUint32 EUCTW_st [ 6] = { PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 @@ -252,7 +252,7 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3}; -SMModel EUCTWSMModel = { +const SMModel EUCTWSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls }, 7, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, @@ -316,7 +316,7 @@ SMModel GB2312SMModel = { // the following state machine data was created by perl script in // intl/chardet/tools. It should be the same as in PSM detector. -static PRUint32 GB18030_cls [ 256 / 8 ] = { +static const PRUint32 GB18030_cls [ 256 / 8 ] = { PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 @@ -352,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff }; -static PRUint32 GB18030_st [ 6] = { +static const PRUint32 GB18030_st [ 6] = { PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 @@ -368,7 +368,7 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f // 2 here. static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; -SMModel GB18030SMModel = { +const SMModel GB18030SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls }, 7, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st }, @@ -378,7 +378,7 @@ SMModel GB18030SMModel = { // sjis -static PRUint32 SJIS_cls [ 256 / 8 ] = { +static const PRUint32 SJIS_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -417,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff }; -static PRUint32 SJIS_st [ 3] = { +static const PRUint32 SJIS_st [ 3] = { PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 @@ -425,7 +425,7 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; -SMModel SJISSMModel = { +const SMModel SJISSMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls }, 6, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, @@ -434,120 +434,7 @@ SMModel SJISSMModel = { }; -static PRUint32 UCS2BE_cls [ 256 / 8 ] = { -PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07 -PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f -PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 -PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f -PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27 -PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f -PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 -PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f -PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47 -PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f -PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 -PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f -PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 -PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f -PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 -PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f -PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7 -PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af -PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7 -PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf -PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7 -PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf -PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7 -PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df -PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 -PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef -PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 -PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff -}; - - -static PRUint32 UCS2BE_st [ 7] = { -PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17 -PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f -PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27 -PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f -PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37 -}; - -static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2}; - -SMModel UCS2BESMModel = { - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls }, - 6, - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st }, - UCS2BECharLenTable, - "UTF-16BE", -}; - -static PRUint32 UCS2LE_cls [ 256 / 8 ] = { -PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07 -PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f -PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 -PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f -PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27 -PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f -PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37 -PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f -PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47 -PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f -PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57 -PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f -PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67 -PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f -PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77 -PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f -PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87 -PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f -PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97 -PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f -PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7 -PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af -PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7 -PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf -PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7 -PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf -PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7 -PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df -PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7 -PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef -PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7 -PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff -}; - - -static PRUint32 UCS2LE_st [ 7] = { -PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07 -PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f -PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17 -PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f -PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27 -PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f -PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37 -}; - -static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2}; - -SMModel UCS2LESMModel = { - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls }, - 6, - {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st }, - UCS2LECharLenTable, - "UTF-16LE", -}; - - -static PRUint32 UTF8_cls [ 256 / 8 ] = { +static const PRUint32 UTF8_cls [ 256 / 8 ] = { //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f @@ -584,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff }; -static PRUint32 UTF8_st [ 26] = { +static const PRUint32 UTF8_st [ 26] = { PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07 PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 @@ -616,7 +503,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6 }; -SMModel UTF8SMModel = { +const SMModel UTF8SMModel = { {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls }, 16, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st }, diff --git a/aegisub/universalchardet/nsPkgInt.h b/aegisub/universalchardet/nsPkgInt.h index 7617d6c91..3caa91220 100644 --- a/aegisub/universalchardet/nsPkgInt.h +++ b/aegisub/universalchardet/nsPkgInt.h @@ -68,7 +68,7 @@ typedef struct nsPkgInt { nsSftMsk sftmsk; nsBitSft bitsft; nsUnitMsk unitmsk; - PRUint32 *data; + const PRUint32* const data; } nsPkgInt; diff --git a/aegisub/universalchardet/nsSBCSGroupProber.h b/aegisub/universalchardet/nsSBCSGroupProber.h index 8d452b7fb..faa57ed14 100644 --- a/aegisub/universalchardet/nsSBCSGroupProber.h +++ b/aegisub/universalchardet/nsSBCSGroupProber.h @@ -49,14 +49,10 @@ public: virtual ~nsSBCSGroupProber(); nsProbingState HandleData(const char* aBuf, PRUint32 aLen); const char* GetCharSetName(); - nsProbingState GetState(void) {return mState;}; + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; - - const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); } - float GetConfidence(int i) { return mProbers[i]->GetConfidence(); } - int GetProbeCount(void) { return NUM_OF_SBCS_PROBERS; } + void SetOpion() {} #ifdef DEBUG_chardet void DumpStatus(); diff --git a/aegisub/universalchardet/nsSBCharSetProber.h b/aegisub/universalchardet/nsSBCharSetProber.h index 82117e251..c927f4b1b 100644 --- a/aegisub/universalchardet/nsSBCharSetProber.h +++ b/aegisub/universalchardet/nsSBCharSetProber.h @@ -51,27 +51,27 @@ typedef struct { - unsigned char *charToOrderMap; // [256] table use to find a char's order - char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency + const unsigned char* const charToOrderMap; // [256] table use to find a char's order + const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) - const char* charsetName; + const char* const charsetName; } SequenceModel; class nsSingleByteCharSetProber : public nsCharSetProber{ public: - nsSingleByteCharSetProber(SequenceModel *model) + nsSingleByteCharSetProber(const SequenceModel *model) :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } - nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) + nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } virtual const char* GetCharSetName(); virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - virtual nsProbingState GetState(void) {return mState;}; + virtual nsProbingState GetState(void) {return mState;} virtual void Reset(void); virtual float GetConfidence(void); - virtual void SetOpion() {}; + virtual void SetOpion() {} // This feature is not implemented yet. any current language model // contain this parameter as PR_FALSE. No one is looking at this @@ -79,7 +79,7 @@ public: // Moreover, the nsSBCSGroupProber which calls the HandleData of this // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid // of the English letters. - PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; // (not implemented) + PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented) #ifdef DEBUG_chardet virtual void DumpStatus(); @@ -87,7 +87,7 @@ public: protected: nsProbingState mState; - const SequenceModel *mModel; + const SequenceModel* const mModel; const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup //char order of last character @@ -106,19 +106,19 @@ protected: }; -extern SequenceModel Koi8rModel; -extern SequenceModel Win1251Model; -extern SequenceModel Latin5Model; -extern SequenceModel MacCyrillicModel; -extern SequenceModel Ibm866Model; -extern SequenceModel Ibm855Model; -extern SequenceModel Latin7Model; -extern SequenceModel Win1253Model; -extern SequenceModel Latin5BulgarianModel; -extern SequenceModel Win1251BulgarianModel; -extern SequenceModel Latin2HungarianModel; -extern SequenceModel Win1250HungarianModel; -extern SequenceModel Win1255Model; +extern const SequenceModel Koi8rModel; +extern const SequenceModel Win1251Model; +extern const SequenceModel Latin5Model; +extern const SequenceModel MacCyrillicModel; +extern const SequenceModel Ibm866Model; +extern const SequenceModel Ibm855Model; +extern const SequenceModel Latin7Model; +extern const SequenceModel Win1253Model; +extern const SequenceModel Latin5BulgarianModel; +extern const SequenceModel Win1251BulgarianModel; +extern const SequenceModel Latin2HungarianModel; +extern const SequenceModel Win1250HungarianModel; +extern const SequenceModel Win1255Model; #endif /* nsSingleByteCharSetProber_h__ */ diff --git a/aegisub/universalchardet/nsSJISProber.cpp b/aegisub/universalchardet/nsSJISProber.cpp index 9bab50681..5b7e7fddc 100644 --- a/aegisub/universalchardet/nsSJISProber.cpp +++ b/aegisub/universalchardet/nsSJISProber.cpp @@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; @@ -95,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) float nsSJISProber::GetConfidence(void) { - float contxtCf = mContextAnalyser.GetConfidence(); - float distribCf = mDistributionAnalyser.GetConfidence(); + float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); return (contxtCf > distribCf ? contxtCf : distribCf); } diff --git a/aegisub/universalchardet/nsSJISProber.h b/aegisub/universalchardet/nsSJISProber.h index 3a612d788..1efb6e3d6 100644 --- a/aegisub/universalchardet/nsSJISProber.h +++ b/aegisub/universalchardet/nsSJISProber.h @@ -51,15 +51,17 @@ class nsSJISProber: public nsCharSetProber { public: - nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel); - Reset();}; - virtual ~nsSJISProber(void){delete mCodingSM;}; + nsSJISProber(PRBool aIsPreferredLanguage) + :mIsPreferredLanguage(aIsPreferredLanguage) + {mCodingSM = new nsCodingStateMachine(&SJISSMModel); + Reset();} + virtual ~nsSJISProber(void){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "Shift_JIS";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "Shift_JIS";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: nsCodingStateMachine* mCodingSM; @@ -69,6 +71,7 @@ protected: SJISDistributionAnalysis mDistributionAnalyser; char mLastChar[2]; + PRBool mIsPreferredLanguage; }; diff --git a/aegisub/universalchardet/nsUTF8Prober.cpp b/aegisub/universalchardet/nsUTF8Prober.cpp index 6d590b45e..ab8d9f7bb 100644 --- a/aegisub/universalchardet/nsUTF8Prober.cpp +++ b/aegisub/universalchardet/nsUTF8Prober.cpp @@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen) for (PRUint32 i = 0; i < aLen; i++) { codingState = mCodingSM->NextState(aBuf[i]); - if (codingState == eError) - { - mState = eNotMe; - break; - } if (codingState == eItsMe) { mState = eFoundIt; diff --git a/aegisub/universalchardet/nsUTF8Prober.h b/aegisub/universalchardet/nsUTF8Prober.h index 9ead2236f..21c91c42a 100644 --- a/aegisub/universalchardet/nsUTF8Prober.h +++ b/aegisub/universalchardet/nsUTF8Prober.h @@ -45,14 +45,14 @@ class nsUTF8Prober: public nsCharSetProber { public: nsUTF8Prober(){mNumOfMBChar = 0; mCodingSM = new nsCodingStateMachine(&UTF8SMModel); - Reset(); }; - virtual ~nsUTF8Prober(){delete mCodingSM;}; + Reset(); } + virtual ~nsUTF8Prober(){delete mCodingSM;} nsProbingState HandleData(const char* aBuf, PRUint32 aLen); - const char* GetCharSetName() {return "UTF-8";}; - nsProbingState GetState(void) {return mState;}; + const char* GetCharSetName() {return "UTF-8";} + nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(void); - void SetOpion() {}; + void SetOpion() {} protected: nsCodingStateMachine* mCodingSM; diff --git a/aegisub/universalchardet/nsUniversalDetector.cpp b/aegisub/universalchardet/nsUniversalDetector.cpp index 9fe64fcc6..1f495228b 100644 --- a/aegisub/universalchardet/nsUniversalDetector.cpp +++ b/aegisub/universalchardet/nsUniversalDetector.cpp @@ -44,9 +44,8 @@ #include "nsSBCSGroupProber.h" #include "nsEscCharsetProber.h" #include "nsLatin1Prober.h" -#include "nsError.h" -nsUniversalDetector::nsUniversalDetector() +nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) { mDone = PR_FALSE; mBestGuess = -1; //illegal value as signal @@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector() mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; + mLanguageFilter = aLanguageFilter; PRUint32 i; for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) @@ -125,12 +125,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) mDetectedCharset = "X-ISO-10646-UCS-4-3412"; else if ('\xFF' == aBuf[1]) // FE FF UTF-16, big endian BOM - mDetectedCharset = "UTF-16BE"; + mDetectedCharset = "UTF-16"; break; case '\x00': if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) // 00 00 FE FF UTF-32, big-endian BOM - mDetectedCharset = "UTF-32BE"; + mDetectedCharset = "UTF-32"; else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) // 00 00 FF FE UCS-4, unusual octet order BOM (2143) mDetectedCharset = "X-ISO-10646-UCS-4-2143"; @@ -138,10 +138,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) case '\xFF': if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) // FF FE 00 00 UTF-32, little-endian BOM - mDetectedCharset = "UTF-32LE"; + mDetectedCharset = "UTF-32"; else if ('\xFE' == aBuf[1]) // FF FE UTF-16, little endian BOM - mDetectedCharset = "UTF-16LE"; + mDetectedCharset = "UTF-16"; break; } // switch @@ -172,16 +172,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) //start multibyte and singlebyte charset prober if (nsnull == mCharSetProbers[0]) - mCharSetProbers[0] = new nsMBCSGroupProber; - if (nsnull == mCharSetProbers[1]) - mCharSetProbers[1] = new nsSBCSGroupProber; - if (nsnull == mCharSetProbers[2]) - mCharSetProbers[2] = new nsLatin1Prober; - - if ((nsnull == mCharSetProbers[0]) || - (nsnull == mCharSetProbers[1]) || - (nsnull == mCharSetProbers[2])) + { + mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); + if (nsnull == mCharSetProbers[0]) return NS_ERROR_OUT_OF_MEMORY; + } + if (nsnull == mCharSetProbers[1] && + (mLanguageFilter & NS_FILTER_NON_CJK)) + { + mCharSetProbers[1] = new nsSBCSGroupProber; + if (nsnull == mCharSetProbers[1]) + return NS_ERROR_OUT_OF_MEMORY; + } + if (nsnull == mCharSetProbers[2]) + { + mCharSetProbers[2] = new nsLatin1Prober; + if (nsnull == mCharSetProbers[2]) + return NS_ERROR_OUT_OF_MEMORY; + } } } else @@ -202,7 +210,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { case eEscAscii: if (nsnull == mEscCharSetProber) { - mEscCharSetProber = new nsEscCharSetProber; + mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); if (nsnull == mEscCharSetProber) return NS_ERROR_OUT_OF_MEMORY; } @@ -216,12 +224,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) case eHighbyte: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { - st = mCharSetProbers[i]->HandleData(aBuf, aLen); - if (st == eFoundIt) + if (mCharSetProbers[i]) { - mDone = PR_TRUE; - mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); - return NS_OK; + st = mCharSetProbers[i]->HandleData(aBuf, aLen); + if (st == eFoundIt) + { + mDone = PR_TRUE; + mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); + return NS_OK; + } } } break; @@ -260,11 +271,14 @@ void nsUniversalDetector::DataEnd() for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { - proberConfidence = mCharSetProbers[i]->GetConfidence(); - if (proberConfidence > maxProberConfidence) + if (mCharSetProbers[i]) { - maxProberConfidence = proberConfidence; - maxProber = i; + proberConfidence = mCharSetProbers[i]->GetConfidence(); + if (proberConfidence > maxProberConfidence) + { + maxProberConfidence = proberConfidence; + maxProber = i; + } } } //do not report anything because we are not confident of it, that's in fact a negative answer diff --git a/aegisub/universalchardet/nsUniversalDetector.h b/aegisub/universalchardet/nsUniversalDetector.h index 8689576e6..525f7220f 100644 --- a/aegisub/universalchardet/nsUniversalDetector.h +++ b/aegisub/universalchardet/nsUniversalDetector.h @@ -38,8 +38,6 @@ #ifndef nsUniversalDetector_h__ #define nsUniversalDetector_h__ -#include "nscore.h" - class nsCharSetProber; #define NUM_OF_CHARSET_PROBERS 3 @@ -50,9 +48,22 @@ typedef enum { eHighbyte = 2 } nsInputState; +#define NS_FILTER_CHINESE_SIMPLIFIED 0x01 +#define NS_FILTER_CHINESE_TRADITIONAL 0x02 +#define NS_FILTER_JAPANESE 0x04 +#define NS_FILTER_KOREAN 0x08 +#define NS_FILTER_NON_CJK 0x10 +#define NS_FILTER_ALL 0x1F +#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \ + NS_FILTER_CHINESE_TRADITIONAL) +#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \ + NS_FILTER_CHINESE_TRADITIONAL | \ + NS_FILTER_JAPANESE | \ + NS_FILTER_KOREAN) + class nsUniversalDetector { public: - nsUniversalDetector(); + nsUniversalDetector(PRUint32 aLanguageFilter); virtual ~nsUniversalDetector(); virtual nsresult HandleData(const char* aBuf, PRUint32 aLen); virtual void DataEnd(void); @@ -68,6 +79,7 @@ protected: char mLastChar; const char * mDetectedCharset; PRInt32 mBestGuess; + PRUint32 mLanguageFilter; nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; nsCharSetProber *mEscCharSetProber;