From 8dab221f8bb44c634ffda4b46d8a69e66a16ae0a Mon Sep 17 00:00:00 2001 From: Amar Takhar Date: Thu, 27 May 2010 02:20:34 +0000 Subject: [PATCH] Bring universalchardet up to 41661:ea9bbf0ff87f (2010-03-13). Originally committed to SVN as r4366. --- aegisub/universalchardet/CharDistribution.cpp | 7 ++----- aegisub/universalchardet/CharDistribution.h | 12 +++++++++--- aegisub/universalchardet/JpCntx.cpp | 7 ++++--- aegisub/universalchardet/JpCntx.h | 9 ++++++--- aegisub/universalchardet/LangBulgarianModel.cpp | 2 +- aegisub/universalchardet/LangCyrillicModel.cpp | 2 +- aegisub/universalchardet/LangGreekModel.cpp | 2 +- aegisub/universalchardet/LangHebrewModel.cpp | 2 +- aegisub/universalchardet/LangHungarianModel.cpp | 2 +- aegisub/universalchardet/LangThaiModel.cpp | 2 +- aegisub/universalchardet/nsBig5Prober.cpp | 4 ++-- aegisub/universalchardet/nsEUCJPProber.cpp | 8 ++++---- aegisub/universalchardet/nsEUCKRProber.cpp | 4 ++-- aegisub/universalchardet/nsEUCTWProber.cpp | 4 ++-- aegisub/universalchardet/nsGB2312Prober.cpp | 4 ++-- aegisub/universalchardet/nsSBCharSetProber.h | 2 +- aegisub/universalchardet/nsSJISProber.cpp | 8 ++++---- 17 files changed, 44 insertions(+), 37 deletions(-) diff --git a/aegisub/universalchardet/CharDistribution.cpp b/aegisub/universalchardet/CharDistribution.cpp index 4817c0e2f..488d9bc3c 100644 --- a/aegisub/universalchardet/CharDistribution.cpp +++ b/aegisub/universalchardet/CharDistribution.cpp @@ -46,16 +46,13 @@ #define SURE_YES 0.99f #define SURE_NO 0.01f -#define MINIMUM_DATA_THRESHOLD 4 - //return confidence base on received data -float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage) +float CharDistributionAnalysis::GetConfidence(void) { //if we didn't receive any character in our consideration range, or the // number of frequent characters is below the minimum threshold, return // negative answer - if (mTotalChars <= 0 || - !aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD) + if (mTotalChars <= 0 || mFreqChars <= mDataThreshold) return SURE_NO; if (mTotalChars != mFreqChars) { diff --git a/aegisub/universalchardet/CharDistribution.h b/aegisub/universalchardet/CharDistribution.h index 99f338ca6..453c2de5f 100644 --- a/aegisub/universalchardet/CharDistribution.h +++ b/aegisub/universalchardet/CharDistribution.h @@ -42,10 +42,12 @@ #define ENOUGH_DATA_THRESHOLD 1024 +#define MINIMUM_DATA_THRESHOLD 4 + class CharDistributionAnalysis { public: - CharDistributionAnalysis() {Reset();} + CharDistributionAnalysis() {Reset(PR_FALSE);} //feed a block of data and do distribution analysis void HandleData(const char* aBuf, PRUint32 aLen) {} @@ -71,14 +73,15 @@ public: } //return confidence base on existing data - float GetConfidence(PRBool aIsPreferredLanguage); + float GetConfidence(void); //Reset analyser, clear any state - void Reset(void) + void Reset(PRBool aIsPreferredLanguage) { mDone = PR_FALSE; mTotalChars = 0; mFreqChars = 0; + mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; } //This function is for future extension. Caller can use this function to control @@ -104,6 +107,9 @@ protected: //Total character encounted. PRUint32 mTotalChars; + //Number of hi-byte characters needed to trigger detection + PRUint32 mDataThreshold; + //Mapping table to get frequency order from char order (get from GetOrder()) const PRInt16 *mCharToFreqOrder; diff --git a/aegisub/universalchardet/JpCntx.cpp b/aegisub/universalchardet/JpCntx.cpp index 9a8cc1d54..7da041396 100644 --- a/aegisub/universalchardet/JpCntx.cpp +++ b/aegisub/universalchardet/JpCntx.cpp @@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen) return; } -void JapaneseContextAnalysis::Reset(void) +void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage) { mTotalRel = 0; for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++) @@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void) mNeedToSkipCharNum = 0; mLastCharOrder = -1; mDone = PR_FALSE; + mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD; } #define DONT_KNOW (float)-1 -float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage) +float JapaneseContextAnalysis::GetConfidence(void) { //This is just one way to calculate confidence. It works well for me. - if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD) + if (mTotalRel > mDataThreshold) return ((float)(mTotalRel - mRelSample[0]))/mTotalRel; else return (float)DONT_KNOW; diff --git a/aegisub/universalchardet/JpCntx.h b/aegisub/universalchardet/JpCntx.h index aaa1576d9..fe8fcb8c2 100644 --- a/aegisub/universalchardet/JpCntx.h +++ b/aegisub/universalchardet/JpCntx.h @@ -51,7 +51,7 @@ extern const PRUint8 jp2CharContext[83][83]; class JapaneseContextAnalysis { public: - JapaneseContextAnalysis() {Reset();} + JapaneseContextAnalysis() {Reset(PR_FALSE);} void HandleData(const char* aBuf, PRUint32 aLen); @@ -74,8 +74,8 @@ public: mLastCharOrder = order; } - float GetConfidence(PRBool aIsPreferredLanguage); - void Reset(void); + float GetConfidence(void); + void Reset(PRBool aIsPreferredLanguage); void SetOpion(){} PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} @@ -88,6 +88,9 @@ protected: //total sequence received PRUint32 mTotalRel; + + //Number of sequences needed to trigger detection + PRUint32 mDataThreshold; //The order of previous char PRInt32 mLastCharOrder; diff --git a/aegisub/universalchardet/LangBulgarianModel.cpp b/aegisub/universalchardet/LangBulgarianModel.cpp index 79c575641..0f73282b8 100644 --- a/aegisub/universalchardet/LangBulgarianModel.cpp +++ b/aegisub/universalchardet/LangBulgarianModel.cpp @@ -94,7 +94,7 @@ static const unsigned char win1251BulgarianCharToOrderMap[] = //first 1024 sequences:3.0618% //rest sequences: 0.2992% //negative sequences: 0.0020% -static const char BulgarianLangModel[] = +static const PRUint8 BulgarianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, diff --git a/aegisub/universalchardet/LangCyrillicModel.cpp b/aegisub/universalchardet/LangCyrillicModel.cpp index 180c40e5f..d8e73e8a9 100644 --- a/aegisub/universalchardet/LangCyrillicModel.cpp +++ b/aegisub/universalchardet/LangCyrillicModel.cpp @@ -167,7 +167,7 @@ static const unsigned char IBM866_CharToOrderMap[] = //first 1024 sequences: 2.3389% //rest sequences: 0.1237% //negative sequences: 0.0009% -static const char RussianLangModel[] = +static const PRUint8 RussianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, diff --git a/aegisub/universalchardet/LangGreekModel.cpp b/aegisub/universalchardet/LangGreekModel.cpp index 13ce0ae9e..30c65dc7f 100644 --- a/aegisub/universalchardet/LangGreekModel.cpp +++ b/aegisub/universalchardet/LangGreekModel.cpp @@ -93,7 +93,7 @@ static const unsigned char win1253_CharToOrderMap[] = //first 1024 sequences:1.7001% //rest sequences: 0.0359% //negative sequences: 0.0148% -static const char GreekLangModel[] = +static const PRUint8 GreekLangModel[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, diff --git a/aegisub/universalchardet/LangHebrewModel.cpp b/aegisub/universalchardet/LangHebrewModel.cpp index f86e68098..a4e10addb 100644 --- a/aegisub/universalchardet/LangHebrewModel.cpp +++ b/aegisub/universalchardet/LangHebrewModel.cpp @@ -76,7 +76,7 @@ static const unsigned char win1255_CharToOrderMap[] = //first 1024 sequences: 1.5981% //rest sequences: 0.087% //negative sequences: 0.0015% -static const char HebrewLangModel[] = +static const PRUint8 HebrewLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, diff --git a/aegisub/universalchardet/LangHungarianModel.cpp b/aegisub/universalchardet/LangHungarianModel.cpp index 876826cdd..3af2f5882 100644 --- a/aegisub/universalchardet/LangHungarianModel.cpp +++ b/aegisub/universalchardet/LangHungarianModel.cpp @@ -91,7 +91,7 @@ static const unsigned char win1250HungarianCharToOrderMap[] = //first 1024 sequences:5.2623% //rest sequences: 0.8894% //negative sequences: 0.0009% -static const char HungarianLangModel[] = +static const PRUint8 HungarianLangModel[] = { 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, diff --git a/aegisub/universalchardet/LangThaiModel.cpp b/aegisub/universalchardet/LangThaiModel.cpp index aadc2dda9..8145ffa1b 100644 --- a/aegisub/universalchardet/LangThaiModel.cpp +++ b/aegisub/universalchardet/LangThaiModel.cpp @@ -78,7 +78,7 @@ static const unsigned char TIS620CharToOrderMap[] = //first 1024 sequences:7.3177% //rest sequences: 1.0230% //negative sequences: 0.0436% -static const char ThaiLangModel[] = +static const PRUint8 ThaiLangModel[] = { 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, diff --git a/aegisub/universalchardet/nsBig5Prober.cpp b/aegisub/universalchardet/nsBig5Prober.cpp index aee5cd483..7a85abb5d 100644 --- a/aegisub/universalchardet/nsBig5Prober.cpp +++ b/aegisub/universalchardet/nsBig5Prober.cpp @@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); } nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) @@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsBig5Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsEUCJPProber.cpp b/aegisub/universalchardet/nsEUCJPProber.cpp index 35387dfdd..54861b3d2 100644 --- a/aegisub/universalchardet/nsEUCJPProber.cpp +++ b/aegisub/universalchardet/nsEUCJPProber.cpp @@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mContextAnalyser.Reset(); - mDistributionAnalyser.Reset(); + mContextAnalyser.Reset(mIsPreferredLanguage); + mDistributionAnalyser.Reset(mIsPreferredLanguage); } nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) @@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCJPProber::GetConfidence(void) { - float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); - float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); + float contxtCf = mContextAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(); return (contxtCf > distribCf ? contxtCf : distribCf); } diff --git a/aegisub/universalchardet/nsEUCKRProber.cpp b/aegisub/universalchardet/nsEUCKRProber.cpp index 396b09527..3632f1f90 100644 --- a/aegisub/universalchardet/nsEUCKRProber.cpp +++ b/aegisub/universalchardet/nsEUCKRProber.cpp @@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); //mContextAnalyser.Reset(); } @@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCKRProber::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsEUCTWProber.cpp b/aegisub/universalchardet/nsEUCTWProber.cpp index 710e413eb..a06e074b3 100644 --- a/aegisub/universalchardet/nsEUCTWProber.cpp +++ b/aegisub/universalchardet/nsEUCTWProber.cpp @@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); //mContextAnalyser.Reset(); } @@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen) float nsEUCTWProber::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsGB2312Prober.cpp b/aegisub/universalchardet/nsGB2312Prober.cpp index 95374c312..b6d469cef 100644 --- a/aegisub/universalchardet/nsGB2312Prober.cpp +++ b/aegisub/universalchardet/nsGB2312Prober.cpp @@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mDistributionAnalyser.Reset(); + mDistributionAnalyser.Reset(mIsPreferredLanguage); //mContextAnalyser.Reset(); } @@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen) float nsGB18030Prober::GetConfidence(void) { - float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); + float distribCf = mDistributionAnalyser.GetConfidence(); return (float)distribCf; } diff --git a/aegisub/universalchardet/nsSBCharSetProber.h b/aegisub/universalchardet/nsSBCharSetProber.h index b629feb34..d7180dcdf 100644 --- a/aegisub/universalchardet/nsSBCharSetProber.h +++ b/aegisub/universalchardet/nsSBCharSetProber.h @@ -52,7 +52,7 @@ typedef struct { const unsigned char* const charToOrderMap; // [256] table use to find a char's order - const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency + const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency float mTypicalPositiveRatio; // = freqSeqs / totalSeqs PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) const char* const charsetName; diff --git a/aegisub/universalchardet/nsSJISProber.cpp b/aegisub/universalchardet/nsSJISProber.cpp index 5b7e7fddc..c7842f6a4 100644 --- a/aegisub/universalchardet/nsSJISProber.cpp +++ b/aegisub/universalchardet/nsSJISProber.cpp @@ -46,8 +46,8 @@ void nsSJISProber::Reset(void) { mCodingSM->Reset(); mState = eDetecting; - mContextAnalyser.Reset(); - mDistributionAnalyser.Reset(); + mContextAnalyser.Reset(mIsPreferredLanguage); + mDistributionAnalyser.Reset(mIsPreferredLanguage); } nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) @@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) float nsSJISProber::GetConfidence(void) { - float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); - float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); + float contxtCf = mContextAnalyser.GetConfidence(); + float distribCf = mDistributionAnalyser.GetConfidence(); return (contxtCf > distribCf ? contxtCf : distribCf); }