Bring universalchardet up to 41661:ea9bbf0ff87f (2010-03-13).

Originally committed to SVN as r4366.
This commit is contained in:
Amar Takhar 2010-05-27 02:20:34 +00:00
parent 6df5d97568
commit 8dab221f8b
17 changed files with 44 additions and 37 deletions

View file

@ -46,16 +46,13 @@
#define SURE_YES 0.99f #define SURE_YES 0.99f
#define SURE_NO 0.01f #define SURE_NO 0.01f
#define MINIMUM_DATA_THRESHOLD 4
//return confidence base on received data //return confidence base on received data
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage) float CharDistributionAnalysis::GetConfidence(void)
{ {
//if we didn't receive any character in our consideration range, or the //if we didn't receive any character in our consideration range, or the
// number of frequent characters is below the minimum threshold, return // number of frequent characters is below the minimum threshold, return
// negative answer // negative answer
if (mTotalChars <= 0 || if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
return SURE_NO; return SURE_NO;
if (mTotalChars != mFreqChars) { if (mTotalChars != mFreqChars) {

View file

@ -42,10 +42,12 @@
#define ENOUGH_DATA_THRESHOLD 1024 #define ENOUGH_DATA_THRESHOLD 1024
#define MINIMUM_DATA_THRESHOLD 4
class CharDistributionAnalysis class CharDistributionAnalysis
{ {
public: public:
CharDistributionAnalysis() {Reset();} CharDistributionAnalysis() {Reset(PR_FALSE);}
//feed a block of data and do distribution analysis //feed a block of data and do distribution analysis
void HandleData(const char* aBuf, PRUint32 aLen) {} void HandleData(const char* aBuf, PRUint32 aLen) {}
@ -71,14 +73,15 @@ public:
} }
//return confidence base on existing data //return confidence base on existing data
float GetConfidence(PRBool aIsPreferredLanguage); float GetConfidence(void);
//Reset analyser, clear any state //Reset analyser, clear any state
void Reset(void) void Reset(PRBool aIsPreferredLanguage)
{ {
mDone = PR_FALSE; mDone = PR_FALSE;
mTotalChars = 0; mTotalChars = 0;
mFreqChars = 0; mFreqChars = 0;
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
} }
//This function is for future extension. Caller can use this function to control //This function is for future extension. Caller can use this function to control
@ -104,6 +107,9 @@ protected:
//Total character encounted. //Total character encounted.
PRUint32 mTotalChars; PRUint32 mTotalChars;
//Number of hi-byte characters needed to trigger detection
PRUint32 mDataThreshold;
//Mapping table to get frequency order from char order (get from GetOrder()) //Mapping table to get frequency order from char order (get from GetOrder())
const PRInt16 *mCharToFreqOrder; const PRInt16 *mCharToFreqOrder;

View file

@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen)
return; return;
} }
void JapaneseContextAnalysis::Reset(void) void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage)
{ {
mTotalRel = 0; mTotalRel = 0;
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++) for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void)
mNeedToSkipCharNum = 0; mNeedToSkipCharNum = 0;
mLastCharOrder = -1; mLastCharOrder = -1;
mDone = PR_FALSE; mDone = PR_FALSE;
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
} }
#define DONT_KNOW (float)-1 #define DONT_KNOW (float)-1
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage) float JapaneseContextAnalysis::GetConfidence(void)
{ {
//This is just one way to calculate confidence. It works well for me. //This is just one way to calculate confidence. It works well for me.
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD) if (mTotalRel > mDataThreshold)
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel; return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
else else
return (float)DONT_KNOW; return (float)DONT_KNOW;

View file

@ -51,7 +51,7 @@ extern const PRUint8 jp2CharContext[83][83];
class JapaneseContextAnalysis class JapaneseContextAnalysis
{ {
public: public:
JapaneseContextAnalysis() {Reset();} JapaneseContextAnalysis() {Reset(PR_FALSE);}
void HandleData(const char* aBuf, PRUint32 aLen); void HandleData(const char* aBuf, PRUint32 aLen);
@ -74,8 +74,8 @@ public:
mLastCharOrder = order; mLastCharOrder = order;
} }
float GetConfidence(PRBool aIsPreferredLanguage); float GetConfidence(void);
void Reset(void); void Reset(PRBool aIsPreferredLanguage);
void SetOpion(){} void SetOpion(){}
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;} PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
@ -88,6 +88,9 @@ protected:
//total sequence received //total sequence received
PRUint32 mTotalRel; PRUint32 mTotalRel;
//Number of sequences needed to trigger detection
PRUint32 mDataThreshold;
//The order of previous char //The order of previous char
PRInt32 mLastCharOrder; PRInt32 mLastCharOrder;

View file

@ -94,7 +94,7 @@ static const unsigned char win1251BulgarianCharToOrderMap[] =
//first 1024 sequences:3.0618% //first 1024 sequences:3.0618%
//rest sequences: 0.2992% //rest sequences: 0.2992%
//negative sequences: 0.0020% //negative sequences: 0.0020%
static const char BulgarianLangModel[] = static const PRUint8 BulgarianLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,

View file

@ -167,7 +167,7 @@ static const unsigned char IBM866_CharToOrderMap[] =
//first 1024 sequences: 2.3389% //first 1024 sequences: 2.3389%
//rest sequences: 0.1237% //rest sequences: 0.1237%
//negative sequences: 0.0009% //negative sequences: 0.0009%
static const char RussianLangModel[] = static const PRUint8 RussianLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,

View file

@ -93,7 +93,7 @@ static const unsigned char win1253_CharToOrderMap[] =
//first 1024 sequences:1.7001% //first 1024 sequences:1.7001%
//rest sequences: 0.0359% //rest sequences: 0.0359%
//negative sequences: 0.0148% //negative sequences: 0.0148%
static const char GreekLangModel[] = static const PRUint8 GreekLangModel[] =
{ {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

View file

@ -76,7 +76,7 @@ static const unsigned char win1255_CharToOrderMap[] =
//first 1024 sequences: 1.5981% //first 1024 sequences: 1.5981%
//rest sequences: 0.087% //rest sequences: 0.087%
//negative sequences: 0.0015% //negative sequences: 0.0015%
static const char HebrewLangModel[] = static const PRUint8 HebrewLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,

View file

@ -91,7 +91,7 @@ static const unsigned char win1250HungarianCharToOrderMap[] =
//first 1024 sequences:5.2623% //first 1024 sequences:5.2623%
//rest sequences: 0.8894% //rest sequences: 0.8894%
//negative sequences: 0.0009% //negative sequences: 0.0009%
static const char HungarianLangModel[] = static const PRUint8 HungarianLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,

View file

@ -78,7 +78,7 @@ static const unsigned char TIS620CharToOrderMap[] =
//first 1024 sequences:7.3177% //first 1024 sequences:7.3177%
//rest sequences: 1.0230% //rest sequences: 1.0230%
//negative sequences: 0.0436% //negative sequences: 0.0436%
static const char ThaiLangModel[] = static const PRUint8 ThaiLangModel[] =
{ {
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,

View file

@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void)
{ {
mCodingSM->Reset(); mCodingSM->Reset();
mState = eDetecting; mState = eDetecting;
mDistributionAnalyser.Reset(); mDistributionAnalyser.Reset(mIsPreferredLanguage);
} }
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen) nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsBig5Prober::GetConfidence(void) float nsBig5Prober::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); float distribCf = mDistributionAnalyser.GetConfidence();
return (float)distribCf; return (float)distribCf;
} }

View file

@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void)
{ {
mCodingSM->Reset(); mCodingSM->Reset();
mState = eDetecting; mState = eDetecting;
mContextAnalyser.Reset(); mContextAnalyser.Reset(mIsPreferredLanguage);
mDistributionAnalyser.Reset(); mDistributionAnalyser.Reset(mIsPreferredLanguage);
} }
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen) nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCJPProber::GetConfidence(void) float nsEUCJPProber::GetConfidence(void)
{ {
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); float contxtCf = mContextAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); float distribCf = mDistributionAnalyser.GetConfidence();
return (contxtCf > distribCf ? contxtCf : distribCf); return (contxtCf > distribCf ? contxtCf : distribCf);
} }

View file

@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void)
{ {
mCodingSM->Reset(); mCodingSM->Reset();
mState = eDetecting; mState = eDetecting;
mDistributionAnalyser.Reset(); mDistributionAnalyser.Reset(mIsPreferredLanguage);
//mContextAnalyser.Reset(); //mContextAnalyser.Reset();
} }
@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCKRProber::GetConfidence(void) float nsEUCKRProber::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); float distribCf = mDistributionAnalyser.GetConfidence();
return (float)distribCf; return (float)distribCf;
} }

View file

@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void)
{ {
mCodingSM->Reset(); mCodingSM->Reset();
mState = eDetecting; mState = eDetecting;
mDistributionAnalyser.Reset(); mDistributionAnalyser.Reset(mIsPreferredLanguage);
//mContextAnalyser.Reset(); //mContextAnalyser.Reset();
} }
@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCTWProber::GetConfidence(void) float nsEUCTWProber::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); float distribCf = mDistributionAnalyser.GetConfidence();
return (float)distribCf; return (float)distribCf;
} }

View file

@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void)
{ {
mCodingSM->Reset(); mCodingSM->Reset();
mState = eDetecting; mState = eDetecting;
mDistributionAnalyser.Reset(); mDistributionAnalyser.Reset(mIsPreferredLanguage);
//mContextAnalyser.Reset(); //mContextAnalyser.Reset();
} }
@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsGB18030Prober::GetConfidence(void) float nsGB18030Prober::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); float distribCf = mDistributionAnalyser.GetConfidence();
return (float)distribCf; return (float)distribCf;
} }

View file

@ -52,7 +52,7 @@
typedef struct typedef struct
{ {
const unsigned char* const charToOrderMap; // [256] table use to find a char's order const unsigned char* const charToOrderMap; // [256] table use to find a char's order
const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* const charsetName; const char* const charsetName;

View file

@ -46,8 +46,8 @@ void nsSJISProber::Reset(void)
{ {
mCodingSM->Reset(); mCodingSM->Reset();
mState = eDetecting; mState = eDetecting;
mContextAnalyser.Reset(); mContextAnalyser.Reset(mIsPreferredLanguage);
mDistributionAnalyser.Reset(); mDistributionAnalyser.Reset(mIsPreferredLanguage);
} }
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen) nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsSJISProber::GetConfidence(void) float nsSJISProber::GetConfidence(void)
{ {
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage); float contxtCf = mContextAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage); float distribCf = mDistributionAnalyser.GetConfidence();
return (contxtCf > distribCf ? contxtCf : distribCf); return (contxtCf > distribCf ? contxtCf : distribCf);
} }