Bring universalchardet up to 41661:ea9bbf0ff87f (2010-03-13).
Originally committed to SVN as r4366.
This commit is contained in:
parent
6df5d97568
commit
8dab221f8b
17 changed files with 44 additions and 37 deletions
|
@ -46,16 +46,13 @@
|
||||||
#define SURE_YES 0.99f
|
#define SURE_YES 0.99f
|
||||||
#define SURE_NO 0.01f
|
#define SURE_NO 0.01f
|
||||||
|
|
||||||
#define MINIMUM_DATA_THRESHOLD 4
|
|
||||||
|
|
||||||
//return confidence base on received data
|
//return confidence base on received data
|
||||||
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
float CharDistributionAnalysis::GetConfidence(void)
|
||||||
{
|
{
|
||||||
//if we didn't receive any character in our consideration range, or the
|
//if we didn't receive any character in our consideration range, or the
|
||||||
// number of frequent characters is below the minimum threshold, return
|
// number of frequent characters is below the minimum threshold, return
|
||||||
// negative answer
|
// negative answer
|
||||||
if (mTotalChars <= 0 ||
|
if (mTotalChars <= 0 || mFreqChars <= mDataThreshold)
|
||||||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
|
||||||
return SURE_NO;
|
return SURE_NO;
|
||||||
|
|
||||||
if (mTotalChars != mFreqChars) {
|
if (mTotalChars != mFreqChars) {
|
||||||
|
|
|
@ -42,10 +42,12 @@
|
||||||
|
|
||||||
#define ENOUGH_DATA_THRESHOLD 1024
|
#define ENOUGH_DATA_THRESHOLD 1024
|
||||||
|
|
||||||
|
#define MINIMUM_DATA_THRESHOLD 4
|
||||||
|
|
||||||
class CharDistributionAnalysis
|
class CharDistributionAnalysis
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
CharDistributionAnalysis() {Reset();}
|
CharDistributionAnalysis() {Reset(PR_FALSE);}
|
||||||
|
|
||||||
//feed a block of data and do distribution analysis
|
//feed a block of data and do distribution analysis
|
||||||
void HandleData(const char* aBuf, PRUint32 aLen) {}
|
void HandleData(const char* aBuf, PRUint32 aLen) {}
|
||||||
|
@ -71,14 +73,15 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
//return confidence base on existing data
|
//return confidence base on existing data
|
||||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
float GetConfidence(void);
|
||||||
|
|
||||||
//Reset analyser, clear any state
|
//Reset analyser, clear any state
|
||||||
void Reset(void)
|
void Reset(PRBool aIsPreferredLanguage)
|
||||||
{
|
{
|
||||||
mDone = PR_FALSE;
|
mDone = PR_FALSE;
|
||||||
mTotalChars = 0;
|
mTotalChars = 0;
|
||||||
mFreqChars = 0;
|
mFreqChars = 0;
|
||||||
|
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
|
||||||
}
|
}
|
||||||
|
|
||||||
//This function is for future extension. Caller can use this function to control
|
//This function is for future extension. Caller can use this function to control
|
||||||
|
@ -104,6 +107,9 @@ protected:
|
||||||
//Total character encounted.
|
//Total character encounted.
|
||||||
PRUint32 mTotalChars;
|
PRUint32 mTotalChars;
|
||||||
|
|
||||||
|
//Number of hi-byte characters needed to trigger detection
|
||||||
|
PRUint32 mDataThreshold;
|
||||||
|
|
||||||
//Mapping table to get frequency order from char order (get from GetOrder())
|
//Mapping table to get frequency order from char order (get from GetOrder())
|
||||||
const PRInt16 *mCharToFreqOrder;
|
const PRInt16 *mCharToFreqOrder;
|
||||||
|
|
||||||
|
|
|
@ -170,7 +170,7 @@ void JapaneseContextAnalysis::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void JapaneseContextAnalysis::Reset(void)
|
void JapaneseContextAnalysis::Reset(PRBool aIsPreferredLanguage)
|
||||||
{
|
{
|
||||||
mTotalRel = 0;
|
mTotalRel = 0;
|
||||||
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
|
for (PRUint32 i = 0; i < NUM_OF_CATEGORY; i++)
|
||||||
|
@ -178,13 +178,14 @@ void JapaneseContextAnalysis::Reset(void)
|
||||||
mNeedToSkipCharNum = 0;
|
mNeedToSkipCharNum = 0;
|
||||||
mLastCharOrder = -1;
|
mLastCharOrder = -1;
|
||||||
mDone = PR_FALSE;
|
mDone = PR_FALSE;
|
||||||
|
mDataThreshold = aIsPreferredLanguage ? 0 : MINIMUM_DATA_THRESHOLD;
|
||||||
}
|
}
|
||||||
#define DONT_KNOW (float)-1
|
#define DONT_KNOW (float)-1
|
||||||
|
|
||||||
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
float JapaneseContextAnalysis::GetConfidence(void)
|
||||||
{
|
{
|
||||||
//This is just one way to calculate confidence. It works well for me.
|
//This is just one way to calculate confidence. It works well for me.
|
||||||
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
|
if (mTotalRel > mDataThreshold)
|
||||||
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
||||||
else
|
else
|
||||||
return (float)DONT_KNOW;
|
return (float)DONT_KNOW;
|
||||||
|
|
|
@ -51,7 +51,7 @@ extern const PRUint8 jp2CharContext[83][83];
|
||||||
class JapaneseContextAnalysis
|
class JapaneseContextAnalysis
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
JapaneseContextAnalysis() {Reset();}
|
JapaneseContextAnalysis() {Reset(PR_FALSE);}
|
||||||
|
|
||||||
void HandleData(const char* aBuf, PRUint32 aLen);
|
void HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
|
|
||||||
|
@ -74,8 +74,8 @@ public:
|
||||||
mLastCharOrder = order;
|
mLastCharOrder = order;
|
||||||
}
|
}
|
||||||
|
|
||||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
float GetConfidence(void);
|
||||||
void Reset(void);
|
void Reset(PRBool aIsPreferredLanguage);
|
||||||
void SetOpion(){}
|
void SetOpion(){}
|
||||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
||||||
|
|
||||||
|
@ -88,6 +88,9 @@ protected:
|
||||||
|
|
||||||
//total sequence received
|
//total sequence received
|
||||||
PRUint32 mTotalRel;
|
PRUint32 mTotalRel;
|
||||||
|
|
||||||
|
//Number of sequences needed to trigger detection
|
||||||
|
PRUint32 mDataThreshold;
|
||||||
|
|
||||||
//The order of previous char
|
//The order of previous char
|
||||||
PRInt32 mLastCharOrder;
|
PRInt32 mLastCharOrder;
|
||||||
|
|
|
@ -94,7 +94,7 @@ static const unsigned char win1251BulgarianCharToOrderMap[] =
|
||||||
//first 1024 sequences:3.0618%
|
//first 1024 sequences:3.0618%
|
||||||
//rest sequences: 0.2992%
|
//rest sequences: 0.2992%
|
||||||
//negative sequences: 0.0020%
|
//negative sequences: 0.0020%
|
||||||
static const char BulgarianLangModel[] =
|
static const PRUint8 BulgarianLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||||
|
|
|
@ -167,7 +167,7 @@ static const unsigned char IBM866_CharToOrderMap[] =
|
||||||
//first 1024 sequences: 2.3389%
|
//first 1024 sequences: 2.3389%
|
||||||
//rest sequences: 0.1237%
|
//rest sequences: 0.1237%
|
||||||
//negative sequences: 0.0009%
|
//negative sequences: 0.0009%
|
||||||
static const char RussianLangModel[] =
|
static const PRUint8 RussianLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||||
|
|
|
@ -93,7 +93,7 @@ static const unsigned char win1253_CharToOrderMap[] =
|
||||||
//first 1024 sequences:1.7001%
|
//first 1024 sequences:1.7001%
|
||||||
//rest sequences: 0.0359%
|
//rest sequences: 0.0359%
|
||||||
//negative sequences: 0.0148%
|
//negative sequences: 0.0148%
|
||||||
static const char GreekLangModel[] =
|
static const PRUint8 GreekLangModel[] =
|
||||||
{
|
{
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
|
|
@ -76,7 +76,7 @@ static const unsigned char win1255_CharToOrderMap[] =
|
||||||
//first 1024 sequences: 1.5981%
|
//first 1024 sequences: 1.5981%
|
||||||
//rest sequences: 0.087%
|
//rest sequences: 0.087%
|
||||||
//negative sequences: 0.0015%
|
//negative sequences: 0.0015%
|
||||||
static const char HebrewLangModel[] =
|
static const PRUint8 HebrewLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||||
|
|
|
@ -91,7 +91,7 @@ static const unsigned char win1250HungarianCharToOrderMap[] =
|
||||||
//first 1024 sequences:5.2623%
|
//first 1024 sequences:5.2623%
|
||||||
//rest sequences: 0.8894%
|
//rest sequences: 0.8894%
|
||||||
//negative sequences: 0.0009%
|
//negative sequences: 0.0009%
|
||||||
static const char HungarianLangModel[] =
|
static const PRUint8 HungarianLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||||
|
|
|
@ -78,7 +78,7 @@ static const unsigned char TIS620CharToOrderMap[] =
|
||||||
//first 1024 sequences:7.3177%
|
//first 1024 sequences:7.3177%
|
||||||
//rest sequences: 1.0230%
|
//rest sequences: 1.0230%
|
||||||
//negative sequences: 0.0436%
|
//negative sequences: 0.0436%
|
||||||
static const char ThaiLangModel[] =
|
static const PRUint8 ThaiLangModel[] =
|
||||||
{
|
{
|
||||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||||
|
|
|
@ -41,7 +41,7 @@ void nsBig5Prober::Reset(void)
|
||||||
{
|
{
|
||||||
mCodingSM->Reset();
|
mCodingSM->Reset();
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
mDistributionAnalyser.Reset();
|
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||||
}
|
}
|
||||||
|
|
||||||
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
@ -81,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsBig5Prober::GetConfidence(void)
|
float nsBig5Prober::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,8 +46,8 @@ void nsEUCJPProber::Reset(void)
|
||||||
{
|
{
|
||||||
mCodingSM->Reset();
|
mCodingSM->Reset();
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
mContextAnalyser.Reset();
|
mContextAnalyser.Reset(mIsPreferredLanguage);
|
||||||
mDistributionAnalyser.Reset();
|
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||||
}
|
}
|
||||||
|
|
||||||
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
@ -91,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsEUCJPProber::GetConfidence(void)
|
float nsEUCJPProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
float contxtCf = mContextAnalyser.GetConfidence();
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||||
|
|
||||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ void nsEUCKRProber::Reset(void)
|
||||||
{
|
{
|
||||||
mCodingSM->Reset();
|
mCodingSM->Reset();
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
mDistributionAnalyser.Reset();
|
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||||
//mContextAnalyser.Reset();
|
//mContextAnalyser.Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsEUCKRProber::GetConfidence(void)
|
float nsEUCKRProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ void nsEUCTWProber::Reset(void)
|
||||||
{
|
{
|
||||||
mCodingSM->Reset();
|
mCodingSM->Reset();
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
mDistributionAnalyser.Reset();
|
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||||
//mContextAnalyser.Reset();
|
//mContextAnalyser.Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsEUCTWProber::GetConfidence(void)
|
float nsEUCTWProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@ void nsGB18030Prober::Reset(void)
|
||||||
{
|
{
|
||||||
mCodingSM->Reset();
|
mCodingSM->Reset();
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
mDistributionAnalyser.Reset();
|
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||||
//mContextAnalyser.Reset();
|
//mContextAnalyser.Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsGB18030Prober::GetConfidence(void)
|
float nsGB18030Prober::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
|
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
|
||||||
const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
const PRUint8* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||||
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
||||||
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
||||||
const char* const charsetName;
|
const char* const charsetName;
|
||||||
|
|
|
@ -46,8 +46,8 @@ void nsSJISProber::Reset(void)
|
||||||
{
|
{
|
||||||
mCodingSM->Reset();
|
mCodingSM->Reset();
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
mContextAnalyser.Reset();
|
mContextAnalyser.Reset(mIsPreferredLanguage);
|
||||||
mDistributionAnalyser.Reset();
|
mDistributionAnalyser.Reset(mIsPreferredLanguage);
|
||||||
}
|
}
|
||||||
|
|
||||||
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
@ -90,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsSJISProber::GetConfidence(void)
|
float nsSJISProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
float contxtCf = mContextAnalyser.GetConfidence();
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||||
|
|
||||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue