Update universalchardet using a patch I made around 2009-02, the one we're currently using is from ~1998. I'll check again later to see if there are any updates to it before closing the ticket. Updates #866.
Originally committed to SVN as r3653.
This commit is contained in:
parent
05c9ffde7a
commit
42e0dd6ce4
43 changed files with 324 additions and 458 deletions
|
@ -106,17 +106,14 @@ wxString CharSetDetect::GetEncoding(wxString filename) {
|
||||||
bool gotLocal = false;
|
bool gotLocal = false;
|
||||||
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
|
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
|
||||||
if (mCharSetProbers[i]) {
|
if (mCharSetProbers[i]) {
|
||||||
int probes = mCharSetProbers[i]->GetProbeCount();
|
float conf = mCharSetProbers[i]->GetConfidence();
|
||||||
for (int j=0;j<probes;j++) {
|
|
||||||
float conf = mCharSetProbers[i]->GetConfidence(j);
|
|
||||||
|
|
||||||
// Only bother with those whose confidence is at least 1%
|
// Only bother with those whose confidence is at least 1%
|
||||||
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(j),wxConvUTF8);
|
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
|
||||||
if (conf > 0.01f || curName == local) {
|
if (conf > 0.01f || curName == local) {
|
||||||
results.push_back(CharDetResult());
|
results.push_back(CharDetResult());
|
||||||
results.back().name = curName;
|
results.back().name = curName;
|
||||||
results.back().confidence = mCharSetProbers[i]->GetConfidence(j);
|
results.back().confidence = conf;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -38,7 +38,9 @@
|
||||||
|
|
||||||
///////////
|
///////////
|
||||||
// Headers
|
// Headers
|
||||||
|
#include "../universalchardet/nscore.h"
|
||||||
#include "../universalchardet/nsUniversalDetector.h"
|
#include "../universalchardet/nsUniversalDetector.h"
|
||||||
|
#include "../universalchardet/nsMBCSGroupProber.h"
|
||||||
|
|
||||||
|
|
||||||
/// DOCME
|
/// DOCME
|
||||||
|
@ -54,6 +56,7 @@ private:
|
||||||
void Report(const char* aCharset);
|
void Report(const char* aCharset);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
|
||||||
wxString GetEncoding(wxString filename);
|
wxString GetEncoding(wxString filename);
|
||||||
|
|
||||||
/// @brief DOCME
|
/// @brief DOCME
|
||||||
|
|
|
@ -49,12 +49,13 @@
|
||||||
#define MINIMUM_DATA_THRESHOLD 4
|
#define MINIMUM_DATA_THRESHOLD 4
|
||||||
|
|
||||||
//return confidence base on received data
|
//return confidence base on received data
|
||||||
float CharDistributionAnalysis::GetConfidence()
|
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||||
{
|
{
|
||||||
//if we didn't receive any character in our consideration range, or the
|
//if we didn't receive any character in our consideration range, or the
|
||||||
// number of frequent characters is below the minimum threshold, return
|
// number of frequent characters is below the minimum threshold, return
|
||||||
// negative answer
|
// negative answer
|
||||||
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
if (mTotalChars <= 0 ||
|
||||||
|
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||||
return SURE_NO;
|
return SURE_NO;
|
||||||
|
|
||||||
if (mTotalChars != mFreqChars) {
|
if (mTotalChars != mFreqChars) {
|
||||||
|
|
|
@ -69,10 +69,10 @@ public:
|
||||||
mFreqChars++;
|
mFreqChars++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
}
|
||||||
|
|
||||||
//return confidence base on existing data
|
//return confidence base on existing data
|
||||||
float GetConfidence();
|
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||||
|
|
||||||
//Reset analyser, clear any state
|
//Reset analyser, clear any state
|
||||||
void Reset(void)
|
void Reset(void)
|
||||||
|
@ -80,21 +80,21 @@ public:
|
||||||
mDone = PR_FALSE;
|
mDone = PR_FALSE;
|
||||||
mTotalChars = 0;
|
mTotalChars = 0;
|
||||||
mFreqChars = 0;
|
mFreqChars = 0;
|
||||||
};
|
}
|
||||||
|
|
||||||
//This function is for future extension. Caller can use this function to control
|
//This function is for future extension. Caller can use this function to control
|
||||||
//analyser's behavior
|
//analyser's behavior
|
||||||
void SetOpion(){};
|
void SetOpion(){}
|
||||||
|
|
||||||
//It is not necessary to receive all data to draw conclusion. For charset detection,
|
//It is not necessary to receive all data to draw conclusion. For charset detection,
|
||||||
// certain amount of data is enough
|
// certain amount of data is enough
|
||||||
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
|
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
//we do not handle character base on its original encoding string, but
|
//we do not handle character base on its original encoding string, but
|
||||||
//convert this encoding string to a number, here called order.
|
//convert this encoding string to a number, here called order.
|
||||||
//This allow multiple encoding of a language to share one frequency table
|
//This allow multiple encoding of a language to share one frequency table
|
||||||
virtual PRInt32 GetOrder(const char* str) {return -1;};
|
virtual PRInt32 GetOrder(const char* str) {return -1;}
|
||||||
|
|
||||||
//If this flag is set to PR_TRUE, detection is done and conclusion has been made
|
//If this flag is set to PR_TRUE, detection is done and conclusion has been made
|
||||||
PRBool mDone;
|
PRBool mDone;
|
||||||
|
@ -132,7 +132,7 @@ protected:
|
||||||
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
|
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ protected:
|
||||||
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class GB2312DistributionAnalysis : public CharDistributionAnalysis
|
class GB2312DistributionAnalysis : public CharDistributionAnalysis
|
||||||
|
@ -167,7 +167,7 @@ protected:
|
||||||
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -188,7 +188,7 @@ protected:
|
||||||
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
|
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class SJISDistributionAnalysis : public CharDistributionAnalysis
|
class SJISDistributionAnalysis : public CharDistributionAnalysis
|
||||||
|
@ -213,7 +213,7 @@ protected:
|
||||||
if ((unsigned char)str[1] > (unsigned char)0x7f)
|
if ((unsigned char)str[1] > (unsigned char)0x7f)
|
||||||
order--;
|
order--;
|
||||||
return order;
|
return order;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class EUCJPDistributionAnalysis : public CharDistributionAnalysis
|
class EUCJPDistributionAnalysis : public CharDistributionAnalysis
|
||||||
|
@ -230,7 +230,7 @@ protected:
|
||||||
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
|
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||||
else
|
else
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif //CharDistribution_h__
|
#endif //CharDistribution_h__
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#include "JpCntx.h"
|
#include "JpCntx.h"
|
||||||
|
|
||||||
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||||
char jp2CharContext[83][83] =
|
const char jp2CharContext[83][83] =
|
||||||
{
|
{
|
||||||
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
||||||
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
||||||
|
@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void)
|
||||||
}
|
}
|
||||||
#define DONT_KNOW (float)-1
|
#define DONT_KNOW (float)-1
|
||||||
|
|
||||||
float JapaneseContextAnalysis::GetConfidence()
|
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||||
{
|
{
|
||||||
//This is just one way to calculate confidence. It works well for me.
|
//This is just one way to calculate confidence. It works well for me.
|
||||||
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
|
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||||
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
||||||
else
|
else
|
||||||
return (float)DONT_KNOW;
|
return (float)DONT_KNOW;
|
||||||
|
@ -227,5 +227,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
|
||||||
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -73,12 +73,12 @@ public:
|
||||||
mRelSample[jp2CharContext[mLastCharOrder][order]]++;
|
mRelSample[jp2CharContext[mLastCharOrder][order]]++;
|
||||||
}
|
}
|
||||||
mLastCharOrder = order;
|
mLastCharOrder = order;
|
||||||
};
|
}
|
||||||
|
|
||||||
float GetConfidence();
|
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
void SetOpion(){};
|
void SetOpion(){}
|
||||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;};
|
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
|
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
|
||||||
|
@ -116,7 +116,7 @@ protected:
|
||||||
(unsigned char)*(str+1) <= (unsigned char)0xf1)
|
(unsigned char)*(str+1) <= (unsigned char)0xf1)
|
||||||
return (unsigned char)*(str+1) - (unsigned char)0x9f;
|
return (unsigned char)*(str+1) - (unsigned char)0x9f;
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
class EUCJPContextAnalysis : public JapaneseContextAnalysis
|
class EUCJPContextAnalysis : public JapaneseContextAnalysis
|
||||||
|
@ -131,7 +131,7 @@ protected:
|
||||||
(unsigned char)*(str+1) <= (unsigned char)0xf3)
|
(unsigned char)*(str+1) <= (unsigned char)0xf3)
|
||||||
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
||||||
return -1;
|
return -1;
|
||||||
};
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif /* __JPCNTX_H__ */
|
#endif /* __JPCNTX_H__ */
|
||||||
|
|
|
@ -48,7 +48,7 @@
|
||||||
//this talbe is modified base on win1251BulgarianCharToOrderMap, so
|
//this talbe is modified base on win1251BulgarianCharToOrderMap, so
|
||||||
//only number <64 is sure valid
|
//only number <64 is sure valid
|
||||||
|
|
||||||
unsigned char Latin5_BulgarianCharToOrderMap[] =
|
static const unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -68,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char win1251BulgarianCharToOrderMap[] =
|
static const unsigned char win1251BulgarianCharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -94,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
|
||||||
//first 1024 sequences:3.0618%
|
//first 1024 sequences:3.0618%
|
||||||
//rest sequences: 0.2992%
|
//rest sequences: 0.2992%
|
||||||
//negative sequences: 0.0020%
|
//negative sequences: 0.0020%
|
||||||
char BulgarianLangModel[] =
|
static const char BulgarianLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||||
|
@ -226,7 +226,7 @@ char BulgarianLangModel[] =
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Latin5BulgarianModel =
|
const SequenceModel Latin5BulgarianModel =
|
||||||
{
|
{
|
||||||
Latin5_BulgarianCharToOrderMap,
|
Latin5_BulgarianCharToOrderMap,
|
||||||
BulgarianLangModel,
|
BulgarianLangModel,
|
||||||
|
@ -235,7 +235,7 @@ SequenceModel Latin5BulgarianModel =
|
||||||
"ISO-8859-5"
|
"ISO-8859-5"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Win1251BulgarianModel =
|
const SequenceModel Win1251BulgarianModel =
|
||||||
{
|
{
|
||||||
win1251BulgarianCharToOrderMap,
|
win1251BulgarianCharToOrderMap,
|
||||||
BulgarianLangModel,
|
BulgarianLangModel,
|
||||||
|
|
|
@ -41,7 +41,7 @@
|
||||||
|
|
||||||
//KOI8-R language model
|
//KOI8-R language model
|
||||||
//Character Mapping Table:
|
//Character Mapping Table:
|
||||||
unsigned char KOI8R_CharToOrderMap[] =
|
static const unsigned char KOI8R_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -61,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
|
||||||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char win1251_CharToOrderMap[] =
|
static const unsigned char win1251_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -81,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
|
||||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char latin5_CharToOrderMap[] =
|
static const unsigned char latin5_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -101,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
|
||||||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char macCyrillic_CharToOrderMap[] =
|
static const unsigned char macCyrillic_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -121,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
|
||||||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char IBM855_CharToOrderMap[] =
|
static const unsigned char IBM855_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -141,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
|
||||||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char IBM866_CharToOrderMap[] =
|
static const unsigned char IBM866_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -167,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
|
||||||
//first 1024 sequences: 2.3389%
|
//first 1024 sequences: 2.3389%
|
||||||
//rest sequences: 0.1237%
|
//rest sequences: 0.1237%
|
||||||
//negative sequences: 0.0009%
|
//negative sequences: 0.0009%
|
||||||
char RussianLangModel[] =
|
static const char RussianLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||||
|
@ -300,7 +300,7 @@ char RussianLangModel[] =
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
SequenceModel Koi8rModel =
|
const SequenceModel Koi8rModel =
|
||||||
{
|
{
|
||||||
KOI8R_CharToOrderMap,
|
KOI8R_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
@ -309,7 +309,7 @@ SequenceModel Koi8rModel =
|
||||||
"KOI8-R"
|
"KOI8-R"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Win1251Model =
|
const SequenceModel Win1251Model =
|
||||||
{
|
{
|
||||||
win1251_CharToOrderMap,
|
win1251_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
@ -318,7 +318,7 @@ SequenceModel Win1251Model =
|
||||||
"windows-1251"
|
"windows-1251"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Latin5Model =
|
const SequenceModel Latin5Model =
|
||||||
{
|
{
|
||||||
latin5_CharToOrderMap,
|
latin5_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
@ -327,7 +327,7 @@ SequenceModel Latin5Model =
|
||||||
"ISO-8859-5"
|
"ISO-8859-5"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel MacCyrillicModel =
|
const SequenceModel MacCyrillicModel =
|
||||||
{
|
{
|
||||||
macCyrillic_CharToOrderMap,
|
macCyrillic_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
@ -336,7 +336,7 @@ SequenceModel MacCyrillicModel =
|
||||||
"x-mac-cyrillic"
|
"x-mac-cyrillic"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Ibm866Model =
|
const SequenceModel Ibm866Model =
|
||||||
{
|
{
|
||||||
IBM866_CharToOrderMap,
|
IBM866_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
@ -345,7 +345,7 @@ SequenceModel Ibm866Model =
|
||||||
"IBM866"
|
"IBM866"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Ibm855Model =
|
const SequenceModel Ibm855Model =
|
||||||
{
|
{
|
||||||
IBM855_CharToOrderMap,
|
IBM855_CharToOrderMap,
|
||||||
RussianLangModel,
|
RussianLangModel,
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
*****************************************************************/
|
*****************************************************************/
|
||||||
|
|
||||||
//Character Mapping Table:
|
//Character Mapping Table:
|
||||||
unsigned char Latin7_CharToOrderMap[] =
|
static const unsigned char Latin7_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -67,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
unsigned char win1253_CharToOrderMap[] =
|
static const unsigned char win1253_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -93,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
|
||||||
//first 1024 sequences:1.7001%
|
//first 1024 sequences:1.7001%
|
||||||
//rest sequences: 0.0359%
|
//rest sequences: 0.0359%
|
||||||
//negative sequences: 0.0148%
|
//negative sequences: 0.0148%
|
||||||
char GreekLangModel[] =
|
static const char GreekLangModel[] =
|
||||||
{
|
{
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
|
@ -225,7 +225,7 @@ char GreekLangModel[] =
|
||||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Latin7Model =
|
const SequenceModel Latin7Model =
|
||||||
{
|
{
|
||||||
Latin7_CharToOrderMap,
|
Latin7_CharToOrderMap,
|
||||||
GreekLangModel,
|
GreekLangModel,
|
||||||
|
@ -234,7 +234,7 @@ SequenceModel Latin7Model =
|
||||||
"ISO-8859-7"
|
"ISO-8859-7"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Win1253Model =
|
const SequenceModel Win1253Model =
|
||||||
{
|
{
|
||||||
win1253_CharToOrderMap,
|
win1253_CharToOrderMap,
|
||||||
GreekLangModel,
|
GreekLangModel,
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
|
|
||||||
//Windows-1255 language model
|
//Windows-1255 language model
|
||||||
//Character Mapping Table:
|
//Character Mapping Table:
|
||||||
unsigned char win1255_CharToOrderMap[] =
|
static const unsigned char win1255_CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -76,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
|
||||||
//first 1024 sequences: 1.5981%
|
//first 1024 sequences: 1.5981%
|
||||||
//rest sequences: 0.087%
|
//rest sequences: 0.087%
|
||||||
//negative sequences: 0.0015%
|
//negative sequences: 0.0015%
|
||||||
char HebrewLangModel[] =
|
static const char HebrewLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||||
|
@ -208,7 +208,7 @@ char HebrewLangModel[] =
|
||||||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Win1255Model =
|
const SequenceModel Win1255Model =
|
||||||
{
|
{
|
||||||
win1255_CharToOrderMap,
|
win1255_CharToOrderMap,
|
||||||
HebrewLangModel,
|
HebrewLangModel,
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
*****************************************************************/
|
*****************************************************************/
|
||||||
|
|
||||||
//Character Mapping Table:
|
//Character Mapping Table:
|
||||||
unsigned char Latin2_HungarianCharToOrderMap[] =
|
static const unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -65,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||||
};
|
};
|
||||||
|
|
||||||
unsigned char win1250HungarianCharToOrderMap[] =
|
static const unsigned char win1250HungarianCharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -91,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
|
||||||
//first 1024 sequences:5.2623%
|
//first 1024 sequences:5.2623%
|
||||||
//rest sequences: 0.8894%
|
//rest sequences: 0.8894%
|
||||||
//negative sequences: 0.0009%
|
//negative sequences: 0.0009%
|
||||||
char HungarianLangModel[] =
|
static const char HungarianLangModel[] =
|
||||||
{
|
{
|
||||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||||
|
@ -223,7 +223,7 @@ char HungarianLangModel[] =
|
||||||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Latin2HungarianModel =
|
const SequenceModel Latin2HungarianModel =
|
||||||
{
|
{
|
||||||
Latin2_HungarianCharToOrderMap,
|
Latin2_HungarianCharToOrderMap,
|
||||||
HungarianLangModel,
|
HungarianLangModel,
|
||||||
|
@ -232,7 +232,7 @@ SequenceModel Latin2HungarianModel =
|
||||||
"ISO-8859-2"
|
"ISO-8859-2"
|
||||||
};
|
};
|
||||||
|
|
||||||
SequenceModel Win1250HungarianModel =
|
const SequenceModel Win1250HungarianModel =
|
||||||
{
|
{
|
||||||
win1250HungarianCharToOrderMap,
|
win1250HungarianCharToOrderMap,
|
||||||
HungarianLangModel,
|
HungarianLangModel,
|
||||||
|
|
|
@ -49,7 +49,7 @@
|
||||||
//The following result for thai was collected from a limited sample (1M).
|
//The following result for thai was collected from a limited sample (1M).
|
||||||
|
|
||||||
//Character Mapping Table:
|
//Character Mapping Table:
|
||||||
unsigned char TIS620CharToOrderMap[] =
|
static const unsigned char TIS620CharToOrderMap[] =
|
||||||
{
|
{
|
||||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||||
|
@ -78,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
|
||||||
//first 1024 sequences:7.3177%
|
//first 1024 sequences:7.3177%
|
||||||
//rest sequences: 1.0230%
|
//rest sequences: 1.0230%
|
||||||
//negative sequences: 0.0436%
|
//negative sequences: 0.0436%
|
||||||
char ThaiLangModel[] =
|
static const char ThaiLangModel[] =
|
||||||
{
|
{
|
||||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||||
|
@ -211,7 +211,7 @@ char ThaiLangModel[] =
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
SequenceModel TIS620ThaiModel =
|
const SequenceModel TIS620ThaiModel =
|
||||||
{
|
{
|
||||||
TIS620CharToOrderMap,
|
TIS620CharToOrderMap,
|
||||||
ThaiLangModel,
|
ThaiLangModel,
|
||||||
|
|
|
@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
@ -86,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsBig5Prober::GetConfidence(void)
|
float nsBig5Prober::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,15 +44,17 @@
|
||||||
|
|
||||||
class nsBig5Prober: public nsCharSetProber {
|
class nsBig5Prober: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
nsBig5Prober(PRBool aIsPreferredLanguage)
|
||||||
Reset();};
|
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||||
virtual ~nsBig5Prober(void){delete mCodingSM;};
|
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||||
|
Reset();}
|
||||||
|
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "Big5";};
|
const char* GetCharSetName() {return "Big5";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||||
|
@ -63,6 +65,7 @@ protected:
|
||||||
//Big5ContextAnalysis mContextAnalyser;
|
//Big5ContextAnalysis mContextAnalyser;
|
||||||
Big5DistributionAnalysis mDistributionAnalyser;
|
Big5DistributionAnalysis mDistributionAnalyser;
|
||||||
char mLastChar[2];
|
char mLastChar[2];
|
||||||
|
PRBool mIsPreferredLanguage;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -74,7 +74,7 @@ PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 a
|
||||||
if (meetMSB && curPtr > prevPtr)
|
if (meetMSB && curPtr > prevPtr)
|
||||||
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
||||||
|
|
||||||
newLen = PRUint32(newptr - *newBuf);
|
newLen = newptr - *newBuf;
|
||||||
|
|
||||||
return PR_TRUE;
|
return PR_TRUE;
|
||||||
}
|
}
|
||||||
|
@ -119,7 +119,7 @@ PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen
|
||||||
while (prevPtr < curPtr)
|
while (prevPtr < curPtr)
|
||||||
*newptr++ = *prevPtr++;
|
*newptr++ = *prevPtr++;
|
||||||
|
|
||||||
newLen = PRUint32(newptr - *newBuf);
|
newLen = newptr - *newBuf;
|
||||||
|
|
||||||
return PR_TRUE;
|
return PR_TRUE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,7 +52,7 @@ typedef enum {
|
||||||
|
|
||||||
class nsCharSetProber {
|
class nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
virtual ~nsCharSetProber() {};
|
virtual ~nsCharSetProber() {}
|
||||||
virtual const char* GetCharSetName() = 0;
|
virtual const char* GetCharSetName() = 0;
|
||||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
||||||
virtual nsProbingState GetState(void) = 0;
|
virtual nsProbingState GetState(void) = 0;
|
||||||
|
@ -60,10 +60,6 @@ public:
|
||||||
virtual float GetConfidence(void) = 0;
|
virtual float GetConfidence(void) = 0;
|
||||||
virtual void SetOpion() = 0;
|
virtual void SetOpion() = 0;
|
||||||
|
|
||||||
virtual const char* GetCharSetName(int i) { return GetCharSetName(); }
|
|
||||||
virtual float GetConfidence(int i) { return GetConfidence(); }
|
|
||||||
virtual int GetProbeCount(void) { return 1; }
|
|
||||||
|
|
||||||
#ifdef DEBUG_chardet
|
#ifdef DEBUG_chardet
|
||||||
virtual void DumpStatus() {};
|
virtual void DumpStatus() {};
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -59,10 +59,7 @@ typedef struct
|
||||||
|
|
||||||
class nsCodingStateMachine {
|
class nsCodingStateMachine {
|
||||||
public:
|
public:
|
||||||
nsCodingStateMachine(SMModel* sm){
|
nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
|
||||||
mCurrentState = eStart;
|
|
||||||
mModel = sm;
|
|
||||||
};
|
|
||||||
nsSMState NextState(char c){
|
nsSMState NextState(char c){
|
||||||
//for each byte we get its class , if it is first byte, we also get byte length
|
//for each byte we get its class , if it is first byte, we also get byte length
|
||||||
PRUint32 byteCls = GETCLASS(c);
|
PRUint32 byteCls = GETCLASS(c);
|
||||||
|
@ -76,33 +73,32 @@ public:
|
||||||
mModel->stateTable);
|
mModel->stateTable);
|
||||||
mCurrentBytePos++;
|
mCurrentBytePos++;
|
||||||
return mCurrentState;
|
return mCurrentState;
|
||||||
};
|
}
|
||||||
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;};
|
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
|
||||||
void Reset(void) {mCurrentState = eStart;};
|
void Reset(void) {mCurrentState = eStart;}
|
||||||
const char * GetCodingStateMachine() {return mModel->name;};
|
const char * GetCodingStateMachine() {return mModel->name;}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
nsSMState mCurrentState;
|
nsSMState mCurrentState;
|
||||||
PRUint32 mCurrentCharLen;
|
PRUint32 mCurrentCharLen;
|
||||||
PRUint32 mCurrentBytePos;
|
PRUint32 mCurrentBytePos;
|
||||||
|
|
||||||
SMModel *mModel;
|
const SMModel *mModel;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern SMModel UTF8SMModel;
|
extern const SMModel UTF8SMModel;
|
||||||
extern SMModel Big5SMModel;
|
extern const SMModel Big5SMModel;
|
||||||
extern SMModel EUCJPSMModel;
|
extern const SMModel EUCJPSMModel;
|
||||||
extern SMModel EUCKRSMModel;
|
extern const SMModel EUCKRSMModel;
|
||||||
extern SMModel EUCTWSMModel;
|
extern const SMModel EUCTWSMModel;
|
||||||
extern SMModel GB18030SMModel;
|
extern const SMModel GB18030SMModel;
|
||||||
extern SMModel SJISSMModel;
|
extern const SMModel SJISSMModel;
|
||||||
extern SMModel UCS2BESMModel;
|
|
||||||
|
|
||||||
|
|
||||||
extern SMModel HZSMModel;
|
extern const SMModel HZSMModel;
|
||||||
extern SMModel ISO2022CNSMModel;
|
extern const SMModel ISO2022CNSMModel;
|
||||||
extern SMModel ISO2022JPSMModel;
|
extern const SMModel ISO2022JPSMModel;
|
||||||
extern SMModel ISO2022KRSMModel;
|
extern const SMModel ISO2022KRSMModel;
|
||||||
|
|
||||||
#endif /* nsCodingStateMachine_h__ */
|
#endif /* nsCodingStateMachine_h__ */
|
||||||
|
|
||||||
|
|
|
@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
@ -96,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsEUCJPProber::GetConfidence(void)
|
float nsEUCJPProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float contxtCf = mContextAnalyser.GetConfidence();
|
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
|
|
||||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,15 +50,17 @@
|
||||||
|
|
||||||
class nsEUCJPProber: public nsCharSetProber {
|
class nsEUCJPProber: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
nsEUCJPProber(PRBool aIsPreferredLanguage)
|
||||||
Reset();};
|
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||||
virtual ~nsEUCJPProber(void){delete mCodingSM;};
|
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||||
|
Reset();}
|
||||||
|
virtual ~nsEUCJPProber(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "EUC-JP";};
|
const char* GetCharSetName() {return "EUC-JP";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
nsCodingStateMachine* mCodingSM;
|
nsCodingStateMachine* mCodingSM;
|
||||||
|
@ -68,6 +70,7 @@ protected:
|
||||||
EUCJPDistributionAnalysis mDistributionAnalyser;
|
EUCJPDistributionAnalysis mDistributionAnalyser;
|
||||||
|
|
||||||
char mLastChar[2];
|
char mLastChar[2];
|
||||||
|
PRBool mIsPreferredLanguage;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
@ -89,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsEUCKRProber::GetConfidence(void)
|
float nsEUCKRProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,15 +44,18 @@
|
||||||
|
|
||||||
class nsEUCKRProber: public nsCharSetProber {
|
class nsEUCKRProber: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
nsEUCKRProber(PRBool aIsPreferredLanguage)
|
||||||
Reset();};
|
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||||
virtual ~nsEUCKRProber(void){delete mCodingSM;};
|
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||||
|
Reset();
|
||||||
|
}
|
||||||
|
virtual ~nsEUCKRProber(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "EUC-KR";};
|
const char* GetCharSetName() {return "EUC-KR";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||||
|
@ -63,6 +66,7 @@ protected:
|
||||||
//EUCKRContextAnalysis mContextAnalyser;
|
//EUCKRContextAnalysis mContextAnalyser;
|
||||||
EUCKRDistributionAnalysis mDistributionAnalyser;
|
EUCKRDistributionAnalysis mDistributionAnalyser;
|
||||||
char mLastChar[2];
|
char mLastChar[2];
|
||||||
|
PRBool mIsPreferredLanguage;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
@ -89,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsEUCTWProber::GetConfidence(void)
|
float nsEUCTWProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,15 +44,17 @@
|
||||||
|
|
||||||
class nsEUCTWProber: public nsCharSetProber {
|
class nsEUCTWProber: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
nsEUCTWProber(PRBool aIsPreferredLanguage)
|
||||||
Reset();};
|
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||||
virtual ~nsEUCTWProber(void){delete mCodingSM;};
|
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||||
|
Reset();}
|
||||||
|
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "x-euc-tw";};
|
const char* GetCharSetName() {return "x-euc-tw";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||||
|
@ -63,6 +65,7 @@ protected:
|
||||||
//EUCTWContextAnalysis mContextAnalyser;
|
//EUCTWContextAnalysis mContextAnalyser;
|
||||||
EUCTWDistributionAnalysis mDistributionAnalyser;
|
EUCTWDistributionAnalysis mDistributionAnalyser;
|
||||||
char mLastChar[2];
|
char mLastChar[2];
|
||||||
|
PRBool mIsPreferredLanguage;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -37,12 +37,20 @@
|
||||||
|
|
||||||
|
|
||||||
#include "nsEscCharsetProber.h"
|
#include "nsEscCharsetProber.h"
|
||||||
|
#include "nsUniversalDetector.h"
|
||||||
|
|
||||||
nsEscCharSetProber::nsEscCharSetProber(void)
|
nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
|
||||||
|
{
|
||||||
|
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
||||||
|
mCodingSM[i] = nsnull;
|
||||||
|
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||||
{
|
{
|
||||||
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
||||||
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
||||||
|
}
|
||||||
|
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||||
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
||||||
|
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||||
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
||||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
|
@ -59,6 +67,7 @@ void nsEscCharSetProber::Reset(void)
|
||||||
{
|
{
|
||||||
mState = eDetecting;
|
mState = eDetecting;
|
||||||
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
||||||
|
if (mCodingSM[i])
|
||||||
mCodingSM[i]->Reset();
|
mCodingSM[i]->Reset();
|
||||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||||
mDetectedCharset = nsnull;
|
mDetectedCharset = nsnull;
|
||||||
|
@ -74,26 +83,10 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
{
|
{
|
||||||
for (j = mActiveSM-1; j>= 0; j--)
|
for (j = mActiveSM-1; j>= 0; j--)
|
||||||
{
|
{
|
||||||
//byte is feed to all active state machine
|
if (mCodingSM[j])
|
||||||
|
{
|
||||||
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
if (codingState == eItsMe)
|
||||||
{
|
|
||||||
//got negative answer for this state machine, make it inactive
|
|
||||||
mActiveSM--;
|
|
||||||
if (mActiveSM == 0)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
return mState;
|
|
||||||
}
|
|
||||||
else if (j != (PRInt32)mActiveSM)
|
|
||||||
{
|
|
||||||
nsCodingStateMachine* t;
|
|
||||||
t = mCodingSM[mActiveSM];
|
|
||||||
mCodingSM[mActiveSM] = mCodingSM[j];
|
|
||||||
mCodingSM[j] = t;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (codingState == eItsMe)
|
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
||||||
|
@ -101,6 +94,7 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return mState;
|
return mState;
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,14 +45,14 @@
|
||||||
|
|
||||||
class nsEscCharSetProber: public nsCharSetProber {
|
class nsEscCharSetProber: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsEscCharSetProber(void);
|
nsEscCharSetProber(PRUint32 aLanguageFilter);
|
||||||
virtual ~nsEscCharSetProber(void);
|
virtual ~nsEscCharSetProber(void);
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return mDetectedCharset;};
|
const char* GetCharSetName() {return mDetectedCharset;}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void){return (float)0.99;};
|
float GetConfidence(void){return (float)0.99;}
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||||
|
|
|
@ -36,7 +36,7 @@
|
||||||
* ***** END LICENSE BLOCK ***** */
|
* ***** END LICENSE BLOCK ***** */
|
||||||
#include "nsCodingStateMachine.h"
|
#include "nsCodingStateMachine.h"
|
||||||
|
|
||||||
static PRUint32 HZ_cls[ 256 / 8 ] = {
|
static const PRUint32 HZ_cls[ 256 / 8 ] = {
|
||||||
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
|
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||||
|
@ -72,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 HZ_st [ 6] = {
|
static const PRUint32 HZ_st [ 6] = {
|
||||||
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
|
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
|
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
|
||||||
|
@ -83,7 +83,7 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
||||||
|
|
||||||
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
||||||
|
|
||||||
SMModel HZSMModel = {
|
const SMModel HZSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
|
||||||
6,
|
6,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
|
||||||
|
@ -92,7 +92,7 @@ SMModel HZSMModel = {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
||||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||||
|
@ -128,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 ISO2022CN_st [ 8] = {
|
static const PRUint32 ISO2022CN_st [ 8] = {
|
||||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
||||||
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
|
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
|
||||||
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
||||||
|
@ -141,7 +141,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
|
||||||
|
|
||||||
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||||
|
|
||||||
SMModel ISO2022CNSMModel = {
|
const SMModel ISO2022CNSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
|
||||||
9,
|
9,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
|
||||||
|
@ -149,7 +149,7 @@ SMModel ISO2022CNSMModel = {
|
||||||
"ISO-2022-CN",
|
"ISO-2022-CN",
|
||||||
};
|
};
|
||||||
|
|
||||||
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
||||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||||
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
|
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||||
|
@ -185,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 ISO2022JP_st [ 9] = {
|
static const PRUint32 ISO2022JP_st [ 9] = {
|
||||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
||||||
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
|
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
||||||
|
@ -199,7 +199,7 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47
|
||||||
|
|
||||||
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||||
|
|
||||||
SMModel ISO2022JPSMModel = {
|
const SMModel ISO2022JPSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
|
||||||
10,
|
10,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
|
||||||
|
@ -207,7 +207,7 @@ SMModel ISO2022JPSMModel = {
|
||||||
"ISO-2022-JP",
|
"ISO-2022-JP",
|
||||||
};
|
};
|
||||||
|
|
||||||
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
||||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||||
|
@ -243,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 ISO2022KR_st [ 5] = {
|
static const PRUint32 ISO2022KR_st [ 5] = {
|
||||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
|
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
|
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
|
||||||
|
@ -253,7 +253,7 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
|
||||||
|
|
||||||
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
||||||
|
|
||||||
SMModel ISO2022KRSMModel = {
|
const SMModel ISO2022KRSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
|
||||||
6,
|
6,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
|
||||||
|
|
|
@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
@ -94,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsGB18030Prober::GetConfidence(void)
|
float nsGB18030Prober::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
|
|
||||||
return (float)distribCf;
|
return (float)distribCf;
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,15 +46,17 @@
|
||||||
|
|
||||||
class nsGB18030Prober: public nsCharSetProber {
|
class nsGB18030Prober: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
nsGB18030Prober(PRBool aIsPreferredLanguage)
|
||||||
Reset();};
|
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||||
virtual ~nsGB18030Prober(void){delete mCodingSM;};
|
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||||
|
Reset();}
|
||||||
|
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "gb18030";};
|
const char* GetCharSetName() {return "gb18030";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||||
|
@ -65,6 +67,7 @@ protected:
|
||||||
//GB2312ContextAnalysis mContextAnalyser;
|
//GB2312ContextAnalysis mContextAnalyser;
|
||||||
GB2312DistributionAnalysis mDistributionAnalyser;
|
GB2312DistributionAnalysis mDistributionAnalyser;
|
||||||
char mLastChar[2];
|
char mLastChar[2];
|
||||||
|
PRBool mIsPreferredLanguage;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ public:
|
||||||
virtual nsProbingState GetState(void);
|
virtual nsProbingState GetState(void);
|
||||||
|
|
||||||
virtual float GetConfidence(void) { return (float)0.0; }
|
virtual float GetConfidence(void) { return (float)0.0; }
|
||||||
virtual void SetOpion() {};
|
virtual void SetOpion() {}
|
||||||
|
|
||||||
void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
|
void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
|
||||||
{ mLogicalProb = logicalPrb; mVisualProb = visualPrb; }
|
{ mLogicalProb = logicalPrb; mVisualProb = visualPrb; }
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
#define ASO 7 // accent small other
|
#define ASO 7 // accent small other
|
||||||
#define CLASS_NUM 8 // total classes
|
#define CLASS_NUM 8 // total classes
|
||||||
|
|
||||||
static unsigned char Latin1_CharToClass[] =
|
static const unsigned char Latin1_CharToClass[] =
|
||||||
{
|
{
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
||||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
||||||
|
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
|
||||||
2 : normal
|
2 : normal
|
||||||
3 : very likely
|
3 : very likely
|
||||||
*/
|
*/
|
||||||
static unsigned char Latin1ClassModel[] =
|
static const unsigned char Latin1ClassModel[] =
|
||||||
{
|
{
|
||||||
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
||||||
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||||
|
|
|
@ -45,14 +45,14 @@
|
||||||
|
|
||||||
class nsLatin1Prober: public nsCharSetProber {
|
class nsLatin1Prober: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsLatin1Prober(void){Reset();};
|
nsLatin1Prober(void){Reset();}
|
||||||
virtual ~nsLatin1Prober(void){};
|
virtual ~nsLatin1Prober(void){}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "windows-1252";};
|
const char* GetCharSetName() {return "windows-1252";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
#ifdef DEBUG_chardet
|
#ifdef DEBUG_chardet
|
||||||
virtual void DumpStatus();
|
virtual void DumpStatus();
|
||||||
|
|
|
@ -39,6 +39,7 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "nsMBCSGroupProber.h"
|
#include "nsMBCSGroupProber.h"
|
||||||
|
#include "nsUniversalDetector.h"
|
||||||
|
|
||||||
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
||||||
const char *ProberName[] =
|
const char *ProberName[] =
|
||||||
|
@ -54,15 +55,26 @@ const char *ProberName[] =
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
nsMBCSGroupProber::nsMBCSGroupProber()
|
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
||||||
{
|
{
|
||||||
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||||
|
mProbers[i] = nsnull;
|
||||||
|
|
||||||
mProbers[0] = new nsUTF8Prober();
|
mProbers[0] = new nsUTF8Prober();
|
||||||
mProbers[1] = new nsSJISProber();
|
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||||
mProbers[2] = new nsEUCJPProber();
|
{
|
||||||
mProbers[3] = new nsGB18030Prober();
|
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||||
mProbers[4] = new nsEUCKRProber();
|
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||||
mProbers[5] = new nsBig5Prober();
|
}
|
||||||
mProbers[6] = new nsEUCTWProber();
|
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||||
|
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
|
||||||
|
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||||
|
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
|
||||||
|
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
|
||||||
|
{
|
||||||
|
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||||
|
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||||
|
}
|
||||||
Reset();
|
Reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -134,16 +146,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
return mState;
|
return mState;
|
||||||
}
|
}
|
||||||
else if (st == eNotMe)
|
|
||||||
{
|
|
||||||
mIsActive[i] = PR_FALSE;
|
|
||||||
mActiveNum--;
|
|
||||||
if (mActiveNum <= 0)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
return mState;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -154,23 +156,13 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
{
|
{
|
||||||
if (!mIsActive[i])
|
if (!mIsActive[i])
|
||||||
continue;
|
continue;
|
||||||
st = mProbers[i]->HandleData(aBuf + start, aLen + 1 - start);
|
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
|
||||||
if (st == eFoundIt)
|
if (st == eFoundIt)
|
||||||
{
|
{
|
||||||
mBestGuess = i;
|
mBestGuess = i;
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
return mState;
|
return mState;
|
||||||
}
|
}
|
||||||
else if (st == eNotMe)
|
|
||||||
{
|
|
||||||
mIsActive[i] = PR_FALSE;
|
|
||||||
mActiveNum--;
|
|
||||||
if (mActiveNum <= 0)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
return mState;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
mKeepNext = keepNext;
|
mKeepNext = keepNext;
|
||||||
|
|
|
@ -51,18 +51,14 @@
|
||||||
|
|
||||||
class nsMBCSGroupProber: public nsCharSetProber {
|
class nsMBCSGroupProber: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsMBCSGroupProber();
|
nsMBCSGroupProber(PRUint32 aLanguageFilter);
|
||||||
virtual ~nsMBCSGroupProber();
|
virtual ~nsMBCSGroupProber();
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName();
|
const char* GetCharSetName();
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
|
|
||||||
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
|
|
||||||
int GetProbeCount(void) { return NUM_OF_PROBERS; }
|
|
||||||
|
|
||||||
#ifdef DEBUG_chardet
|
#ifdef DEBUG_chardet
|
||||||
void DumpStatus();
|
void DumpStatus();
|
||||||
|
|
|
@ -44,7 +44,7 @@ Modification from frank tang's original work:
|
||||||
|
|
||||||
// BIG5
|
// BIG5
|
||||||
|
|
||||||
static PRUint32 BIG5_cls [ 256 / 8 ] = {
|
static const PRUint32 BIG5_cls [ 256 / 8 ] = {
|
||||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
|
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
|
||||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||||
|
@ -81,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 BIG5_st [ 3] = {
|
static const PRUint32 BIG5_st [ 3] = {
|
||||||
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
|
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
|
||||||
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
||||||
|
@ -89,7 +89,7 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
||||||
|
|
||||||
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
|
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
|
||||||
|
|
||||||
SMModel Big5SMModel = {
|
SMModel const Big5SMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
|
||||||
5,
|
5,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
|
||||||
|
@ -97,7 +97,7 @@ SMModel Big5SMModel = {
|
||||||
"Big5",
|
"Big5",
|
||||||
};
|
};
|
||||||
|
|
||||||
static PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
||||||
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
|
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
|
||||||
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
|
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
|
||||||
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
|
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
|
||||||
|
@ -134,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 EUCJP_st [ 5] = {
|
static const PRUint32 EUCJP_st [ 5] = {
|
||||||
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
|
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||||
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
|
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
|
||||||
|
@ -144,7 +144,7 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
|
||||||
|
|
||||||
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
|
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
|
||||||
|
|
||||||
SMModel EUCJPSMModel = {
|
const SMModel EUCJPSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
|
||||||
6,
|
6,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
|
||||||
|
@ -152,7 +152,7 @@ SMModel EUCJPSMModel = {
|
||||||
"EUC-JP",
|
"EUC-JP",
|
||||||
};
|
};
|
||||||
|
|
||||||
static PRUint32 EUCKR_cls [ 256 / 8 ] = {
|
static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
|
||||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||||
|
@ -189,14 +189,14 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 EUCKR_st [ 2] = {
|
static const PRUint32 EUCKR_st [ 2] = {
|
||||||
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
|
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
|
||||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
|
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
|
||||||
};
|
};
|
||||||
|
|
||||||
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
|
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
|
||||||
|
|
||||||
SMModel EUCKRSMModel = {
|
const SMModel EUCKRSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
|
||||||
4,
|
4,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
|
||||||
|
@ -204,7 +204,7 @@ SMModel EUCKRSMModel = {
|
||||||
"EUC-KR",
|
"EUC-KR",
|
||||||
};
|
};
|
||||||
|
|
||||||
static PRUint32 EUCTW_cls [ 256 / 8 ] = {
|
static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
|
||||||
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
|
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
|
||||||
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
|
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
|
||||||
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
|
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
|
||||||
|
@ -241,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 EUCTW_st [ 6] = {
|
static const PRUint32 EUCTW_st [ 6] = {
|
||||||
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
|
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
||||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
|
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
|
||||||
|
@ -252,7 +252,7 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
||||||
|
|
||||||
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
|
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
|
||||||
|
|
||||||
SMModel EUCTWSMModel = {
|
const SMModel EUCTWSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
|
||||||
7,
|
7,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
|
||||||
|
@ -316,7 +316,7 @@ SMModel GB2312SMModel = {
|
||||||
|
|
||||||
// the following state machine data was created by perl script in
|
// the following state machine data was created by perl script in
|
||||||
// intl/chardet/tools. It should be the same as in PSM detector.
|
// intl/chardet/tools. It should be the same as in PSM detector.
|
||||||
static PRUint32 GB18030_cls [ 256 / 8 ] = {
|
static const PRUint32 GB18030_cls [ 256 / 8 ] = {
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
|
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
|
||||||
|
@ -352,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 GB18030_st [ 6] = {
|
static const PRUint32 GB18030_st [ 6] = {
|
||||||
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
|
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
||||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
|
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
|
||||||
|
@ -368,7 +368,7 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
||||||
// 2 here.
|
// 2 here.
|
||||||
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
|
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
|
||||||
|
|
||||||
SMModel GB18030SMModel = {
|
const SMModel GB18030SMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
|
||||||
7,
|
7,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
|
||||||
|
@ -378,7 +378,7 @@ SMModel GB18030SMModel = {
|
||||||
|
|
||||||
// sjis
|
// sjis
|
||||||
|
|
||||||
static PRUint32 SJIS_cls [ 256 / 8 ] = {
|
static const PRUint32 SJIS_cls [ 256 / 8 ] = {
|
||||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||||
|
@ -417,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 SJIS_st [ 3] = {
|
static const PRUint32 SJIS_st [ 3] = {
|
||||||
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
||||||
|
@ -425,7 +425,7 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
||||||
|
|
||||||
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
|
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
|
||||||
|
|
||||||
SMModel SJISSMModel = {
|
const SMModel SJISSMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
|
||||||
6,
|
6,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
|
||||||
|
@ -434,120 +434,7 @@ SMModel SJISSMModel = {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
|
static const PRUint32 UTF8_cls [ 256 / 8 ] = {
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
|
|
||||||
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
|
||||||
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
|
||||||
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 UCS2BE_st [ 7] = {
|
|
||||||
PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
|
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
|
||||||
PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
|
|
||||||
PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
|
|
||||||
PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
|
|
||||||
PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
|
|
||||||
PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
|
|
||||||
};
|
|
||||||
|
|
||||||
static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
|
|
||||||
|
|
||||||
SMModel UCS2BESMModel = {
|
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
|
|
||||||
6,
|
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
|
|
||||||
UCS2BECharLenTable,
|
|
||||||
"UTF-16BE",
|
|
||||||
};
|
|
||||||
|
|
||||||
static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
|
|
||||||
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
|
||||||
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
|
||||||
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
|
|
||||||
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
|
|
||||||
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 UCS2LE_st [ 7] = {
|
|
||||||
PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
|
|
||||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
|
||||||
PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
|
|
||||||
PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
|
|
||||||
PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
|
|
||||||
PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
|
|
||||||
PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
|
|
||||||
};
|
|
||||||
|
|
||||||
static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
|
|
||||||
|
|
||||||
SMModel UCS2LESMModel = {
|
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
|
|
||||||
6,
|
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
|
|
||||||
UCS2LECharLenTable,
|
|
||||||
"UTF-16LE",
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 UTF8_cls [ 256 / 8 ] = {
|
|
||||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
|
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
|
||||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||||
|
@ -584,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
static PRUint32 UTF8_st [ 26] = {
|
static const PRUint32 UTF8_st [ 26] = {
|
||||||
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
|
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
|
||||||
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
|
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
|
||||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
|
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
|
||||||
|
@ -616,7 +503,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
|
||||||
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
|
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
|
||||||
3, 3, 4, 4, 5, 5, 6, 6 };
|
3, 3, 4, 4, 5, 5, 6, 6 };
|
||||||
|
|
||||||
SMModel UTF8SMModel = {
|
const SMModel UTF8SMModel = {
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
|
||||||
16,
|
16,
|
||||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
|
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
|
||||||
|
|
|
@ -68,7 +68,7 @@ typedef struct nsPkgInt {
|
||||||
nsSftMsk sftmsk;
|
nsSftMsk sftmsk;
|
||||||
nsBitSft bitsft;
|
nsBitSft bitsft;
|
||||||
nsUnitMsk unitmsk;
|
nsUnitMsk unitmsk;
|
||||||
PRUint32 *data;
|
const PRUint32* const data;
|
||||||
} nsPkgInt;
|
} nsPkgInt;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -49,14 +49,10 @@ public:
|
||||||
virtual ~nsSBCSGroupProber();
|
virtual ~nsSBCSGroupProber();
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName();
|
const char* GetCharSetName();
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
|
|
||||||
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
|
|
||||||
int GetProbeCount(void) { return NUM_OF_SBCS_PROBERS; }
|
|
||||||
|
|
||||||
#ifdef DEBUG_chardet
|
#ifdef DEBUG_chardet
|
||||||
void DumpStatus();
|
void DumpStatus();
|
||||||
|
|
|
@ -51,27 +51,27 @@
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
unsigned char *charToOrderMap; // [256] table use to find a char's order
|
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
|
||||||
char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||||
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
||||||
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
||||||
const char* charsetName;
|
const char* const charsetName;
|
||||||
} SequenceModel;
|
} SequenceModel;
|
||||||
|
|
||||||
|
|
||||||
class nsSingleByteCharSetProber : public nsCharSetProber{
|
class nsSingleByteCharSetProber : public nsCharSetProber{
|
||||||
public:
|
public:
|
||||||
nsSingleByteCharSetProber(SequenceModel *model)
|
nsSingleByteCharSetProber(const SequenceModel *model)
|
||||||
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
|
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
|
||||||
nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||||
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
||||||
|
|
||||||
virtual const char* GetCharSetName();
|
virtual const char* GetCharSetName();
|
||||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
virtual nsProbingState GetState(void) {return mState;};
|
virtual nsProbingState GetState(void) {return mState;}
|
||||||
virtual void Reset(void);
|
virtual void Reset(void);
|
||||||
virtual float GetConfidence(void);
|
virtual float GetConfidence(void);
|
||||||
virtual void SetOpion() {};
|
virtual void SetOpion() {}
|
||||||
|
|
||||||
// This feature is not implemented yet. any current language model
|
// This feature is not implemented yet. any current language model
|
||||||
// contain this parameter as PR_FALSE. No one is looking at this
|
// contain this parameter as PR_FALSE. No one is looking at this
|
||||||
|
@ -79,7 +79,7 @@ public:
|
||||||
// Moreover, the nsSBCSGroupProber which calls the HandleData of this
|
// Moreover, the nsSBCSGroupProber which calls the HandleData of this
|
||||||
// prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
|
// prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
|
||||||
// of the English letters.
|
// of the English letters.
|
||||||
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; // (not implemented)
|
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
|
||||||
|
|
||||||
#ifdef DEBUG_chardet
|
#ifdef DEBUG_chardet
|
||||||
virtual void DumpStatus();
|
virtual void DumpStatus();
|
||||||
|
@ -87,7 +87,7 @@ public:
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
nsProbingState mState;
|
nsProbingState mState;
|
||||||
const SequenceModel *mModel;
|
const SequenceModel* const mModel;
|
||||||
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
|
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
|
||||||
|
|
||||||
//char order of last character
|
//char order of last character
|
||||||
|
@ -106,19 +106,19 @@ protected:
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
extern SequenceModel Koi8rModel;
|
extern const SequenceModel Koi8rModel;
|
||||||
extern SequenceModel Win1251Model;
|
extern const SequenceModel Win1251Model;
|
||||||
extern SequenceModel Latin5Model;
|
extern const SequenceModel Latin5Model;
|
||||||
extern SequenceModel MacCyrillicModel;
|
extern const SequenceModel MacCyrillicModel;
|
||||||
extern SequenceModel Ibm866Model;
|
extern const SequenceModel Ibm866Model;
|
||||||
extern SequenceModel Ibm855Model;
|
extern const SequenceModel Ibm855Model;
|
||||||
extern SequenceModel Latin7Model;
|
extern const SequenceModel Latin7Model;
|
||||||
extern SequenceModel Win1253Model;
|
extern const SequenceModel Win1253Model;
|
||||||
extern SequenceModel Latin5BulgarianModel;
|
extern const SequenceModel Latin5BulgarianModel;
|
||||||
extern SequenceModel Win1251BulgarianModel;
|
extern const SequenceModel Win1251BulgarianModel;
|
||||||
extern SequenceModel Latin2HungarianModel;
|
extern const SequenceModel Latin2HungarianModel;
|
||||||
extern SequenceModel Win1250HungarianModel;
|
extern const SequenceModel Win1250HungarianModel;
|
||||||
extern SequenceModel Win1255Model;
|
extern const SequenceModel Win1255Model;
|
||||||
|
|
||||||
#endif /* nsSingleByteCharSetProber_h__ */
|
#endif /* nsSingleByteCharSetProber_h__ */
|
||||||
|
|
||||||
|
|
|
@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
@ -95,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
float nsSJISProber::GetConfidence(void)
|
float nsSJISProber::GetConfidence(void)
|
||||||
{
|
{
|
||||||
float contxtCf = mContextAnalyser.GetConfidence();
|
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||||
|
|
||||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,15 +51,17 @@
|
||||||
|
|
||||||
class nsSJISProber: public nsCharSetProber {
|
class nsSJISProber: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
nsSJISProber(PRBool aIsPreferredLanguage)
|
||||||
Reset();};
|
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||||
virtual ~nsSJISProber(void){delete mCodingSM;};
|
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||||
|
Reset();}
|
||||||
|
virtual ~nsSJISProber(void){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "Shift_JIS";};
|
const char* GetCharSetName() {return "Shift_JIS";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
nsCodingStateMachine* mCodingSM;
|
nsCodingStateMachine* mCodingSM;
|
||||||
|
@ -69,6 +71,7 @@ protected:
|
||||||
SJISDistributionAnalysis mDistributionAnalyser;
|
SJISDistributionAnalysis mDistributionAnalyser;
|
||||||
|
|
||||||
char mLastChar[2];
|
char mLastChar[2];
|
||||||
|
PRBool mIsPreferredLanguage;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
for (PRUint32 i = 0; i < aLen; i++)
|
for (PRUint32 i = 0; i < aLen; i++)
|
||||||
{
|
{
|
||||||
codingState = mCodingSM->NextState(aBuf[i]);
|
codingState = mCodingSM->NextState(aBuf[i]);
|
||||||
if (codingState == eError)
|
|
||||||
{
|
|
||||||
mState = eNotMe;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (codingState == eItsMe)
|
if (codingState == eItsMe)
|
||||||
{
|
{
|
||||||
mState = eFoundIt;
|
mState = eFoundIt;
|
||||||
|
|
|
@ -45,14 +45,14 @@ class nsUTF8Prober: public nsCharSetProber {
|
||||||
public:
|
public:
|
||||||
nsUTF8Prober(){mNumOfMBChar = 0;
|
nsUTF8Prober(){mNumOfMBChar = 0;
|
||||||
mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
|
mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
|
||||||
Reset(); };
|
Reset(); }
|
||||||
virtual ~nsUTF8Prober(){delete mCodingSM;};
|
virtual ~nsUTF8Prober(){delete mCodingSM;}
|
||||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
const char* GetCharSetName() {return "UTF-8";};
|
const char* GetCharSetName() {return "UTF-8";}
|
||||||
nsProbingState GetState(void) {return mState;};
|
nsProbingState GetState(void) {return mState;}
|
||||||
void Reset(void);
|
void Reset(void);
|
||||||
float GetConfidence(void);
|
float GetConfidence(void);
|
||||||
void SetOpion() {};
|
void SetOpion() {}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
nsCodingStateMachine* mCodingSM;
|
nsCodingStateMachine* mCodingSM;
|
||||||
|
|
|
@ -44,9 +44,8 @@
|
||||||
#include "nsSBCSGroupProber.h"
|
#include "nsSBCSGroupProber.h"
|
||||||
#include "nsEscCharsetProber.h"
|
#include "nsEscCharsetProber.h"
|
||||||
#include "nsLatin1Prober.h"
|
#include "nsLatin1Prober.h"
|
||||||
#include "nsError.h"
|
|
||||||
|
|
||||||
nsUniversalDetector::nsUniversalDetector()
|
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||||
{
|
{
|
||||||
mDone = PR_FALSE;
|
mDone = PR_FALSE;
|
||||||
mBestGuess = -1; //illegal value as signal
|
mBestGuess = -1; //illegal value as signal
|
||||||
|
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
|
||||||
mGotData = PR_FALSE;
|
mGotData = PR_FALSE;
|
||||||
mInputState = ePureAscii;
|
mInputState = ePureAscii;
|
||||||
mLastChar = '\0';
|
mLastChar = '\0';
|
||||||
|
mLanguageFilter = aLanguageFilter;
|
||||||
|
|
||||||
PRUint32 i;
|
PRUint32 i;
|
||||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||||
|
@ -125,12 +125,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
|
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
|
||||||
else if ('\xFF' == aBuf[1])
|
else if ('\xFF' == aBuf[1])
|
||||||
// FE FF UTF-16, big endian BOM
|
// FE FF UTF-16, big endian BOM
|
||||||
mDetectedCharset = "UTF-16BE";
|
mDetectedCharset = "UTF-16";
|
||||||
break;
|
break;
|
||||||
case '\x00':
|
case '\x00':
|
||||||
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
||||||
// 00 00 FE FF UTF-32, big-endian BOM
|
// 00 00 FE FF UTF-32, big-endian BOM
|
||||||
mDetectedCharset = "UTF-32BE";
|
mDetectedCharset = "UTF-32";
|
||||||
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
||||||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||||
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
|
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
|
||||||
|
@ -138,10 +138,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
case '\xFF':
|
case '\xFF':
|
||||||
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
||||||
// FF FE 00 00 UTF-32, little-endian BOM
|
// FF FE 00 00 UTF-32, little-endian BOM
|
||||||
mDetectedCharset = "UTF-32LE";
|
mDetectedCharset = "UTF-32";
|
||||||
else if ('\xFE' == aBuf[1])
|
else if ('\xFE' == aBuf[1])
|
||||||
// FF FE UTF-16, little endian BOM
|
// FF FE UTF-16, little endian BOM
|
||||||
mDetectedCharset = "UTF-16LE";
|
mDetectedCharset = "UTF-16";
|
||||||
break;
|
break;
|
||||||
} // switch
|
} // switch
|
||||||
|
|
||||||
|
@ -172,17 +172,25 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
|
|
||||||
//start multibyte and singlebyte charset prober
|
//start multibyte and singlebyte charset prober
|
||||||
if (nsnull == mCharSetProbers[0])
|
if (nsnull == mCharSetProbers[0])
|
||||||
mCharSetProbers[0] = new nsMBCSGroupProber;
|
{
|
||||||
if (nsnull == mCharSetProbers[1])
|
mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
|
||||||
mCharSetProbers[1] = new nsSBCSGroupProber;
|
if (nsnull == mCharSetProbers[0])
|
||||||
if (nsnull == mCharSetProbers[2])
|
|
||||||
mCharSetProbers[2] = new nsLatin1Prober;
|
|
||||||
|
|
||||||
if ((nsnull == mCharSetProbers[0]) ||
|
|
||||||
(nsnull == mCharSetProbers[1]) ||
|
|
||||||
(nsnull == mCharSetProbers[2]))
|
|
||||||
return NS_ERROR_OUT_OF_MEMORY;
|
return NS_ERROR_OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
|
if (nsnull == mCharSetProbers[1] &&
|
||||||
|
(mLanguageFilter & NS_FILTER_NON_CJK))
|
||||||
|
{
|
||||||
|
mCharSetProbers[1] = new nsSBCSGroupProber;
|
||||||
|
if (nsnull == mCharSetProbers[1])
|
||||||
|
return NS_ERROR_OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
if (nsnull == mCharSetProbers[2])
|
||||||
|
{
|
||||||
|
mCharSetProbers[2] = new nsLatin1Prober;
|
||||||
|
if (nsnull == mCharSetProbers[2])
|
||||||
|
return NS_ERROR_OUT_OF_MEMORY;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -202,7 +210,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
{
|
{
|
||||||
case eEscAscii:
|
case eEscAscii:
|
||||||
if (nsnull == mEscCharSetProber) {
|
if (nsnull == mEscCharSetProber) {
|
||||||
mEscCharSetProber = new nsEscCharSetProber;
|
mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
|
||||||
if (nsnull == mEscCharSetProber)
|
if (nsnull == mEscCharSetProber)
|
||||||
return NS_ERROR_OUT_OF_MEMORY;
|
return NS_ERROR_OUT_OF_MEMORY;
|
||||||
}
|
}
|
||||||
|
@ -215,6 +223,8 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
break;
|
break;
|
||||||
case eHighbyte:
|
case eHighbyte:
|
||||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||||
|
{
|
||||||
|
if (mCharSetProbers[i])
|
||||||
{
|
{
|
||||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
||||||
if (st == eFoundIt)
|
if (st == eFoundIt)
|
||||||
|
@ -224,6 +234,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
||||||
return NS_OK;
|
return NS_OK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
default: //pure ascii
|
default: //pure ascii
|
||||||
|
@ -259,6 +270,8 @@ void nsUniversalDetector::DataEnd()
|
||||||
PRInt32 maxProber = 0;
|
PRInt32 maxProber = 0;
|
||||||
|
|
||||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||||
|
{
|
||||||
|
if (mCharSetProbers[i])
|
||||||
{
|
{
|
||||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||||
if (proberConfidence > maxProberConfidence)
|
if (proberConfidence > maxProberConfidence)
|
||||||
|
@ -267,6 +280,7 @@ void nsUniversalDetector::DataEnd()
|
||||||
maxProber = i;
|
maxProber = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
//do not report anything because we are not confident of it, that's in fact a negative answer
|
//do not report anything because we are not confident of it, that's in fact a negative answer
|
||||||
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
if (maxProberConfidence > MINIMUM_THRESHOLD)
|
||||||
Report(mCharSetProbers[maxProber]->GetCharSetName());
|
Report(mCharSetProbers[maxProber]->GetCharSetName());
|
||||||
|
|
|
@ -38,8 +38,6 @@
|
||||||
#ifndef nsUniversalDetector_h__
|
#ifndef nsUniversalDetector_h__
|
||||||
#define nsUniversalDetector_h__
|
#define nsUniversalDetector_h__
|
||||||
|
|
||||||
#include "nscore.h"
|
|
||||||
|
|
||||||
class nsCharSetProber;
|
class nsCharSetProber;
|
||||||
|
|
||||||
#define NUM_OF_CHARSET_PROBERS 3
|
#define NUM_OF_CHARSET_PROBERS 3
|
||||||
|
@ -50,9 +48,22 @@ typedef enum {
|
||||||
eHighbyte = 2
|
eHighbyte = 2
|
||||||
} nsInputState;
|
} nsInputState;
|
||||||
|
|
||||||
|
#define NS_FILTER_CHINESE_SIMPLIFIED 0x01
|
||||||
|
#define NS_FILTER_CHINESE_TRADITIONAL 0x02
|
||||||
|
#define NS_FILTER_JAPANESE 0x04
|
||||||
|
#define NS_FILTER_KOREAN 0x08
|
||||||
|
#define NS_FILTER_NON_CJK 0x10
|
||||||
|
#define NS_FILTER_ALL 0x1F
|
||||||
|
#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
|
||||||
|
NS_FILTER_CHINESE_TRADITIONAL)
|
||||||
|
#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
|
||||||
|
NS_FILTER_CHINESE_TRADITIONAL | \
|
||||||
|
NS_FILTER_JAPANESE | \
|
||||||
|
NS_FILTER_KOREAN)
|
||||||
|
|
||||||
class nsUniversalDetector {
|
class nsUniversalDetector {
|
||||||
public:
|
public:
|
||||||
nsUniversalDetector();
|
nsUniversalDetector(PRUint32 aLanguageFilter);
|
||||||
virtual ~nsUniversalDetector();
|
virtual ~nsUniversalDetector();
|
||||||
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
|
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
|
||||||
virtual void DataEnd(void);
|
virtual void DataEnd(void);
|
||||||
|
@ -68,6 +79,7 @@ protected:
|
||||||
char mLastChar;
|
char mLastChar;
|
||||||
const char * mDetectedCharset;
|
const char * mDetectedCharset;
|
||||||
PRInt32 mBestGuess;
|
PRInt32 mBestGuess;
|
||||||
|
PRUint32 mLanguageFilter;
|
||||||
|
|
||||||
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
||||||
nsCharSetProber *mEscCharSetProber;
|
nsCharSetProber *mEscCharSetProber;
|
||||||
|
|
Loading…
Reference in a new issue