Update universalchardet using a patch I made around 2009-02, the one we're currently using is from ~1998. I'll check again later to see if there are any updates to it before closing the ticket. Updates #866.

Originally committed to SVN as r3653.
This commit is contained in:
Amar Takhar 2009-10-09 14:30:27 +00:00
parent 05c9ffde7a
commit 42e0dd6ce4
43 changed files with 324 additions and 458 deletions

View file

@ -106,17 +106,14 @@ wxString CharSetDetect::GetEncoding(wxString filename) {
bool gotLocal = false; bool gotLocal = false;
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) { for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
if (mCharSetProbers[i]) { if (mCharSetProbers[i]) {
int probes = mCharSetProbers[i]->GetProbeCount(); float conf = mCharSetProbers[i]->GetConfidence();
for (int j=0;j<probes;j++) {
float conf = mCharSetProbers[i]->GetConfidence(j);
// Only bother with those whose confidence is at least 1% // Only bother with those whose confidence is at least 1%
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(j),wxConvUTF8); wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
if (conf > 0.01f || curName == local) { if (conf > 0.01f || curName == local) {
results.push_back(CharDetResult()); results.push_back(CharDetResult());
results.back().name = curName; results.back().name = curName;
results.back().confidence = mCharSetProbers[i]->GetConfidence(j); results.back().confidence = conf;
}
} }
} }
} }

View file

@ -38,7 +38,9 @@
/////////// ///////////
// Headers // Headers
#include "../universalchardet/nscore.h"
#include "../universalchardet/nsUniversalDetector.h" #include "../universalchardet/nsUniversalDetector.h"
#include "../universalchardet/nsMBCSGroupProber.h"
/// DOCME /// DOCME
@ -54,6 +56,7 @@ private:
void Report(const char* aCharset); void Report(const char* aCharset);
public: public:
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
wxString GetEncoding(wxString filename); wxString GetEncoding(wxString filename);
/// @brief DOCME /// @brief DOCME

View file

@ -49,12 +49,13 @@
#define MINIMUM_DATA_THRESHOLD 4 #define MINIMUM_DATA_THRESHOLD 4
//return confidence base on received data //return confidence base on received data
float CharDistributionAnalysis::GetConfidence() float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{ {
//if we didn't receive any character in our consideration range, or the //if we didn't receive any character in our consideration range, or the
// number of frequent characters is below the minimum threshold, return // number of frequent characters is below the minimum threshold, return
// negative answer // negative answer
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD) if (mTotalChars <= 0 ||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
return SURE_NO; return SURE_NO;
if (mTotalChars != mFreqChars) { if (mTotalChars != mFreqChars) {

View file

@ -69,10 +69,10 @@ public:
mFreqChars++; mFreqChars++;
} }
} }
}; }
//return confidence base on existing data //return confidence base on existing data
float GetConfidence(); float GetConfidence(PRBool aIsPreferredLanguage);
//Reset analyser, clear any state //Reset analyser, clear any state
void Reset(void) void Reset(void)
@ -80,21 +80,21 @@ public:
mDone = PR_FALSE; mDone = PR_FALSE;
mTotalChars = 0; mTotalChars = 0;
mFreqChars = 0; mFreqChars = 0;
}; }
//This function is for future extension. Caller can use this function to control //This function is for future extension. Caller can use this function to control
//analyser's behavior //analyser's behavior
void SetOpion(){}; void SetOpion(){}
//It is not necessary to receive all data to draw conclusion. For charset detection, //It is not necessary to receive all data to draw conclusion. For charset detection,
// certain amount of data is enough // certain amount of data is enough
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}; PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
protected: protected:
//we do not handle character base on its original encoding string, but //we do not handle character base on its original encoding string, but
//convert this encoding string to a number, here called order. //convert this encoding string to a number, here called order.
//This allow multiple encoding of a language to share one frequency table //This allow multiple encoding of a language to share one frequency table
virtual PRInt32 GetOrder(const char* str) {return -1;}; virtual PRInt32 GetOrder(const char* str) {return -1;}
//If this flag is set to PR_TRUE, detection is done and conclusion has been made //If this flag is set to PR_TRUE, detection is done and conclusion has been made
PRBool mDone; PRBool mDone;
@ -132,7 +132,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1; return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
else else
return -1; return -1;
}; }
}; };
@ -150,7 +150,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
else else
return -1; return -1;
}; }
}; };
class GB2312DistributionAnalysis : public CharDistributionAnalysis class GB2312DistributionAnalysis : public CharDistributionAnalysis
@ -167,7 +167,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1; return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
else else
return -1; return -1;
}; }
}; };
@ -188,7 +188,7 @@ protected:
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40; return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
else else
return -1; return -1;
}; }
}; };
class SJISDistributionAnalysis : public CharDistributionAnalysis class SJISDistributionAnalysis : public CharDistributionAnalysis
@ -213,7 +213,7 @@ protected:
if ((unsigned char)str[1] > (unsigned char)0x7f) if ((unsigned char)str[1] > (unsigned char)0x7f)
order--; order--;
return order; return order;
}; }
}; };
class EUCJPDistributionAnalysis : public CharDistributionAnalysis class EUCJPDistributionAnalysis : public CharDistributionAnalysis
@ -230,7 +230,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1; return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
else else
return -1; return -1;
}; }
}; };
#endif //CharDistribution_h__ #endif //CharDistribution_h__

View file

@ -39,7 +39,7 @@
#include "JpCntx.h" #include "JpCntx.h"
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category //This is hiragana 2-char sequence table, the number in each cell represents its frequency category
char jp2CharContext[83][83] = const char jp2CharContext[83][83] =
{ {
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,}, { 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,}, { 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void)
} }
#define DONT_KNOW (float)-1 #define DONT_KNOW (float)-1
float JapaneseContextAnalysis::GetConfidence() float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{ {
//This is just one way to calculate confidence. It works well for me. //This is just one way to calculate confidence. It works well for me.
if (mTotalRel > MINIMUM_DATA_THRESHOLD) if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel; return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
else else
return (float)DONT_KNOW; return (float)DONT_KNOW;
@ -227,5 +227,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
return (unsigned char)*(str+1) - (unsigned char)0xa1; return (unsigned char)*(str+1) - (unsigned char)0xa1;
return -1; return -1;
} }

View file

@ -73,12 +73,12 @@ public:
mRelSample[jp2CharContext[mLastCharOrder][order]]++; mRelSample[jp2CharContext[mLastCharOrder][order]]++;
} }
mLastCharOrder = order; mLastCharOrder = order;
}; }
float GetConfidence(); float GetConfidence(PRBool aIsPreferredLanguage);
void Reset(void); void Reset(void);
void SetOpion(){}; void SetOpion(){}
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}; PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
protected: protected:
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0; virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
@ -116,7 +116,7 @@ protected:
(unsigned char)*(str+1) <= (unsigned char)0xf1) (unsigned char)*(str+1) <= (unsigned char)0xf1)
return (unsigned char)*(str+1) - (unsigned char)0x9f; return (unsigned char)*(str+1) - (unsigned char)0x9f;
return -1; return -1;
}; }
}; };
class EUCJPContextAnalysis : public JapaneseContextAnalysis class EUCJPContextAnalysis : public JapaneseContextAnalysis
@ -131,7 +131,7 @@ protected:
(unsigned char)*(str+1) <= (unsigned char)0xf3) (unsigned char)*(str+1) <= (unsigned char)0xf3)
return (unsigned char)*(str+1) - (unsigned char)0xa1; return (unsigned char)*(str+1) - (unsigned char)0xa1;
return -1; return -1;
}; }
}; };
#endif /* __JPCNTX_H__ */ #endif /* __JPCNTX_H__ */

View file

@ -48,7 +48,7 @@
//this talbe is modified base on win1251BulgarianCharToOrderMap, so //this talbe is modified base on win1251BulgarianCharToOrderMap, so
//only number <64 is sure valid //only number <64 is sure valid
unsigned char Latin5_BulgarianCharToOrderMap[] = static const unsigned char Latin5_BulgarianCharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -68,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0 62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
}; };
unsigned char win1251BulgarianCharToOrderMap[] = static const unsigned char win1251BulgarianCharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -94,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
//first 1024 sequences:3.0618% //first 1024 sequences:3.0618%
//rest sequences: 0.2992% //rest sequences: 0.2992%
//negative sequences: 0.0020% //negative sequences: 0.0020%
char BulgarianLangModel[] = static const char BulgarianLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
@ -226,7 +226,7 @@ char BulgarianLangModel[] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
}; };
SequenceModel Latin5BulgarianModel = const SequenceModel Latin5BulgarianModel =
{ {
Latin5_BulgarianCharToOrderMap, Latin5_BulgarianCharToOrderMap,
BulgarianLangModel, BulgarianLangModel,
@ -235,7 +235,7 @@ SequenceModel Latin5BulgarianModel =
"ISO-8859-5" "ISO-8859-5"
}; };
SequenceModel Win1251BulgarianModel = const SequenceModel Win1251BulgarianModel =
{ {
win1251BulgarianCharToOrderMap, win1251BulgarianCharToOrderMap,
BulgarianLangModel, BulgarianLangModel,

View file

@ -41,7 +41,7 @@
//KOI8-R language model //KOI8-R language model
//Character Mapping Table: //Character Mapping Table:
unsigned char KOI8R_CharToOrderMap[] = static const unsigned char KOI8R_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -61,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0 35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
}; };
unsigned char win1251_CharToOrderMap[] = static const unsigned char win1251_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -81,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
}; };
unsigned char latin5_CharToOrderMap[] = static const unsigned char latin5_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -101,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255, 239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
}; };
unsigned char macCyrillic_CharToOrderMap[] = static const unsigned char macCyrillic_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -121,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255, 9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
}; };
unsigned char IBM855_CharToOrderMap[] = static const unsigned char IBM855_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -141,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255, 250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
}; };
unsigned char IBM866_CharToOrderMap[] = static const unsigned char IBM866_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -167,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
//first 1024 sequences: 2.3389% //first 1024 sequences: 2.3389%
//rest sequences: 0.1237% //rest sequences: 0.1237%
//negative sequences: 0.0009% //negative sequences: 0.0009%
char RussianLangModel[] = static const char RussianLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
@ -300,7 +300,7 @@ char RussianLangModel[] =
}; };
SequenceModel Koi8rModel = const SequenceModel Koi8rModel =
{ {
KOI8R_CharToOrderMap, KOI8R_CharToOrderMap,
RussianLangModel, RussianLangModel,
@ -309,7 +309,7 @@ SequenceModel Koi8rModel =
"KOI8-R" "KOI8-R"
}; };
SequenceModel Win1251Model = const SequenceModel Win1251Model =
{ {
win1251_CharToOrderMap, win1251_CharToOrderMap,
RussianLangModel, RussianLangModel,
@ -318,7 +318,7 @@ SequenceModel Win1251Model =
"windows-1251" "windows-1251"
}; };
SequenceModel Latin5Model = const SequenceModel Latin5Model =
{ {
latin5_CharToOrderMap, latin5_CharToOrderMap,
RussianLangModel, RussianLangModel,
@ -327,7 +327,7 @@ SequenceModel Latin5Model =
"ISO-8859-5" "ISO-8859-5"
}; };
SequenceModel MacCyrillicModel = const SequenceModel MacCyrillicModel =
{ {
macCyrillic_CharToOrderMap, macCyrillic_CharToOrderMap,
RussianLangModel, RussianLangModel,
@ -336,7 +336,7 @@ SequenceModel MacCyrillicModel =
"x-mac-cyrillic" "x-mac-cyrillic"
}; };
SequenceModel Ibm866Model = const SequenceModel Ibm866Model =
{ {
IBM866_CharToOrderMap, IBM866_CharToOrderMap,
RussianLangModel, RussianLangModel,
@ -345,7 +345,7 @@ SequenceModel Ibm866Model =
"IBM866" "IBM866"
}; };
SequenceModel Ibm855Model = const SequenceModel Ibm855Model =
{ {
IBM855_CharToOrderMap, IBM855_CharToOrderMap,
RussianLangModel, RussianLangModel,

View file

@ -45,7 +45,7 @@
*****************************************************************/ *****************************************************************/
//Character Mapping Table: //Character Mapping Table:
unsigned char Latin7_CharToOrderMap[] = static const unsigned char Latin7_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -67,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =
unsigned char win1253_CharToOrderMap[] = static const unsigned char win1253_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -93,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
//first 1024 sequences:1.7001% //first 1024 sequences:1.7001%
//rest sequences: 0.0359% //rest sequences: 0.0359%
//negative sequences: 0.0148% //negative sequences: 0.0148%
char GreekLangModel[] = static const char GreekLangModel[] =
{ {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@ -225,7 +225,7 @@ char GreekLangModel[] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
}; };
SequenceModel Latin7Model = const SequenceModel Latin7Model =
{ {
Latin7_CharToOrderMap, Latin7_CharToOrderMap,
GreekLangModel, GreekLangModel,
@ -234,7 +234,7 @@ SequenceModel Latin7Model =
"ISO-8859-7" "ISO-8859-7"
}; };
SequenceModel Win1253Model = const SequenceModel Win1253Model =
{ {
win1253_CharToOrderMap, win1253_CharToOrderMap,
GreekLangModel, GreekLangModel,

View file

@ -50,7 +50,7 @@
//Windows-1255 language model //Windows-1255 language model
//Character Mapping Table: //Character Mapping Table:
unsigned char win1255_CharToOrderMap[] = static const unsigned char win1255_CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -76,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
//first 1024 sequences: 1.5981% //first 1024 sequences: 1.5981%
//rest sequences: 0.087% //rest sequences: 0.087%
//negative sequences: 0.0015% //negative sequences: 0.0015%
char HebrewLangModel[] = static const char HebrewLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0, 0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1, 3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
@ -208,7 +208,7 @@ char HebrewLangModel[] =
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0, 0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
}; };
SequenceModel Win1255Model = const SequenceModel Win1255Model =
{ {
win1255_CharToOrderMap, win1255_CharToOrderMap,
HebrewLangModel, HebrewLangModel,

View file

@ -45,7 +45,7 @@
*****************************************************************/ *****************************************************************/
//Character Mapping Table: //Character Mapping Table:
unsigned char Latin2_HungarianCharToOrderMap[] = static const unsigned char Latin2_HungarianCharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -65,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253, 245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
}; };
unsigned char win1250HungarianCharToOrderMap[] = static const unsigned char win1250HungarianCharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -91,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
//first 1024 sequences:5.2623% //first 1024 sequences:5.2623%
//rest sequences: 0.8894% //rest sequences: 0.8894%
//negative sequences: 0.0009% //negative sequences: 0.0009%
char HungarianLangModel[] = static const char HungarianLangModel[] =
{ {
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2, 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
@ -223,7 +223,7 @@ char HungarianLangModel[] =
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0, 0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
}; };
SequenceModel Latin2HungarianModel = const SequenceModel Latin2HungarianModel =
{ {
Latin2_HungarianCharToOrderMap, Latin2_HungarianCharToOrderMap,
HungarianLangModel, HungarianLangModel,
@ -232,7 +232,7 @@ SequenceModel Latin2HungarianModel =
"ISO-8859-2" "ISO-8859-2"
}; };
SequenceModel Win1250HungarianModel = const SequenceModel Win1250HungarianModel =
{ {
win1250HungarianCharToOrderMap, win1250HungarianCharToOrderMap,
HungarianLangModel, HungarianLangModel,

View file

@ -49,7 +49,7 @@
//The following result for thai was collected from a limited sample (1M). //The following result for thai was collected from a limited sample (1M).
//Character Mapping Table: //Character Mapping Table:
unsigned char TIS620CharToOrderMap[] = static const unsigned char TIS620CharToOrderMap[] =
{ {
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00 255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10 255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -78,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
//first 1024 sequences:7.3177% //first 1024 sequences:7.3177%
//rest sequences: 1.0230% //rest sequences: 1.0230%
//negative sequences: 0.0436% //negative sequences: 0.0436%
char ThaiLangModel[] = static const char ThaiLangModel[] =
{ {
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3, 0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2, 0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
@ -211,7 +211,7 @@ char ThaiLangModel[] =
}; };
SequenceModel TIS620ThaiModel = const SequenceModel TIS620ThaiModel =
{ {
TIS620CharToOrderMap, TIS620CharToOrderMap,
ThaiLangModel, ThaiLangModel,

View file

@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
@ -86,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsBig5Prober::GetConfidence(void) float nsBig5Prober::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(); float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf; return (float)distribCf;
} }

View file

@ -44,15 +44,17 @@
class nsBig5Prober: public nsCharSetProber { class nsBig5Prober: public nsCharSetProber {
public: public:
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel); nsBig5Prober(PRBool aIsPreferredLanguage)
Reset();}; :mIsPreferredLanguage(aIsPreferredLanguage)
virtual ~nsBig5Prober(void){delete mCodingSM;}; {mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Big5";}; const char* GetCharSetName() {return "Big5";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr); void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -63,6 +65,7 @@ protected:
//Big5ContextAnalysis mContextAnalyser; //Big5ContextAnalysis mContextAnalyser;
Big5DistributionAnalysis mDistributionAnalyser; Big5DistributionAnalysis mDistributionAnalyser;
char mLastChar[2]; char mLastChar[2];
PRBool mIsPreferredLanguage;
}; };

View file

@ -74,7 +74,7 @@ PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 a
if (meetMSB && curPtr > prevPtr) if (meetMSB && curPtr > prevPtr)
while (prevPtr < curPtr) *newptr++ = *prevPtr++; while (prevPtr < curPtr) *newptr++ = *prevPtr++;
newLen = PRUint32(newptr - *newBuf); newLen = newptr - *newBuf;
return PR_TRUE; return PR_TRUE;
} }
@ -119,7 +119,7 @@ PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen
while (prevPtr < curPtr) while (prevPtr < curPtr)
*newptr++ = *prevPtr++; *newptr++ = *prevPtr++;
newLen = PRUint32(newptr - *newBuf); newLen = newptr - *newBuf;
return PR_TRUE; return PR_TRUE;
} }

View file

@ -52,7 +52,7 @@ typedef enum {
class nsCharSetProber { class nsCharSetProber {
public: public:
virtual ~nsCharSetProber() {}; virtual ~nsCharSetProber() {}
virtual const char* GetCharSetName() = 0; virtual const char* GetCharSetName() = 0;
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0; virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
virtual nsProbingState GetState(void) = 0; virtual nsProbingState GetState(void) = 0;
@ -60,10 +60,6 @@ public:
virtual float GetConfidence(void) = 0; virtual float GetConfidence(void) = 0;
virtual void SetOpion() = 0; virtual void SetOpion() = 0;
virtual const char* GetCharSetName(int i) { return GetCharSetName(); }
virtual float GetConfidence(int i) { return GetConfidence(); }
virtual int GetProbeCount(void) { return 1; }
#ifdef DEBUG_chardet #ifdef DEBUG_chardet
virtual void DumpStatus() {}; virtual void DumpStatus() {};
#endif #endif

View file

@ -59,10 +59,7 @@ typedef struct
class nsCodingStateMachine { class nsCodingStateMachine {
public: public:
nsCodingStateMachine(SMModel* sm){ nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
mCurrentState = eStart;
mModel = sm;
};
nsSMState NextState(char c){ nsSMState NextState(char c){
//for each byte we get its class , if it is first byte, we also get byte length //for each byte we get its class , if it is first byte, we also get byte length
PRUint32 byteCls = GETCLASS(c); PRUint32 byteCls = GETCLASS(c);
@ -76,33 +73,32 @@ public:
mModel->stateTable); mModel->stateTable);
mCurrentBytePos++; mCurrentBytePos++;
return mCurrentState; return mCurrentState;
}; }
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}; PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
void Reset(void) {mCurrentState = eStart;}; void Reset(void) {mCurrentState = eStart;}
const char * GetCodingStateMachine() {return mModel->name;}; const char * GetCodingStateMachine() {return mModel->name;}
protected: protected:
nsSMState mCurrentState; nsSMState mCurrentState;
PRUint32 mCurrentCharLen; PRUint32 mCurrentCharLen;
PRUint32 mCurrentBytePos; PRUint32 mCurrentBytePos;
SMModel *mModel; const SMModel *mModel;
}; };
extern SMModel UTF8SMModel; extern const SMModel UTF8SMModel;
extern SMModel Big5SMModel; extern const SMModel Big5SMModel;
extern SMModel EUCJPSMModel; extern const SMModel EUCJPSMModel;
extern SMModel EUCKRSMModel; extern const SMModel EUCKRSMModel;
extern SMModel EUCTWSMModel; extern const SMModel EUCTWSMModel;
extern SMModel GB18030SMModel; extern const SMModel GB18030SMModel;
extern SMModel SJISSMModel; extern const SMModel SJISSMModel;
extern SMModel UCS2BESMModel;
extern SMModel HZSMModel; extern const SMModel HZSMModel;
extern SMModel ISO2022CNSMModel; extern const SMModel ISO2022CNSMModel;
extern SMModel ISO2022JPSMModel; extern const SMModel ISO2022JPSMModel;
extern SMModel ISO2022KRSMModel; extern const SMModel ISO2022KRSMModel;
#endif /* nsCodingStateMachine_h__ */ #endif /* nsCodingStateMachine_h__ */

View file

@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
@ -96,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCJPProber::GetConfidence(void) float nsEUCJPProber::GetConfidence(void)
{ {
float contxtCf = mContextAnalyser.GetConfidence(); float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
float distribCf = mDistributionAnalyser.GetConfidence(); float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf); return (contxtCf > distribCf ? contxtCf : distribCf);
} }

View file

@ -50,15 +50,17 @@
class nsEUCJPProber: public nsCharSetProber { class nsEUCJPProber: public nsCharSetProber {
public: public:
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel); nsEUCJPProber(PRBool aIsPreferredLanguage)
Reset();}; :mIsPreferredLanguage(aIsPreferredLanguage)
virtual ~nsEUCJPProber(void){delete mCodingSM;}; {mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-JP";}; const char* GetCharSetName() {return "EUC-JP";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
nsCodingStateMachine* mCodingSM; nsCodingStateMachine* mCodingSM;
@ -68,6 +70,7 @@ protected:
EUCJPDistributionAnalysis mDistributionAnalyser; EUCJPDistributionAnalysis mDistributionAnalyser;
char mLastChar[2]; char mLastChar[2];
PRBool mIsPreferredLanguage;
}; };

View file

@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
@ -89,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCKRProber::GetConfidence(void) float nsEUCKRProber::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(); float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf; return (float)distribCf;
} }

View file

@ -44,15 +44,18 @@
class nsEUCKRProber: public nsCharSetProber { class nsEUCKRProber: public nsCharSetProber {
public: public:
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel); nsEUCKRProber(PRBool aIsPreferredLanguage)
Reset();}; :mIsPreferredLanguage(aIsPreferredLanguage)
virtual ~nsEUCKRProber(void){delete mCodingSM;}; {mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();
}
virtual ~nsEUCKRProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-KR";}; const char* GetCharSetName() {return "EUC-KR";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr); void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -63,6 +66,7 @@ protected:
//EUCKRContextAnalysis mContextAnalyser; //EUCKRContextAnalysis mContextAnalyser;
EUCKRDistributionAnalysis mDistributionAnalyser; EUCKRDistributionAnalysis mDistributionAnalyser;
char mLastChar[2]; char mLastChar[2];
PRBool mIsPreferredLanguage;
}; };

View file

@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
@ -89,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCTWProber::GetConfidence(void) float nsEUCTWProber::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(); float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf; return (float)distribCf;
} }

View file

@ -44,15 +44,17 @@
class nsEUCTWProber: public nsCharSetProber { class nsEUCTWProber: public nsCharSetProber {
public: public:
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel); nsEUCTWProber(PRBool aIsPreferredLanguage)
Reset();}; :mIsPreferredLanguage(aIsPreferredLanguage)
virtual ~nsEUCTWProber(void){delete mCodingSM;}; {mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "x-euc-tw";}; const char* GetCharSetName() {return "x-euc-tw";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr); void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -63,6 +65,7 @@ protected:
//EUCTWContextAnalysis mContextAnalyser; //EUCTWContextAnalysis mContextAnalyser;
EUCTWDistributionAnalysis mDistributionAnalyser; EUCTWDistributionAnalysis mDistributionAnalyser;
char mLastChar[2]; char mLastChar[2];
PRBool mIsPreferredLanguage;
}; };

View file

@ -37,12 +37,20 @@
#include "nsEscCharsetProber.h" #include "nsEscCharsetProber.h"
#include "nsUniversalDetector.h"
nsEscCharSetProber::nsEscCharSetProber(void) nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
{
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
mCodingSM[i] = nsnull;
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
{ {
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel); mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel); mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
}
if (aLanguageFilter & NS_FILTER_JAPANESE)
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel); mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
if (aLanguageFilter & NS_FILTER_KOREAN)
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel); mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
mActiveSM = NUM_OF_ESC_CHARSETS; mActiveSM = NUM_OF_ESC_CHARSETS;
mState = eDetecting; mState = eDetecting;
@ -59,6 +67,7 @@ void nsEscCharSetProber::Reset(void)
{ {
mState = eDetecting; mState = eDetecting;
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++) for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
if (mCodingSM[i])
mCodingSM[i]->Reset(); mCodingSM[i]->Reset();
mActiveSM = NUM_OF_ESC_CHARSETS; mActiveSM = NUM_OF_ESC_CHARSETS;
mDetectedCharset = nsnull; mDetectedCharset = nsnull;
@ -74,26 +83,10 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
{ {
for (j = mActiveSM-1; j>= 0; j--) for (j = mActiveSM-1; j>= 0; j--)
{ {
//byte is feed to all active state machine if (mCodingSM[j])
{
codingState = mCodingSM[j]->NextState(aBuf[i]); codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eError) if (codingState == eItsMe)
{
//got negative answer for this state machine, make it inactive
mActiveSM--;
if (mActiveSM == 0)
{
mState = eNotMe;
return mState;
}
else if (j != (PRInt32)mActiveSM)
{
nsCodingStateMachine* t;
t = mCodingSM[mActiveSM];
mCodingSM[mActiveSM] = mCodingSM[j];
mCodingSM[j] = t;
}
}
else if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
@ -101,6 +94,7 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
} }
} }
} }
}
return mState; return mState;
} }

View file

@ -45,14 +45,14 @@
class nsEscCharSetProber: public nsCharSetProber { class nsEscCharSetProber: public nsCharSetProber {
public: public:
nsEscCharSetProber(void); nsEscCharSetProber(PRUint32 aLanguageFilter);
virtual ~nsEscCharSetProber(void); virtual ~nsEscCharSetProber(void);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return mDetectedCharset;}; const char* GetCharSetName() {return mDetectedCharset;}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void){return (float)0.99;}; float GetConfidence(void){return (float)0.99;}
void SetOpion() {}; void SetOpion() {}
protected: protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr); void GetDistribution(PRUint32 aCharLen, const char* aStr);

View file

@ -36,7 +36,7 @@
* ***** END LICENSE BLOCK ***** */ * ***** END LICENSE BLOCK ***** */
#include "nsCodingStateMachine.h" #include "nsCodingStateMachine.h"
static PRUint32 HZ_cls[ 256 / 8 ] = { static const PRUint32 HZ_cls[ 256 / 8 ] = {
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -72,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
}; };
static PRUint32 HZ_st [ 6] = { static const PRUint32 HZ_st [ 6] = {
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07 PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17 PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
@ -83,7 +83,7 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0}; static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
SMModel HZSMModel = { const SMModel HZSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
6, 6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
@ -92,7 +92,7 @@ SMModel HZSMModel = {
}; };
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = { static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -128,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
}; };
static PRUint32 ISO2022CN_st [ 8] = { static const PRUint32 ISO2022CN_st [ 8] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
@ -141,7 +141,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
SMModel ISO2022CNSMModel = { const SMModel ISO2022CNSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
9, 9,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
@ -149,7 +149,7 @@ SMModel ISO2022CNSMModel = {
"ISO-2022-CN", "ISO-2022-CN",
}; };
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = { static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -185,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
}; };
static PRUint32 ISO2022JP_st [ 9] = { static const PRUint32 ISO2022JP_st [ 9] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07 PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17 PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
@ -199,7 +199,7 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0}; static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
SMModel ISO2022JPSMModel = { const SMModel ISO2022JPSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
10, 10,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
@ -207,7 +207,7 @@ SMModel ISO2022JPSMModel = {
"ISO-2022-JP", "ISO-2022-JP",
}; };
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = { static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07 PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17 PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -243,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
}; };
static PRUint32 ISO2022KR_st [ 5] = { static const PRUint32 ISO2022KR_st [ 5] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07 PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17 PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
@ -253,7 +253,7 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0}; static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
SMModel ISO2022KRSMModel = { const SMModel ISO2022KRSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
6, 6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },

View file

@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
@ -94,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsGB18030Prober::GetConfidence(void) float nsGB18030Prober::GetConfidence(void)
{ {
float distribCf = mDistributionAnalyser.GetConfidence(); float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf; return (float)distribCf;
} }

View file

@ -46,15 +46,17 @@
class nsGB18030Prober: public nsCharSetProber { class nsGB18030Prober: public nsCharSetProber {
public: public:
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel); nsGB18030Prober(PRBool aIsPreferredLanguage)
Reset();}; :mIsPreferredLanguage(aIsPreferredLanguage)
virtual ~nsGB18030Prober(void){delete mCodingSM;}; {mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "gb18030";}; const char* GetCharSetName() {return "gb18030";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr); void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -65,6 +67,7 @@ protected:
//GB2312ContextAnalysis mContextAnalyser; //GB2312ContextAnalysis mContextAnalyser;
GB2312DistributionAnalysis mDistributionAnalyser; GB2312DistributionAnalysis mDistributionAnalyser;
char mLastChar[2]; char mLastChar[2];
PRBool mIsPreferredLanguage;
}; };

View file

@ -55,7 +55,7 @@ public:
virtual nsProbingState GetState(void); virtual nsProbingState GetState(void);
virtual float GetConfidence(void) { return (float)0.0; } virtual float GetConfidence(void) { return (float)0.0; }
virtual void SetOpion() {}; virtual void SetOpion() {}
void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb) void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
{ mLogicalProb = logicalPrb; mVisualProb = visualPrb; } { mLogicalProb = logicalPrb; mVisualProb = visualPrb; }

View file

@ -50,7 +50,7 @@
#define ASO 7 // accent small other #define ASO 7 // accent small other
#define CLASS_NUM 8 // total classes #define CLASS_NUM 8 // total classes
static unsigned char Latin1_CharToClass[] = static const unsigned char Latin1_CharToClass[] =
{ {
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
2 : normal 2 : normal
3 : very likely 3 : very likely
*/ */
static unsigned char Latin1ClassModel[] = static const unsigned char Latin1ClassModel[] =
{ {
/* UDF OTH ASC ASS ACV ACO ASV ASO */ /* UDF OTH ASC ASS ACV ACO ASV ASO */
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, /*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,

View file

@ -45,14 +45,14 @@
class nsLatin1Prober: public nsCharSetProber { class nsLatin1Prober: public nsCharSetProber {
public: public:
nsLatin1Prober(void){Reset();}; nsLatin1Prober(void){Reset();}
virtual ~nsLatin1Prober(void){}; virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "windows-1252";}; const char* GetCharSetName() {return "windows-1252";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
#ifdef DEBUG_chardet #ifdef DEBUG_chardet
virtual void DumpStatus(); virtual void DumpStatus();

View file

@ -39,6 +39,7 @@
#include <stdio.h> #include <stdio.h>
#include "nsMBCSGroupProber.h" #include "nsMBCSGroupProber.h"
#include "nsUniversalDetector.h"
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) #if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
const char *ProberName[] = const char *ProberName[] =
@ -54,15 +55,26 @@ const char *ProberName[] =
#endif #endif
nsMBCSGroupProber::nsMBCSGroupProber() nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
{ {
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
mProbers[i] = nsnull;
mProbers[0] = new nsUTF8Prober(); mProbers[0] = new nsUTF8Prober();
mProbers[1] = new nsSJISProber(); if (aLanguageFilter & NS_FILTER_JAPANESE)
mProbers[2] = new nsEUCJPProber(); {
mProbers[3] = new nsGB18030Prober(); mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
mProbers[4] = new nsEUCKRProber(); mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
mProbers[5] = new nsBig5Prober(); }
mProbers[6] = new nsEUCTWProber(); if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
if (aLanguageFilter & NS_FILTER_KOREAN)
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
{
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
Reset(); Reset();
} }
@ -134,16 +146,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
mState = eFoundIt; mState = eFoundIt;
return mState; return mState;
} }
else if (st == eNotMe)
{
mIsActive[i] = PR_FALSE;
mActiveNum--;
if (mActiveNum <= 0)
{
mState = eNotMe;
return mState;
}
}
} }
} }
} }
@ -154,23 +156,13 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{ {
if (!mIsActive[i]) if (!mIsActive[i])
continue; continue;
st = mProbers[i]->HandleData(aBuf + start, aLen + 1 - start); st = mProbers[i]->HandleData(aBuf + start, aLen - start);
if (st == eFoundIt) if (st == eFoundIt)
{ {
mBestGuess = i; mBestGuess = i;
mState = eFoundIt; mState = eFoundIt;
return mState; return mState;
} }
else if (st == eNotMe)
{
mIsActive[i] = PR_FALSE;
mActiveNum--;
if (mActiveNum <= 0)
{
mState = eNotMe;
return mState;
}
}
} }
} }
mKeepNext = keepNext; mKeepNext = keepNext;

View file

@ -51,18 +51,14 @@
class nsMBCSGroupProber: public nsCharSetProber { class nsMBCSGroupProber: public nsCharSetProber {
public: public:
nsMBCSGroupProber(); nsMBCSGroupProber(PRUint32 aLanguageFilter);
virtual ~nsMBCSGroupProber(); virtual ~nsMBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName(); const char* GetCharSetName();
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
int GetProbeCount(void) { return NUM_OF_PROBERS; }
#ifdef DEBUG_chardet #ifdef DEBUG_chardet
void DumpStatus(); void DumpStatus();

View file

@ -44,7 +44,7 @@ Modification from frank tang's original work:
// BIG5 // BIG5
static PRUint32 BIG5_cls [ 256 / 8 ] = { static const PRUint32 BIG5_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -81,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
}; };
static PRUint32 BIG5_st [ 3] = { static const PRUint32 BIG5_st [ 3] = {
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17 PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
@ -89,7 +89,7 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0}; static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
SMModel Big5SMModel = { SMModel const Big5SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
5, 5,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
@ -97,7 +97,7 @@ SMModel Big5SMModel = {
"Big5", "Big5",
}; };
static PRUint32 EUCJP_cls [ 256 / 8 ] = { static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07 //PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07 PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
@ -134,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
}; };
static PRUint32 EUCJP_st [ 5] = { static const PRUint32 EUCJP_st [ 5] = {
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07 PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17 PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
@ -144,7 +144,7 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0}; static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
SMModel EUCJPSMModel = { const SMModel EUCJPSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
6, 6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
@ -152,7 +152,7 @@ SMModel EUCJPSMModel = {
"EUC-JP", "EUC-JP",
}; };
static PRUint32 EUCKR_cls [ 256 / 8 ] = { static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -189,14 +189,14 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
}; };
static PRUint32 EUCKR_st [ 2] = { static const PRUint32 EUCKR_st [ 2] = {
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
}; };
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0}; static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
SMModel EUCKRSMModel = { const SMModel EUCKRSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
4, 4,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
@ -204,7 +204,7 @@ SMModel EUCKRSMModel = {
"EUC-KR", "EUC-KR",
}; };
static PRUint32 EUCTW_cls [ 256 / 8 ] = { static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07 //PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07 PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
@ -241,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
}; };
static PRUint32 EUCTW_st [ 6] = { static const PRUint32 EUCTW_st [ 6] = {
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07 PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
@ -252,7 +252,7 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3}; static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
SMModel EUCTWSMModel = { const SMModel EUCTWSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
7, 7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
@ -316,7 +316,7 @@ SMModel GB2312SMModel = {
// the following state machine data was created by perl script in // the following state machine data was created by perl script in
// intl/chardet/tools. It should be the same as in PSM detector. // intl/chardet/tools. It should be the same as in PSM detector.
static PRUint32 GB18030_cls [ 256 / 8 ] = { static const PRUint32 GB18030_cls [ 256 / 8 ] = {
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17 PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
@ -352,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
}; };
static PRUint32 GB18030_st [ 6] = { static const PRUint32 GB18030_st [ 6] = {
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07 PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17 PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
@ -368,7 +368,7 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
// 2 here. // 2 here.
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2}; static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
SMModel GB18030SMModel = { const SMModel GB18030SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
7, 7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
@ -378,7 +378,7 @@ SMModel GB18030SMModel = {
// sjis // sjis
static PRUint32 SJIS_cls [ 256 / 8 ] = { static const PRUint32 SJIS_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -417,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
}; };
static PRUint32 SJIS_st [ 3] = { static const PRUint32 SJIS_st [ 3] = {
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07 PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17 PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
@ -425,7 +425,7 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0}; static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
SMModel SJISSMModel = { const SMModel SJISSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
6, 6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
@ -434,120 +434,7 @@ SMModel SJISSMModel = {
}; };
static PRUint32 UCS2BE_cls [ 256 / 8 ] = { static const PRUint32 UTF8_cls [ 256 / 8 ] = {
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
};
static PRUint32 UCS2BE_st [ 7] = {
PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
};
static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
SMModel UCS2BESMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
UCS2BECharLenTable,
"UTF-16BE",
};
static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
};
static PRUint32 UCS2LE_st [ 7] = {
PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
};
static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
SMModel UCS2LESMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
UCS2LECharLenTable,
"UTF-16LE",
};
static PRUint32 UTF8_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07 //PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -584,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
}; };
static PRUint32 UTF8_st [ 26] = { static const PRUint32 UTF8_st [ 26] = {
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07 PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17 PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
@ -616,7 +503,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3, static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
3, 3, 4, 4, 5, 5, 6, 6 }; 3, 3, 4, 4, 5, 5, 6, 6 };
SMModel UTF8SMModel = { const SMModel UTF8SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
16, 16,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st }, {eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },

View file

@ -68,7 +68,7 @@ typedef struct nsPkgInt {
nsSftMsk sftmsk; nsSftMsk sftmsk;
nsBitSft bitsft; nsBitSft bitsft;
nsUnitMsk unitmsk; nsUnitMsk unitmsk;
PRUint32 *data; const PRUint32* const data;
} nsPkgInt; } nsPkgInt;

View file

@ -49,14 +49,10 @@ public:
virtual ~nsSBCSGroupProber(); virtual ~nsSBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName(); const char* GetCharSetName();
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
int GetProbeCount(void) { return NUM_OF_SBCS_PROBERS; }
#ifdef DEBUG_chardet #ifdef DEBUG_chardet
void DumpStatus(); void DumpStatus();

View file

@ -51,27 +51,27 @@
typedef struct typedef struct
{ {
unsigned char *charToOrderMap; // [256] table use to find a char's order const unsigned char* const charToOrderMap; // [256] table use to find a char's order
char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented) PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* charsetName; const char* const charsetName;
} SequenceModel; } SequenceModel;
class nsSingleByteCharSetProber : public nsCharSetProber{ class nsSingleByteCharSetProber : public nsCharSetProber{
public: public:
nsSingleByteCharSetProber(SequenceModel *model) nsSingleByteCharSetProber(const SequenceModel *model)
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); } :mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber) nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); } :mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
virtual const char* GetCharSetName(); virtual const char* GetCharSetName();
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen); virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual nsProbingState GetState(void) {return mState;}; virtual nsProbingState GetState(void) {return mState;}
virtual void Reset(void); virtual void Reset(void);
virtual float GetConfidence(void); virtual float GetConfidence(void);
virtual void SetOpion() {}; virtual void SetOpion() {}
// This feature is not implemented yet. any current language model // This feature is not implemented yet. any current language model
// contain this parameter as PR_FALSE. No one is looking at this // contain this parameter as PR_FALSE. No one is looking at this
@ -79,7 +79,7 @@ public:
// Moreover, the nsSBCSGroupProber which calls the HandleData of this // Moreover, the nsSBCSGroupProber which calls the HandleData of this
// prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid // prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
// of the English letters. // of the English letters.
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; // (not implemented) PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
#ifdef DEBUG_chardet #ifdef DEBUG_chardet
virtual void DumpStatus(); virtual void DumpStatus();
@ -87,7 +87,7 @@ public:
protected: protected:
nsProbingState mState; nsProbingState mState;
const SequenceModel *mModel; const SequenceModel* const mModel;
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
//char order of last character //char order of last character
@ -106,19 +106,19 @@ protected:
}; };
extern SequenceModel Koi8rModel; extern const SequenceModel Koi8rModel;
extern SequenceModel Win1251Model; extern const SequenceModel Win1251Model;
extern SequenceModel Latin5Model; extern const SequenceModel Latin5Model;
extern SequenceModel MacCyrillicModel; extern const SequenceModel MacCyrillicModel;
extern SequenceModel Ibm866Model; extern const SequenceModel Ibm866Model;
extern SequenceModel Ibm855Model; extern const SequenceModel Ibm855Model;
extern SequenceModel Latin7Model; extern const SequenceModel Latin7Model;
extern SequenceModel Win1253Model; extern const SequenceModel Win1253Model;
extern SequenceModel Latin5BulgarianModel; extern const SequenceModel Latin5BulgarianModel;
extern SequenceModel Win1251BulgarianModel; extern const SequenceModel Win1251BulgarianModel;
extern SequenceModel Latin2HungarianModel; extern const SequenceModel Latin2HungarianModel;
extern SequenceModel Win1250HungarianModel; extern const SequenceModel Win1250HungarianModel;
extern SequenceModel Win1255Model; extern const SequenceModel Win1255Model;
#endif /* nsSingleByteCharSetProber_h__ */ #endif /* nsSingleByteCharSetProber_h__ */

View file

@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;
@ -95,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsSJISProber::GetConfidence(void) float nsSJISProber::GetConfidence(void)
{ {
float contxtCf = mContextAnalyser.GetConfidence(); float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
float distribCf = mDistributionAnalyser.GetConfidence(); float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf); return (contxtCf > distribCf ? contxtCf : distribCf);
} }

View file

@ -51,15 +51,17 @@
class nsSJISProber: public nsCharSetProber { class nsSJISProber: public nsCharSetProber {
public: public:
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel); nsSJISProber(PRBool aIsPreferredLanguage)
Reset();}; :mIsPreferredLanguage(aIsPreferredLanguage)
virtual ~nsSJISProber(void){delete mCodingSM;}; {mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Shift_JIS";}; const char* GetCharSetName() {return "Shift_JIS";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
nsCodingStateMachine* mCodingSM; nsCodingStateMachine* mCodingSM;
@ -69,6 +71,7 @@ protected:
SJISDistributionAnalysis mDistributionAnalyser; SJISDistributionAnalysis mDistributionAnalyser;
char mLastChar[2]; char mLastChar[2];
PRBool mIsPreferredLanguage;
}; };

View file

@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++) for (PRUint32 i = 0; i < aLen; i++)
{ {
codingState = mCodingSM->NextState(aBuf[i]); codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe) if (codingState == eItsMe)
{ {
mState = eFoundIt; mState = eFoundIt;

View file

@ -45,14 +45,14 @@ class nsUTF8Prober: public nsCharSetProber {
public: public:
nsUTF8Prober(){mNumOfMBChar = 0; nsUTF8Prober(){mNumOfMBChar = 0;
mCodingSM = new nsCodingStateMachine(&UTF8SMModel); mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
Reset(); }; Reset(); }
virtual ~nsUTF8Prober(){delete mCodingSM;}; virtual ~nsUTF8Prober(){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen); nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "UTF-8";}; const char* GetCharSetName() {return "UTF-8";}
nsProbingState GetState(void) {return mState;}; nsProbingState GetState(void) {return mState;}
void Reset(void); void Reset(void);
float GetConfidence(void); float GetConfidence(void);
void SetOpion() {}; void SetOpion() {}
protected: protected:
nsCodingStateMachine* mCodingSM; nsCodingStateMachine* mCodingSM;

View file

@ -44,9 +44,8 @@
#include "nsSBCSGroupProber.h" #include "nsSBCSGroupProber.h"
#include "nsEscCharsetProber.h" #include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h" #include "nsLatin1Prober.h"
#include "nsError.h"
nsUniversalDetector::nsUniversalDetector() nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
{ {
mDone = PR_FALSE; mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal mBestGuess = -1; //illegal value as signal
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
mGotData = PR_FALSE; mGotData = PR_FALSE;
mInputState = ePureAscii; mInputState = ePureAscii;
mLastChar = '\0'; mLastChar = '\0';
mLanguageFilter = aLanguageFilter;
PRUint32 i; PRUint32 i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -125,12 +125,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
mDetectedCharset = "X-ISO-10646-UCS-4-3412"; mDetectedCharset = "X-ISO-10646-UCS-4-3412";
else if ('\xFF' == aBuf[1]) else if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM // FE FF UTF-16, big endian BOM
mDetectedCharset = "UTF-16BE"; mDetectedCharset = "UTF-16";
break; break;
case '\x00': case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM // 00 00 FE FF UTF-32, big-endian BOM
mDetectedCharset = "UTF-32BE"; mDetectedCharset = "UTF-32";
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143) // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = "X-ISO-10646-UCS-4-2143"; mDetectedCharset = "X-ISO-10646-UCS-4-2143";
@ -138,10 +138,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
case '\xFF': case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM // FF FE 00 00 UTF-32, little-endian BOM
mDetectedCharset = "UTF-32LE"; mDetectedCharset = "UTF-32";
else if ('\xFE' == aBuf[1]) else if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM // FF FE UTF-16, little endian BOM
mDetectedCharset = "UTF-16LE"; mDetectedCharset = "UTF-16";
break; break;
} // switch } // switch
@ -172,17 +172,25 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
//start multibyte and singlebyte charset prober //start multibyte and singlebyte charset prober
if (nsnull == mCharSetProbers[0]) if (nsnull == mCharSetProbers[0])
mCharSetProbers[0] = new nsMBCSGroupProber; {
if (nsnull == mCharSetProbers[1]) mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
mCharSetProbers[1] = new nsSBCSGroupProber; if (nsnull == mCharSetProbers[0])
if (nsnull == mCharSetProbers[2])
mCharSetProbers[2] = new nsLatin1Prober;
if ((nsnull == mCharSetProbers[0]) ||
(nsnull == mCharSetProbers[1]) ||
(nsnull == mCharSetProbers[2]))
return NS_ERROR_OUT_OF_MEMORY; return NS_ERROR_OUT_OF_MEMORY;
} }
if (nsnull == mCharSetProbers[1] &&
(mLanguageFilter & NS_FILTER_NON_CJK))
{
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[1])
return NS_ERROR_OUT_OF_MEMORY;
}
if (nsnull == mCharSetProbers[2])
{
mCharSetProbers[2] = new nsLatin1Prober;
if (nsnull == mCharSetProbers[2])
return NS_ERROR_OUT_OF_MEMORY;
}
}
} }
else else
{ {
@ -202,7 +210,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{ {
case eEscAscii: case eEscAscii:
if (nsnull == mEscCharSetProber) { if (nsnull == mEscCharSetProber) {
mEscCharSetProber = new nsEscCharSetProber; mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
if (nsnull == mEscCharSetProber) if (nsnull == mEscCharSetProber)
return NS_ERROR_OUT_OF_MEMORY; return NS_ERROR_OUT_OF_MEMORY;
} }
@ -215,6 +223,8 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
break; break;
case eHighbyte: case eHighbyte:
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
if (mCharSetProbers[i])
{ {
st = mCharSetProbers[i]->HandleData(aBuf, aLen); st = mCharSetProbers[i]->HandleData(aBuf, aLen);
if (st == eFoundIt) if (st == eFoundIt)
@ -224,6 +234,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
return NS_OK; return NS_OK;
} }
} }
}
break; break;
default: //pure ascii default: //pure ascii
@ -259,6 +270,8 @@ void nsUniversalDetector::DataEnd()
PRInt32 maxProber = 0; PRInt32 maxProber = 0;
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
if (mCharSetProbers[i])
{ {
proberConfidence = mCharSetProbers[i]->GetConfidence(); proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence) if (proberConfidence > maxProberConfidence)
@ -267,6 +280,7 @@ void nsUniversalDetector::DataEnd()
maxProber = i; maxProber = i;
} }
} }
}
//do not report anything because we are not confident of it, that's in fact a negative answer //do not report anything because we are not confident of it, that's in fact a negative answer
if (maxProberConfidence > MINIMUM_THRESHOLD) if (maxProberConfidence > MINIMUM_THRESHOLD)
Report(mCharSetProbers[maxProber]->GetCharSetName()); Report(mCharSetProbers[maxProber]->GetCharSetName());

View file

@ -38,8 +38,6 @@
#ifndef nsUniversalDetector_h__ #ifndef nsUniversalDetector_h__
#define nsUniversalDetector_h__ #define nsUniversalDetector_h__
#include "nscore.h"
class nsCharSetProber; class nsCharSetProber;
#define NUM_OF_CHARSET_PROBERS 3 #define NUM_OF_CHARSET_PROBERS 3
@ -50,9 +48,22 @@ typedef enum {
eHighbyte = 2 eHighbyte = 2
} nsInputState; } nsInputState;
#define NS_FILTER_CHINESE_SIMPLIFIED 0x01
#define NS_FILTER_CHINESE_TRADITIONAL 0x02
#define NS_FILTER_JAPANESE 0x04
#define NS_FILTER_KOREAN 0x08
#define NS_FILTER_NON_CJK 0x10
#define NS_FILTER_ALL 0x1F
#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
NS_FILTER_CHINESE_TRADITIONAL)
#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
NS_FILTER_CHINESE_TRADITIONAL | \
NS_FILTER_JAPANESE | \
NS_FILTER_KOREAN)
class nsUniversalDetector { class nsUniversalDetector {
public: public:
nsUniversalDetector(); nsUniversalDetector(PRUint32 aLanguageFilter);
virtual ~nsUniversalDetector(); virtual ~nsUniversalDetector();
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen); virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
virtual void DataEnd(void); virtual void DataEnd(void);
@ -68,6 +79,7 @@ protected:
char mLastChar; char mLastChar;
const char * mDetectedCharset; const char * mDetectedCharset;
PRInt32 mBestGuess; PRInt32 mBestGuess;
PRUint32 mLanguageFilter;
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS]; nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
nsCharSetProber *mEscCharSetProber; nsCharSetProber *mEscCharSetProber;