Update universalchardet using a patch I made around 2009-02, the one we're currently using is from ~1998. I'll check again later to see if there are any updates to it before closing the ticket. Updates #866.

Originally committed to SVN as r3653.
This commit is contained in:
Amar Takhar 2009-10-09 14:30:27 +00:00
parent 05c9ffde7a
commit 42e0dd6ce4
43 changed files with 324 additions and 458 deletions

View file

@ -106,17 +106,14 @@ wxString CharSetDetect::GetEncoding(wxString filename) {
bool gotLocal = false;
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
if (mCharSetProbers[i]) {
int probes = mCharSetProbers[i]->GetProbeCount();
for (int j=0;j<probes;j++) {
float conf = mCharSetProbers[i]->GetConfidence(j);
float conf = mCharSetProbers[i]->GetConfidence();
// Only bother with those whose confidence is at least 1%
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(j),wxConvUTF8);
if (conf > 0.01f || curName == local) {
results.push_back(CharDetResult());
results.back().name = curName;
results.back().confidence = mCharSetProbers[i]->GetConfidence(j);
}
// Only bother with those whose confidence is at least 1%
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
if (conf > 0.01f || curName == local) {
results.push_back(CharDetResult());
results.back().name = curName;
results.back().confidence = conf;
}
}
}

View file

@ -38,7 +38,9 @@
///////////
// Headers
#include "../universalchardet/nscore.h"
#include "../universalchardet/nsUniversalDetector.h"
#include "../universalchardet/nsMBCSGroupProber.h"
/// DOCME
@ -54,6 +56,7 @@ private:
void Report(const char* aCharset);
public:
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
wxString GetEncoding(wxString filename);
/// @brief DOCME

View file

@ -49,12 +49,13 @@
#define MINIMUM_DATA_THRESHOLD 4
//return confidence base on received data
float CharDistributionAnalysis::GetConfidence()
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{
//if we didn't receive any character in our consideration range, or the
// number of frequent characters is below the minimum threshold, return
// negative answer
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
if (mTotalChars <= 0 ||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
return SURE_NO;
if (mTotalChars != mFreqChars) {

View file

@ -69,10 +69,10 @@ public:
mFreqChars++;
}
}
};
}
//return confidence base on existing data
float GetConfidence();
float GetConfidence(PRBool aIsPreferredLanguage);
//Reset analyser, clear any state
void Reset(void)
@ -80,21 +80,21 @@ public:
mDone = PR_FALSE;
mTotalChars = 0;
mFreqChars = 0;
};
}
//This function is for future extension. Caller can use this function to control
//analyser's behavior
void SetOpion(){};
void SetOpion(){}
//It is not necessary to receive all data to draw conclusion. For charset detection,
// certain amount of data is enough
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
protected:
//we do not handle character base on its original encoding string, but
//convert this encoding string to a number, here called order.
//This allow multiple encoding of a language to share one frequency table
virtual PRInt32 GetOrder(const char* str) {return -1;};
virtual PRInt32 GetOrder(const char* str) {return -1;}
//If this flag is set to PR_TRUE, detection is done and conclusion has been made
PRBool mDone;
@ -132,7 +132,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
else
return -1;
};
}
};
@ -150,7 +150,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
else
return -1;
};
}
};
class GB2312DistributionAnalysis : public CharDistributionAnalysis
@ -167,7 +167,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
else
return -1;
};
}
};
@ -188,7 +188,7 @@ protected:
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
else
return -1;
};
}
};
class SJISDistributionAnalysis : public CharDistributionAnalysis
@ -213,7 +213,7 @@ protected:
if ((unsigned char)str[1] > (unsigned char)0x7f)
order--;
return order;
};
}
};
class EUCJPDistributionAnalysis : public CharDistributionAnalysis
@ -230,7 +230,7 @@ protected:
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
else
return -1;
};
}
};
#endif //CharDistribution_h__

View file

@ -39,7 +39,7 @@
#include "JpCntx.h"
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category
char jp2CharContext[83][83] =
const char jp2CharContext[83][83] =
{
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void)
}
#define DONT_KNOW (float)-1
float JapaneseContextAnalysis::GetConfidence()
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
{
//This is just one way to calculate confidence. It works well for me.
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
else
return (float)DONT_KNOW;
@ -227,5 +227,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
return (unsigned char)*(str+1) - (unsigned char)0xa1;
return -1;
}

View file

@ -73,12 +73,12 @@ public:
mRelSample[jp2CharContext[mLastCharOrder][order]]++;
}
mLastCharOrder = order;
};
}
float GetConfidence();
float GetConfidence(PRBool aIsPreferredLanguage);
void Reset(void);
void SetOpion(){};
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;};
void SetOpion(){}
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
protected:
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
@ -116,7 +116,7 @@ protected:
(unsigned char)*(str+1) <= (unsigned char)0xf1)
return (unsigned char)*(str+1) - (unsigned char)0x9f;
return -1;
};
}
};
class EUCJPContextAnalysis : public JapaneseContextAnalysis
@ -131,7 +131,7 @@ protected:
(unsigned char)*(str+1) <= (unsigned char)0xf3)
return (unsigned char)*(str+1) - (unsigned char)0xa1;
return -1;
};
}
};
#endif /* __JPCNTX_H__ */

View file

@ -48,7 +48,7 @@
//this talbe is modified base on win1251BulgarianCharToOrderMap, so
//only number <64 is sure valid
unsigned char Latin5_BulgarianCharToOrderMap[] =
static const unsigned char Latin5_BulgarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -68,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
};
unsigned char win1251BulgarianCharToOrderMap[] =
static const unsigned char win1251BulgarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -94,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
//first 1024 sequences:3.0618%
//rest sequences: 0.2992%
//negative sequences: 0.0020%
char BulgarianLangModel[] =
static const char BulgarianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
@ -226,7 +226,7 @@ char BulgarianLangModel[] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
};
SequenceModel Latin5BulgarianModel =
const SequenceModel Latin5BulgarianModel =
{
Latin5_BulgarianCharToOrderMap,
BulgarianLangModel,
@ -235,7 +235,7 @@ SequenceModel Latin5BulgarianModel =
"ISO-8859-5"
};
SequenceModel Win1251BulgarianModel =
const SequenceModel Win1251BulgarianModel =
{
win1251BulgarianCharToOrderMap,
BulgarianLangModel,

View file

@ -41,7 +41,7 @@
//KOI8-R language model
//Character Mapping Table:
unsigned char KOI8R_CharToOrderMap[] =
static const unsigned char KOI8R_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -61,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
};
unsigned char win1251_CharToOrderMap[] =
static const unsigned char win1251_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -81,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
};
unsigned char latin5_CharToOrderMap[] =
static const unsigned char latin5_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -101,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
};
unsigned char macCyrillic_CharToOrderMap[] =
static const unsigned char macCyrillic_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -121,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
};
unsigned char IBM855_CharToOrderMap[] =
static const unsigned char IBM855_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -141,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
};
unsigned char IBM866_CharToOrderMap[] =
static const unsigned char IBM866_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -167,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
//first 1024 sequences: 2.3389%
//rest sequences: 0.1237%
//negative sequences: 0.0009%
char RussianLangModel[] =
static const char RussianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
@ -300,7 +300,7 @@ char RussianLangModel[] =
};
SequenceModel Koi8rModel =
const SequenceModel Koi8rModel =
{
KOI8R_CharToOrderMap,
RussianLangModel,
@ -309,7 +309,7 @@ SequenceModel Koi8rModel =
"KOI8-R"
};
SequenceModel Win1251Model =
const SequenceModel Win1251Model =
{
win1251_CharToOrderMap,
RussianLangModel,
@ -318,7 +318,7 @@ SequenceModel Win1251Model =
"windows-1251"
};
SequenceModel Latin5Model =
const SequenceModel Latin5Model =
{
latin5_CharToOrderMap,
RussianLangModel,
@ -327,7 +327,7 @@ SequenceModel Latin5Model =
"ISO-8859-5"
};
SequenceModel MacCyrillicModel =
const SequenceModel MacCyrillicModel =
{
macCyrillic_CharToOrderMap,
RussianLangModel,
@ -336,7 +336,7 @@ SequenceModel MacCyrillicModel =
"x-mac-cyrillic"
};
SequenceModel Ibm866Model =
const SequenceModel Ibm866Model =
{
IBM866_CharToOrderMap,
RussianLangModel,
@ -345,7 +345,7 @@ SequenceModel Ibm866Model =
"IBM866"
};
SequenceModel Ibm855Model =
const SequenceModel Ibm855Model =
{
IBM855_CharToOrderMap,
RussianLangModel,

View file

@ -45,7 +45,7 @@
*****************************************************************/
//Character Mapping Table:
unsigned char Latin7_CharToOrderMap[] =
static const unsigned char Latin7_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -67,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =
unsigned char win1253_CharToOrderMap[] =
static const unsigned char win1253_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -93,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
//first 1024 sequences:1.7001%
//rest sequences: 0.0359%
//negative sequences: 0.0148%
char GreekLangModel[] =
static const char GreekLangModel[] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@ -225,7 +225,7 @@ char GreekLangModel[] =
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
SequenceModel Latin7Model =
const SequenceModel Latin7Model =
{
Latin7_CharToOrderMap,
GreekLangModel,
@ -234,7 +234,7 @@ SequenceModel Latin7Model =
"ISO-8859-7"
};
SequenceModel Win1253Model =
const SequenceModel Win1253Model =
{
win1253_CharToOrderMap,
GreekLangModel,

View file

@ -50,7 +50,7 @@
//Windows-1255 language model
//Character Mapping Table:
unsigned char win1255_CharToOrderMap[] =
static const unsigned char win1255_CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -76,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
//first 1024 sequences: 1.5981%
//rest sequences: 0.087%
//negative sequences: 0.0015%
char HebrewLangModel[] =
static const char HebrewLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
@ -208,7 +208,7 @@ char HebrewLangModel[] =
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
};
SequenceModel Win1255Model =
const SequenceModel Win1255Model =
{
win1255_CharToOrderMap,
HebrewLangModel,

View file

@ -45,7 +45,7 @@
*****************************************************************/
//Character Mapping Table:
unsigned char Latin2_HungarianCharToOrderMap[] =
static const unsigned char Latin2_HungarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -65,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
};
unsigned char win1250HungarianCharToOrderMap[] =
static const unsigned char win1250HungarianCharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -91,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
//first 1024 sequences:5.2623%
//rest sequences: 0.8894%
//negative sequences: 0.0009%
char HungarianLangModel[] =
static const char HungarianLangModel[] =
{
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
@ -223,7 +223,7 @@ char HungarianLangModel[] =
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
};
SequenceModel Latin2HungarianModel =
const SequenceModel Latin2HungarianModel =
{
Latin2_HungarianCharToOrderMap,
HungarianLangModel,
@ -232,7 +232,7 @@ SequenceModel Latin2HungarianModel =
"ISO-8859-2"
};
SequenceModel Win1250HungarianModel =
const SequenceModel Win1250HungarianModel =
{
win1250HungarianCharToOrderMap,
HungarianLangModel,

View file

@ -49,7 +49,7 @@
//The following result for thai was collected from a limited sample (1M).
//Character Mapping Table:
unsigned char TIS620CharToOrderMap[] =
static const unsigned char TIS620CharToOrderMap[] =
{
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
@ -78,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
//first 1024 sequences:7.3177%
//rest sequences: 1.0230%
//negative sequences: 0.0436%
char ThaiLangModel[] =
static const char ThaiLangModel[] =
{
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
@ -211,7 +211,7 @@ char ThaiLangModel[] =
};
SequenceModel TIS620ThaiModel =
const SequenceModel TIS620ThaiModel =
{
TIS620CharToOrderMap,
ThaiLangModel,

View file

@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;
@ -86,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsBig5Prober::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

View file

@ -44,15 +44,17 @@
class nsBig5Prober: public nsCharSetProber {
public:
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();};
virtual ~nsBig5Prober(void){delete mCodingSM;};
nsBig5Prober(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
Reset();}
virtual ~nsBig5Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Big5";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "Big5";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -63,6 +65,7 @@ protected:
//Big5ContextAnalysis mContextAnalyser;
Big5DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View file

@ -74,7 +74,7 @@ PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 a
if (meetMSB && curPtr > prevPtr)
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
newLen = PRUint32(newptr - *newBuf);
newLen = newptr - *newBuf;
return PR_TRUE;
}
@ -119,7 +119,7 @@ PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen
while (prevPtr < curPtr)
*newptr++ = *prevPtr++;
newLen = PRUint32(newptr - *newBuf);
newLen = newptr - *newBuf;
return PR_TRUE;
}

View file

@ -52,7 +52,7 @@ typedef enum {
class nsCharSetProber {
public:
virtual ~nsCharSetProber() {};
virtual ~nsCharSetProber() {}
virtual const char* GetCharSetName() = 0;
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
virtual nsProbingState GetState(void) = 0;
@ -60,10 +60,6 @@ public:
virtual float GetConfidence(void) = 0;
virtual void SetOpion() = 0;
virtual const char* GetCharSetName(int i) { return GetCharSetName(); }
virtual float GetConfidence(int i) { return GetConfidence(); }
virtual int GetProbeCount(void) { return 1; }
#ifdef DEBUG_chardet
virtual void DumpStatus() {};
#endif

View file

@ -59,10 +59,7 @@ typedef struct
class nsCodingStateMachine {
public:
nsCodingStateMachine(SMModel* sm){
mCurrentState = eStart;
mModel = sm;
};
nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
nsSMState NextState(char c){
//for each byte we get its class , if it is first byte, we also get byte length
PRUint32 byteCls = GETCLASS(c);
@ -76,33 +73,32 @@ public:
mModel->stateTable);
mCurrentBytePos++;
return mCurrentState;
};
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;};
void Reset(void) {mCurrentState = eStart;};
const char * GetCodingStateMachine() {return mModel->name;};
}
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
void Reset(void) {mCurrentState = eStart;}
const char * GetCodingStateMachine() {return mModel->name;}
protected:
nsSMState mCurrentState;
PRUint32 mCurrentCharLen;
PRUint32 mCurrentBytePos;
SMModel *mModel;
const SMModel *mModel;
};
extern SMModel UTF8SMModel;
extern SMModel Big5SMModel;
extern SMModel EUCJPSMModel;
extern SMModel EUCKRSMModel;
extern SMModel EUCTWSMModel;
extern SMModel GB18030SMModel;
extern SMModel SJISSMModel;
extern SMModel UCS2BESMModel;
extern const SMModel UTF8SMModel;
extern const SMModel Big5SMModel;
extern const SMModel EUCJPSMModel;
extern const SMModel EUCKRSMModel;
extern const SMModel EUCTWSMModel;
extern const SMModel GB18030SMModel;
extern const SMModel SJISSMModel;
extern SMModel HZSMModel;
extern SMModel ISO2022CNSMModel;
extern SMModel ISO2022JPSMModel;
extern SMModel ISO2022KRSMModel;
extern const SMModel HZSMModel;
extern const SMModel ISO2022CNSMModel;
extern const SMModel ISO2022JPSMModel;
extern const SMModel ISO2022KRSMModel;
#endif /* nsCodingStateMachine_h__ */

View file

@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;
@ -96,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCJPProber::GetConfidence(void)
{
float contxtCf = mContextAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence();
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf);
}

View file

@ -50,15 +50,17 @@
class nsEUCJPProber: public nsCharSetProber {
public:
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();};
virtual ~nsEUCJPProber(void){delete mCodingSM;};
nsEUCJPProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
Reset();}
virtual ~nsEUCJPProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-JP";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "EUC-JP";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
nsCodingStateMachine* mCodingSM;
@ -68,6 +70,7 @@ protected:
EUCJPDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View file

@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;
@ -89,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCKRProber::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

View file

@ -44,15 +44,18 @@
class nsEUCKRProber: public nsCharSetProber {
public:
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();};
virtual ~nsEUCKRProber(void){delete mCodingSM;};
nsEUCKRProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
Reset();
}
virtual ~nsEUCKRProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "EUC-KR";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "EUC-KR";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -63,6 +66,7 @@ protected:
//EUCKRContextAnalysis mContextAnalyser;
EUCKRDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View file

@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;
@ -89,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsEUCTWProber::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

View file

@ -44,15 +44,17 @@
class nsEUCTWProber: public nsCharSetProber {
public:
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();};
virtual ~nsEUCTWProber(void){delete mCodingSM;};
nsEUCTWProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
Reset();}
virtual ~nsEUCTWProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "x-euc-tw";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "x-euc-tw";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -63,6 +65,7 @@ protected:
//EUCTWContextAnalysis mContextAnalyser;
EUCTWDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View file

@ -37,13 +37,21 @@
#include "nsEscCharsetProber.h"
#include "nsUniversalDetector.h"
nsEscCharSetProber::nsEscCharSetProber(void)
nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
{
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
mCodingSM[i] = nsnull;
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
{
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
}
if (aLanguageFilter & NS_FILTER_JAPANESE)
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
if (aLanguageFilter & NS_FILTER_KOREAN)
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
mActiveSM = NUM_OF_ESC_CHARSETS;
mState = eDetecting;
mDetectedCharset = nsnull;
@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void)
{
mState = eDetecting;
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
mCodingSM[i]->Reset();
if (mCodingSM[i])
mCodingSM[i]->Reset();
mActiveSM = NUM_OF_ESC_CHARSETS;
mDetectedCharset = nsnull;
}
@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
{
for (j = mActiveSM-1; j>= 0; j--)
{
//byte is feed to all active state machine
codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eError)
if (mCodingSM[j])
{
//got negative answer for this state machine, make it inactive
mActiveSM--;
if (mActiveSM == 0)
codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eItsMe)
{
mState = eNotMe;
mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
return mState;
}
else if (j != (PRInt32)mActiveSM)
{
nsCodingStateMachine* t;
t = mCodingSM[mActiveSM];
mCodingSM[mActiveSM] = mCodingSM[j];
mCodingSM[j] = t;
}
}
else if (codingState == eItsMe)
{
mState = eFoundIt;
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
return mState;
}
}
}

View file

@ -45,14 +45,14 @@
class nsEscCharSetProber: public nsCharSetProber {
public:
nsEscCharSetProber(void);
nsEscCharSetProber(PRUint32 aLanguageFilter);
virtual ~nsEscCharSetProber(void);
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return mDetectedCharset;};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return mDetectedCharset;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void){return (float)0.99;};
void SetOpion() {};
float GetConfidence(void){return (float)0.99;}
void SetOpion() {}
protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);

View file

@ -36,7 +36,7 @@
* ***** END LICENSE BLOCK ***** */
#include "nsCodingStateMachine.h"
static PRUint32 HZ_cls[ 256 / 8 ] = {
static const PRUint32 HZ_cls[ 256 / 8 ] = {
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -72,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
};
static PRUint32 HZ_st [ 6] = {
static const PRUint32 HZ_st [ 6] = {
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
@ -83,7 +83,7 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
SMModel HZSMModel = {
const SMModel HZSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
@ -92,7 +92,7 @@ SMModel HZSMModel = {
};
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -128,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
};
static PRUint32 ISO2022CN_st [ 8] = {
static const PRUint32 ISO2022CN_st [ 8] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
@ -141,7 +141,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
SMModel ISO2022CNSMModel = {
const SMModel ISO2022CNSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
9,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
@ -149,7 +149,7 @@ SMModel ISO2022CNSMModel = {
"ISO-2022-CN",
};
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -185,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
};
static PRUint32 ISO2022JP_st [ 9] = {
static const PRUint32 ISO2022JP_st [ 9] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
@ -199,7 +199,7 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
SMModel ISO2022JPSMModel = {
const SMModel ISO2022JPSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
10,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
@ -207,7 +207,7 @@ SMModel ISO2022JPSMModel = {
"ISO-2022-JP",
};
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
@ -243,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
};
static PRUint32 ISO2022KR_st [ 5] = {
static const PRUint32 ISO2022KR_st [ 5] = {
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
@ -253,7 +253,7 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
SMModel ISO2022KRSMModel = {
const SMModel ISO2022KRSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },

View file

@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;
@ -94,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
float nsGB18030Prober::GetConfidence(void)
{
float distribCf = mDistributionAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (float)distribCf;
}

View file

@ -46,15 +46,17 @@
class nsGB18030Prober: public nsCharSetProber {
public:
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();};
virtual ~nsGB18030Prober(void){delete mCodingSM;};
nsGB18030Prober(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
Reset();}
virtual ~nsGB18030Prober(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "gb18030";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "gb18030";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
void GetDistribution(PRUint32 aCharLen, const char* aStr);
@ -65,6 +67,7 @@ protected:
//GB2312ContextAnalysis mContextAnalyser;
GB2312DistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View file

@ -55,7 +55,7 @@ public:
virtual nsProbingState GetState(void);
virtual float GetConfidence(void) { return (float)0.0; }
virtual void SetOpion() {};
virtual void SetOpion() {}
void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
{ mLogicalProb = logicalPrb; mVisualProb = visualPrb; }

View file

@ -50,7 +50,7 @@
#define ASO 7 // accent small other
#define CLASS_NUM 8 // total classes
static unsigned char Latin1_CharToClass[] =
static const unsigned char Latin1_CharToClass[] =
{
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
2 : normal
3 : very likely
*/
static unsigned char Latin1ClassModel[] =
static const unsigned char Latin1ClassModel[] =
{
/* UDF OTH ASC ASS ACV ACO ASV ASO */
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,

View file

@ -45,14 +45,14 @@
class nsLatin1Prober: public nsCharSetProber {
public:
nsLatin1Prober(void){Reset();};
virtual ~nsLatin1Prober(void){};
nsLatin1Prober(void){Reset();}
virtual ~nsLatin1Prober(void){}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "windows-1252";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "windows-1252";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
#ifdef DEBUG_chardet
virtual void DumpStatus();

View file

@ -39,6 +39,7 @@
#include <stdio.h>
#include "nsMBCSGroupProber.h"
#include "nsUniversalDetector.h"
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
const char *ProberName[] =
@ -54,15 +55,26 @@ const char *ProberName[] =
#endif
nsMBCSGroupProber::nsMBCSGroupProber()
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
{
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
mProbers[i] = nsnull;
mProbers[0] = new nsUTF8Prober();
mProbers[1] = new nsSJISProber();
mProbers[2] = new nsEUCJPProber();
mProbers[3] = new nsGB18030Prober();
mProbers[4] = new nsEUCKRProber();
mProbers[5] = new nsBig5Prober();
mProbers[6] = new nsEUCTWProber();
if (aLanguageFilter & NS_FILTER_JAPANESE)
{
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
}
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
if (aLanguageFilter & NS_FILTER_KOREAN)
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
{
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
}
Reset();
}
@ -134,16 +146,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
mState = eFoundIt;
return mState;
}
else if (st == eNotMe)
{
mIsActive[i] = PR_FALSE;
mActiveNum--;
if (mActiveNum <= 0)
{
mState = eNotMe;
return mState;
}
}
}
}
}
@ -154,23 +156,13 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
{
if (!mIsActive[i])
continue;
st = mProbers[i]->HandleData(aBuf + start, aLen + 1 - start);
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
if (st == eFoundIt)
{
mBestGuess = i;
mState = eFoundIt;
return mState;
}
else if (st == eNotMe)
{
mIsActive[i] = PR_FALSE;
mActiveNum--;
if (mActiveNum <= 0)
{
mState = eNotMe;
return mState;
}
}
}
}
mKeepNext = keepNext;

View file

@ -51,18 +51,14 @@
class nsMBCSGroupProber: public nsCharSetProber {
public:
nsMBCSGroupProber();
nsMBCSGroupProber(PRUint32 aLanguageFilter);
virtual ~nsMBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
nsProbingState GetState(void) {return mState;};
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
int GetProbeCount(void) { return NUM_OF_PROBERS; }
void SetOpion() {}
#ifdef DEBUG_chardet
void DumpStatus();

View file

@ -44,7 +44,7 @@ Modification from frank tang's original work:
// BIG5
static PRUint32 BIG5_cls [ 256 / 8 ] = {
static const PRUint32 BIG5_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -81,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
};
static PRUint32 BIG5_st [ 3] = {
static const PRUint32 BIG5_st [ 3] = {
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
@ -89,7 +89,7 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
SMModel Big5SMModel = {
SMModel const Big5SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
5,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
@ -97,7 +97,7 @@ SMModel Big5SMModel = {
"Big5",
};
static PRUint32 EUCJP_cls [ 256 / 8 ] = {
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
@ -134,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
};
static PRUint32 EUCJP_st [ 5] = {
static const PRUint32 EUCJP_st [ 5] = {
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
@ -144,7 +144,7 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
SMModel EUCJPSMModel = {
const SMModel EUCJPSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
@ -152,7 +152,7 @@ SMModel EUCJPSMModel = {
"EUC-JP",
};
static PRUint32 EUCKR_cls [ 256 / 8 ] = {
static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -189,14 +189,14 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
};
static PRUint32 EUCKR_st [ 2] = {
static const PRUint32 EUCKR_st [ 2] = {
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
};
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
SMModel EUCKRSMModel = {
const SMModel EUCKRSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
4,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
@ -204,7 +204,7 @@ SMModel EUCKRSMModel = {
"EUC-KR",
};
static PRUint32 EUCTW_cls [ 256 / 8 ] = {
static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
@ -241,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
};
static PRUint32 EUCTW_st [ 6] = {
static const PRUint32 EUCTW_st [ 6] = {
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
@ -252,7 +252,7 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
SMModel EUCTWSMModel = {
const SMModel EUCTWSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
@ -316,7 +316,7 @@ SMModel GB2312SMModel = {
// the following state machine data was created by perl script in
// intl/chardet/tools. It should be the same as in PSM detector.
static PRUint32 GB18030_cls [ 256 / 8 ] = {
static const PRUint32 GB18030_cls [ 256 / 8 ] = {
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
@ -352,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
};
static PRUint32 GB18030_st [ 6] = {
static const PRUint32 GB18030_st [ 6] = {
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
@ -368,7 +368,7 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
// 2 here.
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
SMModel GB18030SMModel = {
const SMModel GB18030SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
7,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
@ -378,7 +378,7 @@ SMModel GB18030SMModel = {
// sjis
static PRUint32 SJIS_cls [ 256 / 8 ] = {
static const PRUint32 SJIS_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -417,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
};
static PRUint32 SJIS_st [ 3] = {
static const PRUint32 SJIS_st [ 3] = {
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
@ -425,7 +425,7 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
SMModel SJISSMModel = {
const SMModel SJISSMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
@ -434,120 +434,7 @@ SMModel SJISSMModel = {
};
static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
};
static PRUint32 UCS2BE_st [ 7] = {
PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
};
static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
SMModel UCS2BESMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
UCS2BECharLenTable,
"UTF-16BE",
};
static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
};
static PRUint32 UCS2LE_st [ 7] = {
PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
};
static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
SMModel UCS2LESMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
6,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
UCS2LECharLenTable,
"UTF-16LE",
};
static PRUint32 UTF8_cls [ 256 / 8 ] = {
static const PRUint32 UTF8_cls [ 256 / 8 ] = {
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
@ -584,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
};
static PRUint32 UTF8_st [ 26] = {
static const PRUint32 UTF8_st [ 26] = {
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
@ -616,7 +503,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
3, 3, 4, 4, 5, 5, 6, 6 };
SMModel UTF8SMModel = {
const SMModel UTF8SMModel = {
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
16,
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },

View file

@ -68,7 +68,7 @@ typedef struct nsPkgInt {
nsSftMsk sftmsk;
nsBitSft bitsft;
nsUnitMsk unitmsk;
PRUint32 *data;
const PRUint32* const data;
} nsPkgInt;

View file

@ -49,14 +49,10 @@ public:
virtual ~nsSBCSGroupProber();
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName();
nsProbingState GetState(void) {return mState;};
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
int GetProbeCount(void) { return NUM_OF_SBCS_PROBERS; }
void SetOpion() {}
#ifdef DEBUG_chardet
void DumpStatus();

View file

@ -51,27 +51,27 @@
typedef struct
{
unsigned char *charToOrderMap; // [256] table use to find a char's order
char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
const char* charsetName;
const char* const charsetName;
} SequenceModel;
class nsSingleByteCharSetProber : public nsCharSetProber{
public:
nsSingleByteCharSetProber(SequenceModel *model)
nsSingleByteCharSetProber(const SequenceModel *model)
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
virtual const char* GetCharSetName();
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
virtual nsProbingState GetState(void) {return mState;};
virtual nsProbingState GetState(void) {return mState;}
virtual void Reset(void);
virtual float GetConfidence(void);
virtual void SetOpion() {};
virtual void SetOpion() {}
// This feature is not implemented yet. any current language model
// contain this parameter as PR_FALSE. No one is looking at this
@ -79,7 +79,7 @@ public:
// Moreover, the nsSBCSGroupProber which calls the HandleData of this
// prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
// of the English letters.
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; // (not implemented)
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
#ifdef DEBUG_chardet
virtual void DumpStatus();
@ -87,7 +87,7 @@ public:
protected:
nsProbingState mState;
const SequenceModel *mModel;
const SequenceModel* const mModel;
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
//char order of last character
@ -106,19 +106,19 @@ protected:
};
extern SequenceModel Koi8rModel;
extern SequenceModel Win1251Model;
extern SequenceModel Latin5Model;
extern SequenceModel MacCyrillicModel;
extern SequenceModel Ibm866Model;
extern SequenceModel Ibm855Model;
extern SequenceModel Latin7Model;
extern SequenceModel Win1253Model;
extern SequenceModel Latin5BulgarianModel;
extern SequenceModel Win1251BulgarianModel;
extern SequenceModel Latin2HungarianModel;
extern SequenceModel Win1250HungarianModel;
extern SequenceModel Win1255Model;
extern const SequenceModel Koi8rModel;
extern const SequenceModel Win1251Model;
extern const SequenceModel Latin5Model;
extern const SequenceModel MacCyrillicModel;
extern const SequenceModel Ibm866Model;
extern const SequenceModel Ibm855Model;
extern const SequenceModel Latin7Model;
extern const SequenceModel Win1253Model;
extern const SequenceModel Latin5BulgarianModel;
extern const SequenceModel Win1251BulgarianModel;
extern const SequenceModel Latin2HungarianModel;
extern const SequenceModel Win1250HungarianModel;
extern const SequenceModel Win1255Model;
#endif /* nsSingleByteCharSetProber_h__ */

View file

@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;
@ -95,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
float nsSJISProber::GetConfidence(void)
{
float contxtCf = mContextAnalyser.GetConfidence();
float distribCf = mDistributionAnalyser.GetConfidence();
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
return (contxtCf > distribCf ? contxtCf : distribCf);
}

View file

@ -51,15 +51,17 @@
class nsSJISProber: public nsCharSetProber {
public:
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();};
virtual ~nsSJISProber(void){delete mCodingSM;};
nsSJISProber(PRBool aIsPreferredLanguage)
:mIsPreferredLanguage(aIsPreferredLanguage)
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
Reset();}
virtual ~nsSJISProber(void){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "Shift_JIS";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "Shift_JIS";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
nsCodingStateMachine* mCodingSM;
@ -69,6 +71,7 @@ protected:
SJISDistributionAnalysis mDistributionAnalyser;
char mLastChar[2];
PRBool mIsPreferredLanguage;
};

View file

@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
for (PRUint32 i = 0; i < aLen; i++)
{
codingState = mCodingSM->NextState(aBuf[i]);
if (codingState == eError)
{
mState = eNotMe;
break;
}
if (codingState == eItsMe)
{
mState = eFoundIt;

View file

@ -45,14 +45,14 @@ class nsUTF8Prober: public nsCharSetProber {
public:
nsUTF8Prober(){mNumOfMBChar = 0;
mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
Reset(); };
virtual ~nsUTF8Prober(){delete mCodingSM;};
Reset(); }
virtual ~nsUTF8Prober(){delete mCodingSM;}
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
const char* GetCharSetName() {return "UTF-8";};
nsProbingState GetState(void) {return mState;};
const char* GetCharSetName() {return "UTF-8";}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(void);
void SetOpion() {};
void SetOpion() {}
protected:
nsCodingStateMachine* mCodingSM;

View file

@ -44,9 +44,8 @@
#include "nsSBCSGroupProber.h"
#include "nsEscCharsetProber.h"
#include "nsLatin1Prober.h"
#include "nsError.h"
nsUniversalDetector::nsUniversalDetector()
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
{
mDone = PR_FALSE;
mBestGuess = -1; //illegal value as signal
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
mGotData = PR_FALSE;
mInputState = ePureAscii;
mLastChar = '\0';
mLanguageFilter = aLanguageFilter;
PRUint32 i;
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
@ -125,12 +125,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
else if ('\xFF' == aBuf[1])
// FE FF UTF-16, big endian BOM
mDetectedCharset = "UTF-16BE";
mDetectedCharset = "UTF-16";
break;
case '\x00':
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
// 00 00 FE FF UTF-32, big-endian BOM
mDetectedCharset = "UTF-32BE";
mDetectedCharset = "UTF-32";
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
@ -138,10 +138,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
case '\xFF':
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
// FF FE 00 00 UTF-32, little-endian BOM
mDetectedCharset = "UTF-32LE";
mDetectedCharset = "UTF-32";
else if ('\xFE' == aBuf[1])
// FF FE UTF-16, little endian BOM
mDetectedCharset = "UTF-16LE";
mDetectedCharset = "UTF-16";
break;
} // switch
@ -172,16 +172,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
//start multibyte and singlebyte charset prober
if (nsnull == mCharSetProbers[0])
mCharSetProbers[0] = new nsMBCSGroupProber;
if (nsnull == mCharSetProbers[1])
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[2])
mCharSetProbers[2] = new nsLatin1Prober;
if ((nsnull == mCharSetProbers[0]) ||
(nsnull == mCharSetProbers[1]) ||
(nsnull == mCharSetProbers[2]))
{
mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
if (nsnull == mCharSetProbers[0])
return NS_ERROR_OUT_OF_MEMORY;
}
if (nsnull == mCharSetProbers[1] &&
(mLanguageFilter & NS_FILTER_NON_CJK))
{
mCharSetProbers[1] = new nsSBCSGroupProber;
if (nsnull == mCharSetProbers[1])
return NS_ERROR_OUT_OF_MEMORY;
}
if (nsnull == mCharSetProbers[2])
{
mCharSetProbers[2] = new nsLatin1Prober;
if (nsnull == mCharSetProbers[2])
return NS_ERROR_OUT_OF_MEMORY;
}
}
}
else
@ -202,7 +210,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
case eEscAscii:
if (nsnull == mEscCharSetProber) {
mEscCharSetProber = new nsEscCharSetProber;
mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
if (nsnull == mEscCharSetProber)
return NS_ERROR_OUT_OF_MEMORY;
}
@ -216,12 +224,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
case eHighbyte:
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
if (st == eFoundIt)
if (mCharSetProbers[i])
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
return NS_OK;
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
if (st == eFoundIt)
{
mDone = PR_TRUE;
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
return NS_OK;
}
}
}
break;
@ -260,11 +271,14 @@ void nsUniversalDetector::DataEnd()
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence)
if (mCharSetProbers[i])
{
maxProberConfidence = proberConfidence;
maxProber = i;
proberConfidence = mCharSetProbers[i]->GetConfidence();
if (proberConfidence > maxProberConfidence)
{
maxProberConfidence = proberConfidence;
maxProber = i;
}
}
}
//do not report anything because we are not confident of it, that's in fact a negative answer

View file

@ -38,8 +38,6 @@
#ifndef nsUniversalDetector_h__
#define nsUniversalDetector_h__
#include "nscore.h"
class nsCharSetProber;
#define NUM_OF_CHARSET_PROBERS 3
@ -50,9 +48,22 @@ typedef enum {
eHighbyte = 2
} nsInputState;
#define NS_FILTER_CHINESE_SIMPLIFIED 0x01
#define NS_FILTER_CHINESE_TRADITIONAL 0x02
#define NS_FILTER_JAPANESE 0x04
#define NS_FILTER_KOREAN 0x08
#define NS_FILTER_NON_CJK 0x10
#define NS_FILTER_ALL 0x1F
#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
NS_FILTER_CHINESE_TRADITIONAL)
#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
NS_FILTER_CHINESE_TRADITIONAL | \
NS_FILTER_JAPANESE | \
NS_FILTER_KOREAN)
class nsUniversalDetector {
public:
nsUniversalDetector();
nsUniversalDetector(PRUint32 aLanguageFilter);
virtual ~nsUniversalDetector();
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
virtual void DataEnd(void);
@ -68,6 +79,7 @@ protected:
char mLastChar;
const char * mDetectedCharset;
PRInt32 mBestGuess;
PRUint32 mLanguageFilter;
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
nsCharSetProber *mEscCharSetProber;