Update universalchardet using a patch I made around 2009-02, the one we're currently using is from ~1998. I'll check again later to see if there are any updates to it before closing the ticket. Updates #866.
Originally committed to SVN as r3653.
This commit is contained in:
parent
05c9ffde7a
commit
42e0dd6ce4
43 changed files with 324 additions and 458 deletions
|
@ -106,17 +106,14 @@ wxString CharSetDetect::GetEncoding(wxString filename) {
|
|||
bool gotLocal = false;
|
||||
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
|
||||
if (mCharSetProbers[i]) {
|
||||
int probes = mCharSetProbers[i]->GetProbeCount();
|
||||
for (int j=0;j<probes;j++) {
|
||||
float conf = mCharSetProbers[i]->GetConfidence(j);
|
||||
float conf = mCharSetProbers[i]->GetConfidence();
|
||||
|
||||
// Only bother with those whose confidence is at least 1%
|
||||
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(j),wxConvUTF8);
|
||||
if (conf > 0.01f || curName == local) {
|
||||
results.push_back(CharDetResult());
|
||||
results.back().name = curName;
|
||||
results.back().confidence = mCharSetProbers[i]->GetConfidence(j);
|
||||
}
|
||||
// Only bother with those whose confidence is at least 1%
|
||||
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
|
||||
if (conf > 0.01f || curName == local) {
|
||||
results.push_back(CharDetResult());
|
||||
results.back().name = curName;
|
||||
results.back().confidence = conf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,9 @@
|
|||
|
||||
///////////
|
||||
// Headers
|
||||
#include "../universalchardet/nscore.h"
|
||||
#include "../universalchardet/nsUniversalDetector.h"
|
||||
#include "../universalchardet/nsMBCSGroupProber.h"
|
||||
|
||||
|
||||
/// DOCME
|
||||
|
@ -54,6 +56,7 @@ private:
|
|||
void Report(const char* aCharset);
|
||||
|
||||
public:
|
||||
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
|
||||
wxString GetEncoding(wxString filename);
|
||||
|
||||
/// @brief DOCME
|
||||
|
|
|
@ -49,12 +49,13 @@
|
|||
#define MINIMUM_DATA_THRESHOLD 4
|
||||
|
||||
//return confidence base on received data
|
||||
float CharDistributionAnalysis::GetConfidence()
|
||||
float CharDistributionAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
//if we didn't receive any character in our consideration range, or the
|
||||
// number of frequent characters is below the minimum threshold, return
|
||||
// negative answer
|
||||
if (mTotalChars <= 0 || mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
if (mTotalChars <= 0 ||
|
||||
!aIsPreferredLanguage && mFreqChars <= MINIMUM_DATA_THRESHOLD)
|
||||
return SURE_NO;
|
||||
|
||||
if (mTotalChars != mFreqChars) {
|
||||
|
|
|
@ -69,10 +69,10 @@ public:
|
|||
mFreqChars++;
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
//return confidence base on existing data
|
||||
float GetConfidence();
|
||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||
|
||||
//Reset analyser, clear any state
|
||||
void Reset(void)
|
||||
|
@ -80,21 +80,21 @@ public:
|
|||
mDone = PR_FALSE;
|
||||
mTotalChars = 0;
|
||||
mFreqChars = 0;
|
||||
};
|
||||
}
|
||||
|
||||
//This function is for future extension. Caller can use this function to control
|
||||
//analyser's behavior
|
||||
void SetOpion(){};
|
||||
void SetOpion(){}
|
||||
|
||||
//It is not necessary to receive all data to draw conclusion. For charset detection,
|
||||
// certain amount of data is enough
|
||||
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;};
|
||||
PRBool GotEnoughData() {return mTotalChars > ENOUGH_DATA_THRESHOLD;}
|
||||
|
||||
protected:
|
||||
//we do not handle character base on its original encoding string, but
|
||||
//convert this encoding string to a number, here called order.
|
||||
//This allow multiple encoding of a language to share one frequency table
|
||||
virtual PRInt32 GetOrder(const char* str) {return -1;};
|
||||
virtual PRInt32 GetOrder(const char* str) {return -1;}
|
||||
|
||||
//If this flag is set to PR_TRUE, detection is done and conclusion has been made
|
||||
PRBool mDone;
|
||||
|
@ -132,7 +132,7 @@ protected:
|
|||
return 94*((unsigned char)str[0]-(unsigned char)0xc4) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||
else
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -150,7 +150,7 @@ protected:
|
|||
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||
else
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
class GB2312DistributionAnalysis : public CharDistributionAnalysis
|
||||
|
@ -167,7 +167,7 @@ protected:
|
|||
return 94*((unsigned char)str[0]-(unsigned char)0xb0) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||
else
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -188,7 +188,7 @@ protected:
|
|||
return 157*((unsigned char)str[0]-(unsigned char)0xa4) + (unsigned char)str[1] - (unsigned char)0x40;
|
||||
else
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
class SJISDistributionAnalysis : public CharDistributionAnalysis
|
||||
|
@ -213,7 +213,7 @@ protected:
|
|||
if ((unsigned char)str[1] > (unsigned char)0x7f)
|
||||
order--;
|
||||
return order;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
class EUCJPDistributionAnalysis : public CharDistributionAnalysis
|
||||
|
@ -230,7 +230,7 @@ protected:
|
|||
return 94*((unsigned char)str[0]-(unsigned char)0xa1) + (unsigned char)str[1] - (unsigned char)0xa1;
|
||||
else
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
#endif //CharDistribution_h__
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#include "JpCntx.h"
|
||||
|
||||
//This is hiragana 2-char sequence table, the number in each cell represents its frequency category
|
||||
char jp2CharContext[83][83] =
|
||||
const char jp2CharContext[83][83] =
|
||||
{
|
||||
{ 0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,},
|
||||
{ 2,4,0,4,0,3,0,4,0,3,4,4,4,2,4,3,3,4,3,2,3,3,4,2,3,3,3,2,4,1,4,3,3,1,5,4,3,4,3,4,3,5,3,0,3,5,4,2,0,3,1,0,3,3,0,3,3,0,1,1,0,4,3,0,3,3,0,4,0,2,0,3,5,5,5,5,4,0,4,1,0,3,4,},
|
||||
|
@ -181,10 +181,10 @@ void JapaneseContextAnalysis::Reset(void)
|
|||
}
|
||||
#define DONT_KNOW (float)-1
|
||||
|
||||
float JapaneseContextAnalysis::GetConfidence()
|
||||
float JapaneseContextAnalysis::GetConfidence(PRBool aIsPreferredLanguage)
|
||||
{
|
||||
//This is just one way to calculate confidence. It works well for me.
|
||||
if (mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||
if (aIsPreferredLanguage || mTotalRel > MINIMUM_DATA_THRESHOLD)
|
||||
return ((float)(mTotalRel - mRelSample[0]))/mTotalRel;
|
||||
else
|
||||
return (float)DONT_KNOW;
|
||||
|
@ -227,5 +227,3 @@ PRInt32 EUCJPContextAnalysis::GetOrder(const char* str, PRUint32 *charLen)
|
|||
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -73,12 +73,12 @@ public:
|
|||
mRelSample[jp2CharContext[mLastCharOrder][order]]++;
|
||||
}
|
||||
mLastCharOrder = order;
|
||||
};
|
||||
}
|
||||
|
||||
float GetConfidence();
|
||||
float GetConfidence(PRBool aIsPreferredLanguage);
|
||||
void Reset(void);
|
||||
void SetOpion(){};
|
||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;};
|
||||
void SetOpion(){}
|
||||
PRBool GotEnoughData() {return mTotalRel > ENOUGH_REL_THRESHOLD;}
|
||||
|
||||
protected:
|
||||
virtual PRInt32 GetOrder(const char* str, PRUint32 *charLen) = 0;
|
||||
|
@ -116,7 +116,7 @@ protected:
|
|||
(unsigned char)*(str+1) <= (unsigned char)0xf1)
|
||||
return (unsigned char)*(str+1) - (unsigned char)0x9f;
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
class EUCJPContextAnalysis : public JapaneseContextAnalysis
|
||||
|
@ -131,7 +131,7 @@ protected:
|
|||
(unsigned char)*(str+1) <= (unsigned char)0xf3)
|
||||
return (unsigned char)*(str+1) - (unsigned char)0xa1;
|
||||
return -1;
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
#endif /* __JPCNTX_H__ */
|
||||
|
|
|
@ -48,7 +48,7 @@
|
|||
//this talbe is modified base on win1251BulgarianCharToOrderMap, so
|
||||
//only number <64 is sure valid
|
||||
|
||||
unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||
static const unsigned char Latin5_BulgarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -68,7 +68,7 @@ unsigned char Latin5_BulgarianCharToOrderMap[] =
|
|||
62,242,243,244, 58,245, 98,246,247,248,249,250,251, 91,252,253, //f0
|
||||
};
|
||||
|
||||
unsigned char win1251BulgarianCharToOrderMap[] =
|
||||
static const unsigned char win1251BulgarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -94,7 +94,7 @@ unsigned char win1251BulgarianCharToOrderMap[] =
|
|||
//first 1024 sequences:3.0618%
|
||||
//rest sequences: 0.2992%
|
||||
//negative sequences: 0.0020%
|
||||
char BulgarianLangModel[] =
|
||||
static const char BulgarianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,3,3,2,2,3,2,2,1,2,2,
|
||||
|
@ -226,7 +226,7 @@ char BulgarianLangModel[] =
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
|
||||
};
|
||||
|
||||
SequenceModel Latin5BulgarianModel =
|
||||
const SequenceModel Latin5BulgarianModel =
|
||||
{
|
||||
Latin5_BulgarianCharToOrderMap,
|
||||
BulgarianLangModel,
|
||||
|
@ -235,7 +235,7 @@ SequenceModel Latin5BulgarianModel =
|
|||
"ISO-8859-5"
|
||||
};
|
||||
|
||||
SequenceModel Win1251BulgarianModel =
|
||||
const SequenceModel Win1251BulgarianModel =
|
||||
{
|
||||
win1251BulgarianCharToOrderMap,
|
||||
BulgarianLangModel,
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
|
||||
//KOI8-R language model
|
||||
//Character Mapping Table:
|
||||
unsigned char KOI8R_CharToOrderMap[] =
|
||||
static const unsigned char KOI8R_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -61,7 +61,7 @@ unsigned char KOI8R_CharToOrderMap[] =
|
|||
35, 43, 45, 32, 40, 52, 56, 33, 61, 62, 51, 57, 47, 63, 50, 70, //f0
|
||||
};
|
||||
|
||||
unsigned char win1251_CharToOrderMap[] =
|
||||
static const unsigned char win1251_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -81,7 +81,7 @@ unsigned char win1251_CharToOrderMap[] =
|
|||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27, 16,
|
||||
};
|
||||
|
||||
unsigned char latin5_CharToOrderMap[] =
|
||||
static const unsigned char latin5_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -101,7 +101,7 @@ unsigned char latin5_CharToOrderMap[] =
|
|||
239, 68,240,241,242,243,244,245,246,247,248,249,250,251,252,255,
|
||||
};
|
||||
|
||||
unsigned char macCyrillic_CharToOrderMap[] =
|
||||
static const unsigned char macCyrillic_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -121,7 +121,7 @@ unsigned char macCyrillic_CharToOrderMap[] =
|
|||
9, 7, 6, 14, 39, 26, 28, 22, 25, 29, 54, 18, 17, 30, 27,255,
|
||||
};
|
||||
|
||||
unsigned char IBM855_CharToOrderMap[] =
|
||||
static const unsigned char IBM855_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -141,7 +141,7 @@ unsigned char IBM855_CharToOrderMap[] =
|
|||
250, 18, 62, 20, 51, 25, 57, 30, 47, 29, 63, 22, 50,251,252,255,
|
||||
};
|
||||
|
||||
unsigned char IBM866_CharToOrderMap[] =
|
||||
static const unsigned char IBM866_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -167,7 +167,7 @@ unsigned char IBM866_CharToOrderMap[] =
|
|||
//first 1024 sequences: 2.3389%
|
||||
//rest sequences: 0.1237%
|
||||
//negative sequences: 0.0009%
|
||||
char RussianLangModel[] =
|
||||
static const char RussianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,1,3,3,3,3,1,3,3,3,2,3,2,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,2,2,2,2,0,0,2,
|
||||
|
@ -300,7 +300,7 @@ char RussianLangModel[] =
|
|||
};
|
||||
|
||||
|
||||
SequenceModel Koi8rModel =
|
||||
const SequenceModel Koi8rModel =
|
||||
{
|
||||
KOI8R_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
|
@ -309,7 +309,7 @@ SequenceModel Koi8rModel =
|
|||
"KOI8-R"
|
||||
};
|
||||
|
||||
SequenceModel Win1251Model =
|
||||
const SequenceModel Win1251Model =
|
||||
{
|
||||
win1251_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
|
@ -318,7 +318,7 @@ SequenceModel Win1251Model =
|
|||
"windows-1251"
|
||||
};
|
||||
|
||||
SequenceModel Latin5Model =
|
||||
const SequenceModel Latin5Model =
|
||||
{
|
||||
latin5_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
|
@ -327,7 +327,7 @@ SequenceModel Latin5Model =
|
|||
"ISO-8859-5"
|
||||
};
|
||||
|
||||
SequenceModel MacCyrillicModel =
|
||||
const SequenceModel MacCyrillicModel =
|
||||
{
|
||||
macCyrillic_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
|
@ -336,7 +336,7 @@ SequenceModel MacCyrillicModel =
|
|||
"x-mac-cyrillic"
|
||||
};
|
||||
|
||||
SequenceModel Ibm866Model =
|
||||
const SequenceModel Ibm866Model =
|
||||
{
|
||||
IBM866_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
|
@ -345,7 +345,7 @@ SequenceModel Ibm866Model =
|
|||
"IBM866"
|
||||
};
|
||||
|
||||
SequenceModel Ibm855Model =
|
||||
const SequenceModel Ibm855Model =
|
||||
{
|
||||
IBM855_CharToOrderMap,
|
||||
RussianLangModel,
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
*****************************************************************/
|
||||
|
||||
//Character Mapping Table:
|
||||
unsigned char Latin7_CharToOrderMap[] =
|
||||
static const unsigned char Latin7_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -67,7 +67,7 @@ unsigned char Latin7_CharToOrderMap[] =
|
|||
|
||||
|
||||
|
||||
unsigned char win1253_CharToOrderMap[] =
|
||||
static const unsigned char win1253_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -93,7 +93,7 @@ unsigned char win1253_CharToOrderMap[] =
|
|||
//first 1024 sequences:1.7001%
|
||||
//rest sequences: 0.0359%
|
||||
//negative sequences: 0.0148%
|
||||
char GreekLangModel[] =
|
||||
static const char GreekLangModel[] =
|
||||
{
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
|
@ -225,7 +225,7 @@ char GreekLangModel[] =
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
SequenceModel Latin7Model =
|
||||
const SequenceModel Latin7Model =
|
||||
{
|
||||
Latin7_CharToOrderMap,
|
||||
GreekLangModel,
|
||||
|
@ -234,7 +234,7 @@ SequenceModel Latin7Model =
|
|||
"ISO-8859-7"
|
||||
};
|
||||
|
||||
SequenceModel Win1253Model =
|
||||
const SequenceModel Win1253Model =
|
||||
{
|
||||
win1253_CharToOrderMap,
|
||||
GreekLangModel,
|
||||
|
|
|
@ -50,7 +50,7 @@
|
|||
|
||||
//Windows-1255 language model
|
||||
//Character Mapping Table:
|
||||
unsigned char win1255_CharToOrderMap[] =
|
||||
static const unsigned char win1255_CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -76,7 +76,7 @@ unsigned char win1255_CharToOrderMap[] =
|
|||
//first 1024 sequences: 1.5981%
|
||||
//rest sequences: 0.087%
|
||||
//negative sequences: 0.0015%
|
||||
char HebrewLangModel[] =
|
||||
static const char HebrewLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,3,3,2,3,2,1,2,0,1,0,0,
|
||||
3,0,3,1,0,0,1,3,2,0,1,1,2,0,2,2,2,1,1,1,1,2,1,1,1,2,0,0,2,2,0,1,
|
||||
|
@ -208,7 +208,7 @@ char HebrewLangModel[] =
|
|||
0,0,0,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,1,1,1,1,1,0,1,0,0,0,1,1,0,0,
|
||||
};
|
||||
|
||||
SequenceModel Win1255Model =
|
||||
const SequenceModel Win1255Model =
|
||||
{
|
||||
win1255_CharToOrderMap,
|
||||
HebrewLangModel,
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
*****************************************************************/
|
||||
|
||||
//Character Mapping Table:
|
||||
unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||
static const unsigned char Latin2_HungarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -65,7 +65,7 @@ unsigned char Latin2_HungarianCharToOrderMap[] =
|
|||
245,246,247, 25, 73, 42, 24,248,249,250, 31, 56, 29,251,252,253,
|
||||
};
|
||||
|
||||
unsigned char win1250HungarianCharToOrderMap[] =
|
||||
static const unsigned char win1250HungarianCharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -91,7 +91,7 @@ unsigned char win1250HungarianCharToOrderMap[] =
|
|||
//first 1024 sequences:5.2623%
|
||||
//rest sequences: 0.8894%
|
||||
//negative sequences: 0.0009%
|
||||
char HungarianLangModel[] =
|
||||
static const char HungarianLangModel[] =
|
||||
{
|
||||
0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
||||
3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,2,2,3,3,1,1,2,2,2,2,2,1,2,
|
||||
|
@ -223,7 +223,7 @@ char HungarianLangModel[] =
|
|||
0,1,1,1,1,1,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,
|
||||
};
|
||||
|
||||
SequenceModel Latin2HungarianModel =
|
||||
const SequenceModel Latin2HungarianModel =
|
||||
{
|
||||
Latin2_HungarianCharToOrderMap,
|
||||
HungarianLangModel,
|
||||
|
@ -232,7 +232,7 @@ SequenceModel Latin2HungarianModel =
|
|||
"ISO-8859-2"
|
||||
};
|
||||
|
||||
SequenceModel Win1250HungarianModel =
|
||||
const SequenceModel Win1250HungarianModel =
|
||||
{
|
||||
win1250HungarianCharToOrderMap,
|
||||
HungarianLangModel,
|
||||
|
|
|
@ -49,7 +49,7 @@
|
|||
//The following result for thai was collected from a limited sample (1M).
|
||||
|
||||
//Character Mapping Table:
|
||||
unsigned char TIS620CharToOrderMap[] =
|
||||
static const unsigned char TIS620CharToOrderMap[] =
|
||||
{
|
||||
255,255,255,255,255,255,255,255,255,255,254,255,255,254,255,255, //00
|
||||
255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255, //10
|
||||
|
@ -78,7 +78,7 @@ unsigned char TIS620CharToOrderMap[] =
|
|||
//first 1024 sequences:7.3177%
|
||||
//rest sequences: 1.0230%
|
||||
//negative sequences: 0.0436%
|
||||
char ThaiLangModel[] =
|
||||
static const char ThaiLangModel[] =
|
||||
{
|
||||
0,1,3,3,3,3,0,0,3,3,0,3,3,0,3,3,3,3,3,3,3,3,0,0,3,3,3,0,3,3,3,3,
|
||||
0,3,3,0,0,0,1,3,0,3,3,2,3,3,0,1,2,3,3,3,3,0,2,0,2,0,0,3,2,1,2,2,
|
||||
|
@ -211,7 +211,7 @@ char ThaiLangModel[] =
|
|||
};
|
||||
|
||||
|
||||
SequenceModel TIS620ThaiModel =
|
||||
const SequenceModel TIS620ThaiModel =
|
||||
{
|
||||
TIS620CharToOrderMap,
|
||||
ThaiLangModel,
|
||||
|
|
|
@ -51,11 +51,6 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
@ -86,7 +81,7 @@ nsProbingState nsBig5Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsBig5Prober::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -44,15 +44,17 @@
|
|||
|
||||
class nsBig5Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsBig5Prober(void){mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();};
|
||||
virtual ~nsBig5Prober(void){delete mCodingSM;};
|
||||
nsBig5Prober(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&Big5SMModel);
|
||||
Reset();}
|
||||
virtual ~nsBig5Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "Big5";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "Big5";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
|
@ -63,6 +65,7 @@ protected:
|
|||
//Big5ContextAnalysis mContextAnalyser;
|
||||
Big5DistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ PRBool nsCharSetProber::FilterWithoutEnglishLetters(const char* aBuf, PRUint32 a
|
|||
if (meetMSB && curPtr > prevPtr)
|
||||
while (prevPtr < curPtr) *newptr++ = *prevPtr++;
|
||||
|
||||
newLen = PRUint32(newptr - *newBuf);
|
||||
newLen = newptr - *newBuf;
|
||||
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
@ -119,7 +119,7 @@ PRBool nsCharSetProber::FilterWithEnglishLetters(const char* aBuf, PRUint32 aLen
|
|||
while (prevPtr < curPtr)
|
||||
*newptr++ = *prevPtr++;
|
||||
|
||||
newLen = PRUint32(newptr - *newBuf);
|
||||
newLen = newptr - *newBuf;
|
||||
|
||||
return PR_TRUE;
|
||||
}
|
||||
|
|
|
@ -52,7 +52,7 @@ typedef enum {
|
|||
|
||||
class nsCharSetProber {
|
||||
public:
|
||||
virtual ~nsCharSetProber() {};
|
||||
virtual ~nsCharSetProber() {}
|
||||
virtual const char* GetCharSetName() = 0;
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen) = 0;
|
||||
virtual nsProbingState GetState(void) = 0;
|
||||
|
@ -60,10 +60,6 @@ public:
|
|||
virtual float GetConfidence(void) = 0;
|
||||
virtual void SetOpion() = 0;
|
||||
|
||||
virtual const char* GetCharSetName(int i) { return GetCharSetName(); }
|
||||
virtual float GetConfidence(int i) { return GetConfidence(); }
|
||||
virtual int GetProbeCount(void) { return 1; }
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
virtual void DumpStatus() {};
|
||||
#endif
|
||||
|
|
|
@ -59,10 +59,7 @@ typedef struct
|
|||
|
||||
class nsCodingStateMachine {
|
||||
public:
|
||||
nsCodingStateMachine(SMModel* sm){
|
||||
mCurrentState = eStart;
|
||||
mModel = sm;
|
||||
};
|
||||
nsCodingStateMachine(const SMModel* sm) : mModel(sm) { mCurrentState = eStart; }
|
||||
nsSMState NextState(char c){
|
||||
//for each byte we get its class , if it is first byte, we also get byte length
|
||||
PRUint32 byteCls = GETCLASS(c);
|
||||
|
@ -76,33 +73,32 @@ public:
|
|||
mModel->stateTable);
|
||||
mCurrentBytePos++;
|
||||
return mCurrentState;
|
||||
};
|
||||
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;};
|
||||
void Reset(void) {mCurrentState = eStart;};
|
||||
const char * GetCodingStateMachine() {return mModel->name;};
|
||||
}
|
||||
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
|
||||
void Reset(void) {mCurrentState = eStart;}
|
||||
const char * GetCodingStateMachine() {return mModel->name;}
|
||||
|
||||
protected:
|
||||
nsSMState mCurrentState;
|
||||
PRUint32 mCurrentCharLen;
|
||||
PRUint32 mCurrentBytePos;
|
||||
|
||||
SMModel *mModel;
|
||||
const SMModel *mModel;
|
||||
};
|
||||
|
||||
extern SMModel UTF8SMModel;
|
||||
extern SMModel Big5SMModel;
|
||||
extern SMModel EUCJPSMModel;
|
||||
extern SMModel EUCKRSMModel;
|
||||
extern SMModel EUCTWSMModel;
|
||||
extern SMModel GB18030SMModel;
|
||||
extern SMModel SJISSMModel;
|
||||
extern SMModel UCS2BESMModel;
|
||||
extern const SMModel UTF8SMModel;
|
||||
extern const SMModel Big5SMModel;
|
||||
extern const SMModel EUCJPSMModel;
|
||||
extern const SMModel EUCKRSMModel;
|
||||
extern const SMModel EUCTWSMModel;
|
||||
extern const SMModel GB18030SMModel;
|
||||
extern const SMModel SJISSMModel;
|
||||
|
||||
|
||||
extern SMModel HZSMModel;
|
||||
extern SMModel ISO2022CNSMModel;
|
||||
extern SMModel ISO2022JPSMModel;
|
||||
extern SMModel ISO2022KRSMModel;
|
||||
extern const SMModel HZSMModel;
|
||||
extern const SMModel ISO2022CNSMModel;
|
||||
extern const SMModel ISO2022JPSMModel;
|
||||
extern const SMModel ISO2022KRSMModel;
|
||||
|
||||
#endif /* nsCodingStateMachine_h__ */
|
||||
|
||||
|
|
|
@ -57,11 +57,6 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
@ -96,8 +91,8 @@ nsProbingState nsEUCJPProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCJPProber::GetConfidence(void)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
|
|
@ -50,15 +50,17 @@
|
|||
|
||||
class nsEUCJPProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCJPProber(void){mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();};
|
||||
virtual ~nsEUCJPProber(void){delete mCodingSM;};
|
||||
nsEUCJPProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCJPSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCJPProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "EUC-JP";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "EUC-JP";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
nsCodingStateMachine* mCodingSM;
|
||||
|
@ -68,6 +70,7 @@ protected:
|
|||
EUCJPDistributionAnalysis mDistributionAnalyser;
|
||||
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -52,11 +52,6 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
@ -89,7 +84,7 @@ nsProbingState nsEUCKRProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCKRProber::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -44,15 +44,18 @@
|
|||
|
||||
class nsEUCKRProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCKRProber(void){mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||
Reset();};
|
||||
virtual ~nsEUCKRProber(void){delete mCodingSM;};
|
||||
nsEUCKRProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCKRSMModel);
|
||||
Reset();
|
||||
}
|
||||
virtual ~nsEUCKRProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "EUC-KR";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "EUC-KR";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
|
@ -63,6 +66,7 @@ protected:
|
|||
//EUCKRContextAnalysis mContextAnalyser;
|
||||
EUCKRDistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -52,11 +52,6 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
@ -89,7 +84,7 @@ nsProbingState nsEUCTWProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsEUCTWProber::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -44,15 +44,17 @@
|
|||
|
||||
class nsEUCTWProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEUCTWProber(void){mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();};
|
||||
virtual ~nsEUCTWProber(void){delete mCodingSM;};
|
||||
nsEUCTWProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&EUCTWSMModel);
|
||||
Reset();}
|
||||
virtual ~nsEUCTWProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "x-euc-tw";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "x-euc-tw";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
|
@ -63,6 +65,7 @@ protected:
|
|||
//EUCTWContextAnalysis mContextAnalyser;
|
||||
EUCTWDistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -37,13 +37,21 @@
|
|||
|
||||
|
||||
#include "nsEscCharsetProber.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
|
||||
nsEscCharSetProber::nsEscCharSetProber(void)
|
||||
nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
|
||||
{
|
||||
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
||||
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
||||
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
||||
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
||||
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
||||
mCodingSM[i] = nsnull;
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||
{
|
||||
mCodingSM[0] = new nsCodingStateMachine(&HZSMModel);
|
||||
mCodingSM[1] = new nsCodingStateMachine(&ISO2022CNSMModel);
|
||||
}
|
||||
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||
mCodingSM[2] = new nsCodingStateMachine(&ISO2022JPSMModel);
|
||||
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||
mCodingSM[3] = new nsCodingStateMachine(&ISO2022KRSMModel);
|
||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||
mState = eDetecting;
|
||||
mDetectedCharset = nsnull;
|
||||
|
@ -59,7 +67,8 @@ void nsEscCharSetProber::Reset(void)
|
|||
{
|
||||
mState = eDetecting;
|
||||
for (PRUint32 i = 0; i < NUM_OF_ESC_CHARSETS; i++)
|
||||
mCodingSM[i]->Reset();
|
||||
if (mCodingSM[i])
|
||||
mCodingSM[i]->Reset();
|
||||
mActiveSM = NUM_OF_ESC_CHARSETS;
|
||||
mDetectedCharset = nsnull;
|
||||
}
|
||||
|
@ -74,30 +83,15 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
{
|
||||
for (j = mActiveSM-1; j>= 0; j--)
|
||||
{
|
||||
//byte is feed to all active state machine
|
||||
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
if (mCodingSM[j])
|
||||
{
|
||||
//got negative answer for this state machine, make it inactive
|
||||
mActiveSM--;
|
||||
if (mActiveSM == 0)
|
||||
codingState = mCodingSM[j]->NextState(aBuf[i]);
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eNotMe;
|
||||
mState = eFoundIt;
|
||||
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
||||
return mState;
|
||||
}
|
||||
else if (j != (PRInt32)mActiveSM)
|
||||
{
|
||||
nsCodingStateMachine* t;
|
||||
t = mCodingSM[mActiveSM];
|
||||
mCodingSM[mActiveSM] = mCodingSM[j];
|
||||
mCodingSM[j] = t;
|
||||
}
|
||||
}
|
||||
else if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,14 +45,14 @@
|
|||
|
||||
class nsEscCharSetProber: public nsCharSetProber {
|
||||
public:
|
||||
nsEscCharSetProber(void);
|
||||
nsEscCharSetProber(PRUint32 aLanguageFilter);
|
||||
virtual ~nsEscCharSetProber(void);
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return mDetectedCharset;};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return mDetectedCharset;}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void){return (float)0.99;};
|
||||
void SetOpion() {};
|
||||
float GetConfidence(void){return (float)0.99;}
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
|
|
|
@ -36,7 +36,7 @@
|
|||
* ***** END LICENSE BLOCK ***** */
|
||||
#include "nsCodingStateMachine.h"
|
||||
|
||||
static PRUint32 HZ_cls[ 256 / 8 ] = {
|
||||
static const PRUint32 HZ_cls[ 256 / 8 ] = {
|
||||
PCK4BITS(1,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
|
@ -72,7 +72,7 @@ PCK4BITS(1,1,1,1,1,1,1,1) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 HZ_st [ 6] = {
|
||||
static const PRUint32 HZ_st [ 6] = {
|
||||
PCK4BITS(eStart,eError, 3,eStart,eStart,eStart,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart, 4,eError),//10-17
|
||||
|
@ -83,7 +83,7 @@ PCK4BITS( 4,eItsMe,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
|||
|
||||
static const PRUint32 HZCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel HZSMModel = {
|
||||
const SMModel HZSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, HZ_st },
|
||||
|
@ -92,7 +92,7 @@ SMModel HZSMModel = {
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 ISO2022CN_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
|
@ -128,7 +128,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022CN_st [ 8] = {
|
||||
static const PRUint32 ISO2022CN_st [ 8] = {
|
||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
||||
PCK4BITS(eStart,eError,eError,eError,eError,eError,eError,eError),//08-0f
|
||||
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
||||
|
@ -141,7 +141,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eItsMe,eError,eStart) //38-3f
|
|||
|
||||
static const PRUint32 ISO2022CNCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel ISO2022CNSMModel = {
|
||||
const SMModel ISO2022CNSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_cls },
|
||||
9,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022CN_st },
|
||||
|
@ -149,7 +149,7 @@ SMModel ISO2022CNSMModel = {
|
|||
"ISO-2022-CN",
|
||||
};
|
||||
|
||||
static PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 ISO2022JP_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,2,2), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
|
@ -185,7 +185,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022JP_st [ 9] = {
|
||||
static const PRUint32 ISO2022JP_st [ 9] = {
|
||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eStart,eStart),//00-07
|
||||
PCK4BITS(eStart,eStart,eError,eError,eError,eError,eError,eError),//08-0f
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//10-17
|
||||
|
@ -199,7 +199,7 @@ PCK4BITS(eError,eError,eError,eError,eItsMe,eError,eStart,eStart) //40-47
|
|||
|
||||
static const PRUint32 ISO2022JPCharLenTable[] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel ISO2022JPSMModel = {
|
||||
const SMModel ISO2022JPSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_cls },
|
||||
10,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022JP_st },
|
||||
|
@ -207,7 +207,7 @@ SMModel ISO2022JPSMModel = {
|
|||
"ISO-2022-JP",
|
||||
};
|
||||
|
||||
static PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 ISO2022KR_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(2,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
|
@ -243,7 +243,7 @@ PCK4BITS(2,2,2,2,2,2,2,2) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 ISO2022KR_st [ 5] = {
|
||||
static const PRUint32 ISO2022KR_st [ 5] = {
|
||||
PCK4BITS(eStart, 3,eError,eStart,eStart,eStart,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eError, 4,eError,eError),//10-17
|
||||
|
@ -253,7 +253,7 @@ PCK4BITS(eError,eError,eError,eItsMe,eStart,eStart,eStart,eStart) //20-27
|
|||
|
||||
static const PRUint32 ISO2022KRCharLenTable[] = {0, 0, 0, 0, 0, 0};
|
||||
|
||||
SMModel ISO2022KRSMModel = {
|
||||
const SMModel ISO2022KRSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, ISO2022KR_st },
|
||||
|
|
|
@ -57,11 +57,6 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
@ -94,7 +89,7 @@ nsProbingState nsGB18030Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsGB18030Prober::GetConfidence(void)
|
||||
{
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (float)distribCf;
|
||||
}
|
||||
|
|
|
@ -46,15 +46,17 @@
|
|||
|
||||
class nsGB18030Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsGB18030Prober(void){mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();};
|
||||
virtual ~nsGB18030Prober(void){delete mCodingSM;};
|
||||
nsGB18030Prober(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&GB18030SMModel);
|
||||
Reset();}
|
||||
virtual ~nsGB18030Prober(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "gb18030";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "gb18030";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
void GetDistribution(PRUint32 aCharLen, const char* aStr);
|
||||
|
@ -65,6 +67,7 @@ protected:
|
|||
//GB2312ContextAnalysis mContextAnalyser;
|
||||
GB2312DistributionAnalysis mDistributionAnalyser;
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ public:
|
|||
virtual nsProbingState GetState(void);
|
||||
|
||||
virtual float GetConfidence(void) { return (float)0.0; }
|
||||
virtual void SetOpion() {};
|
||||
virtual void SetOpion() {}
|
||||
|
||||
void SetModelProbers(nsCharSetProber *logicalPrb, nsCharSetProber *visualPrb)
|
||||
{ mLogicalProb = logicalPrb; mVisualProb = visualPrb; }
|
||||
|
|
|
@ -50,7 +50,7 @@
|
|||
#define ASO 7 // accent small other
|
||||
#define CLASS_NUM 8 // total classes
|
||||
|
||||
static unsigned char Latin1_CharToClass[] =
|
||||
static const unsigned char Latin1_CharToClass[] =
|
||||
{
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07
|
||||
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F
|
||||
|
@ -92,7 +92,7 @@ static unsigned char Latin1_CharToClass[] =
|
|||
2 : normal
|
||||
3 : very likely
|
||||
*/
|
||||
static unsigned char Latin1ClassModel[] =
|
||||
static const unsigned char Latin1ClassModel[] =
|
||||
{
|
||||
/* UDF OTH ASC ASS ACV ACO ASV ASO */
|
||||
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
|
|
|
@ -45,14 +45,14 @@
|
|||
|
||||
class nsLatin1Prober: public nsCharSetProber {
|
||||
public:
|
||||
nsLatin1Prober(void){Reset();};
|
||||
virtual ~nsLatin1Prober(void){};
|
||||
nsLatin1Prober(void){Reset();}
|
||||
virtual ~nsLatin1Prober(void){}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "windows-1252";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "windows-1252";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
virtual void DumpStatus();
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include "nsMBCSGroupProber.h"
|
||||
#include "nsUniversalDetector.h"
|
||||
|
||||
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers)
|
||||
const char *ProberName[] =
|
||||
|
@ -54,15 +55,26 @@ const char *ProberName[] =
|
|||
|
||||
#endif
|
||||
|
||||
nsMBCSGroupProber::nsMBCSGroupProber()
|
||||
nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter)
|
||||
{
|
||||
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++)
|
||||
mProbers[i] = nsnull;
|
||||
|
||||
mProbers[0] = new nsUTF8Prober();
|
||||
mProbers[1] = new nsSJISProber();
|
||||
mProbers[2] = new nsEUCJPProber();
|
||||
mProbers[3] = new nsGB18030Prober();
|
||||
mProbers[4] = new nsEUCKRProber();
|
||||
mProbers[5] = new nsBig5Prober();
|
||||
mProbers[6] = new nsEUCTWProber();
|
||||
if (aLanguageFilter & NS_FILTER_JAPANESE)
|
||||
{
|
||||
mProbers[1] = new nsSJISProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||
mProbers[2] = new nsEUCJPProber(aLanguageFilter == NS_FILTER_JAPANESE);
|
||||
}
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_SIMPLIFIED)
|
||||
mProbers[3] = new nsGB18030Prober(aLanguageFilter == NS_FILTER_CHINESE_SIMPLIFIED);
|
||||
if (aLanguageFilter & NS_FILTER_KOREAN)
|
||||
mProbers[4] = new nsEUCKRProber(aLanguageFilter == NS_FILTER_KOREAN);
|
||||
if (aLanguageFilter & NS_FILTER_CHINESE_TRADITIONAL)
|
||||
{
|
||||
mProbers[5] = new nsBig5Prober(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
mProbers[6] = new nsEUCTWProber(aLanguageFilter == NS_FILTER_CHINESE_TRADITIONAL);
|
||||
}
|
||||
Reset();
|
||||
}
|
||||
|
||||
|
@ -134,16 +146,6 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
else if (st == eNotMe)
|
||||
{
|
||||
mIsActive[i] = PR_FALSE;
|
||||
mActiveNum--;
|
||||
if (mActiveNum <= 0)
|
||||
{
|
||||
mState = eNotMe;
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -154,23 +156,13 @@ nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
{
|
||||
if (!mIsActive[i])
|
||||
continue;
|
||||
st = mProbers[i]->HandleData(aBuf + start, aLen + 1 - start);
|
||||
st = mProbers[i]->HandleData(aBuf + start, aLen - start);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mBestGuess = i;
|
||||
mState = eFoundIt;
|
||||
return mState;
|
||||
}
|
||||
else if (st == eNotMe)
|
||||
{
|
||||
mIsActive[i] = PR_FALSE;
|
||||
mActiveNum--;
|
||||
if (mActiveNum <= 0)
|
||||
{
|
||||
mState = eNotMe;
|
||||
return mState;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
mKeepNext = keepNext;
|
||||
|
|
|
@ -51,18 +51,14 @@
|
|||
|
||||
class nsMBCSGroupProber: public nsCharSetProber {
|
||||
public:
|
||||
nsMBCSGroupProber();
|
||||
nsMBCSGroupProber(PRUint32 aLanguageFilter);
|
||||
virtual ~nsMBCSGroupProber();
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName();
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
|
||||
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
|
||||
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
|
||||
int GetProbeCount(void) { return NUM_OF_PROBERS; }
|
||||
void SetOpion() {}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void DumpStatus();
|
||||
|
|
|
@ -44,7 +44,7 @@ Modification from frank tang's original work:
|
|||
|
||||
// BIG5
|
||||
|
||||
static PRUint32 BIG5_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 BIG5_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as legal value
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
|
@ -81,7 +81,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 BIG5_st [ 3] = {
|
||||
static const PRUint32 BIG5_st [ 3] = {
|
||||
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError),//08-0f
|
||||
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
||||
|
@ -89,7 +89,7 @@ PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart,eStart,eStart) //10-17
|
|||
|
||||
static const PRUint32 Big5CharLenTable[] = {0, 1, 1, 2, 0};
|
||||
|
||||
SMModel Big5SMModel = {
|
||||
SMModel const Big5SMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_cls },
|
||||
5,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, BIG5_st },
|
||||
|
@ -97,7 +97,7 @@ SMModel Big5SMModel = {
|
|||
"Big5",
|
||||
};
|
||||
|
||||
static PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 EUCJP_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(5,4,4,4,4,4,4,4), // 00 - 07
|
||||
PCK4BITS(4,4,4,4,4,4,4,4), // 00 - 07
|
||||
PCK4BITS(4,4,4,4,4,4,5,5), // 08 - 0f
|
||||
|
@ -134,7 +134,7 @@ PCK4BITS(0,0,0,0,0,0,0,5) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 EUCJP_st [ 5] = {
|
||||
static const PRUint32 EUCJP_st [ 5] = {
|
||||
PCK4BITS( 3, 4, 3, 5,eStart,eError,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eStart,eError,eStart,eError,eError,eError),//10-17
|
||||
|
@ -144,7 +144,7 @@ PCK4BITS( 3,eError,eError,eError,eStart,eStart,eStart,eStart) //20-27
|
|||
|
||||
static const PRUint32 EUCJPCharLenTable[] = {2, 2, 2, 3, 1, 0};
|
||||
|
||||
SMModel EUCJPSMModel = {
|
||||
const SMModel EUCJPSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCJP_st },
|
||||
|
@ -152,7 +152,7 @@ SMModel EUCJPSMModel = {
|
|||
"EUC-JP",
|
||||
};
|
||||
|
||||
static PRUint32 EUCKR_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 EUCKR_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
|
@ -189,14 +189,14 @@ PCK4BITS(2,2,2,2,2,2,2,0) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 EUCKR_st [ 2] = {
|
||||
static const PRUint32 EUCKR_st [ 2] = {
|
||||
PCK4BITS(eError,eStart, 3,eError,eError,eError,eError,eError),//00-07
|
||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart,eStart) //08-0f
|
||||
};
|
||||
|
||||
static const PRUint32 EUCKRCharLenTable[] = {0, 1, 2, 0};
|
||||
|
||||
SMModel EUCKRSMModel = {
|
||||
const SMModel EUCKRSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_cls },
|
||||
4,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCKR_st },
|
||||
|
@ -204,7 +204,7 @@ SMModel EUCKRSMModel = {
|
|||
"EUC-KR",
|
||||
};
|
||||
|
||||
static PRUint32 EUCTW_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 EUCTW_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,2,2,2,2,2,2,2), // 00 - 07
|
||||
PCK4BITS(2,2,2,2,2,2,2,2), // 00 - 07
|
||||
PCK4BITS(2,2,2,2,2,2,0,0), // 08 - 0f
|
||||
|
@ -241,7 +241,7 @@ PCK4BITS(3,3,3,3,3,3,3,0) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 EUCTW_st [ 6] = {
|
||||
static const PRUint32 EUCTW_st [ 6] = {
|
||||
PCK4BITS(eError,eError,eStart, 3, 3, 3, 4,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eStart,eError),//10-17
|
||||
|
@ -252,7 +252,7 @@ PCK4BITS(eStart,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
|||
|
||||
static const PRUint32 EUCTWCharLenTable[] = {0, 0, 1, 2, 2, 2, 3};
|
||||
|
||||
SMModel EUCTWSMModel = {
|
||||
const SMModel EUCTWSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_cls },
|
||||
7,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, EUCTW_st },
|
||||
|
@ -316,7 +316,7 @@ SMModel GB2312SMModel = {
|
|||
|
||||
// the following state machine data was created by perl script in
|
||||
// intl/chardet/tools. It should be the same as in PSM detector.
|
||||
static PRUint32 GB18030_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 GB18030_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 10 - 17
|
||||
|
@ -352,7 +352,7 @@ PCK4BITS(6,6,6,6,6,6,6,0) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 GB18030_st [ 6] = {
|
||||
static const PRUint32 GB18030_st [ 6] = {
|
||||
PCK4BITS(eError,eStart,eStart,eStart,eStart,eStart, 3,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eItsMe,eItsMe,eItsMe,eError,eError,eStart),//10-17
|
||||
|
@ -368,7 +368,7 @@ PCK4BITS(eError,eError,eStart,eStart,eStart,eStart,eStart,eStart) //28-2f
|
|||
// 2 here.
|
||||
static const PRUint32 GB18030CharLenTable[] = {0, 1, 1, 1, 1, 1, 2};
|
||||
|
||||
SMModel GB18030SMModel = {
|
||||
const SMModel GB18030SMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_cls },
|
||||
7,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, GB18030_st },
|
||||
|
@ -378,7 +378,7 @@ SMModel GB18030SMModel = {
|
|||
|
||||
// sjis
|
||||
|
||||
static PRUint32 SJIS_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 SJIS_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
|
@ -417,7 +417,7 @@ PCK4BITS(4,4,4,4,4,0,0,0) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 SJIS_st [ 3] = {
|
||||
static const PRUint32 SJIS_st [ 3] = {
|
||||
PCK4BITS(eError,eStart,eStart, 3,eError,eError,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
||||
|
@ -425,7 +425,7 @@ PCK4BITS(eItsMe,eItsMe,eError,eError,eStart,eStart,eStart,eStart) //10-17
|
|||
|
||||
static const PRUint32 SJISCharLenTable[] = {0, 1, 1, 2, 0, 0};
|
||||
|
||||
SMModel SJISSMModel = {
|
||||
const SMModel SJISSMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, SJIS_st },
|
||||
|
@ -434,120 +434,7 @@ SMModel SJISSMModel = {
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 UCS2BE_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UCS2BE_st [ 7] = {
|
||||
PCK4BITS( 5, 7, 7,eError, 4, 3,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe, 6, 6, 6, 6,eError,eError),//10-17
|
||||
PCK4BITS( 6, 6, 6, 6, 6,eItsMe, 6, 6),//18-1f
|
||||
PCK4BITS( 6, 6, 6, 6, 5, 7, 7,eError),//20-27
|
||||
PCK4BITS( 5, 8, 6, 6,eError, 6, 6, 6),//28-2f
|
||||
PCK4BITS( 6, 6, 6, 6,eError,eError,eStart,eStart) //30-37
|
||||
};
|
||||
|
||||
static const PRUint32 UCS2BECharLenTable[] = {2, 2, 2, 0, 2, 2};
|
||||
|
||||
SMModel UCS2BESMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2BE_st },
|
||||
UCS2BECharLenTable,
|
||||
"UTF-16BE",
|
||||
};
|
||||
|
||||
static PRUint32 UCS2LE_cls [ 256 / 8 ] = {
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 00 - 07
|
||||
PCK4BITS(0,0,1,0,0,2,0,0), // 08 - 0f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 10 - 17
|
||||
PCK4BITS(0,0,0,3,0,0,0,0), // 18 - 1f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 20 - 27
|
||||
PCK4BITS(0,3,3,3,3,3,0,0), // 28 - 2f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 30 - 37
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 38 - 3f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 40 - 47
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 48 - 4f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 50 - 57
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 58 - 5f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 60 - 67
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 68 - 6f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 70 - 77
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 78 - 7f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 80 - 87
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 88 - 8f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 90 - 97
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // 98 - 9f
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a0 - a7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // a8 - af
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b0 - b7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // b8 - bf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c0 - c7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // c8 - cf
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d0 - d7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // d8 - df
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e0 - e7
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // e8 - ef
|
||||
PCK4BITS(0,0,0,0,0,0,0,0), // f0 - f7
|
||||
PCK4BITS(0,0,0,0,0,0,4,5) // f8 - ff
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UCS2LE_st [ 7] = {
|
||||
PCK4BITS( 6, 6, 7, 6, 4, 3,eError,eError),//00-07
|
||||
PCK4BITS(eError,eError,eError,eError,eItsMe,eItsMe,eItsMe,eItsMe),//08-0f
|
||||
PCK4BITS(eItsMe,eItsMe, 5, 5, 5,eError,eItsMe,eError),//10-17
|
||||
PCK4BITS( 5, 5, 5,eError, 5,eError, 6, 6),//18-1f
|
||||
PCK4BITS( 7, 6, 8, 8, 5, 5, 5,eError),//20-27
|
||||
PCK4BITS( 5, 5, 5,eError,eError,eError, 5, 5),//28-2f
|
||||
PCK4BITS( 5, 5, 5,eError, 5,eError,eStart,eStart) //30-37
|
||||
};
|
||||
|
||||
static const PRUint32 UCS2LECharLenTable[] = {2, 2, 2, 2, 2, 2};
|
||||
|
||||
SMModel UCS2LESMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_cls },
|
||||
6,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UCS2LE_st },
|
||||
UCS2LECharLenTable,
|
||||
"UTF-16LE",
|
||||
};
|
||||
|
||||
|
||||
static PRUint32 UTF8_cls [ 256 / 8 ] = {
|
||||
static const PRUint32 UTF8_cls [ 256 / 8 ] = {
|
||||
//PCK4BITS(0,1,1,1,1,1,1,1), // 00 - 07
|
||||
PCK4BITS(1,1,1,1,1,1,1,1), // 00 - 07 //allow 0x00 as a legal value
|
||||
PCK4BITS(1,1,1,1,1,1,0,0), // 08 - 0f
|
||||
|
@ -584,7 +471,7 @@ PCK4BITS(12,13,13,13,14,15,0,0) // f8 - ff
|
|||
};
|
||||
|
||||
|
||||
static PRUint32 UTF8_st [ 26] = {
|
||||
static const PRUint32 UTF8_st [ 26] = {
|
||||
PCK4BITS(eError,eStart,eError,eError,eError,eError, 12, 10),//00-07
|
||||
PCK4BITS( 9, 11, 8, 7, 6, 5, 4, 3),//08-0f
|
||||
PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError),//10-17
|
||||
|
@ -616,7 +503,7 @@ PCK4BITS(eError,eError,eError,eError,eError,eError,eError,eError) //c8-cf
|
|||
static const PRUint32 UTF8CharLenTable[] = {0, 1, 0, 0, 0, 0, 2, 3,
|
||||
3, 3, 4, 4, 5, 5, 6, 6 };
|
||||
|
||||
SMModel UTF8SMModel = {
|
||||
const SMModel UTF8SMModel = {
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_cls },
|
||||
16,
|
||||
{eIdxSft4bits, eSftMsk4bits, eBitSft4bits, eUnitMsk4bits, UTF8_st },
|
||||
|
|
|
@ -68,7 +68,7 @@ typedef struct nsPkgInt {
|
|||
nsSftMsk sftmsk;
|
||||
nsBitSft bitsft;
|
||||
nsUnitMsk unitmsk;
|
||||
PRUint32 *data;
|
||||
const PRUint32* const data;
|
||||
} nsPkgInt;
|
||||
|
||||
|
||||
|
|
|
@ -49,14 +49,10 @@ public:
|
|||
virtual ~nsSBCSGroupProber();
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName();
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
|
||||
const char* GetCharSetName(int i) { return mProbers[i]->GetCharSetName(); }
|
||||
float GetConfidence(int i) { return mProbers[i]->GetConfidence(); }
|
||||
int GetProbeCount(void) { return NUM_OF_SBCS_PROBERS; }
|
||||
void SetOpion() {}
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
void DumpStatus();
|
||||
|
|
|
@ -51,27 +51,27 @@
|
|||
|
||||
typedef struct
|
||||
{
|
||||
unsigned char *charToOrderMap; // [256] table use to find a char's order
|
||||
char *precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||
const unsigned char* const charToOrderMap; // [256] table use to find a char's order
|
||||
const char* const precedenceMatrix; // [SAMPLE_SIZE][SAMPLE_SIZE]; table to find a 2-char sequence's frequency
|
||||
float mTypicalPositiveRatio; // = freqSeqs / totalSeqs
|
||||
PRBool keepEnglishLetter; // says if this script contains English characters (not implemented)
|
||||
const char* charsetName;
|
||||
const char* const charsetName;
|
||||
} SequenceModel;
|
||||
|
||||
|
||||
class nsSingleByteCharSetProber : public nsCharSetProber{
|
||||
public:
|
||||
nsSingleByteCharSetProber(SequenceModel *model)
|
||||
nsSingleByteCharSetProber(const SequenceModel *model)
|
||||
:mModel(model), mReversed(PR_FALSE), mNameProber(0) { Reset(); }
|
||||
nsSingleByteCharSetProber(SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||
nsSingleByteCharSetProber(const SequenceModel *model, PRBool reversed, nsCharSetProber* nameProber)
|
||||
:mModel(model), mReversed(reversed), mNameProber(nameProber) { Reset(); }
|
||||
|
||||
virtual const char* GetCharSetName();
|
||||
virtual nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
virtual nsProbingState GetState(void) {return mState;};
|
||||
virtual nsProbingState GetState(void) {return mState;}
|
||||
virtual void Reset(void);
|
||||
virtual float GetConfidence(void);
|
||||
virtual void SetOpion() {};
|
||||
virtual void SetOpion() {}
|
||||
|
||||
// This feature is not implemented yet. any current language model
|
||||
// contain this parameter as PR_FALSE. No one is looking at this
|
||||
|
@ -79,7 +79,7 @@ public:
|
|||
// Moreover, the nsSBCSGroupProber which calls the HandleData of this
|
||||
// prober has a hard-coded call to FilterWithoutEnglishLetters which gets rid
|
||||
// of the English letters.
|
||||
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;}; // (not implemented)
|
||||
PRBool KeepEnglishLetters() {return mModel->keepEnglishLetter;} // (not implemented)
|
||||
|
||||
#ifdef DEBUG_chardet
|
||||
virtual void DumpStatus();
|
||||
|
@ -87,7 +87,7 @@ public:
|
|||
|
||||
protected:
|
||||
nsProbingState mState;
|
||||
const SequenceModel *mModel;
|
||||
const SequenceModel* const mModel;
|
||||
const PRBool mReversed; // PR_TRUE if we need to reverse every pair in the model lookup
|
||||
|
||||
//char order of last character
|
||||
|
@ -106,19 +106,19 @@ protected:
|
|||
};
|
||||
|
||||
|
||||
extern SequenceModel Koi8rModel;
|
||||
extern SequenceModel Win1251Model;
|
||||
extern SequenceModel Latin5Model;
|
||||
extern SequenceModel MacCyrillicModel;
|
||||
extern SequenceModel Ibm866Model;
|
||||
extern SequenceModel Ibm855Model;
|
||||
extern SequenceModel Latin7Model;
|
||||
extern SequenceModel Win1253Model;
|
||||
extern SequenceModel Latin5BulgarianModel;
|
||||
extern SequenceModel Win1251BulgarianModel;
|
||||
extern SequenceModel Latin2HungarianModel;
|
||||
extern SequenceModel Win1250HungarianModel;
|
||||
extern SequenceModel Win1255Model;
|
||||
extern const SequenceModel Koi8rModel;
|
||||
extern const SequenceModel Win1251Model;
|
||||
extern const SequenceModel Latin5Model;
|
||||
extern const SequenceModel MacCyrillicModel;
|
||||
extern const SequenceModel Ibm866Model;
|
||||
extern const SequenceModel Ibm855Model;
|
||||
extern const SequenceModel Latin7Model;
|
||||
extern const SequenceModel Win1253Model;
|
||||
extern const SequenceModel Latin5BulgarianModel;
|
||||
extern const SequenceModel Win1251BulgarianModel;
|
||||
extern const SequenceModel Latin2HungarianModel;
|
||||
extern const SequenceModel Win1250HungarianModel;
|
||||
extern const SequenceModel Win1255Model;
|
||||
|
||||
#endif /* nsSingleByteCharSetProber_h__ */
|
||||
|
||||
|
|
|
@ -57,11 +57,6 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
@ -95,8 +90,8 @@ nsProbingState nsSJISProber::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
float nsSJISProber::GetConfidence(void)
|
||||
{
|
||||
float contxtCf = mContextAnalyser.GetConfidence();
|
||||
float distribCf = mDistributionAnalyser.GetConfidence();
|
||||
float contxtCf = mContextAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
float distribCf = mDistributionAnalyser.GetConfidence(mIsPreferredLanguage);
|
||||
|
||||
return (contxtCf > distribCf ? contxtCf : distribCf);
|
||||
}
|
||||
|
|
|
@ -51,15 +51,17 @@
|
|||
|
||||
class nsSJISProber: public nsCharSetProber {
|
||||
public:
|
||||
nsSJISProber(void){mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();};
|
||||
virtual ~nsSJISProber(void){delete mCodingSM;};
|
||||
nsSJISProber(PRBool aIsPreferredLanguage)
|
||||
:mIsPreferredLanguage(aIsPreferredLanguage)
|
||||
{mCodingSM = new nsCodingStateMachine(&SJISSMModel);
|
||||
Reset();}
|
||||
virtual ~nsSJISProber(void){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "Shift_JIS";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "Shift_JIS";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
nsCodingStateMachine* mCodingSM;
|
||||
|
@ -69,6 +71,7 @@ protected:
|
|||
SJISDistributionAnalysis mDistributionAnalyser;
|
||||
|
||||
char mLastChar[2];
|
||||
PRBool mIsPreferredLanguage;
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -51,11 +51,6 @@ nsProbingState nsUTF8Prober::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
for (PRUint32 i = 0; i < aLen; i++)
|
||||
{
|
||||
codingState = mCodingSM->NextState(aBuf[i]);
|
||||
if (codingState == eError)
|
||||
{
|
||||
mState = eNotMe;
|
||||
break;
|
||||
}
|
||||
if (codingState == eItsMe)
|
||||
{
|
||||
mState = eFoundIt;
|
||||
|
|
|
@ -45,14 +45,14 @@ class nsUTF8Prober: public nsCharSetProber {
|
|||
public:
|
||||
nsUTF8Prober(){mNumOfMBChar = 0;
|
||||
mCodingSM = new nsCodingStateMachine(&UTF8SMModel);
|
||||
Reset(); };
|
||||
virtual ~nsUTF8Prober(){delete mCodingSM;};
|
||||
Reset(); }
|
||||
virtual ~nsUTF8Prober(){delete mCodingSM;}
|
||||
nsProbingState HandleData(const char* aBuf, PRUint32 aLen);
|
||||
const char* GetCharSetName() {return "UTF-8";};
|
||||
nsProbingState GetState(void) {return mState;};
|
||||
const char* GetCharSetName() {return "UTF-8";}
|
||||
nsProbingState GetState(void) {return mState;}
|
||||
void Reset(void);
|
||||
float GetConfidence(void);
|
||||
void SetOpion() {};
|
||||
void SetOpion() {}
|
||||
|
||||
protected:
|
||||
nsCodingStateMachine* mCodingSM;
|
||||
|
|
|
@ -44,9 +44,8 @@
|
|||
#include "nsSBCSGroupProber.h"
|
||||
#include "nsEscCharsetProber.h"
|
||||
#include "nsLatin1Prober.h"
|
||||
#include "nsError.h"
|
||||
|
||||
nsUniversalDetector::nsUniversalDetector()
|
||||
nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
|
||||
{
|
||||
mDone = PR_FALSE;
|
||||
mBestGuess = -1; //illegal value as signal
|
||||
|
@ -58,6 +57,7 @@ nsUniversalDetector::nsUniversalDetector()
|
|||
mGotData = PR_FALSE;
|
||||
mInputState = ePureAscii;
|
||||
mLastChar = '\0';
|
||||
mLanguageFilter = aLanguageFilter;
|
||||
|
||||
PRUint32 i;
|
||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
|
@ -125,12 +125,12 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
mDetectedCharset = "X-ISO-10646-UCS-4-3412";
|
||||
else if ('\xFF' == aBuf[1])
|
||||
// FE FF UTF-16, big endian BOM
|
||||
mDetectedCharset = "UTF-16BE";
|
||||
mDetectedCharset = "UTF-16";
|
||||
break;
|
||||
case '\x00':
|
||||
if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
|
||||
// 00 00 FE FF UTF-32, big-endian BOM
|
||||
mDetectedCharset = "UTF-32BE";
|
||||
mDetectedCharset = "UTF-32";
|
||||
else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
|
||||
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
|
||||
mDetectedCharset = "X-ISO-10646-UCS-4-2143";
|
||||
|
@ -138,10 +138,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
case '\xFF':
|
||||
if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
|
||||
// FF FE 00 00 UTF-32, little-endian BOM
|
||||
mDetectedCharset = "UTF-32LE";
|
||||
mDetectedCharset = "UTF-32";
|
||||
else if ('\xFE' == aBuf[1])
|
||||
// FF FE UTF-16, little endian BOM
|
||||
mDetectedCharset = "UTF-16LE";
|
||||
mDetectedCharset = "UTF-16";
|
||||
break;
|
||||
} // switch
|
||||
|
||||
|
@ -172,16 +172,24 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
|
||||
//start multibyte and singlebyte charset prober
|
||||
if (nsnull == mCharSetProbers[0])
|
||||
mCharSetProbers[0] = new nsMBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[1])
|
||||
mCharSetProbers[1] = new nsSBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
mCharSetProbers[2] = new nsLatin1Prober;
|
||||
|
||||
if ((nsnull == mCharSetProbers[0]) ||
|
||||
(nsnull == mCharSetProbers[1]) ||
|
||||
(nsnull == mCharSetProbers[2]))
|
||||
{
|
||||
mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter);
|
||||
if (nsnull == mCharSetProbers[0])
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
if (nsnull == mCharSetProbers[1] &&
|
||||
(mLanguageFilter & NS_FILTER_NON_CJK))
|
||||
{
|
||||
mCharSetProbers[1] = new nsSBCSGroupProber;
|
||||
if (nsnull == mCharSetProbers[1])
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
{
|
||||
mCharSetProbers[2] = new nsLatin1Prober;
|
||||
if (nsnull == mCharSetProbers[2])
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -202,7 +210,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
{
|
||||
case eEscAscii:
|
||||
if (nsnull == mEscCharSetProber) {
|
||||
mEscCharSetProber = new nsEscCharSetProber;
|
||||
mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter);
|
||||
if (nsnull == mEscCharSetProber)
|
||||
return NS_ERROR_OUT_OF_MEMORY;
|
||||
}
|
||||
|
@ -216,12 +224,15 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
|
|||
case eHighbyte:
|
||||
for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
{
|
||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
||||
if (st == eFoundIt)
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
return NS_OK;
|
||||
st = mCharSetProbers[i]->HandleData(aBuf, aLen);
|
||||
if (st == eFoundIt)
|
||||
{
|
||||
mDone = PR_TRUE;
|
||||
mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
|
||||
return NS_OK;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
@ -260,11 +271,14 @@ void nsUniversalDetector::DataEnd()
|
|||
|
||||
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
|
||||
{
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
if (mCharSetProbers[i])
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
proberConfidence = mCharSetProbers[i]->GetConfidence();
|
||||
if (proberConfidence > maxProberConfidence)
|
||||
{
|
||||
maxProberConfidence = proberConfidence;
|
||||
maxProber = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
//do not report anything because we are not confident of it, that's in fact a negative answer
|
||||
|
|
|
@ -38,8 +38,6 @@
|
|||
#ifndef nsUniversalDetector_h__
|
||||
#define nsUniversalDetector_h__
|
||||
|
||||
#include "nscore.h"
|
||||
|
||||
class nsCharSetProber;
|
||||
|
||||
#define NUM_OF_CHARSET_PROBERS 3
|
||||
|
@ -50,9 +48,22 @@ typedef enum {
|
|||
eHighbyte = 2
|
||||
} nsInputState;
|
||||
|
||||
#define NS_FILTER_CHINESE_SIMPLIFIED 0x01
|
||||
#define NS_FILTER_CHINESE_TRADITIONAL 0x02
|
||||
#define NS_FILTER_JAPANESE 0x04
|
||||
#define NS_FILTER_KOREAN 0x08
|
||||
#define NS_FILTER_NON_CJK 0x10
|
||||
#define NS_FILTER_ALL 0x1F
|
||||
#define NS_FILTER_CHINESE (NS_FILTER_CHINESE_SIMPLIFIED | \
|
||||
NS_FILTER_CHINESE_TRADITIONAL)
|
||||
#define NS_FILTER_CJK (NS_FILTER_CHINESE_SIMPLIFIED | \
|
||||
NS_FILTER_CHINESE_TRADITIONAL | \
|
||||
NS_FILTER_JAPANESE | \
|
||||
NS_FILTER_KOREAN)
|
||||
|
||||
class nsUniversalDetector {
|
||||
public:
|
||||
nsUniversalDetector();
|
||||
nsUniversalDetector(PRUint32 aLanguageFilter);
|
||||
virtual ~nsUniversalDetector();
|
||||
virtual nsresult HandleData(const char* aBuf, PRUint32 aLen);
|
||||
virtual void DataEnd(void);
|
||||
|
@ -68,6 +79,7 @@ protected:
|
|||
char mLastChar;
|
||||
const char * mDetectedCharset;
|
||||
PRInt32 mBestGuess;
|
||||
PRUint32 mLanguageFilter;
|
||||
|
||||
nsCharSetProber *mCharSetProbers[NUM_OF_CHARSET_PROBERS];
|
||||
nsCharSetProber *mEscCharSetProber;
|
||||
|
|
Loading…
Reference in a new issue