/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Universal charset detector code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 2001 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Shy Shalom <shooshX@gmail.com> * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nscore.h" #include "nsUniversalDetector.h" #include "nsMBCSGroupProber.h" #include "nsSBCSGroupProber.h" #include "nsEscCharsetProber.h" #include "nsLatin1Prober.h" nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter) { mDone = PR_FALSE; mBestGuess = -1; //illegal value as signal mInTag = PR_FALSE; mEscCharSetProber = nsnull; mStart = PR_TRUE; mDetectedCharset = nsnull; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; mLanguageFilter = aLanguageFilter; PRUint32 i; for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) mCharSetProbers[i] = nsnull; } nsUniversalDetector::~nsUniversalDetector() { for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) if (mCharSetProbers[i]) delete mCharSetProbers[i]; if (mEscCharSetProber) delete mEscCharSetProber; } void nsUniversalDetector::Reset() { mDone = PR_FALSE; mBestGuess = -1; //illegal value as signal mInTag = PR_FALSE; mStart = PR_TRUE; mDetectedCharset = nsnull; mGotData = PR_FALSE; mInputState = ePureAscii; mLastChar = '\0'; if (mEscCharSetProber) mEscCharSetProber->Reset(); PRUint32 i; for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) if (mCharSetProbers[i]) mCharSetProbers[i]->Reset(); } //--------------------------------------------------------------------- #define SHORTCUT_THRESHOLD (float)0.95 #define MINIMUM_THRESHOLD (float)0.20 nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { if(mDone) return NS_OK; if (aLen > 0) mGotData = PR_TRUE; //If the data starts with BOM, we know it is UTF if (mStart) { mStart = PR_FALSE; if (aLen > 3) switch (aBuf[0]) { case '\xEF': if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2])) // EF BB BF UTF-8 encoded BOM mDetectedCharset = "UTF-8"; break; case '\xFE': if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) // FE FF 00 00 UCS-4, unusual octet order BOM (3412) mDetectedCharset = "X-ISO-10646-UCS-4-3412"; else if ('\xFF' == aBuf[1]) // FE FF UTF-16, big endian BOM mDetectedCharset = "UTF-16BE"; break; case '\x00': if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3])) // 00 00 FE FF UTF-32, big-endian BOM mDetectedCharset = "UTF-32BE"; else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3])) // 00 00 FF FE UCS-4, unusual octet order BOM (2143) mDetectedCharset = "X-ISO-10646-UCS-4-2143"; break; case '\xFF': if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3])) // FF FE 00 00 UTF-32, little-endian BOM mDetectedCharset = "UTF-32LE"; else if ('\xFE' == aBuf[1]) // FF FE UTF-16, little endian BOM mDetectedCharset = "UTF-16LE"; break; } // switch if (mDetectedCharset) { mDone = PR_TRUE; return NS_OK; } } PRUint32 i; for (i = 0; i < aLen; i++) { //other than 0xa0, if every othe character is ascii, the page is ascii if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP { //we got a non-ascii byte (high-byte) if (mInputState != eHighbyte) { //adjust state mInputState = eHighbyte; //kill mEscCharSetProber if it is active if (mEscCharSetProber) { delete mEscCharSetProber; mEscCharSetProber = nsnull; } //start multibyte and singlebyte charset prober if (nsnull == mCharSetProbers[0]) { mCharSetProbers[0] = new nsMBCSGroupProber(mLanguageFilter); if (nsnull == mCharSetProbers[0]) return NS_ERROR_OUT_OF_MEMORY; } if (nsnull == mCharSetProbers[1] && (mLanguageFilter & NS_FILTER_NON_CJK)) { mCharSetProbers[1] = new nsSBCSGroupProber; if (nsnull == mCharSetProbers[1]) return NS_ERROR_OUT_OF_MEMORY; } if (nsnull == mCharSetProbers[2]) { mCharSetProbers[2] = new nsLatin1Prober; if (nsnull == mCharSetProbers[2]) return NS_ERROR_OUT_OF_MEMORY; } } } else { //ok, just pure ascii so far if ( ePureAscii == mInputState && (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) ) { //found escape character or HZ "~{" mInputState = eEscAscii; } mLastChar = aBuf[i]; } } nsProbingState st; switch (mInputState) { case eEscAscii: if (nsnull == mEscCharSetProber) { mEscCharSetProber = new nsEscCharSetProber(mLanguageFilter); if (nsnull == mEscCharSetProber) return NS_ERROR_OUT_OF_MEMORY; } st = mEscCharSetProber->HandleData(aBuf, aLen); if (st == eFoundIt) { mDone = PR_TRUE; mDetectedCharset = mEscCharSetProber->GetCharSetName(); } break; case eHighbyte: for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { if (mCharSetProbers[i]) { st = mCharSetProbers[i]->HandleData(aBuf, aLen); if (st == eFoundIt) { mDone = PR_TRUE; mDetectedCharset = mCharSetProbers[i]->GetCharSetName(); return NS_OK; } } } break; default: //pure ascii ;//do nothing here } return NS_OK; } //--------------------------------------------------------------------- void nsUniversalDetector::DataEnd() { if (!mGotData) { // we haven't got any data yet, return immediately // caller program sometimes call DataEnd before anything has been sent to detector return; } if (mDetectedCharset) { mDone = PR_TRUE; Report(mDetectedCharset); return; } switch (mInputState) { case eHighbyte: { float proberConfidence; float maxProberConfidence = (float)0.0; PRInt32 maxProber = 0; for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { if (mCharSetProbers[i]) { proberConfidence = mCharSetProbers[i]->GetConfidence(); if (proberConfidence > maxProberConfidence) { maxProberConfidence = proberConfidence; maxProber = i; } } } //do not report anything because we are not confident of it, that's in fact a negative answer if (maxProberConfidence > MINIMUM_THRESHOLD) Report(mCharSetProbers[maxProber]->GetCharSetName()); } break; case eEscAscii: break; default: ; } return; }