From e7b859b9f70fa7a957516c0afae925ebe4be942b Mon Sep 17 00:00:00 2001 From: Amar Takhar Date: Sat, 29 May 2010 02:25:19 +0000 Subject: [PATCH] Switch to using libaegisub for character set detection. There are some bugs here but it seems to be more consistent than the last version. There are two remaining issues left: when the character set is unknown we need to provide a full list to choose from. The second: if the file is detected as US-ASCII we need to give a parse error to the user if we run into problems. Right now we'll load the file fine and disable a lot of options.. with no message to the user. Originally committed to SVN as r4370. --- aegisub/src/charset_detect.cpp | 130 ++++++------------------------- aegisub/src/charset_detect.h | 31 ++------ aegisub/src/text_file_reader.cpp | 43 +--------- 3 files changed, 32 insertions(+), 172 deletions(-) diff --git a/aegisub/src/charset_detect.cpp b/aegisub/src/charset_detect.cpp index 0e1296c98..34bd7d18f 100644 --- a/aegisub/src/charset_detect.cpp +++ b/aegisub/src/charset_detect.cpp @@ -1,4 +1,4 @@ -// Copyright (c) 2007, Rodrigo Braz Monteiro +// Copyright (c) 2010, Amar Takhar // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -39,7 +39,6 @@ // Headers #include "config.h" -#ifdef WITH_UNIVCHARDET #ifndef AGI_PRE #include #include @@ -49,132 +48,47 @@ #include #endif -#include "../universalchardet/nsCharSetProber.h" +#include + #include "charset_detect.h" #include "text_file_reader.h" +#include "compat.h" - -/// DOCME -struct CharDetResult { - - /// DOCME - float confidence; - - /// DOCME - wxString name; - - - /// @brief DOCME - /// @param par - /// @return - /// - bool operator < (CharDetResult &par) { return confidence > par.confidence; } -}; - - /// @brief Get encoding /// @param filename /// @return /// wxString CharSetDetect::GetEncoding(wxString filename) { - std::ifstream file; -#ifdef __WINDOWS__ - file.open(filename.wc_str(),std::ios::in | std::ios::binary); -#else - file.open(wxFNCONV(filename),std::ios::in | std::ios::binary); -#endif - if (!file.is_open()) { - throw _T("Failed opening file for reading."); + wxLogDebug("Filename: %s", filename); + bool unknown = 0; + + agi::charset::CharsetListDetected list; + agi::charset::CharsetListDetected::const_iterator i_lst; + + try { + agi::charset::DetectAll(STD_STR(filename), list); + } catch (const agi::charset::UnknownCharset&) { + unknown = 1; } - // Loop through it until it finds interesting lines - while (!file.eof() && !done()) { - char buffer[512]; - file.read(buffer, 512); - size_t bytesRead = file.gcount(); - HandleData(buffer, bytesRead); - } - - // Flag as finished - DataEnd(); - - // Grab every result obtained - wxString local = wxLocale::GetSystemEncodingName(); - std::list results; - bool gotLocal = false; - for (int i=0;iGetConfidence(); - - // Only bother with those whose confidence is at least 1% - wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8); - if (conf > 0.01f || curName == local) { - results.push_back(CharDetResult()); - results.back().name = curName; - results.back().confidence = conf; - } - } - } - - // If you got more than one valid result, ask the user which he wants - if (results.size() > 1) { - // Add local - if (!gotLocal) { - results.push_back(CharDetResult()); - results.back().name = local; - results.back().confidence = 0; - } - - // Sort by confidence - results.sort(); + /// @todo If the charset is unknown we need to display a complete list of character sets. + if (list.size() > 1) { // Get choice from user wxArrayString choices; - wxArrayString picked; - int i = 0; - for (std::list::iterator cur=results.begin();cur!=results.end();cur++) { - wxString name = (*cur).name; - if (picked.Index(name) == wxNOT_FOUND) { - picked.Add(name); - // Generate name - wxString choiceStr; - if ((*cur).confidence > 0.0f) choiceStr = wxString::Format(_T("%f%% - "),(*cur).confidence*100.0f); - else choiceStr = _T("Unknown - "); - choiceStr += name; - if (name == local) choiceStr += _T(" (local)"); - - // Insert - choices.Add(choiceStr); - i++; - if (i == 20) break; - } + for (i_lst = list.begin(); i_lst != list.end(); ++i_lst) { + choices.Add(lagi_wxString(i_lst->second)); } + int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices); if (choice == -1) throw _T("Canceled"); - - // Retrieve name - i = 0; - for (std::list::iterator cur=results.begin();cur!=results.end();cur++,i++) { - if (i == choice) result = (*cur).name; - } + return choices.Item(choice); } - // Return whatever it got - return result; + i_lst = list.begin(); + return i_lst->second; } - -/// @brief Report -/// @param aCharset -/// -void CharSetDetect::Report(const char* aCharset) { - // Store the result reported - result = wxString(aCharset,wxConvUTF8); -} - -#endif // WITH_UNIVCHARDET - - diff --git a/aegisub/src/charset_detect.h b/aegisub/src/charset_detect.h index 61cf43bef..bbb327bbf 100644 --- a/aegisub/src/charset_detect.h +++ b/aegisub/src/charset_detect.h @@ -1,4 +1,4 @@ -// Copyright (c) 2007, Rodrigo Braz Monteiro +// Copyright (c) 2010, Amar Takhar // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -34,34 +34,17 @@ /// @ingroup utility /// - - -/////////// -// Headers -#include "../universalchardet/nscore.h" -#include "../universalchardet/nsUniversalDetector.h" -#include "../universalchardet/nsMBCSGroupProber.h" - - /// DOCME /// @class CharSetDetect -/// @brief DOCME -/// -/// DOCME -class CharSetDetect : public nsUniversalDetector { +/// @brief Detect character set of a file +class CharSetDetect { private: - - /// DOCME + /// Character set wxString result; - void Report(const char* aCharset); public: - CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { }; + /// @brief Get character set name. + /// @param filename File to check + /// @return Character set name wxString GetEncoding(wxString filename); - - /// @brief DOCME - /// - PRBool done() const { return mDone; } }; - - diff --git a/aegisub/src/text_file_reader.cpp b/aegisub/src/text_file_reader.cpp index 90c52f2f9..042bc6c3a 100644 --- a/aegisub/src/text_file_reader.cpp +++ b/aegisub/src/text_file_reader.cpp @@ -74,49 +74,12 @@ TextFileReader::~TextFileReader() { } wxString TextFileReader::GetEncoding(wxString const& filename) { - // Prepare - unsigned char b[4]; - memset(b, 0, sizeof(b)); - // Read four bytes from file - std::ifstream ifile; -#ifdef __WINDOWS__ - ifile.open(filename.wc_str()); -#else - ifile.open(wxFNCONV(filename)); -#endif - if (!ifile.is_open()) { - return L"unknown"; - } - ifile.read(reinterpret_cast(b),4); - ifile.close(); - - // Try to get the byte order mark from them - if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8"; - else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE"; - else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE"; - else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE"; - else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE"; - else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7"; - - // Try to guess UTF-16 - else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE"; - else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE"; - - // If any of the first four bytes are under 0x20 (the first printable character), - // except for 9-13 range, assume binary - for (int i=0;i<4;i++) { - if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary"; - } - -#ifdef WITH_UNIVCHARDET // Use universalchardet library to detect charset CharSetDetect det; - return det.GetEncoding(filename); -#else - // Fall back to local - return L"local"; -#endif + wxString str(det.GetEncoding(filename)); + wxLogDebug("Encoding: %s", str); + return str; } wchar_t TextFileReader::GetWChar() {