forked from mia/Aegisub
Switch to using libaegisub for character set detection. There are some bugs here but it seems to be more consistent than the last version. There are two remaining issues left: when the character set is unknown we need to provide a full list to choose from. The second: if the file is detected as US-ASCII we need to give a parse error to the user if we run into problems. Right now we'll load the file fine and disable a lot of options.. with no message to the user.
Originally committed to SVN as r4370.
This commit is contained in:
parent
9d854b69f3
commit
e7b859b9f7
3 changed files with 32 additions and 172 deletions
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2007, Rodrigo Braz Monteiro
|
||||
// Copyright (c) 2010, Amar Takhar
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
|
@ -39,7 +39,6 @@
|
|||
// Headers
|
||||
#include "config.h"
|
||||
|
||||
#ifdef WITH_UNIVCHARDET
|
||||
#ifndef AGI_PRE
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
|
@ -49,132 +48,47 @@
|
|||
#include <wx/intl.h>
|
||||
#endif
|
||||
|
||||
#include "../universalchardet/nsCharSetProber.h"
|
||||
#include <libaegisub/charset.h>
|
||||
|
||||
#include "charset_detect.h"
|
||||
#include "text_file_reader.h"
|
||||
#include "compat.h"
|
||||
|
||||
|
||||
|
||||
|
||||
/// DOCME
|
||||
struct CharDetResult {
|
||||
|
||||
/// DOCME
|
||||
float confidence;
|
||||
|
||||
/// DOCME
|
||||
wxString name;
|
||||
|
||||
|
||||
/// @brief DOCME
|
||||
/// @param par
|
||||
/// @return
|
||||
///
|
||||
bool operator < (CharDetResult &par) { return confidence > par.confidence; }
|
||||
};
|
||||
|
||||
|
||||
/// @brief Get encoding
|
||||
/// @param filename
|
||||
/// @return
|
||||
///
|
||||
wxString CharSetDetect::GetEncoding(wxString filename) {
|
||||
std::ifstream file;
|
||||
#ifdef __WINDOWS__
|
||||
file.open(filename.wc_str(),std::ios::in | std::ios::binary);
|
||||
#else
|
||||
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
|
||||
#endif
|
||||
if (!file.is_open()) {
|
||||
throw _T("Failed opening file for reading.");
|
||||
wxLogDebug("Filename: %s", filename);
|
||||
bool unknown = 0;
|
||||
|
||||
agi::charset::CharsetListDetected list;
|
||||
agi::charset::CharsetListDetected::const_iterator i_lst;
|
||||
|
||||
try {
|
||||
agi::charset::DetectAll(STD_STR(filename), list);
|
||||
} catch (const agi::charset::UnknownCharset&) {
|
||||
unknown = 1;
|
||||
}
|
||||
|
||||
// Loop through it until it finds interesting lines
|
||||
while (!file.eof() && !done()) {
|
||||
char buffer[512];
|
||||
file.read(buffer, 512);
|
||||
size_t bytesRead = file.gcount();
|
||||
HandleData(buffer, bytesRead);
|
||||
}
|
||||
|
||||
// Flag as finished
|
||||
DataEnd();
|
||||
|
||||
// Grab every result obtained
|
||||
wxString local = wxLocale::GetSystemEncodingName();
|
||||
std::list<CharDetResult> results;
|
||||
bool gotLocal = false;
|
||||
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
|
||||
if (mCharSetProbers[i]) {
|
||||
float conf = mCharSetProbers[i]->GetConfidence();
|
||||
|
||||
// Only bother with those whose confidence is at least 1%
|
||||
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
|
||||
if (conf > 0.01f || curName == local) {
|
||||
results.push_back(CharDetResult());
|
||||
results.back().name = curName;
|
||||
results.back().confidence = conf;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If you got more than one valid result, ask the user which he wants
|
||||
if (results.size() > 1) {
|
||||
// Add local
|
||||
if (!gotLocal) {
|
||||
results.push_back(CharDetResult());
|
||||
results.back().name = local;
|
||||
results.back().confidence = 0;
|
||||
}
|
||||
|
||||
// Sort by confidence
|
||||
results.sort();
|
||||
/// @todo If the charset is unknown we need to display a complete list of character sets.
|
||||
if (list.size() > 1) {
|
||||
|
||||
// Get choice from user
|
||||
wxArrayString choices;
|
||||
wxArrayString picked;
|
||||
int i = 0;
|
||||
for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++) {
|
||||
wxString name = (*cur).name;
|
||||
if (picked.Index(name) == wxNOT_FOUND) {
|
||||
picked.Add(name);
|
||||
|
||||
// Generate name
|
||||
wxString choiceStr;
|
||||
if ((*cur).confidence > 0.0f) choiceStr = wxString::Format(_T("%f%% - "),(*cur).confidence*100.0f);
|
||||
else choiceStr = _T("Unknown - ");
|
||||
choiceStr += name;
|
||||
if (name == local) choiceStr += _T(" (local)");
|
||||
|
||||
// Insert
|
||||
choices.Add(choiceStr);
|
||||
i++;
|
||||
if (i == 20) break;
|
||||
}
|
||||
for (i_lst = list.begin(); i_lst != list.end(); ++i_lst) {
|
||||
choices.Add(lagi_wxString(i_lst->second));
|
||||
}
|
||||
|
||||
int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices);
|
||||
if (choice == -1) throw _T("Canceled");
|
||||
|
||||
// Retrieve name
|
||||
i = 0;
|
||||
for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++,i++) {
|
||||
if (i == choice) result = (*cur).name;
|
||||
}
|
||||
return choices.Item(choice);
|
||||
}
|
||||
|
||||
// Return whatever it got
|
||||
return result;
|
||||
i_lst = list.begin();
|
||||
return i_lst->second;
|
||||
}
|
||||
|
||||
|
||||
/// @brief Report
|
||||
/// @param aCharset
|
||||
///
|
||||
void CharSetDetect::Report(const char* aCharset) {
|
||||
// Store the result reported
|
||||
result = wxString(aCharset,wxConvUTF8);
|
||||
}
|
||||
|
||||
#endif // WITH_UNIVCHARDET
|
||||
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// Copyright (c) 2007, Rodrigo Braz Monteiro
|
||||
// Copyright (c) 2010, Amar Takhar
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
|
@ -34,34 +34,17 @@
|
|||
/// @ingroup utility
|
||||
///
|
||||
|
||||
|
||||
|
||||
///////////
|
||||
// Headers
|
||||
#include "../universalchardet/nscore.h"
|
||||
#include "../universalchardet/nsUniversalDetector.h"
|
||||
#include "../universalchardet/nsMBCSGroupProber.h"
|
||||
|
||||
|
||||
/// DOCME
|
||||
/// @class CharSetDetect
|
||||
/// @brief DOCME
|
||||
///
|
||||
/// DOCME
|
||||
class CharSetDetect : public nsUniversalDetector {
|
||||
/// @brief Detect character set of a file
|
||||
class CharSetDetect {
|
||||
private:
|
||||
|
||||
/// DOCME
|
||||
/// Character set
|
||||
wxString result;
|
||||
void Report(const char* aCharset);
|
||||
|
||||
public:
|
||||
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
|
||||
/// @brief Get character set name.
|
||||
/// @param filename File to check
|
||||
/// @return Character set name
|
||||
wxString GetEncoding(wxString filename);
|
||||
|
||||
/// @brief DOCME
|
||||
///
|
||||
PRBool done() const { return mDone; }
|
||||
};
|
||||
|
||||
|
||||
|
|
|
@ -74,49 +74,12 @@ TextFileReader::~TextFileReader() {
|
|||
}
|
||||
|
||||
wxString TextFileReader::GetEncoding(wxString const& filename) {
|
||||
// Prepare
|
||||
unsigned char b[4];
|
||||
memset(b, 0, sizeof(b));
|
||||
|
||||
// Read four bytes from file
|
||||
std::ifstream ifile;
|
||||
#ifdef __WINDOWS__
|
||||
ifile.open(filename.wc_str());
|
||||
#else
|
||||
ifile.open(wxFNCONV(filename));
|
||||
#endif
|
||||
if (!ifile.is_open()) {
|
||||
return L"unknown";
|
||||
}
|
||||
ifile.read(reinterpret_cast<char *>(b),4);
|
||||
ifile.close();
|
||||
|
||||
// Try to get the byte order mark from them
|
||||
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8";
|
||||
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE";
|
||||
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE";
|
||||
else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE";
|
||||
else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE";
|
||||
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7";
|
||||
|
||||
// Try to guess UTF-16
|
||||
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE";
|
||||
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE";
|
||||
|
||||
// If any of the first four bytes are under 0x20 (the first printable character),
|
||||
// except for 9-13 range, assume binary
|
||||
for (int i=0;i<4;i++) {
|
||||
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary";
|
||||
}
|
||||
|
||||
#ifdef WITH_UNIVCHARDET
|
||||
// Use universalchardet library to detect charset
|
||||
CharSetDetect det;
|
||||
return det.GetEncoding(filename);
|
||||
#else
|
||||
// Fall back to local
|
||||
return L"local";
|
||||
#endif
|
||||
wxString str(det.GetEncoding(filename));
|
||||
wxLogDebug("Encoding: %s", str);
|
||||
return str;
|
||||
}
|
||||
|
||||
wchar_t TextFileReader::GetWChar() {
|
||||
|
|
Loading…
Reference in a new issue