Switch to using libaegisub for character set detection. There are some bugs here but it seems to be more consistent than the last version. There are two remaining issues left: when the character set is unknown we need to provide a full list to choose from. The second: if the file is detected as US-ASCII we need to give a parse error to the user if we run into problems. Right now we'll load the file fine and disable a lot of options.. with no message to the user.
Originally committed to SVN as r4370.
This commit is contained in:
parent
9d854b69f3
commit
e7b859b9f7
3 changed files with 32 additions and 172 deletions
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2007, Rodrigo Braz Monteiro
|
// Copyright (c) 2010, Amar Takhar
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
// Redistribution and use in source and binary forms, with or without
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -39,7 +39,6 @@
|
||||||
// Headers
|
// Headers
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
#ifdef WITH_UNIVCHARDET
|
|
||||||
#ifndef AGI_PRE
|
#ifndef AGI_PRE
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
@ -49,132 +48,47 @@
|
||||||
#include <wx/intl.h>
|
#include <wx/intl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "../universalchardet/nsCharSetProber.h"
|
#include <libaegisub/charset.h>
|
||||||
|
|
||||||
#include "charset_detect.h"
|
#include "charset_detect.h"
|
||||||
#include "text_file_reader.h"
|
#include "text_file_reader.h"
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// DOCME
|
|
||||||
struct CharDetResult {
|
|
||||||
|
|
||||||
/// DOCME
|
|
||||||
float confidence;
|
|
||||||
|
|
||||||
/// DOCME
|
|
||||||
wxString name;
|
|
||||||
|
|
||||||
|
|
||||||
/// @brief DOCME
|
|
||||||
/// @param par
|
|
||||||
/// @return
|
|
||||||
///
|
|
||||||
bool operator < (CharDetResult &par) { return confidence > par.confidence; }
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/// @brief Get encoding
|
/// @brief Get encoding
|
||||||
/// @param filename
|
/// @param filename
|
||||||
/// @return
|
/// @return
|
||||||
///
|
///
|
||||||
wxString CharSetDetect::GetEncoding(wxString filename) {
|
wxString CharSetDetect::GetEncoding(wxString filename) {
|
||||||
std::ifstream file;
|
wxLogDebug("Filename: %s", filename);
|
||||||
#ifdef __WINDOWS__
|
bool unknown = 0;
|
||||||
file.open(filename.wc_str(),std::ios::in | std::ios::binary);
|
|
||||||
#else
|
agi::charset::CharsetListDetected list;
|
||||||
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
|
agi::charset::CharsetListDetected::const_iterator i_lst;
|
||||||
#endif
|
|
||||||
if (!file.is_open()) {
|
try {
|
||||||
throw _T("Failed opening file for reading.");
|
agi::charset::DetectAll(STD_STR(filename), list);
|
||||||
|
} catch (const agi::charset::UnknownCharset&) {
|
||||||
|
unknown = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop through it until it finds interesting lines
|
/// @todo If the charset is unknown we need to display a complete list of character sets.
|
||||||
while (!file.eof() && !done()) {
|
if (list.size() > 1) {
|
||||||
char buffer[512];
|
|
||||||
file.read(buffer, 512);
|
|
||||||
size_t bytesRead = file.gcount();
|
|
||||||
HandleData(buffer, bytesRead);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Flag as finished
|
|
||||||
DataEnd();
|
|
||||||
|
|
||||||
// Grab every result obtained
|
|
||||||
wxString local = wxLocale::GetSystemEncodingName();
|
|
||||||
std::list<CharDetResult> results;
|
|
||||||
bool gotLocal = false;
|
|
||||||
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
|
|
||||||
if (mCharSetProbers[i]) {
|
|
||||||
float conf = mCharSetProbers[i]->GetConfidence();
|
|
||||||
|
|
||||||
// Only bother with those whose confidence is at least 1%
|
|
||||||
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
|
|
||||||
if (conf > 0.01f || curName == local) {
|
|
||||||
results.push_back(CharDetResult());
|
|
||||||
results.back().name = curName;
|
|
||||||
results.back().confidence = conf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If you got more than one valid result, ask the user which he wants
|
|
||||||
if (results.size() > 1) {
|
|
||||||
// Add local
|
|
||||||
if (!gotLocal) {
|
|
||||||
results.push_back(CharDetResult());
|
|
||||||
results.back().name = local;
|
|
||||||
results.back().confidence = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort by confidence
|
|
||||||
results.sort();
|
|
||||||
|
|
||||||
// Get choice from user
|
// Get choice from user
|
||||||
wxArrayString choices;
|
wxArrayString choices;
|
||||||
wxArrayString picked;
|
|
||||||
int i = 0;
|
|
||||||
for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++) {
|
|
||||||
wxString name = (*cur).name;
|
|
||||||
if (picked.Index(name) == wxNOT_FOUND) {
|
|
||||||
picked.Add(name);
|
|
||||||
|
|
||||||
// Generate name
|
for (i_lst = list.begin(); i_lst != list.end(); ++i_lst) {
|
||||||
wxString choiceStr;
|
choices.Add(lagi_wxString(i_lst->second));
|
||||||
if ((*cur).confidence > 0.0f) choiceStr = wxString::Format(_T("%f%% - "),(*cur).confidence*100.0f);
|
}
|
||||||
else choiceStr = _T("Unknown - ");
|
|
||||||
choiceStr += name;
|
|
||||||
if (name == local) choiceStr += _T(" (local)");
|
|
||||||
|
|
||||||
// Insert
|
|
||||||
choices.Add(choiceStr);
|
|
||||||
i++;
|
|
||||||
if (i == 20) break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices);
|
int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices);
|
||||||
if (choice == -1) throw _T("Canceled");
|
if (choice == -1) throw _T("Canceled");
|
||||||
|
return choices.Item(choice);
|
||||||
// Retrieve name
|
|
||||||
i = 0;
|
|
||||||
for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++,i++) {
|
|
||||||
if (i == choice) result = (*cur).name;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return whatever it got
|
i_lst = list.begin();
|
||||||
return result;
|
return i_lst->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// @brief Report
|
|
||||||
/// @param aCharset
|
|
||||||
///
|
|
||||||
void CharSetDetect::Report(const char* aCharset) {
|
|
||||||
// Store the result reported
|
|
||||||
result = wxString(aCharset,wxConvUTF8);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // WITH_UNIVCHARDET
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2007, Rodrigo Braz Monteiro
|
// Copyright (c) 2010, Amar Takhar
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
// Redistribution and use in source and binary forms, with or without
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -34,34 +34,17 @@
|
||||||
/// @ingroup utility
|
/// @ingroup utility
|
||||||
///
|
///
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
///////////
|
|
||||||
// Headers
|
|
||||||
#include "../universalchardet/nscore.h"
|
|
||||||
#include "../universalchardet/nsUniversalDetector.h"
|
|
||||||
#include "../universalchardet/nsMBCSGroupProber.h"
|
|
||||||
|
|
||||||
|
|
||||||
/// DOCME
|
/// DOCME
|
||||||
/// @class CharSetDetect
|
/// @class CharSetDetect
|
||||||
/// @brief DOCME
|
/// @brief Detect character set of a file
|
||||||
///
|
class CharSetDetect {
|
||||||
/// DOCME
|
|
||||||
class CharSetDetect : public nsUniversalDetector {
|
|
||||||
private:
|
private:
|
||||||
|
/// Character set
|
||||||
/// DOCME
|
|
||||||
wxString result;
|
wxString result;
|
||||||
void Report(const char* aCharset);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
|
/// @brief Get character set name.
|
||||||
|
/// @param filename File to check
|
||||||
|
/// @return Character set name
|
||||||
wxString GetEncoding(wxString filename);
|
wxString GetEncoding(wxString filename);
|
||||||
|
|
||||||
/// @brief DOCME
|
|
||||||
///
|
|
||||||
PRBool done() const { return mDone; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -74,49 +74,12 @@ TextFileReader::~TextFileReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
wxString TextFileReader::GetEncoding(wxString const& filename) {
|
wxString TextFileReader::GetEncoding(wxString const& filename) {
|
||||||
// Prepare
|
|
||||||
unsigned char b[4];
|
|
||||||
memset(b, 0, sizeof(b));
|
|
||||||
|
|
||||||
// Read four bytes from file
|
|
||||||
std::ifstream ifile;
|
|
||||||
#ifdef __WINDOWS__
|
|
||||||
ifile.open(filename.wc_str());
|
|
||||||
#else
|
|
||||||
ifile.open(wxFNCONV(filename));
|
|
||||||
#endif
|
|
||||||
if (!ifile.is_open()) {
|
|
||||||
return L"unknown";
|
|
||||||
}
|
|
||||||
ifile.read(reinterpret_cast<char *>(b),4);
|
|
||||||
ifile.close();
|
|
||||||
|
|
||||||
// Try to get the byte order mark from them
|
|
||||||
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8";
|
|
||||||
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE";
|
|
||||||
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE";
|
|
||||||
else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE";
|
|
||||||
else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE";
|
|
||||||
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7";
|
|
||||||
|
|
||||||
// Try to guess UTF-16
|
|
||||||
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE";
|
|
||||||
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE";
|
|
||||||
|
|
||||||
// If any of the first four bytes are under 0x20 (the first printable character),
|
|
||||||
// except for 9-13 range, assume binary
|
|
||||||
for (int i=0;i<4;i++) {
|
|
||||||
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary";
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef WITH_UNIVCHARDET
|
|
||||||
// Use universalchardet library to detect charset
|
// Use universalchardet library to detect charset
|
||||||
CharSetDetect det;
|
CharSetDetect det;
|
||||||
return det.GetEncoding(filename);
|
wxString str(det.GetEncoding(filename));
|
||||||
#else
|
wxLogDebug("Encoding: %s", str);
|
||||||
// Fall back to local
|
return str;
|
||||||
return L"local";
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
wchar_t TextFileReader::GetWChar() {
|
wchar_t TextFileReader::GetWChar() {
|
||||||
|
|
Loading…
Reference in a new issue