forked from mia/Aegisub
Switch to using libaegisub for character set detection. There are some bugs here but it seems to be more consistent than the last version. There are two remaining issues left: when the character set is unknown we need to provide a full list to choose from. The second: if the file is detected as US-ASCII we need to give a parse error to the user if we run into problems. Right now we'll load the file fine and disable a lot of options.. with no message to the user.
Originally committed to SVN as r4370.
This commit is contained in:
parent
9d854b69f3
commit
e7b859b9f7
3 changed files with 32 additions and 172 deletions
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2007, Rodrigo Braz Monteiro
|
// Copyright (c) 2010, Amar Takhar
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
// Redistribution and use in source and binary forms, with or without
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -39,7 +39,6 @@
|
||||||
// Headers
|
// Headers
|
||||||
#include "config.h"
|
#include "config.h"
|
||||||
|
|
||||||
#ifdef WITH_UNIVCHARDET
|
|
||||||
#ifndef AGI_PRE
|
#ifndef AGI_PRE
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <list>
|
#include <list>
|
||||||
|
@ -49,132 +48,47 @@
|
||||||
#include <wx/intl.h>
|
#include <wx/intl.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#include "../universalchardet/nsCharSetProber.h"
|
#include <libaegisub/charset.h>
|
||||||
|
|
||||||
#include "charset_detect.h"
|
#include "charset_detect.h"
|
||||||
#include "text_file_reader.h"
|
#include "text_file_reader.h"
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/// DOCME
|
|
||||||
struct CharDetResult {
|
|
||||||
|
|
||||||
/// DOCME
|
|
||||||
float confidence;
|
|
||||||
|
|
||||||
/// DOCME
|
|
||||||
wxString name;
|
|
||||||
|
|
||||||
|
|
||||||
/// @brief DOCME
|
|
||||||
/// @param par
|
|
||||||
/// @return
|
|
||||||
///
|
|
||||||
bool operator < (CharDetResult &par) { return confidence > par.confidence; }
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
/// @brief Get encoding
|
/// @brief Get encoding
|
||||||
/// @param filename
|
/// @param filename
|
||||||
/// @return
|
/// @return
|
||||||
///
|
///
|
||||||
wxString CharSetDetect::GetEncoding(wxString filename) {
|
wxString CharSetDetect::GetEncoding(wxString filename) {
|
||||||
std::ifstream file;
|
wxLogDebug("Filename: %s", filename);
|
||||||
#ifdef __WINDOWS__
|
bool unknown = 0;
|
||||||
file.open(filename.wc_str(),std::ios::in | std::ios::binary);
|
|
||||||
#else
|
agi::charset::CharsetListDetected list;
|
||||||
file.open(wxFNCONV(filename),std::ios::in | std::ios::binary);
|
agi::charset::CharsetListDetected::const_iterator i_lst;
|
||||||
#endif
|
|
||||||
if (!file.is_open()) {
|
try {
|
||||||
throw _T("Failed opening file for reading.");
|
agi::charset::DetectAll(STD_STR(filename), list);
|
||||||
|
} catch (const agi::charset::UnknownCharset&) {
|
||||||
|
unknown = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Loop through it until it finds interesting lines
|
/// @todo If the charset is unknown we need to display a complete list of character sets.
|
||||||
while (!file.eof() && !done()) {
|
if (list.size() > 1) {
|
||||||
char buffer[512];
|
|
||||||
file.read(buffer, 512);
|
|
||||||
size_t bytesRead = file.gcount();
|
|
||||||
HandleData(buffer, bytesRead);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Flag as finished
|
|
||||||
DataEnd();
|
|
||||||
|
|
||||||
// Grab every result obtained
|
|
||||||
wxString local = wxLocale::GetSystemEncodingName();
|
|
||||||
std::list<CharDetResult> results;
|
|
||||||
bool gotLocal = false;
|
|
||||||
for (int i=0;i<NUM_OF_CHARSET_PROBERS;i++) {
|
|
||||||
if (mCharSetProbers[i]) {
|
|
||||||
float conf = mCharSetProbers[i]->GetConfidence();
|
|
||||||
|
|
||||||
// Only bother with those whose confidence is at least 1%
|
|
||||||
wxString curName = wxString(mCharSetProbers[i]->GetCharSetName(),wxConvUTF8);
|
|
||||||
if (conf > 0.01f || curName == local) {
|
|
||||||
results.push_back(CharDetResult());
|
|
||||||
results.back().name = curName;
|
|
||||||
results.back().confidence = conf;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// If you got more than one valid result, ask the user which he wants
|
|
||||||
if (results.size() > 1) {
|
|
||||||
// Add local
|
|
||||||
if (!gotLocal) {
|
|
||||||
results.push_back(CharDetResult());
|
|
||||||
results.back().name = local;
|
|
||||||
results.back().confidence = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort by confidence
|
|
||||||
results.sort();
|
|
||||||
|
|
||||||
// Get choice from user
|
// Get choice from user
|
||||||
wxArrayString choices;
|
wxArrayString choices;
|
||||||
wxArrayString picked;
|
|
||||||
int i = 0;
|
|
||||||
for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++) {
|
|
||||||
wxString name = (*cur).name;
|
|
||||||
if (picked.Index(name) == wxNOT_FOUND) {
|
|
||||||
picked.Add(name);
|
|
||||||
|
|
||||||
// Generate name
|
for (i_lst = list.begin(); i_lst != list.end(); ++i_lst) {
|
||||||
wxString choiceStr;
|
choices.Add(lagi_wxString(i_lst->second));
|
||||||
if ((*cur).confidence > 0.0f) choiceStr = wxString::Format(_T("%f%% - "),(*cur).confidence*100.0f);
|
|
||||||
else choiceStr = _T("Unknown - ");
|
|
||||||
choiceStr += name;
|
|
||||||
if (name == local) choiceStr += _T(" (local)");
|
|
||||||
|
|
||||||
// Insert
|
|
||||||
choices.Add(choiceStr);
|
|
||||||
i++;
|
|
||||||
if (i == 20) break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices);
|
int choice = wxGetSingleChoiceIndex(_("Aegisub could not narrow down the character set to a single one.\nPlease pick one below:"),_("Choose character set"),choices);
|
||||||
if (choice == -1) throw _T("Canceled");
|
if (choice == -1) throw _T("Canceled");
|
||||||
|
return choices.Item(choice);
|
||||||
// Retrieve name
|
|
||||||
i = 0;
|
|
||||||
for (std::list<CharDetResult>::iterator cur=results.begin();cur!=results.end();cur++,i++) {
|
|
||||||
if (i == choice) result = (*cur).name;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return whatever it got
|
i_lst = list.begin();
|
||||||
return result;
|
return i_lst->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// @brief Report
|
|
||||||
/// @param aCharset
|
|
||||||
///
|
|
||||||
void CharSetDetect::Report(const char* aCharset) {
|
|
||||||
// Store the result reported
|
|
||||||
result = wxString(aCharset,wxConvUTF8);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // WITH_UNIVCHARDET
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2007, Rodrigo Braz Monteiro
|
// Copyright (c) 2010, Amar Takhar
|
||||||
// All rights reserved.
|
// All rights reserved.
|
||||||
//
|
//
|
||||||
// Redistribution and use in source and binary forms, with or without
|
// Redistribution and use in source and binary forms, with or without
|
||||||
|
@ -34,34 +34,17 @@
|
||||||
/// @ingroup utility
|
/// @ingroup utility
|
||||||
///
|
///
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
///////////
|
|
||||||
// Headers
|
|
||||||
#include "../universalchardet/nscore.h"
|
|
||||||
#include "../universalchardet/nsUniversalDetector.h"
|
|
||||||
#include "../universalchardet/nsMBCSGroupProber.h"
|
|
||||||
|
|
||||||
|
|
||||||
/// DOCME
|
/// DOCME
|
||||||
/// @class CharSetDetect
|
/// @class CharSetDetect
|
||||||
/// @brief DOCME
|
/// @brief Detect character set of a file
|
||||||
///
|
class CharSetDetect {
|
||||||
/// DOCME
|
|
||||||
class CharSetDetect : public nsUniversalDetector {
|
|
||||||
private:
|
private:
|
||||||
|
/// Character set
|
||||||
/// DOCME
|
|
||||||
wxString result;
|
wxString result;
|
||||||
void Report(const char* aCharset);
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
CharSetDetect() : nsUniversalDetector(NS_FILTER_ALL) { };
|
/// @brief Get character set name.
|
||||||
|
/// @param filename File to check
|
||||||
|
/// @return Character set name
|
||||||
wxString GetEncoding(wxString filename);
|
wxString GetEncoding(wxString filename);
|
||||||
|
|
||||||
/// @brief DOCME
|
|
||||||
///
|
|
||||||
PRBool done() const { return mDone; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -74,49 +74,12 @@ TextFileReader::~TextFileReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
wxString TextFileReader::GetEncoding(wxString const& filename) {
|
wxString TextFileReader::GetEncoding(wxString const& filename) {
|
||||||
// Prepare
|
|
||||||
unsigned char b[4];
|
|
||||||
memset(b, 0, sizeof(b));
|
|
||||||
|
|
||||||
// Read four bytes from file
|
|
||||||
std::ifstream ifile;
|
|
||||||
#ifdef __WINDOWS__
|
|
||||||
ifile.open(filename.wc_str());
|
|
||||||
#else
|
|
||||||
ifile.open(wxFNCONV(filename));
|
|
||||||
#endif
|
|
||||||
if (!ifile.is_open()) {
|
|
||||||
return L"unknown";
|
|
||||||
}
|
|
||||||
ifile.read(reinterpret_cast<char *>(b),4);
|
|
||||||
ifile.close();
|
|
||||||
|
|
||||||
// Try to get the byte order mark from them
|
|
||||||
if (b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF) return L"UTF-8";
|
|
||||||
else if (b[0] == 0xFF && b[1] == 0xFE && b[2] == 0x00 && b[3] == 0x00) return L"UTF-32LE";
|
|
||||||
else if (b[0] == 0x00 && b[1] == 0x00 && b[2] == 0xFE && b[3] == 0xFF) return L"UTF-32BE";
|
|
||||||
else if (b[0] == 0xFF && b[1] == 0xFE) return L"UTF-16LE";
|
|
||||||
else if (b[0] == 0xFE && b[1] == 0xFF) return L"UTF-16BE";
|
|
||||||
else if (b[0] == 0x2B && b[1] == 0x2F && b[2] == 0x76) return L"UTF-7";
|
|
||||||
|
|
||||||
// Try to guess UTF-16
|
|
||||||
else if (b[0] == 0 && b[1] >= 32 && b[2] == 0 && b[3] >= 32) return L"UTF-16BE";
|
|
||||||
else if (b[0] >= 32 && b[1] == 0 && b[2] >= 32 && b[3] == 0) return L"UTF-16LE";
|
|
||||||
|
|
||||||
// If any of the first four bytes are under 0x20 (the first printable character),
|
|
||||||
// except for 9-13 range, assume binary
|
|
||||||
for (int i=0;i<4;i++) {
|
|
||||||
if (b[i] < 9 || (b[i] > 13 && b[i] < 32)) return L"binary";
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef WITH_UNIVCHARDET
|
|
||||||
// Use universalchardet library to detect charset
|
// Use universalchardet library to detect charset
|
||||||
CharSetDetect det;
|
CharSetDetect det;
|
||||||
return det.GetEncoding(filename);
|
wxString str(det.GetEncoding(filename));
|
||||||
#else
|
wxLogDebug("Encoding: %s", str);
|
||||||
// Fall back to local
|
return str;
|
||||||
return L"local";
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
wchar_t TextFileReader::GetWChar() {
|
wchar_t TextFileReader::GetWChar() {
|
||||||
|
|
Loading…
Reference in a new issue