forked from mia/Aegisub
Detect byte order
Since uchardet does not do that for us Fix wangqr/Aegisub#38
This commit is contained in:
parent
799d96602d
commit
638f1eb99e
2 changed files with 52 additions and 15 deletions
|
@ -23,6 +23,7 @@
|
|||
|
||||
#ifdef WITH_UCHARDET
|
||||
#include <uchardet/uchardet.h>
|
||||
#include <boost/algorithm/string/case_conv.hpp>
|
||||
#endif
|
||||
|
||||
namespace agi { namespace charset {
|
||||
|
@ -32,12 +33,21 @@ std::string Detect(agi::fs::path const& file) {
|
|||
// FIXME: It is an empty file. Treat as ascii
|
||||
if (fp.size() == 0) return "ascii";
|
||||
|
||||
// FIXME: Dirty hack for Matroska. These 4 bytes are the magic
|
||||
// number of EBML which is used by mkv and webm
|
||||
// First check for known magic bytes which identify the file type
|
||||
if (fp.size() >= 4) {
|
||||
const char* buf = fp.read(0, 4);
|
||||
if (!strncmp(buf, "\x1a\x45\xdf\xa3", 4))
|
||||
return "binary";
|
||||
const char* header = fp.read(0, 4);
|
||||
if (!strncmp(header, "\xef\xbb\xbf", 3))
|
||||
return "utf-8";
|
||||
if (!strncmp(header, "\x00\x00\xfe\xff", 4))
|
||||
return "utf-32be";
|
||||
if (!strncmp(header, "\xff\xfe\x00\x00", 4))
|
||||
return "utf-32le";
|
||||
if (!strncmp(header, "\xfe\xff", 2))
|
||||
return "utf-16be";
|
||||
if (!strncmp(header, "\xff\xfe", 2))
|
||||
return "utf-16le";
|
||||
if (!strncmp(header, "\x1a\x45\xdf\xa3", 4))
|
||||
return "binary"; // Actually EBML/Matroska
|
||||
}
|
||||
|
||||
#ifdef WITH_UCHARDET
|
||||
|
@ -50,16 +60,42 @@ std::string Detect(agi::fs::path const& file) {
|
|||
}
|
||||
uchardet_data_end(ud);
|
||||
std::string encoding = uchardet_get_charset(ud);
|
||||
|
||||
// uchardet does not tell us the byte order of UTF-16 / UTF-32, so do it ourself
|
||||
std::string encoding_lower{ encoding };
|
||||
boost::to_lower(encoding_lower);
|
||||
if (encoding_lower == "utf-16") {
|
||||
uint64_t le_score = 0, be_score = 0;
|
||||
for (uint64_t offset = 0; offset < fp.size(); ) {
|
||||
auto read = std::min<uint64_t>(65536, fp.size() - offset);
|
||||
auto buf = fp.read(offset, read);
|
||||
for (uint64_t i = 0; i + 1 < read; i += 2) {
|
||||
if (!buf[i])
|
||||
++be_score;
|
||||
if (!buf[i + 1])
|
||||
++le_score;
|
||||
}
|
||||
offset += read;
|
||||
}
|
||||
return le_score < be_score ? "utf-16be" : "utf-16le";
|
||||
}
|
||||
else if (encoding_lower == "utf-32") {
|
||||
uint64_t le_score = 0, be_score = 0;
|
||||
for (uint64_t offset = 0; offset < fp.size(); ) {
|
||||
auto read = std::min<uint64_t>(65536, fp.size() - offset);
|
||||
auto buf = fp.read(offset, read);
|
||||
for (uint64_t i = 0; i + 3 < read; i += 2) {
|
||||
if (!buf[i])
|
||||
++be_score;
|
||||
if (!buf[i + 3])
|
||||
++le_score;
|
||||
}
|
||||
offset += read;
|
||||
}
|
||||
return le_score < be_score ? "utf-32be" : "utf-32le";
|
||||
}
|
||||
return encoding.empty() ? "binary" : encoding;
|
||||
#else
|
||||
|
||||
// Look for utf-8 BOM
|
||||
if (fp.size() >= 3) {
|
||||
const char* buf = fp.read(0, 3);
|
||||
if (!strncmp(buf, "\xef\xbb\xbf", 3))
|
||||
return "utf-8";
|
||||
}
|
||||
|
||||
// If it's over 100 MB it's either binary or big enough that we won't
|
||||
// be able to do anything useful with it anyway
|
||||
if (fp.size() > 100 * 1024 * 1024)
|
||||
|
|
|
@ -22,8 +22,9 @@ namespace agi {
|
|||
line_iterator_base::line_iterator_base(std::istream &stream, std::string encoding)
|
||||
: stream(&stream)
|
||||
{
|
||||
boost::to_lower(encoding);
|
||||
if (encoding != "utf-8") {
|
||||
std::string encoding_lower{ encoding };
|
||||
boost::to_lower(encoding_lower);
|
||||
if (encoding_lower != "utf-8") {
|
||||
agi::charset::IconvWrapper c("utf-8", encoding.c_str());
|
||||
c.Convert("\r", 1, reinterpret_cast<char *>(&cr), sizeof(int));
|
||||
c.Convert("\n", 1, reinterpret_cast<char *>(&lf), sizeof(int));
|
||||
|
|
Loading…
Reference in a new issue