Aegisub/libaegisub/common/charset.cpp
wangqr f733297499 Rewrite encoding detection
Now feeds all data to uchardet, when uchardet is available. The file
size limit is removed.

When uchardet is not available, we look for UTF-8 BOM.
This should make loading UTF-8-BOM files faster.
Because Aegisub always save file in UTF-8-BOM, this should also
guarentee Aegisub will load large (>100MB) file saved by itself.

See Aegisub/Aegisub#110
2019-05-18 22:13:26 -04:00

70 lines
2.2 KiB
C++

// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
/// @file charset.cpp
/// @brief Character set detection and manipulation utilities.
/// @ingroup libaegisub
#include "libaegisub/charset.h"
#include "libaegisub/file_mapping.h"
#include "libaegisub/scoped_ptr.h"
#ifdef WITH_UCHARDET
#include <uchardet/uchardet.h>
#endif
namespace agi { namespace charset {
std::string Detect(agi::fs::path const& file) {
agi::read_file_mapping fp(file);
#ifdef WITH_UCHARDET
agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
for (uint64_t offset = 0; offset < fp.size(); ) {
auto read = std::min<uint64_t>(65536, fp.size() - offset);
auto buf = fp.read(offset, read);
uchardet_handle_data(ud, buf, read);
offset += read;
}
uchardet_data_end(ud);
std::string encoding = uchardet_get_charset(ud);
return encoding.empty() ? "binary" : encoding;
#else
// Look for utf-8 BOM
if (fp.size() >= 3) {
const char* buf = fp.read(0, 3);
if (!strncmp(buf, "\xef\xbb\xbf", 3))
return "utf-8";
}
// If it's over 100 MB it's either binary or big enough that we won't
// be able to do anything useful with it anyway
if (fp.size() > 100 * 1024 * 1024)
return "binary";
uint64_t binaryish = 0;
auto read = std::min<uint64_t>(65536, fp.size());
auto buf = fp.read(0, read);
for (size_t i = 0; i < read; ++i) {
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
++binaryish;
}
if (binaryish > read / 8)
return "binary";
return "utf-8";
#endif
}
} }