Aegisub/libaegisub/common/charset.cpp
Myaamori 84ac716972 meson: add uchardet port
Meson port instead of using CMake as I ran into issues with the
src directory (where uchardet.h is located) not being appended
to the include path, and on Windows I ran into a Meson issue
where a -D macro definition was being interpreted as a filename.
In the end a Meson port seemed simpler than working out the CMake
issues, as the CMakeLists.txt files were straightforward and easy
to port.

Note that because of the directory structure of the uchardet source
I had to change the include directive from uchardet/uchardet.h
to just uchardet.h. This is actually more in line with the uchardet
pkg-config file, which appends /usr/include/uchardet to the include path.
2021-05-21 21:09:36 -04:00

90 lines
2.8 KiB
C++

// Copyright (c) 2010, Amar Takhar <verm@aegisub.org>
//
// Permission to use, copy, modify, and distribute this software for any
// purpose with or without fee is hereby granted, provided that the above
// copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
/// @file charset.cpp
/// @brief Character set detection and manipulation utilities.
/// @ingroup libaegisub
#include "libaegisub/charset.h"
#include "libaegisub/file_mapping.h"
#include "libaegisub/scoped_ptr.h"
#ifdef WITH_UCHARDET
#include <uchardet.h>
#endif
namespace agi { namespace charset {
std::string Detect(agi::fs::path const& file) {
agi::read_file_mapping fp(file);
// First check for known magic bytes which identify the file type
if (fp.size() >= 4) {
const char* header = fp.read(0, 4);
if (!strncmp(header, "\xef\xbb\xbf", 3))
return "utf-8";
if (!strncmp(header, "\x00\x00\xfe\xff", 4))
return "utf-32be";
if (!strncmp(header, "\xff\xfe\x00\x00", 4))
return "utf-32le";
if (!strncmp(header, "\xfe\xff", 2))
return "utf-16be";
if (!strncmp(header, "\xff\xfe", 2))
return "utf-16le";
if (!strncmp(header, "\x1a\x45\xdf\xa3", 4))
return "binary"; // Actually EBML/Matroska
}
// If it's over 100 MB it's either binary or big enough that we won't
// be able to do anything useful with it anyway
if (fp.size() > 100 * 1024 * 1024)
return "binary";
uint64_t binaryish = 0;
#ifdef WITH_UCHARDET
agi::scoped_holder<uchardet_t> ud(uchardet_new(), uchardet_delete);
for (uint64_t offset = 0; offset < fp.size(); ) {
auto read = std::min<uint64_t>(4096, fp.size() - offset);
auto buf = fp.read(offset, read);
uchardet_handle_data(ud, buf, read);
offset += read;
// A dumb heuristic to detect binary files
for (size_t i = 0; i < read; ++i) {
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
++binaryish;
}
if (binaryish > offset / 8)
return "binary";
}
uchardet_data_end(ud);
return uchardet_get_charset(ud);
#else
auto read = std::min<uint64_t>(4096, fp.size());
auto buf = fp.read(0, read);
for (size_t i = 0; i < read; ++i) {
if ((unsigned char)buf[i] < 32 && (buf[i] != '\r' && buf[i] != '\n' && buf[i] != '\t'))
++binaryish;
}
if (binaryish > read / 8)
return "binary";
return "utf-8";
#endif
}
} }