forked from mia/Aegisub
292 lines
5.8 KiB
C++
292 lines
5.8 KiB
C++
|
#include <cstdlib>
|
||
|
#include <cstring>
|
||
|
#include <cstdio>
|
||
|
#include <ctype.h>
|
||
|
|
||
|
#include "../hunspell/csutil.hxx"
|
||
|
#include "textparser.hxx"
|
||
|
|
||
|
#ifndef W32
|
||
|
using namespace std;
|
||
|
#endif
|
||
|
|
||
|
// ISO-8859-1 HTML character entities
|
||
|
|
||
|
static char * LATIN1[] = {
|
||
|
"À",
|
||
|
"Ã",
|
||
|
"Å",
|
||
|
"Æ",
|
||
|
"È",
|
||
|
"Ê",
|
||
|
"Ì",
|
||
|
"Ï",
|
||
|
"Ð",
|
||
|
"Ñ",
|
||
|
"Ò",
|
||
|
"Ø",
|
||
|
"Ù",
|
||
|
"Þ",
|
||
|
"à",
|
||
|
"ã",
|
||
|
"å",
|
||
|
"æ",
|
||
|
"è",
|
||
|
"ê",
|
||
|
"ì",
|
||
|
"ï",
|
||
|
"ð",
|
||
|
"ñ",
|
||
|
"ò",
|
||
|
"ø",
|
||
|
"ù",
|
||
|
"þ",
|
||
|
"ÿ"
|
||
|
};
|
||
|
|
||
|
#define LATIN1_LEN (sizeof(LATIN1) / sizeof(char *))
|
||
|
|
||
|
TextParser::TextParser() {
|
||
|
init((char *) NULL);
|
||
|
}
|
||
|
|
||
|
TextParser::TextParser(const char * wordchars)
|
||
|
{
|
||
|
init(wordchars);
|
||
|
}
|
||
|
|
||
|
TextParser::TextParser(unsigned short * wordchars, int len)
|
||
|
{
|
||
|
init(wordchars, len);
|
||
|
}
|
||
|
|
||
|
TextParser::~TextParser()
|
||
|
{
|
||
|
}
|
||
|
|
||
|
int TextParser::is_wordchar(char * w)
|
||
|
{
|
||
|
if (*w == '\0') return 0;
|
||
|
if (utf8) {
|
||
|
w_char wc;
|
||
|
unsigned short idx;
|
||
|
u8_u16(&wc, 1, w);
|
||
|
idx = (wc.h << 8) + wc.l;
|
||
|
return (unicodeisalpha(idx) || (wordchars_utf16 && flag_bsearch(wordchars_utf16, *((unsigned short *) &wc), wclen)));
|
||
|
} else {
|
||
|
return wordcharacters[(*w + 256) % 256];
|
||
|
}
|
||
|
}
|
||
|
|
||
|
char * TextParser::get_latin1(char * s)
|
||
|
{
|
||
|
if (s[0] == '&') {
|
||
|
unsigned int i = 0;
|
||
|
while ((i < LATIN1_LEN) &&
|
||
|
strncmp(LATIN1[i], s, strlen(LATIN1[i]))) i++;
|
||
|
if (i != LATIN1_LEN) return LATIN1[i];
|
||
|
}
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
void TextParser::init(const char * wordchars)
|
||
|
{
|
||
|
for (int i = 0; i < MAXPREVLINE; i++) {
|
||
|
line[i][0] = '\0';
|
||
|
}
|
||
|
actual = 0;
|
||
|
head = 0;
|
||
|
token = 0;
|
||
|
state = 0;
|
||
|
utf8 = 0;
|
||
|
checkurl = 0;
|
||
|
unsigned int j;
|
||
|
for (j = 0; j < 256; j++) {
|
||
|
wordcharacters[j] = 0;
|
||
|
}
|
||
|
if (!wordchars) wordchars = "qwertzuiopasdfghjklyxcvbnmQWERTZUIOPASDFGHJKLYXCVBNM";
|
||
|
for (j = 0; j < strlen(wordchars); j++) {
|
||
|
wordcharacters[(wordchars[j] + 256) % 256] = 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
void TextParser::init(unsigned short * wc, int len)
|
||
|
{
|
||
|
for (int i = 0; i < MAXPREVLINE; i++) {
|
||
|
line[i][0] = '\0';
|
||
|
}
|
||
|
actual = 0;
|
||
|
head = 0;
|
||
|
token = 0;
|
||
|
state = 0;
|
||
|
utf8 = 1;
|
||
|
checkurl = 0;
|
||
|
wordchars_utf16 = wc;
|
||
|
wclen = len;
|
||
|
}
|
||
|
|
||
|
int TextParser::next_char(char * line, int * pos) {
|
||
|
if (*(line + *pos) == '\0') return 1;
|
||
|
if (utf8) {
|
||
|
if (*(line + *pos) >> 7) {
|
||
|
// jump to next UTF-8 character
|
||
|
for((*pos)++; (*(line + *pos) & 0xc0) == 0x80; (*pos)++);
|
||
|
} else {
|
||
|
(*pos)++;
|
||
|
}
|
||
|
} else (*pos)++;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
void TextParser::put_line(char * word)
|
||
|
{
|
||
|
actual = (actual + 1) % MAXPREVLINE;
|
||
|
strcpy(line[actual], word);
|
||
|
token = 0;
|
||
|
head = 0;
|
||
|
check_urls();
|
||
|
}
|
||
|
|
||
|
char * TextParser::get_prevline(int n)
|
||
|
{
|
||
|
return mystrdup(line[(actual + MAXPREVLINE - n) % MAXPREVLINE]);
|
||
|
}
|
||
|
|
||
|
char * TextParser::get_line()
|
||
|
{
|
||
|
return get_prevline(0);
|
||
|
}
|
||
|
|
||
|
char * TextParser::next_token()
|
||
|
{
|
||
|
char * latin1;
|
||
|
|
||
|
for (;;) {
|
||
|
switch (state)
|
||
|
{
|
||
|
case 0: // non word chars
|
||
|
if (is_wordchar(line[actual] + head)) {
|
||
|
state = 1;
|
||
|
token = head;
|
||
|
} else if ((latin1 = get_latin1(line[actual] + head))) {
|
||
|
state = 1;
|
||
|
token = head;
|
||
|
head += strlen(latin1);
|
||
|
}
|
||
|
break;
|
||
|
case 1: // wordchar
|
||
|
if ((latin1 = get_latin1(line[actual] + head))) {
|
||
|
head += strlen(latin1);
|
||
|
} else if (! is_wordchar(line[actual] + head)) {
|
||
|
state = 0;
|
||
|
char * t = alloc_token(token, &head);
|
||
|
if (t) return t;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
if (next_char(line[actual], &head)) return NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int TextParser::get_tokenpos()
|
||
|
{
|
||
|
return token;
|
||
|
}
|
||
|
|
||
|
int TextParser::change_token(const char * word)
|
||
|
{
|
||
|
if (word) {
|
||
|
char * r = mystrdup(line[actual] + head);
|
||
|
strcpy(line[actual] + token, word);
|
||
|
strcat(line[actual], r);
|
||
|
head = token;
|
||
|
free(r);
|
||
|
return 1;
|
||
|
}
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
void TextParser::check_urls()
|
||
|
{
|
||
|
int url_state = 0;
|
||
|
int url_head = 0;
|
||
|
int url_token = 0;
|
||
|
int url = 0;
|
||
|
for (;;) {
|
||
|
switch (url_state)
|
||
|
{
|
||
|
case 0: // non word chars
|
||
|
if (is_wordchar(line[actual] + url_head)) {
|
||
|
url_state = 1;
|
||
|
url_token = url_head;
|
||
|
// Unix path
|
||
|
} else if (*(line[actual] + url_head) == '/') {
|
||
|
url_state = 1;
|
||
|
url_token = url_head;
|
||
|
url = 1;
|
||
|
}
|
||
|
break;
|
||
|
case 1: // wordchar
|
||
|
char ch = *(line[actual] + url_head);
|
||
|
// e-mail address
|
||
|
if ((ch == '@') ||
|
||
|
// MS-DOS, Windows path
|
||
|
(strncmp(line[actual] + url_head, ":\\", 2) == 0) ||
|
||
|
// URL
|
||
|
(strncmp(line[actual] + url_head, "://", 3) == 0)) {
|
||
|
url = 1;
|
||
|
} else if (! (is_wordchar(line[actual] + url_head) ||
|
||
|
(ch == '-') || (ch == '_') || (ch == '\\') ||
|
||
|
(ch == '.') || (ch == ':') || (ch == '/') ||
|
||
|
(ch == '~') || (ch == '%') || (ch == '*') ||
|
||
|
(ch == '$') || (ch == '[') || (ch == ']') ||
|
||
|
(ch == '?') || (ch == '!') ||
|
||
|
((ch >= '0') && (ch <= '9')))) {
|
||
|
url_state = 0;
|
||
|
if (url == 1) {
|
||
|
for (int i = url_token; i < url_head; i++) {
|
||
|
*(urlline + i) = 1;
|
||
|
}
|
||
|
}
|
||
|
url = 0;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
*(urlline + url_head) = 0;
|
||
|
if (next_char(line[actual], &url_head)) return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
int TextParser::get_url(int token_pos, int * head)
|
||
|
{
|
||
|
for (int i = *head; urlline[i] && *(line[actual]+i); i++, (*head)++);
|
||
|
return checkurl ? 0 : urlline[token_pos];
|
||
|
}
|
||
|
|
||
|
void TextParser::set_url_checking(int check)
|
||
|
{
|
||
|
checkurl = check;
|
||
|
}
|
||
|
|
||
|
|
||
|
char * TextParser::alloc_token(int token, int * head)
|
||
|
{
|
||
|
if (get_url(token, head)) return NULL;
|
||
|
char * t = (char *) malloc(*head - token + 1);
|
||
|
if (t) {
|
||
|
t[*head - token] = '\0';
|
||
|
strncpy(t, line[actual] + token, *head - token);
|
||
|
// remove colon for Finnish and Swedish language
|
||
|
if (t[*head - token - 1] == ':') {
|
||
|
t[*head - token - 1] = '\0';
|
||
|
if (!t[0]) {
|
||
|
free(t);
|
||
|
return NULL;
|
||
|
}
|
||
|
}
|
||
|
return t;
|
||
|
}
|
||
|
fprintf(stderr,"Error - Insufficient Memory\n");
|
||
|
return NULL;
|
||
|
}
|