Make unicode.lua a proper module and add basic tests
Also drop support for >4 byte sequences as they were eliminated in RFC 3629.
This commit is contained in:
parent
37e74b8274
commit
a40b1d0028
4 changed files with 199 additions and 125 deletions
58
aegisub/automation/include/aegisub/unicode.lua
Normal file
58
aegisub/automation/include/aegisub/unicode.lua
Normal file
|
@ -0,0 +1,58 @@
|
|||
local unicode
|
||||
unicode = {
|
||||
charwidth = function(s, i)
|
||||
local b = s:byte(i or 1)
|
||||
if not b then
|
||||
return 1
|
||||
elseif b < 128 then
|
||||
return 1
|
||||
elseif b < 224 then
|
||||
return 2
|
||||
elseif b < 240 then
|
||||
return 3
|
||||
else
|
||||
return 4
|
||||
end
|
||||
end,
|
||||
chars = function(s)
|
||||
local curchar, i = 0, 1
|
||||
return function()
|
||||
if i > s:len() then
|
||||
return
|
||||
end
|
||||
local j = i
|
||||
curchar = curchar + 1
|
||||
i = i + unicode.charwidth(s, i)
|
||||
return s:sub(j, i - 1), curchar
|
||||
end
|
||||
end,
|
||||
len = function(s)
|
||||
local n = 0
|
||||
for c in unicode.chars(s) do
|
||||
n = n + 1
|
||||
end
|
||||
return n
|
||||
end,
|
||||
codepoint = function(s)
|
||||
local b = s:byte(1)
|
||||
if b < 128 then
|
||||
return b
|
||||
end
|
||||
local res, w
|
||||
if b < 224 then
|
||||
res = b - 192
|
||||
w = 2
|
||||
elseif b < 240 then
|
||||
res = b - 224
|
||||
w = 3
|
||||
else
|
||||
res = b - 240
|
||||
w = 4
|
||||
end
|
||||
for i = 2, w do
|
||||
res = res * 64 + s:byte(i) - 128
|
||||
end
|
||||
return res
|
||||
end
|
||||
}
|
||||
return unicode
|
87
aegisub/automation/include/aegisub/unicode.moon
Normal file
87
aegisub/automation/include/aegisub/unicode.moon
Normal file
|
@ -0,0 +1,87 @@
|
|||
-- Copyright (c) 2007, Niels Martin Hansen, Rodrigo Braz Monteiro
|
||||
-- All rights reserved.
|
||||
--
|
||||
-- Redistribution and use in source and binary forms, with or without
|
||||
-- modification, are permitted provided that the following conditions are met:
|
||||
--
|
||||
-- * Redistributions of source code must retain the above copyright notice,
|
||||
-- this list of conditions and the following disclaimer.
|
||||
-- * Redistributions in binary form must reproduce the above copyright notice,
|
||||
-- this list of conditions and the following disclaimer in the documentation
|
||||
-- and/or other materials provided with the distribution.
|
||||
-- * Neither the name of the Aegisub Group nor the names of its contributors
|
||||
-- may be used to endorse or promote products derived from this software
|
||||
-- without specific prior written permission.
|
||||
--
|
||||
-- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
-- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
-- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
-- ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
-- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
-- CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
-- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
-- INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
-- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
-- ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
-- POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
-- Unicode (UTF-8) support functions for Aegisub Automation 4 Lua
|
||||
-- http://www.ietf.org/rfc/rfc2279.txt
|
||||
|
||||
local unicode
|
||||
unicode =
|
||||
-- Return the number of bytes occupied by the character starting at the i'th byte in s
|
||||
charwidth: (s, i) ->
|
||||
b = s\byte i or 1
|
||||
-- FIXME, something in karaskel results in this case, shouldn't happen
|
||||
-- What would "proper" behaviour be? Zero? Or just explode?
|
||||
if not b then 1
|
||||
elseif b < 128 then 1
|
||||
elseif b < 224 then 2
|
||||
elseif b < 240 then 3
|
||||
else 4
|
||||
|
||||
-- Returns an iterator function for iterating over the characters in s
|
||||
chars: (s) ->
|
||||
curchar, i = 0, 1
|
||||
->
|
||||
return if i > s\len()
|
||||
|
||||
j = i
|
||||
curchar += 1
|
||||
i += unicode.charwidth s, i
|
||||
s\sub(j, i - 1), curchar
|
||||
|
||||
-- Returns the number of characters in s
|
||||
-- Runs in O(s:len()) time!
|
||||
len: (s) ->
|
||||
n = 0
|
||||
n += 1 for c in unicode.chars s
|
||||
n
|
||||
|
||||
-- Get codepoint of first char in s
|
||||
codepoint: (s) ->
|
||||
-- Basic case, ASCII
|
||||
b = s\byte 1
|
||||
return b if b < 128
|
||||
|
||||
-- Use a naive decoding algorithm, and assume input is valid
|
||||
local res, w
|
||||
|
||||
if b < 224 then
|
||||
-- prefix byte is 110xxxxx
|
||||
res = b - 192
|
||||
w = 2
|
||||
elseif b < 240 then
|
||||
-- prefix byte is 11100000
|
||||
res = b - 224
|
||||
w = 3
|
||||
else
|
||||
res = b - 240
|
||||
w = 4
|
||||
|
||||
for i = 2, w
|
||||
res = res*64 + s\byte(i) - 128
|
||||
res
|
||||
|
||||
return unicode
|
|
@ -1,126 +1,16 @@
|
|||
--[[
|
||||
Copyright (c) 2007, Niels Martin Hansen, Rodrigo Braz Monteiro
|
||||
All rights reserved.
|
||||
-- Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
|
||||
--
|
||||
-- Permission to use, copy, modify, and distribute this software for any
|
||||
-- purpose with or without fee is hereby granted, provided that the above
|
||||
-- copyright notice and this permission notice appear in all copies.
|
||||
--
|
||||
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
* Neither the name of the Aegisub Group nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
]]
|
||||
|
||||
-- Unicode (UTF-8) support functions for Aegisub Automation 4 Lua
|
||||
-- http://www.ietf.org/rfc/rfc2279.txt
|
||||
|
||||
module("unicode")
|
||||
|
||||
-- Return the number of bytes occupied by the character starting at the i'th byte in s
|
||||
function charwidth(s, i)
|
||||
local b = s:byte(i or 1)
|
||||
if not b then
|
||||
--aegisub.debug.out(3, "unicode.charwidth of '%s' @ %d, nil byte\n", s, i)
|
||||
-- FIXME, something in karaskel results in this case, shouldn't happen
|
||||
-- What would "proper" behaviour be? Zero? Or just explode?
|
||||
return 1
|
||||
elseif b < 128 then
|
||||
return 1
|
||||
elseif b < 224 then
|
||||
return 2
|
||||
elseif b < 240 then
|
||||
return 3
|
||||
elseif b < 248 then
|
||||
return 4
|
||||
elseif b < 252 then
|
||||
return 5
|
||||
else
|
||||
return 6
|
||||
end
|
||||
-- Actually there are more possibilities, but those aren't really legal
|
||||
end
|
||||
|
||||
-- Returns an iterator function for iterating over the characters in s
|
||||
function chars(s)
|
||||
local curchar, i = 0, 1
|
||||
|
||||
local function itor()
|
||||
if i > s:len() then
|
||||
return nil
|
||||
end
|
||||
|
||||
local width = charwidth(s, i)
|
||||
local j = i
|
||||
curchar = curchar + 1
|
||||
i = i + width
|
||||
return s:sub(j, i-1), curchar
|
||||
end
|
||||
|
||||
return itor
|
||||
end
|
||||
|
||||
-- Returns the number of characters in s
|
||||
-- Runs in O(s:len()) time!
|
||||
function len(s)
|
||||
local n = 0
|
||||
for c in chars(s) do
|
||||
n = n + 1
|
||||
end
|
||||
return n
|
||||
end
|
||||
|
||||
-- Get codepoint of first char in s
|
||||
function codepoint(s)
|
||||
-- Basic case, ASCII
|
||||
local b = s:byte(1)
|
||||
if s:byte(1) < 128 then
|
||||
return s:byte(1)
|
||||
end
|
||||
|
||||
-- Use a naive decoding algorithm, and assume input is valid
|
||||
local res, w = 0
|
||||
|
||||
if b < 224 then
|
||||
-- prefix byte is 110xxxxx
|
||||
res = b - 192
|
||||
w = 2
|
||||
elseif b < 240 then
|
||||
-- prefix byte is 11100000
|
||||
res = b - 224
|
||||
w = 3
|
||||
elseif b < 248 then
|
||||
-- prefix byte is 11110000
|
||||
res = b - 240
|
||||
w = 4
|
||||
elseif b < 252 then
|
||||
-- prefix byte is 11111000
|
||||
res = b - 248
|
||||
w = 5
|
||||
else
|
||||
-- prefix byte is 11111100
|
||||
res = b - 252
|
||||
w = 6
|
||||
end
|
||||
|
||||
for i = 2, w do
|
||||
res = res*64 + s:byte(i) - 128
|
||||
end
|
||||
|
||||
return res
|
||||
end
|
||||
_G.unicode = require 'aegisub.unicode'
|
||||
return _G.unicode
|
||||
|
|
39
aegisub/automation/tests/modules/unicode.moon
Normal file
39
aegisub/automation/tests/modules/unicode.moon
Normal file
|
@ -0,0 +1,39 @@
|
|||
-- Copyright (c) 2013, Thomas Goyne <plorkyeran@aegisub.org>
|
||||
--
|
||||
-- Permission to use, copy, modify, and distribute this software for any
|
||||
-- purpose with or without fee is hereby granted, provided that the above
|
||||
-- copyright notice and this permission notice appear in all copies.
|
||||
--
|
||||
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
require 'lunatest'
|
||||
unicode = require 'aegisub.unicode'
|
||||
|
||||
export test_char_widths = ->
|
||||
assert_equal 1, unicode.charwidth 'a'
|
||||
assert_equal 2, unicode.charwidth 'ß'
|
||||
assert_equal 3, unicode.charwidth 'c'
|
||||
assert_equal 4, unicode.charwidth '🄓'
|
||||
|
||||
export test_char_iterator = ->
|
||||
chars = [c for c in unicode.chars 'aßc🄓']
|
||||
assert_equal 4, #chars
|
||||
assert_equal chars[1], 'a'
|
||||
assert_equal chars[2], 'ß'
|
||||
assert_equal chars[3], 'c'
|
||||
assert_equal chars[4], '🄓'
|
||||
|
||||
export test_len = ->
|
||||
assert_equal 4, unicode.len 'aßc🄓'
|
||||
|
||||
export test_codepoint = ->
|
||||
assert_equal 97, unicode.codepoint 'a'
|
||||
assert_equal 223, unicode.codepoint 'ß'
|
||||
assert_equal 0xFF43, unicode.codepoint 'c'
|
||||
assert_equal 0x1F113, unicode.codepoint '🄓'
|
Loading…
Reference in a new issue