Strip a few more MB of unused stuff from ICU's data files
This commit is contained in:
parent
571c9d9b7a
commit
9f8a10b014
2 changed files with 318 additions and 3 deletions
|
@ -81,12 +81,11 @@
|
||||||
Outputs="$(AegisubObjectDir)build.timestamp"
|
Outputs="$(AegisubObjectDir)build.timestamp"
|
||||||
>
|
>
|
||||||
|
|
||||||
<!-- Generated with http://apps.icu-project.org/datacustom/ -->
|
<!-- Generated with tools/strip-icu.py -->
|
||||||
<!-- Includes Break Iterator and Display Name data only -->
|
|
||||||
<DownloadTgzFile
|
<DownloadTgzFile
|
||||||
Url="http://www.aegisub.org/~plorkyeran/icudt53l.dat.gz"
|
Url="http://www.aegisub.org/~plorkyeran/icudt53l.dat.gz"
|
||||||
OutputFile="$(MSBuildThisFileDirectory)..\..\vendor\icu\source\data\in\icudt53l.dat"
|
OutputFile="$(MSBuildThisFileDirectory)..\..\vendor\icu\source\data\in\icudt53l.dat"
|
||||||
Hash="6b4244cdad08804fbb9c4f53e5fbf212321b80d1"
|
Hash="2a351e7753fb1e92910dc2d68721b8f41bbf33b0"
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<ExecShellScript
|
<ExecShellScript
|
||||||
|
|
316
tools/strip-icu.py
Normal file
316
tools/strip-icu.py
Normal file
|
@ -0,0 +1,316 @@
|
||||||
|
# Copyright (c) 2014, Thomas Goyne <plorkyeran@aegisub.org>
|
||||||
|
#
|
||||||
|
# Permission to use, copy, modify, and distribute this software for any
|
||||||
|
# purpose with or without fee is hereby granted, provided that the above
|
||||||
|
# copyright notice and this permission notice appear in all copies.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||||
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||||
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||||
|
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||||
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||||
|
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||||
|
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
|
#
|
||||||
|
# Aegisub Project http://www.aegisub.org/
|
||||||
|
|
||||||
|
# A script to strip all of the data we don't use out of ICU's data files
|
||||||
|
# Run from $ICU_ROOT/source/data
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Remove stuff we don't use at all from the Makefile
|
||||||
|
def delete_matching(filename, strs):
|
||||||
|
exprs = [re.compile(s) for s in strs]
|
||||||
|
|
||||||
|
with open(filename) as f:
|
||||||
|
lines = [line for line in f if not any(r.match(line.decode('utf-8')) for r in exprs)]
|
||||||
|
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
for line in lines:
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
|
REMOVE_SUBDIRS=['LOCSRCDIR', 'CURRSRCDIR', 'ZONESRCDIR', 'COLSRCDIR', 'RBNFSRCDIR', 'TRANSLITSRCDIR']
|
||||||
|
delete_matching('Makefile.in', ['^-include .*%s' % s for s in REMOVE_SUBDIRS])
|
||||||
|
delete_matching('Makefile.in', ['^CNV_FILES'])
|
||||||
|
|
||||||
|
# Remove data we don't need from the lang and region files
|
||||||
|
def parse_txt(filename):
|
||||||
|
root = {}
|
||||||
|
cur = root
|
||||||
|
stack = [root]
|
||||||
|
comment = False
|
||||||
|
for line in open(filename):
|
||||||
|
line = line.decode('utf-8')
|
||||||
|
line = line.strip()
|
||||||
|
if len(line) == 0:
|
||||||
|
continue
|
||||||
|
if '//' in line:
|
||||||
|
continue
|
||||||
|
if '/*' in line:
|
||||||
|
comment = True
|
||||||
|
continue
|
||||||
|
if comment:
|
||||||
|
if '*/' in line:
|
||||||
|
comment = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if line == '}':
|
||||||
|
stack.pop()
|
||||||
|
cur = stack[-1]
|
||||||
|
continue
|
||||||
|
if line.endswith('{'):
|
||||||
|
obj = {}
|
||||||
|
cur[line[:-1]] = obj
|
||||||
|
cur = obj
|
||||||
|
stack.append(obj)
|
||||||
|
continue
|
||||||
|
|
||||||
|
m = re.match('(.*){"(.*)"}', line)
|
||||||
|
if not m:
|
||||||
|
print line
|
||||||
|
else:
|
||||||
|
cur[m.group(1)] = m.group(2)
|
||||||
|
|
||||||
|
return root
|
||||||
|
|
||||||
|
def remove_sections(root):
|
||||||
|
for child in root.itervalues():
|
||||||
|
child.pop('Keys', None)
|
||||||
|
child.pop('LanguagesShort', None)
|
||||||
|
child.pop('Types', None)
|
||||||
|
child.pop('Variants', None)
|
||||||
|
child.pop('codePatterns', None)
|
||||||
|
child.pop('localeDisplayPattern', None)
|
||||||
|
child.pop('CountriesShort', None)
|
||||||
|
child.pop('Scripts%stand-alone', None)
|
||||||
|
|
||||||
|
def remove_languages(root):
|
||||||
|
for lang, child in root.iteritems():
|
||||||
|
# We only care about a language's name in that language
|
||||||
|
lang = lang.split('_')[0]
|
||||||
|
trimmed = {}
|
||||||
|
v = child.get('Languages', {}).get(lang)
|
||||||
|
if v:
|
||||||
|
trimmed[lang] = v
|
||||||
|
child['Languages'] = trimmed
|
||||||
|
|
||||||
|
# Scripts which are actually used by stuff
|
||||||
|
SCRIPTS = ['Cyrl', 'Latn', 'Arab', 'Vaii', 'Hans', 'Hant']
|
||||||
|
def remove_scripts(root):
|
||||||
|
for lang, child in root.iteritems():
|
||||||
|
v = child.get('Scripts')
|
||||||
|
if not v:
|
||||||
|
continue
|
||||||
|
|
||||||
|
trimmed = {}
|
||||||
|
for script in SCRIPTS:
|
||||||
|
if v.get(script):
|
||||||
|
trimmed[script] = v[script]
|
||||||
|
child['Scripts'] = trimmed
|
||||||
|
|
||||||
|
def write_dict(name, value, out, indent):
|
||||||
|
if len(value) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
child_indent = indent + ' '
|
||||||
|
|
||||||
|
out.write(indent)
|
||||||
|
out.write(name.encode('utf-8'))
|
||||||
|
out.write('{\n')
|
||||||
|
for k in sorted(value.keys()):
|
||||||
|
v = value[k]
|
||||||
|
if type(v) == dict:
|
||||||
|
write_dict(k, v, out, child_indent)
|
||||||
|
else:
|
||||||
|
out.write(('%s%s{"%s"}\n' % (child_indent, k, v)).encode('utf-8'))
|
||||||
|
out.write(indent)
|
||||||
|
out.write('}\n')
|
||||||
|
|
||||||
|
def write_file(root, filename):
|
||||||
|
with open(filename, 'w') as f:
|
||||||
|
for k, v in root.iteritems():
|
||||||
|
write_dict(k, v, f, '')
|
||||||
|
|
||||||
|
def minify_lang(filename):
|
||||||
|
f = parse_txt(filename)
|
||||||
|
remove_sections(f)
|
||||||
|
remove_languages(f)
|
||||||
|
remove_scripts(f)
|
||||||
|
write_file(f, filename)
|
||||||
|
|
||||||
|
for name in os.listdir('lang'):
|
||||||
|
if not name.endswith('.txt'):
|
||||||
|
continue
|
||||||
|
minify_lang('lang/' + name)
|
||||||
|
|
||||||
|
# gather information about which language+region combinations actually exist,
|
||||||
|
# so that we can drop all others
|
||||||
|
def gather_regions():
|
||||||
|
langs = {
|
||||||
|
'af': ['ZA'],
|
||||||
|
'am': ['ET'],
|
||||||
|
'ar': ['AE', 'BH', 'DZ', 'EG', 'IQ', 'JO', 'KW', 'LB', 'LY', 'MA', 'OM', 'QA', 'SA', 'SY', 'TN', 'YE'],
|
||||||
|
'arn': ['CL'],
|
||||||
|
'as': ['IN'],
|
||||||
|
'az': ['AZ', 'AZ'],
|
||||||
|
'ba': ['RU'],
|
||||||
|
'be': ['BY'],
|
||||||
|
'bg': ['BG'],
|
||||||
|
'bn': ['BD', 'IN'],
|
||||||
|
'bo': ['CN'],
|
||||||
|
'br': ['FR'],
|
||||||
|
'bs': ['BA', 'BA'],
|
||||||
|
'ca': ['ES'],
|
||||||
|
'co': ['FR'],
|
||||||
|
'cs': ['CZ'],
|
||||||
|
'cy': ['GB'],
|
||||||
|
'da': ['DK'],
|
||||||
|
'de': ['AT', 'CH', 'DE', 'LI', 'LU'],
|
||||||
|
'div': ['MV'],
|
||||||
|
'el': ['GR'],
|
||||||
|
'en': ['029', 'AU', 'BZ', 'CA', 'GB', 'IE', 'IN', 'JM', 'MY', 'NZ', 'PH', 'SG', 'TT', 'US', 'ZA', 'ZW'],
|
||||||
|
'es': ['AR', 'BO', 'CL', 'CO', 'CR', 'DO', 'EC', 'ES', 'GT', 'HN', 'MX', 'NI', 'PA', 'PE', 'PR', 'PY', 'SV', 'US', 'UY', 'VE'],
|
||||||
|
'et': ['EE'],
|
||||||
|
'eu': ['ES'],
|
||||||
|
'fa': ['IR'],
|
||||||
|
'fi': ['FI'],
|
||||||
|
'fil': ['PH'],
|
||||||
|
'fo': ['FO'],
|
||||||
|
'fr': ['BE', 'CA', 'CH', 'FR', 'LU', 'MC'],
|
||||||
|
'fy': ['NL'],
|
||||||
|
'ga': ['IE'],
|
||||||
|
'gl': ['ES'],
|
||||||
|
'gsw': ['FR'],
|
||||||
|
'gu': ['IN'],
|
||||||
|
'ha': ['NG'],
|
||||||
|
'he': ['IL'],
|
||||||
|
'hi': ['IN'],
|
||||||
|
'hr': ['BA', 'HR'],
|
||||||
|
'hu': ['HU'],
|
||||||
|
'hy': ['AM'],
|
||||||
|
'id': ['ID'],
|
||||||
|
'ig': ['NG'],
|
||||||
|
'ii': ['CN'],
|
||||||
|
'is': ['IS'],
|
||||||
|
'it': ['CH', 'IT'],
|
||||||
|
'iu': ['CA', 'CA'],
|
||||||
|
'ja': ['JP'],
|
||||||
|
'ka': ['GE'],
|
||||||
|
'kk': ['KZ'],
|
||||||
|
'kl': ['GL'],
|
||||||
|
'km': ['KH'],
|
||||||
|
'kn': ['IN'],
|
||||||
|
'ko': ['KR'],
|
||||||
|
'kok': ['IN'],
|
||||||
|
'ky': ['KG'],
|
||||||
|
'lb': ['LU'],
|
||||||
|
'lo': ['LA'],
|
||||||
|
'lt': ['LT'],
|
||||||
|
'lv': ['LV'],
|
||||||
|
'mi': ['NZ'],
|
||||||
|
'mk': ['MK'],
|
||||||
|
'ml': ['IN'],
|
||||||
|
'mn': ['CN', 'MN'],
|
||||||
|
'moh': ['CA'],
|
||||||
|
'mr': ['IN'],
|
||||||
|
'ms': ['BN', 'MY'],
|
||||||
|
'mt': ['MT'],
|
||||||
|
'nb': ['NO'],
|
||||||
|
'ne': ['NP'],
|
||||||
|
'nl': ['BE', 'NL'],
|
||||||
|
'nn': ['NO'],
|
||||||
|
'nso': ['ZA'],
|
||||||
|
'oc': ['FR'],
|
||||||
|
'or': ['IN'],
|
||||||
|
'pa': ['IN'],
|
||||||
|
'pl': ['PL'],
|
||||||
|
'prs': ['AF'],
|
||||||
|
'ps': ['AF'],
|
||||||
|
'pt': ['BR', 'PT'],
|
||||||
|
'qut': ['GT'],
|
||||||
|
'quz': ['BO', 'EC', 'PE'],
|
||||||
|
'rm': ['CH'],
|
||||||
|
'ro': ['RO'],
|
||||||
|
'ru': ['RU'],
|
||||||
|
'rw': ['RW'],
|
||||||
|
'sa': ['IN'],
|
||||||
|
'sah': ['RU'],
|
||||||
|
'se': ['FI', 'NO', 'SE'],
|
||||||
|
'si': ['LK'],
|
||||||
|
'sk': ['SK'],
|
||||||
|
'sl': ['SI'],
|
||||||
|
'sma': ['NO', 'SE'],
|
||||||
|
'smj': ['NO', 'SE'],
|
||||||
|
'smn': ['FI'],
|
||||||
|
'sms': ['FI'],
|
||||||
|
'sq': ['AL'],
|
||||||
|
'sr': ['BA', 'BA', 'SP', 'YU'],
|
||||||
|
'sv': ['FI', 'SE'],
|
||||||
|
'sw': ['KE', 'TZ'],
|
||||||
|
'syr': ['SY'],
|
||||||
|
'ta': ['IN'],
|
||||||
|
'te': ['IN'],
|
||||||
|
'tg': ['TJ'],
|
||||||
|
'th': ['TH'],
|
||||||
|
'tk': ['TM'],
|
||||||
|
'tn': ['ZA'],
|
||||||
|
'tr': ['TR'],
|
||||||
|
'tt': ['RU'],
|
||||||
|
'tzm': ['DZ'],
|
||||||
|
'ug': ['CN'],
|
||||||
|
'uk': ['UA'],
|
||||||
|
'ur': ['PK'],
|
||||||
|
'uz': ['UZ', 'UZ'],
|
||||||
|
'vi': ['VN'],
|
||||||
|
'wee': ['DE'],
|
||||||
|
'wen': ['DE'],
|
||||||
|
'wo': ['SN'],
|
||||||
|
'xh': ['ZA'],
|
||||||
|
'yo': ['NG'],
|
||||||
|
'zh': ['CN', 'HK', 'MO', 'SG', 'TW'],
|
||||||
|
'zu': ['ZA']
|
||||||
|
}
|
||||||
|
for name in os.listdir('region'):
|
||||||
|
if not name.endswith('.txt'): continue
|
||||||
|
parts = name[:-4].split('_')
|
||||||
|
if len(parts) == 1: continue
|
||||||
|
if not parts[0] in langs:
|
||||||
|
langs[parts[0]] = []
|
||||||
|
langs[parts[0]].extend(parts[1:])
|
||||||
|
return langs
|
||||||
|
|
||||||
|
REGIONS = gather_regions()
|
||||||
|
def remove_countries(root):
|
||||||
|
for lang, child in root.iteritems():
|
||||||
|
v = child.get('Countries', {})
|
||||||
|
if not v: continue
|
||||||
|
|
||||||
|
# We only care about the names for regions in the languages used in
|
||||||
|
# those regions
|
||||||
|
lang = lang.split('_')[0]
|
||||||
|
regions = REGIONS.get(lang)
|
||||||
|
if not regions:
|
||||||
|
del child['Countries']
|
||||||
|
continue
|
||||||
|
|
||||||
|
trimmed = {}
|
||||||
|
for region in regions:
|
||||||
|
name = v.get(region)
|
||||||
|
if name:
|
||||||
|
trimmed[region] = name
|
||||||
|
child['Countries'] = trimmed
|
||||||
|
|
||||||
|
def minify_region(filename):
|
||||||
|
f = parse_txt(filename)
|
||||||
|
remove_sections(f)
|
||||||
|
remove_countries(f)
|
||||||
|
write_file(f, filename)
|
||||||
|
|
||||||
|
for name in os.listdir('region'):
|
||||||
|
if not name.endswith('.txt'):
|
||||||
|
continue
|
||||||
|
minify_region('region/' + name)
|
||||||
|
|
Loading…
Reference in a new issue