push
This commit is contained in:
174
scripts/gen_decomp.py
Executable file
174
scripts/gen_decomp.py
Executable file
@@ -0,0 +1,174 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2010-2013 Various Authors
|
||||
# Copyright 2010 Johannes Weißl
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU General Public License as
|
||||
# published by the Free Software Foundation; either version 2 of the
|
||||
# License, or (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import sys
|
||||
import re
|
||||
import os.path
|
||||
import urllib2
|
||||
from optparse import OptionParser
|
||||
|
||||
# Some letters don't have a decomposition, but can't be composed on all
|
||||
# keyboards. This dictionary maps them to an ASCII character which
|
||||
# *looks* similar.
|
||||
special_decompositions = {
|
||||
u'Æ': u'A',
|
||||
u'Ð': u'D',
|
||||
u'×': u'x',
|
||||
u'Ø': u'O',
|
||||
u'Þ': u'P',
|
||||
u'ß': u'B',
|
||||
u'æ': u'a',
|
||||
u'ð': u'd',
|
||||
u'ø': u'o',
|
||||
u'þ': u'p',
|
||||
# Various punctation/quotation characters
|
||||
u'‐': u'-',
|
||||
u'‒': u'-',
|
||||
u'–': u'-',
|
||||
u'−': u'-',
|
||||
u'—': u'-',
|
||||
u'―': u'-',
|
||||
u'‘': u"'",
|
||||
u'’': u"'",
|
||||
u'′': u"'",
|
||||
u'“': u'"',
|
||||
u'”': u'"',
|
||||
u'″': u'"',
|
||||
u'〃': u'"',
|
||||
u'…': u'.',
|
||||
}
|
||||
|
||||
def parse_unidata(f):
|
||||
u = {}
|
||||
for line in f:
|
||||
d = line.rstrip('\n').split(';')
|
||||
cp = int(d[0], 16)
|
||||
u[cp] = {}
|
||||
u[cp]['name'] = d[1]
|
||||
decomp = d[5]
|
||||
if decomp:
|
||||
m = re.match(r'<.*> (.*)', decomp)
|
||||
u[cp]['compat'] = bool(m)
|
||||
if m:
|
||||
decomp = m.group(1)
|
||||
u[cp]['decomp'] = [int(x, 16) for x in decomp.split(' ')]
|
||||
else:
|
||||
u[cp]['decomp'] = []
|
||||
return u
|
||||
|
||||
def unidata_expand_decomp(unidata):
|
||||
def recurse(k):
|
||||
if k not in unidata or not unidata[k]['decomp']:
|
||||
return [k]
|
||||
exp = []
|
||||
for d in unidata[k]['decomp']:
|
||||
exp += recurse(d)
|
||||
return exp
|
||||
for k in unidata.keys():
|
||||
exp = recurse(k)
|
||||
if exp != [k]:
|
||||
unidata[k]['decomp'] = exp
|
||||
|
||||
def unidata_add_mapping(unidata, mapping):
|
||||
for k, v in mapping.items():
|
||||
unidata[ord(k)]['decomp'] = [ord(v)]
|
||||
|
||||
def is_diacritical_mark(c):
|
||||
return c >= 0x0300 and c <= 0x036F
|
||||
|
||||
def filter_unidata(unidata, include):
|
||||
for k, v in unidata.items():
|
||||
if k in include:
|
||||
continue
|
||||
if not v['decomp']:
|
||||
del unidata[k]
|
||||
continue
|
||||
b = v['decomp'][0]
|
||||
if unichr(b) == u' ' or is_diacritical_mark(b):
|
||||
del unidata[k]
|
||||
continue
|
||||
has_accents = False
|
||||
for d in v['decomp'][1:]:
|
||||
if is_diacritical_mark(d):
|
||||
has_accents = True
|
||||
break
|
||||
if not has_accents:
|
||||
del unidata[k]
|
||||
|
||||
def output(unidata, f):
|
||||
buf = '''/* This file is automatically generated. DO NOT EDIT!
|
||||
Instead, edit %s and re-run. */
|
||||
|
||||
static struct {
|
||||
uchar composed;
|
||||
uchar base;
|
||||
} unidecomp_map[] = {
|
||||
''' % os.path.basename(sys.argv[0])
|
||||
for k in sorted(unidata.keys()):
|
||||
b = unidata[k]['decomp'][0]
|
||||
buf += ('\t{ %#6x, %#6x },\t// %s -> %s,\t%s' % \
|
||||
(k, b,
|
||||
unichr(k).encode('utf-8'),
|
||||
unichr(b).encode('utf-8'),
|
||||
', '.join([' %s (%x)' %
|
||||
(unichr(d).encode('utf-8'), d)
|
||||
for d in unidata[k]['decomp'][1:]]))).rstrip() + '\n'
|
||||
buf += '};'
|
||||
f.write(buf+'\n')
|
||||
|
||||
def main(argv=None):
|
||||
|
||||
if not argv:
|
||||
argv = sys.argv
|
||||
|
||||
parser = OptionParser(usage='usage: %prog [-w] [-o unidecomp.h]')
|
||||
parser.add_option('-w', '--wget', action='store_true',
|
||||
help='get unicode data from unicode.org')
|
||||
parser.add_option('-o', '--output',
|
||||
help='output file, default stdout')
|
||||
(options, args) = parser.parse_args(argv[1:])
|
||||
|
||||
urlbase = 'http://unicode.org/Public/UNIDATA/'
|
||||
unidata_filename = 'UnicodeData.txt'
|
||||
|
||||
if not os.path.exists(unidata_filename) and not options.wget:
|
||||
parser.error('''need %s in the current directory, download
|
||||
from unicode.org or use `--wget' option.''' % unidata_filename)
|
||||
|
||||
if options.wget:
|
||||
unidata_file = urllib2.urlopen(urlbase+unidata_filename)
|
||||
else:
|
||||
unidata_file = open(unidata_filename, 'rb')
|
||||
|
||||
unidata = parse_unidata(unidata_file)
|
||||
unidata_file.close()
|
||||
|
||||
unidata_add_mapping(unidata, special_decompositions)
|
||||
unidata_expand_decomp(unidata)
|
||||
filter_unidata(unidata, [ord(x) for x in special_decompositions])
|
||||
|
||||
outfile = sys.stdout
|
||||
if options.output:
|
||||
outfile = open(options.output, 'wb')
|
||||
output(unidata, outfile)
|
||||
if options.output:
|
||||
outfile.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user