push

2026-03-29 14:01:52 +03:00
commit 0611279128
210 changed files with 60454 additions and 0 deletions
--- a/scripts/gen_decomp.py
+++ b/scripts/gen_decomp.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2010-2013 Various Authors
+# Copyright 2010 Johannes Weißl
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+import sys
+import re
+import os.path
+import urllib2
+from optparse import OptionParser
+
+# Some letters don't have a decomposition, but can't be composed on all
+# keyboards. This dictionary maps them to an ASCII character which
+# *looks* similar.
+special_decompositions = {
+    u'Æ': u'A',
+    u'Ð': u'D',
+    u'×': u'x',
+    u'Ø': u'O',
+    u'Þ': u'P',
+    u'ß': u'B',
+    u'æ': u'a',
+    u'ð': u'd',
+    u'ø': u'o',
+    u'þ': u'p',
+# Various punctation/quotation characters
+    u'‐': u'-',
+    u'‒': u'-',
+    u'–': u'-',
+    u'−': u'-',
+    u'—': u'-',
+    u'―': u'-',
+    u'‘': u"'",
+    u'’': u"'",
+    u'′': u"'",
+    u'“': u'"',
+    u'”': u'"',
+    u'″': u'"',
+    u'〃': u'"',
+    u'…': u'.',
+}
+
+def parse_unidata(f):
+    u = {}
+    for line in f:
+        d = line.rstrip('\n').split(';')
+        cp = int(d[0], 16)
+        u[cp] = {}
+        u[cp]['name'] = d[1]
+        decomp = d[5]
+        if decomp:
+            m = re.match(r'<.*> (.*)', decomp)
+            u[cp]['compat'] = bool(m)
+            if m:
+                decomp = m.group(1)
+            u[cp]['decomp'] = [int(x, 16) for x in decomp.split(' ')]
+        else:
+            u[cp]['decomp'] = []
+    return u
+
+def unidata_expand_decomp(unidata):
+    def recurse(k):
+        if k not in unidata or not unidata[k]['decomp']:
+            return [k]
+        exp = []
+        for d in unidata[k]['decomp']:
+            exp += recurse(d)
+        return exp
+    for k in unidata.keys():
+        exp = recurse(k)
+        if exp != [k]:
+            unidata[k]['decomp'] = exp
+
+def unidata_add_mapping(unidata, mapping):
+    for k, v in mapping.items():
+        unidata[ord(k)]['decomp'] = [ord(v)]
+
+def is_diacritical_mark(c):
+    return c >= 0x0300 and c <= 0x036F
+
+def filter_unidata(unidata, include):
+    for k, v in unidata.items():
+        if k in include:
+            continue
+        if not v['decomp']:
+            del unidata[k]
+            continue
+        b = v['decomp'][0]
+        if unichr(b) == u' ' or is_diacritical_mark(b):
+            del unidata[k]
+            continue
+        has_accents = False
+        for d in v['decomp'][1:]:
+            if is_diacritical_mark(d):
+                has_accents = True
+                break
+        if not has_accents:
+            del unidata[k]
+
+def output(unidata, f):
+    buf = '''/* This file is automatically generated. DO NOT EDIT!
+Instead, edit %s and re-run. */
+
+static struct {
+	uchar composed;
+	uchar base;
+} unidecomp_map[] = {
+''' % os.path.basename(sys.argv[0])
+    for k in sorted(unidata.keys()):
+        b = unidata[k]['decomp'][0]
+        buf += ('\t{ %#6x, %#6x },\t// %s -> %s,\t%s' % \
+            (k, b,
+            unichr(k).encode('utf-8'),
+            unichr(b).encode('utf-8'),
+            ', '.join([' %s (%x)' %
+                (unichr(d).encode('utf-8'), d)
+                    for d in unidata[k]['decomp'][1:]]))).rstrip()  + '\n'
+    buf += '};'
+    f.write(buf+'\n')
+
+def main(argv=None):
+
+    if not argv:
+        argv = sys.argv
+
+    parser = OptionParser(usage='usage: %prog [-w] [-o unidecomp.h]')
+    parser.add_option('-w', '--wget', action='store_true',
+        help='get unicode data from unicode.org')
+    parser.add_option('-o', '--output',
+        help='output file, default stdout')
+    (options, args) = parser.parse_args(argv[1:])
+
+    urlbase = 'http://unicode.org/Public/UNIDATA/'
+    unidata_filename = 'UnicodeData.txt'
+
+    if not os.path.exists(unidata_filename) and not options.wget:
+        parser.error('''need %s in the current directory, download
+from unicode.org or use `--wget' option.''' % unidata_filename)
+
+    if options.wget:
+        unidata_file = urllib2.urlopen(urlbase+unidata_filename)
+    else:
+        unidata_file = open(unidata_filename, 'rb')
+
+    unidata = parse_unidata(unidata_file)
+    unidata_file.close()
+
+    unidata_add_mapping(unidata, special_decompositions)
+    unidata_expand_decomp(unidata)
+    filter_unidata(unidata, [ord(x) for x in special_decompositions])
+
+    outfile = sys.stdout
+    if options.output:
+        outfile = open(options.output, 'wb')
+    output(unidata, outfile)
+    if options.output:
+        outfile.close()
+
+if __name__ == '__main__':
+    sys.exit(main())