cmus/scripts/gen_decomp.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2010-2013 Various Authors
# Copyright 2010 Johannes Weißl
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
# published by the Free Software Foundation; either version 2 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

import sys
import re
import os.path
import urllib2
from optparse import OptionParser

# Some letters don't have a decomposition, but can't be composed on all
# keyboards. This dictionary maps them to an ASCII character which
# *looks* similar.
special_decompositions = {
    u'Æ': u'A',
    u'Ð': u'D',
    u'×': u'x',
    u'Ø': u'O',
    u'Þ': u'P',
    u'ß': u'B',
    u'æ': u'a',
    u'ð': u'd',
    u'ø': u'o',
    u'þ': u'p',
# Various punctation/quotation characters
    u'‐': u'-',
    u'‒': u'-',
    u'–': u'-',
    u'−': u'-',
    u'—': u'-',
    u'―': u'-',
    u'‘': u"'",
    u'’': u"'",
    u'′': u"'",
    u'“': u'"',
    u'”': u'"',
    u'″': u'"',
    u'〃': u'"',
    u'…': u'.',
}

def parse_unidata(f):
    u = {}
    for line in f:
        d = line.rstrip('\n').split(';')
        cp = int(d[0], 16)
        u[cp] = {}
        u[cp]['name'] = d[1]
        decomp = d[5]
        if decomp:
            m = re.match(r'<.*> (.*)', decomp)
            u[cp]['compat'] = bool(m)
            if m:
                decomp = m.group(1)
            u[cp]['decomp'] = [int(x, 16) for x in decomp.split(' ')]
        else:
            u[cp]['decomp'] = []
    return u

def unidata_expand_decomp(unidata):
    def recurse(k):
        if k not in unidata or not unidata[k]['decomp']:
            return [k]
        exp = []
        for d in unidata[k]['decomp']:
            exp += recurse(d)
        return exp
    for k in unidata.keys():
        exp = recurse(k)
        if exp != [k]:
            unidata[k]['decomp'] = exp

def unidata_add_mapping(unidata, mapping):
    for k, v in mapping.items():
        unidata[ord(k)]['decomp'] = [ord(v)]

def is_diacritical_mark(c):
    return c >= 0x0300 and c <= 0x036F

def filter_unidata(unidata, include):
    for k, v in unidata.items():
        if k in include:
            continue
        if not v['decomp']:
            del unidata[k]
            continue
        b = v['decomp'][0]
        if unichr(b) == u' ' or is_diacritical_mark(b):
            del unidata[k]
            continue
        has_accents = False
        for d in v['decomp'][1:]:
            if is_diacritical_mark(d):
                has_accents = True
                break
        if not has_accents:
            del unidata[k]

def output(unidata, f):
    buf = '''/* This file is automatically generated. DO NOT EDIT!
Instead, edit %s and re-run. */

static struct {
	uchar composed;
	uchar base;
} unidecomp_map[] = {
''' % os.path.basename(sys.argv[0])
    for k in sorted(unidata.keys()):
        b = unidata[k]['decomp'][0]
        buf += ('\t{ %#6x, %#6x },\t// %s -> %s,\t%s' % \
            (k, b,
            unichr(k).encode('utf-8'),
            unichr(b).encode('utf-8'),
            ', '.join([' %s (%x)' %
                (unichr(d).encode('utf-8'), d)
                    for d in unidata[k]['decomp'][1:]]))).rstrip()  + '\n'
    buf += '};'
    f.write(buf+'\n')

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = OptionParser(usage='usage: %prog [-w] [-o unidecomp.h]')
    parser.add_option('-w', '--wget', action='store_true',
        help='get unicode data from unicode.org')
    parser.add_option('-o', '--output',
        help='output file, default stdout')
    (options, args) = parser.parse_args(argv[1:])

    urlbase = 'http://unicode.org/Public/UNIDATA/'
    unidata_filename = 'UnicodeData.txt'

    if not os.path.exists(unidata_filename) and not options.wget:
        parser.error('''need %s in the current directory, download
from unicode.org or use `--wget' option.''' % unidata_filename)

    if options.wget:
        unidata_file = urllib2.urlopen(urlbase+unidata_filename)
    else:
        unidata_file = open(unidata_filename, 'rb')

    unidata = parse_unidata(unidata_file)
    unidata_file.close()

    unidata_add_mapping(unidata, special_decompositions)
    unidata_expand_decomp(unidata)
    filter_unidata(unidata, [ord(x) for x in special_decompositions])

    outfile = sys.stdout
    if options.output:
        outfile = open(options.output, 'wb')
    output(unidata, outfile)
    if options.output:
        outfile.close()

if __name__ == '__main__':
    sys.exit(main())