blob: b3e2aab466d15bce8f138d6982035606be7a405d [file] [log] [blame]
;;; Copyright (C) 2023 Free Software Foundation, Inc.
;;; This file is part of the GNU LIBICONV Tools.
;;;
;;; This program is free software: you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 3 of the License, or
;;; (at your option) any later version.
;;;
;;; This program is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with this program; if not, see <https://www.gnu.org/licenses/>.
;;; Transform UnicodeData.txt to a part of translit.def.
;;; translit.def consists of lines of the form
;;; Unicode <tab> utf-8 replacement <tab> # comment
;;; This program produces the lines for the code points >= U+3200
;;; and reasonable approximations for the code points < U+3200.
(defun starts-with (s prefix)
(locally (declare (compile)))
(and (>= (length s) (length prefix))
(equal (substring s 0 (length prefix)) prefix)))
(defun generate-part (infilename outfilename)
(locally (declare (compile)))
(with-open-file (in infilename)
(with-open-file (out outfilename :direction :output
:external-format CHARSET:UTF-8)
(loop
(let ((line (read-line in nil nil)))
(unless line (return))
(let* ((semicolon1 (position #\; line))
(semicolon2 (position #\; line :start (+ semicolon1 1)))
(semicolon3 (position #\; line :start (+ semicolon2 1)))
(semicolon4 (position #\; line :start (+ semicolon3 1)))
(semicolon5 (position #\; line :start (+ semicolon4 1)))
(semicolon6 (position #\; line :start (+ semicolon5 1)))
(code (let ((*read-base* 16)) (read-from-string (substring line 0 semicolon1))))
(name (substring line (+ semicolon1 1) semicolon2))
(category (substring line (+ semicolon2 1) semicolon3))
(combining (substring line (+ semicolon3 1) semicolon4))
(bidi (substring line (+ semicolon4 1) semicolon5))
(decomposition (substring line (+ semicolon5 1) semicolon6)))
(declare (ignore category combining bidi))
(when (or (starts-with decomposition "<circle> ")
(starts-with decomposition "<compat> ")
(starts-with decomposition "<font> ")
(starts-with decomposition "<narrow> ")
(starts-with decomposition "<small> ")
(starts-with decomposition "<square> ")
(starts-with decomposition "<wide> ")
(and (starts-with name "CJK COMPATIBILITY IDEOGRAPH-")
(> (length decomposition) 0))
)
(let ((replacement '()))
(let ((i (+ (or (position #\> decomposition) -2) 2)))
(loop
(let* ((space (position #\Space decomposition :start i))
(hexcode (substring decomposition i (or space (length decomposition)))))
(push
(int-char (let ((*read-base* 16)) (read-from-string hexcode)))
replacement)
(unless space
(return))
(setq i (+ space 1)))))
(setq replacement (nreverse replacement))
(when (starts-with decomposition "<circle> ")
(setq replacement (concatenate 'list '(#\() replacement '(#\)))))
(setq replacement (mapcan (lambda (c)
(cond ((eql c #\DIVISION_SLASH) (list #\/))
((eql c #\SUPERSCRIPT_TWO) (list #\^ #\2))
((eql c #\SUPERSCRIPT_THREE) (list #\^ #\3))
((eql c #\SCRIPT_SMALL_L) (list #\l))
(t (list c))))
replacement))
(format out "~4,'0X~A~{~A~}~A# ~A~%" code #\Tab replacement #\Tab name)
) )
) ) ) ) ) )
; (generate-part "/home/bruno/notes/UnicodeData-15.0.0.txt" "translit-part-15.0.0.def")