#-------------------------------------------------------------------- | |
# Copyright (c) 1999-2001, International Business Machines | |
# Corporation and others. All Rights Reserved. | |
#-------------------------------------------------------------------- | |
# Date: Tue Jan 23 2001 | |
#-------------------------------------------------------------------- | |
# Hiragana-Katana | |
# This is largely a one-to-one mapping, but it has a | |
# few kinks: | |
# 1. The Katakana va/vi/ve/vo (30F7-30FA) have no | |
# Hiragana equivalents. We use Hiragana wa/wi/we/wo | |
# (308F-3092) with a voicing mark (3099), which is | |
# semantically equivalent. However, this is a non- | |
# roundtripping transformation. | |
# 2. The Katakana small ka/ke (30F5,30F6) have no | |
# Hiragana equiavlents. We convert them to normal | |
# Hiragana ka/ke (304B,3051). This is a one-way | |
# information-losing transformation and precludes | |
# round-tripping of 30F5 and 30F6. | |
# 3. The combining marks 3099-309C are in the Hiragana | |
# block, but they apply to Katakana as well, so we | |
# leave them untouched. | |
# 4. The Katakana prolonged sound mark 30FC doubles the | |
# preceding vowel. This is a one-way information- | |
# losing transformation from Katakana to Hiragana. | |
# 5. The Katakana middle dot separates words in foreign | |
# expressions; we leave this unmodified. | |
# The above points preclude successful round-trip | |
# transformations of arbitrary input text. However, | |
# they provide naturalistic results that should conform | |
# to user expectations. | |
# Combining equivalents va/vi/ve/vo | |
わ゙ <> ヷ; | |
ゐ゙ <> ヸ; | |
ゑ゙ <> ヹ; | |
を゙ <> ヺ; | |
# One-to-one mappings, main block | |
# 3041:3094 <> 30A1:30F4 | |
# 309D,E <> 30FD,E | |
ぁ <> ァ; | |
あ <> ア; | |
ぃ <> ィ; | |
い <> イ; | |
ぅ <> ゥ; | |
う <> ウ; | |
ぇ <> ェ; | |
え <> エ; | |
ぉ <> ォ; | |
お <> オ; | |
か <> カ; | |
が <> ガ; | |
き <> キ; | |
ぎ <> ギ; | |
く <> ク; | |
ぐ <> グ; | |
け <> ケ; | |
げ <> ゲ; | |
こ <> コ; | |
ご <> ゴ; | |
さ <> サ; | |
ざ <> ザ; | |
し <> シ; | |
じ <> ジ; | |
す <> ス; | |
ず <> ズ; | |
せ <> セ; | |
ぜ <> ゼ; | |
そ <> ソ; | |
ぞ <> ゾ; | |
た <> タ; | |
だ <> ダ; | |
ち <> チ; | |
ぢ <> ヂ; | |
っ <> ッ; | |
つ <> ツ; | |
づ <> ヅ; | |
て <> テ; | |
で <> デ; | |
と <> ト; | |
ど <> ド; | |
な <> ナ; | |
に <> ニ; | |
ぬ <> ヌ; | |
ね <> ネ; | |
の <> ノ; | |
は <> ハ; | |
ば <> バ; | |
ぱ <> パ; | |
ひ <> ヒ; | |
び <> ビ; | |
ぴ <> ピ; | |
ふ <> フ; | |
ぶ <> ブ; | |
ぷ <> プ; | |
へ <> ヘ; | |
べ <> ベ; | |
ぺ <> ペ; | |
ほ <> ホ; | |
ぼ <> ボ; | |
ぽ <> ポ; | |
ま <> マ; | |
み <> ミ; | |
む <> ム; | |
め <> メ; | |
も <> モ; | |
ゃ <> ャ; | |
や <> ヤ; | |
ゅ <> ュ; | |
ゆ <> ユ; | |
ょ <> ョ; | |
よ <> ヨ; | |
ら <> ラ; | |
り <> リ; | |
る <> ル; | |
れ <> レ; | |
ろ <> ロ; | |
ゎ <> ヮ; | |
わ <> ワ; | |
ゐ <> ヰ; | |
ゑ <> ヱ; | |
を <> ヲ; | |
ん <> ン; | |
ゔ <> ヴ; | |
ゝ <> ヽ; | |
ゞ <> ヾ; | |
# One-way Katakana-Hiragana xform of small K ka/ke to | |
# normal H ka/ke. | |
か < ヵ; | |
け < ヶ; | |
# Katakana followed by a prolonged sound mark 30FC has | |
# its final vowel doubled. This is a Katakana-Hiragana | |
# one-way information-losing transformation. We | |
# include the small Katakana (e.g., small A 3041) and | |
# do not distinguish them from their large | |
# counterparts. It doesn't make sense to double a | |
# small counterpart vowel as a small Hiragana vowel, so | |
# we don't do so. In natural text this should never | |
# occur anyway. If a 30FC is seen without a preceding | |
# vowel sound (e.g., after n 30F3) we do not change it. | |
### $long = ー; | |
# The following categories are Hiragana, not Katakana | |
# as might be expected, since by the time we get to the | |
# 30FC, the preceding character will have already been | |
# transformed to Hiragana. | |
# {The following mechanically generated from the | |
# Unicode 3.0 data:} | |
$xa = [ \ | |
ぁ あ か が さ ざ \ | |
た だ な は ば ぱ \ | |
ま ゃ や ら ゎ わ \ | |
]; | |
$xi = [ \ | |
ぃ い き ぎ し じ \ | |
ち ぢ に ひ び ぴ \ | |
み り ゐ \ | |
]; | |
$xu = [ \ | |
ぅ う く ぐ す ず \ | |
っ つ づ ぬ ふ ぶ \ | |
ぷ む ゅ ゆ る ゔ \ | |
]; | |
$xe = [ \ | |
ぇ え け げ せ ぜ \ | |
て で ね へ べ ぺ \ | |
め れ ゑ \ | |
]; | |
$xo = [ \ | |
ぉ お こ ご そ ぞ \ | |
と ど の ほ ぼ ぽ \ | |
も ょ よ ろ を \ | |
]; | |
あ < $xa {ー}; | |
い < $xi {ー}; | |
う < $xu {ー}; | |
え < $xe {ー}; | |
お < $xo {ー}; | |
# eof |