| #-------------------------------------------------------------------- |
| # Copyright (c) 1999-2004, International Business Machines |
| # Corporation and others. All Rights Reserved. |
| #-------------------------------------------------------------------- |
| |
| # note: a global filter is more efficient, but MUST include all source chars |
| :: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ; |
| :: NFKC (); |
| |
| # Hiragana-Katakana |
| |
| # This is largely a one-to-one mapping, but it has a |
| # few kinks: |
| |
| # 1. The Katakana va/vi/ve/vo (30F7-30FA) have no |
| # Hiragana equivalents. We use Hiragana wa/wi/we/wo |
| # (308F-3092) with a voicing mark (3099), which is |
| # semantically equivalent. However, this is a non- |
| # roundtripping transformation. |
| |
| # 2. The Katakana small ka/ke (30F5,30F6) have no |
| # Hiragana equiavlents. We convert them to normal |
| # Hiragana ka/ke (304B,3051). This is a one-way |
| # information-losing transformation and precludes |
| # round-tripping of 30F5 and 30F6. |
| |
| # 3. The combining marks 3099-309C are in the Hiragana |
| # block, but they apply to Katakana as well, so we |
| # leave them untouched. |
| |
| # 4. The Katakana prolonged sound mark 30FC doubles the |
| # preceding vowel. This is a one-way information- |
| # losing transformation from Katakana to Hiragana. |
| |
| # 5. The Katakana middle dot separates words in foreign |
| # expressions; we leave this unmodified. |
| |
| # The above points preclude successful round-trip |
| # transformations of arbitrary input text. However, |
| # they provide naturalistic results that should conform |
| # to user expectations. |
| |
| |
| # Combining equivalents va/vi/ve/vo |
| わ゙ <> ヷ; |
| ゐ゙ <> ヸ; |
| ゑ゙ <> ヹ; |
| を゙ <> ヺ; |
| |
| # One-to-one mappings, main block |
| # 3041:3094 <> 30A1:30F4 |
| # 309D,E <> 30FD,E |
| ぁ <> ァ; |
| あ <> ア; |
| ぃ <> ィ; |
| い <> イ; |
| ぅ <> ゥ; |
| う <> ウ; |
| ぇ <> ェ; |
| え <> エ; |
| ぉ <> ォ; |
| お <> オ; |
| か <> カ; |
| が <> ガ; |
| き <> キ; |
| ぎ <> ギ; |
| く <> ク; |
| ぐ <> グ; |
| け <> ケ; |
| げ <> ゲ; |
| こ <> コ; |
| ご <> ゴ; |
| さ <> サ; |
| ざ <> ザ; |
| し <> シ; |
| じ <> ジ; |
| す <> ス; |
| ず <> ズ; |
| せ <> セ; |
| ぜ <> ゼ; |
| そ <> ソ; |
| ぞ <> ゾ; |
| た <> タ; |
| だ <> ダ; |
| ち <> チ; |
| ぢ <> ヂ; |
| っ <> ッ; |
| つ <> ツ; |
| づ <> ヅ; |
| て <> テ; |
| で <> デ; |
| と <> ト; |
| ど <> ド; |
| な <> ナ; |
| に <> ニ; |
| ぬ <> ヌ; |
| ね <> ネ; |
| の <> ノ; |
| は <> ハ; |
| ば <> バ; |
| ぱ <> パ; |
| ひ <> ヒ; |
| び <> ビ; |
| ぴ <> ピ; |
| ふ <> フ; |
| ぶ <> ブ; |
| ぷ <> プ; |
| へ <> ヘ; |
| べ <> ベ; |
| ぺ <> ペ; |
| ほ <> ホ; |
| ぼ <> ボ; |
| ぽ <> ポ; |
| ま <> マ; |
| み <> ミ; |
| む <> ム; |
| め <> メ; |
| も <> モ; |
| ゃ <> ャ; |
| や <> ヤ; |
| ゅ <> ュ; |
| ゆ <> ユ; |
| ょ <> ョ; |
| よ <> ヨ; |
| ら <> ラ; |
| り <> リ; |
| る <> ル; |
| れ <> レ; |
| ろ <> ロ; |
| ゎ <> ヮ; |
| わ <> ワ; |
| ゐ <> ヰ; |
| ゑ <> ヱ; |
| を <> ヲ; |
| ん <> ン; |
| ゔ <> ヴ; |
| ゝ <> ヽ; |
| ゞ <> ヾ; |
| |
| # One-way Katakana-Hiragana xform of small K ka/ke to |
| # normal H ka/ke. |
| か < ヵ; |
| け < ヶ; |
| |
| # Katakana followed by a prolonged sound mark 30FC has |
| # its final vowel doubled. This is a Katakana-Hiragana |
| # one-way information-losing transformation. We |
| # include the small Katakana (e.g., small A 3041) and |
| # do not distinguish them from their large |
| # counterparts. It doesn't make sense to double a |
| # small counterpart vowel as a small Hiragana vowel, so |
| # we don't do so. In natural text this should never |
| # occur anyway. If a 30FC is seen without a preceding |
| # vowel sound (e.g., after n 30F3) we do not change it. |
| |
| ### $long = ー; |
| |
| # The following categories are Hiragana, not Katakana |
| # as might be expected, since by the time we get to the |
| # 30FC, the preceding character will have already been |
| # transformed to Hiragana. |
| |
| # {The following mechanically generated from the |
| # Unicode 3.0 data:} |
| |
| $xa = [ \ |
| ぁ あ か が さ ざ \ |
| た だ な は ば ぱ \ |
| ま ゃ や ら ゎ わ \ |
| ]; |
| |
| $xi = [ \ |
| ぃ い き ぎ し じ \ |
| ち ぢ に ひ び ぴ \ |
| み り ゐ \ |
| ]; |
| |
| $xu = [ \ |
| ぅ う く ぐ す ず \ |
| っ つ づ ぬ ふ ぶ \ |
| ぷ む ゅ ゆ る ゔ \ |
| ]; |
| |
| $xe = [ \ |
| ぇ え け げ せ ぜ \ |
| て で ね へ べ ぺ \ |
| め れ ゑ \ |
| ]; |
| |
| $xo = [ \ |
| ぉ お こ ご そ ぞ \ |
| と ど の ほ ぼ ぽ \ |
| も ょ よ ろ を \ |
| ]; |
| |
| あ < $xa {ー}; |
| い < $xi {ー}; |
| う < $xu {ー}; |
| え < $xe {ー}; |
| お < $xo {ー}; |
| |
| :: (NFKC) ; |
| |
| # note: a global filter is more efficient, but MUST include all source chars!! |
| :: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]); |
| |
| # eof |