source/data/translit/Hiragana_Katakana.txt - external/github.com/unicode-org/icu - Git at Google

 #--------------------------------------------------------------------
 # Copyright (c) 1999-2004, International Business Machines
 # Corporation and others. All Rights Reserved.
 #--------------------------------------------------------------------

 # note: a global filter is more efficient, but MUST include all source chars
 :: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
 :: NFKC ();

 # Hiragana-Katakana

 # This is largely a one-to-one mapping, but it has a
 # few kinks:

 # 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
 # Hiragana equivalents.  We use Hiragana wa/wi/we/wo
 # (308F-3092) with a voicing mark (3099), which is
 # semantically equivalent.  However, this is a non-
 # roundtripping transformation.

 # 2. The Katakana small ka/ke (30F5,30F6) have no
 # Hiragana equiavlents.  We convert them to normal
 # Hiragana ka/ke (304B,3051).  This is a one-way
 # information-losing transformation and precludes
 # round-tripping of 30F5 and 30F6.

 # 3. The combining marks 3099-309C are in the Hiragana
 # block, but they apply to Katakana as well, so we
 # leave them untouched.

 # 4. The Katakana prolonged sound mark 30FC doubles the
 # preceding vowel.  This is a one-way information-
 # losing transformation from Katakana to Hiragana.

 # 5. The Katakana middle dot separates words in foreign
 # expressions; we leave this unmodified.

 # The above points preclude successful round-trip
 # transformations of arbitrary input text.  However,
 # they provide naturalistic results that should conform
 # to user expectations.


 # Combining equivalents va/vi/ve/vo
 わ゙ <> ヷ;
 ゐ゙ <> ヸ;
 ゑ゙ <> ヹ;
 を゙ <> ヺ;

 # One-to-one mappings, main block
 # 3041:3094 <> 30A1:30F4
 # 309D,E <> 30FD,E
 ぁ <> ァ;
 あ <> ア;
 ぃ <> ィ;
 い <> イ;
 ぅ <> ゥ;
 う <> ウ;
 ぇ <> ェ;
 え <> エ;
 ぉ <> ォ;
 お <> オ;
 か <> カ;
 が <> ガ;
 き <> キ;
 ぎ <> ギ;
 く <> ク;
 ぐ <> グ;
 け <> ケ;
 げ <> ゲ;
 こ <> コ;
 ご <> ゴ;
 さ <> サ;
 ざ <> ザ;
 し <> シ;
 じ <> ジ;
 す <> ス;
 ず <> ズ;
 せ <> セ;
 ぜ <> ゼ;
 そ <> ソ;
 ぞ <> ゾ;
 た <> タ;
 だ <> ダ;
 ち <> チ;
 ぢ <> ヂ;
 っ <> ッ;
 つ <> ツ;
 づ <> ヅ;
 て <> テ;
 で <> デ;
 と <> ト;
 ど <> ド;
 な <> ナ;
 に <> ニ;
 ぬ <> ヌ;
 ね <> ネ;
 の <> ノ;
 は <> ハ;
 ば <> バ;
 ぱ <> パ;
 ひ <> ヒ;
 び <> ビ;
 ぴ <> ピ;
 ふ <> フ;
 ぶ <> ブ;
 ぷ <> プ;
 へ <> ヘ;
 べ <> ベ;
 ぺ <> ペ;
 ほ <> ホ;
 ぼ <> ボ;
 ぽ <> ポ;
 ま <> マ;
 み <> ミ;
 む <> ム;
 め <> メ;
 も <> モ;
 ゃ <> ャ;
 や <> ヤ;
 ゅ <> ュ;
 ゆ <> ユ;
 ょ <> ョ;
 よ <> ヨ;
 ら <> ラ;
 り <> リ;
 る <> ル;
 れ <> レ;
 ろ <> ロ;
 ゎ <> ヮ;
 わ <> ワ;
 ゐ <> ヰ;
 ゑ <> ヱ;
 を <> ヲ;
 ん <> ン;
 ゔ <> ヴ;
 ゝ <> ヽ;
 ゞ <> ヾ;

 # One-way Katakana-Hiragana xform of small K ka/ke to
 # normal H ka/ke.
 か < ヵ;
 け < ヶ;

 # Katakana followed by a prolonged sound mark 30FC has
 # its final vowel doubled.  This is a Katakana-Hiragana
 # one-way information-losing transformation.  We
 # include the small Katakana (e.g., small A 3041) and
 # do not distinguish them from their large
 # counterparts.  It doesn't make sense to double a
 # small counterpart vowel as a small Hiragana vowel, so
 # we don't do so.  In natural text this should never
 # occur anyway.  If a 30FC is seen without a preceding
 # vowel sound (e.g., after n 30F3) we do not change it.

 ### $long = ー;

 # The following categories are Hiragana, not Katakana
 # as might be expected, since by the time we get to the
 # 30FC, the preceding character will have already been
 # transformed to Hiragana.

 # {The following mechanically generated from the
 # Unicode 3.0 data:}

 $xa = [ \
 ぁ あ か が さ ざ \
 た だ な は ば ぱ \
 ま ゃ や ら ゎ わ \
 ];

 $xi = [ \
 ぃ い き ぎ し じ \
 ち ぢ に ひ び ぴ \
 み り ゐ \
 ];

 $xu = [ \
 ぅ う く ぐ す ず \
 っ つ づ ぬ ふ ぶ \
 ぷ む ゅ ゆ る ゔ \
 ];

 $xe = [ \
 ぇ え け げ せ ぜ \
 て で ね へ べ ぺ \
 め れ ゑ \
 ];

 $xo = [ \
 ぉ お こ ご そ ぞ \
 と ど の ほ ぼ ぽ \
 も ょ よ ろ を \
 ];

 あ < $xa {ー};
 い < $xi {ー};
 う < $xu {ー};
 え < $xe {ー};
 お < $xo {ー};

 :: (NFKC) ;

 # note: a global filter is more efficient, but MUST include all source chars!!
 :: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);

 # eof
	#--------------------------------------------------------------------
	# Copyright (c) 1999-2004, International Business Machines
	# Corporation and others. All Rights Reserved.
	#--------------------------------------------------------------------

	# note: a global filter is more efficient, but MUST include all source chars
	:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
	:: NFKC ();

	# Hiragana-Katakana

	# This is largely a one-to-one mapping, but it has a
	# few kinks:

	# 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
	# Hiragana equivalents. We use Hiragana wa/wi/we/wo
	# (308F-3092) with a voicing mark (3099), which is
	# semantically equivalent. However, this is a non-
	# roundtripping transformation.

	# 2. The Katakana small ka/ke (30F5,30F6) have no
	# Hiragana equiavlents. We convert them to normal
	# Hiragana ka/ke (304B,3051). This is a one-way
	# information-losing transformation and precludes
	# round-tripping of 30F5 and 30F6.

	# 3. The combining marks 3099-309C are in the Hiragana
	# block, but they apply to Katakana as well, so we
	# leave them untouched.

	# 4. The Katakana prolonged sound mark 30FC doubles the
	# preceding vowel. This is a one-way information-
	# losing transformation from Katakana to Hiragana.

	# 5. The Katakana middle dot separates words in foreign
	# expressions; we leave this unmodified.

	# The above points preclude successful round-trip
	# transformations of arbitrary input text. However,
	# they provide naturalistic results that should conform
	# to user expectations.


	# Combining equivalents va/vi/ve/vo
	わ゙ <> ヷ;
	ゐ゙ <> ヸ;
	ゑ゙ <> ヹ;
	を゙ <> ヺ;

	# One-to-one mappings, main block
	# 3041:3094 <> 30A1:30F4
	# 309D,E <> 30FD,E
	ぁ <> ァ;
	あ <> ア;
	ぃ <> ィ;
	い <> イ;
	ぅ <> ゥ;
	う <> ウ;
	ぇ <> ェ;
	え <> エ;
	ぉ <> ォ;
	お <> オ;
	か <> カ;
	が <> ガ;
	き <> キ;
	ぎ <> ギ;
	く <> ク;
	ぐ <> グ;
	け <> ケ;
	げ <> ゲ;
	こ <> コ;
	ご <> ゴ;
	さ <> サ;
	ざ <> ザ;
	し <> シ;
	じ <> ジ;
	す <> ス;
	ず <> ズ;
	せ <> セ;
	ぜ <> ゼ;
	そ <> ソ;
	ぞ <> ゾ;
	た <> タ;
	だ <> ダ;
	ち <> チ;
	ぢ <> ヂ;
	っ <> ッ;
	つ <> ツ;
	づ <> ヅ;
	て <> テ;
	で <> デ;
	と <> ト;
	ど <> ド;
	な <> ナ;
	に <> ニ;
	ぬ <> ヌ;
	ね <> ネ;
	の <> ノ;
	は <> ハ;
	ば <> バ;
	ぱ <> パ;
	ひ <> ヒ;
	び <> ビ;
	ぴ <> ピ;
	ふ <> フ;
	ぶ <> ブ;
	ぷ <> プ;
	へ <> ヘ;
	べ <> ベ;
	ぺ <> ペ;
	ほ <> ホ;
	ぼ <> ボ;
	ぽ <> ポ;
	ま <> マ;
	み <> ミ;
	む <> ム;
	め <> メ;
	も <> モ;
	ゃ <> ャ;
	や <> ヤ;
	ゅ <> ュ;
	ゆ <> ユ;
	ょ <> ョ;
	よ <> ヨ;
	ら <> ラ;
	り <> リ;
	る <> ル;
	れ <> レ;
	ろ <> ロ;
	ゎ <> ヮ;
	わ <> ワ;
	ゐ <> ヰ;
	ゑ <> ヱ;
	を <> ヲ;
	ん <> ン;
	ゔ <> ヴ;
	ゝ <> ヽ;
	ゞ <> ヾ;

	# One-way Katakana-Hiragana xform of small K ka/ke to
	# normal H ka/ke.
	か < ヵ;
	け < ヶ;

	# Katakana followed by a prolonged sound mark 30FC has
	# its final vowel doubled. This is a Katakana-Hiragana
	# one-way information-losing transformation. We
	# include the small Katakana (e.g., small A 3041) and
	# do not distinguish them from their large
	# counterparts. It doesn't make sense to double a
	# small counterpart vowel as a small Hiragana vowel, so
	# we don't do so. In natural text this should never
	# occur anyway. If a 30FC is seen without a preceding
	# vowel sound (e.g., after n 30F3) we do not change it.

	### $long = ー;

	# The following categories are Hiragana, not Katakana
	# as might be expected, since by the time we get to the
	# 30FC, the preceding character will have already been
	# transformed to Hiragana.

	# {The following mechanically generated from the
	# Unicode 3.0 data:}

	$xa = [ \
	ぁあかがさざ \
	ただなはばぱ \
	まゃやらゎわ \
	];

	$xi = [ \
	ぃいきぎしじ \
	ちぢにひびぴ \
	みりゐ \
	];

	$xu = [ \
	ぅうくぐすず \
	っつづぬふぶ \
	ぷむゅゆるゔ \
	];

	$xe = [ \
	ぇえけげせぜ \
	てでねへべぺ \
	めれゑ \
	];

	$xo = [ \
	ぉおこごそぞ \
	とどのほぼぽ \
	もょよろを \
	];

	あ < $xa {ー};
	い < $xi {ー};
	う < $xu {ー};
	え < $xe {ー};
	お < $xo {ー};

	:: (NFKC) ;

	# note: a global filter is more efficient, but MUST include all source chars!!
	:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);

	# eof