| # © 2016 and later: Unicode, Inc. and others. |
| # License & terms of use: http://www.unicode.org/copyright.html |
| # Generated using tools/cldr/cldr-to-icu/build-icu-data.xml |
| # |
| # File: Latn_Kana.txt |
| # Generated from CLDR |
| # |
| |
| # note: a global filter is more efficient, but MUST include all source chars |
| #:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ; |
| # MINIMAL FILTER GENERATED FOR: Latin-Katakana |
| ### WARNING -- must add width filter, both here and below!!! ### |
| :: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ; |
| :: [:Latin:] fullwidth-halfwidth (); |
| :: NFD (NFC); |
| :: Lower (); # whenever transliterating from cased to uncased script, include this |
| # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese |
| # Uses modified Hepburn. Small changes to make unambiguous. |
| # | Kunrei-shiki: Hepburn/MHepburn |
| # | ------------------------------ |
| # | si: shi |
| # | si ~ya: sha |
| # | si ~yu: shu |
| # | si ~yo: sho |
| # | zi: ji |
| # | zi ~ya: ja |
| # | zi ~yu: ju |
| # | zi ~yo: jo |
| # | ti: chi |
| # | ti ~ya: cha |
| # | ti ~yu: chu |
| # | ti ~yu: cho |
| # | tu: tsu |
| # | di: ji/dji |
| # | du: zu/dzu |
| # | hu: fu |
| # | For foreign words: |
| # | ----------------- |
| # | se ~i si |
| # | si ~e she |
| # | |
| # | ze ~i zi |
| # | zi ~e je |
| # | |
| # | te ~i ti |
| # | ti ~e che |
| # | te ~u tu |
| # | |
| # | de ~i di |
| # | de ~u du |
| # | de ~i di |
| # | |
| # | he ~u: hu |
| # | hu ~a fa |
| # | hu ~i fi |
| # | hu ~e he |
| # | hu ~o ho |
| # Most small forms are generated, but if necessary |
| # explicit small forms are given with ~a, ~ya, etc. |
| #------------------------------------------------------ |
| # Variables |
| $vowel = [aeiou] ; |
| $consonant = [bcdfghjklmnpqrstvwxyz] ; |
| $macron = \u0304 ; |
| # Variables used for doubled-consonants with tsu |
| $kana = [ぁ-ゔ] ; |
| $voice = [\u3099゛]; |
| $semivoice = [\u309A゜]; |
| $k_start = [カキクケコかきくけこ] ; |
| $s_start = [サシスセソさしすせそ] ; |
| $j_start = [シし] $voice ; |
| $t_start = [タチツテトたちつてと] ; |
| $n_start = [ナニヌネノンなにぬねの] ; |
| $h_start = [ハヒヘホはひへほ] ; |
| $f_start = [フふ] ; |
| $m_start = [マミムメモまみむめも] ; |
| $y_start = [ヤユヨやゆよ] ; |
| $r_start = [ラリルレロらりるれろ] ; |
| $w_start = [ワヰヱヲわゐゑを] ; |
| $v_start = [ワヰヱヲ]\u3099 ; |
| $voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ; |
| # if ン is followed by $n_quoter, then it needs an |
| # apostrophe after its romaji form to disambiguate it. |
| # e.g., ン ア ! = ナ, so represent as "n'a", not "na". |
| $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; |
| $small_y = [ャィュェョ] ; |
| $iteration = ゝ ; |
| #------------------------------------------------------ |
| # katakana rules |
| # Punctuation |
| '.' ↔ 。; |
| ',' ↔ 、; |
| # ' ' } [a-z] → ; # delete spaces before latin |
| # ' ' ← [^' '゠-ヿ] {} ['゠-ヿ] ; #insert spaces before hiragana |
| # Iteration Mark |
| # Copy previous letter § marks |
| # TODO |
| # | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration |
| # Specials for katakana -- not shared with hiragana |
| va ↔ ワ\u3099 ; |
| vi ↔ ヰ\u3099 ; |
| ve ↔ ヱ\u3099 ; |
| vo ↔ ヲ\u3099 ; |
| '~ka' ↔ ヵ ; |
| '~ke' ↔ ヶ ; |
| # ~~~ begin shared rules ~~~ |
| #special |
| ya ← '~'ャ; |
| yi ← '~'ィ ; |
| yu ← '~'ュ; |
| ye ← '~'ェ; |
| yo ← '~'ョ; |
| #normal |
| a ↔ ア ; |
| b | '~' ← ヒ \u3099} $small_y ; |
| by } $vowel → ヒ\u3099 | '~y' ; |
| ba ↔ ハ\u3099 ; |
| bi ↔ ヒ\u3099 ; |
| bu ↔ フ\u3099 ; |
| be ↔ ヘ\u3099 ; |
| bo ↔ ホ\u3099 ; |
| c } i → | s ; |
| c } e → | s ; |
| da ↔ タ\u3099 ; |
| di ↔ テ\u3099ィ ; |
| du ↔ テ\u3099ゥ ; |
| de ↔ テ\u3099 ; |
| do ↔ ト\u3099 ; |
| dzu ↔ ツ\u3099 ; |
| dja ← チ\u3099ャ ; |
| dji'~i' ← チ\u3099ィ ; # liu |
| dju ← チ\u3099ュ ; |
| dje ← チ\u3099ェ ; |
| djo ← チ\u3099ョ ; |
| dji ↔ チ\u3099 ; |
| dj } $vowel → チ\u3099 | '~y' ; |
| # TODO: QUESTION: use ĵĴżŻ instead of dj, dz |
| cha ← チャ ; |
| chi'~i' ← チィ ; # liu |
| chu ← チュ ; |
| che ← チェ ; |
| cho ← チョ ; |
| chi ↔ チ ; |
| ch } $vowel → チ | '~y' ; |
| e ↔ エ ; |
| g | '~' ← キ\u3099} $small_y ; |
| gy } $vowel → キ\u3099 | '~y' ; |
| ga ↔ カ\u3099 ; |
| gi ↔ キ\u3099 ; |
| gu ↔ ク\u3099 ; |
| ge ↔ ケ\u3099 ; |
| go ↔ コ\u3099 ; |
| i ↔ イ ; |
| # j } $vowel → シ\u3099 | '~y' ; |
| ja ↔ シ\u3099ャ ; |
| ji'~i' ← シ\u3099ィ ; # liu |
| ju ↔ シ\u3099ュ ; |
| je ↔ シ\u3099ェ ; |
| jo ↔ シ\u3099ョ ; |
| ji ↔ シ\u3099 ; |
| k | '~' ← キ} $small_y ; |
| ky } $vowel → キ | '~y' ; |
| ka ↔ カ ; |
| ki ↔ キ ; |
| ku ↔ ク ; |
| ke ↔ ケ ; |
| ko ↔ コ ; |
| m | '~' ← ミ} $small_y ; |
| my } $vowel → ミ | '~y' ; |
| ma ↔ マ ; |
| mi ↔ ミ ; |
| mu ↔ ム ; |
| me ↔ メ ; |
| mo ↔ モ ; |
| m } [pbfv] → ン ; |
| n | '~' ← ニ } $small_y ; |
| ny } $vowel → ニ | '~y' ; |
| na ↔ ナ ; |
| ni ↔ ニ ; |
| nu ↔ ヌ ; |
| ne ↔ ネ ; |
| no ↔ ノ ; |
| o ↔ オ ; |
| p | '~' ← ヒ\u309A } $small_y ; |
| py } $vowel → ヒ\u309A | '~y' ; |
| pa ↔ ハ\u309A ; |
| pi ↔ ヒ\u309A ; |
| pu ↔ フ\u309A ; |
| pe ↔ ヘ\u309A ; |
| po ↔ ホ\u309A ; |
| h | '~' ← ヒ } $small_y ; |
| hy } $vowel → ヒ | '~y' ; |
| ha ↔ ハ ; |
| hi ↔ ヒ ; |
| hu ↔ ヘゥ ; |
| he ↔ ヘ ; |
| ho ↔ ホ ; |
| # f | '~' ← フ } $small_y ; |
| # f } $vowel → フ | '~' ; |
| fa ↔ ファ ; |
| fi ↔ フィ ; |
| fe ↔ フェ ; |
| fo ↔ フォ ; |
| fu ↔ フ ; |
| r | '~' ← リ } $small_y ; |
| ry } $vowel → リ | '~y' ; |
| ra ↔ ラ ; |
| ri ↔ リ ; |
| ru ↔ ル ; |
| re ↔ レ ; |
| ro ↔ ロ ; |
| za ↔ サ\u3099 ; |
| zi ↔ セ\u3099ィ ; |
| zu ↔ ス\u3099 ; |
| ze ↔ セ\u3099 ; |
| zo ↔ ソ\u3099 ; |
| sa ↔ サ ; |
| si ↔ セィ ; |
| su ↔ ス ; |
| se ↔ セ ; |
| so ↔ ソ ; |
| sha ← シャ ; |
| shi'~i' ← シィ ; # liu |
| shu ← シュ ; |
| she ← シェ ; |
| sho ← ショ ; |
| shi ↔ シ ; |
| sh } $vowel → シ | '~y' ; |
| ta ↔ タ ; |
| ti ↔ ティ ; |
| tu ↔ テゥ ; |
| te ↔ テ ; |
| to ↔ ト ; |
| tsu ↔ ツ ; |
| # v } $vowel → ウ\u3099 | '~' ; |
| #'v~a' ← ウ\u3099ァ ; # liu |
| #'v~i' ← ウ\u3099ィ ; # liu |
| #'v~e' ← ウ\u3099ェ ; # liu |
| #'v~o' ← ウ\u3099ォ ; # liu |
| vu ↔ ウ\u3099 ; |
| u ↔ ウ ; |
| # w } $vowel → ウ | '~' ; |
| wa ↔ ワ ; |
| wi ↔ ヰ ; |
| wu → ウ ; |
| we ↔ ヱ ; |
| wo ↔ ヲ ; |
| ya ↔ ヤ ; |
| yi → イ ; |
| yu ↔ ユ ; |
| ye → エ ; |
| yo ↔ ヨ ; |
| # double consonants |
| #specials |
| s } sh → ッ ; |
| t } ch → ッ ; |
| #voiced |
| j } j ↔ ッ } $j_start ; |
| b } b ↔ ッ } [$h_start$f_start] $voice; |
| d } d ↔ ッ } $t_start $voice; |
| g } g ↔ ッ } $k_start $voice; |
| p } p ↔ ッ } [$h_start$f_start] $semivoice; |
| # v } v ↔ ッ } [ワヰウヱヲう] $voice ; |
| z } z ↔ ッ } $s_start $voice; |
| v } v ↔ ッ } $v_start; |
| # normal |
| k } k ↔ ッ } $k_start ; |
| m } m ↔ ッ } $m_start ; |
| n } n ↔ ッ } $n_start ; |
| h } h ↔ ッ } $h_start ; |
| f } f ↔ ッ } $f_start ; |
| r } r ↔ ッ } $r_start ; |
| t } t ↔ ッ } $t_start ; |
| s } s ↔ ッ } $s_start ; |
| w } w ↔ ッ } $w_start; |
| y } y ↔ ッ } $y_start; |
| # completeness |
| x } x → ッ ; |
| c } k → ッ ; |
| c } c → ッ ; |
| c } q → ッ ; |
| l } l → ッ ; |
| q } q → ッ ; |
| # y } y → ッ ; |
| # w } w → ッ ; |
| # prolonged vowel mark. this indicates a doubling of |
| # the preceding vowel sound |
| #a ← a { ー ; # liu |
| #e ← e { ー ; # liu |
| #i ← i { ー ; # liu |
| #o ← o { ー ; # liu |
| #u ← u { ー ; # liu |
| $macron ↔ ー ; |
| # small forms |
| '~a' ↔ ァ ; |
| '~i' ↔ ィ ; |
| '~u' ↔ ゥ ; |
| '~e' ↔ ェ ; |
| '~o' ↔ ォ ; |
| '~tsu' ↔ ッ ; |
| '~wa' ↔ ヮ ; |
| '~ya' ↔ ャ ; |
| '~yi' → ィ ; |
| '~yu' ↔ ュ ; |
| '~ye' → ェ ; |
| '~yo' ↔ ョ ; |
| # iteration marks |
| # TODO: make more accurate |
| j $1 ← sh (y* $vowel) {ヽ$voice ; |
| dj $1 ← ch (y* $vowel) {ヽ$voice ; |
| dz $1 ← ts (y* $vowel) {ヽ$voice ; |
| g $1 ← k (y* $vowel) {ヽ$voice ; |
| z $1 ← s (y* $vowel) {ヽ$voice ; |
| d $1 ← t (y* $vowel) {ヽ$voice ; |
| h $1 ← b (y* $vowel) {ヽ$voice ; |
| v $1 ← w (y* $vowel) {ヽ$voice ; |
| sh $1 ← sh (y* $vowel) {ヽ$voice ; |
| j $1 ← j (y* $vowel) {ヽ$voice ; |
| ch $1 ← ch (y* $vowel) {ヽ$voice ; |
| dj $1 ← dj(y* $vowel) {ヽ$voice ; |
| ts $1 ← ts (y* $vowel) {ヽ$voice ; |
| dz $1 ← dz (y* $vowel) {ヽ$voice ; |
| $1 ← ($consonant y* $vowel) {ヽ$voice? ; |
| $1 ← (.) {ヽ $voice? ; # otherwise repeat last character |
| ← ヽ $voice? ; # delete if no characters found |
| # h- rule: lengthens vowel if not followed by a vowel. |
| # At the point this is applied, latin [cons]?vowel sequences |
| # have been converted to katakana in NFD form. |
| $voweled_basekana [\u3099 \u309A]? { h → ー ; |
| # one-way latin- → kana rules. these do not occur in |
| # well-formed romaji representing actual japanese text. |
| # their purpose is to make all romaji map to kana of |
| # some sort. |
| # the following are not really necessary, but produce |
| # slightly more natural results. |
| cy → セィ ; |
| dy → テ\u3099ィ ; |
| hy → ヒ ; |
| sy → セィ ; |
| ty → ティ ; |
| zy → セ\u3099ィ ; |
| h → ヘ ; |
| # isolated consonants listed here so as not to mask |
| # longer rules above. |
| ch → チ; |
| sh → シ ; |
| dz → ツ\u3099 ; |
| dj → チ\u3099; |
| b → フ\u3099 ; |
| d → テ\u3099 ; |
| g → ク\u3099 ; |
| k → ク ; |
| m → ム ; |
| n'' ← ン } $n_quoter ; |
| n ↔ ン ; |
| p → フ\u309A ; |
| r → ル ; |
| s → ス ; |
| t → テ ; |
| y → イ ; |
| z → ス\u3099 ; |
| v → ウ\u3099 ; |
| f → フ; |
| j → シ\u3099; |
| w → ウ; |
| ß → | ss ; |
| æ → | e ; |
| ð → | d ; |
| ø → | u ; |
| þ → | th ; |
| # simple substitutions using backup |
| c → | k ; |
| l → | r ; |
| q → | k ; |
| x → | ks ; |
| # ~~~ END shared rules ~~~ |
| #------------------------------------------------------ |
| # Final cleanup |
| '~' → ; # delete stray tildes between letters |
| [:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters |
| # [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use |
| :: NFC (NFD) ; |
| :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); |
| # note: a global filter is more efficient, but MUST include all source chars!! |
| #:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]); |
| # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD |
| :: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; |
| # eof |
| |