| #-------------------------------------------------------------------- |
| # Copyright (c) 1999-2004, International Business Machines |
| # Corporation and others. All Rights Reserved. |
| #-------------------------------------------------------------------- |
| |
| # note: a global filter is more efficient, but MUST include all source chars |
| #:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ; |
| # MINIMAL FILTER GENERATED FOR: Latin-Katakana |
| ### WARNING -- must add width filter, both here and below!!! ### |
| :: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ; |
| |
| :: [:Latin:] fullwidth-halfwidth (); |
| :: NFD (NFC); |
| :: Lower (); # whenever transliterating from cased to uncased script, include this |
| # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese |
| |
| # Uses modified Hepburn. Small changes to make unambiguous. |
| |
| # | Kunrei-shiki: Hepburn/MHepburn |
| # | ------------------------------ |
| # | si: shi |
| # | si ~ya: sha |
| # | si ~yu: shu |
| # | si ~yo: sho |
| # | zi: ji |
| # | zi ~ya: ja |
| # | zi ~yu: ju |
| # | zi ~yo: jo |
| # | ti: chi |
| # | ti ~ya: cha |
| # | ti ~yu: chu |
| # | ti ~yu: cho |
| # | tu: tsu |
| # | di: ji/dji |
| # | du: zu/dzu |
| # | hu: fu |
| |
| # | For foreign words: |
| # | ----------------- |
| # | se ~i si |
| # | si ~e she |
| # | |
| # | ze ~i zi |
| # | zi ~e je |
| # | |
| # | te ~i ti |
| # | ti ~e che |
| # | te ~u tu |
| # | |
| # | de ~i di |
| # | de ~u du |
| # | de ~i di |
| # | |
| # | he ~u: hu |
| # | hu ~a fa |
| # | hu ~i fi |
| # | hu ~e he |
| # | hu ~o ho |
| |
| # Most small forms are generated, but if necessary |
| # explicit small forms are given with ~a, ~ya, etc. |
| |
| #------------------------------------------------------ |
| # Variables |
| |
| $vowel = [aeiou] ; |
| $consonant = [bcdfghjklmnpqrstvwxyz] ; |
| $macron = \u0304 ; |
| |
| # Variables used for doubled-consonants with tsu |
| |
| $kana = [\u3041-\u3094] ; |
| |
| $voice = [\u3099\u309B]; |
| $semivoice = [\u309A\u309C]; |
| |
| $k_start = [カキクケコかきくけこ] ; |
| |
| $s_start = [サシスセソさしすせそ] ; |
| |
| $j_start = [シし] $voice ; |
| |
| $t_start = [タチツテトたちつてと] ; |
| |
| $n_start = [ナニヌネノンなにぬねの] ; |
| |
| $h_start = [ハヒヘホはひへほ] ; |
| $f_start = [フふ] ; |
| |
| $m_start = [マミムメモまみむめも] ; |
| |
| $y_start = [ヤユヨやゆよ] ; |
| |
| $r_start = [ラリルレロらりるれろ] ; |
| |
| $w_start = [ワヰヱヲわゐゑを] ; |
| |
| $v_start = [ワヰヱヲ]゙ ; |
| |
| # if ン is followed by $n_quoter, then it needs an |
| # apostrophe after its romaji form to disambiguate it. |
| # e.g., ン ア ! = ナ, so represent as "n'a", not "na". |
| |
| $n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ; |
| |
| $small_y = [ャィュェョ] ; |
| |
| $iteration = \u309D ; |
| |
| #------------------------------------------------------ |
| # katakana rules |
| |
| # Punctuation |
| |
| '.' <> 。; |
| ',' <> 、; |
| # ' ' } [a-z] > ; # delete spaces before latin |
| # ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana |
| |
| # Iteration Mark |
| # Copy previous letter & marks |
| |
| # TODO |
| # | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration |
| |
| # Specials for katakana -- not shared with hiragana |
| |
| va <> ヷ ; |
| vi <> ヸ ; |
| ve <> ヹ ; |
| vo <> ヺ ; |
| '~ka' <> ヵ ; |
| '~ke' <> ヶ ; |
| |
| # ~~~ begin shared rules ~~~ |
| |
| #special |
| |
| ya < '~'ャ; |
| yi < '~'ィ ; |
| yu < '~'ュ; |
| ye < '~'ェ; |
| yo < '~'ョ; |
| |
| #normal |
| |
| a <> ア ; |
| |
| b | '~' < ヒ ゙} $small_y ; |
| by } $vowel > ビ | '~y' ; |
| |
| ba <> バ ; |
| bi <> ビ ; |
| bu <> ブ ; |
| be <> ベ ; |
| bo <> ボ ; |
| |
| c } i > | s ; |
| c } e > | s ; |
| |
| da <> ダ ; |
| di <> ディ ; |
| du <> デゥ ; |
| de <> デ ; |
| do <> ド ; |
| dzu <> ヅ ; |
| dja < ヂャ ; |
| dji'~i' < ヂィ ; # liu |
| dju < ヂュ ; |
| dje < ヂェ ; |
| djo < ヂョ ; |
| dji <> ヂ ; |
| dj } $vowel > ヂ | '~y' ; |
| |
| # TODO: QUESTION: use ĵĴżŻ instead of dj, dz |
| |
| cha < チャ ; |
| chi'~i' < チィ ; # liu |
| chu < チュ ; |
| che < チェ ; |
| cho < チョ ; |
| chi <> チ ; |
| ch } $vowel > チ | '~y' ; |
| |
| e <> エ ; |
| |
| g | '~' < ギ} $small_y ; |
| gy } $vowel > ギ | '~y' ; |
| |
| ga <> ガ ; |
| gi <> ギ ; |
| gu <> グ ; |
| ge <> ゲ ; |
| go <> ゴ ; |
| |
| i <> イ ; |
| |
| # j } $vowel > ジ | '~y' ; |
| |
| ja <> ジャ ; |
| ji'~i' < ジィ ; # liu |
| ju <> ジュ ; |
| je <> ジェ ; |
| jo <> ジョ ; |
| ji <> ジ ; |
| |
| k | '~' < キ} $small_y ; |
| ky } $vowel > キ | '~y' ; |
| |
| ka <> カ ; |
| ki <> キ ; |
| ku <> ク ; |
| ke <> ケ ; |
| ko <> コ ; |
| |
| m | '~' < ミ} $small_y ; |
| my } $vowel > ミ | '~y' ; |
| |
| ma <> マ ; |
| mi <> ミ ; |
| mu <> ム ; |
| me <> メ ; |
| mo <> モ ; |
| |
| m } [pbfv] > ン ; |
| |
| n | '~' < ニ } $small_y ; |
| ny } $vowel > ニ | '~y' ; |
| |
| na <> ナ ; |
| ni <> ニ ; |
| nu <> ヌ ; |
| ne <> ネ ; |
| no <> ノ ; |
| |
| o <> オ ; |
| |
| p | '~' < ピ } $small_y ; |
| py } $vowel > ピ | '~y' ; |
| |
| pa <> パ ; |
| pi <> ピ ; |
| pu <> プ ; |
| pe <> ペ ; |
| po <> ポ ; |
| |
| h | '~' < ヒ } $small_y ; |
| hy } $vowel > ヒ | '~y' ; |
| |
| ha <> ハ ; |
| hi <> ヒ ; |
| hu <> ヘゥ ; |
| he <> ヘ ; |
| ho <> ホ ; |
| |
| # f | '~' < フ } $small_y ; |
| # f } $vowel > フ | '~' ; |
| |
| fa <> ファ ; |
| fi <> フィ ; |
| fe <> フェ ; |
| fo <> フォ ; |
| fu <> フ ; |
| |
| r | '~' < リ } $small_y ; |
| ry } $vowel > リ | '~y' ; |
| |
| ra <> ラ ; |
| ri <> リ ; |
| ru <> ル ; |
| re <> レ ; |
| ro <> ロ ; |
| |
| za <> ザ ; |
| zi <> ゼィ ; |
| zu <> ズ ; |
| ze <> ゼ ; |
| zo <> ゾ ; |
| |
| sa <> サ ; |
| si <> セィ ; |
| su <> ス ; |
| se <> セ ; |
| so <> ソ ; |
| |
| sha < シャ ; |
| shi'~i' < シィ ; # liu |
| shu < シュ ; |
| she < シェ ; |
| sho < ショ ; |
| shi <> シ ; |
| sh } $vowel > シ | '~y' ; |
| |
| ta <> タ ; |
| ti <> ティ ; |
| tu <> テゥ ; |
| te <> テ ; |
| to <> ト ; |
| |
| tsu <> ツ ; |
| |
| # v } $vowel > ヴ | '~' ; |
| |
| #'v~a' < ヴァ ; # liu |
| #'v~i' < ヴィ ; # liu |
| #'v~e' < ヴェ ; # liu |
| #'v~o' < ヴォ ; # liu |
| vu <> ヴ ; |
| |
| u <> ウ ; |
| |
| # w } $vowel > ウ | '~' ; |
| |
| wa <> ワ ; |
| wi <> ヰ ; |
| wu > ウ ; |
| we <> ヱ ; |
| wo <> ヲ ; |
| |
| ya <> ヤ ; |
| yi > イ ; |
| yu <> ユ ; |
| ye > エ ; |
| yo <> ヨ ; |
| |
| # double consonants |
| |
| #specials |
| s } sh > ッ ; |
| t } ch > ッ ; |
| |
| #voiced |
| |
| j } j <> ッ } $j_start ; |
| b } b <> ッ } [$h_start$f_start] $voice; |
| d } d <> ッ } $t_start $voice; |
| g } g <> ッ } $k_start $voice; |
| p } p <> ッ } [$h_start$f_start] $semivoice; |
| # v } v <> ッ } [ワヰウヱヲう] $voice ; |
| z } z <> ッ } $s_start $voice; |
| v } v <> ッ } $v_start; |
| |
| # normal |
| |
| k } k <> ッ } $k_start ; |
| m } m <> ッ } $m_start ; |
| n } n <> ッ } $n_start ; |
| h } h <> ッ } $h_start ; |
| f } f <> ッ } $f_start ; |
| r } r <> ッ } $r_start ; |
| t } t <> ッ } $t_start ; |
| s } s <> ッ } $s_start ; |
| |
| w } w <> ッ } $w_start; |
| y } y <> ッ } $y_start; |
| |
| # completeness |
| x } x > ッ ; |
| c } k > ッ ; |
| c } c > ッ ; |
| c } q > ッ ; |
| l } l > ッ ; |
| q } q > ッ ; |
| # y } y > ッ ; |
| # w } w > ッ ; |
| |
| # prolonged vowel mark. this indicates a doubling of |
| # the preceding vowel sound |
| |
| #a < a { ー ; # liu |
| #e < e { ー ; # liu |
| #i < i { ー ; # liu |
| #o < o { ー ; # liu |
| #u < u { ー ; # liu |
| |
| $macron <> ー ; |
| |
| # small forms |
| |
| '~a' <> ァ ; |
| '~i' <> ィ ; |
| '~u' <> ゥ ; |
| '~e' <> ェ ; |
| '~o' <> ォ ; |
| '~tsu' <> ッ ; |
| '~wa' <> ヮ ; |
| '~ya' <> ャ ; |
| '~yi' > ィ ; |
| '~yu' <> ュ ; |
| '~ye' > ェ ; |
| '~yo' <> ョ ; |
| |
| # iteration marks |
| # TODO: make more accurate |
| |
| j $1 < sh (y* $vowel) {ヽ$voice ; |
| dj $1 < ch (y* $vowel) {ヽ$voice ; |
| dz $1 < ts (y* $vowel) {ヽ$voice ; |
| |
| g $1 < k (y* $vowel) {ヽ$voice ; |
| z $1 < s (y* $vowel) {ヽ$voice ; |
| d $1 < t (y* $vowel) {ヽ$voice ; |
| h $1 < b (y* $vowel) {ヽ$voice ; |
| v $1 < w (y* $vowel) {ヽ$voice ; |
| |
| sh $1 < sh (y* $vowel) {ヽ$voice ; |
| j $1 < j (y* $vowel) {ヽ$voice ; |
| ch $1 < ch (y* $vowel) {ヽ$voice ; |
| dj $1 < dj(y* $vowel) {ヽ$voice ; |
| ts $1 < ts (y* $vowel) {ヽ$voice ; |
| dz $1 < dz (y* $vowel) {ヽ$voice ; |
| |
| $1 < ($consonant y* $vowel) {ヽ$voice? ; |
| $1 < (.) {ヽ $voice? ; # otherwise repeat last character |
| < ヽ $voice? ; # delete if no characters found |
| |
| # h- rule: lengthens vowel if not followed by a vowel |
| |
| [aeiou] } h > ー ; |
| |
| # one-way latin- > kana rules. these do not occur in |
| # well-formed romaji representing actual japanese text. |
| # their purpose is to make all romaji map to kana of |
| # some sort. |
| |
| # the following are not really necessary, but produce |
| # slightly more natural results. |
| |
| cy > セィ ; |
| dy > ディ ; |
| hy > ヒ ; |
| sy > セィ ; |
| ty > ティ ; |
| zy > ゼィ ; |
| |
| h > ヘ ; |
| |
| # isolated consonants listed here so as not to mask |
| # longer rules above. |
| |
| ch > チ; |
| sh > シ ; |
| dz > ヅ ; |
| dj > ヂ; |
| |
| b > ブ ; |
| d > デ ; |
| g > グ ; |
| k > ク ; |
| m > ム ; |
| n'' < ン } $n_quoter ; |
| n <> ン ; |
| p > プ ; |
| r > ル ; |
| s > ス ; |
| t > テ ; |
| y > イ ; |
| z > ズ ; |
| v > ヴ ; |
| |
| f > フ; |
| j > ジ; |
| w > ウ; |
| |
| ß > | ss ; |
| æ > | e ; |
| ð > | d ; |
| ø > | u ; |
| þ > | th ; |
| |
| # simple substitutions using backup |
| |
| c > | k ; |
| l > | r ; |
| q > | k ; |
| x > | ks ; |
| |
| # ~~~ END shared rules ~~~ |
| |
| #------------------------------------------------------ |
| # Final cleanup |
| |
| '~' > ; # delete stray tildes between letters |
| [:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters |
| # [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use |
| |
| :: NFC (NFD) ; |
| :: ([:Katakana:] halfwidth-fullwidth); |
| |
| # note: a global filter is more efficient, but MUST include all source chars!! |
| #:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]); |
| # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD |
| :: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ; |
| |
| # eof |