| // -*- Coding: utf-8; -*- |
| //-------------------------------------------------------------------- |
| // Copyright (c) 1999-2001, International Business Machines |
| // Corporation and others. All Rights Reserved. |
| //-------------------------------------------------------------------- |
| // THIS IS A MACHINE-GENERATED FILE |
| // Tool: dumpicurules.bat |
| // Source: ../../text/resources/Transliterator_Latin_Katakana.txt |
| // Date: Mon Dec 3 11:44:30 2001 |
| //-------------------------------------------------------------------- |
| |
| // Latin_Katakana |
| |
| translit_Latin_Katakana { |
| Rule { |
| //-------------------------------------------------------------------- |
| // Copyright (c) 1999-2001, International Business Machines |
| // Corporation and others. All Rights Reserved. |
| //-------------------------------------------------------------------- |
| // $Source: /xsrl/Nsvn/icu/icu/data/Attic/translit_Latin_Katakana.txt,v $ |
| // $Date: 2001/12/03 20:51:19 $ |
| // $Revision: 1.8 $ |
| //-------------------------------------------------------------------- |
| |
| // note: a global filter is more efficient, but MUST include all source chars |
| //:: [\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ; |
| // MINIMAL FILTER GENERATED FOR: Latin-Katakana |
| //## WARNING -- must add width filter, both here and below!!! ### |
| ":: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;" |
| |
| ":: [:Latin:] fullwidth-halfwidth ();" |
| ":: NFD (NFC);" |
| ":: Lower ();" // whenever transliterating from cased to uncased script, include this |
| // :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese |
| |
| // Uses modified Hepburn. Small changes to make unambiguous. |
| |
| // | Kunrei-shiki: Hepburn/MHepburn |
| // | ------------------------------ |
| // | si: shi |
| // | si ~ya: sha |
| // | si ~yu: shu |
| // | si ~yo: sho |
| // | zi: ji |
| // | zi ~ya: ja |
| // | zi ~yu: ju |
| // | zi ~yo: jo |
| // | ti: chi |
| // | ti ~ya: cha |
| // | ti ~yu: chu |
| // | ti ~yu: cho |
| // | tu: tsu |
| // | di: ji/dji |
| // | du: zu/dzu |
| // | hu: fu |
| |
| // | For foreign words: |
| // | ----------------- |
| // | se ~i si |
| // | si ~e she |
| // | |
| // | ze ~i zi |
| // | zi ~e je |
| // | |
| // | te ~i ti |
| // | ti ~e che |
| // | te ~u tu |
| // | |
| // | de ~i di |
| // | de ~u du |
| // | de ~i di |
| // | |
| // | he ~u: hu |
| // | hu ~a fa |
| // | hu ~i fi |
| // | hu ~e he |
| // | hu ~o ho |
| |
| // Most small forms are generated, but if necessary |
| // explicit small forms are given with ~a, ~ya, etc. |
| |
| //------------------------------------------------------ |
| // Variables |
| |
| "$vowel = [aeiou] ;" |
| "$consonant = [bcdfghjklmnpqrstvwxyz] ;" |
| "$macron = \u0304 ;" |
| |
| // Variables used for doubled-consonants with tsu |
| |
| "$kana = [\u3041-\u3094] ;" |
| |
| "$voice = [\u3099\u309B];" |
| "$semivoice = [\u309A\u309C];" |
| |
| "$k_start = [カキクケコかきくけこ] ;" |
| |
| "$s_start = [サシスセソさしすせそ] ;" |
| |
| "$j_start = [シし] $voice ;" |
| |
| "$t_start = [タチツテトたちつてと] ;" |
| |
| "$n_start = [ナニヌネノンなにぬねの] ;" |
| |
| "$h_start = [ハヒヘホはひへほ] ;" |
| "$f_start = [フふ] ;" |
| |
| "$m_start = [マミムメモまみむめも] ;" |
| |
| "$y_start = [ヤユヨやゆよ] ;" |
| |
| "$r_start = [ラリルレロらりるれろ] ;" |
| |
| "$w_start = [ワヰヱヲわゐゑを] ;" |
| |
| "$v_start = [ワヰヱヲ]゙ ;" |
| |
| // if ン is followed by $n_quoter, then it needs an |
| // apostrophe after its romaji form to disambiguate it. |
| // e.g., ン ア ! = ナ, so represent as "n'a", not "na". |
| |
| "$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;" |
| |
| "$small_y = [ャィュェョ] ;" |
| |
| "$iteration = \u309D ;" |
| |
| //------------------------------------------------------ |
| // katakana rules |
| |
| // Punctuation |
| |
| "'.' <> 。;" |
| "',' <> 、;" |
| // ' ' } [a-z] > ; # delete spaces before latin |
| // ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana |
| |
| // Iteration Mark |
| // Copy previous letter & marks |
| |
| // TODO |
| // | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration |
| |
| // Specials for katakana -- not shared with hiragana |
| |
| "va <> ヷ ;" |
| "vi <> ヸ ;" |
| "ve <> ヹ ;" |
| "vo <> ヺ ;" |
| "'~ka' <> ヵ ;" |
| "'~ke' <> ヶ ;" |
| |
| // ~~~ begin shared rules ~~~ |
| |
| //special |
| |
| "ya < '~'ャ;" |
| "yi < '~'ィ ;" |
| "yu < '~'ュ;" |
| "ye < '~'ェ;" |
| "yo < '~'ョ;" |
| |
| //normal |
| |
| "a <> ア ;" |
| |
| "b | '~' < ヒ ゙} $small_y ;" |
| "by } $vowel > ビ | '~y' ;" |
| |
| "ba <> バ ;" |
| "bi <> ビ ;" |
| "bu <> ブ ;" |
| "be <> ベ ;" |
| "bo <> ボ ;" |
| |
| "c } i > | s ;" |
| "c } e > | s ;" |
| |
| "da <> ダ ;" |
| "di <> ディ ;" |
| "du <> デゥ ;" |
| "de <> デ ;" |
| "do <> ド ;" |
| "dzu <> ヅ ;" |
| "dja < ヂャ ;" |
| "dji'~i' < ヂィ ;" // liu |
| "dju < ヂュ ;" |
| "dje < ヂェ ;" |
| "djo < ヂョ ;" |
| "dji <> ヂ ;" |
| "dj } $vowel > ヂ | '~y' ;" |
| |
| // TODO: QUESTION: use ĵĴżŻ instead of dj, dz |
| |
| "cha < チャ ;" |
| "chi'~i' < チィ ;" // liu |
| "chu < チュ ;" |
| "che < チェ ;" |
| "cho < チョ ;" |
| "chi <> チ ;" |
| "ch } $vowel > チ | '~y' ;" |
| |
| "e <> エ ;" |
| |
| "g | '~' < ギ} $small_y ;" |
| "gy } $vowel > ギ | '~y' ;" |
| |
| "ga <> ガ ;" |
| "gi <> ギ ;" |
| "gu <> グ ;" |
| "ge <> ゲ ;" |
| "go <> ゴ ;" |
| |
| "i <> イ ;" |
| |
| // j } $vowel > ジ | '~y' ; |
| |
| "ja <> ジャ ;" |
| "ji'~i' < ジィ ;" // liu |
| "ju <> ジュ ;" |
| "je <> ジェ ;" |
| "jo <> ジョ ;" |
| "ji <> ジ ;" |
| |
| "k | '~' < キ} $small_y ;" |
| "ky } $vowel > キ | '~y' ;" |
| |
| "ka <> カ ;" |
| "ki <> キ ;" |
| "ku <> ク ;" |
| "ke <> ケ ;" |
| "ko <> コ ;" |
| |
| "m | '~' < ミ} $small_y ;" |
| "my } $vowel > ミ | '~y' ;" |
| |
| "ma <> マ ;" |
| "mi <> ミ ;" |
| "mu <> ム ;" |
| "me <> メ ;" |
| "mo <> モ ;" |
| |
| "m } [pbfv] > ン ;" |
| |
| "n | '~' < ニ } $small_y ;" |
| "ny } $vowel > ニ | '~y' ;" |
| |
| "na <> ナ ;" |
| "ni <> ニ ;" |
| "nu <> ヌ ;" |
| "ne <> ネ ;" |
| "no <> ノ ;" |
| |
| "o <> オ ;" |
| |
| "p | '~' < ピ } $small_y ;" |
| "py } $vowel > ピ | '~y' ;" |
| |
| "pa <> パ ;" |
| "pi <> ピ ;" |
| "pu <> プ ;" |
| "pe <> ペ ;" |
| "po <> ポ ;" |
| |
| "h | '~' < ヒ } $small_y ;" |
| "hy } $vowel > ヒ | '~y' ;" |
| |
| "ha <> ハ ;" |
| "hi <> ヒ ;" |
| "hu <> ヘゥ ;" |
| "he <> ヘ ;" |
| "ho <> ホ ;" |
| |
| // f | '~' < フ } $small_y ; |
| // f } $vowel > フ | '~' ; |
| |
| "fa <> ファ ;" |
| "fi <> フィ ;" |
| "fe <> フェ ;" |
| "fo <> フォ ;" |
| "fu <> フ ;" |
| |
| "r | '~' < リ } $small_y ;" |
| "ry } $vowel > リ | '~y' ;" |
| |
| "ra <> ラ ;" |
| "ri <> リ ;" |
| "ru <> ル ;" |
| "re <> レ ;" |
| "ro <> ロ ;" |
| |
| "za <> ザ ;" |
| "zi <> ゼィ ;" |
| "zu <> ズ ;" |
| "ze <> ゼ ;" |
| "zo <> ゾ ;" |
| |
| "sa <> サ ;" |
| "si <> セィ ;" |
| "su <> ス ;" |
| "se <> セ ;" |
| "so <> ソ ;" |
| |
| "sha < シャ ;" |
| "shi'~i' < シィ ;" // liu |
| "shu < シュ ;" |
| "she < シェ ;" |
| "sho < ショ ;" |
| "shi <> シ ;" |
| "sh } $vowel > シ | '~y' ;" |
| |
| "ta <> タ ;" |
| "ti <> ティ ;" |
| "tu <> テゥ ;" |
| "te <> テ ;" |
| "to <> ト ;" |
| |
| "tsu <> ツ ;" |
| |
| // v } $vowel > ヴ | '~' ; |
| |
| //'v~a' < ヴァ ; # liu |
| //'v~i' < ヴィ ; # liu |
| //'v~e' < ヴェ ; # liu |
| //'v~o' < ヴォ ; # liu |
| "vu <> ヴ ;" |
| |
| "u <> ウ ;" |
| |
| // w } $vowel > ウ | '~' ; |
| |
| "wa <> ワ ;" |
| "wi <> ヰ ;" |
| "wu > ウ ;" |
| "we <> ヱ ;" |
| "wo <> ヲ ;" |
| |
| "ya <> ヤ ;" |
| "yi > イ ;" |
| "yu <> ユ ;" |
| "ye > エ ;" |
| "yo <> ヨ ;" |
| |
| // double consonants |
| |
| //specials |
| "s } sh > ッ ;" |
| "t } ch > ッ ;" |
| |
| //voiced |
| |
| "j } j <> ッ } $j_start ;" |
| "b } b <> ッ } [$h_start$f_start] $voice;" |
| "d } d <> ッ } $t_start $voice;" |
| "g } g <> ッ } $k_start $voice;" |
| "p } p <> ッ } [$h_start$f_start] $semivoice;" |
| // v } v <> ッ } [ワヰウヱヲう] $voice ; |
| "z } z <> ッ } $s_start $voice;" |
| "v } v <> ッ } $v_start;" |
| |
| // normal |
| |
| "k } k <> ッ } $k_start ;" |
| "m } m <> ッ } $m_start ;" |
| "n } n <> ッ } $n_start ;" |
| "h } h <> ッ } $h_start ;" |
| "f } f <> ッ } $f_start ;" |
| "r } r <> ッ } $r_start ;" |
| "t } t <> ッ } $t_start ;" |
| "s } s <> ッ } $s_start ;" |
| |
| "w } w <> ッ } $w_start;" |
| "y } y <> ッ } $y_start;" |
| |
| // completeness |
| "x } x > ッ ;" |
| "c } k > ッ ;" |
| "c } c > ッ ;" |
| "c } q > ッ ;" |
| "l } l > ッ ;" |
| "q } q > ッ ;" |
| // y } y > ッ ; |
| // w } w > ッ ; |
| |
| // prolonged vowel mark. this indicates a doubling of |
| // the preceding vowel sound |
| |
| //a < a { ー ; # liu |
| //e < e { ー ; # liu |
| //i < i { ー ; # liu |
| //o < o { ー ; # liu |
| //u < u { ー ; # liu |
| |
| "$macron <> ー ;" |
| |
| // small forms |
| |
| "'~a' <> ァ ;" |
| "'~i' <> ィ ;" |
| "'~u' <> ゥ ;" |
| "'~e' <> ェ ;" |
| "'~o' <> ォ ;" |
| "'~tsu' <> ッ ;" |
| "'~wa' <> ヮ ;" |
| "'~ya' <> ャ ;" |
| "'~yi' > ィ ;" |
| "'~yu' <> ュ ;" |
| "'~ye' > ェ ;" |
| "'~yo' <> ョ ;" |
| |
| // iteration marks |
| // TODO: make more accurate |
| |
| "j $1 < sh (y* $vowel) {ヽ$voice ;" |
| "dj $1 < ch (y* $vowel) {ヽ$voice ;" |
| "dz $1 < ts (y* $vowel) {ヽ$voice ;" |
| |
| "g $1 < k (y* $vowel) {ヽ$voice ;" |
| "z $1 < s (y* $vowel) {ヽ$voice ;" |
| "d $1 < t (y* $vowel) {ヽ$voice ;" |
| "h $1 < b (y* $vowel) {ヽ$voice ;" |
| "v $1 < w (y* $vowel) {ヽ$voice ;" |
| |
| "sh $1 < sh (y* $vowel) {ヽ$voice ;" |
| "j $1 < j (y* $vowel) {ヽ$voice ;" |
| "ch $1 < ch (y* $vowel) {ヽ$voice ;" |
| "dj $1 < dj(y* $vowel) {ヽ$voice ;" |
| "ts $1 < ts (y* $vowel) {ヽ$voice ;" |
| "dz $1 < dz (y* $vowel) {ヽ$voice ;" |
| |
| "$1 < ($consonant y* $vowel) {ヽ$voice? ;" |
| "$1 < (.) {ヽ $voice? ;" // otherwise repeat last character |
| "< ヽ $voice? ;" // delete if no characters found |
| |
| // h- rule: lengthens vowel if not followed by a vowel |
| |
| "[aeiou] } h > ー ;" |
| |
| // one-way latin- > kana rules. these do not occur in |
| // well-formed romaji representing actual japanese text. |
| // their purpose is to make all romaji map to kana of |
| // some sort. |
| |
| // the following are not really necessary, but produce |
| // slightly more natural results. |
| |
| "cy > セィ ;" |
| "dy > ディ ;" |
| "hy > ヒ ;" |
| "sy > セィ ;" |
| "ty > ティ ;" |
| "zy > ゼィ ;" |
| |
| "h > ヘ ;" |
| |
| // isolated consonants listed here so as not to mask |
| // longer rules above. |
| |
| "ch > チ;" |
| "sh > シ ;" |
| "dz > ヅ ;" |
| "dj > ヂ;" |
| |
| "b > ブ ;" |
| "d > デ ;" |
| "g > グ ;" |
| "k > ク ;" |
| "m > ム ;" |
| "n'' < ン } $n_quoter ;" |
| "n <> ン ;" |
| "p > プ ;" |
| "r > ル ;" |
| "s > ス ;" |
| "t > テ ;" |
| "y > イ ;" |
| "z > ズ ;" |
| "v > ヴ ;" |
| |
| "f > フ;" |
| "j > ジ;" |
| "w > ウ;" |
| |
| "ß > | ss ;" |
| "æ > | e ;" |
| "ð > | d ;" |
| "ø > | u ;" |
| "þ > | th ;" |
| |
| // simple substitutions using backup |
| |
| "c > | k ;" |
| "l > | r ;" |
| "q > | k ;" |
| "x > | ks ;" |
| |
| // ~~~ END shared rules ~~~ |
| |
| //------------------------------------------------------ |
| // Final cleanup |
| |
| "'~' > ;" // delete stray tildes between letters |
| "[:Katakana:] { '' } [:Latin:] > ;" // delete stray quotes between letters |
| // [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use |
| |
| ":: NFC (NFD) ;" |
| ":: ([:Katakana:] halfwidth-fullwidth);" |
| |
| // note: a global filter is more efficient, but MUST include all source chars!! |
| //:: ([\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]); |
| // MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD |
| ":: ( [[\\\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;" |
| |
| // eof |
| } |
| } |