#-------------------------------------------------------------------- | |
# Copyright (c) 1999-2001, International Business Machines | |
# Corporation and others. All Rights Reserved. | |
#-------------------------------------------------------------------- | |
# Latin-Jamo | |
# Transliteration from Latin characters to Korean script is done in | |
# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul | |
# transliteration is done algorithmically following Unicode 3.0 | |
# section 3.11. This file implements the Latin to Jamo | |
# transliteration using rules. | |
# Jamo occupy the block 1100-11FF. Within this block there are three | |
# groups of characters: initial consonants or choseong (I), medial | |
# vowels or jungseong (M), and trailing consonants or jongseong (F). | |
# Standard Korean syllables are of the form I+M+F*. | |
# Section 3.11 describes the use of 'filler' jamo to convert | |
# nonstandard syllables to standard form: the choseong filler 115F and | |
# the junseong filler 1160. In this transliterator, we will not use | |
# 115F or 1160. | |
# We will, however, insert two 'null' jamo to make foreign words | |
# conform to Korean syllable structure. These are the null initial | |
# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text, | |
# we will use the hyphen in order to disambiguate strings, | |
# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G). | |
# We will not use all of the characters in the jamo block. We will | |
# only use the 19 initials, 21 medials, and 27 finals possessing a | |
# jamo short name as defined in section 4.4 of the Unicode book. | |
# Rules of thumb. These guidelines provide the basic framework | |
# for the rules. They are phrased in terms of Latin-Jamo transliteration. | |
# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are | |
# just context-free transliteration of jamo to corresponding short names, | |
# with the addition of hyphens to maintain round-trip integrity | |
# in the context of the Latin-Jamo rules. | |
# A sequence of vowels: | |
# - Take the longest sequence you can. If there are too many, or you don't | |
# have a starting consonant, introduce a 110B necessary. | |
# A sequence of consonants. | |
# - First join the double consonants: G + G -> GG | |
# - In the remaining list, | |
# -- If there is no preceding vowel, take the first consonant, and insert EU | |
# after it. Continue with the rest of the consonants. | |
# -- If there is one consonant, attach to the following vowel | |
# -- If there are two consonants and a following vowel, attach one to the | |
# preceeding vowel, and one to the following vowel. | |
# -- If there are more than two consonants, join the first two together if you | |
# can: L + G => LG | |
# -- If you still end up with more than 2 consonants, insert EU after the | |
# first one, and continue with the rest of the consonants. | |
#---------------------------------------------------------------------- | |
# Variables | |
# Some latin consonants or consonant pairs only occur as initials, and | |
# some only as finals, but some occur as both. This makes some jamo | |
# consonants ambiguous when transliterated into latin. | |
# Initial only: IEUNG BB DD JJ R | |
# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ | |
# Initial and Final: B C D G GG H J K M N P S SS T | |
$Gi = \u1100; | |
$GGi = \u1101; | |
$Ni = \u1102; | |
$Di = \u1103; | |
$DD = \u1104; | |
$R = \u1105; | |
$Mi = \u1106; | |
$Bi = \u1107; | |
$BB = \u1108; | |
$Si = \u1109; | |
$SSi = \u110A; | |
$IEUNG = \u110B; # null initial, inserted during Latin-Jamo | |
$Ji = \u110C; | |
$JJ = \u110D; | |
$Ci = \u110E; | |
$Ki = \u110F; | |
$Ti = \u1110; | |
$Pi = \u1111; | |
$Hi = \u1112; | |
$A = \u1161; | |
$AE = \u1162; | |
$YA = \u1163; | |
$YAE = \u1164; | |
$EO = \u1165; | |
$E = \u1166; | |
$YEO = \u1167; | |
$YE = \u1168; | |
$O = \u1169; | |
$WA = \u116A; | |
$WAE = \u116B; | |
$OE = \u116C; | |
$YO = \u116D; | |
$U = \u116E; | |
$WEO = \u116F; | |
$WE = \u1170; | |
$WI = \u1171; | |
$YU = \u1172; | |
$EU = \u1173; # null medial, inserted during Latin-Jamo | |
$YI = \u1174; | |
$I = \u1175; | |
$Gf = \u11A8; | |
$GGf = \u11A9; | |
$GS = \u11AA; | |
$Nf = \u11AB; | |
$NJ = \u11AC; | |
$NH = \u11AD; | |
$Df = \u11AE; | |
$L = \u11AF; | |
$LG = \u11B0; | |
$LM = \u11B1; | |
$LB = \u11B2; | |
$LS = \u11B3; | |
$LT = \u11B4; | |
$LP = \u11B5; | |
$LH = \u11B6; | |
$Mf = \u11B7; | |
$Bf = \u11B8; | |
$BS = \u11B9; | |
$Sf = \u11BA; | |
$SSf = \u11BB; | |
$NG = \u11BC; | |
$Jf = \u11BD; | |
$Cf = \u11BE; | |
$Kf = \u11BF; | |
$Tf = \u11C0; | |
$Pf = \u11C1; | |
$Hf = \u11C2; | |
$jamoInitial = [\u1100-\u1112]; | |
$jamoMedial = [\u1161-\u1175]; | |
$latinInitial = [bcdghjkmnprst]; | |
# Any character in the latin transliteration of a medial | |
$latinMedial = [aeiouwy]; | |
# The last character of the latin transliteration of a medial | |
$latinMedialEnd = [aeiou]; | |
#---------------------------------------------------------------------- | |
# Jamo-Latin | |
# Jamo to latin is relatively simple, since it is the latin that is | |
# ambiguous. Most rules are straightforward, and we encode them below | |
# as simple add-on back rule, e.g.: | |
# $jamoMedial {bs} > $BS; | |
# becomes | |
# $jamoMedial {bs} <> $BS; | |
# Furthermore, we don't care about the ordering for Jamo-Latin because | |
# we are going from single characters, so we can very easily piggyback | |
# on the Latin-Jamo. | |
# The main issue with Jamo-Latin is when to insert hyphens. | |
# Hyphens are inserted to obtain correct round trip behavior. For | |
# example, the sequence Ki A Gf Gi E, if transliterated to "kagge", | |
# would then round trip to Ki A GGi E. To prevent this, we insert a | |
# hyphen: "kag-ge". IMPORTANT: The need for hyphens depends | |
# very specifically on the behavior of the Latin-Jamo rules. A change | |
# in the Latin-Jamo behavior can completely change the way the | |
# hyphen insertion must be done. | |
# First try to preserve actual hyphens in the jamo text by doubling | |
# them. This fixes problems like: | |
# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol | |
# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional | |
# -- if we don't care about losing hyphens in the jamo, we can delete | |
# this rule. | |
'--' <> '-'; | |
# Triple consonants. For three consonants "axxx" we insert a | |
# hyphen between the first and second "x" if XXf, Xf, and Xi all | |
# exist, and we have A Xf XXi. This prevents the reverse | |
# transliteration to A XXf Xi. | |
'-' < $latinMedialEnd g {} $GGi; | |
'-' < $latinMedialEnd s {} $SSi; | |
# For vowels the rule is similar. If there is a vowel "ae" such that | |
# "a" by itself and "e" by itself are vowels, then we want to map A E | |
# to "a-e" so as not to round trip to AE. However, in the text Ki EO | |
# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For | |
# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be | |
# tested. NOTE: These rules used to have a left context of | |
# $latinInitial instead of [^$latinMedial]. The problem with this is | |
# sequences where an initial IEUNG is transliterated away: | |
# (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O) | |
'-' < [^$latinMedial] [y w] e {} [$O $OE]; | |
'-' < [^$latinMedial] e {} [$O $OE $U]; | |
'-' < [^$latinMedial] [o a] {} [$E $EO $EU]; | |
'-' < [^$latinMedial] [w y] a {} [$E $EO $EU]; | |
# Similar to the above, but with an intervening $IEUNG. | |
'-' < [^$latinMedial] [y w] e {} $IEUNG [$O $OE]; | |
'-' < [^$latinMedial] e {} $IEUNG [$O $OE $U]; | |
'-' < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU]; | |
'-' < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU]; | |
# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E, | |
# where Xi also exists, must be transliterated as "ax-e" to prevent | |
# the round trip conversion to A Xi E. | |
'-' < $latinMedialEnd b {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd c {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd d {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd g {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd h {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd j {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd k {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd m {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd n {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd p {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd s {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd t {} $IEUNG $jamoMedial; | |
# Double finals followed by IEUNG. Similar to the single finals | |
# followed by IEUNG. Any latin consonant pair X Y, between medials, | |
# that we would split by Latin-Jamo, we must handle when it occurs as | |
# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi | |
# E. | |
'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial; | |
'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial; | |
# Split doubles. Text of the form A Xi Xf E, where XXi also occurs, | |
# we transliterate as "ax-xe" to prevent round trip transliteration as | |
# A XXi E. | |
'-' < $latinMedialEnd b {} $Bi $jamoMedial; | |
'-' < $latinMedialEnd d {} $Di $jamoMedial; | |
'-' < $latinMedialEnd j {} $Ji $jamoMedial; | |
'-' < $latinMedialEnd g {} $Gi $jamoMedial; | |
'-' < $latinMedialEnd s {} $Si $jamoMedial; | |
# XYY. This corresponds to the XYY rule in Latin-Jamo. By default | |
# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result, | |
# "xyy" forms that correspond to XYf Yi must be transliterated as | |
# "xy-y". | |
'-' < $latinMedialEnd b s {} [$Si $SSi]; | |
'-' < $latinMedialEnd g s {} [$Si $SSi]; | |
'-' < $latinMedialEnd l b {} [$Bi $BB]; | |
'-' < $latinMedialEnd l g {} [$Gi $GGi]; | |
'-' < $latinMedialEnd l s {} [$Si $SSi]; | |
'-' < $latinMedialEnd n g {} [$Gi $GGi]; | |
'-' < $latinMedialEnd n j {} [$Ji $JJ]; | |
# Deletion of IEUNG is handled below. | |
#---------------------------------------------------------------------- | |
# Latin-Jamo | |
# [Basic, context-free Jamo-Latin rules are embedded here too. See | |
# above.] | |
# Split digraphs: Text of the form 'axye', where 'xy' is a final | |
# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and | |
# 'e' are medials, we want to transliterate this as A Xf Yi E rather | |
# than A XYf IEUNG E. We do NOT include text of the form "axxe", | |
# since that is handled differently below. These rules are generated | |
# programmatically from the jamo data. | |
$jamoMedial {b s} $latinMedial > $Bf $Si; | |
$jamoMedial {g s} $latinMedial > $Gf $Si; | |
$jamoMedial {l b} $latinMedial > $L $Bi; | |
$jamoMedial {l g} $latinMedial > $L $Gi; | |
$jamoMedial {l h} $latinMedial > $L $Hi; | |
$jamoMedial {l m} $latinMedial > $L $Mi; | |
$jamoMedial {l p} $latinMedial > $L $Pi; | |
$jamoMedial {l s} $latinMedial > $L $Si; | |
$jamoMedial {l t} $latinMedial > $L $Ti; | |
$jamoMedial {n g} $latinMedial > $Nf $Gi; | |
$jamoMedial {n h} $latinMedial > $Nf $Hi; | |
$jamoMedial {n j} $latinMedial > $Nf $Ji; | |
# Single consonants are initials: Text of the form 'axe', where 'x' | |
# can be an initial or a final, and 'a' and 'e' are medials, we want | |
# to transliterate as A Xi E rather than A Xf IEUNG E. | |
$jamoMedial {b} $latinMedial > $Bi; | |
$jamoMedial {c} $latinMedial > $Ci; | |
$jamoMedial {d} $latinMedial > $Di; | |
$jamoMedial {g} $latinMedial > $Gi; | |
$jamoMedial {h} $latinMedial > $Hi; | |
$jamoMedial {j} $latinMedial > $Ji; | |
$jamoMedial {k} $latinMedial > $Ki; | |
$jamoMedial {m} $latinMedial > $Mi; | |
$jamoMedial {n} $latinMedial > $Ni; | |
$jamoMedial {p} $latinMedial > $Pi; | |
$jamoMedial {s} $latinMedial > $Si; | |
$jamoMedial {t} $latinMedial > $Ti; | |
# Doubled initials. The sequence "axxe", where XX exists as an initial | |
# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want | |
# to transliterate as A XXi E, rather than split to A Xf Xi E. | |
$jamoMedial {b b} $latinMedial > $BB; | |
$jamoMedial {d d} $latinMedial > $DD; | |
$jamoMedial {j j} $latinMedial > $JJ; | |
$jamoMedial {g g} $latinMedial > $GGi; | |
$jamoMedial {s s} $latinMedial > $SSi; | |
# XYY. Because doubled consonants bind more strongly than XY | |
# consonants, we must handle the sequence "axyy" specially. Here XYf | |
# and YYi must exist. In these cases, we map to Xf YYi rather than | |
# XYf. | |
$jamoMedial {b} s s > $Bf; | |
$jamoMedial {g} s s > $Gf; | |
$jamoMedial {l} b b > $L; | |
$jamoMedial {l} g g > $L; | |
$jamoMedial {l} s s > $L; | |
$jamoMedial {n} g g > $Nf; | |
$jamoMedial {n} j j > $Nf; | |
# Finals: Attach consonant with preceding medial to preceding medial. | |
# Do this BEFORE mapping consonants to initials. Longer keys must | |
# precede shorter keys that they start with, e.g., the rule for 'bs' | |
# must precede 'b'. | |
# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this | |
# block for Jamo-Latin.] | |
$jamoMedial {bs} <> $BS; | |
$jamoMedial {b} <> $Bf; | |
$jamoMedial {c} <> $Cf; | |
$jamoMedial {d} <> $Df; | |
$jamoMedial {gg} <> $GGf; | |
$jamoMedial {gs} <> $GS; | |
$jamoMedial {g} <> $Gf; | |
$jamoMedial {h} <> $Hf; | |
$jamoMedial {j} <> $Jf; | |
$jamoMedial {k} <> $Kf; | |
$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG; | |
$jamoMedial {lh} <> $LH; | |
$jamoMedial {lm} <> $LM; | |
$jamoMedial {lp} <> $LP; | |
$jamoMedial {ls} <> $LS; | |
$jamoMedial {lt} <> $LT; | |
$jamoMedial {l} <> $L; | |
$jamoMedial {m} <> $Mf; | |
$jamoMedial {ng} <> $NG; | |
$jamoMedial {nh} <> $NH; | |
$jamoMedial {nj} <> $NJ; | |
$jamoMedial {n} <> $Nf; | |
$jamoMedial {p} <> $Pf; | |
$jamoMedial {ss} <> $SSf; | |
$jamoMedial {s} <> $Sf; | |
$jamoMedial {t} <> $Tf; | |
# Initials: Attach single consonant to following medial. Do this | |
# AFTER mapping finals. Longer keys must precede shorter keys that | |
# they start with, e.g., the rule for 'gg' must precede 'g'. | |
# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within | |
# this block for Jamo-Latin.] | |
{gg} $latinMedial <> $GGi; | |
{g} $latinMedial <> $Gi; | |
{n} $latinMedial <> $Ni; | |
{dd} $latinMedial <> $DD; | |
{d} $latinMedial <> $Di; | |
{r} $latinMedial <> $R; | |
{m} $latinMedial <> $Mi; | |
{bb} $latinMedial <> $BB; | |
{b} $latinMedial <> $Bi; | |
{ss} $latinMedial <> $SSi; | |
{s} $latinMedial <> $Si; | |
{jj} $latinMedial <> $JJ; | |
{j} $latinMedial <> $Ji; | |
{c} $latinMedial <> $Ci; | |
{k} $latinMedial <> $Ki; | |
{t} $latinMedial <> $Ti; | |
{p} $latinMedial <> $Pi; | |
{h} $latinMedial <> $Hi; | |
# 'r' in final position. Because of the equivalency of the 'l' and | |
# 'r' jamo (the glyphs are the same), we try to provide the same | |
# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled | |
# below. If we see an 'r' in an apparent final position, treat it | |
# like 'l'. For example, "karka" => Ki A R EU Ki A without this rule. | |
# Instead, we want Ki A L Ki A. | |
$jamoMedial {r} $latinInitial > | l; | |
# Initial + Final: If we match the next rule, we have initial then | |
# final consonant with no intervening medial. We insert the null | |
# vowel BEFORE it to create a well-formed syllable. (In the next rule | |
# we insert a null vowel AFTER an anomalous initial.) | |
$jamoInitial {} [bcdghjklmnpst] > $EU; | |
# Initial + X: This block matches an initial consonant not followed by | |
# a medial. We insert the null vowel after it. We handle double | |
# initials explicitly here; for single initial consonants we insert EU | |
# (as Latin) after them and let standard rules do the rest. | |
# BREAKS ROUND TRIP INTEGRITY | |
gg > $GGi $EU; | |
dd > $DD $EU; | |
bb > $BB $EU; | |
ss > $SSi $EU; | |
jj > $JJ $EU; | |
([bcdghjkmnprst]) > | $1 eu; | |
# X + Final: Finally we have to deal with a consonant that can only be | |
# interpreted as a final (not an initial) and which is preceded | |
# neither by an initial nor a medial. It is the start of the | |
# syllable, but cannot be. Most of these will already be handled by | |
# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng' | |
# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'. | |
# For this isolated case, we could add a null initial and medial, | |
# which would give "la" => IEUNG EU L IEUNG A, for example. A more | |
# economical solution is to transliterate isolated "l" (that is, | |
# initial "l") to "r". (Other similar conversions of consonants that | |
# occur neither as initials nor as finals are handled below.) | |
l > | r; | |
# Medials. If a medial is preceded by an initial, then we proceed | |
# normally. As usual, longer keys must precede shorter ones. | |
# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within | |
# this block for Jamo-Latin.] | |
$jamoInitial {ae} <> $AE; | |
$jamoInitial {a} <> $A; | |
$jamoInitial {eo} <> $EO; | |
$jamoInitial {eu} <> $EU; | |
$jamoInitial {e} <> $E; | |
$jamoInitial {i} <> $I; | |
$jamoInitial {oe} <> $OE; | |
$jamoInitial {o} <> $O; | |
$jamoInitial {u} <> $U; | |
$jamoInitial {wae} <> $WAE; | |
$jamoInitial {wa} <> $WA; | |
$jamoInitial {weo} <> $WEO; | |
$jamoInitial {we} <> $WE; | |
$jamoInitial {wi} <> $WI; | |
$jamoInitial {yae} <> $YAE; | |
$jamoInitial {ya} <> $YA; | |
$jamoInitial {yeo} <> $YEO; | |
$jamoInitial {ye} <> $YE; | |
$jamoInitial {yi} <> $YI; | |
$jamoInitial {yo} <> $YO; | |
$jamoInitial {yu} <> $YU; | |
# We may see an anomalous isolated 'w' or 'y'. In that case, we | |
# interpret it as 'wi' and 'yu', respectively. | |
# BREAKS ROUND TRIP INTEGRITY | |
$jamoInitial {w} > | wi; | |
$jamoInitial {y} > | yu; | |
# Otherwise, insert a null consonant IEUNG before the medial (which is | |
# still an untransliterated latin vowel). | |
($latinMedial) > $IEUNG | $1; | |
# Convert non-jamo latin consonants to equivalents. These occur as | |
# neither initials nor finals in jamo. 'l' occurs as a final, but not | |
# an initial; it is handled above. The following letters (left hand | |
# side) will never be output by Jamo-Latin. | |
f > | p; | |
q > | k; | |
v > | b; | |
x > | ks; | |
z > | s; | |
# Delete hyphens (Latin-Jamo). | |
'-' > ; | |
# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels, | |
# since these may also occur in text. | |
< $IEUNG; | |
# eof |