| #-------------------------------------------------------------------- |
| # Copyright (c) 1999-2001, International Business Machines |
| # Corporation and others. All Rights Reserved. |
| #-------------------------------------------------------------------- |
| # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Greek_Latin.txt,v $ |
| # $Date: 2002/07/21 08:39:23 $ |
| # $Revision: 1.21 $ |
| #-------------------------------------------------------------------- |
| |
| # Rules are predicated on running NFD first, and NFC afterwards |
| # :: [\u0000-\u007F \u0370-\u03FF [:Greek:] [:nonspacing mark:]] ; |
| # MINIMAL FILTER GENERATED FOR: Greek-Latin |
| :: [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] ; |
| |
| :: NFD (NFC) ; |
| |
| # TEST CASES |
| |
| # Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος |
| # ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ |
| # ᾳ ῃ ῳ ὃ ὄ |
| # ὠς ὡς ὢς ὣς |
| # Ὠς Ὡς Ὢς Ὣς |
| # ὨΣ ὩΣ ὪΣ ὫΣ |
| # Ạ, ạ, Ẹ, ẹ, Ọ, ọ |
| |
| # Useful variables |
| |
| $lower = [[:latin:][:greek:] & [:Ll:]]; |
| $glower = [[:greek:] & [:Ll:]]; |
| $upper = [[:latin:][:greek:] & [:Lu:]] ; |
| $accent = [:M:] ; |
| |
| # NOTE: restrict to just the Greek & Latin accents that we care about |
| # TODO: broaden out once interation is fixed |
| $accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ; |
| |
| $macron = \u0304 ; |
| $ddot = \u0308 ; |
| $ddotmac = [$ddot$macron]; |
| |
| $lcgvowel = [αεηιουω] ; |
| $ucgvowel = [ΑΕΗΙΟΥΩ] ; |
| $gvowel = [$lcgvowel $ucgvowel] ; |
| $lcgvowelC = [$lcgvowel $accent] ; |
| |
| $evowel = [aeiouyAEIOUY]; |
| $evowel2 = [iuyIUY]; |
| $vowel = [ $evowel $gvowel] ; |
| |
| $gammaLike = [ΓΚΞΧγκξχϰ] ; |
| $egammaLike = [GKXCgkxc] ; |
| $smooth = ̓ ; |
| $rough = ̔ ; |
| $iotasub = ͅ ; |
| |
| $evowel_i = [$evowel-[iI]] ; |
| $evowel2_i = [uyUY]; |
| |
| $underbar = \u0331; |
| |
| $afterLetter = [:L:] [[:M:]\']* ; |
| $beforeLetter = [[:M:]\']* [:L:] ; |
| $beforeLower = $accent * $lower ; |
| |
| $notLetter = [^[:L:][:M:]] ; |
| $under = ̱; |
| |
| # Fix punctuation |
| # preserve original |
| \: <> \: $under ; |
| \? <> \? $under ; |
| |
| \; <> \? ; |
| · <> \: ; |
| |
| # CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve |
| |
| \u0342 <> \u0302 ; |
| |
| # IOTA: convert iota subscript to iota |
| # first make previous alpha long! |
| |
| $accent_minus = [[$accent]-[$iotasub$macron]]; |
| |
| Α } $accent_minus * $iotasub > | Α $macron ; |
| α } $accent_minus * $iotasub > | α $macron ; |
| |
| # now convert to uppercase if after uppercase, ow to lowercase |
| |
| $upper $accent * { $iotasub > I ; |
| $iotasub > i ; |
| |
| | $1 $iotasub < ($evowel $macron $accentMinus *) i ; |
| | $1 $iotasub < ($evowel $macron $accentMinus *) I ; |
| |
| # BREATHING |
| |
| # Convert rough breathing to h, and move before letters. |
| |
| # Make A ` x = > H a x |
| |
| Α ($macron?) $rough } $beforeLower > H | α $1; |
| Ε $rough } $beforeLower > H | ε; |
| Η $rough } $beforeLower > H | η ; |
| Ι ($ddot?) $rough } $beforeLower > H | ι $1; |
| Ο $rough } $beforeLower > H | ο ; |
| Υ $rough } $beforeLower > H | υ ; |
| Ω ($ddot?) $rough } $beforeLower > H | ω $1; |
| |
| # Make A x ` = > H a x |
| |
| Α ($glower $macron?) $rough > H | α $1 ; |
| Ε ($glower) $rough > H | ε $1 ; |
| Η ($glower) $rough > H | η $1 ; |
| Ι ($glower $ddot?) $rough > H | ι $1 ; |
| Ο ($glower) $rough > H | ο $1 ; |
| Υ ($glower) $rough > H | υ $1 ; |
| Ω ($glower $ddot?) $rough > H | ω $1 ; |
| |
| #Otherwise, make x ` into h x and X ` into H X |
| |
| ($lcgvowel + $ddotmac? ) $rough > h | $1 ; |
| ($gvowel + $ddotmac? ) $rough > H | $1 ; |
| |
| # Go backwards with H |
| |
| | $1 $rough < h ($evowel $macron $ddot? $evowel2_i $macron?) ; |
| | $1 $rough < h ($evowel $ddot? $evowel2 $macron?) ; |
| | $1 $rough < h ($evowel $macron? $ddot?) ; |
| |
| | $1 $rough < H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ; |
| | $1 $rough < H ([AEIOUY] $ddot? $evowel2 $macron?) ; |
| | $1 $rough < H ([AEIOUY] $macron? $ddot?) ; |
| |
| # titlecase, have to fix individually |
| # in the future, we should add &uppercase() to make this easier |
| |
| | A $1 $rough < H a ($macron $ddot? $evowel2_i $macron?) ; |
| | E $1 $rough < H e ($macron $ddot? $evowel2_i $macron?) ; |
| | I $1 $rough < H i ($macron $ddot? $evowel2_i $macron?) ; |
| | O $1 $rough < H o ($macron $ddot? $evowel2_i $macron?) ; |
| | U $1 $rough < H u ($macron $ddot? $evowel2_i $macron?) ; |
| | Y $1 $rough < H y ($macron $ddot? $evowel2_i $macron?) ; |
| |
| | A $1 $rough < H a ($ddot? $evowel2 $macron?) ; |
| | E $1 $rough < H e ($ddot? $evowel2 $macron?) ; |
| | I $1 $rough < H i ($ddot? $evowel2 $macron?) ; |
| | O $1 $rough < H o ($ddot? $evowel2 $macron?) ; |
| | U $1 $rough < H u ($ddot? $evowel2 $macron?) ; |
| | Y $1 $rough < H y ($ddot? $evowel2 $macron?) ; |
| |
| | A $1 $rough < H a ($macron? $ddot? ) ; |
| | E $1 $rough < H e ($macron? $ddot? ) ; |
| | I $1 $rough < H i ($macron? $ddot? ) ; |
| | O $1 $rough < H o ($macron? $ddot? ) ; |
| | U $1 $rough < H u ($macron? $ddot? ) ; |
| | Y $1 $rough < H y ($macron? $ddot? ) ; |
| |
| # Now do smooth |
| |
| #delete smooth breathing for Latin |
| $smooth > ; |
| |
| # insert in Greek |
| # the assumption is that all Marks are on letters. |
| |
| | $1 $smooth < $notLetter { ([rR]) } [^hH$smooth$rough] ; |
| | $1 $smooth < $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ; |
| | $1 $smooth < $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ; |
| |
| # TODO: preserve smooth/rough breathing if not |
| # on initial vowel sequence |
| |
| # need to have these up here so the rules don't mask |
| |
| # remove now superfluous macron when returning |
| |
| Α < A $macron ; |
| α < a $macron ; |
| |
| η <> e $macron ; |
| Η <> E $macron ; |
| |
| φ <> ph ; |
| Ψ } $beforeLower <> Ps ; |
| Ψ <> PS ; |
| |
| Φ } $beforeLower <> Ph ; |
| Φ <> PH ; |
| ψ <> ps ; |
| |
| ω <> o $macron ; |
| Ω <> O $macron; |
| |
| # NORMAL |
| |
| α <> a ; |
| Α <> A ; |
| |
| β <> b ; |
| Β <> B ; |
| |
| γ } $gammaLike <> n } $egammaLike ; |
| γ <> g ; |
| Γ } $gammaLike <> N } $egammaLike ; |
| Γ <> G ; |
| |
| δ <> d ; |
| Δ <> D ; |
| |
| ε <> e ; |
| Ε <> E ; |
| |
| ζ <> z ; |
| Ζ <> Z ; |
| |
| θ <> th ; |
| Θ } $beforeLower <> Th ; |
| Θ <> TH ; |
| |
| ι <> i ; |
| Ι <> I ; |
| |
| κ <> k ; |
| Κ <> K ; |
| |
| λ <> l ; |
| Λ <> L ; |
| |
| μ <> m ; |
| Μ <> M ; |
| |
| ν } $gammaLike > n\' ; |
| ν <> n ; |
| Ν } $gammaLike <> N\' ; |
| Ν <> N ; |
| |
| ξ <> x ; |
| Ξ <> X ; |
| |
| ο <> o ; |
| Ο <> O ; |
| |
| π <> p ; |
| Π <> P ; |
| |
| ρ $rough <> rh; |
| Ρ $rough } $beforeLower <> Rh ; |
| Ρ $rough <> RH ; |
| ρ <> r ; |
| Ρ <> R ; |
| |
| # insert separator |
| |
| [Pp] { } ς > \' ; |
| [Pp] { } σ > \' ; |
| |
| # underbar means exception |
| |
| # before a letter, initial |
| ς } $beforeLetter <> s $underbar } $beforeLetter; |
| σ } $beforeLetter <> s } $beforeLetter; |
| |
| # otherwise, after a letter = final |
| $afterLetter { σ <> $afterLetter { s $underbar; |
| $afterLetter { ς <> $afterLetter { s ; |
| |
| # otherwise (isolated) = initial |
| ς <> s $underbar; |
| σ <> s ; |
| |
| [Pp] { Σ <> \'S ; |
| Σ <> S ; |
| |
| τ <> t ; |
| Τ <> T ; |
| |
| $vowel {υ } <> u ; |
| υ <> y ; |
| $vowel { Υ <> U ; |
| Υ <> Y ; |
| |
| χ <> ch ; |
| Χ } $beforeLower <> Ch ; |
| Χ <> CH ; |
| |
| # Completeness for ASCII |
| |
| $ignore = [[:Mark:]''] * ; |
| |
| | k < c ; |
| | ph < f ; |
| | i < j ; |
| | k < q ; |
| | b < v } $vowel ; |
| | b < w } $vowel; |
| | u < v ; |
| | u < w; |
| | K < C ; |
| | Ph < F ; |
| | I < J ; |
| | K < Q ; |
| | B < V } $vowel ; |
| | B < W } $vowel ; |
| | U < V ; |
| | U < W ; |
| |
| $rough } $ignore [:UppercaseLetter:] > H ; |
| $ignore [:UppercaseLetter:] { $rough > H ; |
| $rough < H ; |
| $rough <> h ; |
| |
| # Completeness for Greek |
| |
| ϐ > | β ; |
| ϑ > | θ ; |
| ϒ > | Υ ; |
| ϕ > | φ ; |
| ϖ > | π ; |
| |
| ϰ > | κ ; |
| ϱ > | ρ ; |
| ϲ > | σ ; |
| ϳ > j ; |
| ϴ > | Θ ; |
| ϵ > | ε ; |
| |
| µ > | μ ; |
| |
| ͺ > i; |
| |
| # delete any trailing ' marks used for roundtripping |
| |
| < [Ππ] { \' } [Ss] ; |
| < [Νν] { \' } $egammaLike ; |
| |
| ::NFC (NFD) ; |
| # ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ; |
| # ([\u0000-\u007F \u00B7 [:Latin:] [:nonspacing mark:]]) ; |
| # MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD |
| :: ( [':?A-Za-z\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0300-\u0337\u0339-\u0345\u0385-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u1E00-\u1E99\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FC1-\u1FC4\u1FC6-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2-\u1FF4\u1FF6-\u1FFC\u212A-\u212B] ) ; |