blob: 6ef8ff6c82bcb0794a1ad07e41f4cc8a64a2cd63 [file] [log] [blame]
#--------------------------------------------------------------------
# Copyright (c) 1999-2001, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_ThaiLogical_Latin.txt,v $
# $Date: 2002/07/26 19:56:55 $
# $Revision: 1.5 $
#--------------------------------------------------------------------
# Thai-Latin
# This set of rules follows ISO 11940
# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
# except that that does not mention an implicit vowel, so we use ọ
#
# The transcription is fairly ugly, so we ought to also do the UNGEGN version
# see: http://www.eki.ee/wgrs/rom1_th.pdf
# and probably make that the main variant.
# Note: this is an internal file. The NFD/NFC is handled externally, in the index
# The insertion of spaces between words, the reversal of the vowels
# and the conversion of space to semicolon are done *outside* of these rules.
# So as far as these rules are concerned, the vowels are in logical order!
# insert implicit vowel (and remove it going the other way)
# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
#$consonant = [ก-ฮ];
#$vowel = [ะ-ฺเ-ไ็];
#{ ( $consonant ) } [^$vowel ] > | $1  ;
# > ọ ;
# < ọ ;
$notAbove = [^\p{ccc=0}\p{ccc=above}] ;
$notBelow = [^\p{ccc=0}\p{ccc=below}] ;
# Consonants
# Warning: the 'h's need to be handled carefully!
# What we really want to say is the following, but we can't
# $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ;
# Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
$freeStandingBelow = [\u0325 ];
$hAccent = [ ̄ ̣]
$notHAccent0 = [^$freeStandingBelow$hAccent];
$notHAccent1 = $freeStandingBelow [^$hAccent];
> h̄ ; # THAI CHARACTER HO HIP
| $1 < h ($notAbove*) ̄; # backward case, account for reordering
<> ḥ ; # THAI CHARACTER HO NOKHUK
<> k̄h ; # THAI CHARACTER KHO KHAI
<> ḳ̄h ; # THAI CHARACTER KHO KHUAT
<> kʹh ; # THAI CHARACTER KHO KHON
<> ḳh ; # THAI CHARACTER KHO RAKHANG
< kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI
<> kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI
<> k ; # THAI CHARACTER KO KAI
<> p̣h ; # THAI CHARACTER PHO SAMPHAO
<> p̄h ; # THAI CHARACTER PHO PHUNG
< ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN
<> ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN
<> p ; # THAI CHARACTER PO PLA
<> c̄h ; # THAI CHARACTER CHO CHING
<> c̣h ; # THAI CHARACTER CHO CHOE
< ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG
<> ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG
<> c ; # THAI CHARACTER CHO CHAN
<> ṭ̄h ; # THAI CHARACTER THO THAN
<> ṯh ; # THAI CHARACTER THO NANGMONTHO
<> tʹh ; # THAI CHARACTER THO PHUTHAO
<> t̄h ; # THAI CHARACTER THO THUNG
<> ṭh ; # THAI CHARACTER THO THONG
< th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
<> th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
<> t̩ ; # THAI CHARACTER TO PATAK
<> t ; # THAI CHARACTER TO TAO
# since there is no singleton g (generated), don't worry about that.
<> ng ; # THAI CHARACTER NGO NGU
<> ṇ ; # THAI CHARACTER NO NEN
<> n ; # THAI CHARACTER NO NU
<> ỵ ; # THAI CHARACTER YO YING
<> ḍ ; # THAI CHARACTER DO CHADA
<> d ; # THAI CHARACTER DO DEK
<> b ; # THAI CHARACTER BO BAIMAI
<> f̄ ; # THAI CHARACTER FO FA
| $1 < f ($notAbove*) ̄; # backward case, account for reordering
<> m ; # THAI CHARACTER MO MA
<> y ; # THAI CHARACTER YO YAK
<> r ; # THAI CHARACTER RO RUA
<> v ; # THAI CHARACTER RU
<> ł ; # THAI CHARACTER LU
<> w ; # THAI CHARACTER WO WAEN
<> ṣ̄ ; # THAI CHARACTER SO SALA***
| $1 < s ̣ ($notAbove*) ̄; # backward case, account for reordering
<> s̄ʹ ; # THAI CHARACTER SO RUSI
> s̄ ; # THAI CHARACTER SO SUA***
| $1 < s ($notAbove*) ̄; # backward case, account for reordering
<> ḷ ; # THAI CHARACTER LO CHULA
<> l ; # THAI CHARACTER LO LING
<> f ; # THAI CHARACTER FO FAN
<> x ; # THAI CHARACTER O ANG
<> s ; # THAI CHARACTER SO SO
# vowels
<> ạ ; # THAI CHARACTER MAI HAN-AKAT
> ā ; # THAI CHARACTER SARA AA
| $1 < a ($notAbove*) ̄; # backward case, account for reordering
# We deviate from ISO for SARA AM for disambiguation
> a ̉; # THAI CHARACTER SARA AM
| $1 < a ($notAbove*) ̉ ; # backward case, account for reordering
<> a ; # THAI CHARACTER SARA A
<> ī ; # THAI CHARACTER SARA II
| $1 < i ($notAbove*) ̄ ; # backward case, account for reordering
<> ụ̄ ; # THAI CHARACTER SARA UEE
| $1 < u ̣ ($notAbove*) ̄ ; # backward case, account for reordering
<> ụ ; # THAI CHARACTER SARA UE
<> ū ; # THAI CHARACTER SARA UU
| $1 < u ($notAbove*) ̄ ; # backward case, account for reordering
<> u ; # THAI CHARACTER SARA U
<> ; # THAI CHARACTER PAIYANNOI
# ฿ <> XXX ; # THAI CURRENCY SYMBOL BAHT
<> e ; # THAI CHARACTER SARA E
<> æ ; # THAI CHARACTER SARA AE
<> o ; # THAI CHARACTER SARA O
<> ı ; # THAI CHARACTER SARA AI MAIMUAN
<> ị ; # THAI CHARACTER SARA AI MAIMALAI
<> ɨ ; # THAI CHARACTER LAKKHANGYAO
<> ̆ ; # THAI CHARACTER MAITAIKHU
<> ̀ ; # THAI CHARACTER MAI EK
<> ̂ ; # THAI CHARACTER MAI THO
<> ́ ; # THAI CHARACTER MAI TRI
<> ̌ ; # THAI CHARACTER MAI CHATTAWA
<> ̒ ; # THAI CHARACTER THANTHAKHAT
<> '~' ; # THAI CHARACTER YAMAKKAN
# We deviate from ISO for disambiguation
<> ̊ ; # THAI CHARACTER NIKHAHIT
<> § ; # THAI CHARACTER FONGMAN
<> 0 ; # THAI DIGIT ZERO
<> 1 ; # THAI DIGIT ONE
<> 2 ; # THAI DIGIT TWO
<> 3 ; # THAI DIGIT THREE
<> 4 ; # THAI DIGIT FOUR
<> 5 ; # THAI DIGIT FIVE
<> 6 ; # THAI DIGIT SIX
<> 7 ; # THAI DIGIT SEVEN
<> 8 ; # THAI DIGIT EIGHT
<> 9 ; # THAI DIGIT NINE
<> '||' ; # THAI CHARACTER ANGKHANKHU
<> » ; # THAI CHARACTER KHOMUT
<> « ; # THAI CHARACTER MAIYAMOK
# moved down to make shorter first
#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
<> ˌ ; # THAI CHARACTER PHINTHU
<> i ; # THAI CHARACTER SARA I
# fallbacks
| k < g ;
| k < h ;
| c < j ;
| k < q ;
| s < z ;
:: (lower);