blob: 03d8e4d1b7db805a60962fe1b8383c5de16b295f [file] [log] [blame]
# © 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html#License
# ***************************************************************************
# *
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Arab_Latn.txt
# Generated from CLDR
#
# Generally follows UNGEGN
# http://www.eki.ee/wgrs/rom1_ar.pdf
# Occasionally deviates in the direction of ISO 233
# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf
# a) where required for disambiguation.
# b) with underdot instead of cedilla for letter like SAD,
# since those are explicitly in Unicode for transliteration.
# c) with extra non-Arabic-language letters, like PEH
#
# Does *not* do assimilation of "al", nor hyphenation.
# While it could be done, we need to determine whether a prefix "al" could
# occur other than as the definite article (since no space is used).
:: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ;
:: NFKD (NFC);
$disambig = \u0331 ;
$disambig2 = \u0330 ;
$under = \u0323 ;
$descender = ˌ;
$notAbove = [[:^ccc=0:] & [:^ccc=230:]];
# non-letters
[:Nd:]{٫}[:Nd:] [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR
[:Nd:]{٬}[:Nd:] [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR
٫ ',' $disambig ; # ARABIC DECIMAL SEPARATOR
٬ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR
# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate
، ',' ; # ARABIC COMMA
؛ ';' ; # ARABIC SEMICOLON
؟ '?' ; # ARABIC QUESTION MARK
٪ '%' ; # ARABIC PERCENT SIGN
۰ 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
۱ 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
۲ 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
۳ 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
۴ 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
۵ 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
۶ 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
۷ 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
۸ 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
۹ 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE
٠ 0 ; # ARABIC-INDIC DIGIT ZERO
١ 1 ; # ARABIC-INDIC DIGIT ONE
٢ 2 ; # ARABIC-INDIC DIGIT TWO
٣ 3 ; # ARABIC-INDIC DIGIT THREE
٤ 4 ; # ARABIC-INDIC DIGIT FOUR
٥ 5 ; # ARABIC-INDIC DIGIT FIVE
٦ 6 ; # ARABIC-INDIC DIGIT SIX
٧ 7 ; # ARABIC-INDIC DIGIT SEVEN
٨ 8 ; # ARABIC-INDIC DIGIT EIGHT
٩ 9 ; # ARABIC-INDIC DIGIT NINE
# letters
# long vowels
\u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF
\u064Fو u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW
\u0650ي i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH
# longer items moved here to prevent masking
ث t h $disambig ; # ARABIC LETTER THEH
ذ d h $disambig ; # ARABIC LETTER THAL
ش s h $disambig ; # ARABIC LETTER SHEEN
ص s $under ; # ARABIC LETTER SAD
ض d $under ; # ARABIC LETTER DAD
ط t $under ; # ARABIC LETTER TAH
ظ z $under ; # ARABIC LETTER ZAH
غ g h $disambig ; # ARABIC LETTER GHAIN
# WARNING: special case
# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→
# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
ة t \u0308 ; # ARABIC LETTER TEH MARBUTA
ة | $1 t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
# non-Arabic language
ژ z h $disambig ; # ARABIC LETTER JEH
ڭ n $disambig g ; # ARABIC LETTER NG
ۋ v $disambig ; # ARABIC LETTER VE
ی y $disambig2 ; # ARABIC LETTER FARSI YEH
ښ s $descender;
# Arabic language
ء ʾ ; # ARABIC LETTER HAMZA
ا a $under; # ARABIC LETTER ALEF
ب b ; # ARABIC LETTER BEH
ت t ; # ARABIC LETTER TEH
ج j ; # ARABIC LETTER JEEM
ح h $under ; # ARABIC LETTER HAH
خ k h $disambig ; # ARABIC LETTER KHAH
د d ; # ARABIC LETTER DAL
ر r ; # ARABIC LETTER REH
ز z ; # ARABIC LETTER ZAIN
س s ; # ARABIC LETTER SEEN
ع ʿ ; # ARABIC LETTER AIN
ـ ; # ARABIC TATWEEL
ف f ; # ARABIC LETTER FEH
ق q ; # ARABIC LETTER QAF
ک k $disambig ; # ARABIC LETTER KEHEH
ك k ; # ARABIC LETTER KAF
ل l ; # ARABIC LETTER LAM
م m ; # ARABIC LETTER MEEM
ن n ; # ARABIC LETTER NOON
ه h ; # ARABIC LETTER HEH
و w ; # ARABIC LETTER WAW
ى y $disambig ; # ARABIC LETTER ALEF MAKSURA
ي y ; # ARABIC LETTER YEH
\u064B a ; # ARABIC FATHATAN
\u064C u ; # ARABIC DAMMATAN
\u064D i ; # ARABIC KASRATAN
\u064E a ; # ARABIC FATHA
\u064F u ; # ARABIC DAMMA
\u0650 i ; # ARABIC KASRA
\u0651 \u0303 ; # ARABIC SHADDA
\u0652 \u030A ; # ARABIC SUKUN
# special combining marks
\u0653 \u0302 ; # ARABIC MADDAH ABOVE
\u0654 \u0309 ; # ARABIC HAMZA ABOVE
\u0655 \u0339 ; # ARABIC HAMZA BELOW
# Some non-Arabic language (not in UNGEGN)
پ p ; # ARABIC LETTER PEH
چ c h $disambig ; # ARABIC LETTER TCHEH
ڤ v ; # ARABIC LETTER VEH
# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
گ g ; # ARABIC LETTER GAF
# fallbacks
| s c } [eiy];
| k c ;
| i e ;
| u o ;
| ks x ;
| n ‎ⁿ;
:: (lower) ;
::NFC (NFD);
:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] );