| #-------------------------------------------------------------------- |
| # Copyright (c) 1999-2001, International Business Machines |
| # Corporation and others. All Rights Reserved. |
| #-------------------------------------------------------------------- |
| # $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/Attic/Transliterator_Arabic_Latin.txt,v $ |
| # $Date: 2002/07/15 23:26:26 $ |
| # $Revision: 1.3 $ |
| #-------------------------------------------------------------------- |
| |
| # Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf> |
| # Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf> |
| # a) where required for disambiguation. |
| # b) with underdot instead of cedilla for letter like SAD, since |
| # those are explicitly in Unicode for transliteration. |
| # c) with extra non-Arabic-language letters, like PEH |
| |
| # Does *not* do assimilation of "al", nor hyphenation. |
| # While it could be done, we need to determine whether a prefix "al" could |
| # occur other than as the definite article (since no space is used). |
| |
| :: [[:Arabic:] [ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ; |
| :: NFKD (NFC); |
| $disambig = ̱ ; |
| $disambig2 = ̰ ; |
| $under = ̣ ; |
| |
| $notAbove = [[:^ccc=0:]&[:^ccc=230:]]; |
| |
| # non-letters |
| |
| ٫ <> '.' $disambig ; # ARABIC DECIMAL SEPARATOR |
| ٬ <> ',' $disambig ; # ARABIC THOUSANDS SEPARATOR |
| # ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate |
| |
| ، <> ',' ; # ARABIC COMMA |
| ؛ <> ';' ; # ARABIC SEMICOLON |
| ؟ <> '?' ; # ARABIC QUESTION MARK |
| ٪ <> '%' ; # ARABIC PERCENT SIGN |
| |
| ۰ <> 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO |
| ۱ <> 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE |
| ۲ <> 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO |
| ۳ <> 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE |
| ۴ <> 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR |
| ۵ <> 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE |
| ۶ <> 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX |
| ۷ <> 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN |
| ۸ <> 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT |
| ۹ <> 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE |
| |
| ٠ <> 0 ; # ARABIC-INDIC DIGIT ZERO |
| ١ <> 1 ; # ARABIC-INDIC DIGIT ONE |
| ٢ <> 2 ; # ARABIC-INDIC DIGIT TWO |
| ٣ <> 3 ; # ARABIC-INDIC DIGIT THREE |
| ٤ <> 4 ; # ARABIC-INDIC DIGIT FOUR |
| ٥ <> 5 ; # ARABIC-INDIC DIGIT FIVE |
| ٦ <> 6 ; # ARABIC-INDIC DIGIT SIX |
| ٧ <> 7 ; # ARABIC-INDIC DIGIT SEVEN |
| ٨ <> 8 ; # ARABIC-INDIC DIGIT EIGHT |
| ٩ <> 9 ; # ARABIC-INDIC DIGIT NINE |
| |
| # letters |
| |
| # long vowels |
| َا<> ā ; # ARABIC FATHA, ARABIC LETTER ALEF |
| ُو <> ū ; # ARABIC DAMMA, ARABIC LETTER WAW |
| ِي <> ī ; # ARABIC KASRA, ARABIC LETTER YEH |
| |
| # longer items moved here to prevent masking |
| ث <> t h $disambig ; # ARABIC LETTER THEH |
| ذ <> d h $disambig ; # ARABIC LETTER THAL |
| ش <> s h $disambig ; # ARABIC LETTER SHEEN |
| ص <> s $under ; # ARABIC LETTER SAD |
| ض <> d $under ; # ARABIC LETTER DAD |
| ط <> t $under ; # ARABIC LETTER TAH |
| ظ <> z $under ; # ARABIC LETTER ZAH |
| غ <> g h $disambig ; # ARABIC LETTER GHAIN |
| |
| # WARNING: special case |
| # <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut> |
| # so on the return, we have to skip over (but preserve) the half-ring below (or others like it) |
| # ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS |
| |
| ة <> t \u0308 ; # ARABIC LETTER TEH MARBUTA |
| ة | $1 < t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA |
| |
| # non-Arabic language |
| ژ <> z h $disambig ; # ARABIC LETTER JEH |
| ڭ <> n $disambig g ; # ARABIC LETTER NG |
| ۋ <> v $disambig ; # ARABIC LETTER VE |
| ی <> y $disambig2 ; # ARABIC LETTER FARSI YEH |
| |
| # Arabic language |
| |
| ء <> ʾ ; # ARABIC LETTER HAMZA |
| ا <> a $under; # ARABIC LETTER ALEF |
| ب <> b ; # ARABIC LETTER BEH |
| ت <> t ; # ARABIC LETTER TEH |
| ج <> j ; # ARABIC LETTER JEEM |
| ح <> h $under ; # ARABIC LETTER HAH |
| خ <> k h $disambig ; # ARABIC LETTER KHAH |
| د <> d ; # ARABIC LETTER DAL |
| ر <> r ; # ARABIC LETTER REH |
| ز <> z ; # ARABIC LETTER ZAIN |
| س <> s ; # ARABIC LETTER SEEN |
| ع <> ʿ ; # ARABIC LETTER AIN |
| ـ > ; # ARABIC TATWEEL |
| ف <> f ; # ARABIC LETTER FEH |
| ق <> q ; # ARABIC LETTER QAF |
| ك <> k ; # ARABIC LETTER KAF |
| ل <> l ; # ARABIC LETTER LAM |
| م <> m ; # ARABIC LETTER MEEM |
| ن <> n ; # ARABIC LETTER NOON |
| ه <> h ; # ARABIC LETTER HEH |
| و <> w ; # ARABIC LETTER WAW |
| ى <> y $disambig ; # ARABIC LETTER ALEF MAKSURA |
| ي <> y ; # ARABIC LETTER YEH |
| ً <> aⁿ ; # ARABIC FATHATAN |
| ٌ <> uⁿ ; # ARABIC DAMMATAN |
| ٍ <> iⁿ ; # ARABIC KASRATAN |
| َ <> a ; # ARABIC FATHA |
| ُ <> u ; # ARABIC DAMMA |
| ِ <> i ; # ARABIC KASRA |
| ّ <> ̃ ; # ARABIC SHADDA |
| ْ <> ̊ ; # ARABIC SUKUN |
| |
| # special combining marks |
| ٓ <> ̂ ; # ARABIC MADDAH ABOVE |
| ٔ <> ̉ ; # ARABIC HAMZA ABOVE |
| ٕ <> ̹ ; # ARABIC HAMZA BELOW |
| |
| # Some non-Arabic language (not in UNGEGN) |
| پ <> p ; # ARABIC LETTER PEH |
| چ <> c h $disambig ; # ARABIC LETTER TCHEH |
| ڤ <> v ; # ARABIC LETTER VEH |
| # ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW |
| # ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW |
| گ <> g ; # ARABIC LETTER GAF |
| |
| # fallbacks |
| | s < c } [eiy]; |
| | k < c ; |
| | i < e ; |
| | u < o ; |
| | ks < x ; |
| | n < ⁿ; |
| |
| :: (lower) ; |
| ::NFC (NFD); |
| :: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] ); |