blob: c57a898fbf67bd8164e037822577b2e48021291e [file] [log] [blame]
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: grapheme.txt
#
# Reference Grapheme Break rules for intltest rbbi/RBBIMonkeyTest
#
#
# Note: Rule syntax and the monkey test itself are still a work in progress.
# They are expected to change with review and the addition of support for rule tailoring.
type = grapheme; # one of grapheme | word | line | sentence
locale = en;
CR = [\p{Grapheme_Cluster_Break = CR}];
LF = [\p{Grapheme_Cluster_Break = LF}];
Control = [[\p{Grapheme_Cluster_Break = Control}]];
Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
#
# Korean Syllable Definitions
#
L = [\p{Grapheme_Cluster_Break = L}];
V = [\p{Grapheme_Cluster_Break = V}];
T = [\p{Grapheme_Cluster_Break = T}];
LV = [\p{Grapheme_Cluster_Break = LV}];
LVT = [\p{Grapheme_Cluster_Break = LVT}];
# Emoji defintions
Extended_Pict = [:ExtPict:];
# Indic Sequences
Virama_ = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Virama}]];
LinkingConsonant = [[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}] & [\p{Indic_Syllabic_Category=Consonant}]];
ExtCccZwj = [[Extend-[\p{ccc=0}]] ZWJ];
GB3: CR LF;
GB4: (Control | CR | LF) ÷;
GB5: . ÷ (Control | CR | LF);
GB6: L (L | V | LV | LVT);
GB7: (LV | V) (V | T);
GB8: (LVT | T) T;
GB11: Extended_Pict Extend* ZWJ Extended_Pict;
GB9c: LinkingConsonant ExtCccZwj* Virama_ ExtCccZwj* LinkingConsonant;
GB9: . (Extend | ZWJ);
GB9a: . SpacingMark;
GB9b: Prepend .;
# Regional Indicators, split into pairs.
# Note that a pair of RIs that is not followed by a third RI will fall into
# the normal rules for Extend, etc.
#
GB12: Regional_Indicator Regional_Indicator ÷ Regional_Indicator;
GB13: Regional_Indicator Regional_Indicator;
GB999: . ÷;