blob: 5f6843c615e2df980299d2116dc1dd012f48b0e5 [file] [log] [blame]
#
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
# Copyright (c) 2016, International Business Machines Corporation and others. All Rights Reserved.
# file: sentence.txt
type = sentence; # one of grapheme | word | line | sentence
locale = en;
CR = [\p{Sentence_Break = CR}];
LF = [\p{Sentence_Break = LF}];
Extend = [\p{Sentence_Break = Extend}];
Sep = [\p{Sentence_Break = Sep}];
Format = [\p{Sentence_Break = Format}];
Sp = [\p{Sentence_Break = Sp}];
Lower = [\p{Sentence_Break = Lower}];
Upper = [\p{Sentence_Break = Upper}];
OLetter = [\p{Sentence_Break = OLetter}];
Numeric = [\p{Sentence_Break = Numeric}];
ATerm = [\p{Sentence_Break = ATerm}];
SContinue = [\p{Sentence_Break = SContinue}];
STerm = [\p{Sentence_Break = STerm}];
Close = [\p{Sentence_Break = Close}];
ParaSep = [Sep CR LF];
SATerm = [STerm ATerm];
ExtFmt = [Extend Format];
# SB2: ÷ eot
# Conventional regular expression matching for '$' as end-of-text also matches
# at a line separator just preceding the physical end of text.
# Instead, use a look-ahead assertion that there is no following character.
SB2: . ÷ (?!.);
SB3: CR LF;
SB4: ParaSep ÷;
# SB5: ignore Format and Extend characters.
SB6: ATerm ExtFmt* Numeric;
SB7: (Upper | Lower) ExtFmt* ATerm ExtFmt* Upper;
SB8: ATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* ([^OLetter Upper Lower ParaSep SATerm ExtFmt] ExtFmt *)* Lower;
SB8a: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (SContinue | SATerm);
SB9: SATerm ExtFmt* (Close ExtFmt*)* (Sp ExtFmt*)* (CR LF | ParaSep)? ÷;
# Also covers SB10, SB11.
SB12: . ExtFmt* [^ExtFmt]?;