ICU-21592 Update cj normal/loose linebreak per CSS
diff --git a/icu4c/source/data/brkitr/rules/line_loose_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
index 880d558..e921a94 100644
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@@ -17,7 +17,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * between ID and hyphens 2010 & 2013 (both BA)
+# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -238,7 +239,7 @@
# See issue ICU-20303
-$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
+$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@@ -294,8 +295,10 @@
# LB 21 x (BA | HY | NS)
# BB x
#
-# DO allow breaks here before $BAX and $NSX, so don't include them
-$LB20NonBreaks $CM* ($BA | $HY | $NS);
+# DO allow breaks here before $NSX, so don't include it.
+# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
+[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
+$ID $CM* ($BA | $HY | $NS);
^$CM+ ($BA | $HY | $NS);
diff --git a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
index 3eab1f7..43d116a 100644
--- a/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_phrase_cj.txt
@@ -15,7 +15,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * between ID and hyphens 2010 & 2013 (both BA)
+# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -251,7 +252,7 @@
# See issue ICU-20303
-$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
+$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
$SP $IS / [^ $CanFollowIS $NU $CM];
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
@@ -307,8 +308,10 @@
# LB 21 x (BA | HY | NS)
# BB x
#
-# DO allow breaks here before $BAX and $NSX, so don't include them
-$LB20NonBreaks $CM* ($BA | $HY | $NS);
+# DO allow breaks here before $NSX, so don't include it.
+# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
+[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
+$ID $CM* ($BA | $HY | $NS);
^$CM+ ($BA | $HY | $NS);
diff --git a/icu4c/source/data/brkitr/rules/line_normal_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
index be5ce1d..7ed8b35 100644
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@@ -17,7 +17,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
#
@@ -29,8 +29,7 @@
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
-$BAX = [\u2010 \u2013];
-$BA = [[:LineBreak = Break_After:] - $BAX];
+$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@@ -184,7 +183,7 @@
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
-[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;
@@ -282,7 +281,7 @@
# LB 21 x (BA | HY | NS)
# BB x
#
-# DO allow breaks here before $BAX and $NSX, so don't include them
+# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);
@@ -294,7 +293,7 @@
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
-$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
+$HL $CM* ($HY | $BA) $CM* [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
diff --git a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
index 55a12ff..1aeafdf 100644
--- a/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_phrase_cj.txt
@@ -15,7 +15,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
#
# The content is the same as line_normal_cj.txt except the following
@@ -31,8 +31,7 @@
$AI = [:LineBreak = Ambiguous:];
$AL = [:LineBreak = Alphabetic:];
-$BAX = [\u2010 \u2013];
-$BA = [[:LineBreak = Break_After:] - $BAX];
+$BA = [:LineBreak = Break_After:];
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
$BB = [:LineBreak = Break_Before:];
$BK = [:LineBreak = Mandatory_Break:];
@@ -197,7 +196,7 @@
# LB 12a Do not break before NBSP and related characters ...
# [^SP BA HY] x GL
#
-[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
+[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
^$CM+ $GL;
@@ -295,7 +294,7 @@
# LB 21 x (BA | HY | NS)
# BB x
#
-# DO allow breaks here before $BAX and $NSX, so don't include them
+# DO allow breaks here before $NSX, so don't include it
$LB20NonBreaks $CM* ($BA | $HY | $NS);
@@ -307,7 +306,7 @@
# LB 21a Don't break after Hebrew + Hyphen
# HL (HY | BA) x
#
-$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
+$HL $CM* ($HY | $BA) $CM* [^$CB]?;
# LB 21b (forward) Don't break between SY and HL
# (break between HL and SY already disallowed by LB 13 above)
diff --git a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
index d0693d5..7d1a025 100644
--- a/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_loose_cj.txt
@@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * between ID and hyphens 2010 & 2013 (both BA)
+# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -200,8 +201,10 @@
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
-LB21.1: . CM* [BA HY NS];
-LB21.2: BB CM* [^CM CB];
+LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
+LB21.2: ID CM* [BA HY NS];
+LB21.3: CM+ [BA HY NS];
+LB21.4: BB CM* [^CM CB];
LB21b: SY CM* HL;
diff --git a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
index a270482..a4e1428 100644
--- a/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
+++ b/icu4c/source/test/testdata/break_rules/line_normal_cj.txt
@@ -20,7 +20,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
type = line;
@@ -28,8 +28,7 @@
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
-BAX = [\u2010 \u2013];
-BA = [[:LineBreak = Break_After:] - BAX];
+BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
@@ -154,7 +153,7 @@
LB12: GL CM* [^CM];
-LB12a: [^SP BA BAX HY] CM* GL;
+LB12a: [^SP BA HY] CM* GL;
# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
@@ -182,12 +181,9 @@
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
-# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
-# should "HL BAX" not break when followed by a CB? Thats what the current
-# rules do, which is why "[^CM CB]?" includes the ?.
-LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
-# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
index 8a88974..efe8a32 100644
--- a/icu4c/source/test/testdata/rbbitst.txt
+++ b/icu4c/source/test/testdata/rbbitst.txt
@@ -1646,11 +1646,17 @@
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+# •no brk before 2010 •
+<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
+# •no brk before 2010 except ok after ID •
+<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
diff --git a/icu4j/main/shared/data/icudata.jar b/icu4j/main/shared/data/icudata.jar
index 2e55bab..1183ece 100644
--- a/icu4j/main/shared/data/icudata.jar
+++ b/icu4j/main/shared/data/icudata.jar
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:44e45ef8b72f3a5aaa1dd9e2a7db91d3a4612dc40198d63aa39e099d7a3e4755
-size 13873068
+oid sha256:8c831eb36e00ffcf96d0fecd1076ed8adb1c95ed1733196eb1bde6fb3d41dcc1
+size 13873113
diff --git a/icu4j/main/shared/data/icutzdata.jar b/icu4j/main/shared/data/icutzdata.jar
index 426473f..45fc6cb 100644
--- a/icu4j/main/shared/data/icutzdata.jar
+++ b/icu4j/main/shared/data/icutzdata.jar
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:3465755959812f1cccb9cff48bfa34fd0f7e56388c886bc344dfc2b4c5168f01
-size 96440
+oid sha256:41b28caf401ac7baa8a7cea7a903a7c4b5b5d07c6f90dbfd96341fe6969a8eeb
+size 96439
diff --git a/icu4j/main/shared/data/testdata.jar b/icu4j/main/shared/data/testdata.jar
index ca53c26..cd0adff 100644
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:4aa5bcc6d593b17ca69ab237e7347c69142d9bcab3fdca66956dad8a0e17c4bb
-size 826074
+oid sha256:6f2c9cabe519fa9d30169b730104f2d4d8a18b2c92ec1f257e60b2830bd6b0b8
+size 826073
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
index d0693d5..7d1a025 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_loose_cj.txt
@@ -20,7 +20,8 @@
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * between ID and hyphens 2010 & 2013 (both BA)
+# * before 301C, 30A0 (both NS)
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
# * between characters of LineBreak class IN such as 2026
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
@@ -200,8 +201,10 @@
LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
-LB21.1: . CM* [BA HY NS];
-LB21.2: BB CM* [^CM CB];
+LB21.1: [^BK CR LF NL CM ZW SP CB ID] CM* [BA BAX HY NS];
+LB21.2: ID CM* [BA HY NS];
+LB21.3: CM+ [BA HY NS];
+LB21.4: BB CM* [^CM CB];
LB21b: SY CM* HL;
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
index a270482..a4e1428 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/break_rules/line_normal_cj.txt
@@ -20,7 +20,7 @@
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
# It sets characters of class CJ to behave like ID.
# In addition, it allows breaks:
-# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
+# * before 301C, 30A0 (both NS)
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
type = line;
@@ -28,8 +28,7 @@
AI = [:LineBreak = Ambiguous:];
AL = [:LineBreak = Alphabetic:];
-BAX = [\u2010 \u2013];
-BA = [[:LineBreak = Break_After:] - BAX];
+BA = [:LineBreak = Break_After:];
HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
BB = [:LineBreak = Break_Before:];
BK = [:LineBreak = Mandatory_Break:];
@@ -154,7 +153,7 @@
LB12: GL CM* [^CM];
-LB12a: [^SP BA BAX HY] CM* GL;
+LB12a: [^SP BA HY] CM* GL;
# LB 13 Do not break before ‘]’ or ‘!’ or ‘/’, even after spaces.
LB13.1: [^SP] CM* [CL CP EX SY];
@@ -182,12 +181,9 @@
# Note: Rule 21a must come before 21 to prevent 21.1 from matching HL BA, then
# not picking up the continuing match after the BA from 21a.
-# TODO: For CJ tailorings (with BAX) does this rule want to include BAX? If so,
-# should "HL BAX" not break when followed by a CB? Thats what the current
-# rules do, which is why "[^CM CB]?" includes the ?.
-LB21a: HL CM* (HY | BA | BAX) CM* [^CM CB]?;
+LB21a: HL CM* (HY | BA) CM* [^CM CB]?;
-# DO allow breaks here before $BAXcm and $NSXcm, so don't include them
+# DO allow breaks here before $NSXcm, so don't include it
LB21.1: . CM* [BA HY NS];
LB21.2: BB CM* [^CM CB];
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
index e226e34..efe8a32 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
@@ -1646,11 +1646,17 @@
# •brk OK before 3063 •brk OK before 301C •no brk btw 2026 •no brk before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026\u2026\u0020•\u30A2\uFF01\u0020•</data>
+# •no brk before 2010 •
+<data>•\u3042\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
<locale ja@lb=loose>
<line>
# •brk OK before 3063 •brk OK before 301C •brk OK btw 2026 •brk OK before FF01•
<data>•\u3084•\u3063•\u3071•\u308A\u0020•\u0031•\u301C\u0020•\u2026•\u2026\u0020•u30A2•\uFF01\u0020•</data>
+# •no brk before 2010 except ok after ID •
+<data>•\u3042•\u2010•\u0031\u0020•\u0061\u2010•\u0031\u0020•</data>
+
<locale en@lb=strict>
<line>
# •no brk before 3063 •no brk before 301C•no brk btw 2026 •no brk before FF01•
@@ -1888,7 +1894,7 @@
<line>
#[京都観光]時雨殿に行った。-> [京都•観光]•時雨•殿に•行った。•
<data>•\uff3b\u4eac\u90fd•\u89b3\u5149\uff3d•\u6642\u96e8•\u6bbf\u306b•\u884c\u3063\u305f\u3002•</data>
-#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た
+#9月に東京から友達が遊びに来た -> 9月に•東京から•友達が•遊びに•来た•
<data>•\uff19\u6708\u306b•\u6771\u4eac\u304b\u3089•\u53cb\u9054\u304c•\u904a\u3073\u306b•\u6765\u305f•</data>
#る文字「そうだ、京都」-> る•文字•「そうだ、•京都」•
<data>•\u308b•\u6587\u5b57•\u300c\u305d\u3046\u3060\u3001•\u4eac\u90fd\u300d•</data>