main/tests/translit/src/com/ibm/icu/dev/test/translit/langtagRegex.txt - external/github.com/unicode-org/icu - Git at Google

 # Copyright (C) 2006-2009, Google, International Business Machines Corporation and others. All Rights Reserved.
 # Regex for recognizing RFC 4646 well-formed tags
 # http://www.rfc-editor.org/rfc/rfc4646.txt
 # http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21

 # The structure requires no forward references, so it reverses the order.
 # It uses Java/Perl syntax instead of the old ABNF
 # The uppercase comments are fragments copied from RFC 4646

 # Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.

 $alpha  = [a-z] ;   # ALPHA
 $digit  = [0-9] ;   # DIGIT
 $alphanum   = [a-z 0-9] ;   # ALPHA / DIGIT
 $x  = x ;   # private use singleton
 $singleton = [a-w y-z] ; # other singleton
 $s  = [-_] ; # separator -- lenient parsers will use [-_] -- strict will use [-]

 # Now do the components. The structure is slightly different to allow for capturing the right components.
 # The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.

 $language   = $alpha{2,8} | $alpha{2,3} $s $alpha{3};

    # ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA  --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8}
    # We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).

 # Note: extlang invalid in Unicode language tags

 $script = $alpha{4} ;   # 4ALPHA

 $region = $alpha{2} | $digit{3} ;    # 2ALPHA / 3DIGIT

 $variant    = (?: $alphanum{5,8} | $digit $alphanum{3} ) ;  # 5*8alphanum / (DIGIT 3alphanum)

 $extension  = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alphanum))

 $privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum))

 # Define certain grandfathered codes, since otherwise the regex is pretty useless.
 # Since these are limited, this is safe even later changes to the registry --
 # the only oddity is that it might change the type of the tag, and thus
 # the results from the capturing groups.
 # http://www.iana.org/assignments/language-subtag-registry
 # Note that these have to be compared case insensitively, requiring (?i) below.

 $grandfathered  = en $s GB $s oed
       | i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu )
       | no $s (?: bok | nyn )
       | sgn $s (?: BE $s (?: fr | nl) | CH $s de )
       | zh $s min $s nan;

 # old:         | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue );
 # For well-formedness, we don't need the ones that would otherwise pass.
 # For validity, they need to be checked.

 # $grandfatheredWellFormed = (?:
 #         art $s lojban
 #     | cel $s gaulish
 #     | zh $s (?: guoyu | hakka | xiang )
 # );

 # Unicode locales: but we are shifting to a compatible form
 # $keyvalue = (?: $alphanum+ \= $alphanum+);
 # $keywords = ($keyvalue (?: \; $keyvalue)*);

 # We separate items that we want to capture as a single group

 $variantList   = $variant (?: $s $variant )* ; # special for multiples
 $extensionList = $extension (?: $s $extension )* ;   # special for multiples

 $langtag = (?: ( $language )
       (?: $s ( $script ) )? 40%
       (?: $s ( $region ) )? 40%
       (?: $s ( $variantList ) )? 10%
       (?: $s ( $extensionList ) )? 5%
       (?: $s ( $privateUse ) )? 5%);

 # Here is the final breakdown, with capturing groups for each of these components
 # The variants, extensions, grandfathered, and private-use may have interior '-'

 $root = (?i) # case-insensitive
   (?:
       $langtag 90%
     | ( $privateUse ) 5%
     | ( $grandfathered ) 5%)
 #    (?: \@ $keywords )? 5%
     ;
	# Copyright (C) 2006-2009, Google, International Business Machines Corporation and others. All Rights Reserved.
	# Regex for recognizing RFC 4646 well-formed tags
	# http://www.rfc-editor.org/rfc/rfc4646.txt
	# http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21

	# The structure requires no forward references, so it reverses the order.
	# It uses Java/Perl syntax instead of the old ABNF
	# The uppercase comments are fragments copied from RFC 4646

	# Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped.

	$alpha = [a-z] ; # ALPHA
	$digit = [0-9] ; # DIGIT
	$alphanum = [a-z 0-9] ; # ALPHA / DIGIT
	$x = x ; # private use singleton
	$singleton = [a-w y-z] ; # other singleton
	$s = [-_] ; # separator -- lenient parsers will use [-_] -- strict will use [-]

	# Now do the components. The structure is slightly different to allow for capturing the right components.
	# The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing.

	$language = $alpha{2,8} \| $alpha{2,3} $s $alpha{3};

	# ABNF (23ALPHA) / 4ALPHA / 58ALPHA --- note: because of how \| works in regex, don't use $alpha{2,3} \| $alpha{4,8}
	# We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan).

	# Note: extlang invalid in Unicode language tags

	$script = $alpha{4} ; # 4ALPHA

	$region = $alpha{2} \| $digit{3} ; # 2ALPHA / 3DIGIT

	$variant = (?: $alphanum{5,8} \| $digit $alphanum{3} ) ; # 5*8alphanum / (DIGIT 3alphanum)

	$extension = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1("-" (28alphanum))

	$privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1("-" (18alphanum))

	# Define certain grandfathered codes, since otherwise the regex is pretty useless.
	# Since these are limited, this is safe even later changes to the registry --
	# the only oddity is that it might change the type of the tag, and thus
	# the results from the capturing groups.
	# http://www.iana.org/assignments/language-subtag-registry
	# Note that these have to be compared case insensitively, requiring (?i) below.

	$grandfathered = en $s GB $s oed
	\| i $s (?: ami \| bnn \| default \| enochian \| hak \| klingon \| lux \| mingo \| navajo \| pwn \| tao \| tay \| tsu )
	\| no $s (?: bok \| nyn )
	\| sgn $s (?: BE $s (?: fr \| nl) \| CH $s de )
	\| zh $s min $s nan;

	# old: \| zh $s (?: cmn (?: $s Hans \| $s Hant )? \| gan \| min (?: $s nan)? \| wuu \| yue );
	# For well-formedness, we don't need the ones that would otherwise pass.
	# For validity, they need to be checked.

	# $grandfatheredWellFormed = (?:
	# art $s lojban
	# \| cel $s gaulish
	# \| zh $s (?: guoyu \| hakka \| xiang )
	# );

	# Unicode locales: but we are shifting to a compatible form
	# $keyvalue = (?: $alphanum+ \= $alphanum+);
	# $keywords = ($keyvalue (?: \; $keyvalue)*);

	# We separate items that we want to capture as a single group

	$variantList = $variant (?: $s $variant )* ; # special for multiples
	$extensionList = $extension (?: $s $extension )* ; # special for multiples

	$langtag = (?: ( $language )
	(?: $s ( $script ) )? 40%
	(?: $s ( $region ) )? 40%
	(?: $s ( $variantList ) )? 10%
	(?: $s ( $extensionList ) )? 5%
	(?: $s ( $privateUse ) )? 5%);

	# Here is the final breakdown, with capturing groups for each of these components
	# The variants, extensions, grandfathered, and private-use may have interior '-'

	$root = (?i) # case-insensitive
	(?:
	$langtag 90%
	\| ( $privateUse ) 5%
	\| ( $grandfathered ) 5%)
	# (?: \@ $keywords )? 5%
	;