ICU-6039 Fixed! Along with removing a redundant operation from CharsetUTF8 and adding a porting a new test from icu4c to icu4j. X-SVN-Rev: 22973

commit: d1599f47712aabaa9a37258ad96197c549c78823 [log] [tgz]
author: Andrew J Macheret <ajmacher@svn.icu-project.org> Tue Nov 27 23:29:02 2007 +0000
committer: Andrew J Macheret <ajmacher@svn.icu-project.org> Tue Nov 27 23:29:02 2007 +0000
tree: 1c215c014fdac89a4366bcfb0f9a88b3449442d8
parent: 8cd61ae501f6485d9e6ce711cb24f2dd5a44d75a [diff]
diff --git a/icu4j/src/com/ibm/icu/charset/CharsetASCII.java b/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
index cfc5477..e8777af 100644
--- a/icu4j/src/com/ibm/icu/charset/CharsetASCII.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetASCII.java

@@ -347,10 +347,7 @@
                 }
             } else {
                 fromUChar32 = lead;
-                if (flush)
-                    return CoderResult.malformedForLength(1);
-                else
-                    return CoderResult.UNDERFLOW;
+                return CoderResult.UNDERFLOW;
             }
         }
 

diff --git a/icu4j/src/com/ibm/icu/charset/CharsetUTF8.java b/icu4j/src/com/ibm/icu/charset/CharsetUTF8.java
index 0893d90..9d46c17a 100644
--- a/icu4j/src/com/ibm/icu/charset/CharsetUTF8.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetUTF8.java

@@ -112,9 +112,7 @@
                             toUnicodeStatus = char32;
                             mode = bytes;
                             toULength = i;
-                            cr = (flush)
-                                    ? CoderResult.malformedForLength(i)
-                                    : CoderResult.UNDERFLOW;
+                            cr = CoderResult.UNDERFLOW;
                             break;
                         }
                         if (((ch = sourceArray[sourceIndex++]) & 0xc0) != 0x80) {
@@ -232,9 +230,7 @@
                             toUnicodeStatus = char32;
                             mode = bytes;
                             toULength = i;
-                            cr = (flush)
-                                    ? CoderResult.malformedForLength(i)
-                                    : CoderResult.UNDERFLOW;
+                            cr = CoderResult.UNDERFLOW;
                             break;
                         }
                         if (((ch = source.get(sourceIndex++)) & 0xc0) != 0x80) {
@@ -405,7 +401,7 @@
                     }
 
                     /* reach the next char into char32 */
-                    char32 = sourceArray[sourceIndex++] & 0xffff;
+                    char32 = sourceArray[sourceIndex++];
 
                     if (char32 <= 0x7f) {
                         /* 1 byte to encode from char32 */
@@ -504,7 +500,7 @@
                     }
 
                     /* reach the next char into char32 */
-                    char32 = source.get(sourceIndex++) & 0xffff;
+                    char32 = source.get(sourceIndex++);
 
                     if (char32 <= 0x7f) {
                         /* 1 byte to encode from char32 */
@@ -570,7 +566,7 @@
             /* we need to read another char to match up the surrogate stored in char32 */
             if (sourceIndex >= sourceLimit) {
                 fromUChar32 = char32;
-                return (flush) ? CoderResult.malformedForLength(1) : CoderResult.UNDERFLOW;
+                return CoderResult.UNDERFLOW;
             }
 
             try {
@@ -618,7 +614,7 @@
             /* we need to read another char to match up the surrogate stored in char32 */
             if (sourceIndex >= sourceLimit) {
                 fromUChar32 = char32;
-                return (flush) ? CoderResult.malformedForLength(1) : CoderResult.UNDERFLOW;
+                return CoderResult.UNDERFLOW;
             }
 
             try {

diff --git a/icu4j/src/com/ibm/icu/dev/test/charset/TestCharset.java b/icu4j/src/com/ibm/icu/dev/test/charset/TestCharset.java
index 8b071f2..6224a47 100644
--- a/icu4j/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charset/TestCharset.java

@@ -514,7 +514,7 @@
         char[] chars = new char[size[0] + size[1] + size[2]];
         int i = 0;
         int x, y;
-
+        
         // 0 to 1 << 7 (1 byters)
         for (; i < size[0]; i++) {
             bytes[i] = (byte) i;
@@ -628,6 +628,81 @@
     }
     
     
+    public void TestUTF8Surrogates() {
+        byte[][] in = new byte[][] {
+            { (byte)0x61, },
+            { (byte)0xc2, (byte)0x80, },
+            { (byte)0xe0, (byte)0xa0, (byte)0x80, },
+            { (byte)0xf0, (byte)0x90, (byte)0x80, (byte)0x80, },
+            { (byte)0xf4, (byte)0x84, (byte)0x8c, (byte)0xa1, },
+            { (byte)0xf0, (byte)0x90, (byte)0x90, (byte)0x81, },
+        };
+
+        /* expected test results */
+        char[][] results = new char[][] {
+            /* number of bytes read, code point */
+            { '\u0061', },
+            { '\u0080', },
+            { '\u0800', },
+            { '\ud800', '\udc00', },      //  10000
+            { '\udbd0', '\udf21', },      // 104321
+            { '\ud801', '\udc01', },      //  10401
+        };
+
+        /* error test input */
+        byte[][] in2 = new byte[][] {
+            { (byte)0x61, },
+            { (byte)0xc0, (byte)0x80,                                     /* illegal non-shortest form */
+            (byte)0xe0, (byte)0x80, (byte)0x80,                           /* illegal non-shortest form */
+            (byte)0xf0, (byte)0x80, (byte)0x80, (byte)0x80,               /* illegal non-shortest form */
+            (byte)0xc0, (byte)0xc0,                                       /* illegal trail byte */
+            (byte)0xf4, (byte)0x90, (byte)0x80, (byte)0x80,               /* 0x110000 out of range */
+            (byte)0xf8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80,   /* too long */
+            (byte)0xfe,                                                   /* illegal byte altogether */
+            (byte)0x62, },
+        };
+
+        /* expected error test results */
+        char[][] results2 = new char[][] {
+            /* number of bytes read, code point */
+            { '\u0062', },
+            { '\u0062', },
+        };
+        
+        String converter = "UTF-8";
+        CharsetProvider icu = new CharsetProviderICU();
+        Charset icuChar = icu.charsetForName(converter);
+        CharsetDecoder decoder = icuChar.newDecoder();
+        
+        int i;
+        try {
+            for (i = 0; i < in.length; i++) {
+                ByteBuffer source = ByteBuffer.wrap(in[i]);
+                CharBuffer expected = CharBuffer.wrap(results[i]);
+                smBufDecode(decoder, converter, source, expected, true, false,
+                        true);
+                smBufDecode(decoder, converter, source, expected, true, false,
+                        false);
+            }
+        } catch (Exception ex) {
+            errln("Incorrect result in " + converter);
+        }
+        try {
+            for (i = 0; i < in2.length; i++) {
+                ByteBuffer source = ByteBuffer.wrap(in2[i]);
+                CharBuffer expected = CharBuffer.wrap(results2[i]);
+                decoder.onMalformedInput(CodingErrorAction.IGNORE);
+                smBufDecode(decoder, converter, source, expected, true, false,
+                        true);
+                smBufDecode(decoder, converter, source, expected, true, false,
+                        false);
+            }
+        } catch (Exception ex) {
+            errln("Incorrect result in " + converter);
+        }
+    }
+    
+    
 //    public void TestCharsetCallback() {
 //        String currentTest = "initialization";
 //        try {
commit	d1599f47712aabaa9a37258ad96197c549c78823	[log] [tgz]
author	Andrew J Macheret <ajmacher@svn.icu-project.org>	Tue Nov 27 23:29:02 2007 +0000
committer	Andrew J Macheret <ajmacher@svn.icu-project.org>	Tue Nov 27 23:29:02 2007 +0000
tree	1c215c014fdac89a4366bcfb0f9a88b3449442d8
parent	8cd61ae501f6485d9e6ce711cb24f2dd5a44d75a [diff]